diff --git a/dups.py b/dups.py index cddaec4..6a4ed74 100644 --- a/dups.py +++ b/dups.py @@ -24,18 +24,28 @@ from job import Job, JobExtra, Joblog, NewJob from settings import Settings from shared import SymlinkName +class DupRow: + def __init__(self, hash, file, dir, did, fid): + ### DupRow Attributes -- note, simple class, no methods ### + self.h=hash + self.f=file + self.d=dir + self.did=did + self.id=fid + class Duplicates: - ip_to_sp_dups_keep={} - ip_to_sp_dups_del={} - in_same_dups={} - per_file_dups=[] - per_path_dups=[] - preferred={} - all_paths=[] - storage_paths=[] - import_paths=[] - def __init__(self): + ### Duplicates Attributes ### + self.ip_to_sp_dups_keep={} + self.ip_to_sp_dups_del={} + self.in_same_dups={} + self.per_file_dups=[] + self.per_path_dups=[] + self.preferred={} + self.all_paths=[] + self.storage_paths=[] + self.import_paths=[] + # per storage path, add entries to view settings=Settings.query.first() paths = settings.storage_path.split("#") @@ -68,30 +78,59 @@ class Duplicates: return False def KeepInSameDups( self, obj ): - if obj['h'] not in self.ip_to_sp_dups_keep: - self.ip_to_sp_dups_keep[obj['h']]= obj + if obj.h not in self.ip_to_sp_dups_keep: + self.ip_to_sp_dups_keep[obj.h]= obj return def DelInSameDups( self, obj ): - if obj['h'] not in self.ip_to_sp_dups_del: - self.ip_to_sp_dups_del[obj['h']]=[] - self.ip_to_sp_dups_del[obj['h']].append( obj ) + if obj.h not in self.ip_to_sp_dups_del: + self.ip_to_sp_dups_del[obj.h]=[] + self.ip_to_sp_dups_del[obj.h].append( obj ) else: - for el in self.ip_to_sp_dups_del[obj['h']]: - if el['id'] == obj['id']: + for el in self.ip_to_sp_dups_del[obj.h]: + if el.id == obj.id: return - self.ip_to_sp_dups_del[obj['h']].append( obj ) + self.ip_to_sp_dups_del[obj.h].append( obj ) return - def AddDup( self, row ): + def DupInImportAndStoragePath( self, row, dr1, dr2 ): if self.InStoragePath(row.path1) and self.InImportPath(row.path2): - self.KeepInSameDups( { 'f': row.fname1, 'd':self.TrimmedPath(row.path1), 'did': row.did1, 'h':row.hash, 'id':row.id1 } ) - self.DelInSameDups( { 'f': row.fname2, 'd':self.TrimmedPath(row.path2), 'did': row.did2, 'h':row.hash, 'id':row.id2 } ) - + self.KeepInSameDups( dr1 ) + self.DelInSameDups( dr2 ) + return True if self.InStoragePath(row.path2) and self.InImportPath(row.path1): - self.KeepInSameDups( { 'f': row.fname2, 'd':self.TrimmedPath(row.path2), 'did': row.did2, 'h':row.hash, 'id':row.id2 } ) - self.DelInSameDups( { 'f': row.fname1, 'd':self.TrimmedPath(row.path1), 'did': row.did1, 'h':row.hash, 'id':row.id1 } ) + self.KeepInSameDups( dr2 ) + self.DelInSameDups( dr1 ) + return True + return False + def AddDup( self, row ): + dr1=DupRow( row.hash, row.fname1, self.TrimmedPath(row.path1), row.did1, row.id1 ) + dr2=DupRow( row.hash, row.fname2, self.TrimmedPath(row.path2), row.did2, row.id2 ) + if self.DupInImportAndStoragePath( row, dr1, dr2 ): + return + + if row.hash not in self.in_same_dups: + self.in_same_dups[row.hash]=[] + self.in_same_dups[row.hash].append( dr1 ) + self.in_same_dups[row.hash].append( dr2 ) + else: + # process path1 / fname1 -- if that combo is not in the dups[hash], add it + found=0 + for dup in self.in_same_dups[row.hash]: + if dup.id == row.id1: + found=1 + continue + if not found: + self.in_same_dups[row.hash].append( dr1 ) + + # process path2 / fname2 -- if that combo is not in the dups[hash], add it + for dup in self.in_same_dups[row.hash]: + if dup.id == row.id2: + found=1 + continue + if not found: + self.in_same_dups[row.hash].append( dr2 ) return def Dump(self): @@ -105,32 +144,21 @@ class Duplicates: for d in self.ip_to_sp_dups_del[h]: print( f"Del: {d}" ) print( f"{cnt} sets of duplicate files to delete at least 1, anything with 2 or more dups is printed above explicitly" ) + if len(self.in_same_dups) > 0: + print( "############ Duplicate Files that are in the same Path ###########") + cnt=0 + for h in self.in_same_dups: + cnt +=1 + if len(self.in_same_dups[h])>2: + print( f"hash={h}, keep 1 of these: ", end='') + for d in self.in_same_dups[h]: + print( f"{d.id}, ", end='' ) + print ("") + print( f"{cnt} sets of duplicate files to delete at least 1, anything with 2 or more dups is printed above explicitly" ) return + """ - if row.hash not in dups: - dups[row.hash]=[] - dups[row.hash].append( { 'f': row.fname1, 'd':TrimmedPath(prefix, row.path1), 'did': row.did1, 'h':row.hash, 'id':row.id1 } ) - dups[row.hash].append( { 'f': row.fname2, 'd':TrimmedPath(prefix, row.path2), 'did': row.did2, 'h':row.hash, 'id':row.id2 } ) - else: - # process path1 / fname1 -- if that combo is not in the dups[hash], add it - found=0 - for dup in dups[row.hash]: - if dup['f'] == row.fname1 and dup['d'] == TrimmedPath(prefix, row.path1): - found=1 - continue - if not found: - dups[row.hash].append( { 'f': row.fname1, 'd':TrimmedPath(prefix, row.path1), 'did': row.did1, 'h':row.hash, 'id':row.id1 } ) - - # process path2 / fname2 -- if that combo is not in the dups[hash], add it - found=0 - for dup in dups[row.hash]: - if dup['f'] == row.fname2 and dup['d'] == TrimmedPath(prefix, row.path2): - found=1 - continue - if not found: - dups[row.hash].append( { 'f': row.fname2, 'd':TrimmedPath(prefix, row.path2), 'did': row.did2, 'h':row.hash, 'id':row.id2 } ) - @app.route("/fix_dups", methods=["POST"]) def fix_dups(): rows = db.engine.execute( "select e1.id as id1, f1.hash, d1.path_prefix as path1, d1.eid as did1, e1.name as fname1, e2.id as id2, d2.path_prefix as path2, d2.eid as did2, e2.name as fname2 from entry e1, file f1, dir d1, entry_dir_link edl1, entry e2, file f2, dir d2, entry_dir_link edl2 where e1.id = f1.eid and e2.id = f2.eid and d1.eid = edl1.dir_eid and edl1.entry_id = e1.id and edl2.dir_eid = d2.eid and edl2.entry_id = e2.id and f1.hash = f2.hash and e1.id != e2.id and f1.size_mb = f2.size_mb order by path1, fname1" )