diff --git a/dups.py b/dups.py index 7f06fe5..5128e11 100644 --- a/dups.py +++ b/dups.py @@ -78,7 +78,7 @@ class DupPathRow: # After the 2 passes, we have data structures that allow the web to break up # the duplicates into batches to process: # 1) auto delete any in the import path that are also in the storage path -# - carefule here, if we have 2 in the import path and 2+ in the storage path, leave it for manual intervention +# - careful here, if we have any in the import path and 2+ in the storage path, leave it for manual intervention # 2) auto delete any in the storage path that are in a set where 1 of them match the 'YYYY/YYYYMMDD' format, the rest are deleted # 3) a set of directories where there are only 2 duplicate files (with the same file name), just in a different dir - allow user to choose the dir to keep # 4) a set of individual files where I want the user to make a decision (3 or more copies, those with different filenames, or in the same dir) - allow user to choose file to keep @@ -158,7 +158,7 @@ class Duplicates: if self.DupInImportAndStoragePath( row, dr1, dr2 ): return - # if the hast is no dups_to_process, created / append + # if the hash is not in dups_to_process, created / append if row.hash not in self.dups_to_process: self.dups_to_process[row.hash]=[] self.dups_to_process[row.hash].append( dr1 ) @@ -182,6 +182,9 @@ class Duplicates: self.dups_to_process[row.hash].append( dr2 ) return + # AddDupPath: takes a row from the database effectively with a dup pair in dir1 & dir2 + # we process these into appropriate data structures on this second pass + # working through if a dir is in th estorage path and is def AddDupPath(self, hash): # this gets complex, if this hash is also in a shared imp / sp - then dont deal with it now, let the imp files be deleted and # the repeat check_dups validation step catch it as a cleander (potential) for still more duplicates just in sp @@ -190,10 +193,13 @@ class Duplicates: return False new=1 for el in self.per_path_dups: + # if this new hash / dup in dpr has same dirs as existing per_path_dups row, then just another file in same dup dir... if el.d1 == dpr.d1 and el.d2 == dpr.d2: el.count += 2 el.hashes = f"{el.hashes},{hash}" new=0 + # okay, we have a new pair of duplicate dirs... Add them, and if either has matching regex its preferred + # FIXME: what if both do? what if one is in SP and the other not, etc... if new: self.per_path_dups.append( dpr ) if re.search( r'\d{4}/\d{8}', dpr.d1): @@ -202,6 +208,8 @@ class Duplicates: self.preferred_path[dpr.did2]=1 return True + # The second pass processes row by row of dups_to_process, looking for per_file_dups and per_path_dups + # AND works out counts to display overall tallies of types of keeping and deletion of files, and choices to make def SecondPass(self): # sort out counts (for ip_to_sp - that is all finished) self.uniq_dups = len(self.hashes_processed) @@ -211,9 +219,12 @@ class Duplicates: for hash in self.ip_to_sp_dups_del: self.total_dups += len(self.ip_to_sp_dups_del[hash]) + # okay, go for each duplicate that should be processed (they are stored + # by hash, and have at least 2 entries, but can have more, and be in + # the IP or SP and any combo, cater for all below for hash in self.dups_to_process: - # more than 2 files (just ask per file) OR only 2 copies, and files are in same dir (so must be diff name, so just ask) OR content same, filename different (ask per file) - if (len(self.dups_to_process[hash]) > 2) or (self.dups_to_process[hash][0].f != self.dups_to_process[hash][1].f) or (self.dups_to_process[hash][0].d == self.dups_to_process[hash][1].d): + # more than 2 files (just ask per file) OR (implied) only 2 copies, and files are in same dir (so must be diff name, so just ask) OR (implied) on 2 copies in same dir & filename different (ask per file) + if (len(self.dups_to_process[hash]) > 2) or (self.dups_to_process[hash][0].d == self.dups_to_process[hash][1].d) or (self.dups_to_process[hash][0].f != self.dups_to_process[hash][1].f): self.per_file_dups.append(self.dups_to_process[hash]) for el in self.dups_to_process[hash]: if re.search( r'\d{4}/\d{8}', el.d): @@ -222,7 +233,7 @@ class Duplicates: # if this combination ALSO has an import path dup, then we have already counted the storage path dup in the earlier keeping count if hash in self.ip_to_sp_dups_keep: self.total_dups -= 1 - # only 2 files, with the same name, different path (ask per path) + # only 2 files AND different path (ask per path) AND with the same name... else: # if this dup path is not already being partially handled by an ip <-> sp dup, then add it / count it if self.AddDupPath( hash ): @@ -235,7 +246,7 @@ class Duplicates: self.total_dups += 1 return - # quick debugger to see the data in the data structure + # quick debugger to see the data in the data structure (not used by default) def Dump(self): if len(self.ip_to_sp_dups_keep) > 0: print( "############ Files that are in both Import and Storage Paths ###########")