From 7e14280ec5d06049ead9738781e44be8b0ba632c Mon Sep 17 00:00:00 2001 From: Damien De Paoli Date: Sun, 21 Mar 2021 16:58:54 +1100 Subject: [PATCH] made new/better function to deal with path duplicates / reduced duplicate code, fixed overall count vars, improved debugs --- dups.py | 83 +++++++++++++++++++++++++-------------------------------- 1 file changed, 36 insertions(+), 47 deletions(-) diff --git a/dups.py b/dups.py index f47b464..ec5a881 100644 --- a/dups.py +++ b/dups.py @@ -137,6 +137,8 @@ class Duplicates: def KeepInSameDups( self, obj ): if obj.h not in self.ip_to_sp_dups_keep: self.ip_to_sp_dups_keep[obj.h]= obj + self.overall_dup_cnt += 1 + self.overall_dup_sets += 1 return # this stores this object into the Delete from same path list (if it is not @@ -150,6 +152,8 @@ class Duplicates: if el.id == obj.id: return self.ip_to_sp_dups_del[obj.h].append( obj ) + # only get here is this is a new duplicate to delete, so increment count + self.overall_dup_cnt += 1 return # this function takes a duplicate file (in the import path and the storage path) @@ -203,14 +207,29 @@ class Duplicates: self.dups_to_process[row.hash].append( dr2 ) return + def AddDupPath(self, hash): + # this gets complex, if this hash is also in a sahred imp / sp - then dont deal with it now, let the imp files be deleted and + # the repeat check_dups validation step catch it as a cleander (potential) for still more duplicates just in sp + if hash in self.ip_to_sp_dups_keep: + return + dpr=DupPathRow( 2, self.dups_to_process[hash][0].d, self.dups_to_process[hash][1].d, self.dups_to_process[hash][0].did, self.dups_to_process[hash][1].did, hash ) + new=1 + for el in self.per_path_dups: + if el.d1 == dpr.d1 and el.d2 == dpr.d2: + self.overall_dup_cnt += 2 + el.count += 2 + el.hashes = f"{el.hashes},{hash}" + new=0 + if new: + self.per_path_dups.append( dpr ) + self.overall_dup_sets += 1 + self.overall_dup_cnt += 2 + if re.search( r'\d{4}/\d{8}', dpr.d1): + self.preferred_path[dpr.did1]=1 + if re.search( r'\d{4}/\d{8}', dpr.d2): + self.preferred_path[dpr.did2]=1 + def SecondPass(self): - print("################################## second pass starting") - d1="" - d2="" - did1="" - did2="" - dup_cnt=1 - hashes="" for hash in self.dups_to_process: # more than 2 files (just ask per file) OR only 2 copies, and files are in same dir (so must be diff name, so just ask) OR content same, filename different (ask per file) if (len(self.dups_to_process[hash]) > 2) or (self.dups_to_process[hash][0].f != self.dups_to_process[hash][1].f) or (self.dups_to_process[hash][0].d == self.dups_to_process[hash][1].d): @@ -221,30 +240,8 @@ class Duplicates: if re.search( r'\d{4}/\d{8}', el.d): self.preferred_file[hash] = el.id # by here we have only 2 files, with the same name, different path (ask per path) - elif d1 != self.dups_to_process[hash][0].d: - if d1 != '': - self.overall_dup_cnt += dup_cnt - self.overall_dup_sets += 1 - self.per_path_dups.append( DupPathRow( dup_cnt, d1, d2, did1, did2, hashes ) ) - if re.search( r'\d{4}/\d{8}', d1): - self.preferred_path[did1]=1 - if re.search( r'\d{4}/\d{8}', d2): - self.preferred_path[did2]=1 - dup_cnt=1 - d1 = self.dups_to_process[hash][0].d - d2 = self.dups_to_process[hash][1].d - did1 = self.dups_to_process[hash][0].did - did2 = self.dups_to_process[hash][1].did - hashes = f"{hash}," else: - dup_cnt += 1 - hashes += f"{hash}," - - if d1 != '': - self.overall_dup_cnt += dup_cnt - self.overall_dup_sets += dup_cnt - self.per_path_dups.append( DupPathRow( dup_cnt, d1, d2, did1, did2, hashes ) ) - print("#################### second pass FINISHED") + self.AddDupPath( hash ) return # quick debugger to see the data in the data structure @@ -252,25 +249,17 @@ class Duplicates: if len(self.ip_to_sp_dups_keep) > 0: print( "############ Files that are in both Import and Storage Paths ###########") for h in self.ip_to_sp_dups_keep: - if len(self.ip_to_sp_dups_del[h])>2: - print( f"(1 file of 2+) hash={h}, keep: {self.ip_to_sp_dups_keep[h]}" ) - for d in self.ip_to_sp_dups_del[h]: - print( f"Del: {d}" ) - else: - print( f"(1 file of 2) hash={h}, keep: {self.ip_to_sp_dups_keep[h]}" ) - for d in self.ip_to_sp_dups_del[h]: - print( f"Del: {d}" ) - + print( f"hash={h} keep 1 of {len(self.ip_to_sp_dups_del[h])+1}, keep: {self.ip_to_sp_dups_keep[h]} | ", end='' ) + for d in self.ip_to_sp_dups_del[h]: + print( f"Del: {d}", end='' ) + print( "" ) print( f"{len(self.ip_to_sp_dups_keep)} sets of duplicate files to delete at least 1, anything with 2 or more dups is printed above explicitly" ) if len(self.dups_to_process) > 0: print( "############ Duplicate Files that are needing to be futher processed ###########") for h in self.dups_to_process: - print( f"hash={h}, keep 1 of these: ", end='') - for d in self.dups_to_process[h]: - print( f"{d.id}, ", end='' ) - print ("") - print( f"{len(self.dups_to_process)} sets of duplicate files to delete at least 1, anything with 2 or more dups is printed above explicitly" ) + print( f"hash={h} keep 1 of {len(self.dups_to_process[h])} from: {self.dups_to_process[h]}" ) + print( f"which is a total of {len(self.dups_to_process)} set(s) of duplicate files to keep only 1 of" ) if len(self.preferred_file) > 0: print( " We have preferred (regexp matched) ###########") @@ -279,7 +268,7 @@ class Duplicates: for d in self.dups_to_process[h]: print( f"{d.id}, ", end='' ) print ("") - print( f"{len(self.preferred_file)} duplicate files we will keep as they match the regexp" ) + print( f"which is a total of {len(self.preferred_file)} duplicate files we will keep as they match the regexp" ) if len(self.per_path_dups) > 0: print( "############ Duplicate Files in Paths that are needing to be futher processed ###########") @@ -289,5 +278,5 @@ class Duplicates: print("Keep dir1") if pair.did2 in self.preferred_path: print("Keep dir2") - print( f"{len(self.per_path_dups)} duplicate files in per path dups" ) - return \ No newline at end of file + print( f"which is a total of {len(self.per_path_dups)} set(s) of path dups to process" ) + return