From c4f36713bfc5d4e7d4054969b3f42a06bc98731c Mon Sep 17 00:00:00 2001 From: Damien De Paoli Date: Sat, 27 Mar 2021 14:24:16 +1100 Subject: [PATCH] improved duplicate counting, removed concept of sets, and validated counts work via DB selects, and they do --- dups.py | 47 ++++++++++++++++++++++++++++++----------------- 1 file changed, 30 insertions(+), 17 deletions(-) diff --git a/dups.py b/dups.py index 0eed195..9f153a3 100644 --- a/dups.py +++ b/dups.py @@ -94,8 +94,9 @@ class Duplicates: self.all_paths=[] self.storage_paths=[] self.import_paths=[] - self.overall_dup_cnt=0 - self.overall_dup_sets=0 + self.hashes_processed={} + self.uniq_dups=0 + self.total_dups=0 # pull apart the storage path Setting, and make array of each for use in TrimmedPath() settings=Settings.query.first() @@ -133,11 +134,13 @@ class Duplicates: return True return False - # this stores this object into the keep from same path list (DDP: could there be more than 1) + # this stores this object into the keep from same path list (DDP: sometimes there can be more than 1 SP, e.g SP to SP to IP) + # for now, by not dealing with the extra SP, we will just delete the IP, and force a check_dups after deleting, it will then + # pick up and process the SP to SP - if still needed -- if there is only SP1 to SP2, then the per_path_dup will pick it up and + # I believe this will all work, but doesn't hurt to do an extra check_dups again def KeepInIPSPDups( self, obj ): if obj.h not in self.ip_to_sp_dups_keep: self.ip_to_sp_dups_keep[obj.h]= obj - self.overall_dup_sets += 1 return # this stores this object into the Delete from same path list (if it is not @@ -151,8 +154,6 @@ class Duplicates: if el.id == obj.id: return self.ip_to_sp_dups_del[obj.h].append( obj ) - # only get here is this is a new duplicate to delete, so increment count - self.overall_dup_cnt += 1 return # this function takes a duplicate file (in the import path and the storage path) @@ -172,6 +173,7 @@ class Duplicates: # AddDup: takes a row from the database effectively file1 & file2 # we process these into appropriate data structures on this first pass def AddDup( self, row ): + self.hashes_processed[row.hash]=1 dr1=DupRow( row.hash, row.fname1, self.TrimmedPath(row.path1), row.did1, row.id1 ) dr2=DupRow( row.hash, row.fname2, self.TrimmedPath(row.path2), row.did2, row.id2 ) # if in both import and storage path, just keep the storage path file, @@ -206,39 +208,50 @@ class Duplicates: def AddDupPath(self, hash): # this gets complex, if this hash is also in a shared imp / sp - then dont deal with it now, let the imp files be deleted and # the repeat check_dups validation step catch it as a cleander (potential) for still more duplicates just in sp - if hash in self.ip_to_sp_dups_keep: - return dpr=DupPathRow( 2, self.dups_to_process[hash][0].d, self.dups_to_process[hash][1].d, self.dups_to_process[hash][0].did, self.dups_to_process[hash][1].did, hash ) + if hash in self.ip_to_sp_dups_keep: + return False new=1 for el in self.per_path_dups: if el.d1 == dpr.d1 and el.d2 == dpr.d2: - self.overall_dup_cnt += 2 el.count += 2 el.hashes = f"{el.hashes},{hash}" new=0 if new: self.per_path_dups.append( dpr ) - self.overall_dup_sets += 1 - self.overall_dup_cnt += 2 if re.search( r'\d{4}/\d{8}', dpr.d1): self.preferred_path[dpr.did1]=1 if re.search( r'\d{4}/\d{8}', dpr.d2): self.preferred_path[dpr.did2]=1 - return + return True def SecondPass(self): + # sort out counts (for ip_to_sp - that is all finished) + self.uniq_dups = len(self.hashes_processed) + # total starts with 1 copy of everything we keep in sp + self.total_dups = len(self.ip_to_sp_dups_keep) + # and then add all those we delete in ip that are in sp + for hash in self.ip_to_sp_dups_del: + self.total_dups += len(self.ip_to_sp_dups_del[hash]) + for hash in self.dups_to_process: # more than 2 files (just ask per file) OR only 2 copies, and files are in same dir (so must be diff name, so just ask) OR content same, filename different (ask per file) if (len(self.dups_to_process[hash]) > 2) or (self.dups_to_process[hash][0].f != self.dups_to_process[hash][1].f) or (self.dups_to_process[hash][0].d == self.dups_to_process[hash][1].d): - self.overall_dup_sets += 1 - self.overall_dup_cnt += len(self.dups_to_process[hash]) self.per_file_dups.append(self.dups_to_process[hash]) for el in self.dups_to_process[hash]: if re.search( r'\d{4}/\d{8}', el.d): self.preferred_file[hash] = el.id - # by here we have only 2 files, with the same name, different path (ask per path) + self.total_dups += len(self.dups_to_process[hash]) + # if this combination ALSO has an import path dup, then we have already counted the storage path dup in the earlier keeping count + if hash in self.ip_to_sp_dups_keep: + self.total_dups -= 1 + # only 2 files, with the same name, different path (ask per path) else: - self.AddDupPath( hash ) + # if this dup path is not already being partially handled by an ip <-> sp dup, then add it / count it + if self.AddDupPath( hash ): + self.total_dups += 2 + else: + self.total_dups += 1 return # quick debugger to see the data in the data structure @@ -276,4 +289,4 @@ class Duplicates: if pair.did2 in self.preferred_path: print("Keep dir2") print( f"which is a total of {len(self.per_path_dups)} set(s) of path dups to process" ) - return + return