diff --git a/dups.py b/dups.py index ec5a881..0eed195 100644 --- a/dups.py +++ b/dups.py @@ -134,16 +134,15 @@ class Duplicates: return False # this stores this object into the keep from same path list (DDP: could there be more than 1) - def KeepInSameDups( self, obj ): + def KeepInIPSPDups( self, obj ): if obj.h not in self.ip_to_sp_dups_keep: self.ip_to_sp_dups_keep[obj.h]= obj - self.overall_dup_cnt += 1 self.overall_dup_sets += 1 return # this stores this object into the Delete from same path list (if it is not # already there) - def DelInSameDups( self, obj ): + def DelInIPSPDups( self, obj ): if obj.h not in self.ip_to_sp_dups_del: self.ip_to_sp_dups_del[obj.h]=[] self.ip_to_sp_dups_del[obj.h].append( obj ) @@ -157,16 +156,16 @@ class Duplicates: return # this function takes a duplicate file (in the import path and the storage path) - # and then puts the storage path file in the keep list (self.ip_to_sp_dups_keep) via self.KeepInSameDups() - # and then puts the import path file in the delete list (self.ip_to_sp_dups_keep) via self.DelInSameDups() + # and then puts the storage path file in the keep list (self.ip_to_sp_dups_keep) via self.KeepInIPSPDups() + # and then puts the import path file in the delete list (self.ip_to_sp_dups_keep) via self.DelInIPSPDups() def DupInImportAndStoragePath( self, row, dr1, dr2 ): if self.InStoragePath(row.path1) and self.InImportPath(row.path2): - self.KeepInSameDups( dr1 ) - self.DelInSameDups( dr2 ) + self.KeepInIPSPDups( dr1 ) + self.DelInIPSPDups( dr2 ) return True if self.InStoragePath(row.path2) and self.InImportPath(row.path1): - self.KeepInSameDups( dr2 ) - self.DelInSameDups( dr1 ) + self.KeepInIPSPDups( dr2 ) + self.DelInIPSPDups( dr1 ) return True return False @@ -180,9 +179,6 @@ class Duplicates: if self.DupInImportAndStoragePath( row, dr1, dr2 ): return - # if we are here, we have duplicates either in the storage path or in - # the import path - # if the hast is no dups_to_process, created / append if row.hash not in self.dups_to_process: self.dups_to_process[row.hash]=[] @@ -208,7 +204,7 @@ class Duplicates: return def AddDupPath(self, hash): - # this gets complex, if this hash is also in a sahred imp / sp - then dont deal with it now, let the imp files be deleted and + # this gets complex, if this hash is also in a shared imp / sp - then dont deal with it now, let the imp files be deleted and # the repeat check_dups validation step catch it as a cleander (potential) for still more duplicates just in sp if hash in self.ip_to_sp_dups_keep: return @@ -228,14 +224,15 @@ class Duplicates: self.preferred_path[dpr.did1]=1 if re.search( r'\d{4}/\d{8}', dpr.d2): self.preferred_path[dpr.did2]=1 + return def SecondPass(self): for hash in self.dups_to_process: # more than 2 files (just ask per file) OR only 2 copies, and files are in same dir (so must be diff name, so just ask) OR content same, filename different (ask per file) if (len(self.dups_to_process[hash]) > 2) or (self.dups_to_process[hash][0].f != self.dups_to_process[hash][1].f) or (self.dups_to_process[hash][0].d == self.dups_to_process[hash][1].d): - self.per_file_dups.append(self.dups_to_process[hash]) - self.overall_dup_cnt += len(self.dups_to_process[hash]) self.overall_dup_sets += 1 + self.overall_dup_cnt += len(self.dups_to_process[hash]) + self.per_file_dups.append(self.dups_to_process[hash]) for el in self.dups_to_process[hash]: if re.search( r'\d{4}/\d{8}', el.d): self.preferred_file[hash] = el.id