From 046c512e6b7c7ba52c7917f7789f0847820faca0 Mon Sep 17 00:00:00 2001 From: Damien De Paoli Date: Sun, 14 Mar 2021 14:33:22 +1100 Subject: [PATCH] added comments --- dups.py | 82 +++++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 65 insertions(+), 17 deletions(-) diff --git a/dups.py b/dups.py index 6a4ed74..93d2fa9 100644 --- a/dups.py +++ b/dups.py @@ -24,6 +24,9 @@ from job import Job, JobExtra, Joblog, NewJob from settings import Settings from shared import SymlinkName +# DupRow class is a simple 'struct' to keep data per duplicate file / just to +# avoid using python list/dicts intermixed, and be able to consistently use +# dot-notation of fields class DupRow: def __init__(self, hash, file, dir, did, fid): ### DupRow Attributes -- note, simple class, no methods ### @@ -33,12 +36,35 @@ class DupRow: self.did=did self.id=fid +# Duplicates class is used with one instance/object to process all the +# 'duplicate' data from the Database, and parse it into more usable data +# structures. This is needed also, as the database content shows duplicates +# more than once, e.g. +# file1 and file2 are a duplicate, then later file2 and file 1 are 'another' duplicate +# The class passes over the data in 2 passes. The first pass in AddDup() finds +# any files in the import and storage path and marks the storage ones to keep, +# the import ones to delete. Anything else is either a set of files duplicated +# inside the import path or set of files duplicated in the storage path +# The first pass, simply concatenates these into a data structure +# (im_same_dups) that contains all the duplicates with a key of the md5 hash +# +# The second pass (), processes these duplicates to see if there are any in the +# storage path that follow the pattern 'YYYY/YYYYMMDD' -> if so mark these to +# keep and the rest to be deleted. +# +# After the 2 passes, we have data structures that allow the web to break up +# the duplicates into batches to process: +# 1) auto delete any in the import path that are also in the storage path +# - carefule here, if we have 2 in the import path and 2+ in the storage path, leave it for manual intervention +# 2) auto delete any in the storage path that are in a set where 1 of them match the 'YYYY/YYYYMMDD' format, the rest are deleted +# 3) a set of directories where there are only 2 duplicate files (with the same file name), just in a different dir - allow user to choose the dir to keep +# 4) a set of individual files where I want the user to make a decision (3 or more copies, those with different filenames, or in the same dir) - allow user to choose file to keep class Duplicates: def __init__(self): ### Duplicates Attributes ### self.ip_to_sp_dups_keep={} self.ip_to_sp_dups_del={} - self.in_same_dups={} + self.dups_to_process={} self.per_file_dups=[] self.per_path_dups=[] self.preferred={} @@ -46,42 +72,52 @@ class Duplicates: self.storage_paths=[] self.import_paths=[] - # per storage path, add entries to view + # pull apart the storage path Setting, and make array of each for use in TrimmedPath() settings=Settings.query.first() paths = settings.storage_path.split("#") for path in paths: prefix = SymlinkName(path,path+'/') self.storage_paths.append(prefix) self.all_paths.append(prefix) + # pull apart the import path Setting, and make array of each for use in TrimmedPath() paths = settings.import_path.split("#") for path in paths: prefix = SymlinkName(path,path+'/') self.import_paths.append(prefix) self.all_paths.append(prefix) + # Strip the front of the path (any match on a storage or import path) is + # removed. Just to make it easier to read when we display in the web page def TrimmedPath( self, path ): for p in self.all_paths: if re.match( f"^{p}", path ): return path.replace(p, '' ) return path + # is this file in the import path? def InImportPath( self, path ): for p in self.import_paths: if re.match( f"^{p}", path ): return True return False + # is this file in the storage path? def InStoragePath( self, path ): for p in self.storage_paths: if re.match( f"^{p}", path ): return True return False + # this stores this object into the keep from same path list (only ever 1) def KeepInSameDups( self, obj ): if obj.h not in self.ip_to_sp_dups_keep: self.ip_to_sp_dups_keep[obj.h]= obj + else: + print( f"DDP: we need to cater for this - 2 files to keep in the storage path, if they are different, then pull these out of here and put them in the in_same_dup list to manually process" ) return + # this stores this object into the Delete from same path list (if it is not + # already there) def DelInSameDups( self, obj ): if obj.h not in self.ip_to_sp_dups_del: self.ip_to_sp_dups_del[obj.h]=[] @@ -93,6 +129,9 @@ class Duplicates: self.ip_to_sp_dups_del[obj.h].append( obj ) return + # this function takes a duplicate file (in the import path and the storage path) + # and then puts the storage path file in the keep list (self.ip_to_sp_dups_keep) via self.KeepInSameDups() + # and then puts the import path file in the delete list (self.ip_to_sp_dups_keep) via self.DelInSameDups() def DupInImportAndStoragePath( self, row, dr1, dr2 ): if self.InStoragePath(row.path1) and self.InImportPath(row.path2): self.KeepInSameDups( dr1 ) @@ -104,35 +143,44 @@ class Duplicates: return True return False + # AddDup: takes a row from the database effectively file1 & file2 + # we process these into appropriate data structures on this first pass def AddDup( self, row ): dr1=DupRow( row.hash, row.fname1, self.TrimmedPath(row.path1), row.did1, row.id1 ) dr2=DupRow( row.hash, row.fname2, self.TrimmedPath(row.path2), row.did2, row.id2 ) + # if in both import and storage path, just keep the storage path file, + # and del import path file. if self.DupInImportAndStoragePath( row, dr1, dr2 ): return - if row.hash not in self.in_same_dups: - self.in_same_dups[row.hash]=[] - self.in_same_dups[row.hash].append( dr1 ) - self.in_same_dups[row.hash].append( dr2 ) + # if we are here, we have duplicates either in the storage path or in + # the import path + + # if the hast is no dups_to_process, created / append + if row.hash not in self.dups_to_process: + self.dups_to_process[row.hash]=[] + self.dups_to_process[row.hash].append( dr1 ) + self.dups_to_process[row.hash].append( dr2 ) else: - # process path1 / fname1 -- if that combo is not in the dups[hash], add it + # process path1 / fname1 -- if that combo is not in the dups_to_process[hash], add it found=0 - for dup in self.in_same_dups[row.hash]: + for dup in self.dups_to_process[row.hash]: if dup.id == row.id1: found=1 continue if not found: - self.in_same_dups[row.hash].append( dr1 ) + self.dups_to_process[row.hash].append( dr1 ) - # process path2 / fname2 -- if that combo is not in the dups[hash], add it - for dup in self.in_same_dups[row.hash]: + # process path2 / fname2 -- if that combo is not in the dups_to_process[hash], add it + for dup in self.dups_to_process[row.hash]: if dup.id == row.id2: found=1 continue if not found: - self.in_same_dups[row.hash].append( dr2 ) + self.dups_to_process[row.hash].append( dr2 ) return + # quick debugger to see the data in the data structure def Dump(self): if len(self.ip_to_sp_dups_keep) > 0: print( "############ Files that are in both Import and Storage Paths ###########") @@ -144,14 +192,14 @@ class Duplicates: for d in self.ip_to_sp_dups_del[h]: print( f"Del: {d}" ) print( f"{cnt} sets of duplicate files to delete at least 1, anything with 2 or more dups is printed above explicitly" ) - if len(self.in_same_dups) > 0: - print( "############ Duplicate Files that are in the same Path ###########") + if len(self.dups_to_process) > 0: + print( "############ Duplicate Files that are needing to be futher processed ###########") cnt=0 - for h in self.in_same_dups: + for h in self.dups_to_process: cnt +=1 - if len(self.in_same_dups[h])>2: + if len(self.dups_to_process[h])>2: print( f"hash={h}, keep 1 of these: ", end='') - for d in self.in_same_dups[h]: + for d in self.dups_to_process[h]: print( f"{d.id}, ", end='' ) print ("") print( f"{cnt} sets of duplicate files to delete at least 1, anything with 2 or more dups is printed above explicitly" )