updated comments

This commit is contained in:
2021-08-11 17:01:12 +10:00
parent 61c85acf3c
commit eb5fc0aa84

23
dups.py
View File

@@ -78,7 +78,7 @@ class DupPathRow:
# After the 2 passes, we have data structures that allow the web to break up # After the 2 passes, we have data structures that allow the web to break up
# the duplicates into batches to process: # the duplicates into batches to process:
# 1) auto delete any in the import path that are also in the storage path # 1) auto delete any in the import path that are also in the storage path
# - carefule here, if we have 2 in the import path and 2+ in the storage path, leave it for manual intervention # - careful here, if we have any in the import path and 2+ in the storage path, leave it for manual intervention
# 2) auto delete any in the storage path that are in a set where 1 of them match the 'YYYY/YYYYMMDD' format, the rest are deleted # 2) auto delete any in the storage path that are in a set where 1 of them match the 'YYYY/YYYYMMDD' format, the rest are deleted
# 3) a set of directories where there are only 2 duplicate files (with the same file name), just in a different dir - allow user to choose the dir to keep # 3) a set of directories where there are only 2 duplicate files (with the same file name), just in a different dir - allow user to choose the dir to keep
# 4) a set of individual files where I want the user to make a decision (3 or more copies, those with different filenames, or in the same dir) - allow user to choose file to keep # 4) a set of individual files where I want the user to make a decision (3 or more copies, those with different filenames, or in the same dir) - allow user to choose file to keep
@@ -158,7 +158,7 @@ class Duplicates:
if self.DupInImportAndStoragePath( row, dr1, dr2 ): if self.DupInImportAndStoragePath( row, dr1, dr2 ):
return return
# if the hast is no dups_to_process, created / append # if the hash is not in dups_to_process, created / append
if row.hash not in self.dups_to_process: if row.hash not in self.dups_to_process:
self.dups_to_process[row.hash]=[] self.dups_to_process[row.hash]=[]
self.dups_to_process[row.hash].append( dr1 ) self.dups_to_process[row.hash].append( dr1 )
@@ -182,6 +182,9 @@ class Duplicates:
self.dups_to_process[row.hash].append( dr2 ) self.dups_to_process[row.hash].append( dr2 )
return return
# AddDupPath: takes a row from the database effectively with a dup pair in dir1 & dir2
# we process these into appropriate data structures on this second pass
# working through if a dir is in th estorage path and is
def AddDupPath(self, hash): def AddDupPath(self, hash):
# this gets complex, if this hash is also in a shared imp / sp - then dont deal with it now, let the imp files be deleted and # this gets complex, if this hash is also in a shared imp / sp - then dont deal with it now, let the imp files be deleted and
# the repeat check_dups validation step catch it as a cleander (potential) for still more duplicates just in sp # the repeat check_dups validation step catch it as a cleander (potential) for still more duplicates just in sp
@@ -190,10 +193,13 @@ class Duplicates:
return False return False
new=1 new=1
for el in self.per_path_dups: for el in self.per_path_dups:
# if this new hash / dup in dpr has same dirs as existing per_path_dups row, then just another file in same dup dir...
if el.d1 == dpr.d1 and el.d2 == dpr.d2: if el.d1 == dpr.d1 and el.d2 == dpr.d2:
el.count += 2 el.count += 2
el.hashes = f"{el.hashes},{hash}" el.hashes = f"{el.hashes},{hash}"
new=0 new=0
# okay, we have a new pair of duplicate dirs... Add them, and if either has matching regex its preferred
# FIXME: what if both do? what if one is in SP and the other not, etc...
if new: if new:
self.per_path_dups.append( dpr ) self.per_path_dups.append( dpr )
if re.search( r'\d{4}/\d{8}', dpr.d1): if re.search( r'\d{4}/\d{8}', dpr.d1):
@@ -202,6 +208,8 @@ class Duplicates:
self.preferred_path[dpr.did2]=1 self.preferred_path[dpr.did2]=1
return True return True
# The second pass processes row by row of dups_to_process, looking for per_file_dups and per_path_dups
# AND works out counts to display overall tallies of types of keeping and deletion of files, and choices to make
def SecondPass(self): def SecondPass(self):
# sort out counts (for ip_to_sp - that is all finished) # sort out counts (for ip_to_sp - that is all finished)
self.uniq_dups = len(self.hashes_processed) self.uniq_dups = len(self.hashes_processed)
@@ -211,9 +219,12 @@ class Duplicates:
for hash in self.ip_to_sp_dups_del: for hash in self.ip_to_sp_dups_del:
self.total_dups += len(self.ip_to_sp_dups_del[hash]) self.total_dups += len(self.ip_to_sp_dups_del[hash])
# okay, go for each duplicate that should be processed (they are stored
# by hash, and have at least 2 entries, but can have more, and be in
# the IP or SP and any combo, cater for all below
for hash in self.dups_to_process: for hash in self.dups_to_process:
# more than 2 files (just ask per file) OR only 2 copies, and files are in same dir (so must be diff name, so just ask) OR content same, filename different (ask per file) # more than 2 files (just ask per file) OR (implied) only 2 copies, and files are in same dir (so must be diff name, so just ask) OR (implied) on 2 copies in same dir & filename different (ask per file)
if (len(self.dups_to_process[hash]) > 2) or (self.dups_to_process[hash][0].f != self.dups_to_process[hash][1].f) or (self.dups_to_process[hash][0].d == self.dups_to_process[hash][1].d): if (len(self.dups_to_process[hash]) > 2) or (self.dups_to_process[hash][0].d == self.dups_to_process[hash][1].d) or (self.dups_to_process[hash][0].f != self.dups_to_process[hash][1].f):
self.per_file_dups.append(self.dups_to_process[hash]) self.per_file_dups.append(self.dups_to_process[hash])
for el in self.dups_to_process[hash]: for el in self.dups_to_process[hash]:
if re.search( r'\d{4}/\d{8}', el.d): if re.search( r'\d{4}/\d{8}', el.d):
@@ -222,7 +233,7 @@ class Duplicates:
# if this combination ALSO has an import path dup, then we have already counted the storage path dup in the earlier keeping count # if this combination ALSO has an import path dup, then we have already counted the storage path dup in the earlier keeping count
if hash in self.ip_to_sp_dups_keep: if hash in self.ip_to_sp_dups_keep:
self.total_dups -= 1 self.total_dups -= 1
# only 2 files, with the same name, different path (ask per path) # only 2 files AND different path (ask per path) AND with the same name...
else: else:
# if this dup path is not already being partially handled by an ip <-> sp dup, then add it / count it # if this dup path is not already being partially handled by an ip <-> sp dup, then add it / count it
if self.AddDupPath( hash ): if self.AddDupPath( hash ):
@@ -235,7 +246,7 @@ class Duplicates:
self.total_dups += 1 self.total_dups += 1
return return
# quick debugger to see the data in the data structure # quick debugger to see the data in the data structure (not used by default)
def Dump(self): def Dump(self):
if len(self.ip_to_sp_dups_keep) > 0: if len(self.ip_to_sp_dups_keep) > 0:
print( "############ Files that are in both Import and Storage Paths ###########") print( "############ Files that are in both Import and Storage Paths ###########")