updated comments
This commit is contained in:
23
dups.py
23
dups.py
@@ -78,7 +78,7 @@ class DupPathRow:
|
||||
# After the 2 passes, we have data structures that allow the web to break up
|
||||
# the duplicates into batches to process:
|
||||
# 1) auto delete any in the import path that are also in the storage path
|
||||
# - carefule here, if we have 2 in the import path and 2+ in the storage path, leave it for manual intervention
|
||||
# - careful here, if we have any in the import path and 2+ in the storage path, leave it for manual intervention
|
||||
# 2) auto delete any in the storage path that are in a set where 1 of them match the 'YYYY/YYYYMMDD' format, the rest are deleted
|
||||
# 3) a set of directories where there are only 2 duplicate files (with the same file name), just in a different dir - allow user to choose the dir to keep
|
||||
# 4) a set of individual files where I want the user to make a decision (3 or more copies, those with different filenames, or in the same dir) - allow user to choose file to keep
|
||||
@@ -158,7 +158,7 @@ class Duplicates:
|
||||
if self.DupInImportAndStoragePath( row, dr1, dr2 ):
|
||||
return
|
||||
|
||||
# if the hast is no dups_to_process, created / append
|
||||
# if the hash is not in dups_to_process, created / append
|
||||
if row.hash not in self.dups_to_process:
|
||||
self.dups_to_process[row.hash]=[]
|
||||
self.dups_to_process[row.hash].append( dr1 )
|
||||
@@ -182,6 +182,9 @@ class Duplicates:
|
||||
self.dups_to_process[row.hash].append( dr2 )
|
||||
return
|
||||
|
||||
# AddDupPath: takes a row from the database effectively with a dup pair in dir1 & dir2
|
||||
# we process these into appropriate data structures on this second pass
|
||||
# working through if a dir is in th estorage path and is
|
||||
def AddDupPath(self, hash):
|
||||
# this gets complex, if this hash is also in a shared imp / sp - then dont deal with it now, let the imp files be deleted and
|
||||
# the repeat check_dups validation step catch it as a cleander (potential) for still more duplicates just in sp
|
||||
@@ -190,10 +193,13 @@ class Duplicates:
|
||||
return False
|
||||
new=1
|
||||
for el in self.per_path_dups:
|
||||
# if this new hash / dup in dpr has same dirs as existing per_path_dups row, then just another file in same dup dir...
|
||||
if el.d1 == dpr.d1 and el.d2 == dpr.d2:
|
||||
el.count += 2
|
||||
el.hashes = f"{el.hashes},{hash}"
|
||||
new=0
|
||||
# okay, we have a new pair of duplicate dirs... Add them, and if either has matching regex its preferred
|
||||
# FIXME: what if both do? what if one is in SP and the other not, etc...
|
||||
if new:
|
||||
self.per_path_dups.append( dpr )
|
||||
if re.search( r'\d{4}/\d{8}', dpr.d1):
|
||||
@@ -202,6 +208,8 @@ class Duplicates:
|
||||
self.preferred_path[dpr.did2]=1
|
||||
return True
|
||||
|
||||
# The second pass processes row by row of dups_to_process, looking for per_file_dups and per_path_dups
|
||||
# AND works out counts to display overall tallies of types of keeping and deletion of files, and choices to make
|
||||
def SecondPass(self):
|
||||
# sort out counts (for ip_to_sp - that is all finished)
|
||||
self.uniq_dups = len(self.hashes_processed)
|
||||
@@ -211,9 +219,12 @@ class Duplicates:
|
||||
for hash in self.ip_to_sp_dups_del:
|
||||
self.total_dups += len(self.ip_to_sp_dups_del[hash])
|
||||
|
||||
# okay, go for each duplicate that should be processed (they are stored
|
||||
# by hash, and have at least 2 entries, but can have more, and be in
|
||||
# the IP or SP and any combo, cater for all below
|
||||
for hash in self.dups_to_process:
|
||||
# more than 2 files (just ask per file) OR only 2 copies, and files are in same dir (so must be diff name, so just ask) OR content same, filename different (ask per file)
|
||||
if (len(self.dups_to_process[hash]) > 2) or (self.dups_to_process[hash][0].f != self.dups_to_process[hash][1].f) or (self.dups_to_process[hash][0].d == self.dups_to_process[hash][1].d):
|
||||
# more than 2 files (just ask per file) OR (implied) only 2 copies, and files are in same dir (so must be diff name, so just ask) OR (implied) on 2 copies in same dir & filename different (ask per file)
|
||||
if (len(self.dups_to_process[hash]) > 2) or (self.dups_to_process[hash][0].d == self.dups_to_process[hash][1].d) or (self.dups_to_process[hash][0].f != self.dups_to_process[hash][1].f):
|
||||
self.per_file_dups.append(self.dups_to_process[hash])
|
||||
for el in self.dups_to_process[hash]:
|
||||
if re.search( r'\d{4}/\d{8}', el.d):
|
||||
@@ -222,7 +233,7 @@ class Duplicates:
|
||||
# if this combination ALSO has an import path dup, then we have already counted the storage path dup in the earlier keeping count
|
||||
if hash in self.ip_to_sp_dups_keep:
|
||||
self.total_dups -= 1
|
||||
# only 2 files, with the same name, different path (ask per path)
|
||||
# only 2 files AND different path (ask per path) AND with the same name...
|
||||
else:
|
||||
# if this dup path is not already being partially handled by an ip <-> sp dup, then add it / count it
|
||||
if self.AddDupPath( hash ):
|
||||
@@ -235,7 +246,7 @@ class Duplicates:
|
||||
self.total_dups += 1
|
||||
return
|
||||
|
||||
# quick debugger to see the data in the data structure
|
||||
# quick debugger to see the data in the data structure (not used by default)
|
||||
def Dump(self):
|
||||
if len(self.ip_to_sp_dups_keep) > 0:
|
||||
print( "############ Files that are in both Import and Storage Paths ###########")
|
||||
|
||||
Reference in New Issue
Block a user