updated comments

2021-08-11 17:01:12 +10:00
parent 61c85acf3c
commit eb5fc0aa84
1 changed files with 17 additions and 6 deletions
--- a/dups.py
+++ b/dups.py
@@ -78,7 +78,7 @@ class DupPathRow:
 # After the 2 passes, we have data structures that allow the web to break up
 # the duplicates into batches to process:
 #    1) auto delete any in the import path that are also in the storage path
-#       - carefule here, if we have 2 in the import path and 2+ in the storage path, leave it for manual intervention
+#       - careful here, if we have any in the import path and 2+ in the storage path, leave it for manual intervention
 #    2) auto delete any in the storage path that are in a set where 1 of them match the 'YYYY/YYYYMMDD' format, the rest are deleted
 #    3) a set of directories where there are only 2 duplicate files (with the same file name), just in a different dir - allow user to choose the dir to keep
 #    4) a set of individual files where I want the user to make a decision (3 or more copies, those with different filenames, or in the same dir) - allow user to choose file to keep
@@ -158,7 +158,7 @@ class Duplicates:
        if self.DupInImportAndStoragePath( row, dr1, dr2 ):
            return

-        # if the hast is no dups_to_process, created / append
+        # if the hash is not in dups_to_process, created / append
        if row.hash not in self.dups_to_process:
            self.dups_to_process[row.hash]=[]
            self.dups_to_process[row.hash].append( dr1 )
@@ -182,6 +182,9 @@ class Duplicates:
                self.dups_to_process[row.hash].append( dr2 )
        return

+    # AddDupPath: takes a row from the database effectively with a dup pair in dir1 & dir2
+    # we process these into appropriate data structures on this second pass
+    # working through if a dir is in th estorage path and is 
    def AddDupPath(self, hash):
        # this gets complex, if this hash is also in a shared imp / sp - then dont deal with it now, let the imp files be deleted and
        # the repeat check_dups validation step catch it as a cleander (potential) for still more duplicates just in sp
@@ -190,10 +193,13 @@ class Duplicates:
            return False
        new=1
        for el in self.per_path_dups:
+            # if this new hash / dup in dpr has same dirs as existing per_path_dups row, then just another file in same dup dir...
            if el.d1 == dpr.d1 and el.d2 == dpr.d2:
                el.count += 2
                el.hashes = f"{el.hashes},{hash}"
                new=0
+        # okay, we have a new pair of duplicate dirs... Add them, and if either has matching regex its preferred
+        # FIXME: what if both do? what if one is in SP and the other not, etc...  
        if new:
            self.per_path_dups.append( dpr )
        if re.search( r'\d{4}/\d{8}', dpr.d1):
@@ -202,6 +208,8 @@ class Duplicates:
            self.preferred_path[dpr.did2]=1
        return True

+    # The second pass processes row by row of dups_to_process, looking for per_file_dups and per_path_dups
+    # AND works out counts to display overall tallies of types of keeping and deletion of files, and choices to make
    def SecondPass(self):
        # sort out counts (for ip_to_sp - that is all finished)
        self.uniq_dups = len(self.hashes_processed)
@@ -211,9 +219,12 @@ class Duplicates:
        for hash in self.ip_to_sp_dups_del:
            self.total_dups += len(self.ip_to_sp_dups_del[hash])

+        # okay, go for each duplicate that should be processed (they are stored
+        # by hash, and have at least 2 entries, but can have more, and be in
+        # the IP or SP and any combo, cater for all below
        for hash in self.dups_to_process:
-            # more than 2 files (just ask per file) OR only 2 copies, and files are in same dir (so must be diff name, so just ask) OR content same, filename different (ask per file)
-            if (len(self.dups_to_process[hash]) > 2) or (self.dups_to_process[hash][0].f != self.dups_to_process[hash][1].f) or (self.dups_to_process[hash][0].d == self.dups_to_process[hash][1].d):
+            # more than 2 files (just ask per file) OR (implied) only 2 copies, and files are in same dir (so must be diff name, so just ask) OR (implied) on 2 copies in same dir & filename different (ask per file)
+            if (len(self.dups_to_process[hash]) > 2) or (self.dups_to_process[hash][0].d == self.dups_to_process[hash][1].d) or (self.dups_to_process[hash][0].f != self.dups_to_process[hash][1].f):
                self.per_file_dups.append(self.dups_to_process[hash])
                for el in self.dups_to_process[hash]:
                    if re.search( r'\d{4}/\d{8}', el.d):
@@ -222,7 +233,7 @@ class Duplicates:
                # if this combination ALSO has an import path dup, then we have already counted the storage path dup in the earlier keeping count
                if hash in self.ip_to_sp_dups_keep:
                    self.total_dups -= 1
-            # only 2 files, with the same name, different path (ask per path)
+            # only 2 files AND different path (ask per path) AND with the same name...
            else:
                # if this dup path is not already being partially handled by an ip <-> sp dup, then add it / count it
                if self.AddDupPath( hash ):
@@ -235,7 +246,7 @@ class Duplicates:
                        self.total_dups += 1
        return

-    # quick debugger to see the data in the data structure 
+    # quick debugger to see the data in the data structure (not used by default)
    def Dump(self):
        if len(self.ip_to_sp_dups_keep) > 0:
            print( "############ Files that are in both Import and Storage Paths ###########")