From 046c512e6b7c7ba52c7917f7789f0847820faca0 Mon Sep 17 00:00:00 2001
From: Damien De Paoli <ddp@depaoli.id.au>
Date: Sun, 14 Mar 2021 14:33:22 +1100
Subject: [PATCH] added comments

---
 dups.py | 82 +++++++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 65 insertions(+), 17 deletions(-)

diff --git a/dups.py b/dups.py
index 6a4ed74..93d2fa9 100644
--- a/dups.py
+++ b/dups.py
@@ -24,6 +24,9 @@ from job import Job, JobExtra, Joblog, NewJob
 from settings import Settings
 from shared import SymlinkName
 
+# DupRow class is a simple 'struct' to keep data per duplicate file / just to
+# avoid using python list/dicts intermixed, and be able to consistently use
+# dot-notation of fields
 class DupRow:
     def __init__(self, hash, file, dir, did, fid):
         ### DupRow Attributes -- note, simple class, no methods ###
@@ -33,12 +36,35 @@ class DupRow:
         self.did=did
         self.id=fid
     
+# Duplicates class is used with one instance/object to process all the
+# 'duplicate' data from the Database, and parse it into more usable data
+# structures.    This is needed also, as the database content shows duplicates
+# more than once, e.g.
+#     file1 and file2 are a duplicate, then later file2 and file 1 are 'another' duplicate
+# The class passes over the data in 2 passes.  The first pass in AddDup() finds
+# any files in the import and storage path and marks the storage ones to keep,
+# the import ones to delete.  Anything else is either a set of files duplicated
+# inside the import path or set of files duplicated in the storage path
+# The first pass, simply concatenates these into a data structure
+# (im_same_dups) that contains all the duplicates with a key of the md5 hash
+#
+# The second pass (), processes these duplicates to see if there are any in the
+# storage path that follow the pattern 'YYYY/YYYYMMDD' -> if so mark these to
+# keep and the rest to be deleted.
+#
+# After the 2 passes, we have data structures that allow the web to break up
+# the duplicates into batches to process:
+#    1) auto delete any in the import path that are also in the storage path
+#       - carefule here, if we have 2 in the import path and 2+ in the storage path, leave it for manual intervention
+#    2) auto delete any in the storage path that are in a set where 1 of them match the 'YYYY/YYYYMMDD' format, the rest are deleted
+#    3) a set of directories where there are only 2 duplicate files (with the same file name), just in a different dir - allow user to choose the dir to keep
+#    4) a set of individual files where I want the user to make a decision (3 or more copies, those with different filenames, or in the same dir) - allow user to choose file to keep
 class Duplicates:
     def __init__(self):
         ### Duplicates Attributes ###
         self.ip_to_sp_dups_keep={}
         self.ip_to_sp_dups_del={}
-        self.in_same_dups={}
+        self.dups_to_process={}
         self.per_file_dups=[]
         self.per_path_dups=[]
         self.preferred={}
@@ -46,42 +72,52 @@ class Duplicates:
         self.storage_paths=[]
         self.import_paths=[]
 
-        # per storage path, add entries to view 
+        # pull apart the storage path Setting, and make array of each for use in TrimmedPath()
         settings=Settings.query.first()
         paths = settings.storage_path.split("#")
         for path in paths:
             prefix = SymlinkName(path,path+'/')
             self.storage_paths.append(prefix)
             self.all_paths.append(prefix)
+        # pull apart the import path Setting, and make array of each for use in TrimmedPath()
         paths = settings.import_path.split("#")
         for path in paths:
             prefix = SymlinkName(path,path+'/')
             self.import_paths.append(prefix)
             self.all_paths.append(prefix)
 
+    # Strip the front of the path (any match on a storage or import path) is
+    # removed.   Just to make it easier to read when we display in the web page
     def TrimmedPath( self, path ):
         for p in self.all_paths:
             if re.match( f"^{p}", path ):
                 return path.replace(p, '' )
         return path
 
+    # is this file in the import path?
     def InImportPath( self, path ):
         for p in self.import_paths:
             if re.match( f"^{p}", path ):
                 return True
         return False
 
+    # is this file in the storage path?
     def InStoragePath( self, path ):
         for p in self.storage_paths:
             if re.match( f"^{p}", path ):
                 return True
         return False
 
+    # this stores this object into the keep from same path list (only ever 1)
     def KeepInSameDups( self, obj ):
         if obj.h not in self.ip_to_sp_dups_keep:
             self.ip_to_sp_dups_keep[obj.h]= obj
+        else:
+            print( f"DDP: we need to cater for this - 2 files to keep in the storage path, if they are different, then pull these out of here and put them in the in_same_dup list to manually process" )
         return
 
+    # this stores this object into the Delete from same path list (if it is not
+    # already there)
     def DelInSameDups( self, obj ):
         if obj.h not in self.ip_to_sp_dups_del:
             self.ip_to_sp_dups_del[obj.h]=[]
@@ -93,6 +129,9 @@ class Duplicates:
             self.ip_to_sp_dups_del[obj.h].append( obj )
         return
 
+    # this function takes a duplicate file (in the import path and the storage path)
+    # and then puts the storage path file in the keep list (self.ip_to_sp_dups_keep) via self.KeepInSameDups()
+    # and then puts the import path file in the delete list (self.ip_to_sp_dups_keep) via self.DelInSameDups()
     def DupInImportAndStoragePath( self, row, dr1, dr2 ):
         if self.InStoragePath(row.path1) and self.InImportPath(row.path2):
             self.KeepInSameDups( dr1 )
@@ -104,35 +143,44 @@ class Duplicates:
             return True
         return False
 
+    # AddDup: takes a row from the database effectively file1 & file2
+    # we process these into appropriate data structures on this first pass
     def AddDup( self, row ):
         dr1=DupRow( row.hash, row.fname1, self.TrimmedPath(row.path1), row.did1, row.id1 )
         dr2=DupRow( row.hash, row.fname2, self.TrimmedPath(row.path2), row.did2, row.id2 )
+        # if in both import and storage path, just keep the storage path file,
+        # and del import path file.
         if self.DupInImportAndStoragePath( row, dr1, dr2 ):
             return
 
-        if row.hash not in self.in_same_dups:
-            self.in_same_dups[row.hash]=[]
-            self.in_same_dups[row.hash].append( dr1 )
-            self.in_same_dups[row.hash].append( dr2 )
+        # if we are here, we have duplicates either in the storage path or in
+        # the import path
+
+        # if the hast is no dups_to_process, created / append
+        if row.hash not in self.dups_to_process:
+            self.dups_to_process[row.hash]=[]
+            self.dups_to_process[row.hash].append( dr1 )
+            self.dups_to_process[row.hash].append( dr2 )
         else:
-            # process path1 / fname1 -- if that combo is not in the dups[hash], add it
+            # process path1 / fname1 -- if that combo is not in the dups_to_process[hash], add it
             found=0
-            for dup in self.in_same_dups[row.hash]:
+            for dup in self.dups_to_process[row.hash]:
                 if dup.id == row.id1:
                     found=1
                     continue
             if not found:
-                self.in_same_dups[row.hash].append( dr1 )
+                self.dups_to_process[row.hash].append( dr1 )
 
-            # process path2 / fname2 -- if that combo is not in the dups[hash], add it
-            for dup in self.in_same_dups[row.hash]:
+            # process path2 / fname2 -- if that combo is not in the dups_to_process[hash], add it
+            for dup in self.dups_to_process[row.hash]:
                 if dup.id == row.id2:
                     found=1
                     continue
             if not found:
-                self.in_same_dups[row.hash].append( dr2 )
+                self.dups_to_process[row.hash].append( dr2 )
         return
 
+    # quick debugger to see the data in the data structure 
     def Dump(self):
         if len(self.ip_to_sp_dups_keep) > 0:
             print( "############ Files that are in both Import and Storage Paths ###########")
@@ -144,14 +192,14 @@ class Duplicates:
                     for d in self.ip_to_sp_dups_del[h]:
                         print( f"Del: {d}" )
             print( f"{cnt} sets of duplicate files to delete at least 1, anything with 2 or more dups is printed above explicitly" )
-        if len(self.in_same_dups) > 0:
-            print( "############ Duplicate Files that are in the same Path ###########")
+        if len(self.dups_to_process) > 0:
+            print( "############ Duplicate Files that are needing to be futher processed ###########")
             cnt=0
-            for h in self.in_same_dups:
+            for h in self.dups_to_process:
                 cnt +=1
-                if len(self.in_same_dups[h])>2:
+                if len(self.dups_to_process[h])>2:
                     print( f"hash={h}, keep 1 of these: ", end='')
-                    for d in self.in_same_dups[h]:
+                    for d in self.dups_to_process[h]:
                         print( f"{d.id}, ", end='' )
                     print ("")
             print( f"{cnt} sets of duplicate files to delete at least 1, anything with 2 or more dups is printed above explicitly" )