improved duplicate counting, removed concept of sets, and validated counts work via DB selects, and they do

2021-03-27 14:24:16 +11:00
parent 7304fc0916
commit c4f36713bf
1 changed files with 30 additions and 17 deletions
--- a/dups.py
+++ b/dups.py
@@ -94,8 +94,9 @@ class Duplicates:
        self.all_paths=[]
        self.storage_paths=[]
        self.import_paths=[]
-        self.overall_dup_cnt=0
-        self.overall_dup_sets=0
+        self.hashes_processed={}
+        self.uniq_dups=0
+        self.total_dups=0

        # pull apart the storage path Setting, and make array of each for use in TrimmedPath()
        settings=Settings.query.first()
@@ -133,11 +134,13 @@ class Duplicates:
                return True
        return False

-    # this stores this object into the keep from same path list (DDP: could there be more than 1)
+    # this stores this object into the keep from same path list (DDP: sometimes there can be more than 1 SP, e.g SP to SP to IP)
+    # for now, by not dealing with the extra SP, we will just delete the IP, and force a check_dups after deleting, it will then
+    # pick up and process the SP to SP - if still needed -- if there is only SP1 to SP2, then the per_path_dup will pick it up and
+    # I believe this will all work, but doesn't hurt to do an extra check_dups again
    def KeepInIPSPDups( self, obj ):
        if obj.h not in self.ip_to_sp_dups_keep:
            self.ip_to_sp_dups_keep[obj.h]= obj
-            self.overall_dup_sets += 1
        return

    # this stores this object into the Delete from same path list (if it is not
@@ -151,8 +154,6 @@ class Duplicates:
                if el.id == obj.id:
                    return
            self.ip_to_sp_dups_del[obj.h].append( obj )
-        # only get here is this is a new duplicate to delete, so increment count
-        self.overall_dup_cnt += 1
        return

    # this function takes a duplicate file (in the import path and the storage path)
@@ -172,6 +173,7 @@ class Duplicates:
    # AddDup: takes a row from the database effectively file1 & file2
    # we process these into appropriate data structures on this first pass
    def AddDup( self, row ):
+        self.hashes_processed[row.hash]=1
        dr1=DupRow( row.hash, row.fname1, self.TrimmedPath(row.path1), row.did1, row.id1 )
        dr2=DupRow( row.hash, row.fname2, self.TrimmedPath(row.path2), row.did2, row.id2 )
        # if in both import and storage path, just keep the storage path file,
@@ -206,39 +208,50 @@ class Duplicates:
    def AddDupPath(self, hash):
        # this gets complex, if this hash is also in a shared imp / sp - then dont deal with it now, let the imp files be deleted and
        # the repeat check_dups validation step catch it as a cleander (potential) for still more duplicates just in sp
-        if hash in self.ip_to_sp_dups_keep:
-            return
        dpr=DupPathRow( 2, self.dups_to_process[hash][0].d, self.dups_to_process[hash][1].d, self.dups_to_process[hash][0].did, self.dups_to_process[hash][1].did, hash )
+        if hash in self.ip_to_sp_dups_keep:
+            return False
        new=1
        for el in self.per_path_dups:
            if el.d1 == dpr.d1 and el.d2 == dpr.d2:
-                self.overall_dup_cnt += 2
                el.count += 2
                el.hashes = f"{el.hashes},{hash}"
                new=0
        if new:
            self.per_path_dups.append( dpr )
-            self.overall_dup_sets += 1
-            self.overall_dup_cnt += 2
        if re.search( r'\d{4}/\d{8}', dpr.d1):
            self.preferred_path[dpr.did1]=1
        if re.search( r'\d{4}/\d{8}', dpr.d2):
            self.preferred_path[dpr.did2]=1
-        return
+        return True

    def SecondPass(self):
+        # sort out counts (for ip_to_sp - that is all finished)
+        self.uniq_dups = len(self.hashes_processed)
+        # total starts with 1 copy of everything we keep in sp
+        self.total_dups = len(self.ip_to_sp_dups_keep)
+        # and then add all those we delete in ip that are in sp
+        for hash in self.ip_to_sp_dups_del:
+            self.total_dups += len(self.ip_to_sp_dups_del[hash])
+
        for hash in self.dups_to_process:
            # more than 2 files (just ask per file) OR only 2 copies, and files are in same dir (so must be diff name, so just ask) OR content same, filename different (ask per file)
            if (len(self.dups_to_process[hash]) > 2) or (self.dups_to_process[hash][0].f != self.dups_to_process[hash][1].f) or (self.dups_to_process[hash][0].d == self.dups_to_process[hash][1].d):
-                self.overall_dup_sets += 1
-                self.overall_dup_cnt += len(self.dups_to_process[hash])
                self.per_file_dups.append(self.dups_to_process[hash])
                for el in self.dups_to_process[hash]:
                    if re.search( r'\d{4}/\d{8}', el.d):
                        self.preferred_file[hash] = el.id
-            # by here we have only 2 files, with the same name, different path (ask per path)
+                self.total_dups += len(self.dups_to_process[hash])
+                # if this combination ALSO has an import path dup, then we have already counted the storage path dup in the earlier keeping count
+                if hash in self.ip_to_sp_dups_keep:
+                    self.total_dups -= 1
+            # only 2 files, with the same name, different path (ask per path)
            else:
-                self.AddDupPath( hash )
+                # if this dup path is not already being partially handled by an ip <-> sp dup, then add it / count it
+                if self.AddDupPath( hash ):
+                    self.total_dups += 2
+                else:
+                    self.total_dups += 1
        return

    # quick debugger to see the data in the data structure