From 8ff61dddfa1ac1eeb1fe260a0543dc9ff1b0660d Mon Sep 17 00:00:00 2001
From: Damien De Paoli <ddp@depaoli.id.au>
Date: Sat, 13 Mar 2021 12:36:16 +1100
Subject: [PATCH] trialing a new duplicate class to deal more consistently with
 the various types of duplicates -- mostly to enable "auto" deleting of
 duplicates in specific conditions, e.g. in both an import dir and storage dir
 - just delete dups from import dir

---
 dups.py  | 201 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 files.py |  16 ++++-
 2 files changed, 215 insertions(+), 2 deletions(-)
 create mode 100644 dups.py

diff --git a/dups.py b/dups.py
new file mode 100644
index 0000000..cddaec4
--- /dev/null
+++ b/dups.py
@@ -0,0 +1,201 @@
+from wtforms import SubmitField, StringField, HiddenField, validators, Form
+from flask_wtf import FlaskForm
+from flask import request, render_template, redirect, send_from_directory
+from main import db, app, ma 
+from sqlalchemy import Sequence
+from sqlalchemy.exc import SQLAlchemyError
+from status import st, Status
+import os
+import glob
+from PIL import Image
+from pymediainfo import MediaInfo
+import hashlib
+import exifread
+import base64
+import numpy
+import cv2
+import time
+import re
+
+################################################################################
+# Local Class imports
+################################################################################
+from job import Job, JobExtra, Joblog, NewJob
+from settings import Settings
+from shared import SymlinkName
+
+class Duplicates:
+    ip_to_sp_dups_keep={}
+    ip_to_sp_dups_del={}
+    in_same_dups={}
+    per_file_dups=[]
+    per_path_dups=[]
+    preferred={}
+    all_paths=[]
+    storage_paths=[]
+    import_paths=[]
+
+    def __init__(self):
+        # per storage path, add entries to view 
+        settings=Settings.query.first()
+        paths = settings.storage_path.split("#")
+        for path in paths:
+            prefix = SymlinkName(path,path+'/')
+            self.storage_paths.append(prefix)
+            self.all_paths.append(prefix)
+        paths = settings.import_path.split("#")
+        for path in paths:
+            prefix = SymlinkName(path,path+'/')
+            self.import_paths.append(prefix)
+            self.all_paths.append(prefix)
+
+    def TrimmedPath( self, path ):
+        for p in self.all_paths:
+            if re.match( f"^{p}", path ):
+                return path.replace(p, '' )
+        return path
+
+    def InImportPath( self, path ):
+        for p in self.import_paths:
+            if re.match( f"^{p}", path ):
+                return True
+        return False
+
+    def InStoragePath( self, path ):
+        for p in self.storage_paths:
+            if re.match( f"^{p}", path ):
+                return True
+        return False
+
+    def KeepInSameDups( self, obj ):
+        if obj['h'] not in self.ip_to_sp_dups_keep:
+            self.ip_to_sp_dups_keep[obj['h']]= obj
+        return
+
+    def DelInSameDups( self, obj ):
+        if obj['h'] not in self.ip_to_sp_dups_del:
+            self.ip_to_sp_dups_del[obj['h']]=[]
+            self.ip_to_sp_dups_del[obj['h']].append( obj )
+        else:
+            for el in self.ip_to_sp_dups_del[obj['h']]:
+                if el['id'] == obj['id']:
+                    return
+            self.ip_to_sp_dups_del[obj['h']].append( obj )
+        return
+
+    def AddDup( self, row ):
+        if self.InStoragePath(row.path1) and self.InImportPath(row.path2):
+            self.KeepInSameDups( { 'f': row.fname1, 'd':self.TrimmedPath(row.path1), 'did': row.did1, 'h':row.hash, 'id':row.id1 } )
+            self.DelInSameDups( { 'f': row.fname2, 'd':self.TrimmedPath(row.path2), 'did': row.did2, 'h':row.hash, 'id':row.id2 } )
+
+        if self.InStoragePath(row.path2) and self.InImportPath(row.path1):
+            self.KeepInSameDups( { 'f': row.fname2, 'd':self.TrimmedPath(row.path2), 'did': row.did2, 'h':row.hash, 'id':row.id2 } )
+            self.DelInSameDups( { 'f': row.fname1, 'd':self.TrimmedPath(row.path1), 'did': row.did1, 'h':row.hash, 'id':row.id1 } )
+
+        return
+
+    def Dump(self):
+        if len(self.ip_to_sp_dups_keep) > 0:
+            print( "############ Files that are in both Import and Storage Paths ###########")
+            cnt=0
+            for h in self.ip_to_sp_dups_keep:
+                cnt +=1
+                if len(self.ip_to_sp_dups_del[h])>2:
+                    print( f"hash={h}, keep: {self.ip_to_sp_dups_keep[h]}" )
+                    for d in self.ip_to_sp_dups_del[h]:
+                        print( f"Del: {d}" )
+            print( f"{cnt} sets of duplicate files to delete at least 1, anything with 2 or more dups is printed above explicitly" )
+        return
+
+"""
+        if row.hash not in dups:
+            dups[row.hash]=[]
+            dups[row.hash].append( { 'f': row.fname1, 'd':TrimmedPath(prefix, row.path1), 'did': row.did1, 'h':row.hash, 'id':row.id1 } )
+            dups[row.hash].append( { 'f': row.fname2, 'd':TrimmedPath(prefix, row.path2), 'did': row.did2, 'h':row.hash, 'id':row.id2 } )
+        else:
+            # process path1 / fname1 -- if that combo is not in the dups[hash], add it
+            found=0
+            for dup in dups[row.hash]:
+                if dup['f'] == row.fname1 and dup['d'] == TrimmedPath(prefix, row.path1):
+                    found=1
+                    continue
+            if not found:
+                dups[row.hash].append( { 'f': row.fname1, 'd':TrimmedPath(prefix, row.path1), 'did': row.did1, 'h':row.hash, 'id':row.id1 } )
+
+            # process path2 / fname2 -- if that combo is not in the dups[hash], add it
+            found=0
+            for dup in dups[row.hash]:
+                if dup['f'] == row.fname2 and dup['d'] == TrimmedPath(prefix, row.path2):
+                    found=1
+                    continue
+            if not found:
+                dups[row.hash].append( { 'f': row.fname2, 'd':TrimmedPath(prefix, row.path2), 'did': row.did2, 'h':row.hash, 'id':row.id2 } )
+
+@app.route("/fix_dups", methods=["POST"])
+def fix_dups():
+    rows = db.engine.execute( "select e1.id as id1, f1.hash, d1.path_prefix as path1, d1.eid as did1, e1.name as fname1, e2.id as id2, d2.path_prefix as path2, d2.eid as did2, e2.name as fname2 from entry e1, file f1, dir d1, entry_dir_link edl1, entry e2, file f2, dir d2, entry_dir_link edl2 where e1.id = f1.eid and e2.id = f2.eid and d1.eid = edl1.dir_eid and edl1.entry_id = e1.id and edl2.dir_eid = d2.eid and edl2.entry_id = e2.id  and f1.hash = f2.hash and e1.id != e2.id and f1.size_mb = f2.size_mb order by path1, fname1" )
+
+    if rows.returns_rows == False:
+        st.SetAlert("success")
+        st.SetMessage(f"Err, no dups - should now clear the FE 'danger' message?")
+        return render_template("base.html")
+
+    jexes = JobExtra.query.join(Job).join(PA_JobManager_Message).filter(PA_JobManager_Message.id==request.form['fe_msg_id']).all()
+    path=[jex.value for jex in jexes if jex.name == "path"][0]
+    prefix = SymlinkName(path,path+'/')
+    if 'pagesize' not in request.form:
+        pagesize=int([jex.value for jex in jexes if jex.name == "pagesize"][0])
+    else:
+        pagesize=int(request.form['pagesize'])
+    dups={}
+    for row in rows:
+        AddDup( prefix+'/', row, dups )
+
+    d1=""
+    d2=""
+    did1=""
+    did2=""
+    str=""
+    dup_cnt=1
+    preferred={}
+    per_file_dups=[]
+    per_path_dups=[]
+    hashes=""
+    overall_dup_cnt=0
+    overall_dup_sets=0
+    for hash in dups:
+        # more than 2 files (just ask per file) OR
+        # only 2 copies, and files are in same dir (so must be diff name, so just ask) OR
+        # content same, filename different (just ask per file)
+        if (len(dups[hash]) > 2) or (dups[hash][0]['d'] == dups[hash][1]['d']) or (dups[hash][0]['f'] != dups[hash][1]['f']):
+            per_file_dups.append(dups[hash])
+            overall_dup_cnt += len(dups[hash])
+            overall_dup_sets += 1
+            for el in dups[hash]:
+                if re.search( '\d{4}/\d{8}', el['d']):
+                    preferred[hash] = el['id']
+                    if overall_dup_cnt<5:
+                        print( f"{dups[hash]} <- keeping {el['d']} -- {preferred[hash]}" )
+        # by here we have only 2 files, with the same name, different path
+        # (MOST COMMON, and I think we dont care per file, just per path)
+        elif d1 != dups[hash][0]['d']:
+            if d1 != '':
+                overall_dup_cnt += dup_cnt
+                overall_dup_sets += 1
+                per_path_dups.append({'count': dup_cnt, 'd1': d1, 'd2': d2, 'did1': did1, 'did2': did2, 'hashes' : hashes })
+                dup_cnt=1
+            d1 = dups[hash][0]['d']
+            d2 = dups[hash][1]['d']
+            did1 = dups[hash][0]['did']
+            did2 = dups[hash][1]['did']
+            str=f"duplicates found in {d1} and {d2}"
+            hashes = f"{hash},"
+        else:
+            dup_cnt += 1
+            hashes += f"{hash},"
+
+    if d1 != '':
+        overall_dup_cnt += dup_cnt
+        overall_dup_sets += dup_cnt
+        per_path_dups.append({'count': dup_cnt, 'd1': d1, 'd2': d2, 'did1': did1, 'did2': did2, 'hashes' : hashes })
+"""
diff --git a/files.py b/files.py
index df5f196..60b16c5 100644
--- a/files.py
+++ b/files.py
@@ -25,6 +25,7 @@ from person import Person, PersonRefimgLink
 from refimg import Refimg
 from settings import Settings
 from shared import SymlinkName
+from dups import Duplicates
 
 ################################################################################
 # Class describing File in the database, and via sqlalchemy, connected to the DB as well
@@ -276,9 +277,11 @@ def fix_dups():
         pagesize=int([jex.value for jex in jexes if jex.name == "pagesize"][0])
     else:
         pagesize=int(request.form['pagesize'])
-    dups={}
+    D=Duplicates()
     for row in rows:
-        AddDup( prefix+'/', row, dups )
+        D.AddDup( row )
+
+    print( D.Dump() )
 
     d1=""
     d2=""
@@ -292,6 +295,13 @@ def fix_dups():
     hashes=""
     overall_dup_cnt=0
     overall_dup_sets=0
+
+    return render_template("dups.html", per_file_dups=per_file_dups, preferred=preferred, per_path_dups=per_path_dups, fe_msg_id=request.form['fe_msg_id'], overall_dup_cnt=overall_dup_cnt, overall_dup_sets=overall_dup_sets, pagesize=pagesize )
+
+"""
+    dups={}
+    for row in rows:
+        AddDup( prefix+'/', row, dups )
     for hash in dups:
         # more than 2 files (just ask per file) OR
         # only 2 copies, and files are in same dir (so must be diff name, so just ask) OR
@@ -330,6 +340,8 @@ def fix_dups():
 
     return render_template("dups.html", per_file_dups=per_file_dups, preferred=preferred, per_path_dups=per_path_dups, fe_msg_id=request.form['fe_msg_id'], overall_dup_cnt=overall_dup_cnt, overall_dup_sets=overall_dup_sets, pagesize=pagesize )
 
+"""
+
 @app.route("/rm_dups", methods=["POST"])
 def rm_dups():