trialing a new duplicate class to deal more consistently with the various types of duplicates -- mostly to enable "auto" deleting of duplicates in specific conditions, e.g. in both an import dir and storage dir - just delete dups from import dir

2021-03-13 12:36:16 +11:00
parent 155068ab85
commit 8ff61dddfa
2 changed files with 215 additions and 2 deletions
--- a/dups.py
+++ b/dups.py
@@ -0,0 +1,201 @@
 from wtforms import SubmitField, StringField, HiddenField, validators, Form
 from flask_wtf import FlaskForm
 from flask import request, render_template, redirect, send_from_directory
 from main import db, app, ma 
 from sqlalchemy import Sequence
 from sqlalchemy.exc import SQLAlchemyError
 from status import st, Status
 import os
 import glob
 from PIL import Image
 from pymediainfo import MediaInfo
 import hashlib
 import exifread
 import base64
 import numpy
 import cv2
 import time
 import re
 ################################################################################
 # Local Class imports
 ################################################################################
 from job import Job, JobExtra, Joblog, NewJob
 from settings import Settings
 from shared import SymlinkName
 class Duplicates:
    ip_to_sp_dups_keep={}
    ip_to_sp_dups_del={}
    in_same_dups={}
    per_file_dups=[]
    per_path_dups=[]
    preferred={}
    all_paths=[]
    storage_paths=[]
    import_paths=[]
    def __init__(self):
        # per storage path, add entries to view 
        settings=Settings.query.first()
        paths = settings.storage_path.split("#")
        for path in paths:
            prefix = SymlinkName(path,path+'/')
            self.storage_paths.append(prefix)
            self.all_paths.append(prefix)
        paths = settings.import_path.split("#")
        for path in paths:
            prefix = SymlinkName(path,path+'/')
            self.import_paths.append(prefix)
            self.all_paths.append(prefix)
    def TrimmedPath( self, path ):
        for p in self.all_paths:
            if re.match( f"^{p}", path ):
                return path.replace(p, '' )
        return path
    def InImportPath( self, path ):
        for p in self.import_paths:
            if re.match( f"^{p}", path ):
                return True
        return False
    def InStoragePath( self, path ):
        for p in self.storage_paths:
            if re.match( f"^{p}", path ):
                return True
        return False
    def KeepInSameDups( self, obj ):
        if obj['h'] not in self.ip_to_sp_dups_keep:
            self.ip_to_sp_dups_keep[obj['h']]= obj
        return
    def DelInSameDups( self, obj ):
        if obj['h'] not in self.ip_to_sp_dups_del:
            self.ip_to_sp_dups_del[obj['h']]=[]
            self.ip_to_sp_dups_del[obj['h']].append( obj )
        else:
            for el in self.ip_to_sp_dups_del[obj['h']]:
                if el['id'] == obj['id']:
                    return
            self.ip_to_sp_dups_del[obj['h']].append( obj )
        return
    def AddDup( self, row ):
        if self.InStoragePath(row.path1) and self.InImportPath(row.path2):
            self.KeepInSameDups( { 'f': row.fname1, 'd':self.TrimmedPath(row.path1), 'did': row.did1, 'h':row.hash, 'id':row.id1 } )
            self.DelInSameDups( { 'f': row.fname2, 'd':self.TrimmedPath(row.path2), 'did': row.did2, 'h':row.hash, 'id':row.id2 } )
        if self.InStoragePath(row.path2) and self.InImportPath(row.path1):
            self.KeepInSameDups( { 'f': row.fname2, 'd':self.TrimmedPath(row.path2), 'did': row.did2, 'h':row.hash, 'id':row.id2 } )
            self.DelInSameDups( { 'f': row.fname1, 'd':self.TrimmedPath(row.path1), 'did': row.did1, 'h':row.hash, 'id':row.id1 } )
        return
    def Dump(self):
        if len(self.ip_to_sp_dups_keep) > 0:
            print( "############ Files that are in both Import and Storage Paths ###########")
            cnt=0
            for h in self.ip_to_sp_dups_keep:
                cnt +=1
                if len(self.ip_to_sp_dups_del[h])>2:
                    print( f"hash={h}, keep: {self.ip_to_sp_dups_keep[h]}" )
                    for d in self.ip_to_sp_dups_del[h]:
                        print( f"Del: {d}" )
            print( f"{cnt} sets of duplicate files to delete at least 1, anything with 2 or more dups is printed above explicitly" )
        return
 """
        if row.hash not in dups:
            dups[row.hash]=[]
            dups[row.hash].append( { 'f': row.fname1, 'd':TrimmedPath(prefix, row.path1), 'did': row.did1, 'h':row.hash, 'id':row.id1 } )
            dups[row.hash].append( { 'f': row.fname2, 'd':TrimmedPath(prefix, row.path2), 'did': row.did2, 'h':row.hash, 'id':row.id2 } )
        else:
            # process path1 / fname1 -- if that combo is not in the dups[hash], add it
            found=0
            for dup in dups[row.hash]:
                if dup['f'] == row.fname1 and dup['d'] == TrimmedPath(prefix, row.path1):
                    found=1
                    continue
            if not found:
                dups[row.hash].append( { 'f': row.fname1, 'd':TrimmedPath(prefix, row.path1), 'did': row.did1, 'h':row.hash, 'id':row.id1 } )
            # process path2 / fname2 -- if that combo is not in the dups[hash], add it
            found=0
            for dup in dups[row.hash]:
                if dup['f'] == row.fname2 and dup['d'] == TrimmedPath(prefix, row.path2):
                    found=1
                    continue
            if not found:
                dups[row.hash].append( { 'f': row.fname2, 'd':TrimmedPath(prefix, row.path2), 'did': row.did2, 'h':row.hash, 'id':row.id2 } )
@app.route("/fix_dups", methods=["POST"])
 def fix_dups():
    rows = db.engine.execute( "select e1.id as id1, f1.hash, d1.path_prefix as path1, d1.eid as did1, e1.name as fname1, e2.id as id2, d2.path_prefix as path2, d2.eid as did2, e2.name as fname2 from entry e1, file f1, dir d1, entry_dir_link edl1, entry e2, file f2, dir d2, entry_dir_link edl2 where e1.id = f1.eid and e2.id = f2.eid and d1.eid = edl1.dir_eid and edl1.entry_id = e1.id and edl2.dir_eid = d2.eid and edl2.entry_id = e2.id  and f1.hash = f2.hash and e1.id != e2.id and f1.size_mb = f2.size_mb order by path1, fname1" )
    if rows.returns_rows == False:
        st.SetAlert("success")
        st.SetMessage(f"Err, no dups - should now clear the FE 'danger' message?")
        return render_template("base.html")
    jexes = JobExtra.query.join(Job).join(PA_JobManager_Message).filter(PA_JobManager_Message.id==request.form['fe_msg_id']).all()
    path=[jex.value for jex in jexes if jex.name == "path"][0]
    prefix = SymlinkName(path,path+'/')
    if 'pagesize' not in request.form:
        pagesize=int([jex.value for jex in jexes if jex.name == "pagesize"][0])
    else:
        pagesize=int(request.form['pagesize'])
    dups={}
    for row in rows:
        AddDup( prefix+'/', row, dups )
    d1=""
    d2=""
    did1=""
    did2=""
    str=""
    dup_cnt=1
    preferred={}
    per_file_dups=[]
    per_path_dups=[]
    hashes=""
    overall_dup_cnt=0
    overall_dup_sets=0
    for hash in dups:
        # more than 2 files (just ask per file) OR
        # only 2 copies, and files are in same dir (so must be diff name, so just ask) OR
        # content same, filename different (just ask per file)
        if (len(dups[hash]) > 2) or (dups[hash][0]['d'] == dups[hash][1]['d']) or (dups[hash][0]['f'] != dups[hash][1]['f']):
            per_file_dups.append(dups[hash])
            overall_dup_cnt += len(dups[hash])
            overall_dup_sets += 1
            for el in dups[hash]:
                if re.search( '\d{4}/\d{8}', el['d']):
                    preferred[hash] = el['id']
                    if overall_dup_cnt<5:
                        print( f"{dups[hash]} <- keeping {el['d']} -- {preferred[hash]}" )
        # by here we have only 2 files, with the same name, different path
        # (MOST COMMON, and I think we dont care per file, just per path)
        elif d1 != dups[hash][0]['d']:
            if d1 != '':
                overall_dup_cnt += dup_cnt
                overall_dup_sets += 1
                per_path_dups.append({'count': dup_cnt, 'd1': d1, 'd2': d2, 'did1': did1, 'did2': did2, 'hashes' : hashes })
                dup_cnt=1
            d1 = dups[hash][0]['d']
            d2 = dups[hash][1]['d']
            did1 = dups[hash][0]['did']
            did2 = dups[hash][1]['did']
            str=f"duplicates found in {d1} and {d2}"
            hashes = f"{hash},"
        else:
            dup_cnt += 1
            hashes += f"{hash},"
    if d1 != '':
        overall_dup_cnt += dup_cnt
        overall_dup_sets += dup_cnt
        per_path_dups.append({'count': dup_cnt, 'd1': d1, 'd2': d2, 'did1': did1, 'did2': did2, 'hashes' : hashes })
 """
--- a/files.py
+++ b/files.py
@@ -25,6 +25,7 @@ from person import Person, PersonRefimgLink
 from refimg import Refimg
 from settings import Settings
 from shared import SymlinkName
 from dups import Duplicates
 ################################################################################
 # Class describing File in the database, and via sqlalchemy, connected to the DB as well
@@ -276,9 +277,11 @@ def fix_dups():
        pagesize=int([jex.value for jex in jexes if jex.name == "pagesize"][0])
    else:
        pagesize=int(request.form['pagesize'])
-    dups={}
+    D=Duplicates()
    for row in rows:
-        AddDup( prefix+'/', row, dups )
+        D.AddDup( row )
    print( D.Dump() )
    d1=""
    d2=""
@@ -292,6 +295,13 @@ def fix_dups():
    hashes=""
    overall_dup_cnt=0
    overall_dup_sets=0
    return render_template("dups.html", per_file_dups=per_file_dups, preferred=preferred, per_path_dups=per_path_dups, fe_msg_id=request.form['fe_msg_id'], overall_dup_cnt=overall_dup_cnt, overall_dup_sets=overall_dup_sets, pagesize=pagesize )
 """
    dups={}
    for row in rows:
        AddDup( prefix+'/', row, dups )
    for hash in dups:
        # more than 2 files (just ask per file) OR
        # only 2 copies, and files are in same dir (so must be diff name, so just ask) OR
@@ -330,6 +340,8 @@ def fix_dups():
    return render_template("dups.html", per_file_dups=per_file_dups, preferred=preferred, per_path_dups=per_path_dups, fe_msg_id=request.form['fe_msg_id'], overall_dup_cnt=overall_dup_cnt, overall_dup_sets=overall_dup_sets, pagesize=pagesize )
 """
@app.route("/rm_dups", methods=["POST"])
 def rm_dups():