From 8ff61dddfa1ac1eeb1fe260a0543dc9ff1b0660d Mon Sep 17 00:00:00 2001 From: Damien De Paoli Date: Sat, 13 Mar 2021 12:36:16 +1100 Subject: [PATCH] trialing a new duplicate class to deal more consistently with the various types of duplicates -- mostly to enable "auto" deleting of duplicates in specific conditions, e.g. in both an import dir and storage dir - just delete dups from import dir --- dups.py | 201 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ files.py | 16 ++++- 2 files changed, 215 insertions(+), 2 deletions(-) create mode 100644 dups.py diff --git a/dups.py b/dups.py new file mode 100644 index 0000000..cddaec4 --- /dev/null +++ b/dups.py @@ -0,0 +1,201 @@ +from wtforms import SubmitField, StringField, HiddenField, validators, Form +from flask_wtf import FlaskForm +from flask import request, render_template, redirect, send_from_directory +from main import db, app, ma +from sqlalchemy import Sequence +from sqlalchemy.exc import SQLAlchemyError +from status import st, Status +import os +import glob +from PIL import Image +from pymediainfo import MediaInfo +import hashlib +import exifread +import base64 +import numpy +import cv2 +import time +import re + +################################################################################ +# Local Class imports +################################################################################ +from job import Job, JobExtra, Joblog, NewJob +from settings import Settings +from shared import SymlinkName + +class Duplicates: + ip_to_sp_dups_keep={} + ip_to_sp_dups_del={} + in_same_dups={} + per_file_dups=[] + per_path_dups=[] + preferred={} + all_paths=[] + storage_paths=[] + import_paths=[] + + def __init__(self): + # per storage path, add entries to view + settings=Settings.query.first() + paths = settings.storage_path.split("#") + for path in paths: + prefix = SymlinkName(path,path+'/') + self.storage_paths.append(prefix) + self.all_paths.append(prefix) + paths = settings.import_path.split("#") + for path in paths: + prefix = SymlinkName(path,path+'/') + self.import_paths.append(prefix) + self.all_paths.append(prefix) + + def TrimmedPath( self, path ): + for p in self.all_paths: + if re.match( f"^{p}", path ): + return path.replace(p, '' ) + return path + + def InImportPath( self, path ): + for p in self.import_paths: + if re.match( f"^{p}", path ): + return True + return False + + def InStoragePath( self, path ): + for p in self.storage_paths: + if re.match( f"^{p}", path ): + return True + return False + + def KeepInSameDups( self, obj ): + if obj['h'] not in self.ip_to_sp_dups_keep: + self.ip_to_sp_dups_keep[obj['h']]= obj + return + + def DelInSameDups( self, obj ): + if obj['h'] not in self.ip_to_sp_dups_del: + self.ip_to_sp_dups_del[obj['h']]=[] + self.ip_to_sp_dups_del[obj['h']].append( obj ) + else: + for el in self.ip_to_sp_dups_del[obj['h']]: + if el['id'] == obj['id']: + return + self.ip_to_sp_dups_del[obj['h']].append( obj ) + return + + def AddDup( self, row ): + if self.InStoragePath(row.path1) and self.InImportPath(row.path2): + self.KeepInSameDups( { 'f': row.fname1, 'd':self.TrimmedPath(row.path1), 'did': row.did1, 'h':row.hash, 'id':row.id1 } ) + self.DelInSameDups( { 'f': row.fname2, 'd':self.TrimmedPath(row.path2), 'did': row.did2, 'h':row.hash, 'id':row.id2 } ) + + if self.InStoragePath(row.path2) and self.InImportPath(row.path1): + self.KeepInSameDups( { 'f': row.fname2, 'd':self.TrimmedPath(row.path2), 'did': row.did2, 'h':row.hash, 'id':row.id2 } ) + self.DelInSameDups( { 'f': row.fname1, 'd':self.TrimmedPath(row.path1), 'did': row.did1, 'h':row.hash, 'id':row.id1 } ) + + return + + def Dump(self): + if len(self.ip_to_sp_dups_keep) > 0: + print( "############ Files that are in both Import and Storage Paths ###########") + cnt=0 + for h in self.ip_to_sp_dups_keep: + cnt +=1 + if len(self.ip_to_sp_dups_del[h])>2: + print( f"hash={h}, keep: {self.ip_to_sp_dups_keep[h]}" ) + for d in self.ip_to_sp_dups_del[h]: + print( f"Del: {d}" ) + print( f"{cnt} sets of duplicate files to delete at least 1, anything with 2 or more dups is printed above explicitly" ) + return + +""" + if row.hash not in dups: + dups[row.hash]=[] + dups[row.hash].append( { 'f': row.fname1, 'd':TrimmedPath(prefix, row.path1), 'did': row.did1, 'h':row.hash, 'id':row.id1 } ) + dups[row.hash].append( { 'f': row.fname2, 'd':TrimmedPath(prefix, row.path2), 'did': row.did2, 'h':row.hash, 'id':row.id2 } ) + else: + # process path1 / fname1 -- if that combo is not in the dups[hash], add it + found=0 + for dup in dups[row.hash]: + if dup['f'] == row.fname1 and dup['d'] == TrimmedPath(prefix, row.path1): + found=1 + continue + if not found: + dups[row.hash].append( { 'f': row.fname1, 'd':TrimmedPath(prefix, row.path1), 'did': row.did1, 'h':row.hash, 'id':row.id1 } ) + + # process path2 / fname2 -- if that combo is not in the dups[hash], add it + found=0 + for dup in dups[row.hash]: + if dup['f'] == row.fname2 and dup['d'] == TrimmedPath(prefix, row.path2): + found=1 + continue + if not found: + dups[row.hash].append( { 'f': row.fname2, 'd':TrimmedPath(prefix, row.path2), 'did': row.did2, 'h':row.hash, 'id':row.id2 } ) + +@app.route("/fix_dups", methods=["POST"]) +def fix_dups(): + rows = db.engine.execute( "select e1.id as id1, f1.hash, d1.path_prefix as path1, d1.eid as did1, e1.name as fname1, e2.id as id2, d2.path_prefix as path2, d2.eid as did2, e2.name as fname2 from entry e1, file f1, dir d1, entry_dir_link edl1, entry e2, file f2, dir d2, entry_dir_link edl2 where e1.id = f1.eid and e2.id = f2.eid and d1.eid = edl1.dir_eid and edl1.entry_id = e1.id and edl2.dir_eid = d2.eid and edl2.entry_id = e2.id and f1.hash = f2.hash and e1.id != e2.id and f1.size_mb = f2.size_mb order by path1, fname1" ) + + if rows.returns_rows == False: + st.SetAlert("success") + st.SetMessage(f"Err, no dups - should now clear the FE 'danger' message?") + return render_template("base.html") + + jexes = JobExtra.query.join(Job).join(PA_JobManager_Message).filter(PA_JobManager_Message.id==request.form['fe_msg_id']).all() + path=[jex.value for jex in jexes if jex.name == "path"][0] + prefix = SymlinkName(path,path+'/') + if 'pagesize' not in request.form: + pagesize=int([jex.value for jex in jexes if jex.name == "pagesize"][0]) + else: + pagesize=int(request.form['pagesize']) + dups={} + for row in rows: + AddDup( prefix+'/', row, dups ) + + d1="" + d2="" + did1="" + did2="" + str="" + dup_cnt=1 + preferred={} + per_file_dups=[] + per_path_dups=[] + hashes="" + overall_dup_cnt=0 + overall_dup_sets=0 + for hash in dups: + # more than 2 files (just ask per file) OR + # only 2 copies, and files are in same dir (so must be diff name, so just ask) OR + # content same, filename different (just ask per file) + if (len(dups[hash]) > 2) or (dups[hash][0]['d'] == dups[hash][1]['d']) or (dups[hash][0]['f'] != dups[hash][1]['f']): + per_file_dups.append(dups[hash]) + overall_dup_cnt += len(dups[hash]) + overall_dup_sets += 1 + for el in dups[hash]: + if re.search( '\d{4}/\d{8}', el['d']): + preferred[hash] = el['id'] + if overall_dup_cnt<5: + print( f"{dups[hash]} <- keeping {el['d']} -- {preferred[hash]}" ) + # by here we have only 2 files, with the same name, different path + # (MOST COMMON, and I think we dont care per file, just per path) + elif d1 != dups[hash][0]['d']: + if d1 != '': + overall_dup_cnt += dup_cnt + overall_dup_sets += 1 + per_path_dups.append({'count': dup_cnt, 'd1': d1, 'd2': d2, 'did1': did1, 'did2': did2, 'hashes' : hashes }) + dup_cnt=1 + d1 = dups[hash][0]['d'] + d2 = dups[hash][1]['d'] + did1 = dups[hash][0]['did'] + did2 = dups[hash][1]['did'] + str=f"duplicates found in {d1} and {d2}" + hashes = f"{hash}," + else: + dup_cnt += 1 + hashes += f"{hash}," + + if d1 != '': + overall_dup_cnt += dup_cnt + overall_dup_sets += dup_cnt + per_path_dups.append({'count': dup_cnt, 'd1': d1, 'd2': d2, 'did1': did1, 'did2': did2, 'hashes' : hashes }) +""" diff --git a/files.py b/files.py index df5f196..60b16c5 100644 --- a/files.py +++ b/files.py @@ -25,6 +25,7 @@ from person import Person, PersonRefimgLink from refimg import Refimg from settings import Settings from shared import SymlinkName +from dups import Duplicates ################################################################################ # Class describing File in the database, and via sqlalchemy, connected to the DB as well @@ -276,9 +277,11 @@ def fix_dups(): pagesize=int([jex.value for jex in jexes if jex.name == "pagesize"][0]) else: pagesize=int(request.form['pagesize']) - dups={} + D=Duplicates() for row in rows: - AddDup( prefix+'/', row, dups ) + D.AddDup( row ) + + print( D.Dump() ) d1="" d2="" @@ -292,6 +295,13 @@ def fix_dups(): hashes="" overall_dup_cnt=0 overall_dup_sets=0 + + return render_template("dups.html", per_file_dups=per_file_dups, preferred=preferred, per_path_dups=per_path_dups, fe_msg_id=request.form['fe_msg_id'], overall_dup_cnt=overall_dup_cnt, overall_dup_sets=overall_dup_sets, pagesize=pagesize ) + +""" + dups={} + for row in rows: + AddDup( prefix+'/', row, dups ) for hash in dups: # more than 2 files (just ask per file) OR # only 2 copies, and files are in same dir (so must be diff name, so just ask) OR @@ -330,6 +340,8 @@ def fix_dups(): return render_template("dups.html", per_file_dups=per_file_dups, preferred=preferred, per_path_dups=per_path_dups, fe_msg_id=request.form['fe_msg_id'], overall_dup_cnt=overall_dup_cnt, overall_dup_sets=overall_dup_sets, pagesize=pagesize ) +""" + @app.route("/rm_dups", methods=["POST"]) def rm_dups():