trialing a new duplicate class to deal more consistently with the various types of duplicates -- mostly to enable "auto" deleting of duplicates in specific conditions, e.g. in both an import dir and storage dir - just delete dups from import dir

This commit is contained in:
2021-03-13 12:36:16 +11:00
parent 155068ab85
commit 8ff61dddfa
2 changed files with 215 additions and 2 deletions

201
dups.py Normal file
View File

@@ -0,0 +1,201 @@
from wtforms import SubmitField, StringField, HiddenField, validators, Form
from flask_wtf import FlaskForm
from flask import request, render_template, redirect, send_from_directory
from main import db, app, ma
from sqlalchemy import Sequence
from sqlalchemy.exc import SQLAlchemyError
from status import st, Status
import os
import glob
from PIL import Image
from pymediainfo import MediaInfo
import hashlib
import exifread
import base64
import numpy
import cv2
import time
import re
################################################################################
# Local Class imports
################################################################################
from job import Job, JobExtra, Joblog, NewJob
from settings import Settings
from shared import SymlinkName
class Duplicates:
ip_to_sp_dups_keep={}
ip_to_sp_dups_del={}
in_same_dups={}
per_file_dups=[]
per_path_dups=[]
preferred={}
all_paths=[]
storage_paths=[]
import_paths=[]
def __init__(self):
# per storage path, add entries to view
settings=Settings.query.first()
paths = settings.storage_path.split("#")
for path in paths:
prefix = SymlinkName(path,path+'/')
self.storage_paths.append(prefix)
self.all_paths.append(prefix)
paths = settings.import_path.split("#")
for path in paths:
prefix = SymlinkName(path,path+'/')
self.import_paths.append(prefix)
self.all_paths.append(prefix)
def TrimmedPath( self, path ):
for p in self.all_paths:
if re.match( f"^{p}", path ):
return path.replace(p, '' )
return path
def InImportPath( self, path ):
for p in self.import_paths:
if re.match( f"^{p}", path ):
return True
return False
def InStoragePath( self, path ):
for p in self.storage_paths:
if re.match( f"^{p}", path ):
return True
return False
def KeepInSameDups( self, obj ):
if obj['h'] not in self.ip_to_sp_dups_keep:
self.ip_to_sp_dups_keep[obj['h']]= obj
return
def DelInSameDups( self, obj ):
if obj['h'] not in self.ip_to_sp_dups_del:
self.ip_to_sp_dups_del[obj['h']]=[]
self.ip_to_sp_dups_del[obj['h']].append( obj )
else:
for el in self.ip_to_sp_dups_del[obj['h']]:
if el['id'] == obj['id']:
return
self.ip_to_sp_dups_del[obj['h']].append( obj )
return
def AddDup( self, row ):
if self.InStoragePath(row.path1) and self.InImportPath(row.path2):
self.KeepInSameDups( { 'f': row.fname1, 'd':self.TrimmedPath(row.path1), 'did': row.did1, 'h':row.hash, 'id':row.id1 } )
self.DelInSameDups( { 'f': row.fname2, 'd':self.TrimmedPath(row.path2), 'did': row.did2, 'h':row.hash, 'id':row.id2 } )
if self.InStoragePath(row.path2) and self.InImportPath(row.path1):
self.KeepInSameDups( { 'f': row.fname2, 'd':self.TrimmedPath(row.path2), 'did': row.did2, 'h':row.hash, 'id':row.id2 } )
self.DelInSameDups( { 'f': row.fname1, 'd':self.TrimmedPath(row.path1), 'did': row.did1, 'h':row.hash, 'id':row.id1 } )
return
def Dump(self):
if len(self.ip_to_sp_dups_keep) > 0:
print( "############ Files that are in both Import and Storage Paths ###########")
cnt=0
for h in self.ip_to_sp_dups_keep:
cnt +=1
if len(self.ip_to_sp_dups_del[h])>2:
print( f"hash={h}, keep: {self.ip_to_sp_dups_keep[h]}" )
for d in self.ip_to_sp_dups_del[h]:
print( f"Del: {d}" )
print( f"{cnt} sets of duplicate files to delete at least 1, anything with 2 or more dups is printed above explicitly" )
return
"""
if row.hash not in dups:
dups[row.hash]=[]
dups[row.hash].append( { 'f': row.fname1, 'd':TrimmedPath(prefix, row.path1), 'did': row.did1, 'h':row.hash, 'id':row.id1 } )
dups[row.hash].append( { 'f': row.fname2, 'd':TrimmedPath(prefix, row.path2), 'did': row.did2, 'h':row.hash, 'id':row.id2 } )
else:
# process path1 / fname1 -- if that combo is not in the dups[hash], add it
found=0
for dup in dups[row.hash]:
if dup['f'] == row.fname1 and dup['d'] == TrimmedPath(prefix, row.path1):
found=1
continue
if not found:
dups[row.hash].append( { 'f': row.fname1, 'd':TrimmedPath(prefix, row.path1), 'did': row.did1, 'h':row.hash, 'id':row.id1 } )
# process path2 / fname2 -- if that combo is not in the dups[hash], add it
found=0
for dup in dups[row.hash]:
if dup['f'] == row.fname2 and dup['d'] == TrimmedPath(prefix, row.path2):
found=1
continue
if not found:
dups[row.hash].append( { 'f': row.fname2, 'd':TrimmedPath(prefix, row.path2), 'did': row.did2, 'h':row.hash, 'id':row.id2 } )
@app.route("/fix_dups", methods=["POST"])
def fix_dups():
rows = db.engine.execute( "select e1.id as id1, f1.hash, d1.path_prefix as path1, d1.eid as did1, e1.name as fname1, e2.id as id2, d2.path_prefix as path2, d2.eid as did2, e2.name as fname2 from entry e1, file f1, dir d1, entry_dir_link edl1, entry e2, file f2, dir d2, entry_dir_link edl2 where e1.id = f1.eid and e2.id = f2.eid and d1.eid = edl1.dir_eid and edl1.entry_id = e1.id and edl2.dir_eid = d2.eid and edl2.entry_id = e2.id and f1.hash = f2.hash and e1.id != e2.id and f1.size_mb = f2.size_mb order by path1, fname1" )
if rows.returns_rows == False:
st.SetAlert("success")
st.SetMessage(f"Err, no dups - should now clear the FE 'danger' message?")
return render_template("base.html")
jexes = JobExtra.query.join(Job).join(PA_JobManager_Message).filter(PA_JobManager_Message.id==request.form['fe_msg_id']).all()
path=[jex.value for jex in jexes if jex.name == "path"][0]
prefix = SymlinkName(path,path+'/')
if 'pagesize' not in request.form:
pagesize=int([jex.value for jex in jexes if jex.name == "pagesize"][0])
else:
pagesize=int(request.form['pagesize'])
dups={}
for row in rows:
AddDup( prefix+'/', row, dups )
d1=""
d2=""
did1=""
did2=""
str=""
dup_cnt=1
preferred={}
per_file_dups=[]
per_path_dups=[]
hashes=""
overall_dup_cnt=0
overall_dup_sets=0
for hash in dups:
# more than 2 files (just ask per file) OR
# only 2 copies, and files are in same dir (so must be diff name, so just ask) OR
# content same, filename different (just ask per file)
if (len(dups[hash]) > 2) or (dups[hash][0]['d'] == dups[hash][1]['d']) or (dups[hash][0]['f'] != dups[hash][1]['f']):
per_file_dups.append(dups[hash])
overall_dup_cnt += len(dups[hash])
overall_dup_sets += 1
for el in dups[hash]:
if re.search( '\d{4}/\d{8}', el['d']):
preferred[hash] = el['id']
if overall_dup_cnt<5:
print( f"{dups[hash]} <- keeping {el['d']} -- {preferred[hash]}" )
# by here we have only 2 files, with the same name, different path
# (MOST COMMON, and I think we dont care per file, just per path)
elif d1 != dups[hash][0]['d']:
if d1 != '':
overall_dup_cnt += dup_cnt
overall_dup_sets += 1
per_path_dups.append({'count': dup_cnt, 'd1': d1, 'd2': d2, 'did1': did1, 'did2': did2, 'hashes' : hashes })
dup_cnt=1
d1 = dups[hash][0]['d']
d2 = dups[hash][1]['d']
did1 = dups[hash][0]['did']
did2 = dups[hash][1]['did']
str=f"duplicates found in {d1} and {d2}"
hashes = f"{hash},"
else:
dup_cnt += 1
hashes += f"{hash},"
if d1 != '':
overall_dup_cnt += dup_cnt
overall_dup_sets += dup_cnt
per_path_dups.append({'count': dup_cnt, 'd1': d1, 'd2': d2, 'did1': did1, 'did2': did2, 'hashes' : hashes })
"""

View File

@@ -25,6 +25,7 @@ from person import Person, PersonRefimgLink
from refimg import Refimg from refimg import Refimg
from settings import Settings from settings import Settings
from shared import SymlinkName from shared import SymlinkName
from dups import Duplicates
################################################################################ ################################################################################
# Class describing File in the database, and via sqlalchemy, connected to the DB as well # Class describing File in the database, and via sqlalchemy, connected to the DB as well
@@ -276,9 +277,11 @@ def fix_dups():
pagesize=int([jex.value for jex in jexes if jex.name == "pagesize"][0]) pagesize=int([jex.value for jex in jexes if jex.name == "pagesize"][0])
else: else:
pagesize=int(request.form['pagesize']) pagesize=int(request.form['pagesize'])
dups={} D=Duplicates()
for row in rows: for row in rows:
AddDup( prefix+'/', row, dups ) D.AddDup( row )
print( D.Dump() )
d1="" d1=""
d2="" d2=""
@@ -292,6 +295,13 @@ def fix_dups():
hashes="" hashes=""
overall_dup_cnt=0 overall_dup_cnt=0
overall_dup_sets=0 overall_dup_sets=0
return render_template("dups.html", per_file_dups=per_file_dups, preferred=preferred, per_path_dups=per_path_dups, fe_msg_id=request.form['fe_msg_id'], overall_dup_cnt=overall_dup_cnt, overall_dup_sets=overall_dup_sets, pagesize=pagesize )
"""
dups={}
for row in rows:
AddDup( prefix+'/', row, dups )
for hash in dups: for hash in dups:
# more than 2 files (just ask per file) OR # more than 2 files (just ask per file) OR
# only 2 copies, and files are in same dir (so must be diff name, so just ask) OR # only 2 copies, and files are in same dir (so must be diff name, so just ask) OR
@@ -330,6 +340,8 @@ def fix_dups():
return render_template("dups.html", per_file_dups=per_file_dups, preferred=preferred, per_path_dups=per_path_dups, fe_msg_id=request.form['fe_msg_id'], overall_dup_cnt=overall_dup_cnt, overall_dup_sets=overall_dup_sets, pagesize=pagesize ) return render_template("dups.html", per_file_dups=per_file_dups, preferred=preferred, per_path_dups=per_path_dups, fe_msg_id=request.form['fe_msg_id'], overall_dup_cnt=overall_dup_cnt, overall_dup_sets=overall_dup_sets, pagesize=pagesize )
"""
@app.route("/rm_dups", methods=["POST"]) @app.route("/rm_dups", methods=["POST"])
def rm_dups(): def rm_dups():