trialing a new duplicate class to deal more consistently with the various types of duplicates -- mostly to enable "auto" deleting of duplicates in specific conditions, e.g. in both an import dir and storage dir - just delete dups from import dir
This commit is contained in:
201
dups.py
Normal file
201
dups.py
Normal file
@@ -0,0 +1,201 @@
|
|||||||
|
from wtforms import SubmitField, StringField, HiddenField, validators, Form
|
||||||
|
from flask_wtf import FlaskForm
|
||||||
|
from flask import request, render_template, redirect, send_from_directory
|
||||||
|
from main import db, app, ma
|
||||||
|
from sqlalchemy import Sequence
|
||||||
|
from sqlalchemy.exc import SQLAlchemyError
|
||||||
|
from status import st, Status
|
||||||
|
import os
|
||||||
|
import glob
|
||||||
|
from PIL import Image
|
||||||
|
from pymediainfo import MediaInfo
|
||||||
|
import hashlib
|
||||||
|
import exifread
|
||||||
|
import base64
|
||||||
|
import numpy
|
||||||
|
import cv2
|
||||||
|
import time
|
||||||
|
import re
|
||||||
|
|
||||||
|
################################################################################
|
||||||
|
# Local Class imports
|
||||||
|
################################################################################
|
||||||
|
from job import Job, JobExtra, Joblog, NewJob
|
||||||
|
from settings import Settings
|
||||||
|
from shared import SymlinkName
|
||||||
|
|
||||||
|
class Duplicates:
|
||||||
|
ip_to_sp_dups_keep={}
|
||||||
|
ip_to_sp_dups_del={}
|
||||||
|
in_same_dups={}
|
||||||
|
per_file_dups=[]
|
||||||
|
per_path_dups=[]
|
||||||
|
preferred={}
|
||||||
|
all_paths=[]
|
||||||
|
storage_paths=[]
|
||||||
|
import_paths=[]
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
# per storage path, add entries to view
|
||||||
|
settings=Settings.query.first()
|
||||||
|
paths = settings.storage_path.split("#")
|
||||||
|
for path in paths:
|
||||||
|
prefix = SymlinkName(path,path+'/')
|
||||||
|
self.storage_paths.append(prefix)
|
||||||
|
self.all_paths.append(prefix)
|
||||||
|
paths = settings.import_path.split("#")
|
||||||
|
for path in paths:
|
||||||
|
prefix = SymlinkName(path,path+'/')
|
||||||
|
self.import_paths.append(prefix)
|
||||||
|
self.all_paths.append(prefix)
|
||||||
|
|
||||||
|
def TrimmedPath( self, path ):
|
||||||
|
for p in self.all_paths:
|
||||||
|
if re.match( f"^{p}", path ):
|
||||||
|
return path.replace(p, '' )
|
||||||
|
return path
|
||||||
|
|
||||||
|
def InImportPath( self, path ):
|
||||||
|
for p in self.import_paths:
|
||||||
|
if re.match( f"^{p}", path ):
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
def InStoragePath( self, path ):
|
||||||
|
for p in self.storage_paths:
|
||||||
|
if re.match( f"^{p}", path ):
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
def KeepInSameDups( self, obj ):
|
||||||
|
if obj['h'] not in self.ip_to_sp_dups_keep:
|
||||||
|
self.ip_to_sp_dups_keep[obj['h']]= obj
|
||||||
|
return
|
||||||
|
|
||||||
|
def DelInSameDups( self, obj ):
|
||||||
|
if obj['h'] not in self.ip_to_sp_dups_del:
|
||||||
|
self.ip_to_sp_dups_del[obj['h']]=[]
|
||||||
|
self.ip_to_sp_dups_del[obj['h']].append( obj )
|
||||||
|
else:
|
||||||
|
for el in self.ip_to_sp_dups_del[obj['h']]:
|
||||||
|
if el['id'] == obj['id']:
|
||||||
|
return
|
||||||
|
self.ip_to_sp_dups_del[obj['h']].append( obj )
|
||||||
|
return
|
||||||
|
|
||||||
|
def AddDup( self, row ):
|
||||||
|
if self.InStoragePath(row.path1) and self.InImportPath(row.path2):
|
||||||
|
self.KeepInSameDups( { 'f': row.fname1, 'd':self.TrimmedPath(row.path1), 'did': row.did1, 'h':row.hash, 'id':row.id1 } )
|
||||||
|
self.DelInSameDups( { 'f': row.fname2, 'd':self.TrimmedPath(row.path2), 'did': row.did2, 'h':row.hash, 'id':row.id2 } )
|
||||||
|
|
||||||
|
if self.InStoragePath(row.path2) and self.InImportPath(row.path1):
|
||||||
|
self.KeepInSameDups( { 'f': row.fname2, 'd':self.TrimmedPath(row.path2), 'did': row.did2, 'h':row.hash, 'id':row.id2 } )
|
||||||
|
self.DelInSameDups( { 'f': row.fname1, 'd':self.TrimmedPath(row.path1), 'did': row.did1, 'h':row.hash, 'id':row.id1 } )
|
||||||
|
|
||||||
|
return
|
||||||
|
|
||||||
|
def Dump(self):
|
||||||
|
if len(self.ip_to_sp_dups_keep) > 0:
|
||||||
|
print( "############ Files that are in both Import and Storage Paths ###########")
|
||||||
|
cnt=0
|
||||||
|
for h in self.ip_to_sp_dups_keep:
|
||||||
|
cnt +=1
|
||||||
|
if len(self.ip_to_sp_dups_del[h])>2:
|
||||||
|
print( f"hash={h}, keep: {self.ip_to_sp_dups_keep[h]}" )
|
||||||
|
for d in self.ip_to_sp_dups_del[h]:
|
||||||
|
print( f"Del: {d}" )
|
||||||
|
print( f"{cnt} sets of duplicate files to delete at least 1, anything with 2 or more dups is printed above explicitly" )
|
||||||
|
return
|
||||||
|
|
||||||
|
"""
|
||||||
|
if row.hash not in dups:
|
||||||
|
dups[row.hash]=[]
|
||||||
|
dups[row.hash].append( { 'f': row.fname1, 'd':TrimmedPath(prefix, row.path1), 'did': row.did1, 'h':row.hash, 'id':row.id1 } )
|
||||||
|
dups[row.hash].append( { 'f': row.fname2, 'd':TrimmedPath(prefix, row.path2), 'did': row.did2, 'h':row.hash, 'id':row.id2 } )
|
||||||
|
else:
|
||||||
|
# process path1 / fname1 -- if that combo is not in the dups[hash], add it
|
||||||
|
found=0
|
||||||
|
for dup in dups[row.hash]:
|
||||||
|
if dup['f'] == row.fname1 and dup['d'] == TrimmedPath(prefix, row.path1):
|
||||||
|
found=1
|
||||||
|
continue
|
||||||
|
if not found:
|
||||||
|
dups[row.hash].append( { 'f': row.fname1, 'd':TrimmedPath(prefix, row.path1), 'did': row.did1, 'h':row.hash, 'id':row.id1 } )
|
||||||
|
|
||||||
|
# process path2 / fname2 -- if that combo is not in the dups[hash], add it
|
||||||
|
found=0
|
||||||
|
for dup in dups[row.hash]:
|
||||||
|
if dup['f'] == row.fname2 and dup['d'] == TrimmedPath(prefix, row.path2):
|
||||||
|
found=1
|
||||||
|
continue
|
||||||
|
if not found:
|
||||||
|
dups[row.hash].append( { 'f': row.fname2, 'd':TrimmedPath(prefix, row.path2), 'did': row.did2, 'h':row.hash, 'id':row.id2 } )
|
||||||
|
|
||||||
|
@app.route("/fix_dups", methods=["POST"])
|
||||||
|
def fix_dups():
|
||||||
|
rows = db.engine.execute( "select e1.id as id1, f1.hash, d1.path_prefix as path1, d1.eid as did1, e1.name as fname1, e2.id as id2, d2.path_prefix as path2, d2.eid as did2, e2.name as fname2 from entry e1, file f1, dir d1, entry_dir_link edl1, entry e2, file f2, dir d2, entry_dir_link edl2 where e1.id = f1.eid and e2.id = f2.eid and d1.eid = edl1.dir_eid and edl1.entry_id = e1.id and edl2.dir_eid = d2.eid and edl2.entry_id = e2.id and f1.hash = f2.hash and e1.id != e2.id and f1.size_mb = f2.size_mb order by path1, fname1" )
|
||||||
|
|
||||||
|
if rows.returns_rows == False:
|
||||||
|
st.SetAlert("success")
|
||||||
|
st.SetMessage(f"Err, no dups - should now clear the FE 'danger' message?")
|
||||||
|
return render_template("base.html")
|
||||||
|
|
||||||
|
jexes = JobExtra.query.join(Job).join(PA_JobManager_Message).filter(PA_JobManager_Message.id==request.form['fe_msg_id']).all()
|
||||||
|
path=[jex.value for jex in jexes if jex.name == "path"][0]
|
||||||
|
prefix = SymlinkName(path,path+'/')
|
||||||
|
if 'pagesize' not in request.form:
|
||||||
|
pagesize=int([jex.value for jex in jexes if jex.name == "pagesize"][0])
|
||||||
|
else:
|
||||||
|
pagesize=int(request.form['pagesize'])
|
||||||
|
dups={}
|
||||||
|
for row in rows:
|
||||||
|
AddDup( prefix+'/', row, dups )
|
||||||
|
|
||||||
|
d1=""
|
||||||
|
d2=""
|
||||||
|
did1=""
|
||||||
|
did2=""
|
||||||
|
str=""
|
||||||
|
dup_cnt=1
|
||||||
|
preferred={}
|
||||||
|
per_file_dups=[]
|
||||||
|
per_path_dups=[]
|
||||||
|
hashes=""
|
||||||
|
overall_dup_cnt=0
|
||||||
|
overall_dup_sets=0
|
||||||
|
for hash in dups:
|
||||||
|
# more than 2 files (just ask per file) OR
|
||||||
|
# only 2 copies, and files are in same dir (so must be diff name, so just ask) OR
|
||||||
|
# content same, filename different (just ask per file)
|
||||||
|
if (len(dups[hash]) > 2) or (dups[hash][0]['d'] == dups[hash][1]['d']) or (dups[hash][0]['f'] != dups[hash][1]['f']):
|
||||||
|
per_file_dups.append(dups[hash])
|
||||||
|
overall_dup_cnt += len(dups[hash])
|
||||||
|
overall_dup_sets += 1
|
||||||
|
for el in dups[hash]:
|
||||||
|
if re.search( '\d{4}/\d{8}', el['d']):
|
||||||
|
preferred[hash] = el['id']
|
||||||
|
if overall_dup_cnt<5:
|
||||||
|
print( f"{dups[hash]} <- keeping {el['d']} -- {preferred[hash]}" )
|
||||||
|
# by here we have only 2 files, with the same name, different path
|
||||||
|
# (MOST COMMON, and I think we dont care per file, just per path)
|
||||||
|
elif d1 != dups[hash][0]['d']:
|
||||||
|
if d1 != '':
|
||||||
|
overall_dup_cnt += dup_cnt
|
||||||
|
overall_dup_sets += 1
|
||||||
|
per_path_dups.append({'count': dup_cnt, 'd1': d1, 'd2': d2, 'did1': did1, 'did2': did2, 'hashes' : hashes })
|
||||||
|
dup_cnt=1
|
||||||
|
d1 = dups[hash][0]['d']
|
||||||
|
d2 = dups[hash][1]['d']
|
||||||
|
did1 = dups[hash][0]['did']
|
||||||
|
did2 = dups[hash][1]['did']
|
||||||
|
str=f"duplicates found in {d1} and {d2}"
|
||||||
|
hashes = f"{hash},"
|
||||||
|
else:
|
||||||
|
dup_cnt += 1
|
||||||
|
hashes += f"{hash},"
|
||||||
|
|
||||||
|
if d1 != '':
|
||||||
|
overall_dup_cnt += dup_cnt
|
||||||
|
overall_dup_sets += dup_cnt
|
||||||
|
per_path_dups.append({'count': dup_cnt, 'd1': d1, 'd2': d2, 'did1': did1, 'did2': did2, 'hashes' : hashes })
|
||||||
|
"""
|
||||||
16
files.py
16
files.py
@@ -25,6 +25,7 @@ from person import Person, PersonRefimgLink
|
|||||||
from refimg import Refimg
|
from refimg import Refimg
|
||||||
from settings import Settings
|
from settings import Settings
|
||||||
from shared import SymlinkName
|
from shared import SymlinkName
|
||||||
|
from dups import Duplicates
|
||||||
|
|
||||||
################################################################################
|
################################################################################
|
||||||
# Class describing File in the database, and via sqlalchemy, connected to the DB as well
|
# Class describing File in the database, and via sqlalchemy, connected to the DB as well
|
||||||
@@ -276,9 +277,11 @@ def fix_dups():
|
|||||||
pagesize=int([jex.value for jex in jexes if jex.name == "pagesize"][0])
|
pagesize=int([jex.value for jex in jexes if jex.name == "pagesize"][0])
|
||||||
else:
|
else:
|
||||||
pagesize=int(request.form['pagesize'])
|
pagesize=int(request.form['pagesize'])
|
||||||
dups={}
|
D=Duplicates()
|
||||||
for row in rows:
|
for row in rows:
|
||||||
AddDup( prefix+'/', row, dups )
|
D.AddDup( row )
|
||||||
|
|
||||||
|
print( D.Dump() )
|
||||||
|
|
||||||
d1=""
|
d1=""
|
||||||
d2=""
|
d2=""
|
||||||
@@ -292,6 +295,13 @@ def fix_dups():
|
|||||||
hashes=""
|
hashes=""
|
||||||
overall_dup_cnt=0
|
overall_dup_cnt=0
|
||||||
overall_dup_sets=0
|
overall_dup_sets=0
|
||||||
|
|
||||||
|
return render_template("dups.html", per_file_dups=per_file_dups, preferred=preferred, per_path_dups=per_path_dups, fe_msg_id=request.form['fe_msg_id'], overall_dup_cnt=overall_dup_cnt, overall_dup_sets=overall_dup_sets, pagesize=pagesize )
|
||||||
|
|
||||||
|
"""
|
||||||
|
dups={}
|
||||||
|
for row in rows:
|
||||||
|
AddDup( prefix+'/', row, dups )
|
||||||
for hash in dups:
|
for hash in dups:
|
||||||
# more than 2 files (just ask per file) OR
|
# more than 2 files (just ask per file) OR
|
||||||
# only 2 copies, and files are in same dir (so must be diff name, so just ask) OR
|
# only 2 copies, and files are in same dir (so must be diff name, so just ask) OR
|
||||||
@@ -330,6 +340,8 @@ def fix_dups():
|
|||||||
|
|
||||||
return render_template("dups.html", per_file_dups=per_file_dups, preferred=preferred, per_path_dups=per_path_dups, fe_msg_id=request.form['fe_msg_id'], overall_dup_cnt=overall_dup_cnt, overall_dup_sets=overall_dup_sets, pagesize=pagesize )
|
return render_template("dups.html", per_file_dups=per_file_dups, preferred=preferred, per_path_dups=per_path_dups, fe_msg_id=request.form['fe_msg_id'], overall_dup_cnt=overall_dup_cnt, overall_dup_sets=overall_dup_sets, pagesize=pagesize )
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
@app.route("/rm_dups", methods=["POST"])
|
@app.route("/rm_dups", methods=["POST"])
|
||||||
def rm_dups():
|
def rm_dups():
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user