Files
photoassistant/dups.py

230 lines
8.7 KiB
Python

from wtforms import SubmitField, StringField, HiddenField, validators, Form
from flask_wtf import FlaskForm
from flask import request, render_template, redirect, send_from_directory
from main import db, app, ma
from sqlalchemy import Sequence
from sqlalchemy.exc import SQLAlchemyError
from status import st, Status
import os
import glob
from PIL import Image
from pymediainfo import MediaInfo
import hashlib
import exifread
import base64
import numpy
import cv2
import time
import re
################################################################################
# Local Class imports
################################################################################
from job import Job, JobExtra, Joblog, NewJob
from settings import Settings
from shared import SymlinkName
class DupRow:
def __init__(self, hash, file, dir, did, fid):
### DupRow Attributes -- note, simple class, no methods ###
self.h=hash
self.f=file
self.d=dir
self.did=did
self.id=fid
class Duplicates:
def __init__(self):
### Duplicates Attributes ###
self.ip_to_sp_dups_keep={}
self.ip_to_sp_dups_del={}
self.in_same_dups={}
self.per_file_dups=[]
self.per_path_dups=[]
self.preferred={}
self.all_paths=[]
self.storage_paths=[]
self.import_paths=[]
# per storage path, add entries to view
settings=Settings.query.first()
paths = settings.storage_path.split("#")
for path in paths:
prefix = SymlinkName(path,path+'/')
self.storage_paths.append(prefix)
self.all_paths.append(prefix)
paths = settings.import_path.split("#")
for path in paths:
prefix = SymlinkName(path,path+'/')
self.import_paths.append(prefix)
self.all_paths.append(prefix)
def TrimmedPath( self, path ):
for p in self.all_paths:
if re.match( f"^{p}", path ):
return path.replace(p, '' )
return path
def InImportPath( self, path ):
for p in self.import_paths:
if re.match( f"^{p}", path ):
return True
return False
def InStoragePath( self, path ):
for p in self.storage_paths:
if re.match( f"^{p}", path ):
return True
return False
def KeepInSameDups( self, obj ):
if obj.h not in self.ip_to_sp_dups_keep:
self.ip_to_sp_dups_keep[obj.h]= obj
return
def DelInSameDups( self, obj ):
if obj.h not in self.ip_to_sp_dups_del:
self.ip_to_sp_dups_del[obj.h]=[]
self.ip_to_sp_dups_del[obj.h].append( obj )
else:
for el in self.ip_to_sp_dups_del[obj.h]:
if el.id == obj.id:
return
self.ip_to_sp_dups_del[obj.h].append( obj )
return
def DupInImportAndStoragePath( self, row, dr1, dr2 ):
if self.InStoragePath(row.path1) and self.InImportPath(row.path2):
self.KeepInSameDups( dr1 )
self.DelInSameDups( dr2 )
return True
if self.InStoragePath(row.path2) and self.InImportPath(row.path1):
self.KeepInSameDups( dr2 )
self.DelInSameDups( dr1 )
return True
return False
def AddDup( self, row ):
dr1=DupRow( row.hash, row.fname1, self.TrimmedPath(row.path1), row.did1, row.id1 )
dr2=DupRow( row.hash, row.fname2, self.TrimmedPath(row.path2), row.did2, row.id2 )
if self.DupInImportAndStoragePath( row, dr1, dr2 ):
return
if row.hash not in self.in_same_dups:
self.in_same_dups[row.hash]=[]
self.in_same_dups[row.hash].append( dr1 )
self.in_same_dups[row.hash].append( dr2 )
else:
# process path1 / fname1 -- if that combo is not in the dups[hash], add it
found=0
for dup in self.in_same_dups[row.hash]:
if dup.id == row.id1:
found=1
continue
if not found:
self.in_same_dups[row.hash].append( dr1 )
# process path2 / fname2 -- if that combo is not in the dups[hash], add it
for dup in self.in_same_dups[row.hash]:
if dup.id == row.id2:
found=1
continue
if not found:
self.in_same_dups[row.hash].append( dr2 )
return
def Dump(self):
if len(self.ip_to_sp_dups_keep) > 0:
print( "############ Files that are in both Import and Storage Paths ###########")
cnt=0
for h in self.ip_to_sp_dups_keep:
cnt +=1
if len(self.ip_to_sp_dups_del[h])>2:
print( f"hash={h}, keep: {self.ip_to_sp_dups_keep[h]}" )
for d in self.ip_to_sp_dups_del[h]:
print( f"Del: {d}" )
print( f"{cnt} sets of duplicate files to delete at least 1, anything with 2 or more dups is printed above explicitly" )
if len(self.in_same_dups) > 0:
print( "############ Duplicate Files that are in the same Path ###########")
cnt=0
for h in self.in_same_dups:
cnt +=1
if len(self.in_same_dups[h])>2:
print( f"hash={h}, keep 1 of these: ", end='')
for d in self.in_same_dups[h]:
print( f"{d.id}, ", end='' )
print ("")
print( f"{cnt} sets of duplicate files to delete at least 1, anything with 2 or more dups is printed above explicitly" )
return
"""
@app.route("/fix_dups", methods=["POST"])
def fix_dups():
rows = db.engine.execute( "select e1.id as id1, f1.hash, d1.path_prefix as path1, d1.eid as did1, e1.name as fname1, e2.id as id2, d2.path_prefix as path2, d2.eid as did2, e2.name as fname2 from entry e1, file f1, dir d1, entry_dir_link edl1, entry e2, file f2, dir d2, entry_dir_link edl2 where e1.id = f1.eid and e2.id = f2.eid and d1.eid = edl1.dir_eid and edl1.entry_id = e1.id and edl2.dir_eid = d2.eid and edl2.entry_id = e2.id and f1.hash = f2.hash and e1.id != e2.id and f1.size_mb = f2.size_mb order by path1, fname1" )
if rows.returns_rows == False:
st.SetAlert("success")
st.SetMessage(f"Err, no dups - should now clear the FE 'danger' message?")
return render_template("base.html")
jexes = JobExtra.query.join(Job).join(PA_JobManager_Message).filter(PA_JobManager_Message.id==request.form['fe_msg_id']).all()
path=[jex.value for jex in jexes if jex.name == "path"][0]
prefix = SymlinkName(path,path+'/')
if 'pagesize' not in request.form:
pagesize=int([jex.value for jex in jexes if jex.name == "pagesize"][0])
else:
pagesize=int(request.form['pagesize'])
dups={}
for row in rows:
AddDup( prefix+'/', row, dups )
d1=""
d2=""
did1=""
did2=""
str=""
dup_cnt=1
preferred={}
per_file_dups=[]
per_path_dups=[]
hashes=""
overall_dup_cnt=0
overall_dup_sets=0
for hash in dups:
# more than 2 files (just ask per file) OR
# only 2 copies, and files are in same dir (so must be diff name, so just ask) OR
# content same, filename different (just ask per file)
if (len(dups[hash]) > 2) or (dups[hash][0]['d'] == dups[hash][1]['d']) or (dups[hash][0]['f'] != dups[hash][1]['f']):
per_file_dups.append(dups[hash])
overall_dup_cnt += len(dups[hash])
overall_dup_sets += 1
for el in dups[hash]:
if re.search( '\d{4}/\d{8}', el['d']):
preferred[hash] = el['id']
if overall_dup_cnt<5:
print( f"{dups[hash]} <- keeping {el['d']} -- {preferred[hash]}" )
# by here we have only 2 files, with the same name, different path
# (MOST COMMON, and I think we dont care per file, just per path)
elif d1 != dups[hash][0]['d']:
if d1 != '':
overall_dup_cnt += dup_cnt
overall_dup_sets += 1
per_path_dups.append({'count': dup_cnt, 'd1': d1, 'd2': d2, 'did1': did1, 'did2': did2, 'hashes' : hashes })
dup_cnt=1
d1 = dups[hash][0]['d']
d2 = dups[hash][1]['d']
did1 = dups[hash][0]['did']
did2 = dups[hash][1]['did']
str=f"duplicates found in {d1} and {d2}"
hashes = f"{hash},"
else:
dup_cnt += 1
hashes += f"{hash},"
if d1 != '':
overall_dup_cnt += dup_cnt
overall_dup_sets += dup_cnt
per_path_dups.append({'count': dup_cnt, 'd1': d1, 'd2': d2, 'did1': did1, 'did2': did2, 'hashes' : hashes })
"""