from wtforms import SubmitField, StringField, HiddenField, validators, Form from flask_wtf import FlaskForm from flask import request, render_template, redirect, send_from_directory from main import db, app, ma from sqlalchemy import Sequence from sqlalchemy.exc import SQLAlchemyError from status import st, Status import os import glob from PIL import Image from pymediainfo import MediaInfo import hashlib import exifread import base64 import numpy import cv2 import time import re ################################################################################ # Local Class imports ################################################################################ from job import Job, JobExtra, Joblog, NewJob from settings import Settings from shared import SymlinkName class DupRow: def __init__(self, hash, file, dir, did, fid): ### DupRow Attributes -- note, simple class, no methods ### self.h=hash self.f=file self.d=dir self.did=did self.id=fid class Duplicates: def __init__(self): ### Duplicates Attributes ### self.ip_to_sp_dups_keep={} self.ip_to_sp_dups_del={} self.in_same_dups={} self.per_file_dups=[] self.per_path_dups=[] self.preferred={} self.all_paths=[] self.storage_paths=[] self.import_paths=[] # per storage path, add entries to view settings=Settings.query.first() paths = settings.storage_path.split("#") for path in paths: prefix = SymlinkName(path,path+'/') self.storage_paths.append(prefix) self.all_paths.append(prefix) paths = settings.import_path.split("#") for path in paths: prefix = SymlinkName(path,path+'/') self.import_paths.append(prefix) self.all_paths.append(prefix) def TrimmedPath( self, path ): for p in self.all_paths: if re.match( f"^{p}", path ): return path.replace(p, '' ) return path def InImportPath( self, path ): for p in self.import_paths: if re.match( f"^{p}", path ): return True return False def InStoragePath( self, path ): for p in self.storage_paths: if re.match( f"^{p}", path ): return True return False def KeepInSameDups( self, obj ): if obj.h not in self.ip_to_sp_dups_keep: self.ip_to_sp_dups_keep[obj.h]= obj return def DelInSameDups( self, obj ): if obj.h not in self.ip_to_sp_dups_del: self.ip_to_sp_dups_del[obj.h]=[] self.ip_to_sp_dups_del[obj.h].append( obj ) else: for el in self.ip_to_sp_dups_del[obj.h]: if el.id == obj.id: return self.ip_to_sp_dups_del[obj.h].append( obj ) return def DupInImportAndStoragePath( self, row, dr1, dr2 ): if self.InStoragePath(row.path1) and self.InImportPath(row.path2): self.KeepInSameDups( dr1 ) self.DelInSameDups( dr2 ) return True if self.InStoragePath(row.path2) and self.InImportPath(row.path1): self.KeepInSameDups( dr2 ) self.DelInSameDups( dr1 ) return True return False def AddDup( self, row ): dr1=DupRow( row.hash, row.fname1, self.TrimmedPath(row.path1), row.did1, row.id1 ) dr2=DupRow( row.hash, row.fname2, self.TrimmedPath(row.path2), row.did2, row.id2 ) if self.DupInImportAndStoragePath( row, dr1, dr2 ): return if row.hash not in self.in_same_dups: self.in_same_dups[row.hash]=[] self.in_same_dups[row.hash].append( dr1 ) self.in_same_dups[row.hash].append( dr2 ) else: # process path1 / fname1 -- if that combo is not in the dups[hash], add it found=0 for dup in self.in_same_dups[row.hash]: if dup.id == row.id1: found=1 continue if not found: self.in_same_dups[row.hash].append( dr1 ) # process path2 / fname2 -- if that combo is not in the dups[hash], add it for dup in self.in_same_dups[row.hash]: if dup.id == row.id2: found=1 continue if not found: self.in_same_dups[row.hash].append( dr2 ) return def Dump(self): if len(self.ip_to_sp_dups_keep) > 0: print( "############ Files that are in both Import and Storage Paths ###########") cnt=0 for h in self.ip_to_sp_dups_keep: cnt +=1 if len(self.ip_to_sp_dups_del[h])>2: print( f"hash={h}, keep: {self.ip_to_sp_dups_keep[h]}" ) for d in self.ip_to_sp_dups_del[h]: print( f"Del: {d}" ) print( f"{cnt} sets of duplicate files to delete at least 1, anything with 2 or more dups is printed above explicitly" ) if len(self.in_same_dups) > 0: print( "############ Duplicate Files that are in the same Path ###########") cnt=0 for h in self.in_same_dups: cnt +=1 if len(self.in_same_dups[h])>2: print( f"hash={h}, keep 1 of these: ", end='') for d in self.in_same_dups[h]: print( f"{d.id}, ", end='' ) print ("") print( f"{cnt} sets of duplicate files to delete at least 1, anything with 2 or more dups is printed above explicitly" ) return """ @app.route("/fix_dups", methods=["POST"]) def fix_dups(): rows = db.engine.execute( "select e1.id as id1, f1.hash, d1.path_prefix as path1, d1.eid as did1, e1.name as fname1, e2.id as id2, d2.path_prefix as path2, d2.eid as did2, e2.name as fname2 from entry e1, file f1, dir d1, entry_dir_link edl1, entry e2, file f2, dir d2, entry_dir_link edl2 where e1.id = f1.eid and e2.id = f2.eid and d1.eid = edl1.dir_eid and edl1.entry_id = e1.id and edl2.dir_eid = d2.eid and edl2.entry_id = e2.id and f1.hash = f2.hash and e1.id != e2.id and f1.size_mb = f2.size_mb order by path1, fname1" ) if rows.returns_rows == False: st.SetAlert("success") st.SetMessage(f"Err, no dups - should now clear the FE 'danger' message?") return render_template("base.html") jexes = JobExtra.query.join(Job).join(PA_JobManager_Message).filter(PA_JobManager_Message.id==request.form['fe_msg_id']).all() path=[jex.value for jex in jexes if jex.name == "path"][0] prefix = SymlinkName(path,path+'/') if 'pagesize' not in request.form: pagesize=int([jex.value for jex in jexes if jex.name == "pagesize"][0]) else: pagesize=int(request.form['pagesize']) dups={} for row in rows: AddDup( prefix+'/', row, dups ) d1="" d2="" did1="" did2="" str="" dup_cnt=1 preferred={} per_file_dups=[] per_path_dups=[] hashes="" overall_dup_cnt=0 overall_dup_sets=0 for hash in dups: # more than 2 files (just ask per file) OR # only 2 copies, and files are in same dir (so must be diff name, so just ask) OR # content same, filename different (just ask per file) if (len(dups[hash]) > 2) or (dups[hash][0]['d'] == dups[hash][1]['d']) or (dups[hash][0]['f'] != dups[hash][1]['f']): per_file_dups.append(dups[hash]) overall_dup_cnt += len(dups[hash]) overall_dup_sets += 1 for el in dups[hash]: if re.search( '\d{4}/\d{8}', el['d']): preferred[hash] = el['id'] if overall_dup_cnt<5: print( f"{dups[hash]} <- keeping {el['d']} -- {preferred[hash]}" ) # by here we have only 2 files, with the same name, different path # (MOST COMMON, and I think we dont care per file, just per path) elif d1 != dups[hash][0]['d']: if d1 != '': overall_dup_cnt += dup_cnt overall_dup_sets += 1 per_path_dups.append({'count': dup_cnt, 'd1': d1, 'd2': d2, 'did1': did1, 'did2': did2, 'hashes' : hashes }) dup_cnt=1 d1 = dups[hash][0]['d'] d2 = dups[hash][1]['d'] did1 = dups[hash][0]['did'] did2 = dups[hash][1]['did'] str=f"duplicates found in {d1} and {d2}" hashes = f"{hash}," else: dup_cnt += 1 hashes += f"{hash}," if d1 != '': overall_dup_cnt += dup_cnt overall_dup_sets += dup_cnt per_path_dups.append({'count': dup_cnt, 'd1': d1, 'd2': d2, 'did1': did1, 'did2': did2, 'hashes' : hashes }) """