from wtforms import SubmitField, StringField, HiddenField, validators, Form from flask_wtf import FlaskForm from flask import request, render_template, redirect, send_from_directory from main import db, app, ma from sqlalchemy import Sequence from sqlalchemy.exc import SQLAlchemyError from status import st, Status import os import glob from PIL import Image from pymediainfo import MediaInfo import hashlib import exifread import base64 import numpy import cv2 import time import re ################################################################################ # Local Class imports ################################################################################ from job import Job, JobExtra, Joblog, NewJob from settings import Settings from shared import SymlinkName from path import PathType ################################################################################ # DupRow class is a simple 'struct' to keep data per duplicate file / just to # avoid using python list/dicts intermixed, and be able to consistently use # dot-notation of fields class DupRow: def __init__(self, hash, file, dir, did, fid): ### DupRow Attributes -- note, simple class, no methods ### self.h=hash self.f=file self.d=dir self.did=did self.id=fid return def __repr__(self): return f"DupRow( id: {self.id}, did: {self.did} )" ################################################################################ # DupPathRow class is a simple 'struct' to keep data per files in duplicate paths # just to avoid using python list/dicts intermixed, and be able to consistently use # dot-notation of fields class DupPathRow: def __init__(self, count, d1, d2, did1, did2, hashes ): self.count=count self.d1=d1 self.d2=d2 self.did1=did1 self.did2=did2 self.hashes=hashes def __repr__(self): return f"DupPathRow( did1: {self.did1}, did2: {self.did2} )" ################################################################################ # Duplicates class is used with one instance/object to process all the # 'duplicate' data from the Database, and parse it into more usable data # structures. This is needed also, as the database content shows duplicates # more than once, e.g. # file1 and file2 are a duplicate, then later file2 and file 1 are 'another' duplicate # The class passes over the data in 2 passes. The first pass in AddDup() finds # any files in the import and storage path and marks the storage ones to keep, # the import ones to delete. Anything else is either a set of files duplicated # inside the import path or set of files duplicated in the storage path # The first pass, simply concatenates these into a data structure # (im_same_dups) that contains all the duplicates with a key of the md5 hash # # The second pass (), processes these duplicates to see if there are any in the # storage path that follow the pattern 'YYYY/YYYYMMDD' -> if so mark these to # keep and the rest to be deleted. # # After the 2 passes, we have data structures that allow the web to break up # the duplicates into batches to process: # 1) auto delete any in the import path that are also in the storage path # - carefule here, if we have 2 in the import path and 2+ in the storage path, leave it for manual intervention # 2) auto delete any in the storage path that are in a set where 1 of them match the 'YYYY/YYYYMMDD' format, the rest are deleted # 3) a set of directories where there are only 2 duplicate files (with the same file name), just in a different dir - allow user to choose the dir to keep # 4) a set of individual files where I want the user to make a decision (3 or more copies, those with different filenames, or in the same dir) - allow user to choose file to keep class Duplicates: def __init__(self): ### Duplicates Attributes ### self.ip_to_sp_dups_keep={} self.ip_to_sp_dups_del={} self.dups_to_process={} self.per_file_dups=[] self.per_path_dups=[] self.preferred_file={} self.preferred_path={} self.hashes_processed={} self.uniq_dups=0 self.total_dups=0 self.import_ptype_id = PathType.query.filter(PathType.name=='Import').first().id self.storage_ptype_id = PathType.query.filter(PathType.name=='Storage').first().id # is this file in the import path? def InImportPath( self, path_type ): if path_type == self.import_ptype_id: return True return False # is this file in the storage path? def InStoragePath( self, path_type ): if path_type == self.storage_ptype_id: return True return False # this stores this object into the keep from same path list (DDP: sometimes there can be more than 1 SP, e.g SP to SP to IP) # for now, by not dealing with the extra SP, we will just delete the IP, and force a check_dups after deleting, it will then # pick up and process the SP to SP - if still needed -- if there is only SP1 to SP2, then the per_path_dup will pick it up and # I believe this will all work, but doesn't hurt to do an extra check_dups again def KeepInIPSPDups( self, obj ): if obj.h not in self.ip_to_sp_dups_keep: self.ip_to_sp_dups_keep[obj.h]= obj return # this stores this object into the Delete from same path list (if it is not # already there) def DelInIPSPDups( self, obj ): if obj.h not in self.ip_to_sp_dups_del: self.ip_to_sp_dups_del[obj.h]=[] self.ip_to_sp_dups_del[obj.h].append( obj ) else: for el in self.ip_to_sp_dups_del[obj.h]: if el.id == obj.id: return self.ip_to_sp_dups_del[obj.h].append( obj ) return # this function takes a duplicate file (in the import path and the storage path) # and then puts the storage path file in the keep list (self.ip_to_sp_dups_keep) via self.KeepInIPSPDups() # and then puts the import path file in the delete list (self.ip_to_sp_dups_keep) via self.DelInIPSPDups() def DupInImportAndStoragePath( self, row, dr1, dr2 ): if self.InStoragePath(row.path_type1) and self.InImportPath(row.path_type2): self.KeepInIPSPDups( dr1 ) self.DelInIPSPDups( dr2 ) return True if self.InStoragePath(row.path_type2) and self.InImportPath(row.path_type1): self.KeepInIPSPDups( dr2 ) self.DelInIPSPDups( dr1 ) return True return False # AddDup: takes a row from the database effectively file1 & file2 # we process these into appropriate data structures on this first pass def AddDup( self, row ): self.hashes_processed[row.hash]=1 dr1=DupRow( row.hash, row.fname1, row.rel_path1, row.did1, row.id1 ) dr2=DupRow( row.hash, row.fname2, row.rel_path2, row.did2, row.id2 ) # if in both import and storage path, just keep the storage path file, # and del import path file. This function checks and keeps/dels as needed if self.DupInImportAndStoragePath( row, dr1, dr2 ): return # if the hast is no dups_to_process, created / append if row.hash not in self.dups_to_process: self.dups_to_process[row.hash]=[] self.dups_to_process[row.hash].append( dr1 ) self.dups_to_process[row.hash].append( dr2 ) else: # process path1 / fname1 -- if that combo is not in the dups_to_process[hash], add it found=0 for dup in self.dups_to_process[row.hash]: if dup.id == row.id1: found=1 continue if not found: self.dups_to_process[row.hash].append( dr1 ) # process path2 / fname2 -- if that combo is not in the dups_to_process[hash], add it for dup in self.dups_to_process[row.hash]: if dup.id == row.id2: found=1 continue if not found: self.dups_to_process[row.hash].append( dr2 ) return def AddDupPath(self, hash): # this gets complex, if this hash is also in a shared imp / sp - then dont deal with it now, let the imp files be deleted and # the repeat check_dups validation step catch it as a cleander (potential) for still more duplicates just in sp dpr=DupPathRow( 2, self.dups_to_process[hash][0].d, self.dups_to_process[hash][1].d, self.dups_to_process[hash][0].did, self.dups_to_process[hash][1].did, hash ) if hash in self.ip_to_sp_dups_keep: return False new=1 for el in self.per_path_dups: if el.d1 == dpr.d1 and el.d2 == dpr.d2: el.count += 2 el.hashes = f"{el.hashes},{hash}" new=0 if new: self.per_path_dups.append( dpr ) if re.search( r'\d{4}/\d{8}', dpr.d1): self.preferred_path[dpr.did1]=1 if re.search( r'\d{4}/\d{8}', dpr.d2): self.preferred_path[dpr.did2]=1 return True def SecondPass(self): # sort out counts (for ip_to_sp - that is all finished) self.uniq_dups = len(self.hashes_processed) # total starts with 1 copy of everything we keep in sp self.total_dups = len(self.ip_to_sp_dups_keep) # and then add all those we delete in ip that are in sp for hash in self.ip_to_sp_dups_del: self.total_dups += len(self.ip_to_sp_dups_del[hash]) for hash in self.dups_to_process: # more than 2 files (just ask per file) OR only 2 copies, and files are in same dir (so must be diff name, so just ask) OR content same, filename different (ask per file) if (len(self.dups_to_process[hash]) > 2) or (self.dups_to_process[hash][0].f != self.dups_to_process[hash][1].f) or (self.dups_to_process[hash][0].d == self.dups_to_process[hash][1].d): self.per_file_dups.append(self.dups_to_process[hash]) for el in self.dups_to_process[hash]: if re.search( r'\d{4}/\d{8}', el.d): self.preferred_file[hash] = el.id self.total_dups += len(self.dups_to_process[hash]) # if this combination ALSO has an import path dup, then we have already counted the storage path dup in the earlier keeping count if hash in self.ip_to_sp_dups_keep: self.total_dups -= 1 # only 2 files, with the same name, different path (ask per path) else: # if this dup path is not already being partially handled by an ip <-> sp dup, then add it / count it if self.AddDupPath( hash ): self.total_dups += 2 else: # okay, if we are here, this path combo is also in an IP <-> SP combo. # IF, this dup we tried to add was in SP<->SP, then there # is another dup to count, if its IP<->IP (as we append these to the del list), then nothing further to count if self.InStoragePath(self.dups_to_process[hash][0].d): self.total_dups += 1 return # quick debugger to see the data in the data structure def Dump(self): if len(self.ip_to_sp_dups_keep) > 0: print( "############ Files that are in both Import and Storage Paths ###########") for h in self.ip_to_sp_dups_keep: print( f"hash={h} keep 1 of {len(self.ip_to_sp_dups_del[h])+1}, keep: {self.ip_to_sp_dups_keep[h]} | ", end='' ) for d in self.ip_to_sp_dups_del[h]: print( f"Del: {d}", end='' ) print( "" ) print( f"{len(self.ip_to_sp_dups_keep)} sets of duplicate files to delete at least 1, anything with 2 or more dups is printed above explicitly" ) if len(self.dups_to_process) > 0: print( "############ Duplicate Files that are needing to be futher processed ###########") for h in self.dups_to_process: print( f"hash={h} keep 1 of {len(self.dups_to_process[h])} from: {self.dups_to_process[h]}" ) print( f"which is a total of {len(self.dups_to_process)} set(s) of duplicate files to keep only 1 of" ) if len(self.preferred_file) > 0: print( " We have preferred (regexp matched) ###########") for h in self.preferred_file: print( f"hash={h}, keep this one: {self.preferred_file[h]} from ", end='' ) for d in self.dups_to_process[h]: print( f"{d.id}, ", end='' ) print ("") print( f"which is a total of {len(self.preferred_file)} duplicate files we will keep as they match the regexp" ) if len(self.per_path_dups) > 0: print( "############ Duplicate Files in Paths that are needing to be futher processed ###########") for pair in self.per_path_dups: print( f"{pair.count} dups in dir1: {pair.did1} dir2: {pair.did2}" ) if pair.did1 in self.preferred_path: print("Keep dir1") if pair.did2 in self.preferred_path: print("Keep dir2") print( f"which is a total of {len(self.per_path_dups)} set(s) of path dups to process" ) return