286 lines
13 KiB
Python
286 lines
13 KiB
Python
from wtforms import SubmitField, StringField, HiddenField, validators, Form
|
|
from flask_wtf import FlaskForm
|
|
from flask import request, render_template, redirect, send_from_directory
|
|
from main import db, app, ma
|
|
from sqlalchemy import Sequence
|
|
from sqlalchemy.exc import SQLAlchemyError
|
|
from status import st, Status
|
|
import os
|
|
import glob
|
|
from PIL import Image
|
|
from pymediainfo import MediaInfo
|
|
import hashlib
|
|
import exifread
|
|
import base64
|
|
import numpy
|
|
import cv2
|
|
import time
|
|
import re
|
|
|
|
################################################################################
|
|
# Local Class imports
|
|
################################################################################
|
|
from job import Job, JobExtra, Joblog, NewJob
|
|
from settings import Settings
|
|
from shared import SymlinkName
|
|
|
|
################################################################################
|
|
# DupRow class is a simple 'struct' to keep data per duplicate file / just to
|
|
# avoid using python list/dicts intermixed, and be able to consistently use
|
|
# dot-notation of fields
|
|
class DupRow:
|
|
def __init__(self, hash, file, dir, did, fid):
|
|
### DupRow Attributes -- note, simple class, no methods ###
|
|
self.h=hash
|
|
self.f=file
|
|
self.d=dir
|
|
self.did=did
|
|
self.id=fid
|
|
return
|
|
|
|
def __repr__(self):
|
|
return f"DupRow( id: {self.id}, did: {self.did} )"
|
|
|
|
################################################################################
|
|
# DupPathRow class is a simple 'struct' to keep data per files in duplicate paths
|
|
# just to avoid using python list/dicts intermixed, and be able to consistently use
|
|
# dot-notation of fields
|
|
class DupPathRow:
|
|
def __init__(self, count, d1, d2, did1, did2, hashes ):
|
|
self.count=count
|
|
self.d1=d1
|
|
self.d2=d2
|
|
self.did1=did1
|
|
self.did2=did2
|
|
self.hashes=hashes
|
|
|
|
def __repr__(self):
|
|
return f"DupPathRow( did1: {self.did1}, did2: {self.did2} )"
|
|
|
|
################################################################################
|
|
# Duplicates class is used with one instance/object to process all the
|
|
# 'duplicate' data from the Database, and parse it into more usable data
|
|
# structures. This is needed also, as the database content shows duplicates
|
|
# more than once, e.g.
|
|
# file1 and file2 are a duplicate, then later file2 and file 1 are 'another' duplicate
|
|
# The class passes over the data in 2 passes. The first pass in AddDup() finds
|
|
# any files in the import and storage path and marks the storage ones to keep,
|
|
# the import ones to delete. Anything else is either a set of files duplicated
|
|
# inside the import path or set of files duplicated in the storage path
|
|
# The first pass, simply concatenates these into a data structure
|
|
# (im_same_dups) that contains all the duplicates with a key of the md5 hash
|
|
#
|
|
# The second pass (), processes these duplicates to see if there are any in the
|
|
# storage path that follow the pattern 'YYYY/YYYYMMDD' -> if so mark these to
|
|
# keep and the rest to be deleted.
|
|
#
|
|
# After the 2 passes, we have data structures that allow the web to break up
|
|
# the duplicates into batches to process:
|
|
# 1) auto delete any in the import path that are also in the storage path
|
|
# - carefule here, if we have 2 in the import path and 2+ in the storage path, leave it for manual intervention
|
|
# 2) auto delete any in the storage path that are in a set where 1 of them match the 'YYYY/YYYYMMDD' format, the rest are deleted
|
|
# 3) a set of directories where there are only 2 duplicate files (with the same file name), just in a different dir - allow user to choose the dir to keep
|
|
# 4) a set of individual files where I want the user to make a decision (3 or more copies, those with different filenames, or in the same dir) - allow user to choose file to keep
|
|
class Duplicates:
|
|
def __init__(self):
|
|
### Duplicates Attributes ###
|
|
self.ip_to_sp_dups_keep={}
|
|
self.ip_to_sp_dups_del={}
|
|
self.dups_to_process={}
|
|
self.per_file_dups=[]
|
|
self.per_path_dups=[]
|
|
self.preferred_file={}
|
|
self.preferred_path={}
|
|
self.all_paths=[]
|
|
self.storage_paths=[]
|
|
self.import_paths=[]
|
|
self.overall_dup_cnt=0
|
|
self.overall_dup_sets=0
|
|
|
|
# pull apart the storage path Setting, and make array of each for use in TrimmedPath()
|
|
settings=Settings.query.first()
|
|
paths = settings.storage_path.split("#")
|
|
for path in paths:
|
|
prefix = SymlinkName(path,path+'/')
|
|
self.storage_paths.append(prefix)
|
|
self.all_paths.append(prefix)
|
|
# pull apart the import path Setting, and make array of each for use in TrimmedPath()
|
|
paths = settings.import_path.split("#")
|
|
for path in paths:
|
|
prefix = SymlinkName(path,path+'/')
|
|
self.import_paths.append(prefix)
|
|
self.all_paths.append(prefix)
|
|
|
|
# Strip the front of the path (any match on a storage or import path) is
|
|
# removed. Just to make it easier to read when we display in the web page
|
|
def TrimmedPath( self, path ):
|
|
for p in self.all_paths:
|
|
if re.match( f"^{p}", path ):
|
|
return path.replace(p, '' )
|
|
return path
|
|
|
|
# is this file in the import path?
|
|
def InImportPath( self, path ):
|
|
for p in self.import_paths:
|
|
if re.match( f"^{p}", path ):
|
|
return True
|
|
return False
|
|
|
|
# is this file in the storage path?
|
|
def InStoragePath( self, path ):
|
|
for p in self.storage_paths:
|
|
if re.match( f"^{p}", path ):
|
|
return True
|
|
return False
|
|
|
|
# this stores this object into the keep from same path list (DDP: could there be more than 1)
|
|
def KeepInSameDups( self, obj ):
|
|
if obj.h not in self.ip_to_sp_dups_keep:
|
|
self.ip_to_sp_dups_keep[obj.h]= obj
|
|
return
|
|
|
|
# this stores this object into the Delete from same path list (if it is not
|
|
# already there)
|
|
def DelInSameDups( self, obj ):
|
|
if obj.h not in self.ip_to_sp_dups_del:
|
|
self.ip_to_sp_dups_del[obj.h]=[]
|
|
self.ip_to_sp_dups_del[obj.h].append( obj )
|
|
else:
|
|
for el in self.ip_to_sp_dups_del[obj.h]:
|
|
if el.id == obj.id:
|
|
return
|
|
self.ip_to_sp_dups_del[obj.h].append( obj )
|
|
return
|
|
|
|
# this function takes a duplicate file (in the import path and the storage path)
|
|
# and then puts the storage path file in the keep list (self.ip_to_sp_dups_keep) via self.KeepInSameDups()
|
|
# and then puts the import path file in the delete list (self.ip_to_sp_dups_keep) via self.DelInSameDups()
|
|
def DupInImportAndStoragePath( self, row, dr1, dr2 ):
|
|
if self.InStoragePath(row.path1) and self.InImportPath(row.path2):
|
|
self.KeepInSameDups( dr1 )
|
|
self.DelInSameDups( dr2 )
|
|
return True
|
|
if self.InStoragePath(row.path2) and self.InImportPath(row.path1):
|
|
self.KeepInSameDups( dr2 )
|
|
self.DelInSameDups( dr1 )
|
|
return True
|
|
return False
|
|
|
|
# AddDup: takes a row from the database effectively file1 & file2
|
|
# we process these into appropriate data structures on this first pass
|
|
def AddDup( self, row ):
|
|
dr1=DupRow( row.hash, row.fname1, self.TrimmedPath(row.path1), row.did1, row.id1 )
|
|
dr2=DupRow( row.hash, row.fname2, self.TrimmedPath(row.path2), row.did2, row.id2 )
|
|
# if in both import and storage path, just keep the storage path file,
|
|
# and del import path file.
|
|
if self.DupInImportAndStoragePath( row, dr1, dr2 ):
|
|
return
|
|
|
|
# if we are here, we have duplicates either in the storage path or in
|
|
# the import path
|
|
|
|
# if the hast is no dups_to_process, created / append
|
|
if row.hash not in self.dups_to_process:
|
|
self.dups_to_process[row.hash]=[]
|
|
self.dups_to_process[row.hash].append( dr1 )
|
|
self.dups_to_process[row.hash].append( dr2 )
|
|
else:
|
|
# process path1 / fname1 -- if that combo is not in the dups_to_process[hash], add it
|
|
found=0
|
|
for dup in self.dups_to_process[row.hash]:
|
|
if dup.id == row.id1:
|
|
found=1
|
|
continue
|
|
if not found:
|
|
self.dups_to_process[row.hash].append( dr1 )
|
|
|
|
# process path2 / fname2 -- if that combo is not in the dups_to_process[hash], add it
|
|
for dup in self.dups_to_process[row.hash]:
|
|
if dup.id == row.id2:
|
|
found=1
|
|
continue
|
|
if not found:
|
|
self.dups_to_process[row.hash].append( dr2 )
|
|
return
|
|
|
|
def SecondPass(self):
|
|
print("################################## second pass starting")
|
|
d1=""
|
|
d2=""
|
|
did1=""
|
|
did2=""
|
|
dup_cnt=1
|
|
hashes=""
|
|
for hash in self.dups_to_process:
|
|
# more than 2 files (just ask per file) OR only 2 copies, and files are in same dir (so must be diff name, so just ask) OR content same, filename different (ask per file)
|
|
if (len(self.dups_to_process[hash]) > 2) or (self.dups_to_process[hash][0].f != self.dups_to_process[hash][1].f) or (self.dups_to_process[hash][0].d == self.dups_to_process[hash][1].d):
|
|
self.per_file_dups.append(self.dups_to_process[hash])
|
|
self.overall_dup_cnt += len(self.dups_to_process[hash])
|
|
self.overall_dup_sets += 1
|
|
for el in self.dups_to_process[hash]:
|
|
if re.search( '\d{4}/\d{8}', el.d):
|
|
self.preferred_file[hash] = el.id
|
|
# by here we have only 2 files, with the same name, different path (ask per path)
|
|
elif d1 != self.dups_to_process[hash][0].d:
|
|
if d1 != '':
|
|
self.overall_dup_cnt += dup_cnt
|
|
self.overall_dup_sets += 1
|
|
self.per_path_dups.append( DupPathRow( dup_cnt, d1, d2, did1, did2, hashes ) )
|
|
if re.search( '\d{4}/\d{8}', d1):
|
|
self.preferred_path[did1]=1
|
|
if re.search( '\d{4}/\d{8}', d2):
|
|
self.preferred_path[did2]=1
|
|
dup_cnt=1
|
|
d1 = self.dups_to_process[hash][0].d
|
|
d2 = self.dups_to_process[hash][1].d
|
|
did1 = self.dups_to_process[hash][0].did
|
|
did2 = self.dups_to_process[hash][1].did
|
|
hashes = f"{hash},"
|
|
else:
|
|
dup_cnt += 1
|
|
hashes += f"{hash},"
|
|
|
|
if d1 != '':
|
|
self.overall_dup_cnt += dup_cnt
|
|
self.overall_dup_sets += dup_cnt
|
|
self.per_path_dups.append( DupPathRow( dup_cnt, d1, d2, did1, did2, hashes ) )
|
|
print("#################### second pass FINISHED")
|
|
return
|
|
|
|
# quick debugger to see the data in the data structure
|
|
def Dump(self):
|
|
if len(self.ip_to_sp_dups_keep) > 0:
|
|
print( "############ Files that are in both Import and Storage Paths ###########")
|
|
for h in self.ip_to_sp_dups_keep:
|
|
if len(self.ip_to_sp_dups_del[h])>2:
|
|
print( f"hash={h}, keep: {self.ip_to_sp_dups_keep[h]}" )
|
|
for d in self.ip_to_sp_dups_del[h]:
|
|
print( f"Del: {d}" )
|
|
print( f"{len(self.ip_to_sp_dups_keep)} sets of duplicate files to delete at least 1, anything with 2 or more dups is printed above explicitly" )
|
|
|
|
if len(self.dups_to_process) > 0:
|
|
print( "############ Duplicate Files that are needing to be futher processed ###########")
|
|
for h in self.dups_to_process:
|
|
print( f"hash={h}, keep 1 of these: ", end='')
|
|
for d in self.dups_to_process[h]:
|
|
print( f"{d.id}, ", end='' )
|
|
print ("")
|
|
print( f"{len(self.dups_to_process)} sets of duplicate files to delete at least 1, anything with 2 or more dups is printed above explicitly" )
|
|
|
|
if len(self.preferred_file) > 0:
|
|
for h in self.preferred_file:
|
|
print( f"hash={h}, keep this one: {self.preferred_file[h]} from ", end='' )
|
|
for d in self.dups_to_process[h]:
|
|
print( f"{d.id}, ", end='' )
|
|
print ("")
|
|
print( f"{len(self.preferred_file)} duplicate files we will keep as they match the regexp" )
|
|
|
|
if len(self.per_path_dups) > 0:
|
|
for pair in self.per_path_dups:
|
|
print( f"{pair.count} dups in dir1: {pair.did1} dir2: {pair.did2}" )
|
|
if pair.did1 in self.preferred_path:
|
|
print("Keep dir1")
|
|
if pair.did2 in self.preferred_path:
|
|
print("Keep dir2")
|
|
print( f"{len(self.per_path_dups)} duplicate files in per path dups" )
|
|
return |