update file to use new pylint settings, added types and using docstrings in goolge format with partial openapi spec
This commit is contained in:
180
dups.py
180
dups.py
@@ -1,47 +1,36 @@
|
||||
from wtforms import SubmitField, StringField, HiddenField, validators, Form
|
||||
from flask_wtf import FlaskForm
|
||||
from flask import request, render_template, send_from_directory
|
||||
from main import db, app, ma
|
||||
from sqlalchemy import Sequence
|
||||
from sqlalchemy.exc import SQLAlchemyError
|
||||
import os
|
||||
import glob
|
||||
from PIL import Image
|
||||
from pymediainfo import MediaInfo
|
||||
import hashlib
|
||||
import exifread
|
||||
import base64
|
||||
import numpy
|
||||
import cv2
|
||||
import time
|
||||
""" functions provided to process duplicate photo data from DB into usable data structures """
|
||||
import re
|
||||
|
||||
################################################################################
|
||||
# Local Class imports
|
||||
################################################################################
|
||||
from settings import Settings
|
||||
from shared import SymlinkName, PA
|
||||
from shared import PA
|
||||
from path import PathType
|
||||
|
||||
################################################################################
|
||||
# DupRow class is a simple 'struct' to keep data per duplicate file / just to
|
||||
# avoid using python list/dicts intermixed, and be able to consistently use
|
||||
# dot-notation of fields
|
||||
################################################################################
|
||||
class DupRow(PA):
|
||||
def __init__(self, hash, file, dir, did, fid):
|
||||
""" DupRow class is a simple 'struct' to keep data per duplicate file
|
||||
|
||||
Created just to avoid using python list/dicts intermixed, and be able to consistently use
|
||||
dot-notation of fields
|
||||
"""
|
||||
|
||||
def __init__(self, _hash, file, _dir, did, fid):
|
||||
### DupRow Attributes -- note, simple class, no methods ###
|
||||
self.h=hash
|
||||
self.h=_hash
|
||||
self.f=file
|
||||
self.d=dir
|
||||
self.d=_dir
|
||||
self.did=did
|
||||
self.id=fid
|
||||
return
|
||||
|
||||
################################################################################
|
||||
# DupPathRow class is a simple 'struct' to keep data per files in duplicate paths
|
||||
# just to avoid using python list/dicts intermixed, and be able to consistently use
|
||||
# dot-notation of fields
|
||||
################################################################################
|
||||
class DupPathRow(PA):
|
||||
""" DupPathRow class is a simple 'struct' to keep data per files in duplicate paths
|
||||
|
||||
Created just to avoid using python list/dicts intermixed, and be able to consistently use
|
||||
dot-notation of fields
|
||||
"""
|
||||
def __init__(self, count, d1, d2, did1, did2, hashes ):
|
||||
self.count=count
|
||||
self.d1=d1
|
||||
@@ -51,33 +40,37 @@ class DupPathRow(PA):
|
||||
self.hashes=hashes
|
||||
return
|
||||
|
||||
################################################################################
|
||||
# Duplicates class is used with one instance/object to process all the
|
||||
# 'duplicate' data from the Database, and parse it into more usable data
|
||||
# structures. This is needed also, as the database content shows duplicates
|
||||
# more than once, e.g.
|
||||
# file1 and file2 are a duplicate, then later file2 and file 1 are 'another' duplicate
|
||||
# The class passes over the data in 2 passes. The first pass in AddDup() finds
|
||||
# any files in the import and storage path and marks the storage ones to keep,
|
||||
# the import ones to delete. Anything else is either a set of files duplicated
|
||||
# inside the import path or set of files duplicated in the storage path
|
||||
# The first pass, simply concatenates these into a data structure
|
||||
# (im_same_dups) that contains all the duplicates with a key of the md5 hash
|
||||
#
|
||||
# The second pass (), processes these duplicates to see if there are any in the
|
||||
# storage path that follow the pattern 'YYYY/YYYYMMDD' -> if so mark these to
|
||||
# keep and the rest to be deleted.
|
||||
#
|
||||
# After the 2 passes, we have data structures that allow the web to break up
|
||||
# the duplicates into batches to process:
|
||||
# 1) auto delete any in the import path that are also in the storage path
|
||||
# - careful here, if we have any in the import path and 2+ in the storage path, leave it for manual intervention
|
||||
# 2) auto delete any in the storage path that are in a set where 1 of them match the 'YYYY/YYYYMMDD' format, the rest are deleted
|
||||
# 3) a set of directories where there are only 2 duplicate files (with the same file name), just in a different dir - allow user to choose the dir to keep
|
||||
# 4) a set of individual files where I want the user to make a decision (3 or more copies, those with different filenames, or in the same dir) - allow user to choose file to keep
|
||||
################################################################################
|
||||
class Duplicates(PA):
|
||||
""" Duplicates class that has methods to process DB duplicate photo data
|
||||
|
||||
The Duplicates class is used with one instance/object to process all the
|
||||
'duplicate' data from the Database, and parse it into more usable data
|
||||
structures. This is needed also, as the database content shows duplicates
|
||||
more than once, e.g.
|
||||
file1 and file2 are a duplicate, then later file2 and file 1 are 'another' duplicate
|
||||
The class passes over the data in 2 passes. The first pass in AddDup() finds
|
||||
any files in the import and storage path and marks the storage ones to keep,
|
||||
the import ones to delete. Anything else is either a set of files duplicated
|
||||
inside the import path or set of files duplicated in the storage path
|
||||
The first pass, simply concatenates these into a data structure
|
||||
(im_same_dups) that contains all the duplicates with a key of the md5 hash
|
||||
|
||||
The second pass (), processes these duplicates to see if there are any in the
|
||||
storage path that follow the pattern 'YYYY/YYYYMMDD' -> if so mark these to
|
||||
keep and the rest to be deleted.
|
||||
|
||||
After the 2 passes, we have data structures that allow the web to break up
|
||||
the duplicates into batches to process:
|
||||
1) auto delete any in the import path that are also in the storage path
|
||||
- careful here, if we have any in the import path and 2+ in the storage path, leave it for manual intervention
|
||||
2) auto delete any in the storage path that are in a set where 1 of them match the 'YYYY/YYYYMMDD' format, the rest are deleted
|
||||
3) a set of directories where there are only 2 duplicate files (with the same file name), just in a different dir - allow user to choose the dir to keep
|
||||
4) a set of individual files where I want the user to make a decision (3 or more copies, those with different filenames, or in the same dir) - allow user to choose file to keep
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
### Duplicates Attributes ###
|
||||
""" initialises all the Duplicates Attributes """
|
||||
self.ip_to_sp_dups_keep={}
|
||||
self.ip_to_sp_dups_del={}
|
||||
self.dups_to_process={}
|
||||
@@ -90,33 +83,59 @@ class Duplicates(PA):
|
||||
self.uniq_dups=0
|
||||
self.total_dups=0
|
||||
|
||||
self.import_ptype_id = PathType.query.filter(PathType.name=='Import').first().id
|
||||
self.storage_ptype_id = PathType.query.filter(PathType.name=='Storage').first().id
|
||||
self.import_ptype_id = PathType.query.filter(PathType.name=="Import").first().id
|
||||
self.storage_ptype_id = PathType.query.filter(PathType.name=="Storage").first().id
|
||||
|
||||
# is this file in the import path?
|
||||
def InImportPath( self, path_type ):
|
||||
""" Is the path being checked a import path
|
||||
|
||||
Args:
|
||||
path_type (int): db key for the path_type of the path being checked
|
||||
Returns:
|
||||
bool: True if this path is a import path
|
||||
"""
|
||||
if path_type == self.import_ptype_id:
|
||||
return True
|
||||
return False
|
||||
|
||||
# is this file in the storage path?
|
||||
def InStoragePath( self, path_type ):
|
||||
""" Is the path being checked a storage path
|
||||
|
||||
Args:
|
||||
path_type (int): db key for the path_type of the path being checked
|
||||
Returns:
|
||||
bool: True if this path is a storage path
|
||||
"""
|
||||
if path_type == self.storage_ptype_id:
|
||||
return True
|
||||
return False
|
||||
|
||||
# this stores this object into the keep from same path list (sometimes there can be more than 1 SP, e.g SP to SP to IP)
|
||||
# for now, by not dealing with the extra SP, we will just delete the IP, and force a check_dups after deleting, it will then
|
||||
# pick up and process the SP to SP - if still needed -- if there is only SP1 to SP2, then the per_path_dup will pick it up and
|
||||
# I believe this will all work, but doesn't hurt to do an extra check_dups again
|
||||
def KeepInIPSPDups( self, obj ):
|
||||
""" stores this file into the "keep from same path" list
|
||||
|
||||
sometimes there can be more than 1 SP, e.g SP to SP to IP
|
||||
for now, by not dealing with the extra SP, we will just delete the IP, and force a check_dups after deleting, it will then
|
||||
pick up and process the SP to SP - if still needed -- if there is only SP1 to SP2, then the per_path_dup will pick it up and
|
||||
I believe this will all work, but doesn't hurt to do an extra check_dups again
|
||||
|
||||
Args:
|
||||
obj (DupRow): file that will be stored into the "Delete from same path" list
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
if obj.h not in self.ip_to_sp_dups_keep:
|
||||
self.ip_to_sp_dups_keep[obj.h]= obj
|
||||
return
|
||||
|
||||
# this stores this object into the Delete from same path list (if it is not
|
||||
# already there)
|
||||
def DelInIPSPDups( self, obj ):
|
||||
""" stores this object into the Delete from same path list (if it is not already there)
|
||||
|
||||
Args:
|
||||
obj (DupRow): file that will be stored into the "Delete from same path" list
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
|
||||
if obj.h not in self.ip_to_sp_dups_del:
|
||||
self.ip_to_sp_dups_del[obj.h]=[]
|
||||
self.ip_to_sp_dups_del[obj.h].append( obj )
|
||||
@@ -127,10 +146,21 @@ class Duplicates(PA):
|
||||
self.ip_to_sp_dups_del[obj.h].append( obj )
|
||||
return
|
||||
|
||||
# this function takes a duplicate file (in the import path and the storage path)
|
||||
# and then puts the storage path file in the keep list (self.ip_to_sp_dups_keep) via self.KeepInIPSPDups()
|
||||
# and then puts the import path file in the delete list (self.ip_to_sp_dups_keep) via self.DelInIPSPDups()
|
||||
def DupInImportAndStoragePath( self, row, dr1, dr2 ):
|
||||
""" handles a duplicate file in import and storage paths, and stores them into keep lists
|
||||
|
||||
this function takes a duplicate file (in the import path and the storage path)
|
||||
and then puts the storage path file in the keep list (self.ip_to_sp_dups_keep) via self.KeepInIPSPDups()
|
||||
and then puts the import path file in the delete list (self.ip_to_sp_dups_keep) via self.DelInIPSPDups()
|
||||
|
||||
Args:
|
||||
row (ORM row): row from the database with a dup pair in dir1 & dir2
|
||||
dr1 (DupRow): dup data for file 1 or a duplicate
|
||||
dr2 (DupRow): dup data for file 2 or a duplicate
|
||||
|
||||
Returns:
|
||||
bool: True if file is in both import and storage path, False otherwise
|
||||
"""
|
||||
if self.InStoragePath(row.path_type1) and self.InImportPath(row.path_type2):
|
||||
self.KeepInIPSPDups( dr1 )
|
||||
self.DelInIPSPDups( dr2 )
|
||||
@@ -180,7 +210,7 @@ class Duplicates(PA):
|
||||
|
||||
# AddDupPath: takes a row from the database effectively with a dup pair in dir1 & dir2
|
||||
# we process these into appropriate data structures on this second pass
|
||||
# working through if a dir is in th estorage path and is
|
||||
# working through if a dir is in th estorage path and is
|
||||
def AddDupPath(self, hash):
|
||||
# this gets complex, if this hash is also in a shared imp / sp - then dont deal with it now, let the imp files be deleted and
|
||||
# the repeat check_dups validation step catch it as a cleander (potential) for still more duplicates just in sp
|
||||
@@ -198,9 +228,9 @@ class Duplicates(PA):
|
||||
# FIXME: what if both do? what if one is in SP and the other not, etc...
|
||||
if new:
|
||||
self.per_path_dups.append( dpr )
|
||||
if re.search( r'\d{4}/\d{8}', dpr.d1):
|
||||
if re.search( r"\d{4}/\d{8}", dpr.d1):
|
||||
self.preferred_path[dpr.did1]=1
|
||||
if re.search( r'\d{4}/\d{8}', dpr.d2):
|
||||
if re.search( r"\d{4}/\d{8}", dpr.d2):
|
||||
self.preferred_path[dpr.did2]=1
|
||||
return True
|
||||
|
||||
@@ -216,7 +246,7 @@ class Duplicates(PA):
|
||||
if (len(self.dups_to_process[hash]) > 2) or (self.dups_to_process[hash][0].d == self.dups_to_process[hash][1].d) or (self.dups_to_process[hash][0].f != self.dups_to_process[hash][1].f):
|
||||
self.per_file_dups.append(self.dups_to_process[hash])
|
||||
for el in self.dups_to_process[hash]:
|
||||
if re.search( r'\d{4}/\d{8}', el.d):
|
||||
if re.search( r"\d{4}/\d{8}", el.d):
|
||||
self.preferred_file[hash] = el.id
|
||||
else:
|
||||
# will force ask per path
|
||||
@@ -232,9 +262,9 @@ class Duplicates(PA):
|
||||
if len(self.ip_to_sp_dups_keep) > 0:
|
||||
print( "############ Files that are in both Import and Storage Paths ###########")
|
||||
for h in self.ip_to_sp_dups_keep:
|
||||
print( f"hash={h} keep 1 of {len(self.ip_to_sp_dups_del[h])+1}, keep: {self.ip_to_sp_dups_keep[h]} | ", end='' )
|
||||
print( f"hash={h} keep 1 of {len(self.ip_to_sp_dups_del[h])+1}, keep: {self.ip_to_sp_dups_keep[h]} | ", end="" )
|
||||
for d in self.ip_to_sp_dups_del[h]:
|
||||
print( f"Del: {d}", end='' )
|
||||
print( f"Del: {d}", end="" )
|
||||
print( "" )
|
||||
print( f"{len(self.ip_to_sp_dups_keep)} sets of duplicate files to delete at least 1, anything with 2 or more dups is printed above explicitly" )
|
||||
|
||||
@@ -247,9 +277,9 @@ class Duplicates(PA):
|
||||
if len(self.preferred_file) > 0:
|
||||
print( " We have preferred (regexp matched) ###########")
|
||||
for h in self.preferred_file:
|
||||
print( f"hash={h}, keep this one: {self.preferred_file[h]} from ", end='' )
|
||||
print( f"hash={h}, keep this one: {self.preferred_file[h]} from ", end="" )
|
||||
for d in self.dups_to_process[h]:
|
||||
print( f"{d.id}, ", end='' )
|
||||
print( f"{d.id}, ", end="" )
|
||||
print ("")
|
||||
print( f"which is a total of {len(self.preferred_file)} duplicate files we will keep as they match the regexp" )
|
||||
|
||||
|
||||
Reference in New Issue
Block a user