update file to use new pylint settings, added types and using docstrings in goolge format with partial openapi spec

This commit is contained in:
2023-06-18 22:02:33 +10:00
parent 2767d7872d
commit b636ac08b8
4 changed files with 348 additions and 188 deletions

180
dups.py
View File

@@ -1,47 +1,36 @@
from wtforms import SubmitField, StringField, HiddenField, validators, Form
from flask_wtf import FlaskForm
from flask import request, render_template, send_from_directory
from main import db, app, ma
from sqlalchemy import Sequence
from sqlalchemy.exc import SQLAlchemyError
import os
import glob
from PIL import Image
from pymediainfo import MediaInfo
import hashlib
import exifread
import base64
import numpy
import cv2
import time
""" functions provided to process duplicate photo data from DB into usable data structures """
import re
################################################################################
# Local Class imports
################################################################################
from settings import Settings
from shared import SymlinkName, PA
from shared import PA
from path import PathType
################################################################################
# DupRow class is a simple 'struct' to keep data per duplicate file / just to
# avoid using python list/dicts intermixed, and be able to consistently use
# dot-notation of fields
################################################################################
class DupRow(PA):
def __init__(self, hash, file, dir, did, fid):
""" DupRow class is a simple 'struct' to keep data per duplicate file
Created just to avoid using python list/dicts intermixed, and be able to consistently use
dot-notation of fields
"""
def __init__(self, _hash, file, _dir, did, fid):
### DupRow Attributes -- note, simple class, no methods ###
self.h=hash
self.h=_hash
self.f=file
self.d=dir
self.d=_dir
self.did=did
self.id=fid
return
################################################################################
# DupPathRow class is a simple 'struct' to keep data per files in duplicate paths
# just to avoid using python list/dicts intermixed, and be able to consistently use
# dot-notation of fields
################################################################################
class DupPathRow(PA):
""" DupPathRow class is a simple 'struct' to keep data per files in duplicate paths
Created just to avoid using python list/dicts intermixed, and be able to consistently use
dot-notation of fields
"""
def __init__(self, count, d1, d2, did1, did2, hashes ):
self.count=count
self.d1=d1
@@ -51,33 +40,37 @@ class DupPathRow(PA):
self.hashes=hashes
return
################################################################################
# Duplicates class is used with one instance/object to process all the
# 'duplicate' data from the Database, and parse it into more usable data
# structures. This is needed also, as the database content shows duplicates
# more than once, e.g.
# file1 and file2 are a duplicate, then later file2 and file 1 are 'another' duplicate
# The class passes over the data in 2 passes. The first pass in AddDup() finds
# any files in the import and storage path and marks the storage ones to keep,
# the import ones to delete. Anything else is either a set of files duplicated
# inside the import path or set of files duplicated in the storage path
# The first pass, simply concatenates these into a data structure
# (im_same_dups) that contains all the duplicates with a key of the md5 hash
#
# The second pass (), processes these duplicates to see if there are any in the
# storage path that follow the pattern 'YYYY/YYYYMMDD' -> if so mark these to
# keep and the rest to be deleted.
#
# After the 2 passes, we have data structures that allow the web to break up
# the duplicates into batches to process:
# 1) auto delete any in the import path that are also in the storage path
# - careful here, if we have any in the import path and 2+ in the storage path, leave it for manual intervention
# 2) auto delete any in the storage path that are in a set where 1 of them match the 'YYYY/YYYYMMDD' format, the rest are deleted
# 3) a set of directories where there are only 2 duplicate files (with the same file name), just in a different dir - allow user to choose the dir to keep
# 4) a set of individual files where I want the user to make a decision (3 or more copies, those with different filenames, or in the same dir) - allow user to choose file to keep
################################################################################
class Duplicates(PA):
""" Duplicates class that has methods to process DB duplicate photo data
The Duplicates class is used with one instance/object to process all the
'duplicate' data from the Database, and parse it into more usable data
structures. This is needed also, as the database content shows duplicates
more than once, e.g.
file1 and file2 are a duplicate, then later file2 and file 1 are 'another' duplicate
The class passes over the data in 2 passes. The first pass in AddDup() finds
any files in the import and storage path and marks the storage ones to keep,
the import ones to delete. Anything else is either a set of files duplicated
inside the import path or set of files duplicated in the storage path
The first pass, simply concatenates these into a data structure
(im_same_dups) that contains all the duplicates with a key of the md5 hash
The second pass (), processes these duplicates to see if there are any in the
storage path that follow the pattern 'YYYY/YYYYMMDD' -> if so mark these to
keep and the rest to be deleted.
After the 2 passes, we have data structures that allow the web to break up
the duplicates into batches to process:
1) auto delete any in the import path that are also in the storage path
- careful here, if we have any in the import path and 2+ in the storage path, leave it for manual intervention
2) auto delete any in the storage path that are in a set where 1 of them match the 'YYYY/YYYYMMDD' format, the rest are deleted
3) a set of directories where there are only 2 duplicate files (with the same file name), just in a different dir - allow user to choose the dir to keep
4) a set of individual files where I want the user to make a decision (3 or more copies, those with different filenames, or in the same dir) - allow user to choose file to keep
"""
def __init__(self):
### Duplicates Attributes ###
""" initialises all the Duplicates Attributes """
self.ip_to_sp_dups_keep={}
self.ip_to_sp_dups_del={}
self.dups_to_process={}
@@ -90,33 +83,59 @@ class Duplicates(PA):
self.uniq_dups=0
self.total_dups=0
self.import_ptype_id = PathType.query.filter(PathType.name=='Import').first().id
self.storage_ptype_id = PathType.query.filter(PathType.name=='Storage').first().id
self.import_ptype_id = PathType.query.filter(PathType.name=="Import").first().id
self.storage_ptype_id = PathType.query.filter(PathType.name=="Storage").first().id
# is this file in the import path?
def InImportPath( self, path_type ):
""" Is the path being checked a import path
Args:
path_type (int): db key for the path_type of the path being checked
Returns:
bool: True if this path is a import path
"""
if path_type == self.import_ptype_id:
return True
return False
# is this file in the storage path?
def InStoragePath( self, path_type ):
""" Is the path being checked a storage path
Args:
path_type (int): db key for the path_type of the path being checked
Returns:
bool: True if this path is a storage path
"""
if path_type == self.storage_ptype_id:
return True
return False
# this stores this object into the keep from same path list (sometimes there can be more than 1 SP, e.g SP to SP to IP)
# for now, by not dealing with the extra SP, we will just delete the IP, and force a check_dups after deleting, it will then
# pick up and process the SP to SP - if still needed -- if there is only SP1 to SP2, then the per_path_dup will pick it up and
# I believe this will all work, but doesn't hurt to do an extra check_dups again
def KeepInIPSPDups( self, obj ):
""" stores this file into the "keep from same path" list
sometimes there can be more than 1 SP, e.g SP to SP to IP
for now, by not dealing with the extra SP, we will just delete the IP, and force a check_dups after deleting, it will then
pick up and process the SP to SP - if still needed -- if there is only SP1 to SP2, then the per_path_dup will pick it up and
I believe this will all work, but doesn't hurt to do an extra check_dups again
Args:
obj (DupRow): file that will be stored into the "Delete from same path" list
Returns:
None
"""
if obj.h not in self.ip_to_sp_dups_keep:
self.ip_to_sp_dups_keep[obj.h]= obj
return
# this stores this object into the Delete from same path list (if it is not
# already there)
def DelInIPSPDups( self, obj ):
""" stores this object into the Delete from same path list (if it is not already there)
Args:
obj (DupRow): file that will be stored into the "Delete from same path" list
Returns:
None
"""
if obj.h not in self.ip_to_sp_dups_del:
self.ip_to_sp_dups_del[obj.h]=[]
self.ip_to_sp_dups_del[obj.h].append( obj )
@@ -127,10 +146,21 @@ class Duplicates(PA):
self.ip_to_sp_dups_del[obj.h].append( obj )
return
# this function takes a duplicate file (in the import path and the storage path)
# and then puts the storage path file in the keep list (self.ip_to_sp_dups_keep) via self.KeepInIPSPDups()
# and then puts the import path file in the delete list (self.ip_to_sp_dups_keep) via self.DelInIPSPDups()
def DupInImportAndStoragePath( self, row, dr1, dr2 ):
""" handles a duplicate file in import and storage paths, and stores them into keep lists
this function takes a duplicate file (in the import path and the storage path)
and then puts the storage path file in the keep list (self.ip_to_sp_dups_keep) via self.KeepInIPSPDups()
and then puts the import path file in the delete list (self.ip_to_sp_dups_keep) via self.DelInIPSPDups()
Args:
row (ORM row): row from the database with a dup pair in dir1 & dir2
dr1 (DupRow): dup data for file 1 or a duplicate
dr2 (DupRow): dup data for file 2 or a duplicate
Returns:
bool: True if file is in both import and storage path, False otherwise
"""
if self.InStoragePath(row.path_type1) and self.InImportPath(row.path_type2):
self.KeepInIPSPDups( dr1 )
self.DelInIPSPDups( dr2 )
@@ -180,7 +210,7 @@ class Duplicates(PA):
# AddDupPath: takes a row from the database effectively with a dup pair in dir1 & dir2
# we process these into appropriate data structures on this second pass
# working through if a dir is in th estorage path and is
# working through if a dir is in th estorage path and is
def AddDupPath(self, hash):
# this gets complex, if this hash is also in a shared imp / sp - then dont deal with it now, let the imp files be deleted and
# the repeat check_dups validation step catch it as a cleander (potential) for still more duplicates just in sp
@@ -198,9 +228,9 @@ class Duplicates(PA):
# FIXME: what if both do? what if one is in SP and the other not, etc...
if new:
self.per_path_dups.append( dpr )
if re.search( r'\d{4}/\d{8}', dpr.d1):
if re.search( r"\d{4}/\d{8}", dpr.d1):
self.preferred_path[dpr.did1]=1
if re.search( r'\d{4}/\d{8}', dpr.d2):
if re.search( r"\d{4}/\d{8}", dpr.d2):
self.preferred_path[dpr.did2]=1
return True
@@ -216,7 +246,7 @@ class Duplicates(PA):
if (len(self.dups_to_process[hash]) > 2) or (self.dups_to_process[hash][0].d == self.dups_to_process[hash][1].d) or (self.dups_to_process[hash][0].f != self.dups_to_process[hash][1].f):
self.per_file_dups.append(self.dups_to_process[hash])
for el in self.dups_to_process[hash]:
if re.search( r'\d{4}/\d{8}', el.d):
if re.search( r"\d{4}/\d{8}", el.d):
self.preferred_file[hash] = el.id
else:
# will force ask per path
@@ -232,9 +262,9 @@ class Duplicates(PA):
if len(self.ip_to_sp_dups_keep) > 0:
print( "############ Files that are in both Import and Storage Paths ###########")
for h in self.ip_to_sp_dups_keep:
print( f"hash={h} keep 1 of {len(self.ip_to_sp_dups_del[h])+1}, keep: {self.ip_to_sp_dups_keep[h]} | ", end='' )
print( f"hash={h} keep 1 of {len(self.ip_to_sp_dups_del[h])+1}, keep: {self.ip_to_sp_dups_keep[h]} | ", end="" )
for d in self.ip_to_sp_dups_del[h]:
print( f"Del: {d}", end='' )
print( f"Del: {d}", end="" )
print( "" )
print( f"{len(self.ip_to_sp_dups_keep)} sets of duplicate files to delete at least 1, anything with 2 or more dups is printed above explicitly" )
@@ -247,9 +277,9 @@ class Duplicates(PA):
if len(self.preferred_file) > 0:
print( " We have preferred (regexp matched) ###########")
for h in self.preferred_file:
print( f"hash={h}, keep this one: {self.preferred_file[h]} from ", end='' )
print( f"hash={h}, keep this one: {self.preferred_file[h]} from ", end="" )
for d in self.dups_to_process[h]:
print( f"{d.id}, ", end='' )
print( f"{d.id}, ", end="" )
print ("")
print( f"which is a total of {len(self.preferred_file)} duplicate files we will keep as they match the regexp" )