update file to use new pylint settings, added types and using docstrings in goolge format with partial openapi spec

2023-06-18 22:02:33 +10:00
parent 2767d7872d
commit b636ac08b8
4 changed files with 348 additions and 188 deletions
--- a/dups.py
+++ b/dups.py
@@ -1,47 +1,36 @@
-from wtforms import SubmitField, StringField, HiddenField, validators, Form
-from flask_wtf import FlaskForm
-from flask import request, render_template, send_from_directory
-from main import db, app, ma 
-from sqlalchemy import Sequence
-from sqlalchemy.exc import SQLAlchemyError
-import os
-import glob
-from PIL import Image
-from pymediainfo import MediaInfo
-import hashlib
-import exifread
-import base64
-import numpy
-import cv2
-import time
+""" functions provided to process duplicate photo data from DB into usable data structures """
 import re

 ################################################################################
 # Local Class imports
 ################################################################################
-from settings import Settings
-from shared import SymlinkName, PA
+from shared import PA
 from path import PathType

-################################################################################    
-# DupRow class is a simple 'struct' to keep data per duplicate file / just to
-# avoid using python list/dicts intermixed, and be able to consistently use
-# dot-notation of fields
+################################################################################
 class DupRow(PA):
-    def __init__(self, hash, file, dir, did, fid):
+    """ DupRow class is a simple 'struct' to keep data per duplicate file
+
+        Created just to avoid using python list/dicts intermixed, and be able to consistently use
+        dot-notation of fields
+    """
+
+    def __init__(self, _hash, file, _dir, did, fid):
        ### DupRow Attributes -- note, simple class, no methods ###
-        self.h=hash
+        self.h=_hash
        self.f=file
-        self.d=dir
+        self.d=_dir
        self.did=did
        self.id=fid
        return

-################################################################################    
-# DupPathRow class is a simple 'struct' to keep data per files in duplicate paths
-# just to avoid using python list/dicts intermixed, and be able to consistently use
-# dot-notation of fields
+################################################################################
 class DupPathRow(PA):
+    """ DupPathRow class is a simple 'struct' to keep data per files in duplicate paths
+
+        Created just to avoid using python list/dicts intermixed, and be able to consistently use
+        dot-notation of fields
+    """
    def __init__(self, count, d1, d2, did1, did2, hashes ):
        self.count=count
        self.d1=d1
@@ -51,33 +40,37 @@ class DupPathRow(PA):
        self.hashes=hashes
        return

-################################################################################    
-# Duplicates class is used with one instance/object to process all the
-# 'duplicate' data from the Database, and parse it into more usable data
-# structures.    This is needed also, as the database content shows duplicates
-# more than once, e.g.
-#     file1 and file2 are a duplicate, then later file2 and file 1 are 'another' duplicate
-# The class passes over the data in 2 passes.  The first pass in AddDup() finds
-# any files in the import and storage path and marks the storage ones to keep,
-# the import ones to delete.  Anything else is either a set of files duplicated
-# inside the import path or set of files duplicated in the storage path
-# The first pass, simply concatenates these into a data structure
-# (im_same_dups) that contains all the duplicates with a key of the md5 hash
-#
-# The second pass (), processes these duplicates to see if there are any in the
-# storage path that follow the pattern 'YYYY/YYYYMMDD' -> if so mark these to
-# keep and the rest to be deleted.
-#
-# After the 2 passes, we have data structures that allow the web to break up
-# the duplicates into batches to process:
-#    1) auto delete any in the import path that are also in the storage path
-#       - careful here, if we have any in the import path and 2+ in the storage path, leave it for manual intervention
-#    2) auto delete any in the storage path that are in a set where 1 of them match the 'YYYY/YYYYMMDD' format, the rest are deleted
-#    3) a set of directories where there are only 2 duplicate files (with the same file name), just in a different dir - allow user to choose the dir to keep
-#    4) a set of individual files where I want the user to make a decision (3 or more copies, those with different filenames, or in the same dir) - allow user to choose file to keep
+################################################################################
 class Duplicates(PA):
+    """ Duplicates class that has methods to process DB duplicate photo data
+
+    The Duplicates class is used with one instance/object to process all the
+    'duplicate' data from the Database, and parse it into more usable data
+    structures.    This is needed also, as the database content shows duplicates
+    more than once, e.g.
+        file1 and file2 are a duplicate, then later file2 and file 1 are 'another' duplicate
+    The class passes over the data in 2 passes.  The first pass in AddDup() finds
+    any files in the import and storage path and marks the storage ones to keep,
+    the import ones to delete.  Anything else is either a set of files duplicated
+    inside the import path or set of files duplicated in the storage path
+    The first pass, simply concatenates these into a data structure
+    (im_same_dups) that contains all the duplicates with a key of the md5 hash
+
+    The second pass (), processes these duplicates to see if there are any in the
+    storage path that follow the pattern 'YYYY/YYYYMMDD' -> if so mark these to
+    keep and the rest to be deleted.
+
+    After the 2 passes, we have data structures that allow the web to break up
+    the duplicates into batches to process:
+       1) auto delete any in the import path that are also in the storage path
+          - careful here, if we have any in the import path and 2+ in the storage path, leave it for manual intervention
+       2) auto delete any in the storage path that are in a set where 1 of them match the 'YYYY/YYYYMMDD' format, the rest are deleted
+       3) a set of directories where there are only 2 duplicate files (with the same file name), just in a different dir - allow user to choose the dir to keep
+       4) a set of individual files where I want the user to make a decision (3 or more copies, those with different filenames, or in the same dir) - allow user to choose file to keep
+    """
+
    def __init__(self):
-        ### Duplicates Attributes ###
+        """ initialises all the Duplicates Attributes """
        self.ip_to_sp_dups_keep={}
        self.ip_to_sp_dups_del={}
        self.dups_to_process={}
@@ -90,33 +83,59 @@ class Duplicates(PA):
        self.uniq_dups=0
        self.total_dups=0

-        self.import_ptype_id  = PathType.query.filter(PathType.name=='Import').first().id
-        self.storage_ptype_id = PathType.query.filter(PathType.name=='Storage').first().id
+        self.import_ptype_id  = PathType.query.filter(PathType.name=="Import").first().id
+        self.storage_ptype_id = PathType.query.filter(PathType.name=="Storage").first().id

-    # is this file in the import path?
    def InImportPath( self, path_type ):
+        """ Is the path being checked a import path
+
+        Args:
+            path_type (int): db key for the path_type of the path being checked
+        Returns:
+            bool: True if this path is a import path
+        """
        if path_type == self.import_ptype_id:
            return True
        return False

-    # is this file in the storage path?
    def InStoragePath( self, path_type ):
+        """ Is the path being checked a storage path
+
+        Args:
+            path_type (int): db key for the path_type of the path being checked
+        Returns:
+            bool: True if this path is a storage path
+        """
        if path_type == self.storage_ptype_id:
            return True
        return False

-    # this stores this object into the keep from same path list (sometimes there can be more than 1 SP, e.g SP to SP to IP)
-    # for now, by not dealing with the extra SP, we will just delete the IP, and force a check_dups after deleting, it will then
-    # pick up and process the SP to SP - if still needed -- if there is only SP1 to SP2, then the per_path_dup will pick it up and
-    # I believe this will all work, but doesn't hurt to do an extra check_dups again
    def KeepInIPSPDups( self, obj ):
+        """ stores this file into the "keep from same path" list 
+
+        sometimes there can be more than 1 SP, e.g SP to SP to IP
+        for now, by not dealing with the extra SP, we will just delete the IP, and force a check_dups after deleting, it will then
+        pick up and process the SP to SP - if still needed -- if there is only SP1 to SP2, then the per_path_dup will pick it up and
+        I believe this will all work, but doesn't hurt to do an extra check_dups again
+
+        Args:
+            obj (DupRow): file that will be stored into the "Delete from same path" list
+        Returns:
+            None
+        """
        if obj.h not in self.ip_to_sp_dups_keep:
            self.ip_to_sp_dups_keep[obj.h]= obj
        return

-    # this stores this object into the Delete from same path list (if it is not
-    # already there)
    def DelInIPSPDups( self, obj ):
+        """ stores this object into the Delete from same path list (if it is not already there) 
+
+        Args:
+            obj (DupRow): file that will be stored into the "Delete from same path" list
+        Returns:
+            None
+        """
+
        if obj.h not in self.ip_to_sp_dups_del:
            self.ip_to_sp_dups_del[obj.h]=[]
            self.ip_to_sp_dups_del[obj.h].append( obj )
@@ -127,10 +146,21 @@ class Duplicates(PA):
            self.ip_to_sp_dups_del[obj.h].append( obj )
        return

-    # this function takes a duplicate file (in the import path and the storage path)
-    # and then puts the storage path file in the keep list (self.ip_to_sp_dups_keep) via self.KeepInIPSPDups()
-    # and then puts the import path file in the delete list (self.ip_to_sp_dups_keep) via self.DelInIPSPDups()
    def DupInImportAndStoragePath( self, row, dr1, dr2 ):
+        """ handles a duplicate file in import and storage paths, and stores them into keep lists
+
+        this function takes a duplicate file (in the import path and the storage path)
+        and then puts the storage path file in the keep list (self.ip_to_sp_dups_keep) via self.KeepInIPSPDups()
+        and then puts the import path file in the delete list (self.ip_to_sp_dups_keep) via self.DelInIPSPDups()
+
+        Args:
+            row (ORM row): row from the database with a dup pair in dir1 & dir2
+            dr1 (DupRow): dup data for file 1 or a duplicate
+            dr2 (DupRow): dup data for file 2 or a duplicate
+            
+        Returns:
+            bool: True if file is in both import and storage path, False otherwise
+        """
        if self.InStoragePath(row.path_type1) and self.InImportPath(row.path_type2):
            self.KeepInIPSPDups( dr1 )
            self.DelInIPSPDups( dr2 )
@@ -180,7 +210,7 @@ class Duplicates(PA):

    # AddDupPath: takes a row from the database effectively with a dup pair in dir1 & dir2
    # we process these into appropriate data structures on this second pass
-    # working through if a dir is in th estorage path and is 
+    # working through if a dir is in th estorage path and is
    def AddDupPath(self, hash):
        # this gets complex, if this hash is also in a shared imp / sp - then dont deal with it now, let the imp files be deleted and
        # the repeat check_dups validation step catch it as a cleander (potential) for still more duplicates just in sp
@@ -198,9 +228,9 @@ class Duplicates(PA):
        # FIXME: what if both do? what if one is in SP and the other not, etc...  
        if new:
            self.per_path_dups.append( dpr )
-        if re.search( r'\d{4}/\d{8}', dpr.d1):
+        if re.search( r"\d{4}/\d{8}", dpr.d1):
            self.preferred_path[dpr.did1]=1
-        if re.search( r'\d{4}/\d{8}', dpr.d2):
+        if re.search( r"\d{4}/\d{8}", dpr.d2):
            self.preferred_path[dpr.did2]=1
        return True

@@ -216,7 +246,7 @@ class Duplicates(PA):
            if (len(self.dups_to_process[hash]) > 2) or (self.dups_to_process[hash][0].d == self.dups_to_process[hash][1].d) or (self.dups_to_process[hash][0].f != self.dups_to_process[hash][1].f):
                self.per_file_dups.append(self.dups_to_process[hash])
                for el in self.dups_to_process[hash]:
-                    if re.search( r'\d{4}/\d{8}', el.d):
+                    if re.search( r"\d{4}/\d{8}", el.d):
                        self.preferred_file[hash] = el.id
            else:
                # will force ask per path
@@ -232,9 +262,9 @@ class Duplicates(PA):
        if len(self.ip_to_sp_dups_keep) > 0:
            print( "############ Files that are in both Import and Storage Paths ###########")
            for h in self.ip_to_sp_dups_keep:
-                print( f"hash={h} keep 1 of {len(self.ip_to_sp_dups_del[h])+1}, keep: {self.ip_to_sp_dups_keep[h]} | ", end='' )
+                print( f"hash={h} keep 1 of {len(self.ip_to_sp_dups_del[h])+1}, keep: {self.ip_to_sp_dups_keep[h]} | ", end="" )
                for d in self.ip_to_sp_dups_del[h]:
-                    print( f"Del: {d}", end='' )
+                    print( f"Del: {d}", end="" )
            print( "" )
            print( f"{len(self.ip_to_sp_dups_keep)} sets of duplicate files to delete at least 1, anything with 2 or more dups is printed above explicitly" )

@@ -247,9 +277,9 @@ class Duplicates(PA):
        if len(self.preferred_file) > 0:
            print( "     We have preferred (regexp matched) ###########")
            for h in self.preferred_file:
-                print( f"hash={h}, keep this one: {self.preferred_file[h]} from ", end='' )
+                print( f"hash={h}, keep this one: {self.preferred_file[h]} from ", end="" )
                for d in self.dups_to_process[h]:
-                    print( f"{d.id}, ", end='' )
+                    print( f"{d.id}, ", end="" )
                print ("")
            print( f"which is a total of {len(self.preferred_file)} duplicate files we will keep as they match the regexp" )