photoassistant/dups.py

""" functions provided to process duplicate photo data from DB into usable data structures """
import re

################################################################################
# Local Class imports
################################################################################
from shared import PA
from path import PathType

################################################################################
class DupRow(PA):
    """ DupRow class is a simple 'struct' to keep data per duplicate file

        Created just to avoid using python list/dicts intermixed, and be able to consistently use
        dot-notation of fields
    """

    def __init__(self, _hash, file, _dir, did, fid):
        ### DupRow Attributes -- note, simple class, no methods ###
        self.h=_hash
        self.f=file
        self.d=_dir
        self.did=did
        self.id=fid
        return

################################################################################
class DupPathRow(PA):
    """ DupPathRow class is a simple 'struct' to keep data per files in duplicate paths

        Created just to avoid using python list/dicts intermixed, and be able to consistently use
        dot-notation of fields
    """
    def __init__(self, count, d1, d2, did1, did2, hashes ):
        self.count=count
        self.d1=d1
        self.d2=d2
        self.did1=did1
        self.did2=did2
        self.hashes=hashes
        return

################################################################################
class Duplicates(PA):
    """ Duplicates class that has methods to process DB duplicate photo data

    The Duplicates class is used with one instance/object to process all the
    'duplicate' data from the Database, and parse it into more usable data
    structures.    This is needed also, as the database content shows duplicates
    more than once, e.g.
        file1 and file2 are a duplicate, then later file2 and file 1 are 'another' duplicate
    The class passes over the data in 2 passes.  The first pass in AddDup() finds
    any files in the import and storage path and marks the storage ones to keep,
    the import ones to delete.  Anything else is either a set of files duplicated
    inside the import path or set of files duplicated in the storage path
    The first pass, simply concatenates these into a data structure
    (im_same_dups) that contains all the duplicates with a key of the md5 hash

    The second pass (), processes these duplicates to see if there are any in the
    storage path that follow the pattern 'YYYY/YYYYMMDD' -> if so mark these to
    keep and the rest to be deleted.

    After the 2 passes, we have data structures that allow the web to break up
    the duplicates into batches to process:
       1) auto delete any in the import path that are also in the storage path
          - careful here, if we have any in the import path and 2+ in the storage path, leave it for manual intervention
       2) auto delete any in the storage path that are in a set where 1 of them match the 'YYYY/YYYYMMDD' format, the rest are deleted
       3) a set of directories where there are only 2 duplicate files (with the same file name), just in a different dir - allow user to choose the dir to keep
       4) a set of individual files where I want the user to make a decision (3 or more copies, those with different filenames, or in the same dir) - allow user to choose file to keep
    """

    def __init__(self):
        """ initialises all the Duplicates Attributes """
        self.ip_to_sp_dups_keep={}
        self.ip_to_sp_dups_del={}
        self.dups_to_process={}
        self.per_file_dups=[]
        self.per_path_dups=[]
        self.preferred_file={}
        self.preferred_path={}
        self.hashes_processed={}
        self.eids_processed={}
        self.uniq_dups=0
        self.total_dups=0

        self.import_ptype_id  = PathType.query.filter(PathType.name=="Import").first().id
        self.storage_ptype_id = PathType.query.filter(PathType.name=="Storage").first().id

    def InImportPath( self, path_type ):
        """ Is the path being checked a import path

        Args:
            path_type (int): db key for the path_type of the path being checked
        Returns:
            bool: True if this path is a import path
        """
        if path_type == self.import_ptype_id:
            return True
        return False

    def InStoragePath( self, path_type ):
        """ Is the path being checked a storage path

        Args:
            path_type (int): db key for the path_type of the path being checked
        Returns:
            bool: True if this path is a storage path
        """
        if path_type == self.storage_ptype_id:
            return True
        return False

    def KeepInIPSPDups( self, obj ):
        """ stores this file into the "keep from same path" list

        sometimes there can be more than 1 SP, e.g SP to SP to IP
        for now, by not dealing with the extra SP, we will just delete the IP, and force a check_dups after deleting, it will then
        pick up and process the SP to SP - if still needed -- if there is only SP1 to SP2, then the per_path_dup will pick it up and
        I believe this will all work, but doesn't hurt to do an extra check_dups again

        Args:
            obj (DupRow): file that will be stored into the "Delete from same path" list
        Returns:
            None
        """
        if obj.h not in self.ip_to_sp_dups_keep:
            self.ip_to_sp_dups_keep[obj.h]= obj
        return

    def DelInIPSPDups( self, obj ):
        """ stores this object into the Delete from same path list (if it is not already there)

        Args:
            obj (DupRow): file that will be stored into the "Delete from same path" list
        Returns:
            None
        """

        if obj.h not in self.ip_to_sp_dups_del:
            self.ip_to_sp_dups_del[obj.h]=[]
            self.ip_to_sp_dups_del[obj.h].append( obj )
        else:
            for el in self.ip_to_sp_dups_del[obj.h]:
                if el.id == obj.id:
                    return
            self.ip_to_sp_dups_del[obj.h].append( obj )
        return

    def DupInImportAndStoragePath( self, row, dr1, dr2 ):
        """ handles a duplicate file in import and storage paths, and stores them into keep lists

        this function takes a duplicate file (in the import path and the storage path)
        and then puts the storage path file in the keep list (self.ip_to_sp_dups_keep) via self.KeepInIPSPDups()
        and then puts the import path file in the delete list (self.ip_to_sp_dups_keep) via self.DelInIPSPDups()

        Args:
            row (ORM row): row from the database with a dup pair in dir1 & dir2
            dr1 (DupRow): dup data for file 1 or a duplicate
            dr2 (DupRow): dup data for file 2 or a duplicate

        Returns:
            bool: True if file is in both import and storage path, False otherwise
        """
        if self.InStoragePath(row.path_type1) and self.InImportPath(row.path_type2):
            self.KeepInIPSPDups( dr1 )
            self.DelInIPSPDups( dr2 )
            return True
        if self.InStoragePath(row.path_type2) and self.InImportPath(row.path_type1):
            self.KeepInIPSPDups( dr2 )
            self.DelInIPSPDups( dr1 )
            return True
        return False

    # AddDup: takes a row from the database effectively file1 & file2
    # we process these into appropriate data structures on this first pass
    def AddDup( self, row ):
        self.hashes_processed[row.hash]=1
        self.eids_processed[row.id1]=1
        self.eids_processed[row.id2]=1
        dr1=DupRow( row.hash, row.fname1, row.rel_path1, row.did1, row.id1 )
        dr2=DupRow( row.hash, row.fname2, row.rel_path2, row.did2, row.id2 )
        # if in both import and storage path, just keep the storage path file,
        # and del import path file.  This function checks and keeps/dels as needed
        if self.DupInImportAndStoragePath( row, dr1, dr2 ):
            return

        # if the hash is not in dups_to_process, created / append
        if row.hash not in self.dups_to_process:
            self.dups_to_process[row.hash]=[]
            self.dups_to_process[row.hash].append( dr1 )
            self.dups_to_process[row.hash].append( dr2 )
        else:
            # process path1 / fname1 -- if that combo is not in the dups_to_process[hash], add it
            found=0
            for dup in self.dups_to_process[row.hash]:
                if dup.id == row.id1:
                    found=1
                    continue
            if not found:
                self.dups_to_process[row.hash].append( dr1 )

            # process path2 / fname2 -- if that combo is not in the dups_to_process[hash], add it
            for dup in self.dups_to_process[row.hash]:
                if dup.id == row.id2:
                    found=1
                    continue
            if not found:
                self.dups_to_process[row.hash].append( dr2 )
        return

    # AddDupPath: takes a row from the database effectively with a dup pair in dir1 & dir2
    # we process these into appropriate data structures on this second pass
    # working through if a dir is in th estorage path and is
    def AddDupPath(self, hash):
        # this gets complex, if this hash is also in a shared imp / sp - then dont deal with it now, let the imp files be deleted and
        # the repeat check_dups validation step catch it as a cleander (potential) for still more duplicates just in sp
        dpr=DupPathRow( 2, self.dups_to_process[hash][0].d, self.dups_to_process[hash][1].d, self.dups_to_process[hash][0].did, self.dups_to_process[hash][1].did, hash )
        if hash in self.ip_to_sp_dups_keep:
            return False
        new=1
        for el in self.per_path_dups:
            # if this new hash / dup in dpr has same dirs as existing per_path_dups row, then just another file in same dup dir...
            if el.d1 == dpr.d1 and el.d2 == dpr.d2:
                el.count += 2
                el.hashes = f"{el.hashes},{hash}"
                new=0
        # okay, we have a new pair of duplicate dirs... Add them, and if either has matching regex its preferred
        # FIXME: what if both do? what if one is in SP and the other not, etc...
        if new:
            self.per_path_dups.append( dpr )
        if re.search( r"\d{4}/\d{8}", dpr.d1):
            self.preferred_path[dpr.did1]=1
        if re.search( r"\d{4}/\d{8}", dpr.d2):
            self.preferred_path[dpr.did2]=1
        return True

    # The second pass processes row by row of dups_to_process, looking for per_file_dups and per_path_dups
    # AND works out counts to display overall tallies of types of keeping and deletion of files, and choices to make
    def SecondPass(self):
        # okay, go for each duplicate that should be processed (they are stored
        # by hash, and have at least 2 entries, but can have more, and be in
        # the IP or SP and any combo, cater for all below
        for hash in self.dups_to_process:
            # more than 2 files (just ask per file) OR (implied) only 2 copies, and files are in same dir (so must be diff name, so just ask) OR (implied) on 2 copies in same dir & filename different (ask per file)
            # will force ask per file
            if (len(self.dups_to_process[hash]) > 2) or (self.dups_to_process[hash][0].d == self.dups_to_process[hash][1].d) or (self.dups_to_process[hash][0].f != self.dups_to_process[hash][1].f):
                self.per_file_dups.append(self.dups_to_process[hash])
                for el in self.dups_to_process[hash]:
                    if re.search( r"\d{4}/\d{8}", el.d):
                        self.preferred_file[hash] = el.id
            else:
                # will force ask per path
                self.AddDupPath( hash )

        # provide convenience counts
        self.uniq_dups = len(self.hashes_processed)
        self.total_dups = len(self.eids_processed)
        return

    # quick debugger to see the data in the data structure (not used by default)
    def Dump(self):
        if len(self.ip_to_sp_dups_keep) > 0:
            print( "############ Files that are in both Import and Storage Paths ###########")
            for h in self.ip_to_sp_dups_keep:
                print( f"hash={h} keep 1 of {len(self.ip_to_sp_dups_del[h])+1}, keep: {self.ip_to_sp_dups_keep[h]} | ", end="" )
                for d in self.ip_to_sp_dups_del[h]:
                    print( f"Del: {d}", end="" )
            print( "" )
            print( f"{len(self.ip_to_sp_dups_keep)} sets of duplicate files to delete at least 1, anything with 2 or more dups is printed above explicitly" )

        if len(self.dups_to_process) > 0:
            print( "############ Duplicate Files that are needing to be futher processed ###########")
            for h in self.dups_to_process:
                print( f"hash={h} keep 1 of {len(self.dups_to_process[h])} from: {self.dups_to_process[h]}" )
            print( f"which is a total of {len(self.dups_to_process)} set(s) of duplicate files to keep only 1 of" )

        if len(self.preferred_file) > 0:
            print( "     We have preferred (regexp matched) ###########")
            for h in self.preferred_file:
                print( f"hash={h}, keep this one: {self.preferred_file[h]} from ", end="" )
                for d in self.dups_to_process[h]:
                    print( f"{d.id}, ", end="" )
                print ("")
            print( f"which is a total of {len(self.preferred_file)} duplicate files we will keep as they match the regexp" )

        if len(self.per_path_dups) > 0:
            print( "############ Duplicate Files in Paths that are needing to be futher processed ###########")
            for pair in self.per_path_dups:
                print( f"{pair.count} dups in dir1: {pair.did1}  dir2: {pair.did2}" )
                if pair.did1 in self.preferred_path:
                    print("Keep dir1")
                if pair.did2 in self.preferred_path:
                    print("Keep dir2")
            print( f"which is a total of {len(self.per_path_dups)} set(s) of path dups to process" )
            return