photoassistant/dups.py

from wtforms import SubmitField, StringField, HiddenField, validators, Form
from flask_wtf import FlaskForm
from flask import request, render_template, redirect, send_from_directory
from main import db, app, ma
from sqlalchemy import Sequence
from sqlalchemy.exc import SQLAlchemyError
from status import st, Status
import os
import glob
from PIL import Image
from pymediainfo import MediaInfo
import hashlib
import exifread
import base64
import numpy
import cv2
import time
import re

################################################################################
# Local Class imports
################################################################################
from job import Job, JobExtra, Joblog, NewJob
from settings import Settings
from shared import SymlinkName

# DupRow class is a simple 'struct' to keep data per duplicate file / just to
# avoid using python list/dicts intermixed, and be able to consistently use
# dot-notation of fields
class DupRow:
    def __init__(self, hash, file, dir, did, fid):
        ### DupRow Attributes -- note, simple class, no methods ###
        self.h=hash
        self.f=file
        self.d=dir
        self.did=did
        self.id=fid

# Duplicates class is used with one instance/object to process all the
# 'duplicate' data from the Database, and parse it into more usable data
# structures.    This is needed also, as the database content shows duplicates
# more than once, e.g.
#     file1 and file2 are a duplicate, then later file2 and file 1 are 'another' duplicate
# The class passes over the data in 2 passes.  The first pass in AddDup() finds
# any files in the import and storage path and marks the storage ones to keep,
# the import ones to delete.  Anything else is either a set of files duplicated
# inside the import path or set of files duplicated in the storage path
# The first pass, simply concatenates these into a data structure
# (im_same_dups) that contains all the duplicates with a key of the md5 hash
#
# The second pass (), processes these duplicates to see if there are any in the
# storage path that follow the pattern 'YYYY/YYYYMMDD' -> if so mark these to
# keep and the rest to be deleted.
#
# After the 2 passes, we have data structures that allow the web to break up
# the duplicates into batches to process:
#    1) auto delete any in the import path that are also in the storage path
#       - carefule here, if we have 2 in the import path and 2+ in the storage path, leave it for manual intervention
#    2) auto delete any in the storage path that are in a set where 1 of them match the 'YYYY/YYYYMMDD' format, the rest are deleted
#    3) a set of directories where there are only 2 duplicate files (with the same file name), just in a different dir - allow user to choose the dir to keep
#    4) a set of individual files where I want the user to make a decision (3 or more copies, those with different filenames, or in the same dir) - allow user to choose file to keep
class Duplicates:
    def __init__(self):
        ### Duplicates Attributes ###
        self.ip_to_sp_dups_keep={}
        self.ip_to_sp_dups_del={}
        self.dups_to_process={}
        self.per_file_dups=[]
        self.per_path_dups=[]
        self.preferred={}
        self.all_paths=[]
        self.storage_paths=[]
        self.import_paths=[]

        # pull apart the storage path Setting, and make array of each for use in TrimmedPath()
        settings=Settings.query.first()
        paths = settings.storage_path.split("#")
        for path in paths:
            prefix = SymlinkName(path,path+'/')
            self.storage_paths.append(prefix)
            self.all_paths.append(prefix)
        # pull apart the import path Setting, and make array of each for use in TrimmedPath()
        paths = settings.import_path.split("#")
        for path in paths:
            prefix = SymlinkName(path,path+'/')
            self.import_paths.append(prefix)
            self.all_paths.append(prefix)

    # Strip the front of the path (any match on a storage or import path) is
    # removed.   Just to make it easier to read when we display in the web page
    def TrimmedPath( self, path ):
        for p in self.all_paths:
            if re.match( f"^{p}", path ):
                return path.replace(p, '' )
        return path

    # is this file in the import path?
    def InImportPath( self, path ):
        for p in self.import_paths:
            if re.match( f"^{p}", path ):
                return True
        return False

    # is this file in the storage path?
    def InStoragePath( self, path ):
        for p in self.storage_paths:
            if re.match( f"^{p}", path ):
                return True
        return False

    # this stores this object into the keep from same path list (only ever 1)
    def KeepInSameDups( self, obj ):
        if obj.h not in self.ip_to_sp_dups_keep:
            self.ip_to_sp_dups_keep[obj.h]= obj
        else:
            print( f"DDP: we need to cater for this - 2 files to keep in the storage path, if they are different, then pull these out of here and put them in the in_same_dup list to manually process" )
        return

    # this stores this object into the Delete from same path list (if it is not
    # already there)
    def DelInSameDups( self, obj ):
        if obj.h not in self.ip_to_sp_dups_del:
            self.ip_to_sp_dups_del[obj.h]=[]
            self.ip_to_sp_dups_del[obj.h].append( obj )
        else:
            for el in self.ip_to_sp_dups_del[obj.h]:
                if el.id == obj.id:
                    return
            self.ip_to_sp_dups_del[obj.h].append( obj )
        return

    # this function takes a duplicate file (in the import path and the storage path)
    # and then puts the storage path file in the keep list (self.ip_to_sp_dups_keep) via self.KeepInSameDups()
    # and then puts the import path file in the delete list (self.ip_to_sp_dups_keep) via self.DelInSameDups()
    def DupInImportAndStoragePath( self, row, dr1, dr2 ):
        if self.InStoragePath(row.path1) and self.InImportPath(row.path2):
            self.KeepInSameDups( dr1 )
            self.DelInSameDups( dr2 )
            return True
        if self.InStoragePath(row.path2) and self.InImportPath(row.path1):
            self.KeepInSameDups( dr2 )
            self.DelInSameDups( dr1 )
            return True
        return False

    # AddDup: takes a row from the database effectively file1 & file2
    # we process these into appropriate data structures on this first pass
    def AddDup( self, row ):
        dr1=DupRow( row.hash, row.fname1, self.TrimmedPath(row.path1), row.did1, row.id1 )
        dr2=DupRow( row.hash, row.fname2, self.TrimmedPath(row.path2), row.did2, row.id2 )
        # if in both import and storage path, just keep the storage path file,
        # and del import path file.
        if self.DupInImportAndStoragePath( row, dr1, dr2 ):
            return

        # if we are here, we have duplicates either in the storage path or in
        # the import path

        # if the hast is no dups_to_process, created / append
        if row.hash not in self.dups_to_process:
            self.dups_to_process[row.hash]=[]
            self.dups_to_process[row.hash].append( dr1 )
            self.dups_to_process[row.hash].append( dr2 )
        else:
            # process path1 / fname1 -- if that combo is not in the dups_to_process[hash], add it
            found=0
            for dup in self.dups_to_process[row.hash]:
                if dup.id == row.id1:
                    found=1
                    continue
            if not found:
                self.dups_to_process[row.hash].append( dr1 )

            # process path2 / fname2 -- if that combo is not in the dups_to_process[hash], add it
            for dup in self.dups_to_process[row.hash]:
                if dup.id == row.id2:
                    found=1
                    continue
            if not found:
                self.dups_to_process[row.hash].append( dr2 )
        return

    # quick debugger to see the data in the data structure
    def Dump(self):
        if len(self.ip_to_sp_dups_keep) > 0:
            print( "############ Files that are in both Import and Storage Paths ###########")
            cnt=0
            for h in self.ip_to_sp_dups_keep:
                cnt +=1
                if len(self.ip_to_sp_dups_del[h])>2:
                    print( f"hash={h}, keep: {self.ip_to_sp_dups_keep[h]}" )
                    for d in self.ip_to_sp_dups_del[h]:
                        print( f"Del: {d}" )
            print( f"{cnt} sets of duplicate files to delete at least 1, anything with 2 or more dups is printed above explicitly" )
        if len(self.dups_to_process) > 0:
            print( "############ Duplicate Files that are needing to be futher processed ###########")
            cnt=0
            for h in self.dups_to_process:
                cnt +=1
                if len(self.dups_to_process[h])>2:
                    print( f"hash={h}, keep 1 of these: ", end='')
                    for d in self.dups_to_process[h]:
                        print( f"{d.id}, ", end='' )
                    print ("")
            print( f"{cnt} sets of duplicate files to delete at least 1, anything with 2 or more dups is printed above explicitly" )
        return


"""
@app.route("/fix_dups", methods=["POST"])
def fix_dups():
    rows = db.engine.execute( "select e1.id as id1, f1.hash, d1.path_prefix as path1, d1.eid as did1, e1.name as fname1, e2.id as id2, d2.path_prefix as path2, d2.eid as did2, e2.name as fname2 from entry e1, file f1, dir d1, entry_dir_link edl1, entry e2, file f2, dir d2, entry_dir_link edl2 where e1.id = f1.eid and e2.id = f2.eid and d1.eid = edl1.dir_eid and edl1.entry_id = e1.id and edl2.dir_eid = d2.eid and edl2.entry_id = e2.id  and f1.hash = f2.hash and e1.id != e2.id and f1.size_mb = f2.size_mb order by path1, fname1" )

    if rows.returns_rows == False:
        st.SetAlert("success")
        st.SetMessage(f"Err, no dups - should now clear the FE 'danger' message?")
        return render_template("base.html")

    jexes = JobExtra.query.join(Job).join(PA_JobManager_Message).filter(PA_JobManager_Message.id==request.form['fe_msg_id']).all()
    path=[jex.value for jex in jexes if jex.name == "path"][0]
    prefix = SymlinkName(path,path+'/')
    if 'pagesize' not in request.form:
        pagesize=int([jex.value for jex in jexes if jex.name == "pagesize"][0])
    else:
        pagesize=int(request.form['pagesize'])
    dups={}
    for row in rows:
        AddDup( prefix+'/', row, dups )

    d1=""
    d2=""
    did1=""
    did2=""
    str=""
    dup_cnt=1
    preferred={}
    per_file_dups=[]
    per_path_dups=[]
    hashes=""
    overall_dup_cnt=0
    overall_dup_sets=0
    for hash in dups:
        # more than 2 files (just ask per file) OR
        # only 2 copies, and files are in same dir (so must be diff name, so just ask) OR
        # content same, filename different (just ask per file)
        if (len(dups[hash]) > 2) or (dups[hash][0]['d'] == dups[hash][1]['d']) or (dups[hash][0]['f'] != dups[hash][1]['f']):
            per_file_dups.append(dups[hash])
            overall_dup_cnt += len(dups[hash])
            overall_dup_sets += 1
            for el in dups[hash]:
                if re.search( '\d{4}/\d{8}', el['d']):
                    preferred[hash] = el['id']
                    if overall_dup_cnt<5:
                        print( f"{dups[hash]} <- keeping {el['d']} -- {preferred[hash]}" )
        # by here we have only 2 files, with the same name, different path
        # (MOST COMMON, and I think we dont care per file, just per path)
        elif d1 != dups[hash][0]['d']:
            if d1 != '':
                overall_dup_cnt += dup_cnt
                overall_dup_sets += 1
                per_path_dups.append({'count': dup_cnt, 'd1': d1, 'd2': d2, 'did1': did1, 'did2': did2, 'hashes' : hashes })
                dup_cnt=1
            d1 = dups[hash][0]['d']
            d2 = dups[hash][1]['d']
            did1 = dups[hash][0]['did']
            did2 = dups[hash][1]['did']
            str=f"duplicates found in {d1} and {d2}"
            hashes = f"{hash},"
        else:
            dup_cnt += 1
            hashes += f"{hash},"

    if d1 != '':
        overall_dup_cnt += dup_cnt
        overall_dup_sets += dup_cnt
        per_path_dups.append({'count': dup_cnt, 'd1': d1, 'd2': d2, 'did1': did1, 'did2': did2, 'hashes' : hashes })
"""