photoassistant/dups.py

from wtforms import SubmitField, StringField, HiddenField, validators, Form
from flask_wtf import FlaskForm
from flask import request, render_template, redirect, send_from_directory
from main import db, app, ma
from sqlalchemy import Sequence
from sqlalchemy.exc import SQLAlchemyError
from status import st, Status
import os
import glob
from PIL import Image
from pymediainfo import MediaInfo
import hashlib
import exifread
import base64
import numpy
import cv2
import time
import re

################################################################################
# Local Class imports
################################################################################
from job import Job, JobExtra, Joblog, NewJob
from settings import Settings
from shared import SymlinkName

class DupRow:
    def __init__(self, hash, file, dir, did, fid):
        ### DupRow Attributes -- note, simple class, no methods ###
        self.h=hash
        self.f=file
        self.d=dir
        self.did=did
        self.id=fid

class Duplicates:
    def __init__(self):
        ### Duplicates Attributes ###
        self.ip_to_sp_dups_keep={}
        self.ip_to_sp_dups_del={}
        self.in_same_dups={}
        self.per_file_dups=[]
        self.per_path_dups=[]
        self.preferred={}
        self.all_paths=[]
        self.storage_paths=[]
        self.import_paths=[]

        # per storage path, add entries to view
        settings=Settings.query.first()
        paths = settings.storage_path.split("#")
        for path in paths:
            prefix = SymlinkName(path,path+'/')
            self.storage_paths.append(prefix)
            self.all_paths.append(prefix)
        paths = settings.import_path.split("#")
        for path in paths:
            prefix = SymlinkName(path,path+'/')
            self.import_paths.append(prefix)
            self.all_paths.append(prefix)

    def TrimmedPath( self, path ):
        for p in self.all_paths:
            if re.match( f"^{p}", path ):
                return path.replace(p, '' )
        return path

    def InImportPath( self, path ):
        for p in self.import_paths:
            if re.match( f"^{p}", path ):
                return True
        return False

    def InStoragePath( self, path ):
        for p in self.storage_paths:
            if re.match( f"^{p}", path ):
                return True
        return False

    def KeepInSameDups( self, obj ):
        if obj.h not in self.ip_to_sp_dups_keep:
            self.ip_to_sp_dups_keep[obj.h]= obj
        return

    def DelInSameDups( self, obj ):
        if obj.h not in self.ip_to_sp_dups_del:
            self.ip_to_sp_dups_del[obj.h]=[]
            self.ip_to_sp_dups_del[obj.h].append( obj )
        else:
            for el in self.ip_to_sp_dups_del[obj.h]:
                if el.id == obj.id:
                    return
            self.ip_to_sp_dups_del[obj.h].append( obj )
        return

    def DupInImportAndStoragePath( self, row, dr1, dr2 ):
        if self.InStoragePath(row.path1) and self.InImportPath(row.path2):
            self.KeepInSameDups( dr1 )
            self.DelInSameDups( dr2 )
            return True
        if self.InStoragePath(row.path2) and self.InImportPath(row.path1):
            self.KeepInSameDups( dr2 )
            self.DelInSameDups( dr1 )
            return True
        return False

    def AddDup( self, row ):
        dr1=DupRow( row.hash, row.fname1, self.TrimmedPath(row.path1), row.did1, row.id1 )
        dr2=DupRow( row.hash, row.fname2, self.TrimmedPath(row.path2), row.did2, row.id2 )
        if self.DupInImportAndStoragePath( row, dr1, dr2 ):
            return

        if row.hash not in self.in_same_dups:
            self.in_same_dups[row.hash]=[]
            self.in_same_dups[row.hash].append( dr1 )
            self.in_same_dups[row.hash].append( dr2 )
        else:
            # process path1 / fname1 -- if that combo is not in the dups[hash], add it
            found=0
            for dup in self.in_same_dups[row.hash]:
                if dup.id == row.id1:
                    found=1
                    continue
            if not found:
                self.in_same_dups[row.hash].append( dr1 )

            # process path2 / fname2 -- if that combo is not in the dups[hash], add it
            for dup in self.in_same_dups[row.hash]:
                if dup.id == row.id2:
                    found=1
                    continue
            if not found:
                self.in_same_dups[row.hash].append( dr2 )
        return

    def Dump(self):
        if len(self.ip_to_sp_dups_keep) > 0:
            print( "############ Files that are in both Import and Storage Paths ###########")
            cnt=0
            for h in self.ip_to_sp_dups_keep:
                cnt +=1
                if len(self.ip_to_sp_dups_del[h])>2:
                    print( f"hash={h}, keep: {self.ip_to_sp_dups_keep[h]}" )
                    for d in self.ip_to_sp_dups_del[h]:
                        print( f"Del: {d}" )
            print( f"{cnt} sets of duplicate files to delete at least 1, anything with 2 or more dups is printed above explicitly" )
        if len(self.in_same_dups) > 0:
            print( "############ Duplicate Files that are in the same Path ###########")
            cnt=0
            for h in self.in_same_dups:
                cnt +=1
                if len(self.in_same_dups[h])>2:
                    print( f"hash={h}, keep 1 of these: ", end='')
                    for d in self.in_same_dups[h]:
                        print( f"{d.id}, ", end='' )
                    print ("")
            print( f"{cnt} sets of duplicate files to delete at least 1, anything with 2 or more dups is printed above explicitly" )
        return


"""
@app.route("/fix_dups", methods=["POST"])
def fix_dups():
    rows = db.engine.execute( "select e1.id as id1, f1.hash, d1.path_prefix as path1, d1.eid as did1, e1.name as fname1, e2.id as id2, d2.path_prefix as path2, d2.eid as did2, e2.name as fname2 from entry e1, file f1, dir d1, entry_dir_link edl1, entry e2, file f2, dir d2, entry_dir_link edl2 where e1.id = f1.eid and e2.id = f2.eid and d1.eid = edl1.dir_eid and edl1.entry_id = e1.id and edl2.dir_eid = d2.eid and edl2.entry_id = e2.id  and f1.hash = f2.hash and e1.id != e2.id and f1.size_mb = f2.size_mb order by path1, fname1" )

    if rows.returns_rows == False:
        st.SetAlert("success")
        st.SetMessage(f"Err, no dups - should now clear the FE 'danger' message?")
        return render_template("base.html")

    jexes = JobExtra.query.join(Job).join(PA_JobManager_Message).filter(PA_JobManager_Message.id==request.form['fe_msg_id']).all()
    path=[jex.value for jex in jexes if jex.name == "path"][0]
    prefix = SymlinkName(path,path+'/')
    if 'pagesize' not in request.form:
        pagesize=int([jex.value for jex in jexes if jex.name == "pagesize"][0])
    else:
        pagesize=int(request.form['pagesize'])
    dups={}
    for row in rows:
        AddDup( prefix+'/', row, dups )

    d1=""
    d2=""
    did1=""
    did2=""
    str=""
    dup_cnt=1
    preferred={}
    per_file_dups=[]
    per_path_dups=[]
    hashes=""
    overall_dup_cnt=0
    overall_dup_sets=0
    for hash in dups:
        # more than 2 files (just ask per file) OR
        # only 2 copies, and files are in same dir (so must be diff name, so just ask) OR
        # content same, filename different (just ask per file)
        if (len(dups[hash]) > 2) or (dups[hash][0]['d'] == dups[hash][1]['d']) or (dups[hash][0]['f'] != dups[hash][1]['f']):
            per_file_dups.append(dups[hash])
            overall_dup_cnt += len(dups[hash])
            overall_dup_sets += 1
            for el in dups[hash]:
                if re.search( '\d{4}/\d{8}', el['d']):
                    preferred[hash] = el['id']
                    if overall_dup_cnt<5:
                        print( f"{dups[hash]} <- keeping {el['d']} -- {preferred[hash]}" )
        # by here we have only 2 files, with the same name, different path
        # (MOST COMMON, and I think we dont care per file, just per path)
        elif d1 != dups[hash][0]['d']:
            if d1 != '':
                overall_dup_cnt += dup_cnt
                overall_dup_sets += 1
                per_path_dups.append({'count': dup_cnt, 'd1': d1, 'd2': d2, 'did1': did1, 'did2': did2, 'hashes' : hashes })
                dup_cnt=1
            d1 = dups[hash][0]['d']
            d2 = dups[hash][1]['d']
            did1 = dups[hash][0]['did']
            did2 = dups[hash][1]['did']
            str=f"duplicates found in {d1} and {d2}"
            hashes = f"{hash},"
        else:
            dup_cnt += 1
            hashes += f"{hash},"

    if d1 != '':
        overall_dup_cnt += dup_cnt
        overall_dup_sets += dup_cnt
        per_path_dups.append({'count': dup_cnt, 'd1': d1, 'd2': d2, 'did1': did1, 'did2': did2, 'hashes' : hashes })
"""