fixed bugs 47, 48, 49 and reduced complexity of dup counting as well. Now removes dirs than become empty due to duplicate removals

2021-09-04 19:50:49 +10:00
parent 4a9c3b4aa3
commit eb0145467f
5 changed files with 69 additions and 56 deletions
--- a/5
+++ b/5
@@ -1,8 +1,3 @@
 ### Next: 49
 BUG-45: when deleting, the .pa_bin path has c:\ ... in it, not the real path
        -- due to multiple Bin paths, really only should have one... Prob. best to re-jig Setting to have Base path (diff for me and Cam), then only 1 Bin path of base, can allow multiple import / storage as it works anyway
-BUG-46: when duplicate is in both import dir x 2 and at least 1 storage (maybe more), it tries to delete/keep it twice
-BUG-47: maybe related to BUG-46, but the counts for deleting are broken (saying 2 manual choices to delete 0 / keep 0 files)
-        SEEMS related... file is in Storage and choices in Import, Storage keeps file first, Import choices are then 
-        thrown away (definitely caseus BUG-46, and I think causes the counts too)
-BUg-48: initial code to CleanUpDirInDB does not even seem to be called, let alone work
--- a/16
+++ b/16
@@ -1,22 +1,20 @@
 ## GENERAL
-    * remove dirs after the duplicate cleanup removes all its content
-
    * Face matching:
        - upgrade to face distance per face per file [DONE]
        - face locations:
            START FORM SCRATCH for prod so all images have face_locn data
+        - need to reconsider whether current distance algorithm gives best match - can I do better?

    * could look to remove the hand fixing of json.loads of array data --> seems you can make your own datatype in the ORM, and it can do the conversion every time you use it
        - https://stackoverflow.com/questions/28143557/sqlalchemy-convert-column-value-back-and-forth-between-internal-and-database-fo

+    * per file you could select an unknown face and add it as a ref img to an existing person, or make a new person and attach?
+    * from menu, we could try to get smart/fancy... say find face with largest size, check it vs. other faces, if it matches more than say 10? we offer it up as a required ref img, then cut that face (with margin) out and use it is a new ref image / person
+
    * fix up logging in general
    * comment your code
        * js files
        * html files?
-    * more OO goodness :)
-    * viewer needs to allow toggle to scan_model (and prob. right-click on file... AI (with CNN) AI (with hog)
-        - make the form-select AI_Model actually do the change (but need more mem on mara really)
-
 ## DB	
    * Dir can have date in the DB, so we can do Oldest/Newest dirs in Folder view

@@ -41,15 +39,17 @@
 ### AI
    * faces per file (need a threshold for too many? OR
    * consider size of bbox of face / 'high-quality' faces -- if face is too small in image, dont match it
-    * if we have a high-qual face, we could show this on a page and have UI to create ref img / person for it

 ### UI
    ??? ipads can't do selections and contextMenus, do I want to re-factor to cater for this?
        - partial fix, double-click / tap allows viewing (most useful context-menu feature)

+    * viewer needs to allow toggle to scan_model (and prob. right-click on file... AI (with CNN) AI (with hog)
+        - make the form-select AI_Model actually do the change (but need more mem on mara really)
+
    For AI / rescan:
        way to override per file:
-            the model used
+            the model used [partial - UI done, need mem on mara]
            the threshold used?
        maybe on the per file you could select an unknown face and add it as a ref img to a existing person, or make a new person and attach?

--- a/dups.py
+++ b/dups.py
@@ -88,6 +88,7 @@ class Duplicates(PA):
        self.preferred_file={}
        self.preferred_path={}
        self.hashes_processed={}
+        self.eids_processed={}
        self.uniq_dups=0
        self.total_dups=0

@@ -146,6 +147,8 @@ class Duplicates(PA):
    # we process these into appropriate data structures on this first pass
    def AddDup( self, row ):
        self.hashes_processed[row.hash]=1
+        self.eids_processed[row.id1]=1
+        self.eids_processed[row.id2]=1
        dr1=DupRow( row.hash, row.fname1, row.rel_path1, row.did1, row.id1 )
        dr2=DupRow( row.hash, row.fname2, row.rel_path2, row.did2, row.id2 )
        # if in both import and storage path, just keep the storage path file,
@@ -206,39 +209,24 @@ class Duplicates(PA):
    # The second pass processes row by row of dups_to_process, looking for per_file_dups and per_path_dups
    # AND works out counts to display overall tallies of types of keeping and deletion of files, and choices to make
    def SecondPass(self):
-        # sort out counts (for ip_to_sp - that is all finished)
-        self.uniq_dups = len(self.hashes_processed)
-        # total starts with 1 copy of everything we keep in sp
-        self.total_dups = len(self.ip_to_sp_dups_keep)
-        # and then add all those we delete in ip that are in sp
-        for hash in self.ip_to_sp_dups_del:
-            self.total_dups += len(self.ip_to_sp_dups_del[hash])
-
        # okay, go for each duplicate that should be processed (they are stored
        # by hash, and have at least 2 entries, but can have more, and be in
        # the IP or SP and any combo, cater for all below
        for hash in self.dups_to_process:
            # more than 2 files (just ask per file) OR (implied) only 2 copies, and files are in same dir (so must be diff name, so just ask) OR (implied) on 2 copies in same dir & filename different (ask per file)
+            # will force ask per file
            if (len(self.dups_to_process[hash]) > 2) or (self.dups_to_process[hash][0].d == self.dups_to_process[hash][1].d) or (self.dups_to_process[hash][0].f != self.dups_to_process[hash][1].f):
                self.per_file_dups.append(self.dups_to_process[hash])
                for el in self.dups_to_process[hash]:
                    if re.search( r'\d{4}/\d{8}', el.d):
                        self.preferred_file[hash] = el.id
-                self.total_dups += len(self.dups_to_process[hash])
-                # if this combination ALSO has an import path dup, then we have already counted the storage path dup in the earlier keeping count
-                if hash in self.ip_to_sp_dups_keep:
-                    self.total_dups -= 1
-            # only 2 files AND different path (ask per path) AND with the same name...
            else:
-                # if this dup path is not already being partially handled by an ip <-> sp dup, then add it / count it
-                if self.AddDupPath( hash ):
-                    self.total_dups += 2
-                else:
-                    # okay, if we are here, this path combo is also in an IP <-> SP combo.
-                    # IF, this dup we tried to add was in SP<->SP, then there
-                    # is another dup to count, if its IP<->IP (as we append these to the del list), then nothing further to count
-                    if self.InStoragePath(self.dups_to_process[hash][0].d):
-                        self.total_dups += 1
+                # will force ask per path
+                self.AddDupPath( hash )
+
+        # provide convenience counts
+        self.uniq_dups = len(self.hashes_processed)
+        self.total_dups = len(self.eids_processed)
        return

    # quick debugger to see the data in the data structure (not used by default)
--- a/pa_job_manager.py
+++ b/pa_job_manager.py
@@ -815,25 +815,55 @@ def ResetExistsOnFS(job, path):
 # Convenience function to remove a file from the database - and its associated links
 # used when scanning and a file has been removed out from under PA, or when we remove duplicates
 ####################################################################################################################################
-def RemoveFileFromDB(id):
-    session.query(EntryDirLink).filter(EntryDirLink.entry_id==id).delete()
-    session.query(File).filter(File.eid==id).delete()
-    session.query(Entry).filter(Entry.id==id).delete()
+def RemoveEmtpyDirFromFS( job, del_me ):
+    try:
+        os.rmdir( del_me.FullPathOnFS() )
+    except Exception as e:
+        print( f"ERROR: Failed to remove file from filesystem - which={del_me.name}, err: {e}")
+    return
+
+def RemoveEmptyDirFromDB( job, del_me ):
+    session.query(EntryDirLink).filter(EntryDirLink.entry_id==del_me.id).delete()
+    session.query(PathDirLink).filter(PathDirLink.dir_eid==del_me.id).delete()
+    session.query(Dir).filter(Dir.eid==del_me.id).delete()
+    session.query(Entry).filter(Entry.id==del_me.id).delete()
+    AddLogForJob( job, f"INFO: Removing {del_me.name} from system as removing duplicates has left it empty" )
    return

 ####################################################################################################################################
-# Actually moves the physical file from its current real directory to a subdir of the recycle bin path
+# Convenience function called after we delete an entry from a DB (always starts as a file deletion), but if we have an empty dir
+# this func will delete it and check that the parent dir is non-empty recursively, so this could trigger a cascading deletion of 
+# empty dirs in a hierachy, if the entry deletion leaves a dir with content, just finish
 ####################################################################################################################################
-def RemoveFileFromFS( del_me ):
-    try:
-        settings = session.query(Settings).first()
-        dst_dir=settings.recycle_bin_path + '/' + del_me.in_dir.in_path.path_prefix.replace('static/','') + '/' + del_me.in_dir.rel_path + '/'
-        os.makedirs( dst_dir,mode=0o777, exist_ok=True )
-        src=del_me.FullPathOnFS()
-        dst=dst_dir + '/' + del_me.name
-        os.replace( src, dst )
-    except Exception as e:
-        print( f"ERROR: Failed to remove file from filesystem - which={src}, err: {e}")
+def CleanUpDirInDB(job, d):
+    session.commit()
+    print( f"CleanUpDirInDB(): checking dir: {d.name} ({d.id})" )
+    content = session.query(Entry).join(EntryDirLink).filter(EntryDirLink.dir_eid==d.id).first()
+    if not content:
+        print( f"  Dir {d.name} - {d.id} is empty - removing it" )
+        # get an Entry from DB (in_dir is a Dir)
+        parent_dir = session.query(Entry).get(d.in_dir.eid)
+        # okay remove this empty dir
+        RemoveEmtpyDirFromFS( job, d )
+        RemoveEmptyDirFromDB( job, d)
+        print( f"  Dir {d.name} is in {parent_dir.name} ({parent_dir.id}) -> check next" )
+        # check to see if removing the empty dir has left the parent dir empty
+        CleanUpDirInDB(job, parent_dir)
+    else:
+        print( f"There is content (first entry: {content.name}) in {d.name} - finished for this dir" )
+    return
+
+####################################################################################################################################
+# Convenience function to remove a file from the database - and its associated links
+# used when scanning and a file has been removed out from under PA, or when we remove duplicates
+####################################################################################################################################
+def RemoveFileFromDB(job, del_me):
+    parent_dir=del_me.in_dir
+    session.query(EntryDirLink).filter(EntryDirLink.entry_id==del_me.id).delete()
+    session.query(File).filter(File.eid==del_me.id).delete()
+    session.query(Entry).filter(Entry.id==del_me.id).delete()
+    AddLogForJob( job, f"INFO: Removing {rm.name} from system as it is no longer on the file system")
+    CleanUpDirInDB(job, parent_dir)
    return

 ####################################################################################################################################
@@ -927,8 +957,10 @@ def MoveFileToRecycleBin(job,del_me):
        parent_dir=new_dir
        part_rel_path += "/"
    
+    parent_dir = session.query(Entry).get(del_me.in_dir.eid)
    del_me.in_dir = new_dir
    AddLogForJob(job, f"Deleted file: {del_me.name} - (moved to {os.path.dirname(del_me.FullPathOnFS())})" )
+    CleanUpDirInDB(job, parent_dir)
    return

 ####################################################################################################################################
@@ -986,14 +1018,12 @@ def HandleAnyFSDeletions(job):
    rms = session.query(Entry).filter(Entry.exists_on_fs==False,Entry.type_id!=dtype.id).all()
    rm_cnt=0
    for rm in rms:
-        RemoveFileFromDB(rm.id)
-        AddLogForJob( job, f"INFO: Removing {rm.name} from system as it is no longer on the file system")
+        RemoveFileFromDB(job, rm)
        rm_cnt+=1

    rmdirs = session.query(Entry).filter(Entry.exists_on_fs==False,Entry.type_id==1).order_by(Entry.id.desc()).all()
    for rmdir in rmdirs:
-        RemoveFileFromDB(rmdir.id)
-        AddLogForJob( job, f"INFO: Removing {rmdir.name} from system as it is no longer on the file system")
+        RemoveFileFromDB(job, rmdir)
        rm_cnt+=1
    return rm_cnt

@@ -1542,7 +1572,7 @@ def InitialValidationChecks():
            rbp_exists=1
            ptype = session.query(PathType).filter(PathType.name=='Bin').first().id
            symlink=CreateSymlink(job,ptype,path)
-            path, dirs, files = next(os.walk(path))
+            root, dirs, files = next(os.walk(path))
            if len(dirs) + len(files) > 0:
                AddLogForJob(job, "INFO: the bin path contains content, cannot process to know where original deletes were form - skipping content!" )
                AddLogForJob(job, "TODO: could be smart about what is known in the DB vs on the FS, and change below to an ERROR if it is one")
--- a/tables.sql
+++ b/tables.sql
@@ -121,6 +121,6 @@ insert into PERSON values ( (select nextval('PERSON_ID_SEQ')), 'mum', 'Mandy', '
 insert into PERSON values ( (select nextval('PERSON_ID_SEQ')), 'cam', 'Cameron', 'De Paoli' );
 insert into PERSON values ( (select nextval('PERSON_ID_SEQ')), 'mich', 'Michelle', 'De Paoli' );
 -- DEV:
-insert into SETTINGS ( id, import_path, storage_path, recycle_bin_path, default_refimg_model, default_scan_model, default_threshold ) values ( (select nextval('SETTINGS_ID_SEQ')), '/home/ddp/src/photoassistant/images_to_process/#c:/Users/cam/Desktop/code/python/photoassistant/photos/#/home/ddp/src/photoassistant/new_img_dir/', '/home/ddp/src/photoassistant/storage/#c:/Users/cam/Desktop/code/python/photoassistant/storage/', '/home/ddp/src/photoassistant/.pa_bin/#c:/Users/cam/Desktop/code/python/photoassistant/.pa_bin/', 2, 1, '0.55' );
+insert into SETTINGS ( id, import_path, storage_path, recycle_bin_path, default_refimg_model, default_scan_model, default_threshold ) values ( (select nextval('SETTINGS_ID_SEQ')), '/home/ddp/src/photoassistant/images_to_process/#c:/Users/cam/Desktop/code/python/photoassistant/photos/#/home/ddp/src/photoassistant/new_img_dir/', '/home/ddp/src/photoassistant/storage/#c:/Users/cam/Desktop/code/python/photoassistant/storage/', '/home/ddp/src/photoassistant/.pa_bin/', 2, 1, '0.55' );
 -- PROD:
 --insert into SETTINGS ( id, import_path, storage_path, recycle_bin_path, default_refimg_model, default_scan_model, default_threshold ) values ( (select nextval('SETTINGS_ID_SEQ')), '/export/docker/storage/Camera_uploads/', '/export/docker/storage/photos/', '/export/docker/storage/.pa_bin/', 2, 1, '0.55' );