diff --git a/BUGs b/BUGs index 1b9233a..9be332f 100644 --- a/BUGs +++ b/BUGs @@ -1,8 +1,3 @@ ### Next: 49 BUG-45: when deleting, the .pa_bin path has c:\ ... in it, not the real path -- due to multiple Bin paths, really only should have one... Prob. best to re-jig Setting to have Base path (diff for me and Cam), then only 1 Bin path of base, can allow multiple import / storage as it works anyway -BUG-46: when duplicate is in both import dir x 2 and at least 1 storage (maybe more), it tries to delete/keep it twice -BUG-47: maybe related to BUG-46, but the counts for deleting are broken (saying 2 manual choices to delete 0 / keep 0 files) - SEEMS related... file is in Storage and choices in Import, Storage keeps file first, Import choices are then - thrown away (definitely caseus BUG-46, and I think causes the counts too) -BUg-48: initial code to CleanUpDirInDB does not even seem to be called, let alone work diff --git a/TODO b/TODO index 99e8893..90d98c4 100644 --- a/TODO +++ b/TODO @@ -1,22 +1,20 @@ ## GENERAL - * remove dirs after the duplicate cleanup removes all its content - * Face matching: - upgrade to face distance per face per file [DONE] - face locations: START FORM SCRATCH for prod so all images have face_locn data + - need to reconsider whether current distance algorithm gives best match - can I do better? * could look to remove the hand fixing of json.loads of array data --> seems you can make your own datatype in the ORM, and it can do the conversion every time you use it - https://stackoverflow.com/questions/28143557/sqlalchemy-convert-column-value-back-and-forth-between-internal-and-database-fo + * per file you could select an unknown face and add it as a ref img to an existing person, or make a new person and attach? + * from menu, we could try to get smart/fancy... say find face with largest size, check it vs. other faces, if it matches more than say 10? we offer it up as a required ref img, then cut that face (with margin) out and use it is a new ref image / person + * fix up logging in general * comment your code * js files * html files? - * more OO goodness :) - * viewer needs to allow toggle to scan_model (and prob. right-click on file... AI (with CNN) AI (with hog) - - make the form-select AI_Model actually do the change (but need more mem on mara really) - ## DB * Dir can have date in the DB, so we can do Oldest/Newest dirs in Folder view @@ -41,15 +39,17 @@ ### AI * faces per file (need a threshold for too many? OR * consider size of bbox of face / 'high-quality' faces -- if face is too small in image, dont match it - * if we have a high-qual face, we could show this on a page and have UI to create ref img / person for it ### UI ??? ipads can't do selections and contextMenus, do I want to re-factor to cater for this? - partial fix, double-click / tap allows viewing (most useful context-menu feature) + * viewer needs to allow toggle to scan_model (and prob. right-click on file... AI (with CNN) AI (with hog) + - make the form-select AI_Model actually do the change (but need more mem on mara really) + For AI / rescan: way to override per file: - the model used + the model used [partial - UI done, need mem on mara] the threshold used? maybe on the per file you could select an unknown face and add it as a ref img to a existing person, or make a new person and attach? diff --git a/dups.py b/dups.py index 669285a..2679c9c 100644 --- a/dups.py +++ b/dups.py @@ -88,6 +88,7 @@ class Duplicates(PA): self.preferred_file={} self.preferred_path={} self.hashes_processed={} + self.eids_processed={} self.uniq_dups=0 self.total_dups=0 @@ -146,6 +147,8 @@ class Duplicates(PA): # we process these into appropriate data structures on this first pass def AddDup( self, row ): self.hashes_processed[row.hash]=1 + self.eids_processed[row.id1]=1 + self.eids_processed[row.id2]=1 dr1=DupRow( row.hash, row.fname1, row.rel_path1, row.did1, row.id1 ) dr2=DupRow( row.hash, row.fname2, row.rel_path2, row.did2, row.id2 ) # if in both import and storage path, just keep the storage path file, @@ -206,39 +209,24 @@ class Duplicates(PA): # The second pass processes row by row of dups_to_process, looking for per_file_dups and per_path_dups # AND works out counts to display overall tallies of types of keeping and deletion of files, and choices to make def SecondPass(self): - # sort out counts (for ip_to_sp - that is all finished) - self.uniq_dups = len(self.hashes_processed) - # total starts with 1 copy of everything we keep in sp - self.total_dups = len(self.ip_to_sp_dups_keep) - # and then add all those we delete in ip that are in sp - for hash in self.ip_to_sp_dups_del: - self.total_dups += len(self.ip_to_sp_dups_del[hash]) - # okay, go for each duplicate that should be processed (they are stored # by hash, and have at least 2 entries, but can have more, and be in # the IP or SP and any combo, cater for all below for hash in self.dups_to_process: # more than 2 files (just ask per file) OR (implied) only 2 copies, and files are in same dir (so must be diff name, so just ask) OR (implied) on 2 copies in same dir & filename different (ask per file) + # will force ask per file if (len(self.dups_to_process[hash]) > 2) or (self.dups_to_process[hash][0].d == self.dups_to_process[hash][1].d) or (self.dups_to_process[hash][0].f != self.dups_to_process[hash][1].f): self.per_file_dups.append(self.dups_to_process[hash]) for el in self.dups_to_process[hash]: if re.search( r'\d{4}/\d{8}', el.d): self.preferred_file[hash] = el.id - self.total_dups += len(self.dups_to_process[hash]) - # if this combination ALSO has an import path dup, then we have already counted the storage path dup in the earlier keeping count - if hash in self.ip_to_sp_dups_keep: - self.total_dups -= 1 - # only 2 files AND different path (ask per path) AND with the same name... else: - # if this dup path is not already being partially handled by an ip <-> sp dup, then add it / count it - if self.AddDupPath( hash ): - self.total_dups += 2 - else: - # okay, if we are here, this path combo is also in an IP <-> SP combo. - # IF, this dup we tried to add was in SP<->SP, then there - # is another dup to count, if its IP<->IP (as we append these to the del list), then nothing further to count - if self.InStoragePath(self.dups_to_process[hash][0].d): - self.total_dups += 1 + # will force ask per path + self.AddDupPath( hash ) + + # provide convenience counts + self.uniq_dups = len(self.hashes_processed) + self.total_dups = len(self.eids_processed) return # quick debugger to see the data in the data structure (not used by default) diff --git a/pa_job_manager.py b/pa_job_manager.py index c1b8f9e..eae082e 100644 --- a/pa_job_manager.py +++ b/pa_job_manager.py @@ -815,25 +815,55 @@ def ResetExistsOnFS(job, path): # Convenience function to remove a file from the database - and its associated links # used when scanning and a file has been removed out from under PA, or when we remove duplicates #################################################################################################################################### -def RemoveFileFromDB(id): - session.query(EntryDirLink).filter(EntryDirLink.entry_id==id).delete() - session.query(File).filter(File.eid==id).delete() - session.query(Entry).filter(Entry.id==id).delete() +def RemoveEmtpyDirFromFS( job, del_me ): + try: + os.rmdir( del_me.FullPathOnFS() ) + except Exception as e: + print( f"ERROR: Failed to remove file from filesystem - which={del_me.name}, err: {e}") + return + +def RemoveEmptyDirFromDB( job, del_me ): + session.query(EntryDirLink).filter(EntryDirLink.entry_id==del_me.id).delete() + session.query(PathDirLink).filter(PathDirLink.dir_eid==del_me.id).delete() + session.query(Dir).filter(Dir.eid==del_me.id).delete() + session.query(Entry).filter(Entry.id==del_me.id).delete() + AddLogForJob( job, f"INFO: Removing {del_me.name} from system as removing duplicates has left it empty" ) return #################################################################################################################################### -# Actually moves the physical file from its current real directory to a subdir of the recycle bin path +# Convenience function called after we delete an entry from a DB (always starts as a file deletion), but if we have an empty dir +# this func will delete it and check that the parent dir is non-empty recursively, so this could trigger a cascading deletion of +# empty dirs in a hierachy, if the entry deletion leaves a dir with content, just finish #################################################################################################################################### -def RemoveFileFromFS( del_me ): - try: - settings = session.query(Settings).first() - dst_dir=settings.recycle_bin_path + '/' + del_me.in_dir.in_path.path_prefix.replace('static/','') + '/' + del_me.in_dir.rel_path + '/' - os.makedirs( dst_dir,mode=0o777, exist_ok=True ) - src=del_me.FullPathOnFS() - dst=dst_dir + '/' + del_me.name - os.replace( src, dst ) - except Exception as e: - print( f"ERROR: Failed to remove file from filesystem - which={src}, err: {e}") +def CleanUpDirInDB(job, d): + session.commit() + print( f"CleanUpDirInDB(): checking dir: {d.name} ({d.id})" ) + content = session.query(Entry).join(EntryDirLink).filter(EntryDirLink.dir_eid==d.id).first() + if not content: + print( f" Dir {d.name} - {d.id} is empty - removing it" ) + # get an Entry from DB (in_dir is a Dir) + parent_dir = session.query(Entry).get(d.in_dir.eid) + # okay remove this empty dir + RemoveEmtpyDirFromFS( job, d ) + RemoveEmptyDirFromDB( job, d) + print( f" Dir {d.name} is in {parent_dir.name} ({parent_dir.id}) -> check next" ) + # check to see if removing the empty dir has left the parent dir empty + CleanUpDirInDB(job, parent_dir) + else: + print( f"There is content (first entry: {content.name}) in {d.name} - finished for this dir" ) + return + +#################################################################################################################################### +# Convenience function to remove a file from the database - and its associated links +# used when scanning and a file has been removed out from under PA, or when we remove duplicates +#################################################################################################################################### +def RemoveFileFromDB(job, del_me): + parent_dir=del_me.in_dir + session.query(EntryDirLink).filter(EntryDirLink.entry_id==del_me.id).delete() + session.query(File).filter(File.eid==del_me.id).delete() + session.query(Entry).filter(Entry.id==del_me.id).delete() + AddLogForJob( job, f"INFO: Removing {rm.name} from system as it is no longer on the file system") + CleanUpDirInDB(job, parent_dir) return #################################################################################################################################### @@ -927,8 +957,10 @@ def MoveFileToRecycleBin(job,del_me): parent_dir=new_dir part_rel_path += "/" + parent_dir = session.query(Entry).get(del_me.in_dir.eid) del_me.in_dir = new_dir AddLogForJob(job, f"Deleted file: {del_me.name} - (moved to {os.path.dirname(del_me.FullPathOnFS())})" ) + CleanUpDirInDB(job, parent_dir) return #################################################################################################################################### @@ -986,14 +1018,12 @@ def HandleAnyFSDeletions(job): rms = session.query(Entry).filter(Entry.exists_on_fs==False,Entry.type_id!=dtype.id).all() rm_cnt=0 for rm in rms: - RemoveFileFromDB(rm.id) - AddLogForJob( job, f"INFO: Removing {rm.name} from system as it is no longer on the file system") + RemoveFileFromDB(job, rm) rm_cnt+=1 rmdirs = session.query(Entry).filter(Entry.exists_on_fs==False,Entry.type_id==1).order_by(Entry.id.desc()).all() for rmdir in rmdirs: - RemoveFileFromDB(rmdir.id) - AddLogForJob( job, f"INFO: Removing {rmdir.name} from system as it is no longer on the file system") + RemoveFileFromDB(job, rmdir) rm_cnt+=1 return rm_cnt @@ -1542,7 +1572,7 @@ def InitialValidationChecks(): rbp_exists=1 ptype = session.query(PathType).filter(PathType.name=='Bin').first().id symlink=CreateSymlink(job,ptype,path) - path, dirs, files = next(os.walk(path)) + root, dirs, files = next(os.walk(path)) if len(dirs) + len(files) > 0: AddLogForJob(job, "INFO: the bin path contains content, cannot process to know where original deletes were form - skipping content!" ) AddLogForJob(job, "TODO: could be smart about what is known in the DB vs on the FS, and change below to an ERROR if it is one") diff --git a/tables.sql b/tables.sql index dc33289..9681a64 100644 --- a/tables.sql +++ b/tables.sql @@ -121,6 +121,6 @@ insert into PERSON values ( (select nextval('PERSON_ID_SEQ')), 'mum', 'Mandy', ' insert into PERSON values ( (select nextval('PERSON_ID_SEQ')), 'cam', 'Cameron', 'De Paoli' ); insert into PERSON values ( (select nextval('PERSON_ID_SEQ')), 'mich', 'Michelle', 'De Paoli' ); -- DEV: -insert into SETTINGS ( id, import_path, storage_path, recycle_bin_path, default_refimg_model, default_scan_model, default_threshold ) values ( (select nextval('SETTINGS_ID_SEQ')), '/home/ddp/src/photoassistant/images_to_process/#c:/Users/cam/Desktop/code/python/photoassistant/photos/#/home/ddp/src/photoassistant/new_img_dir/', '/home/ddp/src/photoassistant/storage/#c:/Users/cam/Desktop/code/python/photoassistant/storage/', '/home/ddp/src/photoassistant/.pa_bin/#c:/Users/cam/Desktop/code/python/photoassistant/.pa_bin/', 2, 1, '0.55' ); +insert into SETTINGS ( id, import_path, storage_path, recycle_bin_path, default_refimg_model, default_scan_model, default_threshold ) values ( (select nextval('SETTINGS_ID_SEQ')), '/home/ddp/src/photoassistant/images_to_process/#c:/Users/cam/Desktop/code/python/photoassistant/photos/#/home/ddp/src/photoassistant/new_img_dir/', '/home/ddp/src/photoassistant/storage/#c:/Users/cam/Desktop/code/python/photoassistant/storage/', '/home/ddp/src/photoassistant/.pa_bin/', 2, 1, '0.55' ); -- PROD: --insert into SETTINGS ( id, import_path, storage_path, recycle_bin_path, default_refimg_model, default_scan_model, default_threshold ) values ( (select nextval('SETTINGS_ID_SEQ')), '/export/docker/storage/Camera_uploads/', '/export/docker/storage/photos/', '/export/docker/storage/.pa_bin/', 2, 1, '0.55' );