From 1499f2ca615cdce3d6944399be2b287588d66410 Mon Sep 17 00:00:00 2001 From: Damien De Paoli Date: Thu, 11 Feb 2021 20:09:24 +1100 Subject: [PATCH] fixed BUG-21: file stucture broken after rescan, and beginnings of new job for checking for duplicate files, and having the back-end job indicate to the front-end that there are duplicates, and the basic route is in the f/e, but not built yet --- BUGs | 61 +-------------------------------------------- TODO | 2 ++ files.py | 23 +++++++++++++++++ pa_job_manager.py | 41 ++++++++++++++++++++++++------ templates/base.html | 9 +++++-- 5 files changed, 66 insertions(+), 70 deletions(-) diff --git a/BUGs b/BUGs index b6c6243..3952233 100644 --- a/BUGs +++ b/BUGs @@ -1,61 +1,2 @@ -### Next: 21 +### Next: 22 -BUG-21: the datastructure of dir/files is actually quite broken on a real import (seems to be on second/subsequent imports -- the code to remove deleted files, is not 'seeing all files?' -pa=# select count(edl.dir_eid), d.path_prefix from entry_dir_link edl, dir d where edl.dir_eid = d.eid and edl.dir_eid in ( select eid from dir ) group by d.path_prefix; - count | path_prefix --------+---------------------------------------------------- - 101 | static/CAM_UPLOADS/M's Phone/Camera Roll - 1 | static/CAM_UPLOADS/M's Phone - 2979 | static/CAM_UPLOADS/Mandy's Phone/Camera Roll - 1 | static/CAM_UPLOADS - 675 | static/CAM_UPLOADS/M's Galaxy A51/Camera Roll - 3656 | static/CAM_UPLOADS/Damien's Phone/Camera Roll - 1 | static/CAM_UPLOADS/Damien's Phone/Camera Roll/0000 - 1 | static/CAM_UPLOADS/M's Galaxy A51 - 1 | static/CAM_UPLOADS/Mandy's Phone - 1 | static/CAM_UPLOADS/Damien's Phone -(10 rows) - -pa=# select * from dir; - eid | path_prefix | num_files | last_import_date -------+----------------------------------------------------+-----------+-------------------- - 2 | static/CAM_UPLOADS/Mandy's Phone | 1 | 1613024867.8238187 - 3 | static/CAM_UPLOADS/Mandy's Phone/Camera Roll | 2999 | 1613024872.385247 - 3003 | static/CAM_UPLOADS/M's Galaxy A51 | 1 | 1613024872.387184 - 3004 | static/CAM_UPLOADS/M's Galaxy A51/Camera Roll | 2400 | 1613024875.9811678 - 5405 | static/CAM_UPLOADS/Damien's Phone | 1 | 1613024875.983697 - 5406 | static/CAM_UPLOADS/Damien's Phone/Camera Roll | 3658 | 1613024883.1730359 - 9058 | static/CAM_UPLOADS/Damien's Phone/Camera Roll/0000 | 1 | 1613024883.1779747 - 9066 | static/CAM_UPLOADS/M's Phone | 1 | 1613024883.1806386 - 9067 | static/CAM_UPLOADS/M's Phone/Camera Roll | 101 | 1613024883.3877454 - 1 | static/CAM_UPLOADS | 9167 | 1613024867.8217578 - - -############ -SO FILE counts are really broken, a slight bit off on a couple of dirs, but say M51, its 675 in DB, and the file tree walk saw 2400 -- and an ls -lR confirms... - - -checking by hand for the first file in M51 dir: - -ddp@mara:~/src/photoassistant$ ls -l /export/docker/storage/photos/CAM_UPLOADS/M\'s\ Galaxy\ A51/Camera\ Roll/20190105_175219.jpg --rw-r--r-- 1 mythtv mythtv 2.6M Dec 28 20:37 "/export/docker/storage/photos/CAM_UPLOADS/M's Galaxy A51/Camera Roll/20190105_175219.jpg" - -pa=# select * from entry where name = '20190105_175219.jpg'; - id | name | type_id | exists_on_fs ------+---------------------+---------+-------------- - 951 | 20190105_175219.jpg | 2 | t -(1 row) - -pa=# select * from entry_dir_link where entry_id = 951; - entry_id | dir_eid -----------+--------- - 951 | 3 -(1 row) - -pa=# select * from dir where eid = 3; - eid | path_prefix | num_files | last_import_date ------+----------------------------------------------+-----------+------------------- - 3 | static/CAM_UPLOADS/Mandy's Phone/Camera Roll | 2999 | 1613024872.385247 - - -so the file is really in the M51 dir, but is in the DB in Mandy's phone dir instead... diff --git a/TODO b/TODO index e5c1237..3e5207d 100644 --- a/TODO +++ b/TODO @@ -13,6 +13,8 @@ - without debugs: import == 04:03, getfiledetails == 0:35:36 -- not a sig diff - with exifread & debug: import == 04:26 + * CheckForDups() needs to allow the f/end to actually do the work, and then clear the MessageToFE() as well + * try again with walk to go through loop once quickly just to add up files, * then start the import dir counting up / progress diff --git a/files.py b/files.py index 6f1508c..eac4477 100644 --- a/files.py +++ b/files.py @@ -159,6 +159,29 @@ def forcescan(): st.SetMessage("force scan & rebuild data for files in: Job #{} (Click the link to follow progress)".format( job.id, job.id) ) return render_template("base.html") +@app.route("/fix_dups", methods=["GET"]) +def fix_dups(): +# dups = db.engine.execute.session.execute( "select d1.path_prefix as path1, e1.name as fname1, d2.path_prefix as path2, e2.name as name2 from entry e1, file f1, dir d1, entry_dir_link edl1, entry e2, file f2, dir d2, entry_dir_link edl2 where e1.id = f1.eid and e2.id = f2.eid and d1.eid = edl1.dir_eid and edl1.entry_id = e1.id and edl2.dir_eid = d2.eid and edl2.entry_id = e2.id and f1.hash = f2.hash and e1.id != e2.id order by path1, fname1;" ) + +# if len(dups) > 0: +# ActionForFE( job, dups, "danger", "Found duplicate(s), click here to finalise import by removing duplicates" ) +# p1="" +# done=list() +# for dup in dups: +# if p1 != dup.path1: +# p1 = dup.path1 +# p2 = dup.path2 +# # this is the flip-side of a previous p1 <-> p2 dup (this p2 is a previous p1) +# if p2 in done: +# continue +# done.append(p1) +# print(f"Duplicates in: {p1} <-> {p2}") + + st.SetAlert("warning") + st.SetMessage("Not Yet!") + return render_template("base.html") + + @app.route("/move_files", methods=["POST"]) def move_files(): st.SetAlert("warning") diff --git a/pa_job_manager.py b/pa_job_manager.py index cc5c7b3..541000d 100644 --- a/pa_job_manager.py +++ b/pa_job_manager.py @@ -233,8 +233,9 @@ def MessageToFE( job_id, alert, message ): msg = PA_JobManager_FE_Message( job_id=job_id, alert=alert, message=message) session.add(msg) session.commit() + return -def ProcessImportDirs(parent_job=None): +def ProcessImportDirs(parent_job): settings = session.query(Settings).first() if settings == None: raise Exception("Cannot create file data with no settings / import path is missing") @@ -263,15 +264,23 @@ def ProcessImportDirs(parent_job=None): session.commit() if parent_job: AddLogForJob(parent_job, "adding job id={} {} (wait for: {})".format( job2.id, job2.id, job2.name, job2.wait_for ) ) - """ + jex3=JobExtra( name="path", value=path ) - job3=Job(start_time=now, last_update=now, name="processai", state="New", wait_for=job2.id, pa_job_state="New", current_file_num=0 ) + job3=Job(start_time=now, last_update=now, name="checkdups", state="New", wait_for=job2.id, pa_job_state="New", current_file_num=0 ) job3.extra.append(jex3) session.add(job3) session.commit() if parent_job: AddLogForJob(parent_job, "adding job id={} {} (wait for: {})".format( job3.id, job3.id, job3.name, job3.wait_for ) ) """ + jex4=JobExtra( name="path", value=path ) + job4=Job(start_time=now, last_update=now, name="processai", state="New", wait_for=job2.id, pa_job_state="New", current_file_num=0 ) + job4.extra.append(jex4) + session.add(job4) + session.commit() + if parent_job: + AddLogForJob(parent_job, "adding job id={} {} (wait for: {})".format( job3.id, job3.id, job3.name, job3.wait_for ) ) + """ HandleJobs() return @@ -298,6 +307,8 @@ def RunJob(job): JobImportDir(job) elif job.name =="getfiledetails": JobGetFileDetails(job) + elif job.name == "checkdups": + CheckForDups(job) elif job.name == "processai": JobProcessAI(job) else: @@ -429,6 +440,7 @@ def AddDir(job, dirname, path_prefix, in_dir): def AddFile(job, fname, type_str, fsize, in_dir, year, month, day, woy ): e=session.query(Entry).join(EntryDirLink).join(Dir).filter(Entry.name==fname,Dir.eid==in_dir.eid).first() if e: + print( f"################################################ FILE EXISTS ALREADY: {fname} -- {in_dir.path_prefix} {e}" ) e.exists_on_fs=True return e ftype = session.query(FileType).filter(FileType.name==type_str).first() @@ -503,7 +515,6 @@ def GetDateFromFile(file, stat): year, month, day, _, _, _, _, _, _ = datetime.fromtimestamp(stat.st_ctime).timetuple() c=date(year, month, day).isocalendar() woy=c[1] - print(f"DEL ME: year={year}, month={month}, day={day}") return year, month, day, woy @@ -536,7 +547,6 @@ def JobImportDir(job): root=root[0:-1] dir=AddDir(job, os.path.basename(root), pp, parent_dir) - parent_dir=dir for basename in files: # commit every 100 files to see progress being made but not hammer the database if job.current_file_num % 100 == 0: @@ -559,15 +569,15 @@ def JobImportDir(job): year, month, day, woy = GetDateFromFile(fname, stat) e=AddFile( job, basename, type_str, fsize, dir, year, month, day, woy ) else: - e=session.query(Entry).filter(Entry.name==basename).first() + e=session.query(Entry).join(EntryDirLink).join(Dir).filter(Entry.name==basename,Dir.eid==dir.eid).first() e.exists_on_fs=True if DEBUG==1: print("DEBUG: {} - {} is OLDER than {}".format( basename, stat.st_ctime, dir.last_import_date ), basename ) job.current_file=basename job.current_file_num+=1 - dir.num_files=len(files)+len(subdirs) dir.last_import_date = time.time() + parent_dir=dir job.num_files=overall_file_cnt job.current_file_num=overall_file_cnt @@ -814,9 +824,24 @@ def GenVideoThumbnail(job, file): return None return thumbnail +def CheckForDups(job): + path=[jex.value for jex in job.extra if jex.name == "path"][0] + path='static'+'/'+os.path.basename(path[0:-1]) + AddLogForJob( job, f"Check for duplicates in import path: {path}" ) + res = session.execute( f"select count(e1.name) as count from entry e1, file f1, dir d1, entry_dir_link edl1, entry e2, file f2, dir d2, entry_dir_link edl2 where e1.id = f1.eid and e2.id = f2.eid and d1.eid = edl1.dir_eid and edl1.entry_id = e1.id and edl2.dir_eid = d2.eid and edl2.entry_id = e2.id and d1.path_prefix like '%{path}%' and f1.hash = f2.hash and e1.id != e2.id" ) + for row in res: + if row.count > 0: + MessageToFE( job.id, "danger", "Found duplicate(s), click  here  to finalise import by removing duplicates" ) + if __name__ == "__main__": print("INFO: PA job manager starting - listening on {}:{}".format( PA_JOB_MANAGER_HOST, PA_JOB_MANAGER_PORT) ) - ProcessImportDirs() + +##### have to test the the lines below (to force a scan on startup) + now=datetime.now(pytz.utc) + job=Job(start_time=now, last_update=now, name="scannow", state="New", wait_for=None, pa_job_state="New", current_file_num=0, num_files=0 ) + session.add(job) + session.commit() + HandleJobs() with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: s.bind((PA_JOB_MANAGER_HOST, PA_JOB_MANAGER_PORT)) s.listen() diff --git a/templates/base.html b/templates/base.html index 2f4404a..22487c9 100644 --- a/templates/base.html +++ b/templates/base.html @@ -110,9 +110,14 @@ {% if GetJM_Message() != None %} {% set msg=GetJM_Message() %}
- Job #{{msg.job_id}}: {{msg.message|safe}} + {% if msg.job_id %} + Job #{{msg.job_id}}: + {% endif %} + {{msg.message|safe}}
- {% set dont_print=ClearJM_Message(msg.id) %} + {% if msg.alert != "danger" %} + {% set dont_print=ClearJM_Message(msg.id) %} + {% endif %} {% endif %} {% endif %}