From 76aee3a10ac418848e9c25a9611945e13e36afe4 Mon Sep 17 00:00:00 2001 From: Damien De Paoli Date: Sat, 6 Mar 2021 17:18:11 +1100 Subject: [PATCH] okay fix_dups page now has functioning pagination, highlights regex matching "good" files as green, and just a file as yellow if we cant find the right one, so easily shows where to really pat attention. Has DBox based help page, and overall just a better UI/UX --- TODO | 12 +++---- files.py | 28 +++++++++-------- templates/base.html | 12 +++---- templates/dups.html | 77 ++++++++++++++++++++++++++++++--------------- 4 files changed, 77 insertions(+), 52 deletions(-) diff --git a/TODO b/TODO index 03be52b..6b6e2d7 100644 --- a/TODO +++ b/TODO @@ -1,12 +1,11 @@ ## GENERAL - * fix_dups, etc. need to know path so we don't guess import_path or storage_path to remove the prefix from the keep/del alerts - * pagination in dups, needs to be a drop-down and take affect on page on change * SymlinkName - use it from shared everywhere, never do path_prefix by hand use this function * AddJobForLog can absorb DEBUGs, etc. in fact fix up logging in general * comment your code * do we need to make some funcs/code into OO? * scan_sp needs to be in scannow * need a way for page to show we are in import_path or storage_path + * storage_path viewing needs to be by folder / not a big grab bag of files (by default) ## DB Need to think about... @@ -21,10 +20,9 @@ ignore *thumb* scan storage_dir - * need to find / remove duplicate files from inside storage_dir and itself, and in import_dir and in storage_dir - implications -- - VIEWING: need to view import dir and view storage dir as separate menu items AND make it clear what you are looking at in header - MOVING/COPYING: need to be smart, its a file move/copy depending on file systems (if import_dir/storage_dir on same fs, we can use mv - much faster) + * need to find / remove duplicate files from inside storage_dir and import_dir + -- in fact not sure what will happen if I try this right now, I think it might sort of work, only the dup display per file won't be able to + use jex.path for all sets of files, only those dups in the original source of the scan -- started on some basic optimisations (commit logs every 100 logs, not each log) - with debugs: import = 04:11, getfiledetails== 0:35:35 @@ -33,7 +31,7 @@ *** Need to use thread-safe sessions per Thread, half-assed version did not work - need a manual button to restart it in the GUI, + need a manual button to restart a job in the GUI, (based on file-level optims, just run the job as new and it will optim over already done parts and continue) Future: diff --git a/files.py b/files.py index c90d524..df5f196 100644 --- a/files.py +++ b/files.py @@ -15,6 +15,7 @@ import base64 import numpy import cv2 import time +import re ################################################################################ # Local Class imports @@ -271,7 +272,10 @@ def fix_dups(): jexes = JobExtra.query.join(Job).join(PA_JobManager_Message).filter(PA_JobManager_Message.id==request.form['fe_msg_id']).all() path=[jex.value for jex in jexes if jex.name == "path"][0] prefix = SymlinkName(path,path+'/') - pagesize=int([jex.value for jex in jexes if jex.name == "pagesize"][0]) + if 'pagesize' not in request.form: + pagesize=int([jex.value for jex in jexes if jex.name == "pagesize"][0]) + else: + pagesize=int(request.form['pagesize']) dups={} for row in rows: AddDup( prefix+'/', row, dups ) @@ -282,27 +286,25 @@ def fix_dups(): did2="" str="" dup_cnt=1 + preferred={} per_file_dups=[] per_path_dups=[] hashes="" overall_dup_cnt=0 overall_dup_sets=0 for hash in dups: - # more than 2 files (just ask per file) - if len(dups[hash]) > 2: - per_file_dups.append(dups[hash]) - overall_dup_cnt += len(dups[hash]) - overall_dup_sets += 1 - # only 2 copies, and files are in same dir (so must be diff name, so just ask) - elif dups[hash][0]['d'] == dups[hash][1]['d']: - per_file_dups.append(dups[hash]) - overall_dup_cnt += len(dups[hash]) - overall_dup_sets += 1 + # more than 2 files (just ask per file) OR + # only 2 copies, and files are in same dir (so must be diff name, so just ask) OR # content same, filename different (just ask per file) - elif dups[hash][0]['f'] != dups[hash][1]['f']: + if (len(dups[hash]) > 2) or (dups[hash][0]['d'] == dups[hash][1]['d']) or (dups[hash][0]['f'] != dups[hash][1]['f']): per_file_dups.append(dups[hash]) overall_dup_cnt += len(dups[hash]) overall_dup_sets += 1 + for el in dups[hash]: + if re.search( '\d{4}/\d{8}', el['d']): + preferred[hash] = el['id'] + if overall_dup_cnt<5: + print( f"{dups[hash]} <- keeping {el['d']} -- {preferred[hash]}" ) # by here we have only 2 files, with the same name, different path # (MOST COMMON, and I think we dont care per file, just per path) elif d1 != dups[hash][0]['d']: @@ -326,7 +328,7 @@ def fix_dups(): overall_dup_sets += dup_cnt per_path_dups.append({'count': dup_cnt, 'd1': d1, 'd2': d2, 'did1': did1, 'did2': did2, 'hashes' : hashes }) - return render_template("dups.html", per_file_dups=per_file_dups, per_path_dups=per_path_dups, fe_msg_id=request.form['fe_msg_id'], overall_dup_cnt=overall_dup_cnt, overall_dup_sets=overall_dup_sets, pagesize=pagesize ) + return render_template("dups.html", per_file_dups=per_file_dups, preferred=preferred, per_path_dups=per_path_dups, fe_msg_id=request.form['fe_msg_id'], overall_dup_cnt=overall_dup_cnt, overall_dup_sets=overall_dup_sets, pagesize=pagesize ) @app.route("/rm_dups", methods=["POST"]) def rm_dups(): diff --git a/templates/base.html b/templates/base.html index 177d42e..03665b4 100644 --- a/templates/base.html +++ b/templates/base.html @@ -11,6 +11,11 @@ + + + + + {% import "bootstrap/wtf.html" as wtf %}