From 76aee3a10ac418848e9c25a9611945e13e36afe4 Mon Sep 17 00:00:00 2001
From: Damien De Paoli <ddp@depaoli.id.au>
Date: Sat, 6 Mar 2021 17:18:11 +1100
Subject: [PATCH] okay fix_dups page now has functioning pagination, highlights
 regex matching "good" files as green, and just a file as yellow if we cant
 find the right one, so easily shows where to really pat attention.  Has DBox
 based help page, and overall just a better UI/UX

---
 TODO                | 12 +++----
 files.py            | 28 +++++++++--------
 templates/base.html | 12 +++----
 templates/dups.html | 77 ++++++++++++++++++++++++++++++---------------
 4 files changed, 77 insertions(+), 52 deletions(-)

diff --git a/TODO b/TODO
index 03be52b..6b6e2d7 100644
--- a/TODO
+++ b/TODO
@@ -1,12 +1,11 @@
 ## GENERAL
-    * fix_dups, etc. need to know path so we don't guess import_path or storage_path to remove the prefix from the keep/del alerts
-    * pagination in dups, needs to be a drop-down and take affect on page on change
     * SymlinkName - use it from shared everywhere, never do path_prefix by hand use this function
     * AddJobForLog can absorb DEBUGs, etc.  in fact fix up logging in general
     * comment your code
     * do we need to make some funcs/code into OO?
     * scan_sp needs to be in scannow
     * need a way for page to show we are in import_path or storage_path
+    * storage_path viewing needs to be by folder / not a big grab bag of files (by default)
 
 ## DB	
     Need to think about...
@@ -21,10 +20,9 @@
         ignore *thumb*
 
     scan storage_dir
-        * need to find / remove duplicate files from inside storage_dir and itself, and in import_dir and in storage_dir
-        implications --
-            VIEWING: need to view import dir and view storage dir as separate menu items AND make it clear what you are looking at in header
-            MOVING/COPYING: need to be smart, its a file move/copy depending on file systems (if import_dir/storage_dir on same fs, we can use mv - much faster)
+        * need to find / remove duplicate files from inside storage_dir and import_dir
+            -- in fact not sure what will happen if I try this right now, I think it might sort of work, only the dup display per file won't be able to
+            use jex.path for all sets of files, only those dups in the original source of the scan 
 
     -- started on some basic optimisations (commit logs every 100 logs, not each log)
         - with debugs: import = 04:11, getfiledetails== 0:35:35
@@ -33,7 +31,7 @@
 
     *** Need to use thread-safe sessions per Thread, half-assed version did not work
 
-    need a manual button to restart it in the GUI, 
+    need a manual button to restart a job in the GUI, 
         (based on file-level optims, just run the job as new and it will optim over already done parts and continue)
 
     Future:
diff --git a/files.py b/files.py
index c90d524..df5f196 100644
--- a/files.py
+++ b/files.py
@@ -15,6 +15,7 @@ import base64
 import numpy
 import cv2
 import time
+import re
 
 ################################################################################
 # Local Class imports
@@ -271,7 +272,10 @@ def fix_dups():
     jexes = JobExtra.query.join(Job).join(PA_JobManager_Message).filter(PA_JobManager_Message.id==request.form['fe_msg_id']).all()
     path=[jex.value for jex in jexes if jex.name == "path"][0]
     prefix = SymlinkName(path,path+'/')
-    pagesize=int([jex.value for jex in jexes if jex.name == "pagesize"][0])
+    if 'pagesize' not in request.form:
+        pagesize=int([jex.value for jex in jexes if jex.name == "pagesize"][0])
+    else:
+        pagesize=int(request.form['pagesize'])
     dups={}
     for row in rows:
         AddDup( prefix+'/', row, dups )
@@ -282,27 +286,25 @@ def fix_dups():
     did2=""
     str=""
     dup_cnt=1
+    preferred={}
     per_file_dups=[]
     per_path_dups=[]
     hashes=""
     overall_dup_cnt=0
     overall_dup_sets=0
     for hash in dups:
-        # more than 2 files (just ask per file)
-        if len(dups[hash]) > 2:
-            per_file_dups.append(dups[hash])
-            overall_dup_cnt += len(dups[hash])
-            overall_dup_sets += 1
-        # only 2 copies, and files are in same dir (so must be diff name, so just ask)
-        elif dups[hash][0]['d'] == dups[hash][1]['d']:
-            per_file_dups.append(dups[hash])
-            overall_dup_cnt += len(dups[hash])
-            overall_dup_sets += 1
+        # more than 2 files (just ask per file) OR
+        # only 2 copies, and files are in same dir (so must be diff name, so just ask) OR
         # content same, filename different (just ask per file)
-        elif dups[hash][0]['f'] != dups[hash][1]['f']:
+        if (len(dups[hash]) > 2) or (dups[hash][0]['d'] == dups[hash][1]['d']) or (dups[hash][0]['f'] != dups[hash][1]['f']):
             per_file_dups.append(dups[hash])
             overall_dup_cnt += len(dups[hash])
             overall_dup_sets += 1
+            for el in dups[hash]:
+                if re.search( '\d{4}/\d{8}', el['d']):
+                    preferred[hash] = el['id']
+                    if overall_dup_cnt<5:
+                        print( f"{dups[hash]} <- keeping {el['d']} -- {preferred[hash]}" )
         # by here we have only 2 files, with the same name, different path
         # (MOST COMMON, and I think we dont care per file, just per path)
         elif d1 != dups[hash][0]['d']:
@@ -326,7 +328,7 @@ def fix_dups():
         overall_dup_sets += dup_cnt
         per_path_dups.append({'count': dup_cnt, 'd1': d1, 'd2': d2, 'did1': did1, 'did2': did2, 'hashes' : hashes })
 
-    return render_template("dups.html", per_file_dups=per_file_dups, per_path_dups=per_path_dups, fe_msg_id=request.form['fe_msg_id'], overall_dup_cnt=overall_dup_cnt, overall_dup_sets=overall_dup_sets, pagesize=pagesize )
+    return render_template("dups.html", per_file_dups=per_file_dups, preferred=preferred, per_path_dups=per_path_dups, fe_msg_id=request.form['fe_msg_id'], overall_dup_cnt=overall_dup_cnt, overall_dup_sets=overall_dup_sets, pagesize=pagesize )
 
 @app.route("/rm_dups", methods=["POST"])
 def rm_dups():
diff --git a/templates/base.html b/templates/base.html
index 177d42e..03665b4 100644
--- a/templates/base.html
+++ b/templates/base.html
@@ -11,6 +11,11 @@
 		<link rel="stylesheet" href="https://cdn.datatables.net/1.10.22/css/dataTables.bootstrap4.min.css">
 		<link rel="shortcut icon" href="{{ url_for('static', filename='favicon.ico') }}">
 		<script src="https://kit.fontawesome.com/9b4c7cf470.js" crossorigin="anonymous"></script>
+        <!-- code to get bootstrap & bootstrap datatable to work -->
+        <script src="https://code.jquery.com/jquery-3.5.1.min.js" integrity="sha256-9/aliU8dGd2tb6OSsuzixeV4y/faTqgFtohetphbbj0=" crossorigin="anonymous"></script>
+        <script src="https://cdn.jsdelivr.net/npm/bootstrap@4.5.3/dist/js/bootstrap.bundle.min.js" integrity="sha384-ho+j7jyWK8fNQe+A12Hb8AhRq26LrZ/JpcUGGOn+Y7RsweNrtN/tE3MoK7ZeZDyx" crossorigin="anonymous"></script>
+        <script src="https://cdn.datatables.net/1.10.22/js/jquery.dataTables.min.js"></script>
+        <script src="https://cdn.datatables.net/1.10.22/js/dataTables.bootstrap4.min.js"></script>
 		{% import "bootstrap/wtf.html" as wtf %}
         <style>
         .highlight { box-shadow: 0 0 7px 4px #5bc0de }
@@ -21,7 +26,7 @@
 
 	<!-- Modal Dialog Box, jquery used to show / set content -->
 	<div id="dbox" class="modal fade" tabindex="-1" role="dialog">
-		<div class="modal-dialog mw-100 w-100">
+		<div class="modal-dialog">
 		    <div class="modal-content">
 				<div class="modal-header">
 					<h5 id="dbox-title" class="modal-title"></h5>
@@ -135,11 +140,6 @@
 	{% endblock main_content %}
 
 {% if not InDBox %}
-	<!-- code to get bootstrap & bootstrap datatable to work -->
-	<script src="https://code.jquery.com/jquery-3.5.1.min.js" integrity="sha256-9/aliU8dGd2tb6OSsuzixeV4y/faTqgFtohetphbbj0=" crossorigin="anonymous"></script>
-	<script src="https://cdn.jsdelivr.net/npm/bootstrap@4.5.3/dist/js/bootstrap.bundle.min.js" integrity="sha384-ho+j7jyWK8fNQe+A12Hb8AhRq26LrZ/JpcUGGOn+Y7RsweNrtN/tE3MoK7ZeZDyx" crossorigin="anonymous"></script>
-	<script src="https://cdn.datatables.net/1.10.22/js/jquery.dataTables.min.js"></script>
-	<script src="https://cdn.datatables.net/1.10.22/js/dataTables.bootstrap4.min.js"></script>
     <script>
         function SetViewingOptionsForSearchForm()
         {
diff --git a/templates/dups.html b/templates/dups.html
index f273b85..93a3fb5 100644
--- a/templates/dups.html
+++ b/templates/dups.html
@@ -1,19 +1,15 @@
 {% extends "base.html" %} {% block main_content %}
 	<div class="container-fluid">
+
 		<h3 class="offset-lg-2">{{page_title}}</h3>
-        <div class="alert alert-info">Duplicate files have been detected.  They have the same binary content,
-        but either have a different name, there are 3 or more copies or are stored in two different
-        directories.  Choose between the options below.  NOTE: after you click
-        'Delete Duplicates', the files / directories in red will be deleted from the file
-        system, those in green will remain
-        </div>
-        <form class="d-flex justify-content-center form-inline">
+        <form id="psform" class="d-flex justify-content-center form-inline" method="POST" action="">
+            <input type="hidden" name="fe_msg_id" value="{{fe_msg_id}}"></input>
             <h5>
             <div class="form-group">
                 <label for="pagesize">{{overall_dup_sets}} sets/dirs of files 
                     containing {{overall_dup_cnt}} files -- Showing&nbsp;</label>
-                <select class="form form-control" name="pagesize">
-                {% for o in "5", "10", "20", "50" %}
+                <select id="pagesize" class="form form-control" name="pagesize" onChange="ResetPageSize()">
+                {% for o in "5", "10", "15", "20", "25", "50", "75", "100", "200" %}
                     <option
                     {% if o|int == pagesize %}
                         selected
@@ -22,25 +18,65 @@
                 {% endfor %}
                 </select>
                 &nbsp;duplicates at a time
+                <button class="button btn-info" onClick="$('#dbox').modal('show'); return false;"><i class="fas fa-info-circle"></i></button>
             </div>
             </h5>
         </form>
         <script>
             let D=[]
             let F=[]
+
+            function ResetPageSize()
+            {
+                console.log( $("#pagesize").val() )
+                $("#psform").submit()
+                return false;
+            }
+
+            $('#dbox-title').html('Duplicate Files Processing (info)')
+            div=`
+                <p>The duplicates are shown below ({{pagesize}} at a time) in a per set of files or sets of directories</p>
+                <p>The per file sets are shown with the title 'Choose between these fiels' and show a series of duplicate files in a row, that comprise one of:</p>
+                <ul>
+                    <li>3 or more of the same file</li>
+                    <li>different file names in the same or different directories</li>
+                </ul>
+                <p>The per direcory sets are shown with the title 'Choose path to KEEP...' and show a series of directories in a row, that contain duplicate files.  In this view duplicates have the same file name and any other files in these directories that are not duplicates will not be deleted</p>
+                <p>TO BE CLEAR: after you click the "Delete Duplicates" button, the files in red and duplicate files in directories in red will be deleted from the file system</p>
+             `
+            $('#dbox-content').html(div)
+
+            function KeepFile(row, which, al)
+            {
+                $('[id^=kf' + row + ']').attr('class', 'alert alert-danger sm-txt py-1')
+                $('[id^=kf' + row + ']').attr('class', 'alert alert-danger sm-txt py-1')
+                $('#kf'+row+'-f'+which).attr('class', 'alert alert-' + al + ' py-1')
+                $('#kfname-'+row).val( F[row.toString()+which.toString()] )
+            }
+            function KeepDir(row, which) 
+            {
+                $('[id^=kd'+row+']').attr('class', 'alert alert-danger sm-txt py-1')
+                $('#kd'+row+'-d'+which).attr('class', 'alert alert-success py-1')
+                $('#kdid-'+row).val( D[row.toString()+which.toString()] )
+            }
         </script>
 		<div class="row">
 			<form class="form form-inline col-lg-12" action="{{url_for('rm_dups')}}" method="POST">
                 {# pass this through so that the back-end can delete this message when it rm_dups #}
                 <input type="hidden" name="fe_msg_id" value={{fe_msg_id}}>
                 {% set page=namespace(cnt=0) %}
+                {% set pref=namespace(have="") %}
                 <h5>Choose between these files:</h5>
                 {% for dups in per_file_dups %}
                     {% set outer_loop=loop.index %}
+                    {% set pref.have="" %}
                     <div class="col-lg-12 py-2">
                         {% for dup in dups %}
                             <alert id="kf{{outer_loop}}-f{{loop.index}}" style="cursor: pointer;"  class="alert"
-                                onClick="KeepFile({{outer_loop}},{{loop.index}})">{{dup.d}}/{{dup.f}}</alert>
+                                onClick="KeepFile({{outer_loop}},{{loop.index}},'success')">{{dup.d}}/{{dup.f}}</alert>
+                            {% if preferred[dup.h] == dup.id %}
+                                {% set pref.have="kf{}-f{}".format(outer_loop,loop.index) %}
+                            {% endif %}
                             {% if loop.index < dups|length %}
                                 or 
                             {% else %}
@@ -52,6 +88,11 @@
                             </script>
                         {% endfor %}
                     </div class="col-lg-12">
+                    {% if pref.have == "" %}
+                        <script>KeepFile( {{outer_loop}}, 1,'warning' )</script>
+                    {% else %}
+                       <script>$("#{{pref.have}}").click()</script>
+                    {% endif %}
                     {% set page.cnt = page.cnt + 1 %}
                     {% if page.cnt == pagesize %}
                         {% break %}
@@ -85,22 +126,6 @@
 {% endblock main_content %}
 {% block script_content %}
     <script>
-        function KeepFile(row, which)
-        {
-            $('[id^=kf' + row + ']').attr('class', 'alert alert-danger sm-txt py-1')
-            $('#kf'+row+'-f'+which).attr('class', 'alert alert-success py-1')
-            $('#kfname-'+row).val( F[row.toString()+which.toString()] )
-        }
-
-        function KeepDir(row, which) 
-        {
-            $('[id^=kd'+row+']').attr('class', 'alert alert-danger sm-txt py-1')
-            $('#kd'+row+'-d'+which).attr('class', 'alert alert-success py-1')
-            $('#kdid-'+row).val( D[row.toString()+which.toString()] )
-        }
-
-        // force choose last of each keep file set
-        $('[id$=f1]').each( function () { $(this).siblings( '.alert' ).last().click() } )
         // force choose last of each dir set
         $('[id$=d2]').each( function() { $(this).click() } )
     </script>