diff --git a/dups.py b/dups.py index 93d2fa9..774272a 100644 --- a/dups.py +++ b/dups.py @@ -24,6 +24,7 @@ from job import Job, JobExtra, Joblog, NewJob from settings import Settings from shared import SymlinkName +################################################################################ # DupRow class is a simple 'struct' to keep data per duplicate file / just to # avoid using python list/dicts intermixed, and be able to consistently use # dot-notation of fields @@ -35,7 +36,28 @@ class DupRow: self.d=dir self.did=did self.id=fid - + return + + def __repr__(self): + return f"DupRow( id: {self.id}, did: {self.did} )" + +################################################################################ +# DupPathRow class is a simple 'struct' to keep data per files in duplicate paths +# just to avoid using python list/dicts intermixed, and be able to consistently use +# dot-notation of fields +class DupPathRow: + def __init__(self, count, d1, d2, did1, did2, hashes ): + self.count=count + self.d1=d1 + self.d2=d2 + self.did1=did1 + self.did2=did2 + self.hashes=hashes + + def __repr__(self): + return f"DupPathRow( did1: {self.did1}, did2: {self.did2} )" + +################################################################################ # Duplicates class is used with one instance/object to process all the # 'duplicate' data from the Database, and parse it into more usable data # structures. This is needed also, as the database content shows duplicates @@ -67,10 +89,13 @@ class Duplicates: self.dups_to_process={} self.per_file_dups=[] self.per_path_dups=[] - self.preferred={} + self.preferred_file={} + self.preferred_path={} self.all_paths=[] self.storage_paths=[] self.import_paths=[] + self.overall_dup_cnt=0 + self.overall_dup_sets=0 # pull apart the storage path Setting, and make array of each for use in TrimmedPath() settings=Settings.query.first() @@ -108,12 +133,10 @@ class Duplicates: return True return False - # this stores this object into the keep from same path list (only ever 1) + # this stores this object into the keep from same path list (DDP: could there be more than 1) def KeepInSameDups( self, obj ): if obj.h not in self.ip_to_sp_dups_keep: self.ip_to_sp_dups_keep[obj.h]= obj - else: - print( f"DDP: we need to cater for this - 2 files to keep in the storage path, if they are different, then pull these out of here and put them in the in_same_dup list to manually process" ) return # this stores this object into the Delete from same path list (if it is not @@ -180,98 +203,95 @@ class Duplicates: self.dups_to_process[row.hash].append( dr2 ) return + def SecondPass(self): + print("################################## second pass starting") + d1="" + d2="" + did1="" + did2="" + str="" + dup_cnt=1 + hashes="" + for hash in self.dups_to_process: + if self.overall_dup_cnt<2: + print(f"process {hash}") + # more than 2 files (just ask per file) OR + # only 2 copies, and files are in same dir (so must be diff name, so just ask) OR + # content same, filename different (just ask per file) + if (len(self.dups_to_process[hash]) > 2) or (self.dups_to_process[hash][0].f != self.dups_to_process[hash][1].f) or (self.dups_to_process[hash][0].d == self.dups_to_process[hash][1].d): + self.per_file_dups.append(self.dups_to_process[hash]) + self.overall_dup_cnt += len(self.dups_to_process[hash]) + self.overall_dup_sets += 1 + if self.overall_dup_cnt<2: + print( f"process as len(el)={len(self.dups_to_process[hash])}" ) + for el in self.dups_to_process[hash]: + if re.search( '\d{4}/\d{8}', el.d): + self.preferred_file[hash] = el.id + if self.overall_dup_cnt<25: + print( f"{self.dups_to_process[hash]} <- keeping {el.id} -- {self.preferred_file[hash]}" ) + # by here we have only 2 files, with the same name, different path + # (MOST COMMON, and I think we dont care per file, just per path) + elif d1 != self.dups_to_process[hash][0].d: + if d1 != '': + self.overall_dup_cnt += dup_cnt + self.overall_dup_sets += 1 + self.per_path_dups.append( DupPathRow( dup_cnt, d1, d2, did1, did2, hashes ) ) + if re.search( '\d{4}/\d{8}', d1): + self.preferred_path[did1]=1 + if re.search( '\d{4}/\d{8}', d2): + self.preferred_path[did2]=1 + dup_cnt=1 + d1 = self.dups_to_process[hash][0].d + d2 = self.dups_to_process[hash][1].d + did1 = self.dups_to_process[hash][0].did + did2 = self.dups_to_process[hash][1].did + str=f"duplicates found in {d1} and {d2}" + hashes = f"{hash}," + else: + dup_cnt += 1 + hashes += f"{hash}," + + if d1 != '': + self.overall_dup_cnt += dup_cnt + self.overall_dup_sets += dup_cnt + self.per_path_dups.append( DupPathRow( dup_cnt, d1, d2, did1, did2, hashes ) ) + print("#################### second pass FINISHED") + return + # quick debugger to see the data in the data structure def Dump(self): if len(self.ip_to_sp_dups_keep) > 0: print( "############ Files that are in both Import and Storage Paths ###########") - cnt=0 for h in self.ip_to_sp_dups_keep: - cnt +=1 if len(self.ip_to_sp_dups_del[h])>2: print( f"hash={h}, keep: {self.ip_to_sp_dups_keep[h]}" ) for d in self.ip_to_sp_dups_del[h]: print( f"Del: {d}" ) - print( f"{cnt} sets of duplicate files to delete at least 1, anything with 2 or more dups is printed above explicitly" ) + print( f"{len(self.ip_to_sp_dups_keep)} sets of duplicate files to delete at least 1, anything with 2 or more dups is printed above explicitly" ) + if len(self.dups_to_process) > 0: print( "############ Duplicate Files that are needing to be futher processed ###########") - cnt=0 for h in self.dups_to_process: - cnt +=1 - if len(self.dups_to_process[h])>2: - print( f"hash={h}, keep 1 of these: ", end='') - for d in self.dups_to_process[h]: - print( f"{d.id}, ", end='' ) - print ("") - print( f"{cnt} sets of duplicate files to delete at least 1, anything with 2 or more dups is printed above explicitly" ) + print( f"hash={h}, keep 1 of these: ", end='') + for d in self.dups_to_process[h]: + print( f"{d.id}, ", end='' ) + print ("") + print( f"{len(self.dups_to_process)} sets of duplicate files to delete at least 1, anything with 2 or more dups is printed above explicitly" ) + + if len(self.preferred_file) > 0: + for h in self.preferred_file: + print( f"hash={h}, keep this one: {self.preferred_file[h]} from ", end='' ) + for d in self.dups_to_process[h]: + print( f"{d.id}, ", end='' ) + print ("") + print( f"{len(self.preferred_file)} duplicate files we will keep as they match the regexp" ) + + if len(self.per_path_dups) > 0: + for pair in self.per_path_dups: + print( f"{pair.count} dups in dir1: {pair.did1} dir2: {pair.did2}" ) + if pair.did1 in self.preferred_path: + print("Keep dir1") + if pair.did2 in self.preferred_path: + print("Keep dir2") + print( f"{len(self.per_path_dups)} duplicate files in per path dups" ) return - - -""" -@app.route("/fix_dups", methods=["POST"]) -def fix_dups(): - rows = db.engine.execute( "select e1.id as id1, f1.hash, d1.path_prefix as path1, d1.eid as did1, e1.name as fname1, e2.id as id2, d2.path_prefix as path2, d2.eid as did2, e2.name as fname2 from entry e1, file f1, dir d1, entry_dir_link edl1, entry e2, file f2, dir d2, entry_dir_link edl2 where e1.id = f1.eid and e2.id = f2.eid and d1.eid = edl1.dir_eid and edl1.entry_id = e1.id and edl2.dir_eid = d2.eid and edl2.entry_id = e2.id and f1.hash = f2.hash and e1.id != e2.id and f1.size_mb = f2.size_mb order by path1, fname1" ) - - if rows.returns_rows == False: - st.SetAlert("success") - st.SetMessage(f"Err, no dups - should now clear the FE 'danger' message?") - return render_template("base.html") - - jexes = JobExtra.query.join(Job).join(PA_JobManager_Message).filter(PA_JobManager_Message.id==request.form['fe_msg_id']).all() - path=[jex.value for jex in jexes if jex.name == "path"][0] - prefix = SymlinkName(path,path+'/') - if 'pagesize' not in request.form: - pagesize=int([jex.value for jex in jexes if jex.name == "pagesize"][0]) - else: - pagesize=int(request.form['pagesize']) - dups={} - for row in rows: - AddDup( prefix+'/', row, dups ) - - d1="" - d2="" - did1="" - did2="" - str="" - dup_cnt=1 - preferred={} - per_file_dups=[] - per_path_dups=[] - hashes="" - overall_dup_cnt=0 - overall_dup_sets=0 - for hash in dups: - # more than 2 files (just ask per file) OR - # only 2 copies, and files are in same dir (so must be diff name, so just ask) OR - # content same, filename different (just ask per file) - if (len(dups[hash]) > 2) or (dups[hash][0]['d'] == dups[hash][1]['d']) or (dups[hash][0]['f'] != dups[hash][1]['f']): - per_file_dups.append(dups[hash]) - overall_dup_cnt += len(dups[hash]) - overall_dup_sets += 1 - for el in dups[hash]: - if re.search( '\d{4}/\d{8}', el['d']): - preferred[hash] = el['id'] - if overall_dup_cnt<5: - print( f"{dups[hash]} <- keeping {el['d']} -- {preferred[hash]}" ) - # by here we have only 2 files, with the same name, different path - # (MOST COMMON, and I think we dont care per file, just per path) - elif d1 != dups[hash][0]['d']: - if d1 != '': - overall_dup_cnt += dup_cnt - overall_dup_sets += 1 - per_path_dups.append({'count': dup_cnt, 'd1': d1, 'd2': d2, 'did1': did1, 'did2': did2, 'hashes' : hashes }) - dup_cnt=1 - d1 = dups[hash][0]['d'] - d2 = dups[hash][1]['d'] - did1 = dups[hash][0]['did'] - did2 = dups[hash][1]['did'] - str=f"duplicates found in {d1} and {d2}" - hashes = f"{hash}," - else: - dup_cnt += 1 - hashes += f"{hash}," - - if d1 != '': - overall_dup_cnt += dup_cnt - overall_dup_sets += dup_cnt - per_path_dups.append({'count': dup_cnt, 'd1': d1, 'd2': d2, 'did1': did1, 'did2': did2, 'hashes' : hashes }) -""" diff --git a/files.py b/files.py index 60b16c5..631c3d1 100644 --- a/files.py +++ b/files.py @@ -281,66 +281,10 @@ def fix_dups(): for row in rows: D.AddDup( row ) - print( D.Dump() ) + D.SecondPass() +# print( D.Dump() ) - d1="" - d2="" - did1="" - did2="" - str="" - dup_cnt=1 - preferred={} - per_file_dups=[] - per_path_dups=[] - hashes="" - overall_dup_cnt=0 - overall_dup_sets=0 - - return render_template("dups.html", per_file_dups=per_file_dups, preferred=preferred, per_path_dups=per_path_dups, fe_msg_id=request.form['fe_msg_id'], overall_dup_cnt=overall_dup_cnt, overall_dup_sets=overall_dup_sets, pagesize=pagesize ) - -""" - dups={} - for row in rows: - AddDup( prefix+'/', row, dups ) - for hash in dups: - # more than 2 files (just ask per file) OR - # only 2 copies, and files are in same dir (so must be diff name, so just ask) OR - # content same, filename different (just ask per file) - if (len(dups[hash]) > 2) or (dups[hash][0]['d'] == dups[hash][1]['d']) or (dups[hash][0]['f'] != dups[hash][1]['f']): - per_file_dups.append(dups[hash]) - overall_dup_cnt += len(dups[hash]) - overall_dup_sets += 1 - for el in dups[hash]: - if re.search( '\d{4}/\d{8}', el['d']): - preferred[hash] = el['id'] - if overall_dup_cnt<5: - print( f"{dups[hash]} <- keeping {el['d']} -- {preferred[hash]}" ) - # by here we have only 2 files, with the same name, different path - # (MOST COMMON, and I think we dont care per file, just per path) - elif d1 != dups[hash][0]['d']: - if d1 != '': - overall_dup_cnt += dup_cnt - overall_dup_sets += 1 - per_path_dups.append({'count': dup_cnt, 'd1': d1, 'd2': d2, 'did1': did1, 'did2': did2, 'hashes' : hashes }) - dup_cnt=1 - d1 = dups[hash][0]['d'] - d2 = dups[hash][1]['d'] - did1 = dups[hash][0]['did'] - did2 = dups[hash][1]['did'] - str=f"duplicates found in {d1} and {d2}" - hashes = f"{hash}," - else: - dup_cnt += 1 - hashes += f"{hash}," - - if d1 != '': - overall_dup_cnt += dup_cnt - overall_dup_sets += dup_cnt - per_path_dups.append({'count': dup_cnt, 'd1': d1, 'd2': d2, 'did1': did1, 'did2': did2, 'hashes' : hashes }) - - return render_template("dups.html", per_file_dups=per_file_dups, preferred=preferred, per_path_dups=per_path_dups, fe_msg_id=request.form['fe_msg_id'], overall_dup_cnt=overall_dup_cnt, overall_dup_sets=overall_dup_sets, pagesize=pagesize ) - -""" + return render_template("dups.html", per_file_dups=D.per_file_dups, preferred=D.preferred_file, per_path_dups=D.per_path_dups, fe_msg_id=request.form['fe_msg_id'], overall_dup_cnt=D.overall_dup_cnt, overall_dup_sets=D.overall_dup_sets, pagesize=pagesize ) @app.route("/rm_dups", methods=["POST"]) def rm_dups():