first pass of using Duplicate class, rather than files doing all the dup work. The html still is shown preferreds, and does not know there are preferred files and preferred dirs yet

This commit is contained in:
2021-03-15 20:36:10 +11:00
parent 046c512e6b
commit 08dc646371
2 changed files with 110 additions and 146 deletions

192
dups.py
View File

@@ -24,6 +24,7 @@ from job import Job, JobExtra, Joblog, NewJob
from settings import Settings
from shared import SymlinkName
################################################################################
# DupRow class is a simple 'struct' to keep data per duplicate file / just to
# avoid using python list/dicts intermixed, and be able to consistently use
# dot-notation of fields
@@ -35,7 +36,28 @@ class DupRow:
self.d=dir
self.did=did
self.id=fid
return
def __repr__(self):
return f"DupRow( id: {self.id}, did: {self.did} )"
################################################################################
# DupPathRow class is a simple 'struct' to keep data per files in duplicate paths
# just to avoid using python list/dicts intermixed, and be able to consistently use
# dot-notation of fields
class DupPathRow:
def __init__(self, count, d1, d2, did1, did2, hashes ):
self.count=count
self.d1=d1
self.d2=d2
self.did1=did1
self.did2=did2
self.hashes=hashes
def __repr__(self):
return f"DupPathRow( did1: {self.did1}, did2: {self.did2} )"
################################################################################
# Duplicates class is used with one instance/object to process all the
# 'duplicate' data from the Database, and parse it into more usable data
# structures. This is needed also, as the database content shows duplicates
@@ -67,10 +89,13 @@ class Duplicates:
self.dups_to_process={}
self.per_file_dups=[]
self.per_path_dups=[]
self.preferred={}
self.preferred_file={}
self.preferred_path={}
self.all_paths=[]
self.storage_paths=[]
self.import_paths=[]
self.overall_dup_cnt=0
self.overall_dup_sets=0
# pull apart the storage path Setting, and make array of each for use in TrimmedPath()
settings=Settings.query.first()
@@ -108,12 +133,10 @@ class Duplicates:
return True
return False
# this stores this object into the keep from same path list (only ever 1)
# this stores this object into the keep from same path list (DDP: could there be more than 1)
def KeepInSameDups( self, obj ):
if obj.h not in self.ip_to_sp_dups_keep:
self.ip_to_sp_dups_keep[obj.h]= obj
else:
print( f"DDP: we need to cater for this - 2 files to keep in the storage path, if they are different, then pull these out of here and put them in the in_same_dup list to manually process" )
return
# this stores this object into the Delete from same path list (if it is not
@@ -180,98 +203,95 @@ class Duplicates:
self.dups_to_process[row.hash].append( dr2 )
return
def SecondPass(self):
print("################################## second pass starting")
d1=""
d2=""
did1=""
did2=""
str=""
dup_cnt=1
hashes=""
for hash in self.dups_to_process:
if self.overall_dup_cnt<2:
print(f"process {hash}")
# more than 2 files (just ask per file) OR
# only 2 copies, and files are in same dir (so must be diff name, so just ask) OR
# content same, filename different (just ask per file)
if (len(self.dups_to_process[hash]) > 2) or (self.dups_to_process[hash][0].f != self.dups_to_process[hash][1].f) or (self.dups_to_process[hash][0].d == self.dups_to_process[hash][1].d):
self.per_file_dups.append(self.dups_to_process[hash])
self.overall_dup_cnt += len(self.dups_to_process[hash])
self.overall_dup_sets += 1
if self.overall_dup_cnt<2:
print( f"process as len(el)={len(self.dups_to_process[hash])}" )
for el in self.dups_to_process[hash]:
if re.search( '\d{4}/\d{8}', el.d):
self.preferred_file[hash] = el.id
if self.overall_dup_cnt<25:
print( f"{self.dups_to_process[hash]} <- keeping {el.id} -- {self.preferred_file[hash]}" )
# by here we have only 2 files, with the same name, different path
# (MOST COMMON, and I think we dont care per file, just per path)
elif d1 != self.dups_to_process[hash][0].d:
if d1 != '':
self.overall_dup_cnt += dup_cnt
self.overall_dup_sets += 1
self.per_path_dups.append( DupPathRow( dup_cnt, d1, d2, did1, did2, hashes ) )
if re.search( '\d{4}/\d{8}', d1):
self.preferred_path[did1]=1
if re.search( '\d{4}/\d{8}', d2):
self.preferred_path[did2]=1
dup_cnt=1
d1 = self.dups_to_process[hash][0].d
d2 = self.dups_to_process[hash][1].d
did1 = self.dups_to_process[hash][0].did
did2 = self.dups_to_process[hash][1].did
str=f"duplicates found in {d1} and {d2}"
hashes = f"{hash},"
else:
dup_cnt += 1
hashes += f"{hash},"
if d1 != '':
self.overall_dup_cnt += dup_cnt
self.overall_dup_sets += dup_cnt
self.per_path_dups.append( DupPathRow( dup_cnt, d1, d2, did1, did2, hashes ) )
print("#################### second pass FINISHED")
return
# quick debugger to see the data in the data structure
def Dump(self):
if len(self.ip_to_sp_dups_keep) > 0:
print( "############ Files that are in both Import and Storage Paths ###########")
cnt=0
for h in self.ip_to_sp_dups_keep:
cnt +=1
if len(self.ip_to_sp_dups_del[h])>2:
print( f"hash={h}, keep: {self.ip_to_sp_dups_keep[h]}" )
for d in self.ip_to_sp_dups_del[h]:
print( f"Del: {d}" )
print( f"{cnt} sets of duplicate files to delete at least 1, anything with 2 or more dups is printed above explicitly" )
print( f"{len(self.ip_to_sp_dups_keep)} sets of duplicate files to delete at least 1, anything with 2 or more dups is printed above explicitly" )
if len(self.dups_to_process) > 0:
print( "############ Duplicate Files that are needing to be futher processed ###########")
cnt=0
for h in self.dups_to_process:
cnt +=1
if len(self.dups_to_process[h])>2:
print( f"hash={h}, keep 1 of these: ", end='')
for d in self.dups_to_process[h]:
print( f"{d.id}, ", end='' )
print ("")
print( f"{cnt} sets of duplicate files to delete at least 1, anything with 2 or more dups is printed above explicitly" )
print( f"hash={h}, keep 1 of these: ", end='')
for d in self.dups_to_process[h]:
print( f"{d.id}, ", end='' )
print ("")
print( f"{len(self.dups_to_process)} sets of duplicate files to delete at least 1, anything with 2 or more dups is printed above explicitly" )
if len(self.preferred_file) > 0:
for h in self.preferred_file:
print( f"hash={h}, keep this one: {self.preferred_file[h]} from ", end='' )
for d in self.dups_to_process[h]:
print( f"{d.id}, ", end='' )
print ("")
print( f"{len(self.preferred_file)} duplicate files we will keep as they match the regexp" )
if len(self.per_path_dups) > 0:
for pair in self.per_path_dups:
print( f"{pair.count} dups in dir1: {pair.did1} dir2: {pair.did2}" )
if pair.did1 in self.preferred_path:
print("Keep dir1")
if pair.did2 in self.preferred_path:
print("Keep dir2")
print( f"{len(self.per_path_dups)} duplicate files in per path dups" )
return
"""
@app.route("/fix_dups", methods=["POST"])
def fix_dups():
rows = db.engine.execute( "select e1.id as id1, f1.hash, d1.path_prefix as path1, d1.eid as did1, e1.name as fname1, e2.id as id2, d2.path_prefix as path2, d2.eid as did2, e2.name as fname2 from entry e1, file f1, dir d1, entry_dir_link edl1, entry e2, file f2, dir d2, entry_dir_link edl2 where e1.id = f1.eid and e2.id = f2.eid and d1.eid = edl1.dir_eid and edl1.entry_id = e1.id and edl2.dir_eid = d2.eid and edl2.entry_id = e2.id and f1.hash = f2.hash and e1.id != e2.id and f1.size_mb = f2.size_mb order by path1, fname1" )
if rows.returns_rows == False:
st.SetAlert("success")
st.SetMessage(f"Err, no dups - should now clear the FE 'danger' message?")
return render_template("base.html")
jexes = JobExtra.query.join(Job).join(PA_JobManager_Message).filter(PA_JobManager_Message.id==request.form['fe_msg_id']).all()
path=[jex.value for jex in jexes if jex.name == "path"][0]
prefix = SymlinkName(path,path+'/')
if 'pagesize' not in request.form:
pagesize=int([jex.value for jex in jexes if jex.name == "pagesize"][0])
else:
pagesize=int(request.form['pagesize'])
dups={}
for row in rows:
AddDup( prefix+'/', row, dups )
d1=""
d2=""
did1=""
did2=""
str=""
dup_cnt=1
preferred={}
per_file_dups=[]
per_path_dups=[]
hashes=""
overall_dup_cnt=0
overall_dup_sets=0
for hash in dups:
# more than 2 files (just ask per file) OR
# only 2 copies, and files are in same dir (so must be diff name, so just ask) OR
# content same, filename different (just ask per file)
if (len(dups[hash]) > 2) or (dups[hash][0]['d'] == dups[hash][1]['d']) or (dups[hash][0]['f'] != dups[hash][1]['f']):
per_file_dups.append(dups[hash])
overall_dup_cnt += len(dups[hash])
overall_dup_sets += 1
for el in dups[hash]:
if re.search( '\d{4}/\d{8}', el['d']):
preferred[hash] = el['id']
if overall_dup_cnt<5:
print( f"{dups[hash]} <- keeping {el['d']} -- {preferred[hash]}" )
# by here we have only 2 files, with the same name, different path
# (MOST COMMON, and I think we dont care per file, just per path)
elif d1 != dups[hash][0]['d']:
if d1 != '':
overall_dup_cnt += dup_cnt
overall_dup_sets += 1
per_path_dups.append({'count': dup_cnt, 'd1': d1, 'd2': d2, 'did1': did1, 'did2': did2, 'hashes' : hashes })
dup_cnt=1
d1 = dups[hash][0]['d']
d2 = dups[hash][1]['d']
did1 = dups[hash][0]['did']
did2 = dups[hash][1]['did']
str=f"duplicates found in {d1} and {d2}"
hashes = f"{hash},"
else:
dup_cnt += 1
hashes += f"{hash},"
if d1 != '':
overall_dup_cnt += dup_cnt
overall_dup_sets += dup_cnt
per_path_dups.append({'count': dup_cnt, 'd1': d1, 'd2': d2, 'did1': did1, 'did2': did2, 'hashes' : hashes })
"""

View File

@@ -281,66 +281,10 @@ def fix_dups():
for row in rows:
D.AddDup( row )
print( D.Dump() )
D.SecondPass()
# print( D.Dump() )
d1=""
d2=""
did1=""
did2=""
str=""
dup_cnt=1
preferred={}
per_file_dups=[]
per_path_dups=[]
hashes=""
overall_dup_cnt=0
overall_dup_sets=0
return render_template("dups.html", per_file_dups=per_file_dups, preferred=preferred, per_path_dups=per_path_dups, fe_msg_id=request.form['fe_msg_id'], overall_dup_cnt=overall_dup_cnt, overall_dup_sets=overall_dup_sets, pagesize=pagesize )
"""
dups={}
for row in rows:
AddDup( prefix+'/', row, dups )
for hash in dups:
# more than 2 files (just ask per file) OR
# only 2 copies, and files are in same dir (so must be diff name, so just ask) OR
# content same, filename different (just ask per file)
if (len(dups[hash]) > 2) or (dups[hash][0]['d'] == dups[hash][1]['d']) or (dups[hash][0]['f'] != dups[hash][1]['f']):
per_file_dups.append(dups[hash])
overall_dup_cnt += len(dups[hash])
overall_dup_sets += 1
for el in dups[hash]:
if re.search( '\d{4}/\d{8}', el['d']):
preferred[hash] = el['id']
if overall_dup_cnt<5:
print( f"{dups[hash]} <- keeping {el['d']} -- {preferred[hash]}" )
# by here we have only 2 files, with the same name, different path
# (MOST COMMON, and I think we dont care per file, just per path)
elif d1 != dups[hash][0]['d']:
if d1 != '':
overall_dup_cnt += dup_cnt
overall_dup_sets += 1
per_path_dups.append({'count': dup_cnt, 'd1': d1, 'd2': d2, 'did1': did1, 'did2': did2, 'hashes' : hashes })
dup_cnt=1
d1 = dups[hash][0]['d']
d2 = dups[hash][1]['d']
did1 = dups[hash][0]['did']
did2 = dups[hash][1]['did']
str=f"duplicates found in {d1} and {d2}"
hashes = f"{hash},"
else:
dup_cnt += 1
hashes += f"{hash},"
if d1 != '':
overall_dup_cnt += dup_cnt
overall_dup_sets += dup_cnt
per_path_dups.append({'count': dup_cnt, 'd1': d1, 'd2': d2, 'did1': did1, 'did2': did2, 'hashes' : hashes })
return render_template("dups.html", per_file_dups=per_file_dups, preferred=preferred, per_path_dups=per_path_dups, fe_msg_id=request.form['fe_msg_id'], overall_dup_cnt=overall_dup_cnt, overall_dup_sets=overall_dup_sets, pagesize=pagesize )
"""
return render_template("dups.html", per_file_dups=D.per_file_dups, preferred=D.preferred_file, per_path_dups=D.per_path_dups, fe_msg_id=request.form['fe_msg_id'], overall_dup_cnt=D.overall_dup_cnt, overall_dup_sets=D.overall_dup_sets, pagesize=pagesize )
@app.route("/rm_dups", methods=["POST"])
def rm_dups():