updated dups.py to have a DupRow class to store the data in a way dot-notation works consistently in Duplicates class, also moved attributes into top of __init__ for consistency, but needed for DupRow to work (implies the attributes are not globally shared per DupRow, which would not work). Broke into a couple more functions for readability, and added back the first-pass duplicate row creation into AddDups -- that needs a better name too
This commit is contained in:
122
dups.py
122
dups.py
@@ -24,18 +24,28 @@ from job import Job, JobExtra, Joblog, NewJob
|
|||||||
from settings import Settings
|
from settings import Settings
|
||||||
from shared import SymlinkName
|
from shared import SymlinkName
|
||||||
|
|
||||||
|
class DupRow:
|
||||||
|
def __init__(self, hash, file, dir, did, fid):
|
||||||
|
### DupRow Attributes -- note, simple class, no methods ###
|
||||||
|
self.h=hash
|
||||||
|
self.f=file
|
||||||
|
self.d=dir
|
||||||
|
self.did=did
|
||||||
|
self.id=fid
|
||||||
|
|
||||||
class Duplicates:
|
class Duplicates:
|
||||||
ip_to_sp_dups_keep={}
|
|
||||||
ip_to_sp_dups_del={}
|
|
||||||
in_same_dups={}
|
|
||||||
per_file_dups=[]
|
|
||||||
per_path_dups=[]
|
|
||||||
preferred={}
|
|
||||||
all_paths=[]
|
|
||||||
storage_paths=[]
|
|
||||||
import_paths=[]
|
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
### Duplicates Attributes ###
|
||||||
|
self.ip_to_sp_dups_keep={}
|
||||||
|
self.ip_to_sp_dups_del={}
|
||||||
|
self.in_same_dups={}
|
||||||
|
self.per_file_dups=[]
|
||||||
|
self.per_path_dups=[]
|
||||||
|
self.preferred={}
|
||||||
|
self.all_paths=[]
|
||||||
|
self.storage_paths=[]
|
||||||
|
self.import_paths=[]
|
||||||
|
|
||||||
# per storage path, add entries to view
|
# per storage path, add entries to view
|
||||||
settings=Settings.query.first()
|
settings=Settings.query.first()
|
||||||
paths = settings.storage_path.split("#")
|
paths = settings.storage_path.split("#")
|
||||||
@@ -68,30 +78,59 @@ class Duplicates:
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
def KeepInSameDups( self, obj ):
|
def KeepInSameDups( self, obj ):
|
||||||
if obj['h'] not in self.ip_to_sp_dups_keep:
|
if obj.h not in self.ip_to_sp_dups_keep:
|
||||||
self.ip_to_sp_dups_keep[obj['h']]= obj
|
self.ip_to_sp_dups_keep[obj.h]= obj
|
||||||
return
|
return
|
||||||
|
|
||||||
def DelInSameDups( self, obj ):
|
def DelInSameDups( self, obj ):
|
||||||
if obj['h'] not in self.ip_to_sp_dups_del:
|
if obj.h not in self.ip_to_sp_dups_del:
|
||||||
self.ip_to_sp_dups_del[obj['h']]=[]
|
self.ip_to_sp_dups_del[obj.h]=[]
|
||||||
self.ip_to_sp_dups_del[obj['h']].append( obj )
|
self.ip_to_sp_dups_del[obj.h].append( obj )
|
||||||
else:
|
else:
|
||||||
for el in self.ip_to_sp_dups_del[obj['h']]:
|
for el in self.ip_to_sp_dups_del[obj.h]:
|
||||||
if el['id'] == obj['id']:
|
if el.id == obj.id:
|
||||||
return
|
return
|
||||||
self.ip_to_sp_dups_del[obj['h']].append( obj )
|
self.ip_to_sp_dups_del[obj.h].append( obj )
|
||||||
return
|
return
|
||||||
|
|
||||||
def AddDup( self, row ):
|
def DupInImportAndStoragePath( self, row, dr1, dr2 ):
|
||||||
if self.InStoragePath(row.path1) and self.InImportPath(row.path2):
|
if self.InStoragePath(row.path1) and self.InImportPath(row.path2):
|
||||||
self.KeepInSameDups( { 'f': row.fname1, 'd':self.TrimmedPath(row.path1), 'did': row.did1, 'h':row.hash, 'id':row.id1 } )
|
self.KeepInSameDups( dr1 )
|
||||||
self.DelInSameDups( { 'f': row.fname2, 'd':self.TrimmedPath(row.path2), 'did': row.did2, 'h':row.hash, 'id':row.id2 } )
|
self.DelInSameDups( dr2 )
|
||||||
|
return True
|
||||||
if self.InStoragePath(row.path2) and self.InImportPath(row.path1):
|
if self.InStoragePath(row.path2) and self.InImportPath(row.path1):
|
||||||
self.KeepInSameDups( { 'f': row.fname2, 'd':self.TrimmedPath(row.path2), 'did': row.did2, 'h':row.hash, 'id':row.id2 } )
|
self.KeepInSameDups( dr2 )
|
||||||
self.DelInSameDups( { 'f': row.fname1, 'd':self.TrimmedPath(row.path1), 'did': row.did1, 'h':row.hash, 'id':row.id1 } )
|
self.DelInSameDups( dr1 )
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
def AddDup( self, row ):
|
||||||
|
dr1=DupRow( row.hash, row.fname1, self.TrimmedPath(row.path1), row.did1, row.id1 )
|
||||||
|
dr2=DupRow( row.hash, row.fname2, self.TrimmedPath(row.path2), row.did2, row.id2 )
|
||||||
|
if self.DupInImportAndStoragePath( row, dr1, dr2 ):
|
||||||
|
return
|
||||||
|
|
||||||
|
if row.hash not in self.in_same_dups:
|
||||||
|
self.in_same_dups[row.hash]=[]
|
||||||
|
self.in_same_dups[row.hash].append( dr1 )
|
||||||
|
self.in_same_dups[row.hash].append( dr2 )
|
||||||
|
else:
|
||||||
|
# process path1 / fname1 -- if that combo is not in the dups[hash], add it
|
||||||
|
found=0
|
||||||
|
for dup in self.in_same_dups[row.hash]:
|
||||||
|
if dup.id == row.id1:
|
||||||
|
found=1
|
||||||
|
continue
|
||||||
|
if not found:
|
||||||
|
self.in_same_dups[row.hash].append( dr1 )
|
||||||
|
|
||||||
|
# process path2 / fname2 -- if that combo is not in the dups[hash], add it
|
||||||
|
for dup in self.in_same_dups[row.hash]:
|
||||||
|
if dup.id == row.id2:
|
||||||
|
found=1
|
||||||
|
continue
|
||||||
|
if not found:
|
||||||
|
self.in_same_dups[row.hash].append( dr2 )
|
||||||
return
|
return
|
||||||
|
|
||||||
def Dump(self):
|
def Dump(self):
|
||||||
@@ -105,32 +144,21 @@ class Duplicates:
|
|||||||
for d in self.ip_to_sp_dups_del[h]:
|
for d in self.ip_to_sp_dups_del[h]:
|
||||||
print( f"Del: {d}" )
|
print( f"Del: {d}" )
|
||||||
print( f"{cnt} sets of duplicate files to delete at least 1, anything with 2 or more dups is printed above explicitly" )
|
print( f"{cnt} sets of duplicate files to delete at least 1, anything with 2 or more dups is printed above explicitly" )
|
||||||
|
if len(self.in_same_dups) > 0:
|
||||||
|
print( "############ Duplicate Files that are in the same Path ###########")
|
||||||
|
cnt=0
|
||||||
|
for h in self.in_same_dups:
|
||||||
|
cnt +=1
|
||||||
|
if len(self.in_same_dups[h])>2:
|
||||||
|
print( f"hash={h}, keep 1 of these: ", end='')
|
||||||
|
for d in self.in_same_dups[h]:
|
||||||
|
print( f"{d.id}, ", end='' )
|
||||||
|
print ("")
|
||||||
|
print( f"{cnt} sets of duplicate files to delete at least 1, anything with 2 or more dups is printed above explicitly" )
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
if row.hash not in dups:
|
|
||||||
dups[row.hash]=[]
|
|
||||||
dups[row.hash].append( { 'f': row.fname1, 'd':TrimmedPath(prefix, row.path1), 'did': row.did1, 'h':row.hash, 'id':row.id1 } )
|
|
||||||
dups[row.hash].append( { 'f': row.fname2, 'd':TrimmedPath(prefix, row.path2), 'did': row.did2, 'h':row.hash, 'id':row.id2 } )
|
|
||||||
else:
|
|
||||||
# process path1 / fname1 -- if that combo is not in the dups[hash], add it
|
|
||||||
found=0
|
|
||||||
for dup in dups[row.hash]:
|
|
||||||
if dup['f'] == row.fname1 and dup['d'] == TrimmedPath(prefix, row.path1):
|
|
||||||
found=1
|
|
||||||
continue
|
|
||||||
if not found:
|
|
||||||
dups[row.hash].append( { 'f': row.fname1, 'd':TrimmedPath(prefix, row.path1), 'did': row.did1, 'h':row.hash, 'id':row.id1 } )
|
|
||||||
|
|
||||||
# process path2 / fname2 -- if that combo is not in the dups[hash], add it
|
|
||||||
found=0
|
|
||||||
for dup in dups[row.hash]:
|
|
||||||
if dup['f'] == row.fname2 and dup['d'] == TrimmedPath(prefix, row.path2):
|
|
||||||
found=1
|
|
||||||
continue
|
|
||||||
if not found:
|
|
||||||
dups[row.hash].append( { 'f': row.fname2, 'd':TrimmedPath(prefix, row.path2), 'did': row.did2, 'h':row.hash, 'id':row.id2 } )
|
|
||||||
|
|
||||||
@app.route("/fix_dups", methods=["POST"])
|
@app.route("/fix_dups", methods=["POST"])
|
||||||
def fix_dups():
|
def fix_dups():
|
||||||
rows = db.engine.execute( "select e1.id as id1, f1.hash, d1.path_prefix as path1, d1.eid as did1, e1.name as fname1, e2.id as id2, d2.path_prefix as path2, d2.eid as did2, e2.name as fname2 from entry e1, file f1, dir d1, entry_dir_link edl1, entry e2, file f2, dir d2, entry_dir_link edl2 where e1.id = f1.eid and e2.id = f2.eid and d1.eid = edl1.dir_eid and edl1.entry_id = e1.id and edl2.dir_eid = d2.eid and edl2.entry_id = e2.id and f1.hash = f2.hash and e1.id != e2.id and f1.size_mb = f2.size_mb order by path1, fname1" )
|
rows = db.engine.execute( "select e1.id as id1, f1.hash, d1.path_prefix as path1, d1.eid as did1, e1.name as fname1, e2.id as id2, d2.path_prefix as path2, d2.eid as did2, e2.name as fname2 from entry e1, file f1, dir d1, entry_dir_link edl1, entry e2, file f2, dir d2, entry_dir_link edl2 where e1.id = f1.eid and e2.id = f2.eid and d1.eid = edl1.dir_eid and edl1.entry_id = e1.id and edl2.dir_eid = d2.eid and edl2.entry_id = e2.id and f1.hash = f2.hash and e1.id != e2.id and f1.size_mb = f2.size_mb order by path1, fname1" )
|
||||||
|
|||||||
Reference in New Issue
Block a user