added comments

This commit is contained in:
2021-03-14 14:33:22 +11:00
parent f88ef2542f
commit 046c512e6b

82
dups.py
View File

@@ -24,6 +24,9 @@ from job import Job, JobExtra, Joblog, NewJob
from settings import Settings from settings import Settings
from shared import SymlinkName from shared import SymlinkName
# DupRow class is a simple 'struct' to keep data per duplicate file / just to
# avoid using python list/dicts intermixed, and be able to consistently use
# dot-notation of fields
class DupRow: class DupRow:
def __init__(self, hash, file, dir, did, fid): def __init__(self, hash, file, dir, did, fid):
### DupRow Attributes -- note, simple class, no methods ### ### DupRow Attributes -- note, simple class, no methods ###
@@ -33,12 +36,35 @@ class DupRow:
self.did=did self.did=did
self.id=fid self.id=fid
# Duplicates class is used with one instance/object to process all the
# 'duplicate' data from the Database, and parse it into more usable data
# structures. This is needed also, as the database content shows duplicates
# more than once, e.g.
# file1 and file2 are a duplicate, then later file2 and file 1 are 'another' duplicate
# The class passes over the data in 2 passes. The first pass in AddDup() finds
# any files in the import and storage path and marks the storage ones to keep,
# the import ones to delete. Anything else is either a set of files duplicated
# inside the import path or set of files duplicated in the storage path
# The first pass, simply concatenates these into a data structure
# (im_same_dups) that contains all the duplicates with a key of the md5 hash
#
# The second pass (), processes these duplicates to see if there are any in the
# storage path that follow the pattern 'YYYY/YYYYMMDD' -> if so mark these to
# keep and the rest to be deleted.
#
# After the 2 passes, we have data structures that allow the web to break up
# the duplicates into batches to process:
# 1) auto delete any in the import path that are also in the storage path
# - carefule here, if we have 2 in the import path and 2+ in the storage path, leave it for manual intervention
# 2) auto delete any in the storage path that are in a set where 1 of them match the 'YYYY/YYYYMMDD' format, the rest are deleted
# 3) a set of directories where there are only 2 duplicate files (with the same file name), just in a different dir - allow user to choose the dir to keep
# 4) a set of individual files where I want the user to make a decision (3 or more copies, those with different filenames, or in the same dir) - allow user to choose file to keep
class Duplicates: class Duplicates:
def __init__(self): def __init__(self):
### Duplicates Attributes ### ### Duplicates Attributes ###
self.ip_to_sp_dups_keep={} self.ip_to_sp_dups_keep={}
self.ip_to_sp_dups_del={} self.ip_to_sp_dups_del={}
self.in_same_dups={} self.dups_to_process={}
self.per_file_dups=[] self.per_file_dups=[]
self.per_path_dups=[] self.per_path_dups=[]
self.preferred={} self.preferred={}
@@ -46,42 +72,52 @@ class Duplicates:
self.storage_paths=[] self.storage_paths=[]
self.import_paths=[] self.import_paths=[]
# per storage path, add entries to view # pull apart the storage path Setting, and make array of each for use in TrimmedPath()
settings=Settings.query.first() settings=Settings.query.first()
paths = settings.storage_path.split("#") paths = settings.storage_path.split("#")
for path in paths: for path in paths:
prefix = SymlinkName(path,path+'/') prefix = SymlinkName(path,path+'/')
self.storage_paths.append(prefix) self.storage_paths.append(prefix)
self.all_paths.append(prefix) self.all_paths.append(prefix)
# pull apart the import path Setting, and make array of each for use in TrimmedPath()
paths = settings.import_path.split("#") paths = settings.import_path.split("#")
for path in paths: for path in paths:
prefix = SymlinkName(path,path+'/') prefix = SymlinkName(path,path+'/')
self.import_paths.append(prefix) self.import_paths.append(prefix)
self.all_paths.append(prefix) self.all_paths.append(prefix)
# Strip the front of the path (any match on a storage or import path) is
# removed. Just to make it easier to read when we display in the web page
def TrimmedPath( self, path ): def TrimmedPath( self, path ):
for p in self.all_paths: for p in self.all_paths:
if re.match( f"^{p}", path ): if re.match( f"^{p}", path ):
return path.replace(p, '' ) return path.replace(p, '' )
return path return path
# is this file in the import path?
def InImportPath( self, path ): def InImportPath( self, path ):
for p in self.import_paths: for p in self.import_paths:
if re.match( f"^{p}", path ): if re.match( f"^{p}", path ):
return True return True
return False return False
# is this file in the storage path?
def InStoragePath( self, path ): def InStoragePath( self, path ):
for p in self.storage_paths: for p in self.storage_paths:
if re.match( f"^{p}", path ): if re.match( f"^{p}", path ):
return True return True
return False return False
# this stores this object into the keep from same path list (only ever 1)
def KeepInSameDups( self, obj ): def KeepInSameDups( self, obj ):
if obj.h not in self.ip_to_sp_dups_keep: if obj.h not in self.ip_to_sp_dups_keep:
self.ip_to_sp_dups_keep[obj.h]= obj self.ip_to_sp_dups_keep[obj.h]= obj
else:
print( f"DDP: we need to cater for this - 2 files to keep in the storage path, if they are different, then pull these out of here and put them in the in_same_dup list to manually process" )
return return
# this stores this object into the Delete from same path list (if it is not
# already there)
def DelInSameDups( self, obj ): def DelInSameDups( self, obj ):
if obj.h not in self.ip_to_sp_dups_del: if obj.h not in self.ip_to_sp_dups_del:
self.ip_to_sp_dups_del[obj.h]=[] self.ip_to_sp_dups_del[obj.h]=[]
@@ -93,6 +129,9 @@ class Duplicates:
self.ip_to_sp_dups_del[obj.h].append( obj ) self.ip_to_sp_dups_del[obj.h].append( obj )
return return
# this function takes a duplicate file (in the import path and the storage path)
# and then puts the storage path file in the keep list (self.ip_to_sp_dups_keep) via self.KeepInSameDups()
# and then puts the import path file in the delete list (self.ip_to_sp_dups_keep) via self.DelInSameDups()
def DupInImportAndStoragePath( self, row, dr1, dr2 ): def DupInImportAndStoragePath( self, row, dr1, dr2 ):
if self.InStoragePath(row.path1) and self.InImportPath(row.path2): if self.InStoragePath(row.path1) and self.InImportPath(row.path2):
self.KeepInSameDups( dr1 ) self.KeepInSameDups( dr1 )
@@ -104,35 +143,44 @@ class Duplicates:
return True return True
return False return False
# AddDup: takes a row from the database effectively file1 & file2
# we process these into appropriate data structures on this first pass
def AddDup( self, row ): def AddDup( self, row ):
dr1=DupRow( row.hash, row.fname1, self.TrimmedPath(row.path1), row.did1, row.id1 ) dr1=DupRow( row.hash, row.fname1, self.TrimmedPath(row.path1), row.did1, row.id1 )
dr2=DupRow( row.hash, row.fname2, self.TrimmedPath(row.path2), row.did2, row.id2 ) dr2=DupRow( row.hash, row.fname2, self.TrimmedPath(row.path2), row.did2, row.id2 )
# if in both import and storage path, just keep the storage path file,
# and del import path file.
if self.DupInImportAndStoragePath( row, dr1, dr2 ): if self.DupInImportAndStoragePath( row, dr1, dr2 ):
return return
if row.hash not in self.in_same_dups: # if we are here, we have duplicates either in the storage path or in
self.in_same_dups[row.hash]=[] # the import path
self.in_same_dups[row.hash].append( dr1 )
self.in_same_dups[row.hash].append( dr2 ) # if the hast is no dups_to_process, created / append
if row.hash not in self.dups_to_process:
self.dups_to_process[row.hash]=[]
self.dups_to_process[row.hash].append( dr1 )
self.dups_to_process[row.hash].append( dr2 )
else: else:
# process path1 / fname1 -- if that combo is not in the dups[hash], add it # process path1 / fname1 -- if that combo is not in the dups_to_process[hash], add it
found=0 found=0
for dup in self.in_same_dups[row.hash]: for dup in self.dups_to_process[row.hash]:
if dup.id == row.id1: if dup.id == row.id1:
found=1 found=1
continue continue
if not found: if not found:
self.in_same_dups[row.hash].append( dr1 ) self.dups_to_process[row.hash].append( dr1 )
# process path2 / fname2 -- if that combo is not in the dups[hash], add it # process path2 / fname2 -- if that combo is not in the dups_to_process[hash], add it
for dup in self.in_same_dups[row.hash]: for dup in self.dups_to_process[row.hash]:
if dup.id == row.id2: if dup.id == row.id2:
found=1 found=1
continue continue
if not found: if not found:
self.in_same_dups[row.hash].append( dr2 ) self.dups_to_process[row.hash].append( dr2 )
return return
# quick debugger to see the data in the data structure
def Dump(self): def Dump(self):
if len(self.ip_to_sp_dups_keep) > 0: if len(self.ip_to_sp_dups_keep) > 0:
print( "############ Files that are in both Import and Storage Paths ###########") print( "############ Files that are in both Import and Storage Paths ###########")
@@ -144,14 +192,14 @@ class Duplicates:
for d in self.ip_to_sp_dups_del[h]: for d in self.ip_to_sp_dups_del[h]:
print( f"Del: {d}" ) print( f"Del: {d}" )
print( f"{cnt} sets of duplicate files to delete at least 1, anything with 2 or more dups is printed above explicitly" ) print( f"{cnt} sets of duplicate files to delete at least 1, anything with 2 or more dups is printed above explicitly" )
if len(self.in_same_dups) > 0: if len(self.dups_to_process) > 0:
print( "############ Duplicate Files that are in the same Path ###########") print( "############ Duplicate Files that are needing to be futher processed ###########")
cnt=0 cnt=0
for h in self.in_same_dups: for h in self.dups_to_process:
cnt +=1 cnt +=1
if len(self.in_same_dups[h])>2: if len(self.dups_to_process[h])>2:
print( f"hash={h}, keep 1 of these: ", end='') print( f"hash={h}, keep 1 of these: ", end='')
for d in self.in_same_dups[h]: for d in self.dups_to_process[h]:
print( f"{d.id}, ", end='' ) print( f"{d.id}, ", end='' )
print ("") print ("")
print( f"{cnt} sets of duplicate files to delete at least 1, anything with 2 or more dups is printed above explicitly" ) print( f"{cnt} sets of duplicate files to delete at least 1, anything with 2 or more dups is printed above explicitly" )