fixed up dup code to work with paths, added path_types throughout and updated TODO to be clear on what next

This commit is contained in:
2021-04-17 17:43:42 +10:00
parent 477aa4e5b8
commit 3237e3bf8f
6 changed files with 68 additions and 77 deletions

49
dups.py
View File

@@ -23,6 +23,7 @@ import re
from job import Job, JobExtra, Joblog, NewJob
from settings import Settings
from shared import SymlinkName
from path import PathType
################################################################################
# DupRow class is a simple 'struct' to keep data per duplicate file / just to
@@ -91,47 +92,23 @@ class Duplicates:
self.per_path_dups=[]
self.preferred_file={}
self.preferred_path={}
self.all_paths=[]
self.storage_paths=[]
self.import_paths=[]
self.hashes_processed={}
self.uniq_dups=0
self.total_dups=0
# pull apart the storage path Setting, and make array of each for use in TrimmedPath()
settings=Settings.query.first()
paths = settings.storage_path.split("#")
for path in paths:
prefix = SymlinkName(path,path+'/')
self.storage_paths.append(prefix)
self.all_paths.append(prefix)
# pull apart the import path Setting, and make array of each for use in TrimmedPath()
paths = settings.import_path.split("#")
for path in paths:
prefix = SymlinkName(path,path+'/')
self.import_paths.append(prefix)
self.all_paths.append(prefix)
# Strip the front of the path (any match on a storage or import path) is
# removed. Just to make it easier to read when we display in the web page
def TrimmedPath( self, path ):
for p in self.all_paths:
if re.match( f"^{p}", path ):
return path.replace(p, '' )
return path
self.import_ptype_id = PathType.query.filter(PathType.name=='Import').first().id
self.storage_ptype_id = PathType.query.filter(PathType.name=='Storage').first().id
# is this file in the import path?
def InImportPath( self, path ):
for p in self.import_paths:
if re.match( f"^{p}", path ):
return True
def InImportPath( self, path_type ):
if path_type == self.import_ptype_id:
return True
return False
# is this file in the storage path?
def InStoragePath( self, path ):
for p in self.storage_paths:
if re.match( f"^{p}", path ):
return True
def InStoragePath( self, path_type ):
if path_type == self.storage_ptype_id:
return True
return False
# this stores this object into the keep from same path list (DDP: sometimes there can be more than 1 SP, e.g SP to SP to IP)
@@ -160,11 +137,11 @@ class Duplicates:
# and then puts the storage path file in the keep list (self.ip_to_sp_dups_keep) via self.KeepInIPSPDups()
# and then puts the import path file in the delete list (self.ip_to_sp_dups_keep) via self.DelInIPSPDups()
def DupInImportAndStoragePath( self, row, dr1, dr2 ):
if self.InStoragePath(row.path1) and self.InImportPath(row.path2):
if self.InStoragePath(row.path_type1) and self.InImportPath(row.path_type2):
self.KeepInIPSPDups( dr1 )
self.DelInIPSPDups( dr2 )
return True
if self.InStoragePath(row.path2) and self.InImportPath(row.path1):
if self.InStoragePath(row.path_type2) and self.InImportPath(row.path_type1):
self.KeepInIPSPDups( dr2 )
self.DelInIPSPDups( dr1 )
return True
@@ -174,8 +151,8 @@ class Duplicates:
# we process these into appropriate data structures on this first pass
def AddDup( self, row ):
self.hashes_processed[row.hash]=1
dr1=DupRow( row.hash, row.fname1, self.TrimmedPath(row.path1), row.did1, row.id1 )
dr2=DupRow( row.hash, row.fname2, self.TrimmedPath(row.path2), row.did2, row.id2 )
dr1=DupRow( row.hash, row.fname1, row.rel_path1, row.did1, row.id1 )
dr2=DupRow( row.hash, row.fname2, row.rel_path2, row.did2, row.id2 )
# if in both import and storage path, just keep the storage path file,
# and del import path file.
if self.DupInImportAndStoragePath( row, dr1, dr2 ):