fixed bugs 47, 48, 49 and reduced complexity of dup counting as well. Now removes dirs than become empty due to duplicate removals
This commit is contained in:
32
dups.py
32
dups.py
@@ -88,6 +88,7 @@ class Duplicates(PA):
|
||||
self.preferred_file={}
|
||||
self.preferred_path={}
|
||||
self.hashes_processed={}
|
||||
self.eids_processed={}
|
||||
self.uniq_dups=0
|
||||
self.total_dups=0
|
||||
|
||||
@@ -146,6 +147,8 @@ class Duplicates(PA):
|
||||
# we process these into appropriate data structures on this first pass
|
||||
def AddDup( self, row ):
|
||||
self.hashes_processed[row.hash]=1
|
||||
self.eids_processed[row.id1]=1
|
||||
self.eids_processed[row.id2]=1
|
||||
dr1=DupRow( row.hash, row.fname1, row.rel_path1, row.did1, row.id1 )
|
||||
dr2=DupRow( row.hash, row.fname2, row.rel_path2, row.did2, row.id2 )
|
||||
# if in both import and storage path, just keep the storage path file,
|
||||
@@ -206,39 +209,24 @@ class Duplicates(PA):
|
||||
# The second pass processes row by row of dups_to_process, looking for per_file_dups and per_path_dups
|
||||
# AND works out counts to display overall tallies of types of keeping and deletion of files, and choices to make
|
||||
def SecondPass(self):
|
||||
# sort out counts (for ip_to_sp - that is all finished)
|
||||
self.uniq_dups = len(self.hashes_processed)
|
||||
# total starts with 1 copy of everything we keep in sp
|
||||
self.total_dups = len(self.ip_to_sp_dups_keep)
|
||||
# and then add all those we delete in ip that are in sp
|
||||
for hash in self.ip_to_sp_dups_del:
|
||||
self.total_dups += len(self.ip_to_sp_dups_del[hash])
|
||||
|
||||
# okay, go for each duplicate that should be processed (they are stored
|
||||
# by hash, and have at least 2 entries, but can have more, and be in
|
||||
# the IP or SP and any combo, cater for all below
|
||||
for hash in self.dups_to_process:
|
||||
# more than 2 files (just ask per file) OR (implied) only 2 copies, and files are in same dir (so must be diff name, so just ask) OR (implied) on 2 copies in same dir & filename different (ask per file)
|
||||
# will force ask per file
|
||||
if (len(self.dups_to_process[hash]) > 2) or (self.dups_to_process[hash][0].d == self.dups_to_process[hash][1].d) or (self.dups_to_process[hash][0].f != self.dups_to_process[hash][1].f):
|
||||
self.per_file_dups.append(self.dups_to_process[hash])
|
||||
for el in self.dups_to_process[hash]:
|
||||
if re.search( r'\d{4}/\d{8}', el.d):
|
||||
self.preferred_file[hash] = el.id
|
||||
self.total_dups += len(self.dups_to_process[hash])
|
||||
# if this combination ALSO has an import path dup, then we have already counted the storage path dup in the earlier keeping count
|
||||
if hash in self.ip_to_sp_dups_keep:
|
||||
self.total_dups -= 1
|
||||
# only 2 files AND different path (ask per path) AND with the same name...
|
||||
else:
|
||||
# if this dup path is not already being partially handled by an ip <-> sp dup, then add it / count it
|
||||
if self.AddDupPath( hash ):
|
||||
self.total_dups += 2
|
||||
else:
|
||||
# okay, if we are here, this path combo is also in an IP <-> SP combo.
|
||||
# IF, this dup we tried to add was in SP<->SP, then there
|
||||
# is another dup to count, if its IP<->IP (as we append these to the del list), then nothing further to count
|
||||
if self.InStoragePath(self.dups_to_process[hash][0].d):
|
||||
self.total_dups += 1
|
||||
# will force ask per path
|
||||
self.AddDupPath( hash )
|
||||
|
||||
# provide convenience counts
|
||||
self.uniq_dups = len(self.hashes_processed)
|
||||
self.total_dups = len(self.eids_processed)
|
||||
return
|
||||
|
||||
# quick debugger to see the data in the data structure (not used by default)
|
||||
|
||||
Reference in New Issue
Block a user