made new/better function to deal with path duplicates / reduced duplicate code, fixed overall count vars, improved debugs
This commit is contained in:
79
dups.py
79
dups.py
@@ -137,6 +137,8 @@ class Duplicates:
|
|||||||
def KeepInSameDups( self, obj ):
|
def KeepInSameDups( self, obj ):
|
||||||
if obj.h not in self.ip_to_sp_dups_keep:
|
if obj.h not in self.ip_to_sp_dups_keep:
|
||||||
self.ip_to_sp_dups_keep[obj.h]= obj
|
self.ip_to_sp_dups_keep[obj.h]= obj
|
||||||
|
self.overall_dup_cnt += 1
|
||||||
|
self.overall_dup_sets += 1
|
||||||
return
|
return
|
||||||
|
|
||||||
# this stores this object into the Delete from same path list (if it is not
|
# this stores this object into the Delete from same path list (if it is not
|
||||||
@@ -150,6 +152,8 @@ class Duplicates:
|
|||||||
if el.id == obj.id:
|
if el.id == obj.id:
|
||||||
return
|
return
|
||||||
self.ip_to_sp_dups_del[obj.h].append( obj )
|
self.ip_to_sp_dups_del[obj.h].append( obj )
|
||||||
|
# only get here is this is a new duplicate to delete, so increment count
|
||||||
|
self.overall_dup_cnt += 1
|
||||||
return
|
return
|
||||||
|
|
||||||
# this function takes a duplicate file (in the import path and the storage path)
|
# this function takes a duplicate file (in the import path and the storage path)
|
||||||
@@ -203,14 +207,29 @@ class Duplicates:
|
|||||||
self.dups_to_process[row.hash].append( dr2 )
|
self.dups_to_process[row.hash].append( dr2 )
|
||||||
return
|
return
|
||||||
|
|
||||||
|
def AddDupPath(self, hash):
|
||||||
|
# this gets complex, if this hash is also in a sahred imp / sp - then dont deal with it now, let the imp files be deleted and
|
||||||
|
# the repeat check_dups validation step catch it as a cleander (potential) for still more duplicates just in sp
|
||||||
|
if hash in self.ip_to_sp_dups_keep:
|
||||||
|
return
|
||||||
|
dpr=DupPathRow( 2, self.dups_to_process[hash][0].d, self.dups_to_process[hash][1].d, self.dups_to_process[hash][0].did, self.dups_to_process[hash][1].did, hash )
|
||||||
|
new=1
|
||||||
|
for el in self.per_path_dups:
|
||||||
|
if el.d1 == dpr.d1 and el.d2 == dpr.d2:
|
||||||
|
self.overall_dup_cnt += 2
|
||||||
|
el.count += 2
|
||||||
|
el.hashes = f"{el.hashes},{hash}"
|
||||||
|
new=0
|
||||||
|
if new:
|
||||||
|
self.per_path_dups.append( dpr )
|
||||||
|
self.overall_dup_sets += 1
|
||||||
|
self.overall_dup_cnt += 2
|
||||||
|
if re.search( r'\d{4}/\d{8}', dpr.d1):
|
||||||
|
self.preferred_path[dpr.did1]=1
|
||||||
|
if re.search( r'\d{4}/\d{8}', dpr.d2):
|
||||||
|
self.preferred_path[dpr.did2]=1
|
||||||
|
|
||||||
def SecondPass(self):
|
def SecondPass(self):
|
||||||
print("################################## second pass starting")
|
|
||||||
d1=""
|
|
||||||
d2=""
|
|
||||||
did1=""
|
|
||||||
did2=""
|
|
||||||
dup_cnt=1
|
|
||||||
hashes=""
|
|
||||||
for hash in self.dups_to_process:
|
for hash in self.dups_to_process:
|
||||||
# more than 2 files (just ask per file) OR only 2 copies, and files are in same dir (so must be diff name, so just ask) OR content same, filename different (ask per file)
|
# more than 2 files (just ask per file) OR only 2 copies, and files are in same dir (so must be diff name, so just ask) OR content same, filename different (ask per file)
|
||||||
if (len(self.dups_to_process[hash]) > 2) or (self.dups_to_process[hash][0].f != self.dups_to_process[hash][1].f) or (self.dups_to_process[hash][0].d == self.dups_to_process[hash][1].d):
|
if (len(self.dups_to_process[hash]) > 2) or (self.dups_to_process[hash][0].f != self.dups_to_process[hash][1].f) or (self.dups_to_process[hash][0].d == self.dups_to_process[hash][1].d):
|
||||||
@@ -221,30 +240,8 @@ class Duplicates:
|
|||||||
if re.search( r'\d{4}/\d{8}', el.d):
|
if re.search( r'\d{4}/\d{8}', el.d):
|
||||||
self.preferred_file[hash] = el.id
|
self.preferred_file[hash] = el.id
|
||||||
# by here we have only 2 files, with the same name, different path (ask per path)
|
# by here we have only 2 files, with the same name, different path (ask per path)
|
||||||
elif d1 != self.dups_to_process[hash][0].d:
|
|
||||||
if d1 != '':
|
|
||||||
self.overall_dup_cnt += dup_cnt
|
|
||||||
self.overall_dup_sets += 1
|
|
||||||
self.per_path_dups.append( DupPathRow( dup_cnt, d1, d2, did1, did2, hashes ) )
|
|
||||||
if re.search( r'\d{4}/\d{8}', d1):
|
|
||||||
self.preferred_path[did1]=1
|
|
||||||
if re.search( r'\d{4}/\d{8}', d2):
|
|
||||||
self.preferred_path[did2]=1
|
|
||||||
dup_cnt=1
|
|
||||||
d1 = self.dups_to_process[hash][0].d
|
|
||||||
d2 = self.dups_to_process[hash][1].d
|
|
||||||
did1 = self.dups_to_process[hash][0].did
|
|
||||||
did2 = self.dups_to_process[hash][1].did
|
|
||||||
hashes = f"{hash},"
|
|
||||||
else:
|
else:
|
||||||
dup_cnt += 1
|
self.AddDupPath( hash )
|
||||||
hashes += f"{hash},"
|
|
||||||
|
|
||||||
if d1 != '':
|
|
||||||
self.overall_dup_cnt += dup_cnt
|
|
||||||
self.overall_dup_sets += dup_cnt
|
|
||||||
self.per_path_dups.append( DupPathRow( dup_cnt, d1, d2, did1, did2, hashes ) )
|
|
||||||
print("#################### second pass FINISHED")
|
|
||||||
return
|
return
|
||||||
|
|
||||||
# quick debugger to see the data in the data structure
|
# quick debugger to see the data in the data structure
|
||||||
@@ -252,25 +249,17 @@ class Duplicates:
|
|||||||
if len(self.ip_to_sp_dups_keep) > 0:
|
if len(self.ip_to_sp_dups_keep) > 0:
|
||||||
print( "############ Files that are in both Import and Storage Paths ###########")
|
print( "############ Files that are in both Import and Storage Paths ###########")
|
||||||
for h in self.ip_to_sp_dups_keep:
|
for h in self.ip_to_sp_dups_keep:
|
||||||
if len(self.ip_to_sp_dups_del[h])>2:
|
print( f"hash={h} keep 1 of {len(self.ip_to_sp_dups_del[h])+1}, keep: {self.ip_to_sp_dups_keep[h]} | ", end='' )
|
||||||
print( f"(1 file of 2+) hash={h}, keep: {self.ip_to_sp_dups_keep[h]}" )
|
|
||||||
for d in self.ip_to_sp_dups_del[h]:
|
for d in self.ip_to_sp_dups_del[h]:
|
||||||
print( f"Del: {d}" )
|
print( f"Del: {d}", end='' )
|
||||||
else:
|
print( "" )
|
||||||
print( f"(1 file of 2) hash={h}, keep: {self.ip_to_sp_dups_keep[h]}" )
|
|
||||||
for d in self.ip_to_sp_dups_del[h]:
|
|
||||||
print( f"Del: {d}" )
|
|
||||||
|
|
||||||
print( f"{len(self.ip_to_sp_dups_keep)} sets of duplicate files to delete at least 1, anything with 2 or more dups is printed above explicitly" )
|
print( f"{len(self.ip_to_sp_dups_keep)} sets of duplicate files to delete at least 1, anything with 2 or more dups is printed above explicitly" )
|
||||||
|
|
||||||
if len(self.dups_to_process) > 0:
|
if len(self.dups_to_process) > 0:
|
||||||
print( "############ Duplicate Files that are needing to be futher processed ###########")
|
print( "############ Duplicate Files that are needing to be futher processed ###########")
|
||||||
for h in self.dups_to_process:
|
for h in self.dups_to_process:
|
||||||
print( f"hash={h}, keep 1 of these: ", end='')
|
print( f"hash={h} keep 1 of {len(self.dups_to_process[h])} from: {self.dups_to_process[h]}" )
|
||||||
for d in self.dups_to_process[h]:
|
print( f"which is a total of {len(self.dups_to_process)} set(s) of duplicate files to keep only 1 of" )
|
||||||
print( f"{d.id}, ", end='' )
|
|
||||||
print ("")
|
|
||||||
print( f"{len(self.dups_to_process)} sets of duplicate files to delete at least 1, anything with 2 or more dups is printed above explicitly" )
|
|
||||||
|
|
||||||
if len(self.preferred_file) > 0:
|
if len(self.preferred_file) > 0:
|
||||||
print( " We have preferred (regexp matched) ###########")
|
print( " We have preferred (regexp matched) ###########")
|
||||||
@@ -279,7 +268,7 @@ class Duplicates:
|
|||||||
for d in self.dups_to_process[h]:
|
for d in self.dups_to_process[h]:
|
||||||
print( f"{d.id}, ", end='' )
|
print( f"{d.id}, ", end='' )
|
||||||
print ("")
|
print ("")
|
||||||
print( f"{len(self.preferred_file)} duplicate files we will keep as they match the regexp" )
|
print( f"which is a total of {len(self.preferred_file)} duplicate files we will keep as they match the regexp" )
|
||||||
|
|
||||||
if len(self.per_path_dups) > 0:
|
if len(self.per_path_dups) > 0:
|
||||||
print( "############ Duplicate Files in Paths that are needing to be futher processed ###########")
|
print( "############ Duplicate Files in Paths that are needing to be futher processed ###########")
|
||||||
@@ -289,5 +278,5 @@ class Duplicates:
|
|||||||
print("Keep dir1")
|
print("Keep dir1")
|
||||||
if pair.did2 in self.preferred_path:
|
if pair.did2 in self.preferred_path:
|
||||||
print("Keep dir2")
|
print("Keep dir2")
|
||||||
print( f"{len(self.per_path_dups)} duplicate files in per path dups" )
|
print( f"which is a total of {len(self.per_path_dups)} set(s) of path dups to process" )
|
||||||
return
|
return
|
||||||
Reference in New Issue
Block a user