fixed up dup code to work with paths, added path_types throughout and updated TODO to be clear on what next

2021-04-17 17:43:42 +10:00
parent 477aa4e5b8
commit 3237e3bf8f
6 changed files with 68 additions and 77 deletions
--- a/31
+++ b/31
@@ -1,26 +1,19 @@
 ## GENERAL
-    * need a path_details_dir_link table (path_details -> type 'import/storage/recycle' 
-      - need to make some FILE/DIR, etc. funcs into OO 
-        -- is there more to do here?  (should probably see how often I reference fields of FILE/DIR, etc. directly, and get rid of them in most instances)
-      - stop fudging the path in DIR (to add static), and just add 'static/' + path_details_dir.prefix + dir.rel_path
-        -- this might be done, but re-consider the idea that 'static' is hard-coded? -- as long as it is in 1 spot
-            -- because, I want to change the recycle bin to be .pa_bin/IMPORT/xxxx  and .pa_bin/STORAGE/xxxx   (to address the potential issue where import_path is /xxx/photos, and storage_path is /yyy/photos....
-                ??? is this possible to cause issues in other spots, like de-dup trim path...  in fact, probably need to revisit TrimPath anyway!
-      - ignore *thumb* -- but consider how we do this and don't screw up 'dir/job counts'
-        and other stuff like .pa_bin if its in storage/import folder?
-    * storage_path viewing needs to be by folder / not a big grab bag of files (by default)
+    * issue where someone could call IP .../Imp/photos and SP .../Sto/photos and then static/photos is ambiguous:
+        -- TODO: make path prefix by static/<ptype.name>/ so that files are in:  static/<ptype.name>/in_path.pp/dir.rel_path/
+        -- then deleting below would just path_prefix from static/storage to .pa_bin/storage, etc.
+        -- need to be able to view recycle bin (should be simple when we have path_types) &&& should able to consolidate the files_ip/files_sp/files_rb? route handling functions
+        -- could also allow undelete per file / show content as another Files->View and more like storage (i.e. show folders)
+    * storage_path viewing needs to be by folder / not a big grab bag of files (by default - DONE)
        -- BUG: issue with view by Day, etc. we print out day even if the Entry is not in the cwd
-        -- mostly done.  Need to toggle the view if I want, and when viewing storage area, change single-click to be view file again, and right-click to be my context menu
-        -> could this also be a small symbol near the icons mentioned below (disk/?/bin)?
-    * doing actual file deletes needed again [DONE]
-      - decided a recycle bin would be good [DONE]
-        - when we put files in recycle bin, they need to stay in the DB and just have their root/base path moved (so they can be view as per above/below) <--- TO BE DONE
-        AND need to be able to view recycle bin (should be simple when we have path_types) &&& should able to consolidate the files_ip/files_sp/files_rb? route handling functions
-      - could also allow undelete per file / show content as another Files->View and more like storage (i.e. show folders)
+        -- TODO: Need to toggle the view if I want, and when viewing storage area, change single-click to be view file again, and right-click to be my context menu
+    * need a way for search results to show we found something in import_path or storage_path:
+      - now we can use the in_path, then have a series of icons, e.g. disk for storage, ? for import, and bin for recycling (before the blue path)--maybe even show different colours, e.g. info for import, primary for storage and danger for bin?
+    * handle thumbs:
+      - need to ignore *thumb* -- but consider how we do this and don't screw up 'dir/job counts'
+        and potentially other stuff like .pa_bin if its in storage/import folder?
    * AddJobForLog can absorb DEBUGs, etc.  in fact fix up logging in general
    * comment your code
-    * need a way for page to show we are in import_path or storage_path:
-      - now we can use the in_path (which needs a type !!!), then have a series of icons, e.g. disk for storage, ? for import, and bin for recycling (before the blue path)--maybe even show different colours, e.g. info for import, primary for storage and danger for bin?

 ## DB	
    Need to think about...
--- a/dups.py
+++ b/dups.py
@@ -23,6 +23,7 @@ import re
 from job import Job, JobExtra, Joblog, NewJob
 from settings import Settings
 from shared import SymlinkName
+from path import PathType

 ################################################################################    
 # DupRow class is a simple 'struct' to keep data per duplicate file / just to
@@ -91,47 +92,23 @@ class Duplicates:
        self.per_path_dups=[]
        self.preferred_file={}
        self.preferred_path={}
-        self.all_paths=[]
-        self.storage_paths=[]
-        self.import_paths=[]
        self.hashes_processed={}
        self.uniq_dups=0
        self.total_dups=0

-        # pull apart the storage path Setting, and make array of each for use in TrimmedPath()
-        settings=Settings.query.first()
-        paths = settings.storage_path.split("#")
-        for path in paths:
-            prefix = SymlinkName(path,path+'/')
-            self.storage_paths.append(prefix)
-            self.all_paths.append(prefix)
-        # pull apart the import path Setting, and make array of each for use in TrimmedPath()
-        paths = settings.import_path.split("#")
-        for path in paths:
-            prefix = SymlinkName(path,path+'/')
-            self.import_paths.append(prefix)
-            self.all_paths.append(prefix)
-
-    # Strip the front of the path (any match on a storage or import path) is
-    # removed.   Just to make it easier to read when we display in the web page
-    def TrimmedPath( self, path ):
-        for p in self.all_paths:
-            if re.match( f"^{p}", path ):
-                return path.replace(p, '' )
-        return path
+        self.import_ptype_id  = PathType.query.filter(PathType.name=='Import').first().id
+        self.storage_ptype_id = PathType.query.filter(PathType.name=='Storage').first().id

    # is this file in the import path?
-    def InImportPath( self, path ):
-        for p in self.import_paths:
-            if re.match( f"^{p}", path ):
-                return True
+    def InImportPath( self, path_type ):
+        if path_type == self.import_ptype_id:
+            return True
        return False

    # is this file in the storage path?
-    def InStoragePath( self, path ):
-        for p in self.storage_paths:
-            if re.match( f"^{p}", path ):
-                return True
+    def InStoragePath( self, path_type ):
+        if path_type == self.storage_ptype_id:
+            return True
        return False

    # this stores this object into the keep from same path list (DDP: sometimes there can be more than 1 SP, e.g SP to SP to IP)
@@ -160,11 +137,11 @@ class Duplicates:
    # and then puts the storage path file in the keep list (self.ip_to_sp_dups_keep) via self.KeepInIPSPDups()
    # and then puts the import path file in the delete list (self.ip_to_sp_dups_keep) via self.DelInIPSPDups()
    def DupInImportAndStoragePath( self, row, dr1, dr2 ):
-        if self.InStoragePath(row.path1) and self.InImportPath(row.path2):
+        if self.InStoragePath(row.path_type1) and self.InImportPath(row.path_type2):
            self.KeepInIPSPDups( dr1 )
            self.DelInIPSPDups( dr2 )
            return True
-        if self.InStoragePath(row.path2) and self.InImportPath(row.path1):
+        if self.InStoragePath(row.path_type2) and self.InImportPath(row.path_type1):
            self.KeepInIPSPDups( dr2 )
            self.DelInIPSPDups( dr1 )
            return True
@@ -174,8 +151,8 @@ class Duplicates:
    # we process these into appropriate data structures on this first pass
    def AddDup( self, row ):
        self.hashes_processed[row.hash]=1
-        dr1=DupRow( row.hash, row.fname1, self.TrimmedPath(row.path1), row.did1, row.id1 )
-        dr2=DupRow( row.hash, row.fname2, self.TrimmedPath(row.path2), row.did2, row.id2 )
+        dr1=DupRow( row.hash, row.fname1, row.rel_path1, row.did1, row.id1 )
+        dr2=DupRow( row.hash, row.fname2, row.rel_path2, row.did2, row.id2 )
        # if in both import and storage path, just keep the storage path file,
        # and del import path file.
        if self.DupInImportAndStoragePath( row, dr1, dr2 ):
--- a/files.py
+++ b/files.py
@@ -21,6 +21,7 @@ import re
 # Local Class imports
 ################################################################################
 from job import Job, JobExtra, Joblog, NewJob
+from path import PathType, Path
 from person import Person, PersonRefimgLink
 from refimg import Refimg
 from settings import Settings
@@ -41,15 +42,6 @@ class PathDirLink(db.Model):
    def __repr__(self):
        return f"<path_id: {self.path_id}, dir_eid: {self.dir_eid}>"

-class Path(db.Model):
-    __tablename__ = "path"
-    id = db.Column(db.Integer, db.Sequence('path_id_seq'), primary_key=True )
-    path_prefix = db.Column(db.String, unique=True, nullable=False )
-    num_files = db.Column(db.Integer)
-
-    def __repr__(self):
-        return f"<id: {self.id}, path_prefix: {self.path_prefix}, num_files={self.num_files}>"
-
 class EntryDirLink(db.Model):
    __tablename__ = "entry_dir_link"
    entry_id = db.Column(db.Integer, db.ForeignKey("entry.id"), primary_key=True )
@@ -277,7 +269,7 @@ def scan_sp():

@app.route("/fix_dups", methods=["POST"])
 def fix_dups():
-    rows = db.engine.execute( "select e1.id as id1, f1.hash, d1.path_prefix as path1, d1.eid as did1, e1.name as fname1, e2.id as id2, d2.path_prefix as path2, d2.eid as did2, e2.name as fname2 from entry e1, file f1, dir d1, entry_dir_link edl1, entry e2, file f2, dir d2, entry_dir_link edl2 where e1.id = f1.eid and e2.id = f2.eid and d1.eid = edl1.dir_eid and edl1.entry_id = e1.id and edl2.dir_eid = d2.eid and edl2.entry_id = e2.id  and f1.hash = f2.hash and e1.id != e2.id and f1.size_mb = f2.size_mb order by path1, fname1" )
+    rows = db.engine.execute( "select e1.id as id1, f1.hash, d1.rel_path as rel_path1, d1.eid as did1, e1.name as fname1, p1.id as path1, p1.type_id as path_type1, e2.id as id2, d2.rel_path as rel_path2, d2.eid as did2, e2.name as fname2, p2.id as path2, p2.type_id as path_type2 from entry e1, file f1, dir d1, entry_dir_link edl1, path_dir_link pdl1, path p1, entry e2, file f2, dir d2, entry_dir_link edl2, path_dir_link pdl2, path p2 where e1.id = f1.eid and e2.id = f2.eid and d1.eid = edl1.dir_eid and edl1.entry_id = e1.id and edl2.dir_eid = d2.eid and edl2.entry_id = e2.id and p1.id = pdl1.path_id and pdl1.dir_eid = d1.eid and p2.id = pdl2.path_id and pdl2.dir_eid = d2.eid and f1.hash = f2.hash and e1.id != e2.id and f1.size_mb = f2.size_mb order by path1, fname1" )

    if rows.returns_rows == False:
        st.SetAlert("success")
--- a/pa_job_manager.py
+++ b/pa_job_manager.py
@@ -73,6 +73,15 @@ Base = declarative_base()
 # Class describing File in the database, and via sqlalchemy, connected to the DB as well
 # This has to match one-for-one the DB table
 ################################################################################
+class PathType(Base):
+    __tablename__ = "path_type"
+    id = Column(Integer, Sequence('path_type_id_seq'), primary_key=True )
+    name = Column(String, unique=True, nullable=False )
+
+    def __repr__(self):
+        return f"<id: {self.id}, name={self.name}>"
+
+
 class PathDirLink(Base):
    __tablename__ = "path_dir_link"
    path_id = Column(Integer, ForeignKey("path.id"), primary_key=True )
@@ -92,6 +101,8 @@ class EntryDirLink(Base):
 class Path(Base):
    __tablename__ = "path"
    id = Column(Integer, Sequence('path_id_seq'), primary_key=True )
+    type_id = Column(Integer, ForeignKey("path_type.id"))
+    type = relationship("PathType")
    path_prefix = Column(String, unique=True, nullable=False )
    num_files = Column(Integer)

@@ -121,7 +132,6 @@ class Entry(Base):
    in_dir = relationship ("Dir", secondary="entry_dir_link", uselist=False )

    def FullPathOnFS(self):
-        print( f"(FullPathOnFS: pp={self.in_dir.in_path.path_prefix}, rp={self.in_dir.rel_path}, n={self.name}" )
        s=self.in_dir.in_path.path_prefix + '/' 
        if len(self.in_dir.rel_path) > 0:
            s += self.in_dir.rel_path + '/'
@@ -274,7 +284,8 @@ def ProcessStorageDirs(parent_job):
    if settings == None:
        raise Exception("Cannot create file data with no settings / import path is missing")
    paths = settings.storage_path.split("#")
-    JobsForPaths( parent_job, paths )
+    ptype = session.query(PathType).filter(PathType.name=='Storage').first().id
+    JobsForPaths( parent_job, paths, ptype )
    return

 def ProcessImportDirs(parent_job):
@@ -282,10 +293,11 @@ def ProcessImportDirs(parent_job):
    if settings == None:
        raise Exception("Cannot create file data with no settings / import path is missing")
    paths = settings.import_path.split("#")
-    JobsForPaths( parent_job, paths )
+    ptype = session.query(PathType).filter(PathType.name=='Import').first().id
+    JobsForPaths( parent_job, paths, ptype )
    return

-def JobsForPaths( parent_job, paths ):
+def JobsForPaths( parent_job, paths, ptype ):
    now=datetime.now(pytz.utc)
    # make new set of Jobs per path... HandleJobs will make them run later
    for path in paths:
@@ -295,8 +307,10 @@ def JobsForPaths( parent_job, paths ):
            cfn=p.num_files
            
        jex=JobExtra( name="path", value=path )
+        jex2=JobExtra( name="path_type", value=ptype )
        job=Job(start_time=now, last_update=now, name="importdir", state="New", wait_for=None, pa_job_state="New", current_file_num=0, num_files=cfn )
        job.extra.append(jex)
+        job.extra.append(jex2)
        session.add(job)
        session.commit()
        if parent_job:
@@ -573,10 +587,9 @@ def GetDateFromFile(file, stat):
 def JobImportDir(job):
    JobProgressState( job, "In Progress" )
    settings = session.query(Settings).first()
-    if settings == None:
-        raise Exception("Cannot create file data with no settings / paths missing")
    path=[jex.value for jex in job.extra if jex.name == "path"][0]
-    AddLogForJob(job, "Checking Directory: {}".format( path ) )
+    path_type=[jex.value for jex in job.extra if jex.name == "path_type"][0]
+    AddLogForJob(job, f"Checking 'path_type' Directory: {path}" )
    if DEBUG==1:
        print("DEBUG: Checking Directory: {}".format( path ) )
    if not os.path.exists( path ):
@@ -584,7 +597,7 @@ def JobImportDir(job):
        return
    symlink=CreateSymlink(job,path)

-    path_obj=Path( path_prefix=symlink, num_files=0 )
+    path_obj=Path( path_prefix=symlink, num_files=0, type_id=path_type )
    session.add(path_obj)
    ResetExistsOnFS(job, symlink)

@@ -922,7 +935,6 @@ def RemoveDups(job):
    cd_jobs=session.query(Job).filter(Job.name=='checkdups').filter(Job.pa_job_state=='New').all()
    for j in cd_jobs:
        FinishJob(j, "Just removed duplicates - so no need to do any other checkdups, we will force 1 last one after the remove step", "Withdrawn")
-        print("here-loop")
    session.commit()

    dup_cnt=0
--- a/tables.sql
+++ b/tables.sql
@@ -4,8 +4,11 @@ create table SETTINGS( ID integer, IMPORT_PATH varchar, STORAGE_PATH varchar, RE

 create table FILE_TYPE ( ID integer, NAME varchar(32) unique, constraint PK_FILE_TYPE_ID primary key(ID) );

-create table PATH ( ID integer, PATH_PREFIX varchar(1024), NUM_FILES integer,
-    constraint PK_PATH_ID primary key(ID) );
+create table PATH_TYPE ( ID integer, NAME varchar(16) unique, constraint PK_PATH_TYPE_ID primary key(ID) );
+
+create table PATH ( ID integer, TYPE_ID integer, PATH_PREFIX varchar(1024), NUM_FILES integer,
+    constraint PK_PATH_ID primary key(ID),
+    constraint FK_PATH_TYPE_TYPE_ID foreign key (TYPE_ID) references PATH_TYPE(ID) );

 create table ENTRY( ID integer, NAME varchar(128), TYPE_ID integer, EXISTS_ON_FS boolean,
    constraint PK_ENTRY_ID primary key(ID),
@@ -64,6 +67,7 @@ create table PA_JOB_MANAGER_FE_MESSAGE ( ID integer, JOB_ID integer, ALERT varch
    constraint FK_PA_JOB_MANAGER_FE_MESSAGE_JOB_ID foreign key(JOB_ID) references JOB(ID) );

 create sequence PATH_ID_SEQ;
+create sequence PATH_TYPE_ID_SEQ;
 create sequence FILE_ID_SEQ;
 create sequence FILE_TYPE_ID_SEQ;
 create sequence JOBEXTRA_ID_SEQ;
@@ -75,9 +79,15 @@ create sequence SETTINGS_ID_SEQ;
 create sequence PA_JOB_MANAGER_ID_SEQ;
 create sequence PA_JOB_MANAGER_FE_MESSAGE_ID_SEQ;

-insert into FILE_TYPE values ( (select nextval('FILE_TYPE_ID_SEQ')), 'Directory' );
+-- default data for types of paths
+insert into PATH_TYPE values ( (select nextval('PATH_TYPE_ID_SEQ')), 'Import' );
+insert into PATH_TYPE values ( (select nextval('PATH_TYPE_ID_SEQ')), 'Storage' );
+insert into PATH_TYPE values ( (select nextval('PATH_TYPE_ID_SEQ')), 'Bin' );
+
+-- default data for types of files
 insert into FILE_TYPE values ( (select nextval('FILE_TYPE_ID_SEQ')), 'Image' );
 insert into FILE_TYPE values ( (select nextval('FILE_TYPE_ID_SEQ')), 'Video' );
+insert into FILE_TYPE values ( (select nextval('FILE_TYPE_ID_SEQ')), 'Directory' );
 insert into FILE_TYPE values ( (select nextval('FILE_TYPE_ID_SEQ')), 'Unknown' );

 -- fake data only for making testing easier
--- a/templates/files.html
+++ b/templates/files.html
@@ -16,6 +16,13 @@
                <input type="hidden" name="term" id="view_term" value="{{search_term}}">
            {% endif %}
            <div class="row">
+                {% if "files_ip" in request.url %}
+                    I
+                {% elif "files_sp" in request.url %}
+                    S
+                {% else %}
+                    R
+                {% endif %}
                {% if folders %}
                    <div class="my-auto">
                        <span class="alert alert-primary">In: {{cwd}}</span>