fix up quick hack / partial code improvement for bug-140

hacky/quick fix to handle db container restarts, NEED to remove global session, and replace with sess, and then use Threads and allow parallelism finally - this fixes BUG-140
quick fix to better handle pools, and db container restarts underneath client
2026-02-04 17:39:41 +11:00 · 2026-02-04 17:26:48 +11:00 · 2026-02-04 17:25:59 +11:00 · 2026-02-04 16:29:42 +11:00 · 2025-11-18 21:13:02 +11:00
3 changed files with 72 additions and 21 deletions
--- a/4
+++ b/4
@@ -1,5 +1,5 @@
-### Next: 146
-BUG-140: When db is restarted underneath PA, it crashes job mgr... It should just accept timeouts, and keep trying to reconnect every 2? mins
+### Next: 147
+BUG-146: with an empty DB, I See 'No files in Path!' twice (for file*, except for files_rbp)
 BUG-118: can move files from Bin path, but it leaves the del_file entry for it - need to remove it
 BUG-117: when search returns files that can be deleted and/or restored, the icon stays as delete and tries to delete!
 BUG-106: cant add trudy /pat? as refimgs via FaceDBox
--- a/main.py
+++ b/main.py
@@ -66,12 +66,19 @@ app.config['LDAP_USER_DN'] = 'ou=users'
 app.config['LDAP_GROUP_DN'] = 'ou=groups'
 app.config['LDAP_USER_RDN_ATTR'] = 'uid'
 app.config['LDAP_USER_LOGIN_ATTR'] = 'uid'
-app.config['LDAP_BIND_USER_DN'] = None
-app.config['LDAP_BIND_USER_PASSWORD'] = None
 app.config['LDAP_GROUP_OBJECT_FILTER'] = '(objectclass=posixGroup)'
 app.config['LDAP_BIND_USER_DN'] = None
 app.config['LDAP_BIND_USER_PASSWORD'] = None

+# stop db restarts from causing stales and client-side 'server errors' - its a
+# touch hacky, e.g. it issues a select 1 before EVERY request, likely should
+# ditch this and just have a short-lived pool, but need to work out if/where I
+# can catch the right exception myself and then dont need this, but for now...
+app.config['SQLALCHEMY_ENGINE_OPTIONS'] = {
+    "pool_pre_ping": True,
+    "pool_recycle": 280,  # Good practice to include this with pre-ping
+}
+

 db = SQLAlchemy(app)                           # create the (flask) sqlalchemy connection
 ma = Marshmallow(app)                          # set up Marshmallow - data marshalling / serialising
--- a/pa_job_manager.py
+++ b/pa_job_manager.py
@@ -20,6 +20,7 @@ from sqlalchemy.orm import relationship
 from sqlalchemy import create_engine
 from sqlalchemy.orm import sessionmaker
 from sqlalchemy.orm import scoped_session
+from contextlib import contextmanager

 ### LOCAL FILE IMPORTS ###
 from shared import DB_URL, PA_JOB_MANAGER_HOST, PA_JOB_MANAGER_PORT, THUMBSIZE, SymlinkName, GenThumb, SECS_IN_A_DAY, PA_EXIF_ROTATER, PA
@@ -66,21 +67,41 @@ override_tbls={ "face_no_match_override", "face_force_match_override", "disconne
 # this is required to handle the duplicate processing code
 sys.setrecursionlimit(50000)

-# a Manager, which the Session will use for connection resources
-some_engine = create_engine(DB_URL)

-# create a configured "Session" class
-#Session = sessionmaker(bind=some_engine)
+# 1. Add pool_pre_ping and pool_recycle here to handle db container disappearing underneath us
+some_engine = create_engine(
+    DB_URL,
+    pool_pre_ping=True,    # check DB connection is still active before use
+    pool_recycle=300,      # churn connections regardless every 5 mins
+    pool_size=20,          # Parallel-ready base pool
+    max_overflow=10        # Burst capacity for high socket traffic
+)

-# create a Session
 session_factory = sessionmaker(bind=some_engine)
 Session = scoped_session(session_factory)
-session = Session()
+
+# HACK: need to remove this and use 'sess' as an actual param everywhere, butt here are 200+ so quick fix until retired
+session = Session
+
+# this is a way to handle a session failing
+@contextmanager
+def PA_db_session():
+    """Provide a transactional scope around a series of operations."""
+    # This creates a NEW session from the registry
+    s = Session()
+    try:
+        yield s
+        s.commit()
+    except Exception:
+        s.rollback()
+        raise
+    finally:
+        # This destroys the session and returns connection to pool
+        Session.remove()

 # this creates the Base (like db model in flask)
 Base = declarative_base()

-
 ################################################################################
 # Class describing PathType & in the database (via sqlalchemy)
 # series of pre-defined types of paths (import, storage, bin)
@@ -2751,7 +2772,13 @@ if __name__ == "__main__":

    InitialValidationChecks()

-    HandleJobs(True)
+    # Initial job run on startup (hence True in 1st param)
+    try:
+        with PA_db_session() as sess:
+            HandleJobs(True)
+    except Exception as e:
+        PAprint(f"ERROR: Initial job handle failed: {e}")
+
    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
        s.bind((PA_JOB_MANAGER_HOST, PA_JOB_MANAGER_PORT))
        # force timeout every 1 day so we can run scheduled jobs
@@ -2759,18 +2786,35 @@ if __name__ == "__main__":
        s.listen()
        while True:
            try:
+                # 1. Wait for connection
                conn, addr = s.accept()
                if DEBUG:
-                    PAprint( f"accept finished, tout={s.timeout}" )
+                    PAprint(f"Connection accepted from {addr}")
+
+                # 2. Process Jobs after a successful socket connection
+                with PA_db_session() as sess:
+                    HandleJobs(False)
+                    # Check for scheduled tasks as well
+                    if ScheduledJobs():
+                        HandleJobs(False)

            except socket.timeout:
                if DEBUG:
-                    PAprint( f"timeout occurred, tout={s.timeout}" )
-                if ScheduledJobs():
-                    HandleJobs(False)
+                    PAprint("Socket timeout (Daily maintenance window) reached.")
+                
+                # 3. Process Scheduled Jobs during the timeout
+                try:
+                    with PA_db_session() as sess:
+                        if ScheduledJobs():
+                            HandleJobs(False)
+                except sqlalchemy.exc.OperationalError:
+                    PAprint("DB Connection lost during scheduled task window. Retrying next cycle.")
                continue
-            else:
-                HandleJobs(False)
-                # in case we constantly have jobs running, the '1 day' last import might be missed, so check it after each job too
-                if ScheduledJobs():
-                    HandleJobs(False)
+
+            except (sqlalchemy.exc.OperationalError, sqlalchemy.exc.InterfaceError) as e:
+                # This catches the DB container restart specifically
+                PAprint(f"DATABASE ERROR: Connection lost. Retrying... {e}")
+                time.sleep(5) # Brief pause before next socket listen
+                
+            except Exception as e:
+                PAprint(f"UNEXPECTED ERROR: {e}")
Author	SHA1	Message	Date
Damien De Paoli	06f81652b7	fix up quick hack / partial code improvement for bug-140	2026-02-04 17:39:41 +11:00
Damien De Paoli	ed6d1dd40d	hacky/quick fix to handle db container restarts, NEED to remove global session, and replace with sess, and then use Threads and allow parallelism finally - this fixes BUG-140	2026-02-04 17:26:48 +11:00
Damien De Paoli	7b1a7ea30d	quick fix to better handle pools, and db container restarts underneath client	2026-02-04 17:25:59 +11:00
Damien De Paoli	5b0bfb3619	new BUG that only shows on empty DB	2026-02-04 16:29:42 +11:00
Damien De Paoli	74647bcdfb	remove duplicate lines of code	2025-11-18 21:13:02 +11:00