Fixed BUG-11: we have changed to file_refimg_link table (from file_person_link), this means we now have some optimisations, and can definitely re-run AI jobs without crashing. Several optims could still be done - see TODO

2021-01-25 01:05:30 +11:00
parent 0829a98376
commit 18b8a30140
9 changed files with 108 additions and 89 deletions
--- a/pa_job_manager.py
+++ b/pa_job_manager.py
@@ -14,7 +14,7 @@
 ### SQLALCHEMY IMPORTS ###

 from sqlalchemy.ext.declarative import declarative_base
-from sqlalchemy import Column, Integer, String, Sequence, Float, ForeignKey, DateTime, LargeBinary
+from sqlalchemy import Column, Integer, String, Sequence, Float, ForeignKey, DateTime, LargeBinary, Boolean
 from sqlalchemy.exc import SQLAlchemyError
 from sqlalchemy.orm import relationship
 from sqlalchemy import create_engine
@@ -97,14 +97,23 @@ class Entry(Base):
    def __repr__(self):
        return "<id: {}, name: {}, type={}, dir_details={}, file_details={}, in_dir={}>".format(self.id, self.name, self.type, self.dir_details, self.file_details, self.in_dir)

+class FileRefimgLink(Base):
+    __tablename__ = "file_refimg_link"
+    file_id = Column(Integer, ForeignKey('file.eid'), unique=True, nullable=False, primary_key=True)
+    refimg_id = Column(Integer, ForeignKey('refimg.id'), unique=True, nullable=False, primary_key=True)
+    when_processed = Column(Float)
+    matched = Column(Boolean)
+
+    def __repr__(self):
+        return f"<file_id: {self.file_id}, refimg_id: {self.refimg_id} when_processed={self.when_processed}, matched={self.matched}"
+
 class File(Base):
    __tablename__ = "file"
    eid = Column(Integer, ForeignKey("entry.id"), primary_key=True )
    size_mb = Column(Integer, unique=False, nullable=False)
    hash = Column(Integer, unique=True, nullable=True)
    thumbnail = Column(String, unique=False, nullable=True)
-#   DDP: need bytea? in db (see other DDP comment)
-#    faces = 
+    faces = Column( LargeBinary )
    faces_created_on = Column(Float)

    def __repr__(self):
@@ -126,7 +135,7 @@ class Settings(Base):
    def __repr__(self):
        return "<id: {}, import_path: {}>".format(self.id, self.import_path )
        
-class Person_Refimg_Link(Base):
+class PersonRefimgLink(Base):
    __tablename__ = "person_refimg_link"
    person_id = Column(Integer, ForeignKey('person.id'), unique=True, nullable=False, primary_key=True)
    refimg_id = Column(Integer, ForeignKey('refimg.id'), unique=True, nullable=False, primary_key=True)
@@ -140,7 +149,7 @@ class Person(Base):
    tag = Column(String(48), unique=False, nullable=False)
    surname = Column(String(48), unique=False, nullable=False)
    firstname = Column(String(48), unique=False, nullable=False)
-    refimg = relationship('Refimg', secondary=Person_Refimg_Link.__table__)
+    refimg = relationship('Refimg', secondary=PersonRefimgLink.__table__)

    def __repr__(self):
        return "<tag: {}, firstname: {}, surname: {}, refimg: {}>".format(self.tag,self.firstname, self.surname, self.refimg)
@@ -155,14 +164,6 @@ class Refimg(Base):
    def __repr__(self):
        return f"<id: {id}, fname: {fname}, created_on: {created_on}, encodings: {encodings}>"

-class File_Person_Link(Base):
-    __tablename__ = "file_person_link"
-    file_id = Column(Integer, ForeignKey('file.eid'), unique=True, nullable=False, primary_key=True)
-    person_id = Column(Integer, ForeignKey('person.id'), unique=True, nullable=False, primary_key=True)
-
-    def __repr__(self):
-        return "<file_id: {}, person_id: {}>".format(self.file_id, self.person_id)
-


 ################################################################################
@@ -360,7 +361,7 @@ def JobScanNow(job):

 def JobForceScan(job):
    JobProgressState( job, "In Progress" )
-    session.query(File_Person_Link).delete()
+    session.query(FileRefimgLink).delete()
    session.query(EntryDirLink).delete()
    session.query(Dir).delete()
    session.query(File).delete()
@@ -481,13 +482,8 @@ def JobImportDir(job):
    return

 def JobProcessAI(job):
-    print ("DDP: HACK - to allow re-running jobs for now, del FPL");
-    session.query(File_Person_Link).delete()
-    #### (delete the above 2 lines)
-
    path=[jex.value for jex in job.extra if jex.name == "path"][0]
    path = SymlinkName(path, '/')
-    print('REMOVE AFTER TESTING ON WINDOWS... path=',path)
    d=session.query(Dir).filter(Dir.path_prefix==path).first()
    job.num_files=d.num_files
    for e in FilesInDir( path ):
@@ -518,26 +514,33 @@ def ProcessAI(job, e):
    for person in people:
        generateKnownEncodings(person)
    
+
    file = e.in_dir[0].path_prefix + '/' + e.name
    stat = os.stat(file)
    # only find faces if we have not already OR file is newer than when we found faces before
    if not e.file_details[0].faces_created_on or stat.st_ctime > e.file_details[0].faces_created_on:
        session.add(e)
-        im = Image.open(file)
-        try:
-            im = ImageOps.exif_transpose(im)
-        except:
-            print("DEBUG: looks like image does not have exif")
+        im_orig = Image.open(file)
+        im = ImageOps.exif_transpose(im_orig)

        faces = generateUnknownEncodings(im)
-#       DDP: uncomment the below to optimise, but I need to store the faces into the DB, not sure how right now
-##### is this really 0? or will there be many with the many faces?
-#       if its many, should we do a faces_file_link???
-#        e.file_details[0].faces = faces[0].tobytes()
-#        e.file_details[0].faces_created_on=time.time()
-#   else:
-#       faces=numpy.frombuffer(e.file_details[0].faces,dtype=numpy.float64)      
-
+        e.file_details[0].faces_created_on=time.time()
+        if faces:
+            flat_faces = numpy.array(faces)
+            e.file_details[0].faces = flat_faces.tobytes()
+        else:
+            e.file_details[0].faces = None
+            return
+    else:
+        if not e.file_details[0].faces:
+            print("OPTIM: This image has no faces, skip it")
+            return
+        recover=numpy.frombuffer(e.file_details[0].faces,dtype=numpy.float64)      
+        real_recover=numpy.reshape(recover,(-1,128))      
+        l=[]
+        for el in real_recover:
+            l.append(numpy.array(el))
+        faces = l
    for unknown_encoding in faces:
        for person in people:
            lookForPersonInImage(job, person, unknown_encoding, e)
@@ -546,25 +549,34 @@ def ProcessAI(job, e):

 def lookForPersonInImage(job, person, unknown_encoding, e):
    for refimg in person.refimg:
-        ###
-        # need a date_stamp in refimg_file_link, but we currently have a person_file_link
-        #   should consider whether we break this into just a scan ( id, refimg, file, date, threshold, etc.)
-        ###
+        # lets see if we have tried this check before
+        frl=session.query(FileRefimgLink).filter(FileRefimgLink.file_id==e.id, FileRefimgLink.refimg_id==refimg.id).first()
+        if not frl:
+            frl = FileRefimgLink(refimg_id=refimg.id, file_id=e.file_details[0].eid)
+        else: 
+            stat=os.stat(e.in_dir[0].path_prefix+'/'+ e.name)
+            # file & refimg are not newer then we dont need to check
+            if frl.matched and stat.st_ctime < frl.when_processed and refimg.created_on < frl.when_processed:
+                print("OPTIM: lookForPersonInImage: file has a previous match, and the file & refimg  haven't changed")
+                return
+
+        session.add(frl)
+        frl.matched=False
+        frl.when_processed=time.time()
        deserialized_bytes = numpy.frombuffer(refimg.encodings, dtype=numpy.float64)
        results = compareAI(deserialized_bytes, unknown_encoding)
        if results[0]:
            print(f'Found a match between: {person.tag} and {e.name}')
            AddLogForJob(job, f'Found a match between: {person.tag} and {e.name}')
-            fpl = File_Person_Link(person_id=person.id, file_id=e.file_details[0].eid)
-            session.add(fpl)
+            frl.matched=True
            return

 def generateUnknownEncodings(im):
    unknown_image = numpy.array(im)
    face_locations = face_recognition.face_locations(unknown_image)
+    if not face_locations:
+        return None
    unknown_encodings = face_recognition.face_encodings(unknown_image, known_face_locations=face_locations)
-    # should save these to the db 
-    # file.locations = face_locations
    return unknown_encodings


@@ -573,7 +585,7 @@ def generateKnownEncodings(person):
        file = 'reference_images/'+refimg.fname
        stat = os.stat(file)
        if refimg.created_on and stat.st_ctime < refimg.created_on:
-            print("DEBUG: skipping re-creating encoding for refimg because file has changed since we did this before")
+            print("OPTIM: skipping re-creating encoding for refimg because file has not changed")
            continue
        img = face_recognition.load_image_file(file)
        location = face_recognition.face_locations(img)