Fixed BUG-11: we have changed to file_refimg_link table (from file_person_link), this means we now have some optimisations, and can definitely re-run AI jobs without crashing. Several optims could still be done - see TODO

This commit is contained in:
2021-01-25 01:05:30 +11:00
parent 0829a98376
commit 18b8a30140
9 changed files with 108 additions and 89 deletions

View File

@@ -14,7 +14,7 @@
### SQLALCHEMY IMPORTS ###
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import Column, Integer, String, Sequence, Float, ForeignKey, DateTime, LargeBinary
from sqlalchemy import Column, Integer, String, Sequence, Float, ForeignKey, DateTime, LargeBinary, Boolean
from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy.orm import relationship
from sqlalchemy import create_engine
@@ -97,14 +97,23 @@ class Entry(Base):
def __repr__(self):
return "<id: {}, name: {}, type={}, dir_details={}, file_details={}, in_dir={}>".format(self.id, self.name, self.type, self.dir_details, self.file_details, self.in_dir)
class FileRefimgLink(Base):
__tablename__ = "file_refimg_link"
file_id = Column(Integer, ForeignKey('file.eid'), unique=True, nullable=False, primary_key=True)
refimg_id = Column(Integer, ForeignKey('refimg.id'), unique=True, nullable=False, primary_key=True)
when_processed = Column(Float)
matched = Column(Boolean)
def __repr__(self):
return f"<file_id: {self.file_id}, refimg_id: {self.refimg_id} when_processed={self.when_processed}, matched={self.matched}"
class File(Base):
__tablename__ = "file"
eid = Column(Integer, ForeignKey("entry.id"), primary_key=True )
size_mb = Column(Integer, unique=False, nullable=False)
hash = Column(Integer, unique=True, nullable=True)
thumbnail = Column(String, unique=False, nullable=True)
# DDP: need bytea? in db (see other DDP comment)
# faces =
faces = Column( LargeBinary )
faces_created_on = Column(Float)
def __repr__(self):
@@ -126,7 +135,7 @@ class Settings(Base):
def __repr__(self):
return "<id: {}, import_path: {}>".format(self.id, self.import_path )
class Person_Refimg_Link(Base):
class PersonRefimgLink(Base):
__tablename__ = "person_refimg_link"
person_id = Column(Integer, ForeignKey('person.id'), unique=True, nullable=False, primary_key=True)
refimg_id = Column(Integer, ForeignKey('refimg.id'), unique=True, nullable=False, primary_key=True)
@@ -140,7 +149,7 @@ class Person(Base):
tag = Column(String(48), unique=False, nullable=False)
surname = Column(String(48), unique=False, nullable=False)
firstname = Column(String(48), unique=False, nullable=False)
refimg = relationship('Refimg', secondary=Person_Refimg_Link.__table__)
refimg = relationship('Refimg', secondary=PersonRefimgLink.__table__)
def __repr__(self):
return "<tag: {}, firstname: {}, surname: {}, refimg: {}>".format(self.tag,self.firstname, self.surname, self.refimg)
@@ -155,14 +164,6 @@ class Refimg(Base):
def __repr__(self):
return f"<id: {id}, fname: {fname}, created_on: {created_on}, encodings: {encodings}>"
class File_Person_Link(Base):
__tablename__ = "file_person_link"
file_id = Column(Integer, ForeignKey('file.eid'), unique=True, nullable=False, primary_key=True)
person_id = Column(Integer, ForeignKey('person.id'), unique=True, nullable=False, primary_key=True)
def __repr__(self):
return "<file_id: {}, person_id: {}>".format(self.file_id, self.person_id)
################################################################################
@@ -360,7 +361,7 @@ def JobScanNow(job):
def JobForceScan(job):
JobProgressState( job, "In Progress" )
session.query(File_Person_Link).delete()
session.query(FileRefimgLink).delete()
session.query(EntryDirLink).delete()
session.query(Dir).delete()
session.query(File).delete()
@@ -481,13 +482,8 @@ def JobImportDir(job):
return
def JobProcessAI(job):
print ("DDP: HACK - to allow re-running jobs for now, del FPL");
session.query(File_Person_Link).delete()
#### (delete the above 2 lines)
path=[jex.value for jex in job.extra if jex.name == "path"][0]
path = SymlinkName(path, '/')
print('REMOVE AFTER TESTING ON WINDOWS... path=',path)
d=session.query(Dir).filter(Dir.path_prefix==path).first()
job.num_files=d.num_files
for e in FilesInDir( path ):
@@ -518,26 +514,33 @@ def ProcessAI(job, e):
for person in people:
generateKnownEncodings(person)
file = e.in_dir[0].path_prefix + '/' + e.name
stat = os.stat(file)
# only find faces if we have not already OR file is newer than when we found faces before
if not e.file_details[0].faces_created_on or stat.st_ctime > e.file_details[0].faces_created_on:
session.add(e)
im = Image.open(file)
try:
im = ImageOps.exif_transpose(im)
except:
print("DEBUG: looks like image does not have exif")
im_orig = Image.open(file)
im = ImageOps.exif_transpose(im_orig)
faces = generateUnknownEncodings(im)
# DDP: uncomment the below to optimise, but I need to store the faces into the DB, not sure how right now
##### is this really 0? or will there be many with the many faces?
# if its many, should we do a faces_file_link???
# e.file_details[0].faces = faces[0].tobytes()
# e.file_details[0].faces_created_on=time.time()
# else:
# faces=numpy.frombuffer(e.file_details[0].faces,dtype=numpy.float64)
e.file_details[0].faces_created_on=time.time()
if faces:
flat_faces = numpy.array(faces)
e.file_details[0].faces = flat_faces.tobytes()
else:
e.file_details[0].faces = None
return
else:
if not e.file_details[0].faces:
print("OPTIM: This image has no faces, skip it")
return
recover=numpy.frombuffer(e.file_details[0].faces,dtype=numpy.float64)
real_recover=numpy.reshape(recover,(-1,128))
l=[]
for el in real_recover:
l.append(numpy.array(el))
faces = l
for unknown_encoding in faces:
for person in people:
lookForPersonInImage(job, person, unknown_encoding, e)
@@ -546,25 +549,34 @@ def ProcessAI(job, e):
def lookForPersonInImage(job, person, unknown_encoding, e):
for refimg in person.refimg:
###
# need a date_stamp in refimg_file_link, but we currently have a person_file_link
# should consider whether we break this into just a scan ( id, refimg, file, date, threshold, etc.)
###
# lets see if we have tried this check before
frl=session.query(FileRefimgLink).filter(FileRefimgLink.file_id==e.id, FileRefimgLink.refimg_id==refimg.id).first()
if not frl:
frl = FileRefimgLink(refimg_id=refimg.id, file_id=e.file_details[0].eid)
else:
stat=os.stat(e.in_dir[0].path_prefix+'/'+ e.name)
# file & refimg are not newer then we dont need to check
if frl.matched and stat.st_ctime < frl.when_processed and refimg.created_on < frl.when_processed:
print("OPTIM: lookForPersonInImage: file has a previous match, and the file & refimg haven't changed")
return
session.add(frl)
frl.matched=False
frl.when_processed=time.time()
deserialized_bytes = numpy.frombuffer(refimg.encodings, dtype=numpy.float64)
results = compareAI(deserialized_bytes, unknown_encoding)
if results[0]:
print(f'Found a match between: {person.tag} and {e.name}')
AddLogForJob(job, f'Found a match between: {person.tag} and {e.name}')
fpl = File_Person_Link(person_id=person.id, file_id=e.file_details[0].eid)
session.add(fpl)
frl.matched=True
return
def generateUnknownEncodings(im):
unknown_image = numpy.array(im)
face_locations = face_recognition.face_locations(unknown_image)
if not face_locations:
return None
unknown_encodings = face_recognition.face_encodings(unknown_image, known_face_locations=face_locations)
# should save these to the db
# file.locations = face_locations
return unknown_encodings
@@ -573,7 +585,7 @@ def generateKnownEncodings(person):
file = 'reference_images/'+refimg.fname
stat = os.stat(file)
if refimg.created_on and stat.st_ctime < refimg.created_on:
print("DEBUG: skipping re-creating encoding for refimg because file has changed since we did this before")
print("OPTIM: skipping re-creating encoding for refimg because file has not changed")
continue
img = face_recognition.load_image_file(file)
location = face_recognition.face_locations(img)