From 96b9a6b5ca9b2e3ca5067d602e46d153838406a0 Mon Sep 17 00:00:00 2001 From: Damien De Paoli Date: Mon, 25 Jan 2021 12:08:08 +1100 Subject: [PATCH] fixed BUG-16 - Hash optimisation (last_hast_date is now in File not Dir), also converted a few .format() to f" --- BUGs | 1 - pa_job_manager.py | 46 ++++++++++++++++++++++------------------------ tables.sql | 4 ++-- 3 files changed, 24 insertions(+), 27 deletions(-) diff --git a/BUGs b/BUGs index 52e4a62..4882752 100644 --- a/BUGs +++ b/BUGs @@ -1,3 +1,2 @@ ### Next: 19 -BUG-16: now we dont do dir level optimising for genfiledetails, need to stat the file / check dates that way to optimise BUG-17: I think it won't handle me deleting files after scan diff --git a/pa_job_manager.py b/pa_job_manager.py index 93fc4bf..358e7ee 100644 --- a/pa_job_manager.py +++ b/pa_job_manager.py @@ -70,7 +70,7 @@ class EntryDirLink(Base): dir_eid = Column(Integer, ForeignKey("dir.eid"), primary_key=True ) def __repr__(self): - return "".format(self.entry_id, self.dir_eid) + return f"" class Dir(Base): __tablename__ = "dir" @@ -78,11 +78,10 @@ class Dir(Base): path_prefix = Column(String, unique=False, nullable=False ) num_files = Column(Integer) last_import_date = Column(Float) - last_hash_date = Column(Float) files = relationship("Entry", secondary="entry_dir_link") def __repr__(self): - return "".format(self.eid, self.path_prefix, self.num_files, self.last_import_date, self.last_hash_date) + return f"" class Entry(Base): __tablename__ = "entry" @@ -95,7 +94,7 @@ class Entry(Base): in_dir = relationship ("Dir", secondary="entry_dir_link" ) def __repr__(self): - return "".format(self.id, self.name, self.type, self.dir_details, self.file_details, self.in_dir) + return f"" class FileRefimgLink(Base): __tablename__ = "file_refimg_link" @@ -113,11 +112,12 @@ class File(Base): size_mb = Column(Integer, unique=False, nullable=False) hash = Column(Integer, unique=True, nullable=True) thumbnail = Column(String, unique=False, nullable=True) + last_hash_date = Column(Float) faces = Column( LargeBinary ) faces_created_on = Column(Float) def __repr__(self): - return "".format(self.eid, self.size_mb, self.hash ) + return f"" class FileType(Base): __tablename__ = "file_type" @@ -125,7 +125,7 @@ class FileType(Base): name = Column(String, unique=True, nullable=False ) def __repr__(self): - return "".format(self.id, self.name ) + return f"" class Settings(Base): __tablename__ = "settings" @@ -133,7 +133,7 @@ class Settings(Base): import_path = Column(String) def __repr__(self): - return "".format(self.id, self.import_path ) + return f"" class PersonRefimgLink(Base): __tablename__ = "person_refimg_link" @@ -141,7 +141,7 @@ class PersonRefimgLink(Base): refimg_id = Column(Integer, ForeignKey('refimg.id'), unique=True, nullable=False, primary_key=True) def __repr__(self): - return "".format(self.person_id, self.refimg_id) + return f"" class Person(Base): __tablename__ = "person" @@ -152,7 +152,7 @@ class Person(Base): refimg = relationship('Refimg', secondary=PersonRefimgLink.__table__) def __repr__(self): - return "".format(self.tag,self.firstname, self.surname, self.refimg) + return f"" class Refimg(Base): __tablename__ = "refimg" @@ -162,7 +162,7 @@ class Refimg(Base): created_on = Column(Float) def __repr__(self): - return f"" + return f"" @@ -397,7 +397,7 @@ def AddDir(job, dirname, path_prefix, in_dir): dir=session.query(Dir).filter(Dir.path_prefix==path_prefix).first() if dir: return dir - dir=Dir( path_prefix=path_prefix, num_files=0, last_import_date=0, last_hash_date=0 ) + dir=Dir( path_prefix=path_prefix, num_files=0, last_import_date=0 ) dtype=session.query(FileType).filter(FileType.name=='Directory').first() e=Entry( name=dirname, type=dtype ) e.dir_details.append(dir) @@ -417,7 +417,7 @@ def AddFile(job, fname, type_str, fsize, in_dir ): return e ftype = session.query(FileType).filter(FileType.name==type_str).first() e=Entry( name=fname, type=ftype ) - f=File( size_mb=fsize ) + f=File( size_mb=fsize, last_hash_date=0, faces_created_on=0 ) e.file_details.append(f) e.in_dir.append(in_dir) AddLogForJob(job, "Found new file: {}".format(fname) ) @@ -501,6 +501,12 @@ def FilesInDir( path ): return d.files def GenHashAndThumb(job, e): + stat = os.stat( e.in_dir[0].path_prefix + '/' + e.name ) + if stat.st_ctime < e.file_details[0].last_hash_date: + print(f"OPTIM: GenHashAndThumb {e.name} file is older than last hash, skip this") + job.current_file_num+=1 + return + e.file_details[0].hash = md5( job, e.in_dir[0].path_prefix+'/'+ e.name ) if e.type.name == 'Image': e.file_details[0].thumbnail = GenImageThumbnail( job, e.in_dir[0].path_prefix+'/'+ e.name ) @@ -508,6 +514,7 @@ def GenHashAndThumb(job, e): e.file_details[0].thumbnail = GenVideoThumbnail( job, e.in_dir[0].path_prefix+'/'+ e.name ) elif e.type.name == 'Unknown': job.current_file_num+=1 + e.file_details[0].last_hash_date = time.time() return def ProcessAI(job, e): @@ -518,8 +525,8 @@ def ProcessAI(job, e): file = e.in_dir[0].path_prefix + '/' + e.name stat = os.stat(file) - # only find faces if we have not already OR file is newer than when we found faces before - if not e.file_details[0].faces_created_on or stat.st_ctime > e.file_details[0].faces_created_on: + # find if file is newer than when we found faces before (fyi: first time faces_created_on == 0) + if stat.st_ctime > e.file_details[0].faces_created_on: session.add(e) im_orig = Image.open(file) im = ImageOps.exif_transpose(im_orig) @@ -561,7 +568,7 @@ def lookForPersonInImage(job, person, unknown_encoding, e): stat=os.stat(e.in_dir[0].path_prefix+'/'+ e.name) # file & refimg are not newer then we dont need to check if frl.matched and stat.st_ctime < frl.when_processed and refimg.created_on < frl.when_processed: - print("OPTIM: lookForPersonInImage: file has a previous match, and the file & refimg haven't changed") + print(f"OPTIM: lookForPersonInImage: file {e.name} has a previous match for: {refimg.fname}, and the file & refimg haven't changed") return session.add(frl) @@ -622,15 +629,6 @@ def JobGetFileDetails(job): if DEBUG==1: print("DEBUG: JobGetFileDetails for path={}".format( path ) ) dir=session.query(Dir).filter(Dir.path_prefix==path).first() - stat=os.stat( path ) - if stat.st_ctime < dir.last_hash_date: - session.add(dir) - dir.last_hash_date = time.time() - FinishJob(job, "{} has not changed since last hashing - finished job".format(dir.path_prefix)) - if DEBUG==1: - print ("DEBUG: skip this dir {} as it has not changed since last hashing".format(dir.path_prefix)) - return - dir.last_hash_date = time.time() job.current_file_num = 0 job.num_files = dir.num_files session.commit() diff --git a/tables.sql b/tables.sql index a394ef1..f1190e1 100644 --- a/tables.sql +++ b/tables.sql @@ -8,11 +8,11 @@ create table ENTRY( ID integer, NAME varchar(128), TYPE_ID integer, constraint PK_ENTRY_ID primary key(ID), constraint FK_FILE_TYPE_TYPE_ID foreign key (TYPE_ID) references FILE_TYPE(ID) ); -create table FILE ( EID integer, SIZE_MB integer, HASH varchar(34), THUMBNAIL varchar, FACES_CREATED_ON float, FACES bytea, +create table FILE ( EID integer, SIZE_MB integer, HASH varchar(34), THUMBNAIL varchar, FACES_CREATED_ON float, FACES bytea, LAST_HASH_DATE float, constraint PK_FILE_ID primary key(EID), constraint FK_FILE_ENTRY_ID foreign key (EID) references ENTRY(ID) ); -create table DIR ( EID integer, PATH_PREFIX varchar(256), NUM_FILES integer, LAST_IMPORT_DATE float, LAST_HASH_DATE float, +create table DIR ( EID integer, PATH_PREFIX varchar(256), NUM_FILES integer, LAST_IMPORT_DATE float, constraint PK_DIR_EID primary key(EID), constraint FK_DIR_ENTRY_ID foreign key (EID) references ENTRY(ID) );