feat: Add advanced pocketbook detection heuristic

Added heuristic which checks for the existence of a specific
meta tag written to the pocketbook XHTML file.
This commit is contained in:
Marty Oehme 2024-01-24 14:56:30 +01:00
parent 6a8f8a03bc
commit c8e8453b68
Signed by: Marty
GPG Key ID: EDBF2ED917B2EF6A
1 changed files with 13 additions and 3 deletions

View File

@ -12,7 +12,19 @@ logger = papis.logging.get_logger(__name__)
class PocketBookExtractor:
def can_process(self, filename: Path) -> bool:
return magic.from_file(filename, mime=True) == "text/xml"
content = self._read_file(filename)
if not content:
return False
if not magic.from_buffer(content, mime=True) == "text/xml":
return False
html = BeautifulSoup(content, features="xml")
if not html.find(
"meta", {"name": "generator", "content": "PocketBook Bookmarks Export"}
):
return False
return True
def run(self, filename: Path) -> list[Annotation]:
"""Extract annotations from pocketbook html file.
@ -64,5 +76,3 @@ class PocketBookExtractor:
except FileNotFoundError:
logger.error(f"Could not open file {filename} for extraction.")
return ""