feat: Add pocketbook extraction

This commit is contained in:
Marty Oehme 2024-01-24 08:55:43 +01:00
parent ddb34fca7b
commit c53cd563b7
Signed by: Marty
GPG key ID: EDBF2ED917B2EF6A
7 changed files with 78 additions and 14 deletions

View file

@ -9,15 +9,19 @@ from papis.document import Document
TEXT_SIMILARITY_MINIMUM = 0.75 TEXT_SIMILARITY_MINIMUM = 0.75
COLOR_SIMILARITY_MINIMUM = 0.833 COLOR_SIMILARITY_MINIMUM = 0.833
COLORS = { COLORS: dict[str, tuple[float, float, float]] = {
"red": (1, 0, 0),
"green": (0, 1, 0),
"blue": (0, 0, 1), "blue": (0, 0, 1),
"green": (0, 1, 0),
"red": (1, 0, 0),
"cyan": (0, 1, 1),
"yellow": (1, 1, 0), "yellow": (1, 1, 0),
"magenta": (1, 0, 1),
"purple": (0.5, 0, 0.5), "purple": (0.5, 0, 0.5),
"pink": (1, 0.75, 0.8),
"orange": (1, 0.65, 0), "orange": (1, 0.65, 0),
} }
@dataclass @dataclass
class Annotation: class Annotation:
"""A PDF annotation object. """A PDF annotation object.

View file

@ -98,7 +98,7 @@ def _add_annots_to_note(
f.write("\n") f.write("\n")
logger.info( logger.info(
f"Wrote {len(new_annotations)} " f"Wrote {len(new_annotations)} "
f"{'annotation' if len(new_annotations) == 1 else 'annotations'} " f"{'line' if len(new_annotations) == 1 else 'lines'} "
f"to {papis.document.describe(document)}" f"to {papis.document.describe(document)}"
) )

View file

@ -47,6 +47,6 @@ def start(
if not file_available: if not file_available:
# have to remove curlys or papis logger gets upset # have to remove curlys or papis logger gets upset
desc = re.sub("[{}]", "", papis.document.describe(document)) desc = re.sub("[{}]", "", papis.document.describe(document))
logger.warning("Did not find suitable file for document: " f"{desc}") logger.info(f"No {type(extractor)} file for document: {desc}")
return annotations return annotations

View file

@ -1,6 +1,8 @@
from papis_extract.extraction import Extractor from papis_extract.extraction import Extractor
from papis_extract.extractors import pdf from papis_extract.extractors import pdf
from papis_extract.extractors.pocketbook import PocketBookExtractor
all_extractors: dict[str, Extractor] = { all_extractors: dict[str, Extractor] = {
"pdf": pdf.PdfExtractor(), "pdf": pdf.PdfExtractor(),
"pocketbook": PocketBookExtractor(),
} }

View file

@ -1,5 +1,4 @@
from pathlib import Path from pathlib import Path
from typing import Any, Optional
import fitz import fitz
import Levenshtein import Levenshtein
@ -43,7 +42,7 @@ class PdfExtractor:
file=str(filename), file=str(filename),
content=quote or "", content=quote or "",
note=note or "", note=note or "",
colors=col, color=col,
type=annot.type[1], type=annot.type[1],
page=(page.number or 0) + 1, page=(page.number or 0) + 1,
) )
@ -54,14 +53,12 @@ class PdfExtractor:
) )
return annotations return annotations
def _is_pdf(self, fname: Path) -> bool: def _is_pdf(self, fname: Path) -> bool:
"""Check if file is a pdf, using mime type.""" """Check if file is a pdf, using mime type."""
return magic.from_file(fname, mime=True) == "application/pdf" return magic.from_file(fname, mime=True) == "application/pdf"
def _retrieve_annotation_content(
def _retrieve_annotation_content(self, self, page: fitz.Page, annotation: fitz.Annot
page: fitz.Page, annotation: fitz.Annot
) -> tuple[str | None, str | None]: ) -> tuple[str | None, str | None]:
"""Gets the text content of an annotation. """Gets the text content of an annotation.
@ -77,7 +74,8 @@ class PdfExtractor:
# highlight with selection in note # highlight with selection in note
minimum_similarity = ( minimum_similarity = (
papis.config.getfloat("minimum_similarity_content", "plugins.extract") or 1.0 papis.config.getfloat("minimum_similarity_content", "plugins.extract")
or 1.0
) )
if Levenshtein.ratio(content, written) > minimum_similarity: if Levenshtein.ratio(content, written) > minimum_similarity:
return (content, None) return (content, None)
@ -92,5 +90,3 @@ class PdfExtractor:
return (written, None) return (written, None)
# just a highlight without any text # just a highlight without any text
return (None, None) return (None, None)

View file

@ -0,0 +1,60 @@
from pathlib import Path
import magic
import papis.config
import papis.logging
from bs4 import BeautifulSoup
from papis_extract.annotation import COLORS, Annotation
logger = papis.logging.get_logger(__name__)
class PocketBookExtractor:
def can_process(self, filename: Path) -> bool:
return magic.from_file(filename, mime=True) == "text/xml"
def run(self, filename: Path) -> list[Annotation]:
"""Extract annotations from pocketbook html file.
Export annotations from pocketbook app and load add them
to a papis document as the exported html file.
Returns all readable annotations contained in the file
passed in, with highlights, notes and pages if available.
"""
annotations: list[Annotation] = []
try:
with open(filename) as f:
html = BeautifulSoup(f.read(), features="xml")
except FileNotFoundError:
logger.error(f"Could not open file {filename} for extraction.")
return []
for bm in html.select("div.bookmark"):
content = (bm.select_one("div.bm-text>p") or html.new_string("")).text
note = (bm.select_one("div.bm-note>p") or html.new_string("")).text
page = (bm.select_one("p.bm-page") or html.new_string("")).text
el_classes = bm.attrs.get("class", "").split(" ")
color = (0, 0, 0)
for c in el_classes:
if "bm-color-" in c:
color = COLORS.get(c.removeprefix("bm-color-"), (0, 0, 0))
break
a = Annotation(
file=str(filename),
content=content or "",
note=note or "",
color=color,
type="Highlight",
page=int(page),
)
annotations.append(a)
logger.debug(
f"Found {len(annotations)} "
f"{'annotation' if len(annotations) == 1 else 'annotations'} for {filename}."
)
return annotations

View file

@ -19,9 +19,11 @@ click = "^8.1.7"
whoosh = { version = "^2.7.4", optional = true } whoosh = { version = "^2.7.4", optional = true }
python-magic = "^0.4.27" python-magic = "^0.4.27"
chevron = "^0.14.0" chevron = "^0.14.0"
beautifulsoup4 = { version = "^4.12.3", optional = true }
[tool.poetry.extras] [tool.poetry.extras]
whoosh = ["whoosh"] whoosh = ["whoosh"]
pocketbook = ["beautifulsoup4"]
[tool.poetry.group.dev.dependencies] [tool.poetry.group.dev.dependencies]
pytest = "^7.4.0" pytest = "^7.4.0"