refactor: Use generator for PDF extractor

This commit is contained in:
Marty Oehme 2025-09-11 17:25:20 +02:00
parent ff36d30f91
commit e46219151b
Signed by: Marty
GPG key ID: 4E535BC19C61886E

View file

@ -1,6 +1,7 @@
# pyright: strict, reportMissingTypeStubs=false, reportUnknownMemberType=false # pyright: strict, reportMissingTypeStubs=false, reportUnknownMemberType=false
from collections.abc import Generator
from pathlib import Path from pathlib import Path
from typing import cast from typing import NamedTuple, cast
import Levenshtein import Levenshtein
import magic import magic
@ -14,6 +15,11 @@ from papis_extract.exceptions import ExtractionError
logger = papis.logging.get_logger(__name__) logger = papis.logging.get_logger(__name__)
class PdfAnnot(NamedTuple):
page: mu.Page
annot: mu.Annot
class PdfExtractor: class PdfExtractor:
def can_process(self, filename: Path) -> bool: def can_process(self, filename: Path) -> bool:
if not filename.is_file(): if not filename.is_file():
@ -29,32 +35,24 @@ class PdfExtractor:
""" """
annotations: list[Annotation] = [] annotations: list[Annotation] = []
try: try:
with mu.Document(filename) as doc: for page, annot in self._all_pdf_annots(filename):
for page in doc: # pyright: ignore [reportUnknownVariableType] - missing stub quote, note = self._get_annotation_content(page, annot)
annot: mu.Annot if not quote and not note:
for annot in page.annots(): continue
quote, note = self._retrieve_annotation_content(page, annot)
if not quote and not note: color = self._get_correct_color(annot)
continue page_nr: int = cast("int", page.number or 0)
color: tuple[float, float, float] = cast( highlight_type: str = cast("str", annot.type[1] or "")
"tuple[float, float, float]",
( a = Annotation(
annot.colors.get("fill") file=str(filename),
or annot.colors.get("stroke") content=quote or "",
or (0.0, 0.0, 0.0) note=note or "",
), color=color,
) type=highlight_type,
page_nr: int = cast("int", page.number or 0) page=page_nr,
highlight_type: str = cast("str", annot.type[1] or "") )
a = Annotation( annotations.append(a)
file=str(filename),
content=quote or "",
note=note or "",
color=color,
type=highlight_type,
page=page_nr,
)
annotations.append(a)
logger.debug( logger.debug(
f"Found {len(annotations)} " f"Found {len(annotations)} "
f"{'annotation' if len(annotations) == 1 else 'annotations'} for {filename}." f"{'annotation' if len(annotations) == 1 else 'annotations'} for {filename}."
@ -65,11 +63,18 @@ class PdfExtractor:
return annotations return annotations
def _all_pdf_annots(self, filename: Path) -> Generator[PdfAnnot]:
with mu.Document(filename) as doc:
for page in doc:
annot: mu.Annot
for annot in page.annots():
yield PdfAnnot(page, annot)
def _is_pdf(self, fname: Path) -> bool: def _is_pdf(self, fname: Path) -> bool:
"""Check if file is a pdf, using mime type.""" """Check if file is a pdf, using mime type."""
return magic.from_file(fname, mime=True) == "application/pdf" return magic.from_file(fname, mime=True) == "application/pdf"
def _retrieve_annotation_content( def _get_annotation_content(
self, page: mu.Page, annotation: mu.Annot self, page: mu.Page, annotation: mu.Annot
) -> tuple[str | None, str | None]: ) -> tuple[str | None, str | None]:
"""Gets the text content of an annotation. """Gets the text content of an annotation.
@ -102,3 +107,10 @@ class PdfExtractor:
return (written, None) return (written, None)
# just a highlight without any text # just a highlight without any text
return (None, None) return (None, None)
def _get_correct_color(self, annot: mu.Annot):
color: tuple[float, float, float] = cast(
"tuple[float, float, float]",
(annot.colors.get("fill") or annot.colors.get("stroke") or (0.0, 0.0, 0.0)),
)
return color