From e46219151bf8278d701464e717a1d81e53ca3ea1 Mon Sep 17 00:00:00 2001 From: Marty Oehme Date: Thu, 11 Sep 2025 17:25:20 +0200 Subject: [PATCH] refactor: Use generator for PDF extractor --- papis_extract/extractors/pdf.py | 68 +++++++++++++++++++-------------- 1 file changed, 40 insertions(+), 28 deletions(-) diff --git a/papis_extract/extractors/pdf.py b/papis_extract/extractors/pdf.py index 6d9acc2..5df854e 100644 --- a/papis_extract/extractors/pdf.py +++ b/papis_extract/extractors/pdf.py @@ -1,6 +1,7 @@ # pyright: strict, reportMissingTypeStubs=false, reportUnknownMemberType=false +from collections.abc import Generator from pathlib import Path -from typing import cast +from typing import NamedTuple, cast import Levenshtein import magic @@ -14,6 +15,11 @@ from papis_extract.exceptions import ExtractionError logger = papis.logging.get_logger(__name__) +class PdfAnnot(NamedTuple): + page: mu.Page + annot: mu.Annot + + class PdfExtractor: def can_process(self, filename: Path) -> bool: if not filename.is_file(): @@ -29,32 +35,24 @@ class PdfExtractor: """ annotations: list[Annotation] = [] try: - with mu.Document(filename) as doc: - for page in doc: # pyright: ignore [reportUnknownVariableType] - missing stub - annot: mu.Annot - for annot in page.annots(): - quote, note = self._retrieve_annotation_content(page, annot) - if not quote and not note: - continue - color: tuple[float, float, float] = cast( - "tuple[float, float, float]", - ( - annot.colors.get("fill") - or annot.colors.get("stroke") - or (0.0, 0.0, 0.0) - ), - ) - page_nr: int = cast("int", page.number or 0) - highlight_type: str = cast("str", annot.type[1] or "") - a = Annotation( - file=str(filename), - content=quote or "", - note=note or "", - color=color, - type=highlight_type, - page=page_nr, - ) - annotations.append(a) + for page, annot in self._all_pdf_annots(filename): + quote, note = self._get_annotation_content(page, annot) + if not quote and not note: + continue + + color = self._get_correct_color(annot) + page_nr: int = cast("int", page.number or 0) + highlight_type: str = cast("str", annot.type[1] or "") + + a = Annotation( + file=str(filename), + content=quote or "", + note=note or "", + color=color, + type=highlight_type, + page=page_nr, + ) + annotations.append(a) logger.debug( f"Found {len(annotations)} " f"{'annotation' if len(annotations) == 1 else 'annotations'} for {filename}." @@ -65,11 +63,18 @@ class PdfExtractor: return annotations + def _all_pdf_annots(self, filename: Path) -> Generator[PdfAnnot]: + with mu.Document(filename) as doc: + for page in doc: + annot: mu.Annot + for annot in page.annots(): + yield PdfAnnot(page, annot) + def _is_pdf(self, fname: Path) -> bool: """Check if file is a pdf, using mime type.""" return magic.from_file(fname, mime=True) == "application/pdf" - def _retrieve_annotation_content( + def _get_annotation_content( self, page: mu.Page, annotation: mu.Annot ) -> tuple[str | None, str | None]: """Gets the text content of an annotation. @@ -102,3 +107,10 @@ class PdfExtractor: return (written, None) # just a highlight without any text return (None, None) + + def _get_correct_color(self, annot: mu.Annot): + color: tuple[float, float, float] = cast( + "tuple[float, float, float]", + (annot.colors.get("fill") or annot.colors.get("stroke") or (0.0, 0.0, 0.0)), + ) + return color