refactor: Use generator for PDF extractor

2025-09-11 17:25:20 +02:00 · 2025-09-11 17:25:20 +02:00 · e46219151b
commit e46219151b
parent ff36d30f91
1 changed files with 40 additions and 28 deletions
--- a/papis_extract/extractors/pdf.py
+++ b/papis_extract/extractors/pdf.py
@ -1,6 +1,7 @@
 # pyright: strict, reportMissingTypeStubs=false, reportUnknownMemberType=false
+from collections.abc import Generator
 from pathlib import Path
-from typing import cast
+from typing import NamedTuple, cast

 import Levenshtein
 import magic
@ -14,6 +15,11 @@ from papis_extract.exceptions import ExtractionError
 logger = papis.logging.get_logger(__name__)


+class PdfAnnot(NamedTuple):
+    page: mu.Page
+    annot: mu.Annot
+
+
 class PdfExtractor:
    def can_process(self, filename: Path) -> bool:
        if not filename.is_file():
@ -29,32 +35,24 @@ class PdfExtractor:
        """
        annotations: list[Annotation] = []
        try:
-            with mu.Document(filename) as doc:
-                for page in doc:  # pyright: ignore [reportUnknownVariableType] - missing stub
-                    annot: mu.Annot
-                    for annot in page.annots():
-                        quote, note = self._retrieve_annotation_content(page, annot)
-                        if not quote and not note:
-                            continue
-                        color: tuple[float, float, float] = cast(
-                            "tuple[float, float, float]",
-                            (
-                                annot.colors.get("fill")
-                                or annot.colors.get("stroke")
-                                or (0.0, 0.0, 0.0)
-                            ),
-                        )
-                        page_nr: int = cast("int", page.number or 0)
-                        highlight_type: str = cast("str", annot.type[1] or "")
-                        a = Annotation(
-                            file=str(filename),
-                            content=quote or "",
-                            note=note or "",
-                            color=color,
-                            type=highlight_type,
-                            page=page_nr,
-                        )
-                        annotations.append(a)
+            for page, annot in self._all_pdf_annots(filename):
+                quote, note = self._get_annotation_content(page, annot)
+                if not quote and not note:
+                    continue
+
+                color = self._get_correct_color(annot)
+                page_nr: int = cast("int", page.number or 0)
+                highlight_type: str = cast("str", annot.type[1] or "")
+
+                a = Annotation(
+                    file=str(filename),
+                    content=quote or "",
+                    note=note or "",
+                    color=color,
+                    type=highlight_type,
+                    page=page_nr,
+                )
+                annotations.append(a)
            logger.debug(
                f"Found {len(annotations)} "
                f"{'annotation' if len(annotations) == 1 else 'annotations'} for {filename}."
@ -65,11 +63,18 @@ class PdfExtractor:

        return annotations

+    def _all_pdf_annots(self, filename: Path) -> Generator[PdfAnnot]:
+        with mu.Document(filename) as doc:
+            for page in doc:
+                annot: mu.Annot
+                for annot in page.annots():
+                    yield PdfAnnot(page, annot)
+
    def _is_pdf(self, fname: Path) -> bool:
        """Check if file is a pdf, using mime type."""
        return magic.from_file(fname, mime=True) == "application/pdf"

-    def _retrieve_annotation_content(
+    def _get_annotation_content(
        self, page: mu.Page, annotation: mu.Annot
    ) -> tuple[str | None, str | None]:
        """Gets the text content of an annotation.
@ -102,3 +107,10 @@ class PdfExtractor:
            return (written, None)
        # just a highlight without any text
        return (None, None)
+
+    def _get_correct_color(self, annot: mu.Annot):
+        color: tuple[float, float, float] = cast(
+            "tuple[float, float, float]",
+            (annot.colors.get("fill") or annot.colors.get("stroke") or (0.0, 0.0, 0.0)),
+        )
+        return color