From e46219151bf8278d701464e717a1d81e53ca3ea1 Mon Sep 17 00:00:00 2001
From: Marty Oehme <contact@martyoeh.me>
Date: Thu, 11 Sep 2025 17:25:20 +0200
Subject: [PATCH] refactor: Use generator for PDF extractor

---
 papis_extract/extractors/pdf.py | 68 +++++++++++++++++++--------------
 1 file changed, 40 insertions(+), 28 deletions(-)

diff --git a/papis_extract/extractors/pdf.py b/papis_extract/extractors/pdf.py
index 6d9acc2..5df854e 100644
--- a/papis_extract/extractors/pdf.py
+++ b/papis_extract/extractors/pdf.py
@@ -1,6 +1,7 @@
 # pyright: strict, reportMissingTypeStubs=false, reportUnknownMemberType=false
+from collections.abc import Generator
 from pathlib import Path
-from typing import cast
+from typing import NamedTuple, cast
 
 import Levenshtein
 import magic
@@ -14,6 +15,11 @@ from papis_extract.exceptions import ExtractionError
 logger = papis.logging.get_logger(__name__)
 
 
+class PdfAnnot(NamedTuple):
+    page: mu.Page
+    annot: mu.Annot
+
+
 class PdfExtractor:
     def can_process(self, filename: Path) -> bool:
         if not filename.is_file():
@@ -29,32 +35,24 @@ class PdfExtractor:
         """
         annotations: list[Annotation] = []
         try:
-            with mu.Document(filename) as doc:
-                for page in doc:  # pyright: ignore [reportUnknownVariableType] - missing stub
-                    annot: mu.Annot
-                    for annot in page.annots():
-                        quote, note = self._retrieve_annotation_content(page, annot)
-                        if not quote and not note:
-                            continue
-                        color: tuple[float, float, float] = cast(
-                            "tuple[float, float, float]",
-                            (
-                                annot.colors.get("fill")
-                                or annot.colors.get("stroke")
-                                or (0.0, 0.0, 0.0)
-                            ),
-                        )
-                        page_nr: int = cast("int", page.number or 0)
-                        highlight_type: str = cast("str", annot.type[1] or "")
-                        a = Annotation(
-                            file=str(filename),
-                            content=quote or "",
-                            note=note or "",
-                            color=color,
-                            type=highlight_type,
-                            page=page_nr,
-                        )
-                        annotations.append(a)
+            for page, annot in self._all_pdf_annots(filename):
+                quote, note = self._get_annotation_content(page, annot)
+                if not quote and not note:
+                    continue
+
+                color = self._get_correct_color(annot)
+                page_nr: int = cast("int", page.number or 0)
+                highlight_type: str = cast("str", annot.type[1] or "")
+
+                a = Annotation(
+                    file=str(filename),
+                    content=quote or "",
+                    note=note or "",
+                    color=color,
+                    type=highlight_type,
+                    page=page_nr,
+                )
+                annotations.append(a)
             logger.debug(
                 f"Found {len(annotations)} "
                 f"{'annotation' if len(annotations) == 1 else 'annotations'} for {filename}."
@@ -65,11 +63,18 @@ class PdfExtractor:
 
         return annotations
 
+    def _all_pdf_annots(self, filename: Path) -> Generator[PdfAnnot]:
+        with mu.Document(filename) as doc:
+            for page in doc:
+                annot: mu.Annot
+                for annot in page.annots():
+                    yield PdfAnnot(page, annot)
+
     def _is_pdf(self, fname: Path) -> bool:
         """Check if file is a pdf, using mime type."""
         return magic.from_file(fname, mime=True) == "application/pdf"
 
-    def _retrieve_annotation_content(
+    def _get_annotation_content(
         self, page: mu.Page, annotation: mu.Annot
     ) -> tuple[str | None, str | None]:
         """Gets the text content of an annotation.
@@ -102,3 +107,10 @@ class PdfExtractor:
             return (written, None)
         # just a highlight without any text
         return (None, None)
+
+    def _get_correct_color(self, annot: mu.Annot):
+        color: tuple[float, float, float] = cast(
+            "tuple[float, float, float]",
+            (annot.colors.get("fill") or annot.colors.get("stroke") or (0.0, 0.0, 0.0)),
+        )
+        return color