refactor: Use generator for PDF extractor

2025-09-11 17:25:20 +02:00 · 2025-09-11 17:25:20 +02:00 · e46219151b
commit e46219151b
parent ff36d30f91
1 changed files with 40 additions and 28 deletions
--- a/papis_extract/extractors/pdf.py
+++ b/papis_extract/extractors/pdf.py
@ -1,6 +1,7 @@
 # pyright: strict, reportMissingTypeStubs=false, reportUnknownMemberType=false
 from collections.abc import Generator
 from pathlib import Path
-from typing import cast
+from typing import NamedTuple, cast
 import Levenshtein
 import magic
@ -14,6 +15,11 @@ from papis_extract.exceptions import ExtractionError
 logger = papis.logging.get_logger(__name__)
 class PdfAnnot(NamedTuple):
    page: mu.Page
    annot: mu.Annot
 class PdfExtractor:
    def can_process(self, filename: Path) -> bool:
        if not filename.is_file():
@ -29,23 +35,15 @@ class PdfExtractor:
        """
        annotations: list[Annotation] = []
        try:
-            with mu.Document(filename) as doc:
+            for page, annot in self._all_pdf_annots(filename):
-                for page in doc:  # pyright: ignore [reportUnknownVariableType] - missing stub
+                quote, note = self._get_annotation_content(page, annot)
                    annot: mu.Annot
                    for annot in page.annots():
                        quote, note = self._retrieve_annotation_content(page, annot)
                if not quote and not note:
                    continue
-                        color: tuple[float, float, float] = cast(
+
-                            "tuple[float, float, float]",
+                color = self._get_correct_color(annot)
                            (
                                annot.colors.get("fill")
                                or annot.colors.get("stroke")
                                or (0.0, 0.0, 0.0)
                            ),
                        )
                page_nr: int = cast("int", page.number or 0)
                highlight_type: str = cast("str", annot.type[1] or "")
                a = Annotation(
                    file=str(filename),
                    content=quote or "",
@ -65,11 +63,18 @@ class PdfExtractor:
        return annotations
    def _all_pdf_annots(self, filename: Path) -> Generator[PdfAnnot]:
        with mu.Document(filename) as doc:
            for page in doc:
                annot: mu.Annot
                for annot in page.annots():
                    yield PdfAnnot(page, annot)
    def _is_pdf(self, fname: Path) -> bool:
        """Check if file is a pdf, using mime type."""
        return magic.from_file(fname, mime=True) == "application/pdf"
-    def _retrieve_annotation_content(
+    def _get_annotation_content(
        self, page: mu.Page, annotation: mu.Annot
    ) -> tuple[str | None, str | None]:
        """Gets the text content of an annotation.
@ -102,3 +107,10 @@ class PdfExtractor:
            return (written, None)
        # just a highlight without any text
        return (None, None)
    def _get_correct_color(self, annot: mu.Annot):
        color: tuple[float, float, float] = cast(
            "tuple[float, float, float]",
            (annot.colors.get("fill") or annot.colors.get("stroke") or (0.0, 0.0, 0.0)),
        )
        return color