refactor: Use generator for PDF extractor

2025-09-11 17:25:20 +02:00 · 2025-09-11 17:25:20 +02:00 · e46219151b
commit e46219151b
parent ff36d30f91
1 changed files with 40 additions and 28 deletions
--- a/papis_extract/extractors/pdf.py
+++ b/papis_extract/extractors/pdf.py
@ -1,6 +1,7 @@
 # pyright: strict, reportMissingTypeStubs=false, reportUnknownMemberType=false
 from collections.abc import Generator
 from pathlib import Path
-from typing import cast
+from typing import NamedTuple, cast
 import Levenshtein
 import magic
@ -14,6 +15,11 @@ from papis_extract.exceptions import ExtractionError
 logger = papis.logging.get_logger(__name__)
 class PdfAnnot(NamedTuple):
    page: mu.Page
    annot: mu.Annot
 class PdfExtractor:
    def can_process(self, filename: Path) -> bool:
        if not filename.is_file():
@ -29,32 +35,24 @@ class PdfExtractor:
        """
        annotations: list[Annotation] = []
        try:
-            with mu.Document(filename) as doc:
+            for page, annot in self._all_pdf_annots(filename):
-                for page in doc:  # pyright: ignore [reportUnknownVariableType] - missing stub
+                quote, note = self._get_annotation_content(page, annot)
-                    annot: mu.Annot
+                if not quote and not note:
-                    for annot in page.annots():
+                    continue
-                        quote, note = self._retrieve_annotation_content(page, annot)
+
-                        if not quote and not note:
+                color = self._get_correct_color(annot)
-                            continue
+                page_nr: int = cast("int", page.number or 0)
-                        color: tuple[float, float, float] = cast(
+                highlight_type: str = cast("str", annot.type[1] or "")
-                            "tuple[float, float, float]",
+
-                            (
+                a = Annotation(
-                                annot.colors.get("fill")
+                    file=str(filename),
-                                or annot.colors.get("stroke")
+                    content=quote or "",
-                                or (0.0, 0.0, 0.0)
+                    note=note or "",
-                            ),
+                    color=color,
-                        )
+                    type=highlight_type,
-                        page_nr: int = cast("int", page.number or 0)
+                    page=page_nr,
-                        highlight_type: str = cast("str", annot.type[1] or "")
+                )
-                        a = Annotation(
+                annotations.append(a)
                            file=str(filename),
                            content=quote or "",
                            note=note or "",
                            color=color,
                            type=highlight_type,
                            page=page_nr,
                        )
                        annotations.append(a)
            logger.debug(
                f"Found {len(annotations)} "
                f"{'annotation' if len(annotations) == 1 else 'annotations'} for {filename}."
@ -65,11 +63,18 @@ class PdfExtractor:
        return annotations
    def _all_pdf_annots(self, filename: Path) -> Generator[PdfAnnot]:
        with mu.Document(filename) as doc:
            for page in doc:
                annot: mu.Annot
                for annot in page.annots():
                    yield PdfAnnot(page, annot)
    def _is_pdf(self, fname: Path) -> bool:
        """Check if file is a pdf, using mime type."""
        return magic.from_file(fname, mime=True) == "application/pdf"
-    def _retrieve_annotation_content(
+    def _get_annotation_content(
        self, page: mu.Page, annotation: mu.Annot
    ) -> tuple[str | None, str | None]:
        """Gets the text content of an annotation.
@ -102,3 +107,10 @@ class PdfExtractor:
            return (written, None)
        # just a highlight without any text
        return (None, None)
    def _get_correct_color(self, annot: mu.Annot):
        color: tuple[float, float, float] = cast(
            "tuple[float, float, float]",
            (annot.colors.get("fill") or annot.colors.get("stroke") or (0.0, 0.0, 0.0)),
        )
        return color