refactor: Remove pymupdf coupling in extraction

The library is only needed for pdf extraction which is taken care of in its own extractor plugin. In the overall extraction routine we do not need any knowledge of the existence of pymupdf.
2024-06-14 14:59:39 +02:00 · 2024-06-14 14:59:39 +02:00 · 8093259551
commit 8093259551
parent 7261e7d80c
3 changed files with 52 additions and 29 deletions
--- a/papis_extract/extraction.py
+++ b/papis_extract/extraction.py
@ -2,13 +2,13 @@ import re
 from pathlib import Path
 from typing import Protocol

-import fitz
 import papis.config
 import papis.document
 import papis.logging
 from papis.document import Document

 from papis_extract.annotation import Annotation
+from papis_extract.extractors import ExtractionError

 logger = papis.logging.get_logger(__name__)

@ -39,8 +39,8 @@ def start(

        try:
            annotations.extend(extractor.run(fname))
-        except fitz.FileDataError as e:
-            print(f"File structure errors for {file}.\n{e}")
+        except ExtractionError as e:
+            print(f"File extraction errors for {file}.\n{e}")

    if not file_available:
        # have to remove curlys or papis logger gets upset
--- a/papis_extract/extractors/init.py
+++ b/papis_extract/extractors/init.py
@ -16,3 +16,13 @@ if find_spec("bs4") and find_spec("magic"):
    all_extractors["pocketbook"] = PocketBookExtractor()
 else:
    logger.debug("pocketbook extractor not activated.")
+
+
+class ExtractionError(Exception):
+    """Raised for exceptions during extraction.
+
+    Something went wrong during the extraction process in the extractor
+    run routine itself.
+    """
+
+    pass
--- a/papis_extract/extractors/pdf.py
+++ b/papis_extract/extractors/pdf.py
@ -7,6 +7,7 @@ import papis.config
 import papis.logging

 from papis_extract.annotation import Annotation
+from papis_extract.extractors import ExtractionError

 logger = papis.logging.get_logger(__name__)

@ -26,31 +27,43 @@ class PdfExtractor:
        Returns all readable annotations contained in the file
        passed in. Only returns Highlight or Text annotations.
        """
-        annotations = []
-        with fitz.Document(filename) as doc:
-            for page in doc:
+        annotations: list[Annotation] = []
+        try:
+            with mu.Document(filename) as doc:
+                for page in doc:  # pyright: ignore [reportUnknownVariableType] - missing stub
+                    page = cast(mu.Page, page)
+                    annot: mu.Annot
                    for annot in page.annots():
                        quote, note = self._retrieve_annotation_content(page, annot)
                        if not quote and not note:
                            continue
-                    col = (
+                        color: tuple[float, float, float] = cast(
+                            tuple[float, float, float],
+                            (
                                annot.colors.get("fill")
                                or annot.colors.get("stroke")
                                or (0.0, 0.0, 0.0)
+                            ),
                        )
+                        page_nr: int = cast(int, page.number or 0)
+                        highlight_type: str = cast(str, annot.type[1] or "")
                        a = Annotation(
                            file=str(filename),
                            content=quote or "",
                            note=note or "",
-                        color=col,
-                        type=annot.type[1],
-                        page=(page.number or 0) + 1,
+                            color=color,
+                            type=highlight_type,
+                            page=page_nr,
                        )
                        annotations.append(a)
            logger.debug(
                f"Found {len(annotations)} "
                f"{'annotation' if len(annotations) == 1 else 'annotations'} for {filename}."
            )
+
+        except mu.FileDataError as e:
+            raise ExtractionError
+
        return annotations

    def _is_pdf(self, fname: Path) -> bool:
@ -58,7 +71,7 @@ class PdfExtractor:
        return magic.from_file(fname, mime=True) == "application/pdf"

    def _retrieve_annotation_content(
-        self, page: fitz.Page, annotation: fitz.Annot
+        self, page: mu.Page, annotation: mu.Annot
    ) -> tuple[str | None, str | None]:
        """Gets the text content of an annotation.