From 809325955185da98e12edbcc3ced11fc63ff16dc Mon Sep 17 00:00:00 2001
From: Marty Oehme <marty.oehme@gmail.com>
Date: Fri, 14 Jun 2024 14:59:39 +0200
Subject: [PATCH] refactor: Remove pymupdf coupling in extraction

The library is only needed for pdf extraction which is taken care of
in its own extractor plugin. In the overall extraction routine we do not
need any knowledge of the existence of pymupdf.
---
 papis_extract/extraction.py          |  6 +--
 papis_extract/extractors/__init__.py | 10 +++++
 papis_extract/extractors/pdf.py      | 65 +++++++++++++++++-----------
 3 files changed, 52 insertions(+), 29 deletions(-)

diff --git a/papis_extract/extraction.py b/papis_extract/extraction.py
index 1c69026..74a6636 100644
--- a/papis_extract/extraction.py
+++ b/papis_extract/extraction.py
@@ -2,13 +2,13 @@ import re
 from pathlib import Path
 from typing import Protocol
 
-import fitz
 import papis.config
 import papis.document
 import papis.logging
 from papis.document import Document
 
 from papis_extract.annotation import Annotation
+from papis_extract.extractors import ExtractionError
 
 logger = papis.logging.get_logger(__name__)
 
@@ -39,8 +39,8 @@ def start(
 
         try:
             annotations.extend(extractor.run(fname))
-        except fitz.FileDataError as e:
-            print(f"File structure errors for {file}.\n{e}")
+        except ExtractionError as e:
+            print(f"File extraction errors for {file}.\n{e}")
 
     if not file_available:
         # have to remove curlys or papis logger gets upset
diff --git a/papis_extract/extractors/__init__.py b/papis_extract/extractors/__init__.py
index 47ead33..40af703 100644
--- a/papis_extract/extractors/__init__.py
+++ b/papis_extract/extractors/__init__.py
@@ -16,3 +16,13 @@ if find_spec("bs4") and find_spec("magic"):
     all_extractors["pocketbook"] = PocketBookExtractor()
 else:
     logger.debug("pocketbook extractor not activated.")
+
+
+class ExtractionError(Exception):
+    """Raised for exceptions during extraction.
+
+    Something went wrong during the extraction process in the extractor
+    run routine itself.
+    """
+
+    pass
diff --git a/papis_extract/extractors/pdf.py b/papis_extract/extractors/pdf.py
index a5f8de4..f61cefc 100644
--- a/papis_extract/extractors/pdf.py
+++ b/papis_extract/extractors/pdf.py
@@ -7,6 +7,7 @@ import papis.config
 import papis.logging
 
 from papis_extract.annotation import Annotation
+from papis_extract.extractors import ExtractionError
 
 logger = papis.logging.get_logger(__name__)
 
@@ -26,31 +27,43 @@ class PdfExtractor:
         Returns all readable annotations contained in the file
         passed in. Only returns Highlight or Text annotations.
         """
-        annotations = []
-        with fitz.Document(filename) as doc:
-            for page in doc:
-                for annot in page.annots():
-                    quote, note = self._retrieve_annotation_content(page, annot)
-                    if not quote and not note:
-                        continue
-                    col = (
-                        annot.colors.get("fill")
-                        or annot.colors.get("stroke")
-                        or (0.0, 0.0, 0.0)
-                    )
-                    a = Annotation(
-                        file=str(filename),
-                        content=quote or "",
-                        note=note or "",
-                        color=col,
-                        type=annot.type[1],
-                        page=(page.number or 0) + 1,
-                    )
-                    annotations.append(a)
-        logger.debug(
-            f"Found {len(annotations)} "
-            f"{'annotation' if len(annotations) == 1 else 'annotations'} for {filename}."
-        )
+        annotations: list[Annotation] = []
+        try:
+            with mu.Document(filename) as doc:
+                for page in doc:  # pyright: ignore [reportUnknownVariableType] - missing stub
+                    page = cast(mu.Page, page)
+                    annot: mu.Annot
+                    for annot in page.annots():
+                        quote, note = self._retrieve_annotation_content(page, annot)
+                        if not quote and not note:
+                            continue
+                        color: tuple[float, float, float] = cast(
+                            tuple[float, float, float],
+                            (
+                                annot.colors.get("fill")
+                                or annot.colors.get("stroke")
+                                or (0.0, 0.0, 0.0)
+                            ),
+                        )
+                        page_nr: int = cast(int, page.number or 0)
+                        highlight_type: str = cast(str, annot.type[1] or "")
+                        a = Annotation(
+                            file=str(filename),
+                            content=quote or "",
+                            note=note or "",
+                            color=color,
+                            type=highlight_type,
+                            page=page_nr,
+                        )
+                        annotations.append(a)
+            logger.debug(
+                f"Found {len(annotations)} "
+                f"{'annotation' if len(annotations) == 1 else 'annotations'} for {filename}."
+            )
+
+        except mu.FileDataError as e:
+            raise ExtractionError
+
         return annotations
 
     def _is_pdf(self, fname: Path) -> bool:
@@ -58,7 +71,7 @@ class PdfExtractor:
         return magic.from_file(fname, mime=True) == "application/pdf"
 
     def _retrieve_annotation_content(
-        self, page: fitz.Page, annotation: fitz.Annot
+        self, page: mu.Page, annotation: mu.Annot
     ) -> tuple[str | None, str | None]:
         """Gets the text content of an annotation.