From 809325955185da98e12edbcc3ced11fc63ff16dc Mon Sep 17 00:00:00 2001 From: Marty Oehme Date: Fri, 14 Jun 2024 14:59:39 +0200 Subject: [PATCH] refactor: Remove pymupdf coupling in extraction The library is only needed for pdf extraction which is taken care of in its own extractor plugin. In the overall extraction routine we do not need any knowledge of the existence of pymupdf. --- papis_extract/extraction.py | 6 +-- papis_extract/extractors/__init__.py | 10 +++++ papis_extract/extractors/pdf.py | 65 +++++++++++++++++----------- 3 files changed, 52 insertions(+), 29 deletions(-) diff --git a/papis_extract/extraction.py b/papis_extract/extraction.py index 1c69026..74a6636 100644 --- a/papis_extract/extraction.py +++ b/papis_extract/extraction.py @@ -2,13 +2,13 @@ import re from pathlib import Path from typing import Protocol -import fitz import papis.config import papis.document import papis.logging from papis.document import Document from papis_extract.annotation import Annotation +from papis_extract.extractors import ExtractionError logger = papis.logging.get_logger(__name__) @@ -39,8 +39,8 @@ def start( try: annotations.extend(extractor.run(fname)) - except fitz.FileDataError as e: - print(f"File structure errors for {file}.\n{e}") + except ExtractionError as e: + print(f"File extraction errors for {file}.\n{e}") if not file_available: # have to remove curlys or papis logger gets upset diff --git a/papis_extract/extractors/__init__.py b/papis_extract/extractors/__init__.py index 47ead33..40af703 100644 --- a/papis_extract/extractors/__init__.py +++ b/papis_extract/extractors/__init__.py @@ -16,3 +16,13 @@ if find_spec("bs4") and find_spec("magic"): all_extractors["pocketbook"] = PocketBookExtractor() else: logger.debug("pocketbook extractor not activated.") + + +class ExtractionError(Exception): + """Raised for exceptions during extraction. + + Something went wrong during the extraction process in the extractor + run routine itself. + """ + + pass diff --git a/papis_extract/extractors/pdf.py b/papis_extract/extractors/pdf.py index a5f8de4..f61cefc 100644 --- a/papis_extract/extractors/pdf.py +++ b/papis_extract/extractors/pdf.py @@ -7,6 +7,7 @@ import papis.config import papis.logging from papis_extract.annotation import Annotation +from papis_extract.extractors import ExtractionError logger = papis.logging.get_logger(__name__) @@ -26,31 +27,43 @@ class PdfExtractor: Returns all readable annotations contained in the file passed in. Only returns Highlight or Text annotations. """ - annotations = [] - with fitz.Document(filename) as doc: - for page in doc: - for annot in page.annots(): - quote, note = self._retrieve_annotation_content(page, annot) - if not quote and not note: - continue - col = ( - annot.colors.get("fill") - or annot.colors.get("stroke") - or (0.0, 0.0, 0.0) - ) - a = Annotation( - file=str(filename), - content=quote or "", - note=note or "", - color=col, - type=annot.type[1], - page=(page.number or 0) + 1, - ) - annotations.append(a) - logger.debug( - f"Found {len(annotations)} " - f"{'annotation' if len(annotations) == 1 else 'annotations'} for {filename}." - ) + annotations: list[Annotation] = [] + try: + with mu.Document(filename) as doc: + for page in doc: # pyright: ignore [reportUnknownVariableType] - missing stub + page = cast(mu.Page, page) + annot: mu.Annot + for annot in page.annots(): + quote, note = self._retrieve_annotation_content(page, annot) + if not quote and not note: + continue + color: tuple[float, float, float] = cast( + tuple[float, float, float], + ( + annot.colors.get("fill") + or annot.colors.get("stroke") + or (0.0, 0.0, 0.0) + ), + ) + page_nr: int = cast(int, page.number or 0) + highlight_type: str = cast(str, annot.type[1] or "") + a = Annotation( + file=str(filename), + content=quote or "", + note=note or "", + color=color, + type=highlight_type, + page=page_nr, + ) + annotations.append(a) + logger.debug( + f"Found {len(annotations)} " + f"{'annotation' if len(annotations) == 1 else 'annotations'} for {filename}." + ) + + except mu.FileDataError as e: + raise ExtractionError + return annotations def _is_pdf(self, fname: Path) -> bool: @@ -58,7 +71,7 @@ class PdfExtractor: return magic.from_file(fname, mime=True) == "application/pdf" def _retrieve_annotation_content( - self, page: fitz.Page, annotation: fitz.Annot + self, page: mu.Page, annotation: mu.Annot ) -> tuple[str | None, str | None]: """Gets the text content of an annotation.