diff --git a/papis_extract/extraction.py b/papis_extract/extraction.py index 1c69026..74a6636 100644 --- a/papis_extract/extraction.py +++ b/papis_extract/extraction.py @@ -2,13 +2,13 @@ import re from pathlib import Path from typing import Protocol -import fitz import papis.config import papis.document import papis.logging from papis.document import Document from papis_extract.annotation import Annotation +from papis_extract.extractors import ExtractionError logger = papis.logging.get_logger(__name__) @@ -39,8 +39,8 @@ def start( try: annotations.extend(extractor.run(fname)) - except fitz.FileDataError as e: - print(f"File structure errors for {file}.\n{e}") + except ExtractionError as e: + print(f"File extraction errors for {file}.\n{e}") if not file_available: # have to remove curlys or papis logger gets upset diff --git a/papis_extract/extractors/__init__.py b/papis_extract/extractors/__init__.py index 47ead33..40af703 100644 --- a/papis_extract/extractors/__init__.py +++ b/papis_extract/extractors/__init__.py @@ -16,3 +16,13 @@ if find_spec("bs4") and find_spec("magic"): all_extractors["pocketbook"] = PocketBookExtractor() else: logger.debug("pocketbook extractor not activated.") + + +class ExtractionError(Exception): + """Raised for exceptions during extraction. + + Something went wrong during the extraction process in the extractor + run routine itself. + """ + + pass diff --git a/papis_extract/extractors/pdf.py b/papis_extract/extractors/pdf.py index a5f8de4..f61cefc 100644 --- a/papis_extract/extractors/pdf.py +++ b/papis_extract/extractors/pdf.py @@ -7,6 +7,7 @@ import papis.config import papis.logging from papis_extract.annotation import Annotation +from papis_extract.extractors import ExtractionError logger = papis.logging.get_logger(__name__) @@ -26,31 +27,43 @@ class PdfExtractor: Returns all readable annotations contained in the file passed in. Only returns Highlight or Text annotations. """ - annotations = [] - with fitz.Document(filename) as doc: - for page in doc: - for annot in page.annots(): - quote, note = self._retrieve_annotation_content(page, annot) - if not quote and not note: - continue - col = ( - annot.colors.get("fill") - or annot.colors.get("stroke") - or (0.0, 0.0, 0.0) - ) - a = Annotation( - file=str(filename), - content=quote or "", - note=note or "", - color=col, - type=annot.type[1], - page=(page.number or 0) + 1, - ) - annotations.append(a) - logger.debug( - f"Found {len(annotations)} " - f"{'annotation' if len(annotations) == 1 else 'annotations'} for {filename}." - ) + annotations: list[Annotation] = [] + try: + with mu.Document(filename) as doc: + for page in doc: # pyright: ignore [reportUnknownVariableType] - missing stub + page = cast(mu.Page, page) + annot: mu.Annot + for annot in page.annots(): + quote, note = self._retrieve_annotation_content(page, annot) + if not quote and not note: + continue + color: tuple[float, float, float] = cast( + tuple[float, float, float], + ( + annot.colors.get("fill") + or annot.colors.get("stroke") + or (0.0, 0.0, 0.0) + ), + ) + page_nr: int = cast(int, page.number or 0) + highlight_type: str = cast(str, annot.type[1] or "") + a = Annotation( + file=str(filename), + content=quote or "", + note=note or "", + color=color, + type=highlight_type, + page=page_nr, + ) + annotations.append(a) + logger.debug( + f"Found {len(annotations)} " + f"{'annotation' if len(annotations) == 1 else 'annotations'} for {filename}." + ) + + except mu.FileDataError as e: + raise ExtractionError + return annotations def _is_pdf(self, fname: Path) -> bool: @@ -58,7 +71,7 @@ class PdfExtractor: return magic.from_file(fname, mime=True) == "application/pdf" def _retrieve_annotation_content( - self, page: fitz.Page, annotation: fitz.Annot + self, page: mu.Page, annotation: mu.Annot ) -> tuple[str | None, str | None]: """Gets the text content of an annotation.