refactor: Remove pymupdf coupling in extraction

The library is only needed for pdf extraction which is taken care of
in its own extractor plugin. In the overall extraction routine we do not
need any knowledge of the existence of pymupdf.
This commit is contained in:
Marty Oehme 2024-06-14 14:59:39 +02:00
parent 7261e7d80c
commit 8093259551
Signed by: Marty
GPG key ID: EDBF2ED917B2EF6A
3 changed files with 52 additions and 29 deletions

View file

@ -2,13 +2,13 @@ import re
from pathlib import Path from pathlib import Path
from typing import Protocol from typing import Protocol
import fitz
import papis.config import papis.config
import papis.document import papis.document
import papis.logging import papis.logging
from papis.document import Document from papis.document import Document
from papis_extract.annotation import Annotation from papis_extract.annotation import Annotation
from papis_extract.extractors import ExtractionError
logger = papis.logging.get_logger(__name__) logger = papis.logging.get_logger(__name__)
@ -39,8 +39,8 @@ def start(
try: try:
annotations.extend(extractor.run(fname)) annotations.extend(extractor.run(fname))
except fitz.FileDataError as e: except ExtractionError as e:
print(f"File structure errors for {file}.\n{e}") print(f"File extraction errors for {file}.\n{e}")
if not file_available: if not file_available:
# have to remove curlys or papis logger gets upset # have to remove curlys or papis logger gets upset

View file

@ -16,3 +16,13 @@ if find_spec("bs4") and find_spec("magic"):
all_extractors["pocketbook"] = PocketBookExtractor() all_extractors["pocketbook"] = PocketBookExtractor()
else: else:
logger.debug("pocketbook extractor not activated.") logger.debug("pocketbook extractor not activated.")
class ExtractionError(Exception):
"""Raised for exceptions during extraction.
Something went wrong during the extraction process in the extractor
run routine itself.
"""
pass

View file

@ -7,6 +7,7 @@ import papis.config
import papis.logging import papis.logging
from papis_extract.annotation import Annotation from papis_extract.annotation import Annotation
from papis_extract.extractors import ExtractionError
logger = papis.logging.get_logger(__name__) logger = papis.logging.get_logger(__name__)
@ -26,31 +27,43 @@ class PdfExtractor:
Returns all readable annotations contained in the file Returns all readable annotations contained in the file
passed in. Only returns Highlight or Text annotations. passed in. Only returns Highlight or Text annotations.
""" """
annotations = [] annotations: list[Annotation] = []
with fitz.Document(filename) as doc: try:
for page in doc: with mu.Document(filename) as doc:
for annot in page.annots(): for page in doc: # pyright: ignore [reportUnknownVariableType] - missing stub
quote, note = self._retrieve_annotation_content(page, annot) page = cast(mu.Page, page)
if not quote and not note: annot: mu.Annot
continue for annot in page.annots():
col = ( quote, note = self._retrieve_annotation_content(page, annot)
annot.colors.get("fill") if not quote and not note:
or annot.colors.get("stroke") continue
or (0.0, 0.0, 0.0) color: tuple[float, float, float] = cast(
) tuple[float, float, float],
a = Annotation( (
file=str(filename), annot.colors.get("fill")
content=quote or "", or annot.colors.get("stroke")
note=note or "", or (0.0, 0.0, 0.0)
color=col, ),
type=annot.type[1], )
page=(page.number or 0) + 1, page_nr: int = cast(int, page.number or 0)
) highlight_type: str = cast(str, annot.type[1] or "")
annotations.append(a) a = Annotation(
logger.debug( file=str(filename),
f"Found {len(annotations)} " content=quote or "",
f"{'annotation' if len(annotations) == 1 else 'annotations'} for {filename}." note=note or "",
) color=color,
type=highlight_type,
page=page_nr,
)
annotations.append(a)
logger.debug(
f"Found {len(annotations)} "
f"{'annotation' if len(annotations) == 1 else 'annotations'} for {filename}."
)
except mu.FileDataError as e:
raise ExtractionError
return annotations return annotations
def _is_pdf(self, fname: Path) -> bool: def _is_pdf(self, fname: Path) -> bool:
@ -58,7 +71,7 @@ class PdfExtractor:
return magic.from_file(fname, mime=True) == "application/pdf" return magic.from_file(fname, mime=True) == "application/pdf"
def _retrieve_annotation_content( def _retrieve_annotation_content(
self, page: fitz.Page, annotation: fitz.Annot self, page: mu.Page, annotation: mu.Annot
) -> tuple[str | None, str | None]: ) -> tuple[str | None, str | None]:
"""Gets the text content of an annotation. """Gets the text content of an annotation.