refactor: Remove pymupdf coupling in extraction

The library is only needed for pdf extraction which is taken care of
in its own extractor plugin. In the overall extraction routine we do not
need any knowledge of the existence of pymupdf.
This commit is contained in:
Marty Oehme 2024-06-14 14:59:39 +02:00
parent 7261e7d80c
commit 8093259551
Signed by: Marty
GPG key ID: EDBF2ED917B2EF6A
3 changed files with 52 additions and 29 deletions

View file

@ -2,13 +2,13 @@ import re
from pathlib import Path
from typing import Protocol
import fitz
import papis.config
import papis.document
import papis.logging
from papis.document import Document
from papis_extract.annotation import Annotation
from papis_extract.extractors import ExtractionError
logger = papis.logging.get_logger(__name__)
@ -39,8 +39,8 @@ def start(
try:
annotations.extend(extractor.run(fname))
except fitz.FileDataError as e:
print(f"File structure errors for {file}.\n{e}")
except ExtractionError as e:
print(f"File extraction errors for {file}.\n{e}")
if not file_available:
# have to remove curlys or papis logger gets upset

View file

@ -16,3 +16,13 @@ if find_spec("bs4") and find_spec("magic"):
all_extractors["pocketbook"] = PocketBookExtractor()
else:
logger.debug("pocketbook extractor not activated.")
class ExtractionError(Exception):
"""Raised for exceptions during extraction.
Something went wrong during the extraction process in the extractor
run routine itself.
"""
pass

View file

@ -7,6 +7,7 @@ import papis.config
import papis.logging
from papis_extract.annotation import Annotation
from papis_extract.extractors import ExtractionError
logger = papis.logging.get_logger(__name__)
@ -26,31 +27,43 @@ class PdfExtractor:
Returns all readable annotations contained in the file
passed in. Only returns Highlight or Text annotations.
"""
annotations = []
with fitz.Document(filename) as doc:
for page in doc:
annotations: list[Annotation] = []
try:
with mu.Document(filename) as doc:
for page in doc: # pyright: ignore [reportUnknownVariableType] - missing stub
page = cast(mu.Page, page)
annot: mu.Annot
for annot in page.annots():
quote, note = self._retrieve_annotation_content(page, annot)
if not quote and not note:
continue
col = (
color: tuple[float, float, float] = cast(
tuple[float, float, float],
(
annot.colors.get("fill")
or annot.colors.get("stroke")
or (0.0, 0.0, 0.0)
),
)
page_nr: int = cast(int, page.number or 0)
highlight_type: str = cast(str, annot.type[1] or "")
a = Annotation(
file=str(filename),
content=quote or "",
note=note or "",
color=col,
type=annot.type[1],
page=(page.number or 0) + 1,
color=color,
type=highlight_type,
page=page_nr,
)
annotations.append(a)
logger.debug(
f"Found {len(annotations)} "
f"{'annotation' if len(annotations) == 1 else 'annotations'} for {filename}."
)
except mu.FileDataError as e:
raise ExtractionError
return annotations
def _is_pdf(self, fname: Path) -> bool:
@ -58,7 +71,7 @@ class PdfExtractor:
return magic.from_file(fname, mime=True) == "application/pdf"
def _retrieve_annotation_content(
self, page: fitz.Page, annotation: fitz.Annot
self, page: mu.Page, annotation: mu.Annot
) -> tuple[str | None, str | None]:
"""Gets the text content of an annotation.