refactor: Remove pymupdf coupling in extraction
The library is only needed for pdf extraction which is taken care of in its own extractor plugin. In the overall extraction routine we do not need any knowledge of the existence of pymupdf.
This commit is contained in:
parent
7261e7d80c
commit
8093259551
3 changed files with 52 additions and 29 deletions
|
@ -2,13 +2,13 @@ import re
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Protocol
|
from typing import Protocol
|
||||||
|
|
||||||
import fitz
|
|
||||||
import papis.config
|
import papis.config
|
||||||
import papis.document
|
import papis.document
|
||||||
import papis.logging
|
import papis.logging
|
||||||
from papis.document import Document
|
from papis.document import Document
|
||||||
|
|
||||||
from papis_extract.annotation import Annotation
|
from papis_extract.annotation import Annotation
|
||||||
|
from papis_extract.extractors import ExtractionError
|
||||||
|
|
||||||
logger = papis.logging.get_logger(__name__)
|
logger = papis.logging.get_logger(__name__)
|
||||||
|
|
||||||
|
@ -39,8 +39,8 @@ def start(
|
||||||
|
|
||||||
try:
|
try:
|
||||||
annotations.extend(extractor.run(fname))
|
annotations.extend(extractor.run(fname))
|
||||||
except fitz.FileDataError as e:
|
except ExtractionError as e:
|
||||||
print(f"File structure errors for {file}.\n{e}")
|
print(f"File extraction errors for {file}.\n{e}")
|
||||||
|
|
||||||
if not file_available:
|
if not file_available:
|
||||||
# have to remove curlys or papis logger gets upset
|
# have to remove curlys or papis logger gets upset
|
||||||
|
|
|
@ -16,3 +16,13 @@ if find_spec("bs4") and find_spec("magic"):
|
||||||
all_extractors["pocketbook"] = PocketBookExtractor()
|
all_extractors["pocketbook"] = PocketBookExtractor()
|
||||||
else:
|
else:
|
||||||
logger.debug("pocketbook extractor not activated.")
|
logger.debug("pocketbook extractor not activated.")
|
||||||
|
|
||||||
|
|
||||||
|
class ExtractionError(Exception):
|
||||||
|
"""Raised for exceptions during extraction.
|
||||||
|
|
||||||
|
Something went wrong during the extraction process in the extractor
|
||||||
|
run routine itself.
|
||||||
|
"""
|
||||||
|
|
||||||
|
pass
|
||||||
|
|
|
@ -7,6 +7,7 @@ import papis.config
|
||||||
import papis.logging
|
import papis.logging
|
||||||
|
|
||||||
from papis_extract.annotation import Annotation
|
from papis_extract.annotation import Annotation
|
||||||
|
from papis_extract.extractors import ExtractionError
|
||||||
|
|
||||||
logger = papis.logging.get_logger(__name__)
|
logger = papis.logging.get_logger(__name__)
|
||||||
|
|
||||||
|
@ -26,31 +27,43 @@ class PdfExtractor:
|
||||||
Returns all readable annotations contained in the file
|
Returns all readable annotations contained in the file
|
||||||
passed in. Only returns Highlight or Text annotations.
|
passed in. Only returns Highlight or Text annotations.
|
||||||
"""
|
"""
|
||||||
annotations = []
|
annotations: list[Annotation] = []
|
||||||
with fitz.Document(filename) as doc:
|
try:
|
||||||
for page in doc:
|
with mu.Document(filename) as doc:
|
||||||
for annot in page.annots():
|
for page in doc: # pyright: ignore [reportUnknownVariableType] - missing stub
|
||||||
quote, note = self._retrieve_annotation_content(page, annot)
|
page = cast(mu.Page, page)
|
||||||
if not quote and not note:
|
annot: mu.Annot
|
||||||
continue
|
for annot in page.annots():
|
||||||
col = (
|
quote, note = self._retrieve_annotation_content(page, annot)
|
||||||
annot.colors.get("fill")
|
if not quote and not note:
|
||||||
or annot.colors.get("stroke")
|
continue
|
||||||
or (0.0, 0.0, 0.0)
|
color: tuple[float, float, float] = cast(
|
||||||
)
|
tuple[float, float, float],
|
||||||
a = Annotation(
|
(
|
||||||
file=str(filename),
|
annot.colors.get("fill")
|
||||||
content=quote or "",
|
or annot.colors.get("stroke")
|
||||||
note=note or "",
|
or (0.0, 0.0, 0.0)
|
||||||
color=col,
|
),
|
||||||
type=annot.type[1],
|
)
|
||||||
page=(page.number or 0) + 1,
|
page_nr: int = cast(int, page.number or 0)
|
||||||
)
|
highlight_type: str = cast(str, annot.type[1] or "")
|
||||||
annotations.append(a)
|
a = Annotation(
|
||||||
logger.debug(
|
file=str(filename),
|
||||||
f"Found {len(annotations)} "
|
content=quote or "",
|
||||||
f"{'annotation' if len(annotations) == 1 else 'annotations'} for {filename}."
|
note=note or "",
|
||||||
)
|
color=color,
|
||||||
|
type=highlight_type,
|
||||||
|
page=page_nr,
|
||||||
|
)
|
||||||
|
annotations.append(a)
|
||||||
|
logger.debug(
|
||||||
|
f"Found {len(annotations)} "
|
||||||
|
f"{'annotation' if len(annotations) == 1 else 'annotations'} for {filename}."
|
||||||
|
)
|
||||||
|
|
||||||
|
except mu.FileDataError as e:
|
||||||
|
raise ExtractionError
|
||||||
|
|
||||||
return annotations
|
return annotations
|
||||||
|
|
||||||
def _is_pdf(self, fname: Path) -> bool:
|
def _is_pdf(self, fname: Path) -> bool:
|
||||||
|
@ -58,7 +71,7 @@ class PdfExtractor:
|
||||||
return magic.from_file(fname, mime=True) == "application/pdf"
|
return magic.from_file(fname, mime=True) == "application/pdf"
|
||||||
|
|
||||||
def _retrieve_annotation_content(
|
def _retrieve_annotation_content(
|
||||||
self, page: fitz.Page, annotation: fitz.Annot
|
self, page: mu.Page, annotation: mu.Annot
|
||||||
) -> tuple[str | None, str | None]:
|
) -> tuple[str | None, str | None]:
|
||||||
"""Gets the text content of an annotation.
|
"""Gets the text content of an annotation.
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue