papis-extract/papis_extract/extractors/pdf.py

116 lines
4 KiB
Python

# pyright: strict, reportMissingTypeStubs=false, reportUnknownMemberType=false
from collections.abc import Generator
from pathlib import Path
from typing import NamedTuple, cast
import Levenshtein
import magic
import papis.config
import papis.logging
import pymupdf as mu
from papis_extract.annotation import Annotation
from papis_extract.exceptions import ExtractionError
logger = papis.logging.get_logger(__name__)
class PdfAnnot(NamedTuple):
page: mu.Page
annot: mu.Annot
class PdfExtractor:
def can_process(self, filename: Path) -> bool:
if not filename.is_file():
logger.error(f"File {str(filename)} not readable.")
return False
return self._is_pdf(filename)
def run(self, filename: Path) -> list[Annotation]:
"""Extract annotations from a file.
Returns all readable annotations contained in the file
passed in. Only returns Highlight or Text annotations.
"""
annotations: list[Annotation] = []
try:
for page, annot in self._all_pdf_annots(filename):
quote, note = self._get_annotation_content(page, annot)
if not quote and not note:
continue
color = self._get_correct_color(annot)
page_nr: int = cast("int", page.number or 0)
highlight_type: str = cast("str", annot.type[1] or "")
a = Annotation(
file=str(filename),
content=quote or "",
note=note or "",
color=color,
type=highlight_type,
page=page_nr,
)
annotations.append(a)
logger.debug(
f"Found {len(annotations)} "
f"{'annotation' if len(annotations) == 1 else 'annotations'} for {filename}."
)
except mu.FileDataError:
raise ExtractionError
return annotations
def _all_pdf_annots(self, filename: Path) -> Generator[PdfAnnot]:
with mu.Document(filename) as doc:
for page in doc:
annot: mu.Annot
for annot in page.annots():
yield PdfAnnot(page, annot)
def _is_pdf(self, fname: Path) -> bool:
"""Check if file is a pdf, using mime type."""
return magic.from_file(fname, mime=True) == "application/pdf"
def _get_annotation_content(
self, page: mu.Page, annotation: mu.Annot
) -> tuple[str | None, str | None]:
"""Gets the text content of an annotation.
Returns the actual content of an annotation. Sometimes
that is only the written words, sometimes that is only
annotation notes, sometimes it is both. Runs a similarity
comparison between strings to find out whether they
should both be included or are the same, using
Levenshtein distance.
"""
content = cast("str", annotation.info["content"].replace("\n", " "))
written = page.get_textbox(annotation.rect).replace("\n", " ")
# highlight with selection in note
minimum_similarity = (
papis.config.getfloat("minimum_similarity_content", "plugins.extract")
or 1.0
)
if Levenshtein.ratio(content, written) > minimum_similarity:
return (content, None)
# both a highlight and a note
if content and written:
return (written, content)
# an independent note, not a highlight
if content:
return (None, content)
# highlight with selection not in note
if written:
return (written, None)
# just a highlight without any text
return (None, None)
def _get_correct_color(self, annot: mu.Annot):
color: tuple[float, float, float] = cast(
"tuple[float, float, float]",
(annot.colors.get("fill") or annot.colors.get("stroke") or (0.0, 0.0, 0.0)),
)
return color