2024-06-14 13:13:24 +00:00
|
|
|
# pyright: strict, reportMissingTypeStubs=false, reportUnknownMemberType=false
|
2024-01-20 17:02:18 +00:00
|
|
|
from pathlib import Path
|
2024-06-14 13:13:24 +00:00
|
|
|
from typing import cast
|
2024-01-20 17:02:18 +00:00
|
|
|
|
|
|
|
import Levenshtein
|
|
|
|
import magic
|
|
|
|
import papis.config
|
|
|
|
import papis.logging
|
2024-06-14 13:13:24 +00:00
|
|
|
import pymupdf as mu
|
2024-01-20 17:02:18 +00:00
|
|
|
|
|
|
|
from papis_extract.annotation import Annotation
|
2024-06-14 12:59:39 +00:00
|
|
|
from papis_extract.extractors import ExtractionError
|
2024-01-20 17:02:18 +00:00
|
|
|
|
|
|
|
logger = papis.logging.get_logger(__name__)
|
|
|
|
|
|
|
|
|
|
|
|
class PdfExtractor:
|
|
|
|
def can_process(self, filename: Path) -> bool:
|
|
|
|
if not filename.is_file():
|
|
|
|
logger.error(f"File {str(filename)} not readable.")
|
|
|
|
return False
|
|
|
|
if not self._is_pdf(filename):
|
|
|
|
return False
|
|
|
|
return True
|
|
|
|
|
|
|
|
def run(self, filename: Path) -> list[Annotation]:
|
|
|
|
"""Extract annotations from a file.
|
|
|
|
|
|
|
|
Returns all readable annotations contained in the file
|
|
|
|
passed in. Only returns Highlight or Text annotations.
|
|
|
|
"""
|
2024-06-14 12:59:39 +00:00
|
|
|
annotations: list[Annotation] = []
|
|
|
|
try:
|
|
|
|
with mu.Document(filename) as doc:
|
2024-06-14 13:13:24 +00:00
|
|
|
for (
|
|
|
|
page
|
|
|
|
) in doc: # pyright: ignore [reportUnknownVariableType] - missing stub
|
2024-06-14 12:59:39 +00:00
|
|
|
page = cast(mu.Page, page)
|
|
|
|
annot: mu.Annot
|
|
|
|
for annot in page.annots():
|
|
|
|
quote, note = self._retrieve_annotation_content(page, annot)
|
|
|
|
if not quote and not note:
|
|
|
|
continue
|
|
|
|
color: tuple[float, float, float] = cast(
|
|
|
|
tuple[float, float, float],
|
|
|
|
(
|
|
|
|
annot.colors.get("fill")
|
|
|
|
or annot.colors.get("stroke")
|
|
|
|
or (0.0, 0.0, 0.0)
|
|
|
|
),
|
|
|
|
)
|
|
|
|
page_nr: int = cast(int, page.number or 0)
|
|
|
|
highlight_type: str = cast(str, annot.type[1] or "")
|
|
|
|
a = Annotation(
|
|
|
|
file=str(filename),
|
|
|
|
content=quote or "",
|
|
|
|
note=note or "",
|
|
|
|
color=color,
|
|
|
|
type=highlight_type,
|
|
|
|
page=page_nr,
|
|
|
|
)
|
|
|
|
annotations.append(a)
|
|
|
|
logger.debug(
|
|
|
|
f"Found {len(annotations)} "
|
|
|
|
f"{'annotation' if len(annotations) == 1 else 'annotations'} for {filename}."
|
|
|
|
)
|
|
|
|
|
2024-06-14 13:13:24 +00:00
|
|
|
except mu.FileDataError:
|
2024-06-14 12:59:39 +00:00
|
|
|
raise ExtractionError
|
|
|
|
|
2024-01-20 17:02:18 +00:00
|
|
|
return annotations
|
|
|
|
|
|
|
|
def _is_pdf(self, fname: Path) -> bool:
|
|
|
|
"""Check if file is a pdf, using mime type."""
|
|
|
|
return magic.from_file(fname, mime=True) == "application/pdf"
|
|
|
|
|
2024-01-24 07:55:43 +00:00
|
|
|
def _retrieve_annotation_content(
|
2024-06-14 12:59:39 +00:00
|
|
|
self, page: mu.Page, annotation: mu.Annot
|
2024-01-20 17:02:18 +00:00
|
|
|
) -> tuple[str | None, str | None]:
|
|
|
|
"""Gets the text content of an annotation.
|
|
|
|
|
|
|
|
Returns the actual content of an annotation. Sometimes
|
|
|
|
that is only the written words, sometimes that is only
|
|
|
|
annotation notes, sometimes it is both. Runs a similarity
|
|
|
|
comparison between strings to find out whether they
|
|
|
|
should both be included or are the same, using
|
|
|
|
Levenshtein distance.
|
|
|
|
"""
|
2024-06-14 13:13:24 +00:00
|
|
|
content = cast(str, annotation.info["content"].replace("\n", " "))
|
2024-01-20 17:02:18 +00:00
|
|
|
written = page.get_textbox(annotation.rect).replace("\n", " ")
|
|
|
|
|
|
|
|
# highlight with selection in note
|
|
|
|
minimum_similarity = (
|
2024-01-24 07:55:43 +00:00
|
|
|
papis.config.getfloat("minimum_similarity_content", "plugins.extract")
|
|
|
|
or 1.0
|
2024-01-20 17:02:18 +00:00
|
|
|
)
|
|
|
|
if Levenshtein.ratio(content, written) > minimum_similarity:
|
|
|
|
return (content, None)
|
|
|
|
# both a highlight and a note
|
|
|
|
elif content and written:
|
|
|
|
return (written, content)
|
|
|
|
# an independent note, not a highlight
|
|
|
|
elif content:
|
|
|
|
return (None, content)
|
|
|
|
# highlight with selection not in note
|
|
|
|
elif written:
|
|
|
|
return (written, None)
|
|
|
|
# just a highlight without any text
|
|
|
|
return (None, None)
|