refactor: Use generator for PDF extractor
This commit is contained in:
parent
ff36d30f91
commit
e46219151b
1 changed files with 40 additions and 28 deletions
|
|
@ -1,6 +1,7 @@
|
|||
# pyright: strict, reportMissingTypeStubs=false, reportUnknownMemberType=false
|
||||
from collections.abc import Generator
|
||||
from pathlib import Path
|
||||
from typing import cast
|
||||
from typing import NamedTuple, cast
|
||||
|
||||
import Levenshtein
|
||||
import magic
|
||||
|
|
@ -14,6 +15,11 @@ from papis_extract.exceptions import ExtractionError
|
|||
logger = papis.logging.get_logger(__name__)
|
||||
|
||||
|
||||
class PdfAnnot(NamedTuple):
|
||||
page: mu.Page
|
||||
annot: mu.Annot
|
||||
|
||||
|
||||
class PdfExtractor:
|
||||
def can_process(self, filename: Path) -> bool:
|
||||
if not filename.is_file():
|
||||
|
|
@ -29,32 +35,24 @@ class PdfExtractor:
|
|||
"""
|
||||
annotations: list[Annotation] = []
|
||||
try:
|
||||
with mu.Document(filename) as doc:
|
||||
for page in doc: # pyright: ignore [reportUnknownVariableType] - missing stub
|
||||
annot: mu.Annot
|
||||
for annot in page.annots():
|
||||
quote, note = self._retrieve_annotation_content(page, annot)
|
||||
if not quote and not note:
|
||||
continue
|
||||
color: tuple[float, float, float] = cast(
|
||||
"tuple[float, float, float]",
|
||||
(
|
||||
annot.colors.get("fill")
|
||||
or annot.colors.get("stroke")
|
||||
or (0.0, 0.0, 0.0)
|
||||
),
|
||||
)
|
||||
page_nr: int = cast("int", page.number or 0)
|
||||
highlight_type: str = cast("str", annot.type[1] or "")
|
||||
a = Annotation(
|
||||
file=str(filename),
|
||||
content=quote or "",
|
||||
note=note or "",
|
||||
color=color,
|
||||
type=highlight_type,
|
||||
page=page_nr,
|
||||
)
|
||||
annotations.append(a)
|
||||
for page, annot in self._all_pdf_annots(filename):
|
||||
quote, note = self._get_annotation_content(page, annot)
|
||||
if not quote and not note:
|
||||
continue
|
||||
|
||||
color = self._get_correct_color(annot)
|
||||
page_nr: int = cast("int", page.number or 0)
|
||||
highlight_type: str = cast("str", annot.type[1] or "")
|
||||
|
||||
a = Annotation(
|
||||
file=str(filename),
|
||||
content=quote or "",
|
||||
note=note or "",
|
||||
color=color,
|
||||
type=highlight_type,
|
||||
page=page_nr,
|
||||
)
|
||||
annotations.append(a)
|
||||
logger.debug(
|
||||
f"Found {len(annotations)} "
|
||||
f"{'annotation' if len(annotations) == 1 else 'annotations'} for {filename}."
|
||||
|
|
@ -65,11 +63,18 @@ class PdfExtractor:
|
|||
|
||||
return annotations
|
||||
|
||||
def _all_pdf_annots(self, filename: Path) -> Generator[PdfAnnot]:
|
||||
with mu.Document(filename) as doc:
|
||||
for page in doc:
|
||||
annot: mu.Annot
|
||||
for annot in page.annots():
|
||||
yield PdfAnnot(page, annot)
|
||||
|
||||
def _is_pdf(self, fname: Path) -> bool:
|
||||
"""Check if file is a pdf, using mime type."""
|
||||
return magic.from_file(fname, mime=True) == "application/pdf"
|
||||
|
||||
def _retrieve_annotation_content(
|
||||
def _get_annotation_content(
|
||||
self, page: mu.Page, annotation: mu.Annot
|
||||
) -> tuple[str | None, str | None]:
|
||||
"""Gets the text content of an annotation.
|
||||
|
|
@ -102,3 +107,10 @@ class PdfExtractor:
|
|||
return (written, None)
|
||||
# just a highlight without any text
|
||||
return (None, None)
|
||||
|
||||
def _get_correct_color(self, annot: mu.Annot):
|
||||
color: tuple[float, float, float] = cast(
|
||||
"tuple[float, float, float]",
|
||||
(annot.colors.get("fill") or annot.colors.get("stroke") or (0.0, 0.0, 0.0)),
|
||||
)
|
||||
return color
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue