refactor: Use generator for PDF extractor
This commit is contained in:
parent
ff36d30f91
commit
e46219151b
1 changed files with 40 additions and 28 deletions
|
|
@ -1,6 +1,7 @@
|
||||||
# pyright: strict, reportMissingTypeStubs=false, reportUnknownMemberType=false
|
# pyright: strict, reportMissingTypeStubs=false, reportUnknownMemberType=false
|
||||||
|
from collections.abc import Generator
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import cast
|
from typing import NamedTuple, cast
|
||||||
|
|
||||||
import Levenshtein
|
import Levenshtein
|
||||||
import magic
|
import magic
|
||||||
|
|
@ -14,6 +15,11 @@ from papis_extract.exceptions import ExtractionError
|
||||||
logger = papis.logging.get_logger(__name__)
|
logger = papis.logging.get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class PdfAnnot(NamedTuple):
|
||||||
|
page: mu.Page
|
||||||
|
annot: mu.Annot
|
||||||
|
|
||||||
|
|
||||||
class PdfExtractor:
|
class PdfExtractor:
|
||||||
def can_process(self, filename: Path) -> bool:
|
def can_process(self, filename: Path) -> bool:
|
||||||
if not filename.is_file():
|
if not filename.is_file():
|
||||||
|
|
@ -29,23 +35,15 @@ class PdfExtractor:
|
||||||
"""
|
"""
|
||||||
annotations: list[Annotation] = []
|
annotations: list[Annotation] = []
|
||||||
try:
|
try:
|
||||||
with mu.Document(filename) as doc:
|
for page, annot in self._all_pdf_annots(filename):
|
||||||
for page in doc: # pyright: ignore [reportUnknownVariableType] - missing stub
|
quote, note = self._get_annotation_content(page, annot)
|
||||||
annot: mu.Annot
|
|
||||||
for annot in page.annots():
|
|
||||||
quote, note = self._retrieve_annotation_content(page, annot)
|
|
||||||
if not quote and not note:
|
if not quote and not note:
|
||||||
continue
|
continue
|
||||||
color: tuple[float, float, float] = cast(
|
|
||||||
"tuple[float, float, float]",
|
color = self._get_correct_color(annot)
|
||||||
(
|
|
||||||
annot.colors.get("fill")
|
|
||||||
or annot.colors.get("stroke")
|
|
||||||
or (0.0, 0.0, 0.0)
|
|
||||||
),
|
|
||||||
)
|
|
||||||
page_nr: int = cast("int", page.number or 0)
|
page_nr: int = cast("int", page.number or 0)
|
||||||
highlight_type: str = cast("str", annot.type[1] or "")
|
highlight_type: str = cast("str", annot.type[1] or "")
|
||||||
|
|
||||||
a = Annotation(
|
a = Annotation(
|
||||||
file=str(filename),
|
file=str(filename),
|
||||||
content=quote or "",
|
content=quote or "",
|
||||||
|
|
@ -65,11 +63,18 @@ class PdfExtractor:
|
||||||
|
|
||||||
return annotations
|
return annotations
|
||||||
|
|
||||||
|
def _all_pdf_annots(self, filename: Path) -> Generator[PdfAnnot]:
|
||||||
|
with mu.Document(filename) as doc:
|
||||||
|
for page in doc:
|
||||||
|
annot: mu.Annot
|
||||||
|
for annot in page.annots():
|
||||||
|
yield PdfAnnot(page, annot)
|
||||||
|
|
||||||
def _is_pdf(self, fname: Path) -> bool:
|
def _is_pdf(self, fname: Path) -> bool:
|
||||||
"""Check if file is a pdf, using mime type."""
|
"""Check if file is a pdf, using mime type."""
|
||||||
return magic.from_file(fname, mime=True) == "application/pdf"
|
return magic.from_file(fname, mime=True) == "application/pdf"
|
||||||
|
|
||||||
def _retrieve_annotation_content(
|
def _get_annotation_content(
|
||||||
self, page: mu.Page, annotation: mu.Annot
|
self, page: mu.Page, annotation: mu.Annot
|
||||||
) -> tuple[str | None, str | None]:
|
) -> tuple[str | None, str | None]:
|
||||||
"""Gets the text content of an annotation.
|
"""Gets the text content of an annotation.
|
||||||
|
|
@ -102,3 +107,10 @@ class PdfExtractor:
|
||||||
return (written, None)
|
return (written, None)
|
||||||
# just a highlight without any text
|
# just a highlight without any text
|
||||||
return (None, None)
|
return (None, None)
|
||||||
|
|
||||||
|
def _get_correct_color(self, annot: mu.Annot):
|
||||||
|
color: tuple[float, float, float] = cast(
|
||||||
|
"tuple[float, float, float]",
|
||||||
|
(annot.colors.get("fill") or annot.colors.get("stroke") or (0.0, 0.0, 0.0)),
|
||||||
|
)
|
||||||
|
return color
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue