papis-extract/papis_extract/extractor.py
Marty Oehme 765de505bb
refactor: Remove AnnotatedDocument class
The AnnotatedDocument class was, essentially, a simple tuple of a document
and a list of annotations. While not bad in a vacuum, it is unwieldy and
passing this around instead of a document, annotations, or both where
necessary is more restrictive and frankly unnecessary.

This commit removes the data class and any instances of its use. Instead,
we now pass the individual components around to anything that needs them.
This also frees us up to pass only annotations around for example.

We also do not iterate through the selected papis documents to work on
in each exporter anymore (since we only pass a single document), but
in the main function itself. This leads to less duplication and makes
the overall run function the overall single source of iteration through
selected documents. Everything else only knows about a single document -
the one it is operating on - which seems much neater.

For now, it does not change much, but should make later work on extra
exporters or extractors easier.
2024-01-20 16:36:24 +01:00

165 lines
5 KiB
Python

import re
from pathlib import Path
from typing import Any, Optional
import Levenshtein
import magic
import fitz
import papis.logging
import papis.config
import papis.document
from papis.document import Document
from papis_extract.annotation import Annotation
logger = papis.logging.get_logger(__name__)
def start(
document: Document,
) -> list[Annotation]:
"""Extract all annotations from passed documents.
Returns all annotations contained in the papis
documents passed in.
"""
annotations: list[Annotation] = []
found_pdf: bool = False
for file in document.get_files():
fname = Path(file)
if not _is_file_processable(fname):
break
found_pdf = True
try:
annotations.extend(extract(fname))
except fitz.FileDataError as e:
print(f"File structure errors for {file}.\n{e}")
if not found_pdf:
# have to remove curlys or papis logger gets upset
desc = re.sub("[{}]", "", papis.document.describe(document))
logger.warning("Did not find suitable PDF file for document: " f"{desc}")
return annotations
def extract(filename: Path) -> list[Annotation]:
"""Extract annotations from a file.
Returns all readable annotations contained in the file
passed in. Only returns Highlight or Text annotations.
"""
annotations = []
with fitz.Document(filename) as doc:
for page in doc:
for annot in page.annots():
quote, note = _retrieve_annotation_content(page, annot)
if not quote and not note:
continue
col = (
annot.colors.get("fill")
or annot.colors.get("stroke")
or (0.0, 0.0, 0.0)
)
a = Annotation(
file=str(filename),
text=quote or "",
content=note or "",
colors=col,
type=annot.type[1],
page=(page.number or 0) + 1,
)
a.tag = _tag_from_colorname(a.colorname or "")
annotations.append(a)
logger.debug(
f"Found {len(annotations)} "
f"{'annotation' if len(annotations) == 1 else 'annotations'} for {filename}."
)
return annotations
def is_pdf(fname: Path) -> bool:
"""Check if file is a pdf, using mime type."""
return magic.from_file(fname, mime=True) == "application/pdf"
def _is_file_processable(fname: Path) -> bool:
if not fname.is_file():
logger.error(f"File {str(fname)} not readable.")
return False
if not is_pdf(fname):
return False
return True
def _tag_from_colorname(colorname: str) -> str:
color_mapping: dict[str, str] = getdict("tags", "plugins.extract")
if not color_mapping:
return ""
return color_mapping.get(colorname, "")
def _retrieve_annotation_content(
page: fitz.Page, annotation: fitz.Annot
) -> tuple[str | None, str | None]:
"""Gets the text content of an annotation.
Returns the actual content of an annotation. Sometimes
that is only the written words, sometimes that is only
annotation notes, sometimes it is both. Runs a similarity
comparison between strings to find out whether they
should both be included or are the same, using
Levenshtein distance.
"""
content = annotation.info["content"].replace("\n", " ")
written = page.get_textbox(annotation.rect).replace("\n", " ")
# highlight with selection in note
minimum_similarity = (
papis.config.getfloat("minimum_similarity_content", "plugins.extract") or 1.0
)
if Levenshtein.ratio(content, written) > minimum_similarity:
return (content, None)
# both a highlight and a note
elif content and written:
return (written, content)
# an independent note, not a highlight
elif content:
return (None, content)
# highlight with selection not in note
elif written:
return (written, None)
# just a highlight without any text
return (None, None)
# mimics the functions in papis.config.{getlist,getint,getfloat} etc.
def getdict(key: str, section: Optional[str] = None) -> dict[str, str]:
"""Dict getter
:returns: A python dict
:raises SyntaxError: Whenever the parsed syntax is either not a valid
python object or a valid python dict.
"""
rawvalue: Any = papis.config.general_get(key, section=section)
if isinstance(rawvalue, dict):
return rawvalue
try:
rawvalue = eval(rawvalue)
except Exception:
raise SyntaxError(
"The key '{}' must be a valid Python object: {}".format(key, rawvalue)
)
else:
if not isinstance(rawvalue, dict):
raise SyntaxError(
"The key '{}' must be a valid Python dict. Got: {} (type {!r})".format(
key, rawvalue, type(rawvalue).__name__
)
)
return rawvalue