Marty Oehme
765de505bb
The AnnotatedDocument class was, essentially, a simple tuple of a document and a list of annotations. While not bad in a vacuum, it is unwieldy and passing this around instead of a document, annotations, or both where necessary is more restrictive and frankly unnecessary. This commit removes the data class and any instances of its use. Instead, we now pass the individual components around to anything that needs them. This also frees us up to pass only annotations around for example. We also do not iterate through the selected papis documents to work on in each exporter anymore (since we only pass a single document), but in the main function itself. This leads to less duplication and makes the overall run function the overall single source of iteration through selected documents. Everything else only knows about a single document - the one it is operating on - which seems much neater. For now, it does not change much, but should make later work on extra exporters or extractors easier.
165 lines
5 KiB
Python
165 lines
5 KiB
Python
import re
|
|
from pathlib import Path
|
|
from typing import Any, Optional
|
|
|
|
import Levenshtein
|
|
import magic
|
|
import fitz
|
|
import papis.logging
|
|
import papis.config
|
|
import papis.document
|
|
from papis.document import Document
|
|
|
|
from papis_extract.annotation import Annotation
|
|
|
|
logger = papis.logging.get_logger(__name__)
|
|
|
|
|
|
def start(
|
|
document: Document,
|
|
) -> list[Annotation]:
|
|
"""Extract all annotations from passed documents.
|
|
|
|
Returns all annotations contained in the papis
|
|
documents passed in.
|
|
"""
|
|
|
|
annotations: list[Annotation] = []
|
|
found_pdf: bool = False
|
|
for file in document.get_files():
|
|
fname = Path(file)
|
|
if not _is_file_processable(fname):
|
|
break
|
|
found_pdf = True
|
|
|
|
try:
|
|
annotations.extend(extract(fname))
|
|
except fitz.FileDataError as e:
|
|
print(f"File structure errors for {file}.\n{e}")
|
|
|
|
if not found_pdf:
|
|
# have to remove curlys or papis logger gets upset
|
|
desc = re.sub("[{}]", "", papis.document.describe(document))
|
|
logger.warning("Did not find suitable PDF file for document: " f"{desc}")
|
|
|
|
return annotations
|
|
|
|
|
|
def extract(filename: Path) -> list[Annotation]:
|
|
"""Extract annotations from a file.
|
|
|
|
Returns all readable annotations contained in the file
|
|
passed in. Only returns Highlight or Text annotations.
|
|
"""
|
|
annotations = []
|
|
with fitz.Document(filename) as doc:
|
|
for page in doc:
|
|
for annot in page.annots():
|
|
quote, note = _retrieve_annotation_content(page, annot)
|
|
if not quote and not note:
|
|
continue
|
|
col = (
|
|
annot.colors.get("fill")
|
|
or annot.colors.get("stroke")
|
|
or (0.0, 0.0, 0.0)
|
|
)
|
|
a = Annotation(
|
|
file=str(filename),
|
|
text=quote or "",
|
|
content=note or "",
|
|
colors=col,
|
|
type=annot.type[1],
|
|
page=(page.number or 0) + 1,
|
|
)
|
|
a.tag = _tag_from_colorname(a.colorname or "")
|
|
annotations.append(a)
|
|
logger.debug(
|
|
f"Found {len(annotations)} "
|
|
f"{'annotation' if len(annotations) == 1 else 'annotations'} for {filename}."
|
|
)
|
|
return annotations
|
|
|
|
|
|
def is_pdf(fname: Path) -> bool:
|
|
"""Check if file is a pdf, using mime type."""
|
|
return magic.from_file(fname, mime=True) == "application/pdf"
|
|
|
|
|
|
def _is_file_processable(fname: Path) -> bool:
|
|
if not fname.is_file():
|
|
logger.error(f"File {str(fname)} not readable.")
|
|
return False
|
|
if not is_pdf(fname):
|
|
return False
|
|
return True
|
|
|
|
|
|
def _tag_from_colorname(colorname: str) -> str:
|
|
color_mapping: dict[str, str] = getdict("tags", "plugins.extract")
|
|
if not color_mapping:
|
|
return ""
|
|
|
|
return color_mapping.get(colorname, "")
|
|
|
|
|
|
def _retrieve_annotation_content(
|
|
page: fitz.Page, annotation: fitz.Annot
|
|
) -> tuple[str | None, str | None]:
|
|
"""Gets the text content of an annotation.
|
|
|
|
Returns the actual content of an annotation. Sometimes
|
|
that is only the written words, sometimes that is only
|
|
annotation notes, sometimes it is both. Runs a similarity
|
|
comparison between strings to find out whether they
|
|
should both be included or are the same, using
|
|
Levenshtein distance.
|
|
"""
|
|
content = annotation.info["content"].replace("\n", " ")
|
|
written = page.get_textbox(annotation.rect).replace("\n", " ")
|
|
|
|
# highlight with selection in note
|
|
minimum_similarity = (
|
|
papis.config.getfloat("minimum_similarity_content", "plugins.extract") or 1.0
|
|
)
|
|
if Levenshtein.ratio(content, written) > minimum_similarity:
|
|
return (content, None)
|
|
# both a highlight and a note
|
|
elif content and written:
|
|
return (written, content)
|
|
# an independent note, not a highlight
|
|
elif content:
|
|
return (None, content)
|
|
# highlight with selection not in note
|
|
elif written:
|
|
return (written, None)
|
|
# just a highlight without any text
|
|
return (None, None)
|
|
|
|
|
|
# mimics the functions in papis.config.{getlist,getint,getfloat} etc.
|
|
def getdict(key: str, section: Optional[str] = None) -> dict[str, str]:
|
|
"""Dict getter
|
|
|
|
:returns: A python dict
|
|
:raises SyntaxError: Whenever the parsed syntax is either not a valid
|
|
python object or a valid python dict.
|
|
"""
|
|
rawvalue: Any = papis.config.general_get(key, section=section)
|
|
if isinstance(rawvalue, dict):
|
|
return rawvalue
|
|
try:
|
|
rawvalue = eval(rawvalue)
|
|
except Exception:
|
|
raise SyntaxError(
|
|
"The key '{}' must be a valid Python object: {}".format(key, rawvalue)
|
|
)
|
|
else:
|
|
if not isinstance(rawvalue, dict):
|
|
raise SyntaxError(
|
|
"The key '{}' must be a valid Python dict. Got: {} (type {!r})".format(
|
|
key, rawvalue, type(rawvalue).__name__
|
|
)
|
|
)
|
|
|
|
return rawvalue
|