2023-08-29 10:40:36 +00:00
|
|
|
import re
|
2023-08-28 08:28:06 +00:00
|
|
|
from pathlib import Path
|
2023-08-28 14:41:18 +00:00
|
|
|
from typing import Any, Optional
|
2023-08-28 08:28:06 +00:00
|
|
|
|
|
|
|
import Levenshtein
|
2023-08-29 10:40:36 +00:00
|
|
|
import magic
|
2023-08-28 08:28:06 +00:00
|
|
|
import fitz_new as fitz
|
2023-08-28 10:53:03 +00:00
|
|
|
import papis.logging
|
2023-08-28 08:28:06 +00:00
|
|
|
import papis.config
|
2023-08-29 10:40:36 +00:00
|
|
|
import papis.document
|
|
|
|
from papis.document import Document
|
2023-08-28 08:28:06 +00:00
|
|
|
|
2023-08-29 10:40:36 +00:00
|
|
|
from papis_extract.annotation_data import Annotation, AnnotatedDocument
|
2023-08-28 08:28:06 +00:00
|
|
|
|
2023-08-28 10:53:03 +00:00
|
|
|
logger = papis.logging.get_logger(__name__)
|
|
|
|
|
2023-08-29 20:23:52 +00:00
|
|
|
|
2023-08-29 10:40:36 +00:00
|
|
|
def start(
|
|
|
|
documents: list[Document],
|
|
|
|
) -> list[AnnotatedDocument]:
|
|
|
|
"""Extract all annotations from passed documents.
|
2023-08-28 08:28:06 +00:00
|
|
|
|
2023-08-29 20:23:52 +00:00
|
|
|
Returns all annotations contained in the papis
|
2023-08-29 10:40:36 +00:00
|
|
|
documents passed in.
|
|
|
|
"""
|
|
|
|
|
|
|
|
output: list[AnnotatedDocument] = []
|
|
|
|
for doc in documents:
|
|
|
|
annotations: list[Annotation] = []
|
|
|
|
found_pdf: bool = False
|
|
|
|
for file in doc.get_files():
|
|
|
|
fname = Path(file)
|
|
|
|
if not _is_file_processable(fname):
|
|
|
|
break
|
|
|
|
found_pdf = True
|
|
|
|
|
|
|
|
try:
|
|
|
|
annotations.extend(extract(fname))
|
|
|
|
except fitz.FileDataError as e:
|
|
|
|
print(f"File structure errors for {file}.\n{e}")
|
|
|
|
|
|
|
|
if not found_pdf:
|
|
|
|
# have to remove curlys or papis logger gets upset
|
|
|
|
desc = re.sub("[{}]", "", papis.document.describe(doc))
|
|
|
|
logger.warning("Did not find suitable PDF file for document: " f"{desc}")
|
|
|
|
output.append(AnnotatedDocument(doc, annotations))
|
|
|
|
return output
|
|
|
|
|
2023-08-29 20:23:52 +00:00
|
|
|
|
2023-08-29 10:40:36 +00:00
|
|
|
def extract(filename: Path) -> list[Annotation]:
|
2023-08-28 08:28:06 +00:00
|
|
|
"""Extract annotations from a file.
|
|
|
|
|
|
|
|
Returns all readable annotations contained in the file
|
|
|
|
passed in. Only returns Highlight or Text annotations.
|
|
|
|
"""
|
|
|
|
annotations = []
|
|
|
|
with fitz.Document(filename) as doc:
|
|
|
|
for page in doc:
|
|
|
|
for annot in page.annots():
|
|
|
|
quote, note = _retrieve_annotation_content(page, annot)
|
2023-08-28 14:41:18 +00:00
|
|
|
if not quote and not note:
|
|
|
|
continue
|
2023-08-29 20:23:52 +00:00
|
|
|
col = (
|
|
|
|
annot.colors.get("fill")
|
|
|
|
or annot.colors.get("stroke")
|
|
|
|
or (0.0, 0.0, 0.0)
|
|
|
|
)
|
2023-08-28 08:28:06 +00:00
|
|
|
a = Annotation(
|
|
|
|
file=str(filename),
|
2023-08-28 14:41:18 +00:00
|
|
|
text=quote or "",
|
|
|
|
content=note or "",
|
2023-08-29 20:23:52 +00:00
|
|
|
colors=col,
|
2023-08-28 08:28:06 +00:00
|
|
|
type=annot.type[1],
|
|
|
|
page=(page.number or 0) + 1,
|
|
|
|
)
|
2023-08-28 14:41:18 +00:00
|
|
|
a.tag = _tag_from_colorname(a.colorname or "")
|
2023-08-28 08:28:06 +00:00
|
|
|
annotations.append(a)
|
2023-08-28 10:53:03 +00:00
|
|
|
logger.debug(
|
|
|
|
f"Found {len(annotations)} "
|
|
|
|
f"{'annotation' if len(annotations) == 1 else 'annotations'} for {filename}."
|
|
|
|
)
|
2023-08-28 08:28:06 +00:00
|
|
|
return annotations
|
|
|
|
|
|
|
|
|
2023-08-29 10:40:36 +00:00
|
|
|
def is_pdf(fname: Path) -> bool:
|
2023-09-19 15:52:45 +00:00
|
|
|
"""Check if file is a pdf, using mime type."""
|
2023-08-29 10:40:36 +00:00
|
|
|
return magic.from_file(fname, mime=True) == "application/pdf"
|
|
|
|
|
|
|
|
|
|
|
|
def _is_file_processable(fname: Path) -> bool:
|
|
|
|
if not fname.is_file():
|
|
|
|
logger.error(f"File {str(fname)} not readable.")
|
|
|
|
return False
|
|
|
|
if not is_pdf(fname):
|
|
|
|
return False
|
|
|
|
return True
|
|
|
|
|
2023-08-29 20:23:52 +00:00
|
|
|
|
2023-08-28 14:41:18 +00:00
|
|
|
def _tag_from_colorname(colorname: str) -> str:
|
2023-08-29 10:15:10 +00:00
|
|
|
color_mapping: dict[str, str] = getdict("tags", "plugins.extract")
|
2023-08-28 14:41:18 +00:00
|
|
|
if not color_mapping:
|
|
|
|
return ""
|
|
|
|
|
|
|
|
return color_mapping.get(colorname, "")
|
2023-08-28 08:28:06 +00:00
|
|
|
|
|
|
|
|
2023-08-28 14:41:18 +00:00
|
|
|
def _retrieve_annotation_content(
|
|
|
|
page: fitz.Page, annotation: fitz.Annot
|
|
|
|
) -> tuple[str | None, str | None]:
|
2023-08-28 08:28:06 +00:00
|
|
|
"""Gets the text content of an annotation.
|
|
|
|
|
|
|
|
Returns the actual content of an annotation. Sometimes
|
|
|
|
that is only the written words, sometimes that is only
|
|
|
|
annotation notes, sometimes it is both. Runs a similarity
|
|
|
|
comparison between strings to find out whether they
|
|
|
|
should both be included or are the same, using
|
|
|
|
Levenshtein distance.
|
|
|
|
"""
|
|
|
|
content = annotation.info["content"].replace("\n", " ")
|
|
|
|
written = page.get_textbox(annotation.rect).replace("\n", " ")
|
|
|
|
|
|
|
|
# highlight with selection in note
|
|
|
|
minimum_similarity = (
|
|
|
|
papis.config.getfloat("minimum_similarity_content", "plugins.extract") or 1.0
|
|
|
|
)
|
|
|
|
if Levenshtein.ratio(content, written) > minimum_similarity:
|
2023-08-28 14:41:18 +00:00
|
|
|
return (content, None)
|
2023-08-28 08:28:06 +00:00
|
|
|
# both a highlight and a note
|
2023-08-28 14:41:18 +00:00
|
|
|
elif content and written:
|
2023-08-28 08:28:06 +00:00
|
|
|
return (written, content)
|
2023-08-28 14:41:18 +00:00
|
|
|
# an independent note, not a highlight
|
|
|
|
elif content:
|
|
|
|
return (None, content)
|
2023-08-28 08:28:06 +00:00
|
|
|
# highlight with selection not in note
|
2023-08-28 14:41:18 +00:00
|
|
|
elif written:
|
|
|
|
return (written, None)
|
|
|
|
# just a highlight without any text
|
|
|
|
return (None, None)
|
|
|
|
|
2023-08-29 10:15:10 +00:00
|
|
|
|
2023-08-28 14:41:18 +00:00
|
|
|
# mimics the functions in papis.config.{getlist,getint,getfloat} etc.
|
|
|
|
def getdict(key: str, section: Optional[str] = None) -> dict[str, str]:
|
|
|
|
"""Dict getter
|
|
|
|
|
|
|
|
:returns: A python dict
|
|
|
|
:raises SyntaxError: Whenever the parsed syntax is either not a valid
|
|
|
|
python object or a valid python dict.
|
|
|
|
"""
|
|
|
|
rawvalue: Any = papis.config.general_get(key, section=section)
|
|
|
|
if isinstance(rawvalue, dict):
|
|
|
|
return rawvalue
|
|
|
|
try:
|
|
|
|
rawvalue = eval(rawvalue)
|
|
|
|
except Exception:
|
|
|
|
raise SyntaxError(
|
2023-08-29 10:15:10 +00:00
|
|
|
"The key '{}' must be a valid Python object: {}".format(key, rawvalue)
|
|
|
|
)
|
2023-08-28 14:41:18 +00:00
|
|
|
else:
|
|
|
|
if not isinstance(rawvalue, dict):
|
|
|
|
raise SyntaxError(
|
2023-08-29 10:15:10 +00:00
|
|
|
"The key '{}' must be a valid Python dict. Got: {} (type {!r})".format(
|
|
|
|
key, rawvalue, type(rawvalue).__name__
|
|
|
|
)
|
|
|
|
)
|
2023-08-28 14:41:18 +00:00
|
|
|
|
|
|
|
return rawvalue
|