initial commit

This commit is contained in:
Marty Oehme 2023-08-28 10:28:06 +02:00
commit a22cc635b2
Signed by: Marty
GPG key ID: EDBF2ED917B2EF6A
9 changed files with 1596 additions and 0 deletions

125
papis_extract/__init__.py Normal file
View file

@ -0,0 +1,125 @@
from pathlib import Path
import click
import fitz_new as fitz
import magic
import papis.cli
import papis.config
import papis.document
from papis.document import Document
import papis.logging
import papis.notes
import papis.strings
from papis_extract import extractor, exporter
from papis_extract.annotation_data import Annotation, AnnotatedDocument
logger = papis.logging.get_logger(__name__)
DEFAULT_OPTIONS = {
"plugins.extract": {
"tags": {"important": "red", "toread": "blue"},
"on_import": False,
"minimum_similarity": 0.75, # for checking against existing annotations
"minimum_similarity_content": 0.9, # for checking if highlight or note
"minimum_similarity_color": 0.833 # for matching tag to color
}
}
papis.config.register_default_settings(DEFAULT_OPTIONS)
@click.command("extract")
@click.help_option("-h", "--help")
@papis.cli.query_argument()
@papis.cli.doc_folder_option()
@papis.cli.git_option(help="Add changes made to the notes files")
@papis.cli.all_option()
@click.option(
"--manual/--no-manual",
"-m",
help="Open each note in editor for manual editing after extracting its annotations",
)
@click.option(
"--write/--no-write",
"-w",
help="Do not write annotations to notes only print results to stdout",
)
def main(
query: str,
# info: bool,
# _papis_id: bool,
# _file: bool,
# notes: bool,
# _dir: bool,
# _format: str,
_all: bool,
doc_folder: str,
manual: bool,
write: bool,
git: bool,
) -> None:
"""Extract annotations from any pdf document
The extract plugin allows manual or automatic extraction of all annotations
contained in the pdf documents belonging to entries of the pubs library.
It can write those changes to stdout or directly create and update notes
for papis documents.
It adds a `papis extract` subcommand through which it is invoked, but can
optionally run whenever a new document is imported for a pubs entry.
"""
documents = papis.cli.handle_doc_folder_query_all_sort(
query, doc_folder, sort_field=None, sort_reverse=False, _all=_all
)
if not documents:
logger.warning(papis.strings.no_documents_retrieved_message)
return
doc_annotations: list[AnnotatedDocument] = _get_annotations_for_documents(documents)
if write:
exporter.to_notes(doc_annotations, edit=manual, git=git)
else:
exporter.to_stdout(doc_annotations)
# note_file: Path = Path(papis.notes.notes_path_ensured(documents[0]))
def is_pdf(fname: Path) -> bool:
return magic.from_file(fname, mime=True) == "application/pdf"
def _get_annotations_for_documents(
documents: list[Document],
) -> list[AnnotatedDocument]:
output: list[AnnotatedDocument] = []
for doc in documents:
annotations: list[Annotation] = []
found_pdf: bool = False
for file in doc.get_files():
fname = Path(file)
if not _is_file_processable(fname):
break
found_pdf = True
try:
annotations.extend(extractor.start(fname))
except fitz.FileDataError as e:
print(f"File structure errors for {file}.\n{e}")
if not found_pdf:
logger.warning(
"Did not find suitable PDF file for document: "
f"{papis.document.describe(doc)}"
)
output.append(AnnotatedDocument(doc, annotations))
return output
def _is_file_processable(fname: Path) -> bool:
if not fname.is_file():
logger.error(f"File {str(fname)} not readable.")
return False
if not is_pdf(fname):
return False
return True

View file

@ -0,0 +1,98 @@
import re
import math
from dataclasses import dataclass, field
import papis.config
from papis.document import Document
TEXT_SIMILARITY_MINIMUM = 0.75
COLOR_SIMILARITY_MINIMUM = 0.833
COLORS = {
"red": (1, 0, 0),
"green": (0, 1, 0),
"blue": (0, 0, 1),
"yellow": (1, 1, 0),
"purple": (0.5, 0, 0.5),
"orange": (1, 0.65, 0),
}
@dataclass
class Annotation:
"""A PDF annotation object"""
file: str
type: str = "Highlight"
text: str = ""
content: str = ""
page: int = 1
colors: dict = field(default_factory=lambda: {"stroke": (0.0, 0.0, 0.0)})
tag: str = ""
def format(self, formatting):
"""Return a formatted string of the annotation.
Given a provided formatting pattern, this method returns the annotation
formatted with the correct marker replacements and removals, ready
for display or writing.
"""
output = formatting
replacements = {
r"{quote}": self.text,
r"{note}": self.content,
r"{page}": str(self.page),
r"{newline}": "\n",
r"{tag}": self.tag,
}
pattern = re.compile(
"|".join(
[re.escape(k) for k in sorted(replacements, key=len, reverse=True)]
),
flags=re.DOTALL,
)
patt_quote_container = re.compile(r"{%quote_container(.*?)%}")
patt_note_container = re.compile(r"{%note_container(.*?)%}")
patt_tag_container = re.compile(r"{%tag_container(.*?)%}")
output = patt_quote_container.sub(r"\1" if self.text else "", output)
output = patt_note_container.sub(r"\1" if self.content else "", output)
output = patt_tag_container.sub(r"\1" if self.tag else "", output)
return pattern.sub(lambda x: replacements[x.group(0)], output)
@property
def colorname(self):
"""Return the stringified version of the annotation color.
Finds the closest named color to the annotation and returns it,
using euclidian distance between the two color vectors.
"""
annot_colors = (
self.colors.get("stroke") or self.colors.get("fill") or (0.0, 0.0, 0.0)
)
nearest = None
minimum_similarity = (
papis.config.getfloat("minimum_similarity_color", "plugins.extract")
or 1.0
)
for name, values in COLORS.items():
similarity_ratio = self._color_similarity_ratio(values, annot_colors)
if similarity_ratio > minimum_similarity:
minimum_similarity = similarity_ratio
nearest = name
return nearest
def _color_similarity_ratio(self, color_one, color_two):
"""Return the similarity of two colors between 0 and 1.
Takes two rgb color tuples made of floats between 0 and 1,
e.g. (1, 0.65, 0) for orange, and returns the similarity
between them, with 1 being the same color and 0 being the
difference between full black and full white, as a float.
"""
return 1 - (abs(math.dist([*color_one], [*color_two])) / 3)
@dataclass
class AnnotatedDocument:
document: Document
annotations: list[Annotation]

128
papis_extract/exporter.py Normal file
View file

@ -0,0 +1,128 @@
import papis.logging
import papis.document
import papis.notes
import papis.commands.edit
import papis.api
import papis.git
import papis.config
import Levenshtein
from papis_extract.annotation_data import AnnotatedDocument, Annotation
logger = papis.logging.get_logger(__name__)
def _format_annotation(annotation: Annotation) -> str:
note = f"NOTE: {annotation.content}" if annotation.content else ""
return f"> {annotation.text}\n {note}"
def to_stdout(annots: list[AnnotatedDocument]) -> None:
if not annots:
return
for entry in annots:
if not entry.annotations:
continue
title_decoration = "=" * len(entry.document.get("title", ""))
print(
f"{title_decoration}\n{papis.document.describe(entry.document)}\n{title_decoration}\n"
)
for a in entry.annotations:
print(_format_annotation(a))
print("\n")
def to_notes(annots: list[AnnotatedDocument], edit: bool, git: bool) -> None:
"""Write annotations into document notes.
Permanently writes the given annotations into notes
belonging to papis documents. Creates new notes for
documents missing a note field or appends to existing.
"""
if not annots:
return
for entry in annots:
if not entry.annotations:
continue
formatted_annotations: list[str] = []
for a in entry.annotations:
formatted_annotations.append(_format_annotation(a))
_add_annots_to_note(entry.document, formatted_annotations)
if edit:
papis.commands.edit.edit_notes(entry.document, git=git)
def _add_annots_to_note(
document: papis.document.Document,
formatted_annotations: list[str],
git: bool = False,
) -> None:
"""Append new annotations to the end of a note.
Looks through note to determine any new annotations which should be
added and adds them to the end of the note file.
"""
logger.debug("Adding annotations to note.")
notes_path = papis.notes.notes_path_ensured(document)
existing: list[str] = []
with open(notes_path, "r") as file_read:
existing = file_read.readlines()
new_annotations: list[str] = _drop_existing_annotations(
formatted_annotations, existing
)
if not new_annotations:
return
with open(notes_path, "a") as f:
# add newline if theres no empty space at file end
if len(existing) > 0 and existing[-1].strip() != "":
f.write("\n")
f.write("\n".join(new_annotations))
f.write("\n")
logger.info(
f"Wrote {len(new_annotations)} annotations "\
f"to {papis.document.describe(document)}"
)
if git:
msg = "Update notes for '{0}'".format(papis.document.describe(document))
folder = document.get_main_folder()
if folder:
papis.git.add_and_commit_resources(
folder, [notes_path, document.get_info_file()], msg
)
def _drop_existing_annotations(
formatted_annotations: list[str], file_lines: list[str]
) -> list[str]:
minimum_similarity = (
papis.config.getfloat("minimum_similarity", "plugins.extract") or 1.0
)
remaining: list[str] = []
for an in formatted_annotations:
an_split = an.splitlines()
if not _test_similarity(an_split[0], file_lines, minimum_similarity):
remaining.append(an)
return remaining
def _test_similarity(
string: str, lines: list[str], minimum_similarity: float = 1.0
) -> bool:
for line in lines:
ratio = Levenshtein.ratio(string, line)
if ratio > minimum_similarity:
return True
return False

View file

@ -0,0 +1,66 @@
from pathlib import Path
import Levenshtein
import fitz_new as fitz
import papis.config
from papis_extract.annotation_data import Annotation
COLOR_MAPPING = {}
def start(filename: Path) -> list[Annotation]:
"""Extract annotations from a file.
Returns all readable annotations contained in the file
passed in. Only returns Highlight or Text annotations.
"""
annotations = []
with fitz.Document(filename) as doc:
for page in doc:
for annot in page.annots():
quote, note = _retrieve_annotation_content(page, annot)
a = Annotation(
file=str(filename),
text=quote,
content=note,
colors=annot.colors,
type=annot.type[1],
page=(page.number or 0) + 1,
)
a.tag = _tag_from_colorname(a.colorname)
annotations.append(a)
return annotations
def _tag_from_colorname(colorname):
return COLOR_MAPPING.get(colorname, "")
def _retrieve_annotation_content(page, annotation):
"""Gets the text content of an annotation.
Returns the actual content of an annotation. Sometimes
that is only the written words, sometimes that is only
annotation notes, sometimes it is both. Runs a similarity
comparison between strings to find out whether they
should both be included or are the same, using
Levenshtein distance.
"""
content = annotation.info["content"].replace("\n", " ")
written = page.get_textbox(annotation.rect).replace("\n", " ")
# highlight with selection in note
minimum_similarity = (
papis.config.getfloat("minimum_similarity_content", "plugins.extract") or 1.0
)
if Levenshtein.ratio(content, written) > minimum_similarity:
return (content, "")
# an independent note, not a highlight
elif content and not written:
return ("", content)
# both a highlight and a note
elif content:
return (written, content)
# highlight with selection not in note
return (written, "")