initial commit
This commit is contained in:
commit
a22cc635b2
9 changed files with 1596 additions and 0 deletions
125
papis_extract/__init__.py
Normal file
125
papis_extract/__init__.py
Normal file
|
|
@ -0,0 +1,125 @@
|
|||
from pathlib import Path
|
||||
|
||||
import click
|
||||
import fitz_new as fitz
|
||||
import magic
|
||||
import papis.cli
|
||||
import papis.config
|
||||
import papis.document
|
||||
from papis.document import Document
|
||||
import papis.logging
|
||||
import papis.notes
|
||||
import papis.strings
|
||||
|
||||
from papis_extract import extractor, exporter
|
||||
from papis_extract.annotation_data import Annotation, AnnotatedDocument
|
||||
|
||||
logger = papis.logging.get_logger(__name__)
|
||||
|
||||
DEFAULT_OPTIONS = {
|
||||
"plugins.extract": {
|
||||
"tags": {"important": "red", "toread": "blue"},
|
||||
"on_import": False,
|
||||
"minimum_similarity": 0.75, # for checking against existing annotations
|
||||
"minimum_similarity_content": 0.9, # for checking if highlight or note
|
||||
"minimum_similarity_color": 0.833 # for matching tag to color
|
||||
}
|
||||
}
|
||||
papis.config.register_default_settings(DEFAULT_OPTIONS)
|
||||
|
||||
|
||||
@click.command("extract")
|
||||
@click.help_option("-h", "--help")
|
||||
@papis.cli.query_argument()
|
||||
@papis.cli.doc_folder_option()
|
||||
@papis.cli.git_option(help="Add changes made to the notes files")
|
||||
@papis.cli.all_option()
|
||||
@click.option(
|
||||
"--manual/--no-manual",
|
||||
"-m",
|
||||
help="Open each note in editor for manual editing after extracting its annotations",
|
||||
)
|
||||
@click.option(
|
||||
"--write/--no-write",
|
||||
"-w",
|
||||
help="Do not write annotations to notes only print results to stdout",
|
||||
)
|
||||
def main(
|
||||
query: str,
|
||||
# info: bool,
|
||||
# _papis_id: bool,
|
||||
# _file: bool,
|
||||
# notes: bool,
|
||||
# _dir: bool,
|
||||
# _format: str,
|
||||
_all: bool,
|
||||
doc_folder: str,
|
||||
manual: bool,
|
||||
write: bool,
|
||||
git: bool,
|
||||
) -> None:
|
||||
"""Extract annotations from any pdf document
|
||||
|
||||
The extract plugin allows manual or automatic extraction of all annotations
|
||||
contained in the pdf documents belonging to entries of the pubs library.
|
||||
It can write those changes to stdout or directly create and update notes
|
||||
for papis documents.
|
||||
|
||||
It adds a `papis extract` subcommand through which it is invoked, but can
|
||||
optionally run whenever a new document is imported for a pubs entry.
|
||||
"""
|
||||
documents = papis.cli.handle_doc_folder_query_all_sort(
|
||||
query, doc_folder, sort_field=None, sort_reverse=False, _all=_all
|
||||
)
|
||||
if not documents:
|
||||
logger.warning(papis.strings.no_documents_retrieved_message)
|
||||
return
|
||||
|
||||
doc_annotations: list[AnnotatedDocument] = _get_annotations_for_documents(documents)
|
||||
|
||||
if write:
|
||||
exporter.to_notes(doc_annotations, edit=manual, git=git)
|
||||
else:
|
||||
exporter.to_stdout(doc_annotations)
|
||||
|
||||
# note_file: Path = Path(papis.notes.notes_path_ensured(documents[0]))
|
||||
|
||||
|
||||
def is_pdf(fname: Path) -> bool:
|
||||
return magic.from_file(fname, mime=True) == "application/pdf"
|
||||
|
||||
|
||||
def _get_annotations_for_documents(
|
||||
documents: list[Document],
|
||||
) -> list[AnnotatedDocument]:
|
||||
output: list[AnnotatedDocument] = []
|
||||
for doc in documents:
|
||||
annotations: list[Annotation] = []
|
||||
found_pdf: bool = False
|
||||
for file in doc.get_files():
|
||||
fname = Path(file)
|
||||
if not _is_file_processable(fname):
|
||||
break
|
||||
found_pdf = True
|
||||
|
||||
try:
|
||||
annotations.extend(extractor.start(fname))
|
||||
except fitz.FileDataError as e:
|
||||
print(f"File structure errors for {file}.\n{e}")
|
||||
|
||||
if not found_pdf:
|
||||
logger.warning(
|
||||
"Did not find suitable PDF file for document: "
|
||||
f"{papis.document.describe(doc)}"
|
||||
)
|
||||
output.append(AnnotatedDocument(doc, annotations))
|
||||
return output
|
||||
|
||||
|
||||
def _is_file_processable(fname: Path) -> bool:
|
||||
if not fname.is_file():
|
||||
logger.error(f"File {str(fname)} not readable.")
|
||||
return False
|
||||
if not is_pdf(fname):
|
||||
return False
|
||||
return True
|
||||
98
papis_extract/annotation_data.py
Normal file
98
papis_extract/annotation_data.py
Normal file
|
|
@ -0,0 +1,98 @@
|
|||
import re
|
||||
import math
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
import papis.config
|
||||
from papis.document import Document
|
||||
|
||||
TEXT_SIMILARITY_MINIMUM = 0.75
|
||||
COLOR_SIMILARITY_MINIMUM = 0.833
|
||||
|
||||
COLORS = {
|
||||
"red": (1, 0, 0),
|
||||
"green": (0, 1, 0),
|
||||
"blue": (0, 0, 1),
|
||||
"yellow": (1, 1, 0),
|
||||
"purple": (0.5, 0, 0.5),
|
||||
"orange": (1, 0.65, 0),
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class Annotation:
|
||||
"""A PDF annotation object"""
|
||||
|
||||
file: str
|
||||
type: str = "Highlight"
|
||||
text: str = ""
|
||||
content: str = ""
|
||||
page: int = 1
|
||||
colors: dict = field(default_factory=lambda: {"stroke": (0.0, 0.0, 0.0)})
|
||||
tag: str = ""
|
||||
|
||||
def format(self, formatting):
|
||||
"""Return a formatted string of the annotation.
|
||||
|
||||
Given a provided formatting pattern, this method returns the annotation
|
||||
formatted with the correct marker replacements and removals, ready
|
||||
for display or writing.
|
||||
"""
|
||||
output = formatting
|
||||
replacements = {
|
||||
r"{quote}": self.text,
|
||||
r"{note}": self.content,
|
||||
r"{page}": str(self.page),
|
||||
r"{newline}": "\n",
|
||||
r"{tag}": self.tag,
|
||||
}
|
||||
pattern = re.compile(
|
||||
"|".join(
|
||||
[re.escape(k) for k in sorted(replacements, key=len, reverse=True)]
|
||||
),
|
||||
flags=re.DOTALL,
|
||||
)
|
||||
patt_quote_container = re.compile(r"{%quote_container(.*?)%}")
|
||||
patt_note_container = re.compile(r"{%note_container(.*?)%}")
|
||||
patt_tag_container = re.compile(r"{%tag_container(.*?)%}")
|
||||
output = patt_quote_container.sub(r"\1" if self.text else "", output)
|
||||
output = patt_note_container.sub(r"\1" if self.content else "", output)
|
||||
output = patt_tag_container.sub(r"\1" if self.tag else "", output)
|
||||
return pattern.sub(lambda x: replacements[x.group(0)], output)
|
||||
|
||||
@property
|
||||
def colorname(self):
|
||||
"""Return the stringified version of the annotation color.
|
||||
|
||||
Finds the closest named color to the annotation and returns it,
|
||||
using euclidian distance between the two color vectors.
|
||||
"""
|
||||
annot_colors = (
|
||||
self.colors.get("stroke") or self.colors.get("fill") or (0.0, 0.0, 0.0)
|
||||
)
|
||||
nearest = None
|
||||
minimum_similarity = (
|
||||
papis.config.getfloat("minimum_similarity_color", "plugins.extract")
|
||||
or 1.0
|
||||
)
|
||||
for name, values in COLORS.items():
|
||||
similarity_ratio = self._color_similarity_ratio(values, annot_colors)
|
||||
if similarity_ratio > minimum_similarity:
|
||||
minimum_similarity = similarity_ratio
|
||||
nearest = name
|
||||
return nearest
|
||||
|
||||
def _color_similarity_ratio(self, color_one, color_two):
|
||||
"""Return the similarity of two colors between 0 and 1.
|
||||
|
||||
Takes two rgb color tuples made of floats between 0 and 1,
|
||||
e.g. (1, 0.65, 0) for orange, and returns the similarity
|
||||
between them, with 1 being the same color and 0 being the
|
||||
difference between full black and full white, as a float.
|
||||
"""
|
||||
return 1 - (abs(math.dist([*color_one], [*color_two])) / 3)
|
||||
|
||||
|
||||
@dataclass
|
||||
class AnnotatedDocument:
|
||||
document: Document
|
||||
annotations: list[Annotation]
|
||||
128
papis_extract/exporter.py
Normal file
128
papis_extract/exporter.py
Normal file
|
|
@ -0,0 +1,128 @@
|
|||
import papis.logging
|
||||
import papis.document
|
||||
import papis.notes
|
||||
import papis.commands.edit
|
||||
import papis.api
|
||||
import papis.git
|
||||
import papis.config
|
||||
import Levenshtein
|
||||
|
||||
from papis_extract.annotation_data import AnnotatedDocument, Annotation
|
||||
|
||||
logger = papis.logging.get_logger(__name__)
|
||||
|
||||
|
||||
def _format_annotation(annotation: Annotation) -> str:
|
||||
note = f"NOTE: {annotation.content}" if annotation.content else ""
|
||||
return f"> {annotation.text}\n {note}"
|
||||
|
||||
|
||||
def to_stdout(annots: list[AnnotatedDocument]) -> None:
|
||||
if not annots:
|
||||
return
|
||||
|
||||
for entry in annots:
|
||||
if not entry.annotations:
|
||||
continue
|
||||
|
||||
title_decoration = "=" * len(entry.document.get("title", ""))
|
||||
print(
|
||||
f"{title_decoration}\n{papis.document.describe(entry.document)}\n{title_decoration}\n"
|
||||
)
|
||||
for a in entry.annotations:
|
||||
print(_format_annotation(a))
|
||||
print("\n")
|
||||
|
||||
|
||||
def to_notes(annots: list[AnnotatedDocument], edit: bool, git: bool) -> None:
|
||||
"""Write annotations into document notes.
|
||||
|
||||
Permanently writes the given annotations into notes
|
||||
belonging to papis documents. Creates new notes for
|
||||
documents missing a note field or appends to existing.
|
||||
"""
|
||||
if not annots:
|
||||
return
|
||||
|
||||
for entry in annots:
|
||||
if not entry.annotations:
|
||||
continue
|
||||
|
||||
formatted_annotations: list[str] = []
|
||||
for a in entry.annotations:
|
||||
formatted_annotations.append(_format_annotation(a))
|
||||
|
||||
_add_annots_to_note(entry.document, formatted_annotations)
|
||||
|
||||
if edit:
|
||||
papis.commands.edit.edit_notes(entry.document, git=git)
|
||||
|
||||
|
||||
def _add_annots_to_note(
|
||||
document: papis.document.Document,
|
||||
formatted_annotations: list[str],
|
||||
git: bool = False,
|
||||
) -> None:
|
||||
"""Append new annotations to the end of a note.
|
||||
|
||||
Looks through note to determine any new annotations which should be
|
||||
added and adds them to the end of the note file.
|
||||
"""
|
||||
logger.debug("Adding annotations to note.")
|
||||
notes_path = papis.notes.notes_path_ensured(document)
|
||||
|
||||
existing: list[str] = []
|
||||
with open(notes_path, "r") as file_read:
|
||||
existing = file_read.readlines()
|
||||
|
||||
new_annotations: list[str] = _drop_existing_annotations(
|
||||
formatted_annotations, existing
|
||||
)
|
||||
if not new_annotations:
|
||||
return
|
||||
|
||||
with open(notes_path, "a") as f:
|
||||
# add newline if theres no empty space at file end
|
||||
if len(existing) > 0 and existing[-1].strip() != "":
|
||||
f.write("\n")
|
||||
f.write("\n".join(new_annotations))
|
||||
f.write("\n")
|
||||
logger.info(
|
||||
f"Wrote {len(new_annotations)} annotations "\
|
||||
f"to {papis.document.describe(document)}"
|
||||
)
|
||||
|
||||
|
||||
if git:
|
||||
msg = "Update notes for '{0}'".format(papis.document.describe(document))
|
||||
folder = document.get_main_folder()
|
||||
if folder:
|
||||
papis.git.add_and_commit_resources(
|
||||
folder, [notes_path, document.get_info_file()], msg
|
||||
)
|
||||
|
||||
|
||||
def _drop_existing_annotations(
|
||||
formatted_annotations: list[str], file_lines: list[str]
|
||||
) -> list[str]:
|
||||
minimum_similarity = (
|
||||
papis.config.getfloat("minimum_similarity", "plugins.extract") or 1.0
|
||||
)
|
||||
|
||||
remaining: list[str] = []
|
||||
for an in formatted_annotations:
|
||||
an_split = an.splitlines()
|
||||
if not _test_similarity(an_split[0], file_lines, minimum_similarity):
|
||||
remaining.append(an)
|
||||
|
||||
return remaining
|
||||
|
||||
|
||||
def _test_similarity(
|
||||
string: str, lines: list[str], minimum_similarity: float = 1.0
|
||||
) -> bool:
|
||||
for line in lines:
|
||||
ratio = Levenshtein.ratio(string, line)
|
||||
if ratio > minimum_similarity:
|
||||
return True
|
||||
return False
|
||||
66
papis_extract/extractor.py
Normal file
66
papis_extract/extractor.py
Normal file
|
|
@ -0,0 +1,66 @@
|
|||
from pathlib import Path
|
||||
|
||||
import Levenshtein
|
||||
import fitz_new as fitz
|
||||
import papis.config
|
||||
|
||||
from papis_extract.annotation_data import Annotation
|
||||
|
||||
COLOR_MAPPING = {}
|
||||
|
||||
|
||||
def start(filename: Path) -> list[Annotation]:
|
||||
"""Extract annotations from a file.
|
||||
|
||||
Returns all readable annotations contained in the file
|
||||
passed in. Only returns Highlight or Text annotations.
|
||||
"""
|
||||
annotations = []
|
||||
with fitz.Document(filename) as doc:
|
||||
for page in doc:
|
||||
for annot in page.annots():
|
||||
quote, note = _retrieve_annotation_content(page, annot)
|
||||
a = Annotation(
|
||||
file=str(filename),
|
||||
text=quote,
|
||||
content=note,
|
||||
colors=annot.colors,
|
||||
type=annot.type[1],
|
||||
page=(page.number or 0) + 1,
|
||||
)
|
||||
a.tag = _tag_from_colorname(a.colorname)
|
||||
annotations.append(a)
|
||||
return annotations
|
||||
|
||||
|
||||
def _tag_from_colorname(colorname):
|
||||
return COLOR_MAPPING.get(colorname, "")
|
||||
|
||||
|
||||
def _retrieve_annotation_content(page, annotation):
|
||||
"""Gets the text content of an annotation.
|
||||
|
||||
Returns the actual content of an annotation. Sometimes
|
||||
that is only the written words, sometimes that is only
|
||||
annotation notes, sometimes it is both. Runs a similarity
|
||||
comparison between strings to find out whether they
|
||||
should both be included or are the same, using
|
||||
Levenshtein distance.
|
||||
"""
|
||||
content = annotation.info["content"].replace("\n", " ")
|
||||
written = page.get_textbox(annotation.rect).replace("\n", " ")
|
||||
|
||||
# highlight with selection in note
|
||||
minimum_similarity = (
|
||||
papis.config.getfloat("minimum_similarity_content", "plugins.extract") or 1.0
|
||||
)
|
||||
if Levenshtein.ratio(content, written) > minimum_similarity:
|
||||
return (content, "")
|
||||
# an independent note, not a highlight
|
||||
elif content and not written:
|
||||
return ("", content)
|
||||
# both a highlight and a note
|
||||
elif content:
|
||||
return (written, content)
|
||||
# highlight with selection not in note
|
||||
return (written, "")
|
||||
Loading…
Add table
Add a link
Reference in a new issue