initial commit

2023-08-28 10:28:06 +02:00 · 2023-08-28 10:28:06 +02:00 · a22cc635b2
commit a22cc635b2
9 changed files with 1596 additions and 0 deletions
--- a/papis_extract/init.py
+++ b/papis_extract/init.py
@ -0,0 +1,125 @@
+from pathlib import Path
+
+import click
+import fitz_new as fitz
+import magic
+import papis.cli
+import papis.config
+import papis.document
+from papis.document import Document
+import papis.logging
+import papis.notes
+import papis.strings
+
+from papis_extract import extractor, exporter
+from papis_extract.annotation_data import Annotation, AnnotatedDocument
+
+logger = papis.logging.get_logger(__name__)
+
+DEFAULT_OPTIONS = {
+    "plugins.extract": {
+        "tags": {"important": "red", "toread": "blue"},
+        "on_import": False,
+        "minimum_similarity": 0.75, # for checking against existing annotations
+        "minimum_similarity_content": 0.9, # for checking if highlight or note
+        "minimum_similarity_color": 0.833 # for matching tag to color
+    }
+}
+papis.config.register_default_settings(DEFAULT_OPTIONS)
+
+
+@click.command("extract")
+@click.help_option("-h", "--help")
+@papis.cli.query_argument()
+@papis.cli.doc_folder_option()
+@papis.cli.git_option(help="Add changes made to the notes files")
+@papis.cli.all_option()
+@click.option(
+    "--manual/--no-manual",
+    "-m",
+    help="Open each note in editor for manual editing after extracting its annotations",
+)
+@click.option(
+    "--write/--no-write",
+    "-w",
+    help="Do not write annotations to notes only print results to stdout",
+)
+def main(
+    query: str,
+    # info: bool,
+    # _papis_id: bool,
+    # _file: bool,
+    # notes: bool,
+    # _dir: bool,
+    # _format: str,
+    _all: bool,
+    doc_folder: str,
+    manual: bool,
+    write: bool,
+    git: bool,
+) -> None:
+    """Extract annotations from any pdf document
+
+    The extract plugin allows manual or automatic extraction of all annotations
+    contained in the pdf documents belonging to entries of the pubs library.
+    It can write those changes to stdout or directly create and update notes
+    for papis documents.
+
+    It adds a `papis extract` subcommand through which it is invoked, but can
+    optionally run whenever a new document is imported for a pubs entry.
+    """
+    documents = papis.cli.handle_doc_folder_query_all_sort(
+        query, doc_folder, sort_field=None, sort_reverse=False, _all=_all
+    )
+    if not documents:
+        logger.warning(papis.strings.no_documents_retrieved_message)
+        return
+
+    doc_annotations: list[AnnotatedDocument] = _get_annotations_for_documents(documents)
+
+    if write:
+        exporter.to_notes(doc_annotations, edit=manual, git=git)
+    else:
+        exporter.to_stdout(doc_annotations)
+
+    # note_file: Path = Path(papis.notes.notes_path_ensured(documents[0]))
+
+
+def is_pdf(fname: Path) -> bool:
+    return magic.from_file(fname, mime=True) == "application/pdf"
+
+
+def _get_annotations_for_documents(
+    documents: list[Document],
+) -> list[AnnotatedDocument]:
+    output: list[AnnotatedDocument] = []
+    for doc in documents:
+        annotations: list[Annotation] = []
+        found_pdf: bool = False
+        for file in doc.get_files():
+            fname = Path(file)
+            if not _is_file_processable(fname):
+                break
+            found_pdf = True
+
+            try:
+                annotations.extend(extractor.start(fname))
+            except fitz.FileDataError as e:
+                print(f"File structure errors for {file}.\n{e}")
+
+        if not found_pdf:
+            logger.warning(
+                "Did not find suitable PDF file for document: "
+                f"{papis.document.describe(doc)}"
+            )
+        output.append(AnnotatedDocument(doc, annotations))
+    return output
+
+
+def _is_file_processable(fname: Path) -> bool:
+    if not fname.is_file():
+        logger.error(f"File {str(fname)} not readable.")
+        return False
+    if not is_pdf(fname):
+        return False
+    return True
--- a/papis_extract/annotation_data.py
+++ b/papis_extract/annotation_data.py
@ -0,0 +1,98 @@
+import re
+import math
+from dataclasses import dataclass, field
+
+import papis.config
+from papis.document import Document
+
+TEXT_SIMILARITY_MINIMUM = 0.75
+COLOR_SIMILARITY_MINIMUM = 0.833
+
+COLORS = {
+    "red": (1, 0, 0),
+    "green": (0, 1, 0),
+    "blue": (0, 0, 1),
+    "yellow": (1, 1, 0),
+    "purple": (0.5, 0, 0.5),
+    "orange": (1, 0.65, 0),
+}
+
+
+@dataclass
+class Annotation:
+    """A PDF annotation object"""
+
+    file: str
+    type: str = "Highlight"
+    text: str = ""
+    content: str = ""
+    page: int = 1
+    colors: dict = field(default_factory=lambda: {"stroke": (0.0, 0.0, 0.0)})
+    tag: str = ""
+
+    def format(self, formatting):
+        """Return a formatted string of the annotation.
+
+        Given a provided formatting pattern, this method returns the annotation
+        formatted with the correct marker replacements and removals, ready
+        for display or writing.
+        """
+        output = formatting
+        replacements = {
+            r"{quote}": self.text,
+            r"{note}": self.content,
+            r"{page}": str(self.page),
+            r"{newline}": "\n",
+            r"{tag}": self.tag,
+        }
+        pattern = re.compile(
+            "|".join(
+                [re.escape(k) for k in sorted(replacements, key=len, reverse=True)]
+            ),
+            flags=re.DOTALL,
+        )
+        patt_quote_container = re.compile(r"{%quote_container(.*?)%}")
+        patt_note_container = re.compile(r"{%note_container(.*?)%}")
+        patt_tag_container = re.compile(r"{%tag_container(.*?)%}")
+        output = patt_quote_container.sub(r"\1" if self.text else "", output)
+        output = patt_note_container.sub(r"\1" if self.content else "", output)
+        output = patt_tag_container.sub(r"\1" if self.tag else "", output)
+        return pattern.sub(lambda x: replacements[x.group(0)], output)
+
+    @property
+    def colorname(self):
+        """Return the stringified version of the annotation color.
+
+        Finds the closest named color to the annotation and returns it,
+        using euclidian distance between the two color vectors.
+        """
+        annot_colors = (
+            self.colors.get("stroke") or self.colors.get("fill") or (0.0, 0.0, 0.0)
+        )
+        nearest = None
+        minimum_similarity = (
+            papis.config.getfloat("minimum_similarity_color", "plugins.extract")
+            or 1.0
+        )
+        for name, values in COLORS.items():
+            similarity_ratio = self._color_similarity_ratio(values, annot_colors)
+            if similarity_ratio > minimum_similarity:
+                minimum_similarity = similarity_ratio
+                nearest = name
+        return nearest
+
+    def _color_similarity_ratio(self, color_one, color_two):
+        """Return the similarity of two colors between 0 and 1.
+
+        Takes two rgb color tuples made of floats between 0 and 1,
+        e.g. (1, 0.65, 0) for orange, and returns the similarity
+        between them, with 1 being the same color and 0 being the
+        difference between full black and full white, as a float.
+        """
+        return 1 - (abs(math.dist([*color_one], [*color_two])) / 3)
+
+
+@dataclass
+class AnnotatedDocument:
+    document: Document
+    annotations: list[Annotation]
--- a/papis_extract/exporter.py
+++ b/papis_extract/exporter.py
@ -0,0 +1,128 @@
+import papis.logging
+import papis.document
+import papis.notes
+import papis.commands.edit
+import papis.api
+import papis.git
+import papis.config
+import Levenshtein
+
+from papis_extract.annotation_data import AnnotatedDocument, Annotation
+
+logger = papis.logging.get_logger(__name__)
+
+
+def _format_annotation(annotation: Annotation) -> str:
+    note = f"NOTE: {annotation.content}" if annotation.content else ""
+    return f"> {annotation.text}\n  {note}"
+
+
+def to_stdout(annots: list[AnnotatedDocument]) -> None:
+    if not annots:
+        return
+
+    for entry in annots:
+        if not entry.annotations:
+            continue
+
+        title_decoration = "=" * len(entry.document.get("title", ""))
+        print(
+            f"{title_decoration}\n{papis.document.describe(entry.document)}\n{title_decoration}\n"
+        )
+        for a in entry.annotations:
+            print(_format_annotation(a))
+        print("\n")
+
+
+def to_notes(annots: list[AnnotatedDocument], edit: bool, git: bool) -> None:
+    """Write annotations into document notes.
+
+    Permanently writes the given annotations into notes
+    belonging to papis documents. Creates new notes for
+    documents missing a note field or appends to existing.
+    """
+    if not annots:
+        return
+
+    for entry in annots:
+        if not entry.annotations:
+            continue
+
+        formatted_annotations: list[str] = []
+        for a in entry.annotations:
+            formatted_annotations.append(_format_annotation(a))
+
+        _add_annots_to_note(entry.document, formatted_annotations)
+
+        if edit:
+            papis.commands.edit.edit_notes(entry.document, git=git)
+
+
+def _add_annots_to_note(
+    document: papis.document.Document,
+    formatted_annotations: list[str],
+    git: bool = False,
+) -> None:
+    """Append new annotations to the end of a note.
+
+    Looks through note to determine any new annotations which should be
+    added and adds them to the end of the note file.
+    """
+    logger.debug("Adding annotations to note.")
+    notes_path = papis.notes.notes_path_ensured(document)
+
+    existing: list[str] = []
+    with open(notes_path, "r") as file_read:
+        existing = file_read.readlines()
+
+    new_annotations: list[str] = _drop_existing_annotations(
+        formatted_annotations, existing
+    )
+    if not new_annotations:
+        return
+
+    with open(notes_path, "a") as f:
+        # add newline if theres no empty space at file end
+        if len(existing) > 0 and existing[-1].strip() != "":
+            f.write("\n")
+        f.write("\n".join(new_annotations))
+        f.write("\n")
+        logger.info(
+            f"Wrote {len(new_annotations)} annotations "\
+            f"to {papis.document.describe(document)}"
+        )
+
+
+    if git:
+        msg = "Update notes for '{0}'".format(papis.document.describe(document))
+        folder = document.get_main_folder()
+        if folder:
+            papis.git.add_and_commit_resources(
+                folder, [notes_path, document.get_info_file()], msg
+            )
+
+
+def _drop_existing_annotations(
+    formatted_annotations: list[str], file_lines: list[str]
+) -> list[str]:
+    minimum_similarity = (
+        papis.config.getfloat("minimum_similarity", "plugins.extract") or 1.0
+    )
+
+    remaining: list[str] = []
+    for an in formatted_annotations:
+        an_split = an.splitlines()
+        if not _test_similarity(an_split[0], file_lines, minimum_similarity):
+            remaining.append(an)
+
+    return remaining
+
+
+def _test_similarity(
+    string: str, lines: list[str], minimum_similarity: float = 1.0
+) -> bool:
+    for line in lines:
+        ratio = Levenshtein.ratio(string, line)
+        if ratio > minimum_similarity:
+            return True
+    return False
--- a/papis_extract/extractor.py
+++ b/papis_extract/extractor.py
@ -0,0 +1,66 @@
+from pathlib import Path
+
+import Levenshtein
+import fitz_new as fitz
+import papis.config
+
+from papis_extract.annotation_data import Annotation
+
+COLOR_MAPPING = {}
+
+
+def start(filename: Path) -> list[Annotation]:
+    """Extract annotations from a file.
+
+    Returns all readable annotations contained in the file
+    passed in. Only returns Highlight or Text annotations.
+    """
+    annotations = []
+    with fitz.Document(filename) as doc:
+        for page in doc:
+            for annot in page.annots():
+                quote, note = _retrieve_annotation_content(page, annot)
+                a = Annotation(
+                    file=str(filename),
+                    text=quote,
+                    content=note,
+                    colors=annot.colors,
+                    type=annot.type[1],
+                    page=(page.number or 0) + 1,
+                )
+                a.tag = _tag_from_colorname(a.colorname)
+                annotations.append(a)
+    return annotations
+
+
+def _tag_from_colorname(colorname):
+    return COLOR_MAPPING.get(colorname, "")
+
+
+def _retrieve_annotation_content(page, annotation):
+    """Gets the text content of an annotation.
+
+    Returns the actual content of an annotation. Sometimes
+    that is only the written words, sometimes that is only
+    annotation notes, sometimes it is both. Runs a similarity
+    comparison between strings to find out whether they
+    should both be included or are the same, using
+    Levenshtein distance.
+    """
+    content = annotation.info["content"].replace("\n", " ")
+    written = page.get_textbox(annotation.rect).replace("\n", " ")
+
+    # highlight with selection in note
+    minimum_similarity = (
+        papis.config.getfloat("minimum_similarity_content", "plugins.extract") or 1.0
+    )
+    if Levenshtein.ratio(content, written) > minimum_similarity:
+        return (content, "")
+    # an independent note, not a highlight
+    elif content and not written:
+        return ("", content)
+    # both a highlight and a note
+    elif content:
+        return (written, content)
+    # highlight with selection not in note
+    return (written, "")