From e325b89c9b2699e7d9e1333ce5a7f7a926c3b168 Mon Sep 17 00:00:00 2001
From: Marty Oehme <marty.oehme@gmail.com>
Date: Tue, 29 Aug 2023 12:40:36 +0200
Subject: [PATCH 1/3] Move all extraction logic into extractor module

The publically accessible default interface only contains
the command line command interface and a single run function.
---
 papis_extract/__init__.py  | 64 +++++++++-----------------------------
 papis_extract/extractor.py | 52 +++++++++++++++++++++++++++++--
 2 files changed, 64 insertions(+), 52 deletions(-)

diff --git a/papis_extract/__init__.py b/papis_extract/__init__.py
index 0b44c48..af3a834 100644
--- a/papis_extract/__init__.py
+++ b/papis_extract/__init__.py
@@ -1,19 +1,14 @@
-from pathlib import Path
-import re
-
 import click
-import fitz_new as fitz
-import magic
 import papis.cli
 import papis.config
 import papis.document
-from papis.document import Document
 import papis.logging
 import papis.notes
 import papis.strings
+from papis.document import Document
 
 from papis_extract import extractor, exporter
-from papis_extract.annotation_data import Annotation, AnnotatedDocument
+from papis_extract.annotation_data import AnnotatedDocument
 
 logger = papis.logging.get_logger(__name__)
 
@@ -76,50 +71,19 @@ def main(
         logger.warning(papis.strings.no_documents_retrieved_message)
         return
 
-    doc_annotations: list[AnnotatedDocument] = _get_annotations_for_documents(documents)
+    run(documents, edit=manual, write=write, git=git)
+
+
+def run(
+    documents: list[Document],
+    edit: bool = False,
+    write: bool = False,
+    git: bool = False,
+) -> None:
+
+    doc_annotations: list[AnnotatedDocument] = extractor.start(documents)
 
     if write:
-        exporter.to_notes(doc_annotations, edit=manual, git=git)
+        exporter.to_notes(doc_annotations, edit=edit, git=git)
     else:
         exporter.to_stdout(doc_annotations)
-
-    # note_file: Path = Path(papis.notes.notes_path_ensured(documents[0]))
-
-
-def is_pdf(fname: Path) -> bool:
-    return magic.from_file(fname, mime=True) == "application/pdf"
-
-
-def _get_annotations_for_documents(
-    documents: list[Document],
-) -> list[AnnotatedDocument]:
-    output: list[AnnotatedDocument] = []
-    for doc in documents:
-        annotations: list[Annotation] = []
-        found_pdf: bool = False
-        for file in doc.get_files():
-            fname = Path(file)
-            if not _is_file_processable(fname):
-                break
-            found_pdf = True
-
-            try:
-                annotations.extend(extractor.start(fname))
-            except fitz.FileDataError as e:
-                print(f"File structure errors for {file}.\n{e}")
-
-        if not found_pdf:
-            # have to remove curlys or papis logger gets upset
-            desc = re.sub("[{}]", "", papis.document.describe(doc))
-            logger.warning("Did not find suitable PDF file for document: " f"{desc}")
-        output.append(AnnotatedDocument(doc, annotations))
-    return output
-
-
-def _is_file_processable(fname: Path) -> bool:
-    if not fname.is_file():
-        logger.error(f"File {str(fname)} not readable.")
-        return False
-    if not is_pdf(fname):
-        return False
-    return True
diff --git a/papis_extract/extractor.py b/papis_extract/extractor.py
index 88ff012..51d6ade 100644
--- a/papis_extract/extractor.py
+++ b/papis_extract/extractor.py
@@ -1,17 +1,51 @@
+import re
 from pathlib import Path
 from typing import Any, Optional
 
 import Levenshtein
+import magic
 import fitz_new as fitz
 import papis.logging
 import papis.config
+import papis.document
+from papis.document import Document
 
-from papis_extract.annotation_data import Annotation
+from papis_extract.annotation_data import Annotation, AnnotatedDocument
 
 logger = papis.logging.get_logger(__name__)
 
+def start(
+    documents: list[Document],
+) -> list[AnnotatedDocument]:
+    """Extract all annotations from passed documents.
 
-def start(filename: Path) -> list[Annotation]:
+    Returns all annotations contained in the papis 
+    documents passed in.
+    """
+
+    output: list[AnnotatedDocument] = []
+    for doc in documents:
+        annotations: list[Annotation] = []
+        found_pdf: bool = False
+        for file in doc.get_files():
+            fname = Path(file)
+            if not _is_file_processable(fname):
+                break
+            found_pdf = True
+
+            try:
+                annotations.extend(extract(fname))
+            except fitz.FileDataError as e:
+                print(f"File structure errors for {file}.\n{e}")
+
+        if not found_pdf:
+            # have to remove curlys or papis logger gets upset
+            desc = re.sub("[{}]", "", papis.document.describe(doc))
+            logger.warning("Did not find suitable PDF file for document: " f"{desc}")
+        output.append(AnnotatedDocument(doc, annotations))
+    return output
+
+def extract(filename: Path) -> list[Annotation]:
     """Extract annotations from a file.
 
     Returns all readable annotations contained in the file
@@ -41,6 +75,20 @@ def start(filename: Path) -> list[Annotation]:
     return annotations
 
 
+def is_pdf(fname: Path) -> bool:
+    return magic.from_file(fname, mime=True) == "application/pdf"
+
+
+
+
+def _is_file_processable(fname: Path) -> bool:
+    if not fname.is_file():
+        logger.error(f"File {str(fname)} not readable.")
+        return False
+    if not is_pdf(fname):
+        return False
+    return True
+
 def _tag_from_colorname(colorname: str) -> str:
     color_mapping: dict[str, str] = getdict("tags", "plugins.extract")
     if not color_mapping:

From 256117d45175ded87c3887ec5ae74c05442f8780 Mon Sep 17 00:00:00 2001
From: Marty Oehme <marty.oehme@gmail.com>
Date: Tue, 29 Aug 2023 13:49:22 +0200
Subject: [PATCH 2/3] Add mustache templating

Added mustache templating engine to be able to provide custom
formatting strings.
---
 papis_extract/annotation_data.py | 42 +++++++++++-----------------
 poetry.lock                      | 13 ++++++++-
 pyproject.toml                   |  1 +
 tests/test_annotation.py         | 48 ++++++++++++++++++++++++++++++--
 4 files changed, 75 insertions(+), 29 deletions(-)

diff --git a/papis_extract/annotation_data.py b/papis_extract/annotation_data.py
index 298f964..26e63cd 100644
--- a/papis_extract/annotation_data.py
+++ b/papis_extract/annotation_data.py
@@ -1,9 +1,9 @@
-import re
 import math
 from dataclasses import dataclass, field
 
 import papis.config
 from papis.document import Document
+import chevron
 
 TEXT_SIMILARITY_MINIMUM = 0.75
 COLOR_SIMILARITY_MINIMUM = 0.833
@@ -23,12 +23,13 @@ class Annotation:
     """A PDF annotation object"""
 
     file: str
-    type: str = "Highlight"
-    text: str = ""
-    content: str = ""
-    page: int = 1
     colors: dict = field(default_factory=lambda: {"stroke": (0.0, 0.0, 0.0)})
+    content: str = ""
+    page: int = 0
     tag: str = ""
+    text: str = ""
+    type: str = "Highlight"
+    minimum_similarity_color: float = 1.0
 
     def format(self, formatting):
         """Return a formatted string of the annotation.
@@ -37,27 +38,15 @@ class Annotation:
         formatted with the correct marker replacements and removals, ready
         for display or writing.
         """
-        output = formatting
-        replacements = {
-            r"{quote}": self.text,
-            r"{note}": self.content,
-            r"{page}": str(self.page),
-            r"{newline}": "\n",
-            r"{tag}": self.tag,
+        data = {
+            "file": self.file,
+            "quote": self.text,
+            "note": self.content,
+            "page": self.page,
+            "tag": self.tag,
+            "type": self.type,
         }
-        pattern = re.compile(
-            "|".join(
-                [re.escape(k) for k in sorted(replacements, key=len, reverse=True)]
-            ),
-            flags=re.DOTALL,
-        )
-        patt_quote_container = re.compile(r"{%quote_container(.*?)%}")
-        patt_note_container = re.compile(r"{%note_container(.*?)%}")
-        patt_tag_container = re.compile(r"{%tag_container(.*?)%}")
-        output = patt_quote_container.sub(r"\1" if self.text else "", output)
-        output = patt_note_container.sub(r"\1" if self.content else "", output)
-        output = patt_tag_container.sub(r"\1" if self.tag else "", output)
-        return pattern.sub(lambda x: replacements[x.group(0)], output)
+        return chevron.render(formatting, data)
 
     @property
     def colorname(self):
@@ -73,9 +62,10 @@ class Annotation:
         minimum_similarity = (
             papis.config.getfloat("minimum_similarity_color", "plugins.extract") or 1.0
         )
+        minimum_similarity = self.minimum_similarity_color
         for name, values in COLORS.items():
             similarity_ratio = self._color_similarity_ratio(values, annot_colors)
-            if similarity_ratio > minimum_similarity:
+            if similarity_ratio >= minimum_similarity:
                 minimum_similarity = similarity_ratio
                 nearest = name
         return nearest
diff --git a/poetry.lock b/poetry.lock
index 8d6734d..1336bfc 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -147,6 +147,17 @@ files = [
     {file = "charset_normalizer-3.2.0-py3-none-any.whl", hash = "sha256:8e098148dd37b4ce3baca71fb394c81dc5d9c7728c95df695d2dca218edf40e6"},
 ]
 
+[[package]]
+name = "chevron"
+version = "0.14.0"
+description = "Mustache templating language renderer"
+optional = false
+python-versions = "*"
+files = [
+    {file = "chevron-0.14.0-py3-none-any.whl", hash = "sha256:fbf996a709f8da2e745ef763f482ce2d311aa817d287593a5b990d6d6e4f0443"},
+    {file = "chevron-0.14.0.tar.gz", hash = "sha256:87613aafdf6d77b6a90ff073165a61ae5086e21ad49057aa0e53681601800ebf"},
+]
+
 [[package]]
 name = "click"
 version = "8.1.7"
@@ -980,4 +991,4 @@ files = [
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.11"
-content-hash = "d519605837788792d06ffc7bca7a92b315612ca6052227c53c558ec49dffec9f"
+content-hash = "a3af36ed2941235df158c20ba9b66bdf5a0af0554235fd004ec77c3e88def3c3"
diff --git a/pyproject.toml b/pyproject.toml
index 4ca3257..8ee68a7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -14,6 +14,7 @@ papis = "^0.13"
 click = "^8.1.7"
 whoosh = "^2.7.4"
 python-magic = "^0.4.27"
+chevron = "^0.14.0"
 
 [tool.poetry.plugins."papis.command"]
 extract = "papis_extract:main"
diff --git a/tests/test_annotation.py b/tests/test_annotation.py
index 72c7a75..d9f6188 100644
--- a/tests/test_annotation.py
+++ b/tests/test_annotation.py
@@ -1,7 +1,51 @@
+import pytest
 from papis_extract.annotation_data import Annotation
 
 
-def test_matches_colorname_exact():
-    sut = Annotation("testfile", colors={"stroke": (1.0, 0.0, 0.0)})
+@pytest.mark.parametrize(
+    "fmt_string,expected",
+    [
+        ("{{quote}}", "I am the text value"),
+        (
+            "> {{quote}}\n{{#note}}Note: {{note}}{{/note}}",
+            "> I am the text value\nNote: Whereas I represent the note",
+        ),
+        (
+            "{{#note}}Note: {{note}}{{/note}}{{#page}}, p. {{page}}{{/page}}",
+            "Note: Whereas I represent the note",
+        ),
+    ],
+)
+def test_formatting(fmt_string, expected):
+    sut = Annotation(
+        "myfile",
+        text="I am the text value",
+        content="Whereas I represent the note",
+    )
+
+    assert sut.format(fmt_string) == expected
+
+def test_colorname_matches_exact():
+    sut = Annotation(
+        "testfile", colors={"stroke": (1.0,0.0,0.0)}, minimum_similarity_color=1.0
+    )
+    c_name = sut.colorname
+    assert c_name == "red"
+
+# TODO inject closeness value instead of relying on default
+@pytest.mark.parametrize(
+    "color_value",
+    [
+        (1.0, 0.0, 0.0),
+        (0.9, 0.0, 0.0),
+        (0.8, 0.0, 0.0),
+        (0.7, 0.0, 0.0),
+        (0.51, 0.0, 0.0),
+    ],
+)
+def test_matches_inexact_colorname(color_value):
+    sut = Annotation(
+        "testfile", colors={"stroke": color_value}, minimum_similarity_color=0.833
+    )
     c_name = sut.colorname
     assert c_name == "red"

From 20873e6ef88184c60a25b2fd5d2f2294fad35650 Mon Sep 17 00:00:00 2001
From: Marty Oehme <marty.oehme@gmail.com>
Date: Tue, 29 Aug 2023 22:23:52 +0200
Subject: [PATCH 3/3] Change annotation color to simple rgb tuple

---
 papis_extract/annotation_data.py |  4 ++--
 papis_extract/extractor.py       | 14 ++++++++++----
 tests/test_annotation.py         |  4 ++--
 3 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/papis_extract/annotation_data.py b/papis_extract/annotation_data.py
index 26e63cd..fa06e0d 100644
--- a/papis_extract/annotation_data.py
+++ b/papis_extract/annotation_data.py
@@ -23,7 +23,7 @@ class Annotation:
     """A PDF annotation object"""
 
     file: str
-    colors: dict = field(default_factory=lambda: {"stroke": (0.0, 0.0, 0.0)})
+    colors: tuple[float, float, float] = field(default_factory=lambda: (0.0, 0.0, 0.0))
     content: str = ""
     page: int = 0
     tag: str = ""
@@ -56,7 +56,7 @@ class Annotation:
         using euclidian distance between the two color vectors.
         """
         annot_colors = (
-            self.colors.get("stroke") or self.colors.get("fill") or (0.0, 0.0, 0.0)
+            self.colors or (0.0, 0.0, 0.0)
         )
         nearest = None
         minimum_similarity = (
diff --git a/papis_extract/extractor.py b/papis_extract/extractor.py
index 51d6ade..a4bc536 100644
--- a/papis_extract/extractor.py
+++ b/papis_extract/extractor.py
@@ -14,12 +14,13 @@ from papis_extract.annotation_data import Annotation, AnnotatedDocument
 
 logger = papis.logging.get_logger(__name__)
 
+
 def start(
     documents: list[Document],
 ) -> list[AnnotatedDocument]:
     """Extract all annotations from passed documents.
 
-    Returns all annotations contained in the papis 
+    Returns all annotations contained in the papis
     documents passed in.
     """
 
@@ -45,6 +46,7 @@ def start(
         output.append(AnnotatedDocument(doc, annotations))
     return output
 
+
 def extract(filename: Path) -> list[Annotation]:
     """Extract annotations from a file.
 
@@ -58,11 +60,16 @@ def extract(filename: Path) -> list[Annotation]:
                 quote, note = _retrieve_annotation_content(page, annot)
                 if not quote and not note:
                     continue
+                col = (
+                    annot.colors.get("fill")
+                    or annot.colors.get("stroke")
+                    or (0.0, 0.0, 0.0)
+                )
                 a = Annotation(
                     file=str(filename),
                     text=quote or "",
                     content=note or "",
-                    colors=annot.colors,
+                    colors=col,
                     type=annot.type[1],
                     page=(page.number or 0) + 1,
                 )
@@ -79,8 +86,6 @@ def is_pdf(fname: Path) -> bool:
     return magic.from_file(fname, mime=True) == "application/pdf"
 
 
-
-
 def _is_file_processable(fname: Path) -> bool:
     if not fname.is_file():
         logger.error(f"File {str(fname)} not readable.")
@@ -89,6 +94,7 @@ def _is_file_processable(fname: Path) -> bool:
         return False
     return True
 
+
 def _tag_from_colorname(colorname: str) -> str:
     color_mapping: dict[str, str] = getdict("tags", "plugins.extract")
     if not color_mapping:
diff --git a/tests/test_annotation.py b/tests/test_annotation.py
index d9f6188..542c3a8 100644
--- a/tests/test_annotation.py
+++ b/tests/test_annotation.py
@@ -27,7 +27,7 @@ def test_formatting(fmt_string, expected):
 
 def test_colorname_matches_exact():
     sut = Annotation(
-        "testfile", colors={"stroke": (1.0,0.0,0.0)}, minimum_similarity_color=1.0
+        "testfile", colors=(1.0,0.0,0.0), minimum_similarity_color=1.0
     )
     c_name = sut.colorname
     assert c_name == "red"
@@ -45,7 +45,7 @@ def test_colorname_matches_exact():
 )
 def test_matches_inexact_colorname(color_value):
     sut = Annotation(
-        "testfile", colors={"stroke": color_value}, minimum_similarity_color=0.833
+        "testfile", colors=color_value, minimum_similarity_color=0.833
     )
     c_name = sut.colorname
     assert c_name == "red"