Change annotation color to simple rgb tuple

Add mustache templating
Added mustache templating engine to be able to provide custom formatting strings.
2023-08-29 22:23:52 +02:00 · 2023-08-29 13:49:22 +02:00 · 2023-08-29 12:40:36 +02:00
6 changed files with 147 additions and 83 deletions
--- a/papis_extract/init.py
+++ b/papis_extract/init.py
@ -1,19 +1,14 @@
-from pathlib import Path
-import re
-
 import click
-import fitz_new as fitz
-import magic
 import papis.cli
 import papis.config
 import papis.document
-from papis.document import Document
 import papis.logging
 import papis.notes
 import papis.strings
+from papis.document import Document

 from papis_extract import extractor, exporter
-from papis_extract.annotation_data import Annotation, AnnotatedDocument
+from papis_extract.annotation_data import AnnotatedDocument

 logger = papis.logging.get_logger(__name__)

@ -76,50 +71,19 @@ def main(
        logger.warning(papis.strings.no_documents_retrieved_message)
        return

-    doc_annotations: list[AnnotatedDocument] = _get_annotations_for_documents(documents)
+    run(documents, edit=manual, write=write, git=git)
+
+
+def run(
+    documents: list[Document],
+    edit: bool = False,
+    write: bool = False,
+    git: bool = False,
+) -> None:
+
+    doc_annotations: list[AnnotatedDocument] = extractor.start(documents)

    if write:
-        exporter.to_notes(doc_annotations, edit=manual, git=git)
+        exporter.to_notes(doc_annotations, edit=edit, git=git)
    else:
        exporter.to_stdout(doc_annotations)
-
-    # note_file: Path = Path(papis.notes.notes_path_ensured(documents[0]))
-
-
-def is_pdf(fname: Path) -> bool:
-    return magic.from_file(fname, mime=True) == "application/pdf"
-
-
-def _get_annotations_for_documents(
-    documents: list[Document],
-) -> list[AnnotatedDocument]:
-    output: list[AnnotatedDocument] = []
-    for doc in documents:
-        annotations: list[Annotation] = []
-        found_pdf: bool = False
-        for file in doc.get_files():
-            fname = Path(file)
-            if not _is_file_processable(fname):
-                break
-            found_pdf = True
-
-            try:
-                annotations.extend(extractor.start(fname))
-            except fitz.FileDataError as e:
-                print(f"File structure errors for {file}.\n{e}")
-
-        if not found_pdf:
-            # have to remove curlys or papis logger gets upset
-            desc = re.sub("[{}]", "", papis.document.describe(doc))
-            logger.warning("Did not find suitable PDF file for document: " f"{desc}")
-        output.append(AnnotatedDocument(doc, annotations))
-    return output
-
-
-def _is_file_processable(fname: Path) -> bool:
-    if not fname.is_file():
-        logger.error(f"File {str(fname)} not readable.")
-        return False
-    if not is_pdf(fname):
-        return False
-    return True
--- a/papis_extract/annotation_data.py
+++ b/papis_extract/annotation_data.py
@ -1,9 +1,9 @@
-import re
 import math
 from dataclasses import dataclass, field

 import papis.config
 from papis.document import Document
+import chevron

 TEXT_SIMILARITY_MINIMUM = 0.75
 COLOR_SIMILARITY_MINIMUM = 0.833
@ -23,12 +23,13 @@ class Annotation:
    """A PDF annotation object"""

    file: str
-    type: str = "Highlight"
-    text: str = ""
+    colors: tuple[float, float, float] = field(default_factory=lambda: (0.0, 0.0, 0.0))
    content: str = ""
-    page: int = 1
-    colors: dict = field(default_factory=lambda: {"stroke": (0.0, 0.0, 0.0)})
+    page: int = 0
    tag: str = ""
+    text: str = ""
+    type: str = "Highlight"
+    minimum_similarity_color: float = 1.0

    def format(self, formatting):
        """Return a formatted string of the annotation.
@ -37,27 +38,15 @@ class Annotation:
        formatted with the correct marker replacements and removals, ready
        for display or writing.
        """
-        output = formatting
-        replacements = {
-            r"{quote}": self.text,
-            r"{note}": self.content,
-            r"{page}": str(self.page),
-            r"{newline}": "\n",
-            r"{tag}": self.tag,
+        data = {
+            "file": self.file,
+            "quote": self.text,
+            "note": self.content,
+            "page": self.page,
+            "tag": self.tag,
+            "type": self.type,
        }
-        pattern = re.compile(
-            "|".join(
-                [re.escape(k) for k in sorted(replacements, key=len, reverse=True)]
-            ),
-            flags=re.DOTALL,
-        )
-        patt_quote_container = re.compile(r"{%quote_container(.*?)%}")
-        patt_note_container = re.compile(r"{%note_container(.*?)%}")
-        patt_tag_container = re.compile(r"{%tag_container(.*?)%}")
-        output = patt_quote_container.sub(r"\1" if self.text else "", output)
-        output = patt_note_container.sub(r"\1" if self.content else "", output)
-        output = patt_tag_container.sub(r"\1" if self.tag else "", output)
-        return pattern.sub(lambda x: replacements[x.group(0)], output)
+        return chevron.render(formatting, data)

    @property
    def colorname(self):
@ -67,15 +56,16 @@ class Annotation:
        using euclidian distance between the two color vectors.
        """
        annot_colors = (
-            self.colors.get("stroke") or self.colors.get("fill") or (0.0, 0.0, 0.0)
+            self.colors or (0.0, 0.0, 0.0)
        )
        nearest = None
        minimum_similarity = (
            papis.config.getfloat("minimum_similarity_color", "plugins.extract") or 1.0
        )
+        minimum_similarity = self.minimum_similarity_color
        for name, values in COLORS.items():
            similarity_ratio = self._color_similarity_ratio(values, annot_colors)
-            if similarity_ratio > minimum_similarity:
+            if similarity_ratio >= minimum_similarity:
                minimum_similarity = similarity_ratio
                nearest = name
        return nearest
--- a/papis_extract/extractor.py
+++ b/papis_extract/extractor.py
@ -1,17 +1,53 @@
+import re
 from pathlib import Path
 from typing import Any, Optional

 import Levenshtein
+import magic
 import fitz_new as fitz
 import papis.logging
 import papis.config
+import papis.document
+from papis.document import Document

-from papis_extract.annotation_data import Annotation
+from papis_extract.annotation_data import Annotation, AnnotatedDocument

 logger = papis.logging.get_logger(__name__)


-def start(filename: Path) -> list[Annotation]:
+def start(
+    documents: list[Document],
+) -> list[AnnotatedDocument]:
+    """Extract all annotations from passed documents.
+
+    Returns all annotations contained in the papis
+    documents passed in.
+    """
+
+    output: list[AnnotatedDocument] = []
+    for doc in documents:
+        annotations: list[Annotation] = []
+        found_pdf: bool = False
+        for file in doc.get_files():
+            fname = Path(file)
+            if not _is_file_processable(fname):
+                break
+            found_pdf = True
+
+            try:
+                annotations.extend(extract(fname))
+            except fitz.FileDataError as e:
+                print(f"File structure errors for {file}.\n{e}")
+
+        if not found_pdf:
+            # have to remove curlys or papis logger gets upset
+            desc = re.sub("[{}]", "", papis.document.describe(doc))
+            logger.warning("Did not find suitable PDF file for document: " f"{desc}")
+        output.append(AnnotatedDocument(doc, annotations))
+    return output
+
+
+def extract(filename: Path) -> list[Annotation]:
    """Extract annotations from a file.

    Returns all readable annotations contained in the file
@ -24,11 +60,16 @@ def start(filename: Path) -> list[Annotation]:
                quote, note = _retrieve_annotation_content(page, annot)
                if not quote and not note:
                    continue
+                col = (
+                    annot.colors.get("fill")
+                    or annot.colors.get("stroke")
+                    or (0.0, 0.0, 0.0)
+                )
                a = Annotation(
                    file=str(filename),
                    text=quote or "",
                    content=note or "",
-                    colors=annot.colors,
+                    colors=col,
                    type=annot.type[1],
                    page=(page.number or 0) + 1,
                )
@ -41,6 +82,19 @@ def start(filename: Path) -> list[Annotation]:
    return annotations


+def is_pdf(fname: Path) -> bool:
+    return magic.from_file(fname, mime=True) == "application/pdf"
+
+
+def _is_file_processable(fname: Path) -> bool:
+    if not fname.is_file():
+        logger.error(f"File {str(fname)} not readable.")
+        return False
+    if not is_pdf(fname):
+        return False
+    return True
+
+
 def _tag_from_colorname(colorname: str) -> str:
    color_mapping: dict[str, str] = getdict("tags", "plugins.extract")
    if not color_mapping:
--- a/poetry.lock
+++ b/poetry.lock
@ -147,6 +147,17 @@ files = [
    {file = "charset_normalizer-3.2.0-py3-none-any.whl", hash = "sha256:8e098148dd37b4ce3baca71fb394c81dc5d9c7728c95df695d2dca218edf40e6"},
 ]

+[[package]]
+name = "chevron"
+version = "0.14.0"
+description = "Mustache templating language renderer"
+optional = false
+python-versions = "*"
+files = [
+    {file = "chevron-0.14.0-py3-none-any.whl", hash = "sha256:fbf996a709f8da2e745ef763f482ce2d311aa817d287593a5b990d6d6e4f0443"},
+    {file = "chevron-0.14.0.tar.gz", hash = "sha256:87613aafdf6d77b6a90ff073165a61ae5086e21ad49057aa0e53681601800ebf"},
+]
+
 [[package]]
 name = "click"
 version = "8.1.7"
@ -980,4 +991,4 @@ files = [
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.11"
-content-hash = "d519605837788792d06ffc7bca7a92b315612ca6052227c53c558ec49dffec9f"
+content-hash = "a3af36ed2941235df158c20ba9b66bdf5a0af0554235fd004ec77c3e88def3c3"
--- a/pyproject.toml
+++ b/pyproject.toml
@ -14,6 +14,7 @@ papis = "^0.13"
 click = "^8.1.7"
 whoosh = "^2.7.4"
 python-magic = "^0.4.27"
+chevron = "^0.14.0"

 [tool.poetry.plugins."papis.command"]
 extract = "papis_extract:main"
--- a/tests/test_annotation.py
+++ b/tests/test_annotation.py
@ -1,7 +1,51 @@
+import pytest
 from papis_extract.annotation_data import Annotation


-def test_matches_colorname_exact():
-    sut = Annotation("testfile", colors={"stroke": (1.0, 0.0, 0.0)})
+@pytest.mark.parametrize(
+    "fmt_string,expected",
+    [
+        ("{{quote}}", "I am the text value"),
+        (
+            "> {{quote}}\n{{#note}}Note: {{note}}{{/note}}",
+            "> I am the text value\nNote: Whereas I represent the note",
+        ),
+        (
+            "{{#note}}Note: {{note}}{{/note}}{{#page}}, p. {{page}}{{/page}}",
+            "Note: Whereas I represent the note",
+        ),
+    ],
+)
+def test_formatting(fmt_string, expected):
+    sut = Annotation(
+        "myfile",
+        text="I am the text value",
+        content="Whereas I represent the note",
+    )
+
+    assert sut.format(fmt_string) == expected
+
+def test_colorname_matches_exact():
+    sut = Annotation(
+        "testfile", colors=(1.0,0.0,0.0), minimum_similarity_color=1.0
+    )
+    c_name = sut.colorname
+    assert c_name == "red"
+
+# TODO inject closeness value instead of relying on default
+@pytest.mark.parametrize(
+    "color_value",
+    [
+        (1.0, 0.0, 0.0),
+        (0.9, 0.0, 0.0),
+        (0.8, 0.0, 0.0),
+        (0.7, 0.0, 0.0),
+        (0.51, 0.0, 0.0),
+    ],
+)
+def test_matches_inexact_colorname(color_value):
+    sut = Annotation(
+        "testfile", colors=color_value, minimum_similarity_color=0.833
+    )
    c_name = sut.colorname
    assert c_name == "red"
Author	SHA1	Message	Date
Marty Oehme	20873e6ef8	Change annotation color to simple rgb tuple Some checks failed ci/woodpecker/push/test unknown status Details ci/woodpecker/push/lint Pipeline failed Details ci/woodpecker/push/static_analysis Pipeline was successful Details	2023-08-29 22:23:52 +02:00
Marty Oehme	256117d451	Add mustache templating Added mustache templating engine to be able to provide custom formatting strings.	2023-08-29 13:49:22 +02:00
Marty Oehme	e325b89c9b	Move all extraction logic into extractor module The publically accessible default interface only contains the command line command interface and a single run function.	2023-08-29 12:40:36 +02:00