6 changed files with 84 additions and 148 deletions
--- a/papis_extract/init.py
+++ b/papis_extract/init.py
@ -1,14 +1,19 @@
 from pathlib import Path
 import re
 import click
 import fitz_new as fitz
 import magic
 import papis.cli
 import papis.config
 import papis.document
 from papis.document import Document
 import papis.logging
 import papis.notes
 import papis.strings
 from papis.document import Document
 from papis_extract import extractor, exporter
-from papis_extract.annotation_data import AnnotatedDocument
+from papis_extract.annotation_data import Annotation, AnnotatedDocument
 logger = papis.logging.get_logger(__name__)
@ -71,19 +76,50 @@ def main(
        logger.warning(papis.strings.no_documents_retrieved_message)
        return
-    run(documents, edit=manual, write=write, git=git)
+    doc_annotations: list[AnnotatedDocument] = _get_annotations_for_documents(documents)
 def run(
    documents: list[Document],
    edit: bool = False,
    write: bool = False,
    git: bool = False,
 ) -> None:
    doc_annotations: list[AnnotatedDocument] = extractor.start(documents)
    if write:
-        exporter.to_notes(doc_annotations, edit=edit, git=git)
+        exporter.to_notes(doc_annotations, edit=manual, git=git)
    else:
        exporter.to_stdout(doc_annotations)
    # note_file: Path = Path(papis.notes.notes_path_ensured(documents[0]))
 def is_pdf(fname: Path) -> bool:
    return magic.from_file(fname, mime=True) == "application/pdf"
 def _get_annotations_for_documents(
    documents: list[Document],
 ) -> list[AnnotatedDocument]:
    output: list[AnnotatedDocument] = []
    for doc in documents:
        annotations: list[Annotation] = []
        found_pdf: bool = False
        for file in doc.get_files():
            fname = Path(file)
            if not _is_file_processable(fname):
                break
            found_pdf = True
            try:
                annotations.extend(extractor.start(fname))
            except fitz.FileDataError as e:
                print(f"File structure errors for {file}.\n{e}")
        if not found_pdf:
            # have to remove curlys or papis logger gets upset
            desc = re.sub("[{}]", "", papis.document.describe(doc))
            logger.warning("Did not find suitable PDF file for document: " f"{desc}")
        output.append(AnnotatedDocument(doc, annotations))
    return output
 def _is_file_processable(fname: Path) -> bool:
    if not fname.is_file():
        logger.error(f"File {str(fname)} not readable.")
        return False
    if not is_pdf(fname):
        return False
    return True
--- a/papis_extract/annotation_data.py
+++ b/papis_extract/annotation_data.py
@ -1,9 +1,9 @@
 import re
 import math
 from dataclasses import dataclass, field
 import papis.config
 from papis.document import Document
 import chevron
 TEXT_SIMILARITY_MINIMUM = 0.75
 COLOR_SIMILARITY_MINIMUM = 0.833
@ -23,13 +23,12 @@ class Annotation:
    """A PDF annotation object"""
    file: str
    colors: tuple[float, float, float] = field(default_factory=lambda: (0.0, 0.0, 0.0))
    content: str = ""
    page: int = 0
    tag: str = ""
    text: str = ""
    type: str = "Highlight"
-    minimum_similarity_color: float = 1.0
+    text: str = ""
    content: str = ""
    page: int = 1
    colors: dict = field(default_factory=lambda: {"stroke": (0.0, 0.0, 0.0)})
    tag: str = ""
    def format(self, formatting):
        """Return a formatted string of the annotation.
@ -38,15 +37,27 @@ class Annotation:
        formatted with the correct marker replacements and removals, ready
        for display or writing.
        """
-        data = {
+        output = formatting
-            "file": self.file,
+        replacements = {
-            "quote": self.text,
+            r"{quote}": self.text,
-            "note": self.content,
+            r"{note}": self.content,
-            "page": self.page,
+            r"{page}": str(self.page),
-            "tag": self.tag,
+            r"{newline}": "\n",
-            "type": self.type,
+            r"{tag}": self.tag,
        }
-        return chevron.render(formatting, data)
+        pattern = re.compile(
            "|".join(
                [re.escape(k) for k in sorted(replacements, key=len, reverse=True)]
            ),
            flags=re.DOTALL,
        )
        patt_quote_container = re.compile(r"{%quote_container(.*?)%}")
        patt_note_container = re.compile(r"{%note_container(.*?)%}")
        patt_tag_container = re.compile(r"{%tag_container(.*?)%}")
        output = patt_quote_container.sub(r"\1" if self.text else "", output)
        output = patt_note_container.sub(r"\1" if self.content else "", output)
        output = patt_tag_container.sub(r"\1" if self.tag else "", output)
        return pattern.sub(lambda x: replacements[x.group(0)], output)
    @property
    def colorname(self):
@ -56,16 +67,15 @@ class Annotation:
        using euclidian distance between the two color vectors.
        """
        annot_colors = (
-            self.colors or (0.0, 0.0, 0.0)
+            self.colors.get("stroke") or self.colors.get("fill") or (0.0, 0.0, 0.0)
        )
        nearest = None
        minimum_similarity = (
            papis.config.getfloat("minimum_similarity_color", "plugins.extract") or 1.0
        )
        minimum_similarity = self.minimum_similarity_color
        for name, values in COLORS.items():
            similarity_ratio = self._color_similarity_ratio(values, annot_colors)
-            if similarity_ratio >= minimum_similarity:
+            if similarity_ratio > minimum_similarity:
                minimum_similarity = similarity_ratio
                nearest = name
        return nearest
--- a/papis_extract/extractor.py
+++ b/papis_extract/extractor.py
@ -1,53 +1,17 @@
 import re
 from pathlib import Path
 from typing import Any, Optional
 import Levenshtein
 import magic
 import fitz_new as fitz
 import papis.logging
 import papis.config
 import papis.document
 from papis.document import Document
-from papis_extract.annotation_data import Annotation, AnnotatedDocument
+from papis_extract.annotation_data import Annotation
 logger = papis.logging.get_logger(__name__)
-def start(
+def start(filename: Path) -> list[Annotation]:
    documents: list[Document],
 ) -> list[AnnotatedDocument]:
    """Extract all annotations from passed documents.
    Returns all annotations contained in the papis
    documents passed in.
    """
    output: list[AnnotatedDocument] = []
    for doc in documents:
        annotations: list[Annotation] = []
        found_pdf: bool = False
        for file in doc.get_files():
            fname = Path(file)
            if not _is_file_processable(fname):
                break
            found_pdf = True
            try:
                annotations.extend(extract(fname))
            except fitz.FileDataError as e:
                print(f"File structure errors for {file}.\n{e}")
        if not found_pdf:
            # have to remove curlys or papis logger gets upset
            desc = re.sub("[{}]", "", papis.document.describe(doc))
            logger.warning("Did not find suitable PDF file for document: " f"{desc}")
        output.append(AnnotatedDocument(doc, annotations))
    return output
 def extract(filename: Path) -> list[Annotation]:
    """Extract annotations from a file.
    Returns all readable annotations contained in the file
@ -60,16 +24,11 @@ def extract(filename: Path) -> list[Annotation]:
                quote, note = _retrieve_annotation_content(page, annot)
                if not quote and not note:
                    continue
                col = (
                    annot.colors.get("fill")
                    or annot.colors.get("stroke")
                    or (0.0, 0.0, 0.0)
                )
                a = Annotation(
                    file=str(filename),
                    text=quote or "",
                    content=note or "",
-                    colors=col,
+                    colors=annot.colors,
                    type=annot.type[1],
                    page=(page.number or 0) + 1,
                )
@ -82,19 +41,6 @@ def extract(filename: Path) -> list[Annotation]:
    return annotations
 def is_pdf(fname: Path) -> bool:
    return magic.from_file(fname, mime=True) == "application/pdf"
 def _is_file_processable(fname: Path) -> bool:
    if not fname.is_file():
        logger.error(f"File {str(fname)} not readable.")
        return False
    if not is_pdf(fname):
        return False
    return True
 def _tag_from_colorname(colorname: str) -> str:
    color_mapping: dict[str, str] = getdict("tags", "plugins.extract")
    if not color_mapping:
--- a/poetry.lock
+++ b/poetry.lock
@ -147,17 +147,6 @@ files = [
    {file = "charset_normalizer-3.2.0-py3-none-any.whl", hash = "sha256:8e098148dd37b4ce3baca71fb394c81dc5d9c7728c95df695d2dca218edf40e6"},
 ]
 [[package]]
 name = "chevron"
 version = "0.14.0"
 description = "Mustache templating language renderer"
 optional = false
 python-versions = "*"
 files = [
    {file = "chevron-0.14.0-py3-none-any.whl", hash = "sha256:fbf996a709f8da2e745ef763f482ce2d311aa817d287593a5b990d6d6e4f0443"},
    {file = "chevron-0.14.0.tar.gz", hash = "sha256:87613aafdf6d77b6a90ff073165a61ae5086e21ad49057aa0e53681601800ebf"},
 ]
 [[package]]
 name = "click"
 version = "8.1.7"
@ -991,4 +980,4 @@ files = [
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.11"
-content-hash = "a3af36ed2941235df158c20ba9b66bdf5a0af0554235fd004ec77c3e88def3c3"
+content-hash = "d519605837788792d06ffc7bca7a92b315612ca6052227c53c558ec49dffec9f"
--- a/pyproject.toml
+++ b/pyproject.toml
@ -14,7 +14,6 @@ papis = "^0.13"
 click = "^8.1.7"
 whoosh = "^2.7.4"
 python-magic = "^0.4.27"
 chevron = "^0.14.0"
 [tool.poetry.plugins."papis.command"]
 extract = "papis_extract:main"
--- a/tests/test_annotation.py
+++ b/tests/test_annotation.py
@ -1,51 +1,7 @@
 import pytest
 from papis_extract.annotation_data import Annotation
-@pytest.mark.parametrize(
+def test_matches_colorname_exact():
-    "fmt_string,expected",
+    sut = Annotation("testfile", colors={"stroke": (1.0, 0.0, 0.0)})
    [
        ("{{quote}}", "I am the text value"),
        (
            "> {{quote}}\n{{#note}}Note: {{note}}{{/note}}",
            "> I am the text value\nNote: Whereas I represent the note",
        ),
        (
            "{{#note}}Note: {{note}}{{/note}}{{#page}}, p. {{page}}{{/page}}",
            "Note: Whereas I represent the note",
        ),
    ],
 )
 def test_formatting(fmt_string, expected):
    sut = Annotation(
        "myfile",
        text="I am the text value",
        content="Whereas I represent the note",
    )
    assert sut.format(fmt_string) == expected
 def test_colorname_matches_exact():
    sut = Annotation(
        "testfile", colors=(1.0,0.0,0.0), minimum_similarity_color=1.0
    )
    c_name = sut.colorname
    assert c_name == "red"
 # TODO inject closeness value instead of relying on default
@pytest.mark.parametrize(
    "color_value",
    [
        (1.0, 0.0, 0.0),
        (0.9, 0.0, 0.0),
        (0.8, 0.0, 0.0),
        (0.7, 0.0, 0.0),
        (0.51, 0.0, 0.0),
    ],
 )
 def test_matches_inexact_colorname(color_value):
    sut = Annotation(
        "testfile", colors=color_value, minimum_similarity_color=0.833
    )
    c_name = sut.colorname
    assert c_name == "red"