refactor: Move formatting logic to formatters

Formatters (previously templates) were pure data containers before, continating the 'template' for how things should be formatted using mustache. The formatting would be done a) in the exporters and b) in the annotations. This spread of formatting has now been consolidated into the Formatter, which fixes the overall spread of formatting code and now can coherently format a whole output instead of just individual annotations. A formatter contains references to all documents and contained annotations and will format everything at once by default, but the formatting function can be invoked with reference to a specific annotated document to only format that. This commit should put more separation into the concerns of exporter and formatter and made formatting a concern purely of the formatters and annotation objects.
2023-09-19 21:43:19 +02:00 · 2023-09-19 21:43:19 +02:00 · 5a6d672c76
commit 5a6d672c76
parent 66f937e2a8
6 changed files with 138 additions and 101 deletions
--- a/README.md
+++ b/README.md
@ -177,8 +177,8 @@ Known issues to be fixed:
 - [x] Speed?
    - should be fine, on my machine (old i5 laptop) it takes around 90s for ~1000 documents with ~4000 annotations
 - [x] ensure all cmdline options do what they should
- [ ] annotations carry over color object from fitz, should just be Color object or simple tuple with rgb vals
- [ ] docstrings, docstrings!
+- [x] annotations carry over color object from fitz, should just be Color object or simple tuple with rgb vals
+- [x] docstrings, docstrings!
 - [ ] testing testing testing!!
    - [ ] refactor into some better abstractions (e.g. Exporter Protocol -> stdout/markdown implementations; Extractor Protocol -> PDF implementation)

@ -188,7 +188,7 @@ features to be implemented:
    - [x] static analysis (lint, typecheck etc) on pushes
    - [x] test pipeline on master pushes
    - [ ] release pipeline to pypi on tags
- [ ] add page number if available
+- [x] add page number if available
    - exists in Annotation, just need to place in output
 - [ ] show overall amount of extractions at the end
 - [ ] custom formatting decided by user
@ -233,14 +233,18 @@ I am not sure if there is much I can do about these issues for now.
 and for myself whenever I forget. The basic building blocks currently in here are three:

 - extractors
-: extract data from a source file attached to a papis document.
+: Extract data from a source file attached to a papis document.
+
+- annotations
+: The actual extracted blocks of text, containing some metadata 
+  info as well, such as their color, type, page.

 - exporters
-: put the extracted data somewhere like stdout or into your notes.
+: Put the extracted data somewhere. For now stdout or into your notes.

- templates
-: make sure the exporter saves the data according to your preferred layout, 
-such as a markdown syntax or csv-structure.
+- formatters
+: Make sure the exporter saves the data according to your preferred layout,
+  such as a markdown syntax or csv-structure.

 Splitting it into those three building blocks makes it easier to recombine them in any way,
 should someone want to save highlights as csv data in their notes, 
--- a/papis_extract/init.py
+++ b/papis_extract/init.py
@ -8,8 +8,7 @@ import papis.strings
 from papis.document import Document

 from papis_extract import extractor, exporter
-from papis_extract.annotation_data import AnnotatedDocument
-from papis_extract.templating import Csv, Markdown, Templating
+from papis_extract.formatter import MarkdownFormatter, Formatter

 logger = papis.logging.get_logger(__name__)

@ -39,8 +38,7 @@ papis.config.register_default_settings(DEFAULT_OPTIONS)
@click.option(
    "--manual/--no-manual",
    "-m",
-    help=
-    "Open each note in editor for manual editing after extracting its annotations.",
+    help="Open note in editor for manual editing after annotation extraction.",
 )
@click.option(
    "--template",
@ -82,23 +80,19 @@ def main(
        return

    if template == "csv":
-        template_type = Csv()
-    else:
-        template_type = Markdown()
-
-    run(documents, edit=manual, write=write, git=git, template=template_type)
+        raise NotImplementedError
+    run(documents, edit=manual, write=write, git=git, template=MarkdownFormatter())


 def run(
    documents: list[Document],
+    template: Formatter,
    edit: bool = False,
    write: bool = False,
    git: bool = False,
-    template: Templating = Markdown(),
 ) -> None:
-    doc_annotations: list[AnnotatedDocument] = extractor.start(documents)
-
+    template.annotated_docs = extractor.start(documents)
    if write:
-        exporter.to_notes(doc_annotations, template, edit=edit, git=git)
+        exporter.to_notes(template, edit=edit, git=git)
    else:
-        exporter.to_stdout(doc_annotations, template)
+        exporter.to_stdout(template)
--- a/papis_extract/annotation_data.py
+++ b/papis_extract/annotation_data.py
@ -5,8 +5,6 @@ import papis.config
 from papis.document import Document
 import chevron

-from papis_extract.templating import Templating
-
 TEXT_SIMILARITY_MINIMUM = 0.75
 COLOR_SIMILARITY_MINIMUM = 0.833

@ -36,7 +34,7 @@ class Annotation:
    type: str = "Highlight"
    minimum_similarity_color: float = 1.0

-    def format(self, template: Templating):
+    def format(self, template: str, doc: Document = Document()):
        """Return a formatted string of the annotation.

        Given a provided formatting pattern, this method returns the annotation
@ -50,8 +48,9 @@ class Annotation:
            "page": self.page,
            "tag": self.tag,
            "type": self.type,
+            "doc": doc,
        }
-        return chevron.render(template.string, data)
+        return chevron.render(template, data)

    @property
    def colorname(self):
@ -89,6 +88,7 @@ class AnnotatedDocument:
    """Contains all annotations belonging to a single papis document.

    Combines a document with a list of annotations which belong to it."""
+
    document: Document
    annotations: list[Annotation]

--- a/papis_extract/exporter.py
+++ b/papis_extract/exporter.py
@ -7,61 +7,33 @@ import papis.git
 import papis.config
 import Levenshtein

-from papis_extract.annotation_data import AnnotatedDocument
-from papis_extract.templating import Templating
+from papis_extract.formatter import Formatter

 logger = papis.logging.get_logger(__name__)


-def to_stdout(annots: list[AnnotatedDocument], template: Templating) -> None:
+def to_stdout(template: Formatter) -> None:
    """Pretty print annotations to stdout.

    Gives a nice human-readable representations of
    the annotations in somewhat of a list form.
    Not intended for machine-readability.
    """
-    if not annots:
-        return
-
-    last = annots[-1]
-    for entry in annots:
-        if not entry.annotations:
-            continue
-
-        title_decoration = (
-            f"{'=' * len(entry.document.get('title', ''))}   "
-            f"{'-' * len(entry.document.get('author', ''))}"
-        )
-        print(
-            f"{title_decoration}\n{papis.document.describe(entry.document)}\n{title_decoration}\n"
-        )
-        for a in entry.annotations:
-            print(a.format(template))
-
-        if entry != last:
-            print("\n")
+    output:str = template.execute()
+    print(output.rstrip('\n'))


-def to_notes(
-    annots: list[AnnotatedDocument], template: Templating, edit: bool, git: bool
-) -> None:
+def to_notes(template: Formatter, edit: bool, git: bool) -> None:
    """Write annotations into document notes.

    Permanently writes the given annotations into notes
    belonging to papis documents. Creates new notes for
    documents missing a note field or appends to existing.
    """
-    if not annots:
-        return
-
-    for entry in annots:
-        if not entry.annotations:
-            continue
-
-        formatted_annotations: list[str] = []
-        for a in entry.annotations:
-            formatted_annotations.append(a.format(template))
-
+    annotated_docs = template.annotated_docs
+    for entry in annotated_docs:
+        formatted_annotations = template.execute(entry).split("\n")
+        if formatted_annotations:
            _add_annots_to_note(entry.document, formatted_annotations)

        if edit:
@ -130,7 +102,9 @@ def _drop_existing_annotations(
    remaining: list[str] = []
    for an in formatted_annotations:
        an_split = an.splitlines()
-        if not _test_similarity(an_split[0], file_lines, minimum_similarity):
+        if an_split and not _test_similarity(
+            an_split[0], file_lines, minimum_similarity
+        ):
            remaining.append(an)

    return remaining
--- a/papis_extract/formatter.py
+++ b/papis_extract/formatter.py
@ -0,0 +1,100 @@
+from dataclasses import dataclass, field
+from typing import Protocol
+
+from papis_extract.annotation_data import AnnotatedDocument
+
+
+@dataclass
+class Formatter(Protocol):
+    annotated_docs: list[AnnotatedDocument]
+    header: str
+    string: str
+    footer: str
+
+    def execute(self, doc: AnnotatedDocument | None = None) -> str:
+        raise NotImplementedError
+
+
+@dataclass
+class MarkdownFormatter:
+    annotated_docs: list[AnnotatedDocument] = field(default_factory=lambda: list())
+    header: str = ""
+    string: str = (
+        "{{#tag}}#{{tag}}\n{{/tag}}"
+        "{{#quote}}> {{quote}}{{/quote}} {{#page}}[p. {{page}}]{{/page}}\n"
+        "{{#note}}  NOTE: {{note}}{{/note}}"
+    )
+    footer: str = ""
+
+    def execute(self, doc: AnnotatedDocument | None = None) -> str:
+        output = ""
+        documents = self.annotated_docs if doc is None else [doc]
+        last = documents[-1]
+        for entry in documents:
+            if not entry.annotations:
+                continue
+
+            title_decoration = (
+                f"{'=' * len(entry.document.get('title', ''))}   "
+                f"{'-' * len(entry.document.get('author', ''))}"
+            )
+            output += (
+                f"{title_decoration}\n"
+                f"{entry.document['title']} - {entry.document['author']}\n"
+                f"{title_decoration}\n\n"
+            )
+            for a in entry.annotations:
+                output += a.format(self.string)
+
+            if entry != last:
+                print(f"entry: {entry}, last: {last}")
+                output += "\n\n\n"
+
+        return output
+
+@dataclass
+class CountFormatter:
+    annotated_docs: list[AnnotatedDocument] = field(default_factory=lambda: list())
+    header: str = ""
+    string: str = ""
+    footer: str = ""
+
+    def execute(self, doc: AnnotatedDocument | None = None) -> str:
+        output = ""
+        documents = self.annotated_docs if doc is None else [doc]
+        last = documents[-1]
+        for entry in documents:
+            if not entry.annotations:
+                continue
+
+            title_decoration = (
+                f"{'=' * len(entry.document.get('title', ''))}   "
+                f"{'-' * len(entry.document.get('author', ''))}"
+            )
+            output += (
+                f"{title_decoration}\n"
+                f"{entry.document['title']} - {entry.document['author']}\n"
+                f"{title_decoration}\n\n"
+            )
+            for a in entry.annotations:
+                output += a.format(self.string)
+
+            if entry != last:
+                print(f"entry: {entry}, last: {last}")
+                output += "\n\n\n"
+
+        return output
+
+@dataclass
+class CsvFormatter:
+    header: str = "type, tag, page, quote, note, file"
+    string: str = "{{type}}, {{tag}}, {{page}}, {{quote}}, {{note}}, {{file}}"
+    footer: str = ""
+
+
+@dataclass
+class CustomFormatter:
+    def __init__(self, header: str = "", string: str = "", footer: str = "") -> None:
+        self.header = header
+        self.string = string
+        self.footer = footer
--- a/papis_extract/templating.py
+++ b/papis_extract/templating.py
@ -1,35 +0,0 @@
-from dataclasses import dataclass
-from typing import Protocol
-
-
-@dataclass
-class Templating(Protocol):
-    header: str
-    string: str
-    footer: str
-
-
-@dataclass
-class Markdown:
-    header: str = ""
-    string: str = (
-        "{{#tag}}#{{tag}}\n{{/tag}}"
-        "{{#quote}}> {{quote}}{{/quote}} {{#page}}[p. {{page}}]{{/page}}\n"
-        "{{#note}}  NOTE: {{note}}{{/note}}"
-    )
-    footer: str = ""
-
-
-@dataclass
-class Csv:
-    header: str = "type, tag, page, quote, note, file"
-    string: str = "{{type}}, {{tag}}, {{page}}, {{quote}}, {{note}}, {{file}}"
-    footer: str = ""
-
-
-@dataclass
-class Custom:
-    def __init__(self, header: str = "", string: str = "", footer: str = "") -> None:
-        self.header = header
-        self.string = string
-        self.footer = footer