refactor: Move formatting logic to formatters

Formatters (previously templates) were pure data containers before, continating the 'template' for how things should be formatted using mustache. The formatting would be done a) in the exporters and b) in the annotations. This spread of formatting has now been consolidated into the Formatter, which fixes the overall spread of formatting code and now can coherently format a whole output instead of just individual annotations. A formatter contains references to all documents and contained annotations and will format everything at once by default, but the formatting function can be invoked with reference to a specific annotated document to only format that. This commit should put more separation into the concerns of exporter and formatter and made formatting a concern purely of the formatters and annotation objects.
2023-09-19 21:43:19 +02:00 · 2023-09-19 21:43:19 +02:00 · 5a6d672c76
commit 5a6d672c76
parent 66f937e2a8
6 changed files with 138 additions and 101 deletions
--- a/README.md
+++ b/README.md
@ -177,8 +177,8 @@ Known issues to be fixed:
 - [x] Speed?
    - should be fine, on my machine (old i5 laptop) it takes around 90s for ~1000 documents with ~4000 annotations
 - [x] ensure all cmdline options do what they should
- [ ] annotations carry over color object from fitz, should just be Color object or simple tuple with rgb vals
+- [x] annotations carry over color object from fitz, should just be Color object or simple tuple with rgb vals
- [ ] docstrings, docstrings!
+- [x] docstrings, docstrings!
 - [ ] testing testing testing!!
    - [ ] refactor into some better abstractions (e.g. Exporter Protocol -> stdout/markdown implementations; Extractor Protocol -> PDF implementation)
@ -188,7 +188,7 @@ features to be implemented:
    - [x] static analysis (lint, typecheck etc) on pushes
    - [x] test pipeline on master pushes
    - [ ] release pipeline to pypi on tags
- [ ] add page number if available
+- [x] add page number if available
    - exists in Annotation, just need to place in output
 - [ ] show overall amount of extractions at the end
 - [ ] custom formatting decided by user
@ -233,13 +233,17 @@ I am not sure if there is much I can do about these issues for now.
 and for myself whenever I forget. The basic building blocks currently in here are three:
 - extractors
-: extract data from a source file attached to a papis document.
+: Extract data from a source file attached to a papis document.
 - annotations
 : The actual extracted blocks of text, containing some metadata 
  info as well, such as their color, type, page.
 - exporters
-: put the extracted data somewhere like stdout or into your notes.
+: Put the extracted data somewhere. For now stdout or into your notes.
- templates
+- formatters
-: make sure the exporter saves the data according to your preferred layout, 
+: Make sure the exporter saves the data according to your preferred layout,
  such as a markdown syntax or csv-structure.
 Splitting it into those three building blocks makes it easier to recombine them in any way,
--- a/papis_extract/init.py
+++ b/papis_extract/init.py
@ -8,8 +8,7 @@ import papis.strings
 from papis.document import Document
 from papis_extract import extractor, exporter
-from papis_extract.annotation_data import AnnotatedDocument
+from papis_extract.formatter import MarkdownFormatter, Formatter
 from papis_extract.templating import Csv, Markdown, Templating
 logger = papis.logging.get_logger(__name__)
@ -39,8 +38,7 @@ papis.config.register_default_settings(DEFAULT_OPTIONS)
@click.option(
    "--manual/--no-manual",
    "-m",
-    help=
+    help="Open note in editor for manual editing after annotation extraction.",
    "Open each note in editor for manual editing after extracting its annotations.",
 )
@click.option(
    "--template",
@ -82,23 +80,19 @@ def main(
        return
    if template == "csv":
-        template_type = Csv()
+        raise NotImplementedError
-    else:
+    run(documents, edit=manual, write=write, git=git, template=MarkdownFormatter())
        template_type = Markdown()
    run(documents, edit=manual, write=write, git=git, template=template_type)
 def run(
    documents: list[Document],
    template: Formatter,
    edit: bool = False,
    write: bool = False,
    git: bool = False,
    template: Templating = Markdown(),
 ) -> None:
-    doc_annotations: list[AnnotatedDocument] = extractor.start(documents)
+    template.annotated_docs = extractor.start(documents)
    if write:
-        exporter.to_notes(doc_annotations, template, edit=edit, git=git)
+        exporter.to_notes(template, edit=edit, git=git)
    else:
-        exporter.to_stdout(doc_annotations, template)
+        exporter.to_stdout(template)
--- a/papis_extract/annotation_data.py
+++ b/papis_extract/annotation_data.py
@ -5,8 +5,6 @@ import papis.config
 from papis.document import Document
 import chevron
 from papis_extract.templating import Templating
 TEXT_SIMILARITY_MINIMUM = 0.75
 COLOR_SIMILARITY_MINIMUM = 0.833
@ -36,7 +34,7 @@ class Annotation:
    type: str = "Highlight"
    minimum_similarity_color: float = 1.0
-    def format(self, template: Templating):
+    def format(self, template: str, doc: Document = Document()):
        """Return a formatted string of the annotation.
        Given a provided formatting pattern, this method returns the annotation
@ -50,8 +48,9 @@ class Annotation:
            "page": self.page,
            "tag": self.tag,
            "type": self.type,
            "doc": doc,
        }
-        return chevron.render(template.string, data)
+        return chevron.render(template, data)
    @property
    def colorname(self):
@ -89,6 +88,7 @@ class AnnotatedDocument:
    """Contains all annotations belonging to a single papis document.
    Combines a document with a list of annotations which belong to it."""
    document: Document
    annotations: list[Annotation]
--- a/papis_extract/exporter.py
+++ b/papis_extract/exporter.py
@ -7,61 +7,33 @@ import papis.git
 import papis.config
 import Levenshtein
-from papis_extract.annotation_data import AnnotatedDocument
+from papis_extract.formatter import Formatter
 from papis_extract.templating import Templating
 logger = papis.logging.get_logger(__name__)
-def to_stdout(annots: list[AnnotatedDocument], template: Templating) -> None:
+def to_stdout(template: Formatter) -> None:
    """Pretty print annotations to stdout.
    Gives a nice human-readable representations of
    the annotations in somewhat of a list form.
    Not intended for machine-readability.
    """
-    if not annots:
+    output:str = template.execute()
-        return
+    print(output.rstrip('\n'))
    last = annots[-1]
    for entry in annots:
        if not entry.annotations:
            continue
        title_decoration = (
            f"{'=' * len(entry.document.get('title', ''))}   "
            f"{'-' * len(entry.document.get('author', ''))}"
        )
        print(
            f"{title_decoration}\n{papis.document.describe(entry.document)}\n{title_decoration}\n"
        )
        for a in entry.annotations:
            print(a.format(template))
        if entry != last:
            print("\n")
-def to_notes(
+def to_notes(template: Formatter, edit: bool, git: bool) -> None:
    annots: list[AnnotatedDocument], template: Templating, edit: bool, git: bool
 ) -> None:
    """Write annotations into document notes.
    Permanently writes the given annotations into notes
    belonging to papis documents. Creates new notes for
    documents missing a note field or appends to existing.
    """
-    if not annots:
+    annotated_docs = template.annotated_docs
-        return
+    for entry in annotated_docs:
-
+        formatted_annotations = template.execute(entry).split("\n")
-    for entry in annots:
+        if formatted_annotations:
        if not entry.annotations:
            continue
        formatted_annotations: list[str] = []
        for a in entry.annotations:
            formatted_annotations.append(a.format(template))
            _add_annots_to_note(entry.document, formatted_annotations)
        if edit:
@ -130,7 +102,9 @@ def _drop_existing_annotations(
    remaining: list[str] = []
    for an in formatted_annotations:
        an_split = an.splitlines()
-        if not _test_similarity(an_split[0], file_lines, minimum_similarity):
+        if an_split and not _test_similarity(
            an_split[0], file_lines, minimum_similarity
        ):
            remaining.append(an)
    return remaining
--- a/papis_extract/formatter.py
+++ b/papis_extract/formatter.py
@ -0,0 +1,100 @@
 from dataclasses import dataclass, field
 from typing import Protocol
 from papis_extract.annotation_data import AnnotatedDocument
@dataclass
 class Formatter(Protocol):
    annotated_docs: list[AnnotatedDocument]
    header: str
    string: str
    footer: str
    def execute(self, doc: AnnotatedDocument | None = None) -> str:
        raise NotImplementedError
@dataclass
 class MarkdownFormatter:
    annotated_docs: list[AnnotatedDocument] = field(default_factory=lambda: list())
    header: str = ""
    string: str = (
        "{{#tag}}#{{tag}}\n{{/tag}}"
        "{{#quote}}> {{quote}}{{/quote}} {{#page}}[p. {{page}}]{{/page}}\n"
        "{{#note}}  NOTE: {{note}}{{/note}}"
    )
    footer: str = ""
    def execute(self, doc: AnnotatedDocument | None = None) -> str:
        output = ""
        documents = self.annotated_docs if doc is None else [doc]
        last = documents[-1]
        for entry in documents:
            if not entry.annotations:
                continue
            title_decoration = (
                f"{'=' * len(entry.document.get('title', ''))}   "
                f"{'-' * len(entry.document.get('author', ''))}"
            )
            output += (
                f"{title_decoration}\n"
                f"{entry.document['title']} - {entry.document['author']}\n"
                f"{title_decoration}\n\n"
            )
            for a in entry.annotations:
                output += a.format(self.string)
            if entry != last:
                print(f"entry: {entry}, last: {last}")
                output += "\n\n\n"
        return output
@dataclass
 class CountFormatter:
    annotated_docs: list[AnnotatedDocument] = field(default_factory=lambda: list())
    header: str = ""
    string: str = ""
    footer: str = ""
    def execute(self, doc: AnnotatedDocument | None = None) -> str:
        output = ""
        documents = self.annotated_docs if doc is None else [doc]
        last = documents[-1]
        for entry in documents:
            if not entry.annotations:
                continue
            title_decoration = (
                f"{'=' * len(entry.document.get('title', ''))}   "
                f"{'-' * len(entry.document.get('author', ''))}"
            )
            output += (
                f"{title_decoration}\n"
                f"{entry.document['title']} - {entry.document['author']}\n"
                f"{title_decoration}\n\n"
            )
            for a in entry.annotations:
                output += a.format(self.string)
            if entry != last:
                print(f"entry: {entry}, last: {last}")
                output += "\n\n\n"
        return output
@dataclass
 class CsvFormatter:
    header: str = "type, tag, page, quote, note, file"
    string: str = "{{type}}, {{tag}}, {{page}}, {{quote}}, {{note}}, {{file}}"
    footer: str = ""
@dataclass
 class CustomFormatter:
    def __init__(self, header: str = "", string: str = "", footer: str = "") -> None:
        self.header = header
        self.string = string
        self.footer = footer
--- a/papis_extract/templating.py
+++ b/papis_extract/templating.py
@ -1,35 +0,0 @@
 from dataclasses import dataclass
 from typing import Protocol
@dataclass
 class Templating(Protocol):
    header: str
    string: str
    footer: str
@dataclass
 class Markdown:
    header: str = ""
    string: str = (
        "{{#tag}}#{{tag}}\n{{/tag}}"
        "{{#quote}}> {{quote}}{{/quote}} {{#page}}[p. {{page}}]{{/page}}\n"
        "{{#note}}  NOTE: {{note}}{{/note}}"
    )
    footer: str = ""
@dataclass
 class Csv:
    header: str = "type, tag, page, quote, note, file"
    string: str = "{{type}}, {{tag}}, {{page}}, {{quote}}, {{note}}, {{file}}"
    footer: str = ""
@dataclass
 class Custom:
    def __init__(self, header: str = "", string: str = "", footer: str = "") -> None:
        self.header = header
        self.string = string
        self.footer = footer