refactor: Move formatting logic to formatters

Formatters (previously templates) were pure data containers before,
continating the 'template' for how things should be formatted using
mustache. The formatting would be done a) in the exporters and b) in the
annotations.

This spread of formatting has now been consolidated into the Formatter,
which fixes the overall spread of formatting code and now can coherently
format a whole output instead of just individual annotations.

A formatter contains references to all documents and contained
annotations and will format everything at once by default, but the
formatting function can be invoked with reference to a specific
annotated document to only format that.

This commit should put more separation into the concerns of exporter and
formatter and made formatting a concern purely of the formatters and
annotation objects.
This commit is contained in:
Marty Oehme 2023-09-19 21:43:19 +02:00
parent 66f937e2a8
commit 5a6d672c76
Signed by: Marty
GPG key ID: EDBF2ED917B2EF6A
6 changed files with 138 additions and 101 deletions

View file

@ -177,8 +177,8 @@ Known issues to be fixed:
- [x] Speed? - [x] Speed?
- should be fine, on my machine (old i5 laptop) it takes around 90s for ~1000 documents with ~4000 annotations - should be fine, on my machine (old i5 laptop) it takes around 90s for ~1000 documents with ~4000 annotations
- [x] ensure all cmdline options do what they should - [x] ensure all cmdline options do what they should
- [ ] annotations carry over color object from fitz, should just be Color object or simple tuple with rgb vals - [x] annotations carry over color object from fitz, should just be Color object or simple tuple with rgb vals
- [ ] docstrings, docstrings! - [x] docstrings, docstrings!
- [ ] testing testing testing!! - [ ] testing testing testing!!
- [ ] refactor into some better abstractions (e.g. Exporter Protocol -> stdout/markdown implementations; Extractor Protocol -> PDF implementation) - [ ] refactor into some better abstractions (e.g. Exporter Protocol -> stdout/markdown implementations; Extractor Protocol -> PDF implementation)
@ -188,7 +188,7 @@ features to be implemented:
- [x] static analysis (lint, typecheck etc) on pushes - [x] static analysis (lint, typecheck etc) on pushes
- [x] test pipeline on master pushes - [x] test pipeline on master pushes
- [ ] release pipeline to pypi on tags - [ ] release pipeline to pypi on tags
- [ ] add page number if available - [x] add page number if available
- exists in Annotation, just need to place in output - exists in Annotation, just need to place in output
- [ ] show overall amount of extractions at the end - [ ] show overall amount of extractions at the end
- [ ] custom formatting decided by user - [ ] custom formatting decided by user
@ -233,14 +233,18 @@ I am not sure if there is much I can do about these issues for now.
and for myself whenever I forget. The basic building blocks currently in here are three: and for myself whenever I forget. The basic building blocks currently in here are three:
- extractors - extractors
: extract data from a source file attached to a papis document. : Extract data from a source file attached to a papis document.
- annotations
: The actual extracted blocks of text, containing some metadata
info as well, such as their color, type, page.
- exporters - exporters
: put the extracted data somewhere like stdout or into your notes. : Put the extracted data somewhere. For now stdout or into your notes.
- templates - formatters
: make sure the exporter saves the data according to your preferred layout, : Make sure the exporter saves the data according to your preferred layout,
such as a markdown syntax or csv-structure. such as a markdown syntax or csv-structure.
Splitting it into those three building blocks makes it easier to recombine them in any way, Splitting it into those three building blocks makes it easier to recombine them in any way,
should someone want to save highlights as csv data in their notes, should someone want to save highlights as csv data in their notes,

View file

@ -8,8 +8,7 @@ import papis.strings
from papis.document import Document from papis.document import Document
from papis_extract import extractor, exporter from papis_extract import extractor, exporter
from papis_extract.annotation_data import AnnotatedDocument from papis_extract.formatter import MarkdownFormatter, Formatter
from papis_extract.templating import Csv, Markdown, Templating
logger = papis.logging.get_logger(__name__) logger = papis.logging.get_logger(__name__)
@ -39,8 +38,7 @@ papis.config.register_default_settings(DEFAULT_OPTIONS)
@click.option( @click.option(
"--manual/--no-manual", "--manual/--no-manual",
"-m", "-m",
help= help="Open note in editor for manual editing after annotation extraction.",
"Open each note in editor for manual editing after extracting its annotations.",
) )
@click.option( @click.option(
"--template", "--template",
@ -82,23 +80,19 @@ def main(
return return
if template == "csv": if template == "csv":
template_type = Csv() raise NotImplementedError
else: run(documents, edit=manual, write=write, git=git, template=MarkdownFormatter())
template_type = Markdown()
run(documents, edit=manual, write=write, git=git, template=template_type)
def run( def run(
documents: list[Document], documents: list[Document],
template: Formatter,
edit: bool = False, edit: bool = False,
write: bool = False, write: bool = False,
git: bool = False, git: bool = False,
template: Templating = Markdown(),
) -> None: ) -> None:
doc_annotations: list[AnnotatedDocument] = extractor.start(documents) template.annotated_docs = extractor.start(documents)
if write: if write:
exporter.to_notes(doc_annotations, template, edit=edit, git=git) exporter.to_notes(template, edit=edit, git=git)
else: else:
exporter.to_stdout(doc_annotations, template) exporter.to_stdout(template)

View file

@ -5,8 +5,6 @@ import papis.config
from papis.document import Document from papis.document import Document
import chevron import chevron
from papis_extract.templating import Templating
TEXT_SIMILARITY_MINIMUM = 0.75 TEXT_SIMILARITY_MINIMUM = 0.75
COLOR_SIMILARITY_MINIMUM = 0.833 COLOR_SIMILARITY_MINIMUM = 0.833
@ -36,7 +34,7 @@ class Annotation:
type: str = "Highlight" type: str = "Highlight"
minimum_similarity_color: float = 1.0 minimum_similarity_color: float = 1.0
def format(self, template: Templating): def format(self, template: str, doc: Document = Document()):
"""Return a formatted string of the annotation. """Return a formatted string of the annotation.
Given a provided formatting pattern, this method returns the annotation Given a provided formatting pattern, this method returns the annotation
@ -50,8 +48,9 @@ class Annotation:
"page": self.page, "page": self.page,
"tag": self.tag, "tag": self.tag,
"type": self.type, "type": self.type,
"doc": doc,
} }
return chevron.render(template.string, data) return chevron.render(template, data)
@property @property
def colorname(self): def colorname(self):
@ -89,6 +88,7 @@ class AnnotatedDocument:
"""Contains all annotations belonging to a single papis document. """Contains all annotations belonging to a single papis document.
Combines a document with a list of annotations which belong to it.""" Combines a document with a list of annotations which belong to it."""
document: Document document: Document
annotations: list[Annotation] annotations: list[Annotation]

View file

@ -7,61 +7,33 @@ import papis.git
import papis.config import papis.config
import Levenshtein import Levenshtein
from papis_extract.annotation_data import AnnotatedDocument from papis_extract.formatter import Formatter
from papis_extract.templating import Templating
logger = papis.logging.get_logger(__name__) logger = papis.logging.get_logger(__name__)
def to_stdout(annots: list[AnnotatedDocument], template: Templating) -> None: def to_stdout(template: Formatter) -> None:
"""Pretty print annotations to stdout. """Pretty print annotations to stdout.
Gives a nice human-readable representations of Gives a nice human-readable representations of
the annotations in somewhat of a list form. the annotations in somewhat of a list form.
Not intended for machine-readability. Not intended for machine-readability.
""" """
if not annots: output:str = template.execute()
return print(output.rstrip('\n'))
last = annots[-1]
for entry in annots:
if not entry.annotations:
continue
title_decoration = (
f"{'=' * len(entry.document.get('title', ''))} "
f"{'-' * len(entry.document.get('author', ''))}"
)
print(
f"{title_decoration}\n{papis.document.describe(entry.document)}\n{title_decoration}\n"
)
for a in entry.annotations:
print(a.format(template))
if entry != last:
print("\n")
def to_notes( def to_notes(template: Formatter, edit: bool, git: bool) -> None:
annots: list[AnnotatedDocument], template: Templating, edit: bool, git: bool
) -> None:
"""Write annotations into document notes. """Write annotations into document notes.
Permanently writes the given annotations into notes Permanently writes the given annotations into notes
belonging to papis documents. Creates new notes for belonging to papis documents. Creates new notes for
documents missing a note field or appends to existing. documents missing a note field or appends to existing.
""" """
if not annots: annotated_docs = template.annotated_docs
return for entry in annotated_docs:
formatted_annotations = template.execute(entry).split("\n")
for entry in annots: if formatted_annotations:
if not entry.annotations:
continue
formatted_annotations: list[str] = []
for a in entry.annotations:
formatted_annotations.append(a.format(template))
_add_annots_to_note(entry.document, formatted_annotations) _add_annots_to_note(entry.document, formatted_annotations)
if edit: if edit:
@ -130,7 +102,9 @@ def _drop_existing_annotations(
remaining: list[str] = [] remaining: list[str] = []
for an in formatted_annotations: for an in formatted_annotations:
an_split = an.splitlines() an_split = an.splitlines()
if not _test_similarity(an_split[0], file_lines, minimum_similarity): if an_split and not _test_similarity(
an_split[0], file_lines, minimum_similarity
):
remaining.append(an) remaining.append(an)
return remaining return remaining

100
papis_extract/formatter.py Normal file
View file

@ -0,0 +1,100 @@
from dataclasses import dataclass, field
from typing import Protocol
from papis_extract.annotation_data import AnnotatedDocument
@dataclass
class Formatter(Protocol):
annotated_docs: list[AnnotatedDocument]
header: str
string: str
footer: str
def execute(self, doc: AnnotatedDocument | None = None) -> str:
raise NotImplementedError
@dataclass
class MarkdownFormatter:
annotated_docs: list[AnnotatedDocument] = field(default_factory=lambda: list())
header: str = ""
string: str = (
"{{#tag}}#{{tag}}\n{{/tag}}"
"{{#quote}}> {{quote}}{{/quote}} {{#page}}[p. {{page}}]{{/page}}\n"
"{{#note}} NOTE: {{note}}{{/note}}"
)
footer: str = ""
def execute(self, doc: AnnotatedDocument | None = None) -> str:
output = ""
documents = self.annotated_docs if doc is None else [doc]
last = documents[-1]
for entry in documents:
if not entry.annotations:
continue
title_decoration = (
f"{'=' * len(entry.document.get('title', ''))} "
f"{'-' * len(entry.document.get('author', ''))}"
)
output += (
f"{title_decoration}\n"
f"{entry.document['title']} - {entry.document['author']}\n"
f"{title_decoration}\n\n"
)
for a in entry.annotations:
output += a.format(self.string)
if entry != last:
print(f"entry: {entry}, last: {last}")
output += "\n\n\n"
return output
@dataclass
class CountFormatter:
annotated_docs: list[AnnotatedDocument] = field(default_factory=lambda: list())
header: str = ""
string: str = ""
footer: str = ""
def execute(self, doc: AnnotatedDocument | None = None) -> str:
output = ""
documents = self.annotated_docs if doc is None else [doc]
last = documents[-1]
for entry in documents:
if not entry.annotations:
continue
title_decoration = (
f"{'=' * len(entry.document.get('title', ''))} "
f"{'-' * len(entry.document.get('author', ''))}"
)
output += (
f"{title_decoration}\n"
f"{entry.document['title']} - {entry.document['author']}\n"
f"{title_decoration}\n\n"
)
for a in entry.annotations:
output += a.format(self.string)
if entry != last:
print(f"entry: {entry}, last: {last}")
output += "\n\n\n"
return output
@dataclass
class CsvFormatter:
header: str = "type, tag, page, quote, note, file"
string: str = "{{type}}, {{tag}}, {{page}}, {{quote}}, {{note}}, {{file}}"
footer: str = ""
@dataclass
class CustomFormatter:
def __init__(self, header: str = "", string: str = "", footer: str = "") -> None:
self.header = header
self.string = string
self.footer = footer

View file

@ -1,35 +0,0 @@
from dataclasses import dataclass
from typing import Protocol
@dataclass
class Templating(Protocol):
header: str
string: str
footer: str
@dataclass
class Markdown:
header: str = ""
string: str = (
"{{#tag}}#{{tag}}\n{{/tag}}"
"{{#quote}}> {{quote}}{{/quote}} {{#page}}[p. {{page}}]{{/page}}\n"
"{{#note}} NOTE: {{note}}{{/note}}"
)
footer: str = ""
@dataclass
class Csv:
header: str = "type, tag, page, quote, note, file"
string: str = "{{type}}, {{tag}}, {{page}}, {{quote}}, {{note}}, {{file}}"
footer: str = ""
@dataclass
class Custom:
def __init__(self, header: str = "", string: str = "", footer: str = "") -> None:
self.header = header
self.string = string
self.footer = footer