refactor: Move formatting logic to formatters
Formatters (previously templates) were pure data containers before, continating the 'template' for how things should be formatted using mustache. The formatting would be done a) in the exporters and b) in the annotations. This spread of formatting has now been consolidated into the Formatter, which fixes the overall spread of formatting code and now can coherently format a whole output instead of just individual annotations. A formatter contains references to all documents and contained annotations and will format everything at once by default, but the formatting function can be invoked with reference to a specific annotated document to only format that. This commit should put more separation into the concerns of exporter and formatter and made formatting a concern purely of the formatters and annotation objects.
This commit is contained in:
parent
66f937e2a8
commit
5a6d672c76
6 changed files with 138 additions and 101 deletions
18
README.md
18
README.md
|
@ -177,8 +177,8 @@ Known issues to be fixed:
|
||||||
- [x] Speed?
|
- [x] Speed?
|
||||||
- should be fine, on my machine (old i5 laptop) it takes around 90s for ~1000 documents with ~4000 annotations
|
- should be fine, on my machine (old i5 laptop) it takes around 90s for ~1000 documents with ~4000 annotations
|
||||||
- [x] ensure all cmdline options do what they should
|
- [x] ensure all cmdline options do what they should
|
||||||
- [ ] annotations carry over color object from fitz, should just be Color object or simple tuple with rgb vals
|
- [x] annotations carry over color object from fitz, should just be Color object or simple tuple with rgb vals
|
||||||
- [ ] docstrings, docstrings!
|
- [x] docstrings, docstrings!
|
||||||
- [ ] testing testing testing!!
|
- [ ] testing testing testing!!
|
||||||
- [ ] refactor into some better abstractions (e.g. Exporter Protocol -> stdout/markdown implementations; Extractor Protocol -> PDF implementation)
|
- [ ] refactor into some better abstractions (e.g. Exporter Protocol -> stdout/markdown implementations; Extractor Protocol -> PDF implementation)
|
||||||
|
|
||||||
|
@ -188,7 +188,7 @@ features to be implemented:
|
||||||
- [x] static analysis (lint, typecheck etc) on pushes
|
- [x] static analysis (lint, typecheck etc) on pushes
|
||||||
- [x] test pipeline on master pushes
|
- [x] test pipeline on master pushes
|
||||||
- [ ] release pipeline to pypi on tags
|
- [ ] release pipeline to pypi on tags
|
||||||
- [ ] add page number if available
|
- [x] add page number if available
|
||||||
- exists in Annotation, just need to place in output
|
- exists in Annotation, just need to place in output
|
||||||
- [ ] show overall amount of extractions at the end
|
- [ ] show overall amount of extractions at the end
|
||||||
- [ ] custom formatting decided by user
|
- [ ] custom formatting decided by user
|
||||||
|
@ -233,13 +233,17 @@ I am not sure if there is much I can do about these issues for now.
|
||||||
and for myself whenever I forget. The basic building blocks currently in here are three:
|
and for myself whenever I forget. The basic building blocks currently in here are three:
|
||||||
|
|
||||||
- extractors
|
- extractors
|
||||||
: extract data from a source file attached to a papis document.
|
: Extract data from a source file attached to a papis document.
|
||||||
|
|
||||||
|
- annotations
|
||||||
|
: The actual extracted blocks of text, containing some metadata
|
||||||
|
info as well, such as their color, type, page.
|
||||||
|
|
||||||
- exporters
|
- exporters
|
||||||
: put the extracted data somewhere like stdout or into your notes.
|
: Put the extracted data somewhere. For now stdout or into your notes.
|
||||||
|
|
||||||
- templates
|
- formatters
|
||||||
: make sure the exporter saves the data according to your preferred layout,
|
: Make sure the exporter saves the data according to your preferred layout,
|
||||||
such as a markdown syntax or csv-structure.
|
such as a markdown syntax or csv-structure.
|
||||||
|
|
||||||
Splitting it into those three building blocks makes it easier to recombine them in any way,
|
Splitting it into those three building blocks makes it easier to recombine them in any way,
|
||||||
|
|
|
@ -8,8 +8,7 @@ import papis.strings
|
||||||
from papis.document import Document
|
from papis.document import Document
|
||||||
|
|
||||||
from papis_extract import extractor, exporter
|
from papis_extract import extractor, exporter
|
||||||
from papis_extract.annotation_data import AnnotatedDocument
|
from papis_extract.formatter import MarkdownFormatter, Formatter
|
||||||
from papis_extract.templating import Csv, Markdown, Templating
|
|
||||||
|
|
||||||
logger = papis.logging.get_logger(__name__)
|
logger = papis.logging.get_logger(__name__)
|
||||||
|
|
||||||
|
@ -39,8 +38,7 @@ papis.config.register_default_settings(DEFAULT_OPTIONS)
|
||||||
@click.option(
|
@click.option(
|
||||||
"--manual/--no-manual",
|
"--manual/--no-manual",
|
||||||
"-m",
|
"-m",
|
||||||
help=
|
help="Open note in editor for manual editing after annotation extraction.",
|
||||||
"Open each note in editor for manual editing after extracting its annotations.",
|
|
||||||
)
|
)
|
||||||
@click.option(
|
@click.option(
|
||||||
"--template",
|
"--template",
|
||||||
|
@ -82,23 +80,19 @@ def main(
|
||||||
return
|
return
|
||||||
|
|
||||||
if template == "csv":
|
if template == "csv":
|
||||||
template_type = Csv()
|
raise NotImplementedError
|
||||||
else:
|
run(documents, edit=manual, write=write, git=git, template=MarkdownFormatter())
|
||||||
template_type = Markdown()
|
|
||||||
|
|
||||||
run(documents, edit=manual, write=write, git=git, template=template_type)
|
|
||||||
|
|
||||||
|
|
||||||
def run(
|
def run(
|
||||||
documents: list[Document],
|
documents: list[Document],
|
||||||
|
template: Formatter,
|
||||||
edit: bool = False,
|
edit: bool = False,
|
||||||
write: bool = False,
|
write: bool = False,
|
||||||
git: bool = False,
|
git: bool = False,
|
||||||
template: Templating = Markdown(),
|
|
||||||
) -> None:
|
) -> None:
|
||||||
doc_annotations: list[AnnotatedDocument] = extractor.start(documents)
|
template.annotated_docs = extractor.start(documents)
|
||||||
|
|
||||||
if write:
|
if write:
|
||||||
exporter.to_notes(doc_annotations, template, edit=edit, git=git)
|
exporter.to_notes(template, edit=edit, git=git)
|
||||||
else:
|
else:
|
||||||
exporter.to_stdout(doc_annotations, template)
|
exporter.to_stdout(template)
|
||||||
|
|
|
@ -5,8 +5,6 @@ import papis.config
|
||||||
from papis.document import Document
|
from papis.document import Document
|
||||||
import chevron
|
import chevron
|
||||||
|
|
||||||
from papis_extract.templating import Templating
|
|
||||||
|
|
||||||
TEXT_SIMILARITY_MINIMUM = 0.75
|
TEXT_SIMILARITY_MINIMUM = 0.75
|
||||||
COLOR_SIMILARITY_MINIMUM = 0.833
|
COLOR_SIMILARITY_MINIMUM = 0.833
|
||||||
|
|
||||||
|
@ -36,7 +34,7 @@ class Annotation:
|
||||||
type: str = "Highlight"
|
type: str = "Highlight"
|
||||||
minimum_similarity_color: float = 1.0
|
minimum_similarity_color: float = 1.0
|
||||||
|
|
||||||
def format(self, template: Templating):
|
def format(self, template: str, doc: Document = Document()):
|
||||||
"""Return a formatted string of the annotation.
|
"""Return a formatted string of the annotation.
|
||||||
|
|
||||||
Given a provided formatting pattern, this method returns the annotation
|
Given a provided formatting pattern, this method returns the annotation
|
||||||
|
@ -50,8 +48,9 @@ class Annotation:
|
||||||
"page": self.page,
|
"page": self.page,
|
||||||
"tag": self.tag,
|
"tag": self.tag,
|
||||||
"type": self.type,
|
"type": self.type,
|
||||||
|
"doc": doc,
|
||||||
}
|
}
|
||||||
return chevron.render(template.string, data)
|
return chevron.render(template, data)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def colorname(self):
|
def colorname(self):
|
||||||
|
@ -89,6 +88,7 @@ class AnnotatedDocument:
|
||||||
"""Contains all annotations belonging to a single papis document.
|
"""Contains all annotations belonging to a single papis document.
|
||||||
|
|
||||||
Combines a document with a list of annotations which belong to it."""
|
Combines a document with a list of annotations which belong to it."""
|
||||||
|
|
||||||
document: Document
|
document: Document
|
||||||
annotations: list[Annotation]
|
annotations: list[Annotation]
|
||||||
|
|
||||||
|
|
|
@ -7,61 +7,33 @@ import papis.git
|
||||||
import papis.config
|
import papis.config
|
||||||
import Levenshtein
|
import Levenshtein
|
||||||
|
|
||||||
from papis_extract.annotation_data import AnnotatedDocument
|
from papis_extract.formatter import Formatter
|
||||||
from papis_extract.templating import Templating
|
|
||||||
|
|
||||||
logger = papis.logging.get_logger(__name__)
|
logger = papis.logging.get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
def to_stdout(annots: list[AnnotatedDocument], template: Templating) -> None:
|
def to_stdout(template: Formatter) -> None:
|
||||||
"""Pretty print annotations to stdout.
|
"""Pretty print annotations to stdout.
|
||||||
|
|
||||||
Gives a nice human-readable representations of
|
Gives a nice human-readable representations of
|
||||||
the annotations in somewhat of a list form.
|
the annotations in somewhat of a list form.
|
||||||
Not intended for machine-readability.
|
Not intended for machine-readability.
|
||||||
"""
|
"""
|
||||||
if not annots:
|
output:str = template.execute()
|
||||||
return
|
print(output.rstrip('\n'))
|
||||||
|
|
||||||
last = annots[-1]
|
|
||||||
for entry in annots:
|
|
||||||
if not entry.annotations:
|
|
||||||
continue
|
|
||||||
|
|
||||||
title_decoration = (
|
|
||||||
f"{'=' * len(entry.document.get('title', ''))} "
|
|
||||||
f"{'-' * len(entry.document.get('author', ''))}"
|
|
||||||
)
|
|
||||||
print(
|
|
||||||
f"{title_decoration}\n{papis.document.describe(entry.document)}\n{title_decoration}\n"
|
|
||||||
)
|
|
||||||
for a in entry.annotations:
|
|
||||||
print(a.format(template))
|
|
||||||
|
|
||||||
if entry != last:
|
|
||||||
print("\n")
|
|
||||||
|
|
||||||
|
|
||||||
def to_notes(
|
def to_notes(template: Formatter, edit: bool, git: bool) -> None:
|
||||||
annots: list[AnnotatedDocument], template: Templating, edit: bool, git: bool
|
|
||||||
) -> None:
|
|
||||||
"""Write annotations into document notes.
|
"""Write annotations into document notes.
|
||||||
|
|
||||||
Permanently writes the given annotations into notes
|
Permanently writes the given annotations into notes
|
||||||
belonging to papis documents. Creates new notes for
|
belonging to papis documents. Creates new notes for
|
||||||
documents missing a note field or appends to existing.
|
documents missing a note field or appends to existing.
|
||||||
"""
|
"""
|
||||||
if not annots:
|
annotated_docs = template.annotated_docs
|
||||||
return
|
for entry in annotated_docs:
|
||||||
|
formatted_annotations = template.execute(entry).split("\n")
|
||||||
for entry in annots:
|
if formatted_annotations:
|
||||||
if not entry.annotations:
|
|
||||||
continue
|
|
||||||
|
|
||||||
formatted_annotations: list[str] = []
|
|
||||||
for a in entry.annotations:
|
|
||||||
formatted_annotations.append(a.format(template))
|
|
||||||
|
|
||||||
_add_annots_to_note(entry.document, formatted_annotations)
|
_add_annots_to_note(entry.document, formatted_annotations)
|
||||||
|
|
||||||
if edit:
|
if edit:
|
||||||
|
@ -130,7 +102,9 @@ def _drop_existing_annotations(
|
||||||
remaining: list[str] = []
|
remaining: list[str] = []
|
||||||
for an in formatted_annotations:
|
for an in formatted_annotations:
|
||||||
an_split = an.splitlines()
|
an_split = an.splitlines()
|
||||||
if not _test_similarity(an_split[0], file_lines, minimum_similarity):
|
if an_split and not _test_similarity(
|
||||||
|
an_split[0], file_lines, minimum_similarity
|
||||||
|
):
|
||||||
remaining.append(an)
|
remaining.append(an)
|
||||||
|
|
||||||
return remaining
|
return remaining
|
||||||
|
|
100
papis_extract/formatter.py
Normal file
100
papis_extract/formatter.py
Normal file
|
@ -0,0 +1,100 @@
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from typing import Protocol
|
||||||
|
|
||||||
|
from papis_extract.annotation_data import AnnotatedDocument
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Formatter(Protocol):
|
||||||
|
annotated_docs: list[AnnotatedDocument]
|
||||||
|
header: str
|
||||||
|
string: str
|
||||||
|
footer: str
|
||||||
|
|
||||||
|
def execute(self, doc: AnnotatedDocument | None = None) -> str:
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class MarkdownFormatter:
|
||||||
|
annotated_docs: list[AnnotatedDocument] = field(default_factory=lambda: list())
|
||||||
|
header: str = ""
|
||||||
|
string: str = (
|
||||||
|
"{{#tag}}#{{tag}}\n{{/tag}}"
|
||||||
|
"{{#quote}}> {{quote}}{{/quote}} {{#page}}[p. {{page}}]{{/page}}\n"
|
||||||
|
"{{#note}} NOTE: {{note}}{{/note}}"
|
||||||
|
)
|
||||||
|
footer: str = ""
|
||||||
|
|
||||||
|
def execute(self, doc: AnnotatedDocument | None = None) -> str:
|
||||||
|
output = ""
|
||||||
|
documents = self.annotated_docs if doc is None else [doc]
|
||||||
|
last = documents[-1]
|
||||||
|
for entry in documents:
|
||||||
|
if not entry.annotations:
|
||||||
|
continue
|
||||||
|
|
||||||
|
title_decoration = (
|
||||||
|
f"{'=' * len(entry.document.get('title', ''))} "
|
||||||
|
f"{'-' * len(entry.document.get('author', ''))}"
|
||||||
|
)
|
||||||
|
output += (
|
||||||
|
f"{title_decoration}\n"
|
||||||
|
f"{entry.document['title']} - {entry.document['author']}\n"
|
||||||
|
f"{title_decoration}\n\n"
|
||||||
|
)
|
||||||
|
for a in entry.annotations:
|
||||||
|
output += a.format(self.string)
|
||||||
|
|
||||||
|
if entry != last:
|
||||||
|
print(f"entry: {entry}, last: {last}")
|
||||||
|
output += "\n\n\n"
|
||||||
|
|
||||||
|
return output
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class CountFormatter:
|
||||||
|
annotated_docs: list[AnnotatedDocument] = field(default_factory=lambda: list())
|
||||||
|
header: str = ""
|
||||||
|
string: str = ""
|
||||||
|
footer: str = ""
|
||||||
|
|
||||||
|
def execute(self, doc: AnnotatedDocument | None = None) -> str:
|
||||||
|
output = ""
|
||||||
|
documents = self.annotated_docs if doc is None else [doc]
|
||||||
|
last = documents[-1]
|
||||||
|
for entry in documents:
|
||||||
|
if not entry.annotations:
|
||||||
|
continue
|
||||||
|
|
||||||
|
title_decoration = (
|
||||||
|
f"{'=' * len(entry.document.get('title', ''))} "
|
||||||
|
f"{'-' * len(entry.document.get('author', ''))}"
|
||||||
|
)
|
||||||
|
output += (
|
||||||
|
f"{title_decoration}\n"
|
||||||
|
f"{entry.document['title']} - {entry.document['author']}\n"
|
||||||
|
f"{title_decoration}\n\n"
|
||||||
|
)
|
||||||
|
for a in entry.annotations:
|
||||||
|
output += a.format(self.string)
|
||||||
|
|
||||||
|
if entry != last:
|
||||||
|
print(f"entry: {entry}, last: {last}")
|
||||||
|
output += "\n\n\n"
|
||||||
|
|
||||||
|
return output
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class CsvFormatter:
|
||||||
|
header: str = "type, tag, page, quote, note, file"
|
||||||
|
string: str = "{{type}}, {{tag}}, {{page}}, {{quote}}, {{note}}, {{file}}"
|
||||||
|
footer: str = ""
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class CustomFormatter:
|
||||||
|
def __init__(self, header: str = "", string: str = "", footer: str = "") -> None:
|
||||||
|
self.header = header
|
||||||
|
self.string = string
|
||||||
|
self.footer = footer
|
|
@ -1,35 +0,0 @@
|
||||||
from dataclasses import dataclass
|
|
||||||
from typing import Protocol
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class Templating(Protocol):
|
|
||||||
header: str
|
|
||||||
string: str
|
|
||||||
footer: str
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class Markdown:
|
|
||||||
header: str = ""
|
|
||||||
string: str = (
|
|
||||||
"{{#tag}}#{{tag}}\n{{/tag}}"
|
|
||||||
"{{#quote}}> {{quote}}{{/quote}} {{#page}}[p. {{page}}]{{/page}}\n"
|
|
||||||
"{{#note}} NOTE: {{note}}{{/note}}"
|
|
||||||
)
|
|
||||||
footer: str = ""
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class Csv:
|
|
||||||
header: str = "type, tag, page, quote, note, file"
|
|
||||||
string: str = "{{type}}, {{tag}}, {{page}}, {{quote}}, {{note}}, {{file}}"
|
|
||||||
footer: str = ""
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class Custom:
|
|
||||||
def __init__(self, header: str = "", string: str = "", footer: str = "") -> None:
|
|
||||||
self.header = header
|
|
||||||
self.string = string
|
|
||||||
self.footer = footer
|
|
Loading…
Reference in a new issue