From 5a6d672c76f7ccb058d894f95550c5236f6067a6 Mon Sep 17 00:00:00 2001 From: Marty Oehme Date: Tue, 19 Sep 2023 21:43:19 +0200 Subject: [PATCH] refactor: Move formatting logic to formatters Formatters (previously templates) were pure data containers before, continating the 'template' for how things should be formatted using mustache. The formatting would be done a) in the exporters and b) in the annotations. This spread of formatting has now been consolidated into the Formatter, which fixes the overall spread of formatting code and now can coherently format a whole output instead of just individual annotations. A formatter contains references to all documents and contained annotations and will format everything at once by default, but the formatting function can be invoked with reference to a specific annotated document to only format that. This commit should put more separation into the concerns of exporter and formatter and made formatting a concern purely of the formatters and annotation objects. --- README.md | 20 ++++--- papis_extract/__init__.py | 22 +++---- papis_extract/annotation_data.py | 8 +-- papis_extract/exporter.py | 54 +++++------------ papis_extract/formatter.py | 100 +++++++++++++++++++++++++++++++ papis_extract/templating.py | 35 ----------- 6 files changed, 138 insertions(+), 101 deletions(-) create mode 100644 papis_extract/formatter.py delete mode 100644 papis_extract/templating.py diff --git a/README.md b/README.md index 61a6f47..89052bf 100644 --- a/README.md +++ b/README.md @@ -177,8 +177,8 @@ Known issues to be fixed: - [x] Speed? - should be fine, on my machine (old i5 laptop) it takes around 90s for ~1000 documents with ~4000 annotations - [x] ensure all cmdline options do what they should -- [ ] annotations carry over color object from fitz, should just be Color object or simple tuple with rgb vals -- [ ] docstrings, docstrings! +- [x] annotations carry over color object from fitz, should just be Color object or simple tuple with rgb vals +- [x] docstrings, docstrings! - [ ] testing testing testing!! - [ ] refactor into some better abstractions (e.g. Exporter Protocol -> stdout/markdown implementations; Extractor Protocol -> PDF implementation) @@ -188,7 +188,7 @@ features to be implemented: - [x] static analysis (lint, typecheck etc) on pushes - [x] test pipeline on master pushes - [ ] release pipeline to pypi on tags -- [ ] add page number if available +- [x] add page number if available - exists in Annotation, just need to place in output - [ ] show overall amount of extractions at the end - [ ] custom formatting decided by user @@ -233,14 +233,18 @@ I am not sure if there is much I can do about these issues for now. and for myself whenever I forget. The basic building blocks currently in here are three: - extractors -: extract data from a source file attached to a papis document. +: Extract data from a source file attached to a papis document. + +- annotations +: The actual extracted blocks of text, containing some metadata + info as well, such as their color, type, page. - exporters -: put the extracted data somewhere like stdout or into your notes. +: Put the extracted data somewhere. For now stdout or into your notes. -- templates -: make sure the exporter saves the data according to your preferred layout, -such as a markdown syntax or csv-structure. +- formatters +: Make sure the exporter saves the data according to your preferred layout, + such as a markdown syntax or csv-structure. Splitting it into those three building blocks makes it easier to recombine them in any way, should someone want to save highlights as csv data in their notes, diff --git a/papis_extract/__init__.py b/papis_extract/__init__.py index 6379b27..026e613 100644 --- a/papis_extract/__init__.py +++ b/papis_extract/__init__.py @@ -8,8 +8,7 @@ import papis.strings from papis.document import Document from papis_extract import extractor, exporter -from papis_extract.annotation_data import AnnotatedDocument -from papis_extract.templating import Csv, Markdown, Templating +from papis_extract.formatter import MarkdownFormatter, Formatter logger = papis.logging.get_logger(__name__) @@ -39,8 +38,7 @@ papis.config.register_default_settings(DEFAULT_OPTIONS) @click.option( "--manual/--no-manual", "-m", - help= - "Open each note in editor for manual editing after extracting its annotations.", + help="Open note in editor for manual editing after annotation extraction.", ) @click.option( "--template", @@ -82,23 +80,19 @@ def main( return if template == "csv": - template_type = Csv() - else: - template_type = Markdown() - - run(documents, edit=manual, write=write, git=git, template=template_type) + raise NotImplementedError + run(documents, edit=manual, write=write, git=git, template=MarkdownFormatter()) def run( documents: list[Document], + template: Formatter, edit: bool = False, write: bool = False, git: bool = False, - template: Templating = Markdown(), ) -> None: - doc_annotations: list[AnnotatedDocument] = extractor.start(documents) - + template.annotated_docs = extractor.start(documents) if write: - exporter.to_notes(doc_annotations, template, edit=edit, git=git) + exporter.to_notes(template, edit=edit, git=git) else: - exporter.to_stdout(doc_annotations, template) + exporter.to_stdout(template) diff --git a/papis_extract/annotation_data.py b/papis_extract/annotation_data.py index 5fd5546..f0bad88 100644 --- a/papis_extract/annotation_data.py +++ b/papis_extract/annotation_data.py @@ -5,8 +5,6 @@ import papis.config from papis.document import Document import chevron -from papis_extract.templating import Templating - TEXT_SIMILARITY_MINIMUM = 0.75 COLOR_SIMILARITY_MINIMUM = 0.833 @@ -36,7 +34,7 @@ class Annotation: type: str = "Highlight" minimum_similarity_color: float = 1.0 - def format(self, template: Templating): + def format(self, template: str, doc: Document = Document()): """Return a formatted string of the annotation. Given a provided formatting pattern, this method returns the annotation @@ -50,8 +48,9 @@ class Annotation: "page": self.page, "tag": self.tag, "type": self.type, + "doc": doc, } - return chevron.render(template.string, data) + return chevron.render(template, data) @property def colorname(self): @@ -89,6 +88,7 @@ class AnnotatedDocument: """Contains all annotations belonging to a single papis document. Combines a document with a list of annotations which belong to it.""" + document: Document annotations: list[Annotation] diff --git a/papis_extract/exporter.py b/papis_extract/exporter.py index 8e05c6c..14ee617 100644 --- a/papis_extract/exporter.py +++ b/papis_extract/exporter.py @@ -7,62 +7,34 @@ import papis.git import papis.config import Levenshtein -from papis_extract.annotation_data import AnnotatedDocument -from papis_extract.templating import Templating +from papis_extract.formatter import Formatter logger = papis.logging.get_logger(__name__) -def to_stdout(annots: list[AnnotatedDocument], template: Templating) -> None: +def to_stdout(template: Formatter) -> None: """Pretty print annotations to stdout. Gives a nice human-readable representations of the annotations in somewhat of a list form. Not intended for machine-readability. """ - if not annots: - return - - last = annots[-1] - for entry in annots: - if not entry.annotations: - continue - - title_decoration = ( - f"{'=' * len(entry.document.get('title', ''))} " - f"{'-' * len(entry.document.get('author', ''))}" - ) - print( - f"{title_decoration}\n{papis.document.describe(entry.document)}\n{title_decoration}\n" - ) - for a in entry.annotations: - print(a.format(template)) - - if entry != last: - print("\n") + output:str = template.execute() + print(output.rstrip('\n')) -def to_notes( - annots: list[AnnotatedDocument], template: Templating, edit: bool, git: bool -) -> None: +def to_notes(template: Formatter, edit: bool, git: bool) -> None: """Write annotations into document notes. Permanently writes the given annotations into notes belonging to papis documents. Creates new notes for documents missing a note field or appends to existing. """ - if not annots: - return - - for entry in annots: - if not entry.annotations: - continue - - formatted_annotations: list[str] = [] - for a in entry.annotations: - formatted_annotations.append(a.format(template)) - - _add_annots_to_note(entry.document, formatted_annotations) + annotated_docs = template.annotated_docs + for entry in annotated_docs: + formatted_annotations = template.execute(entry).split("\n") + if formatted_annotations: + _add_annots_to_note(entry.document, formatted_annotations) if edit: papis.commands.edit.edit_notes(entry.document, git=git) @@ -117,7 +89,7 @@ def _drop_existing_annotations( ) -> list[str]: """Returns the input annotations dropping any existing. - Takes a list of formatted annotations and a list of strings + Takes a list of formatted annotations and a list of strings (most probably existing lines in a file). If anny annotations match an existing line closely enough, they will be dropped. @@ -130,7 +102,9 @@ def _drop_existing_annotations( remaining: list[str] = [] for an in formatted_annotations: an_split = an.splitlines() - if not _test_similarity(an_split[0], file_lines, minimum_similarity): + if an_split and not _test_similarity( + an_split[0], file_lines, minimum_similarity + ): remaining.append(an) return remaining diff --git a/papis_extract/formatter.py b/papis_extract/formatter.py new file mode 100644 index 0000000..3b921dc --- /dev/null +++ b/papis_extract/formatter.py @@ -0,0 +1,100 @@ +from dataclasses import dataclass, field +from typing import Protocol + +from papis_extract.annotation_data import AnnotatedDocument + + +@dataclass +class Formatter(Protocol): + annotated_docs: list[AnnotatedDocument] + header: str + string: str + footer: str + + def execute(self, doc: AnnotatedDocument | None = None) -> str: + raise NotImplementedError + + +@dataclass +class MarkdownFormatter: + annotated_docs: list[AnnotatedDocument] = field(default_factory=lambda: list()) + header: str = "" + string: str = ( + "{{#tag}}#{{tag}}\n{{/tag}}" + "{{#quote}}> {{quote}}{{/quote}} {{#page}}[p. {{page}}]{{/page}}\n" + "{{#note}} NOTE: {{note}}{{/note}}" + ) + footer: str = "" + + def execute(self, doc: AnnotatedDocument | None = None) -> str: + output = "" + documents = self.annotated_docs if doc is None else [doc] + last = documents[-1] + for entry in documents: + if not entry.annotations: + continue + + title_decoration = ( + f"{'=' * len(entry.document.get('title', ''))} " + f"{'-' * len(entry.document.get('author', ''))}" + ) + output += ( + f"{title_decoration}\n" + f"{entry.document['title']} - {entry.document['author']}\n" + f"{title_decoration}\n\n" + ) + for a in entry.annotations: + output += a.format(self.string) + + if entry != last: + print(f"entry: {entry}, last: {last}") + output += "\n\n\n" + + return output + +@dataclass +class CountFormatter: + annotated_docs: list[AnnotatedDocument] = field(default_factory=lambda: list()) + header: str = "" + string: str = "" + footer: str = "" + + def execute(self, doc: AnnotatedDocument | None = None) -> str: + output = "" + documents = self.annotated_docs if doc is None else [doc] + last = documents[-1] + for entry in documents: + if not entry.annotations: + continue + + title_decoration = ( + f"{'=' * len(entry.document.get('title', ''))} " + f"{'-' * len(entry.document.get('author', ''))}" + ) + output += ( + f"{title_decoration}\n" + f"{entry.document['title']} - {entry.document['author']}\n" + f"{title_decoration}\n\n" + ) + for a in entry.annotations: + output += a.format(self.string) + + if entry != last: + print(f"entry: {entry}, last: {last}") + output += "\n\n\n" + + return output + +@dataclass +class CsvFormatter: + header: str = "type, tag, page, quote, note, file" + string: str = "{{type}}, {{tag}}, {{page}}, {{quote}}, {{note}}, {{file}}" + footer: str = "" + + +@dataclass +class CustomFormatter: + def __init__(self, header: str = "", string: str = "", footer: str = "") -> None: + self.header = header + self.string = string + self.footer = footer diff --git a/papis_extract/templating.py b/papis_extract/templating.py deleted file mode 100644 index c8abf7f..0000000 --- a/papis_extract/templating.py +++ /dev/null @@ -1,35 +0,0 @@ -from dataclasses import dataclass -from typing import Protocol - - -@dataclass -class Templating(Protocol): - header: str - string: str - footer: str - - -@dataclass -class Markdown: - header: str = "" - string: str = ( - "{{#tag}}#{{tag}}\n{{/tag}}" - "{{#quote}}> {{quote}}{{/quote}} {{#page}}[p. {{page}}]{{/page}}\n" - "{{#note}} NOTE: {{note}}{{/note}}" - ) - footer: str = "" - - -@dataclass -class Csv: - header: str = "type, tag, page, quote, note, file" - string: str = "{{type}}, {{tag}}, {{page}}, {{quote}}, {{note}}, {{file}}" - footer: str = "" - - -@dataclass -class Custom: - def __init__(self, header: str = "", string: str = "", footer: str = "") -> None: - self.header = header - self.string = string - self.footer = footer