From c2aec7add605519769ea5d63be4a9615767946d9 Mon Sep 17 00:00:00 2001 From: Marty Oehme Date: Wed, 12 Jun 2024 11:45:35 +0200 Subject: [PATCH] feat: Notify formatters if formatting first entry This allows headers to be created by a formatter, which will *only* be added to the very first entry created and not to each entry. Currently for example this is used to create a csv header but not for each document in turn. --- papis_extract/exporters/notes.py | 7 +++++-- papis_extract/exporters/stdout.py | 4 +++- papis_extract/formatter.py | 26 ++++++++++++++++++++++---- tests/test_formatting.py | 9 +++++++++ 4 files changed, 39 insertions(+), 7 deletions(-) diff --git a/papis_extract/exporters/notes.py b/papis_extract/exporters/notes.py index a218db7..22b4ce0 100644 --- a/papis_extract/exporters/notes.py +++ b/papis_extract/exporters/notes.py @@ -28,7 +28,8 @@ class NotesExporter: documents missing a note field or appends to existing. """ for doc, annots in annot_docs: - formatted_annotations = self.formatter(doc, annots).split("\n") + # first always true since we write single doc per note + formatted_annotations: list[str] = self.formatter(doc, annots, first=True).split("\n") if formatted_annotations: self._add_annots_to_note(doc, formatted_annotations, force=self.force) @@ -80,7 +81,9 @@ class NotesExporter: # add newline if theres no empty space at file end if len(existing) > 0 and existing[-1].strip() != "": f.write("\n") - f.write("\n\n".join(new_annotations)) + # FIXME this either joins them too close or moves them too far apart + # We need a better algorithm which knows what a full 'annotation' is. + f.write("\n".join(new_annotations)) f.write("\n") logger.info( f"Wrote {len(new_annotations)} " diff --git a/papis_extract/exporters/stdout.py b/papis_extract/exporters/stdout.py index ed1cabf..f8a229f 100644 --- a/papis_extract/exporters/stdout.py +++ b/papis_extract/exporters/stdout.py @@ -20,7 +20,9 @@ class StdoutExporter: the annotations in somewhat of a list form. Not intended for machine-readability. """ + first_entry = True for doc, annots in annot_docs: - output: str = self.formatter(doc, annots) + output: str = self.formatter(doc, annots, first=first_entry) if output: print("{output}\n".format(output=output.rstrip("\n"))) + first_entry = False diff --git a/papis_extract/formatter.py b/papis_extract/formatter.py index 986d691..18e15b2 100644 --- a/papis_extract/formatter.py +++ b/papis_extract/formatter.py @@ -1,15 +1,29 @@ -from collections.abc import Callable - +from typing import Protocol from papis.document import Document from papis_extract.annotation import Annotation -Formatter = Callable[[Document, list[Annotation]], str] +class Formatter(Protocol): + """Basic formatter protocol. + + Every valid formatter must implement at least this protocol. + A formatter is a function which receives a document and a list + of annotations and spits them out in some formatted way. + + Formatters additionally must take the (often optional) passed + parameter 'first' which signals to the formatter that the current + document entry is the very first one to be printed in whatever + exporter is used, if multiple entries are printed. + This can be useful for adding a header if necessary for the format. + """ + def __call__(self, document: Document, annotations: list[Annotation], first: bool) -> str: + ... def format_markdown( document: Document = Document(), annotations: list[Annotation] = [], + first: bool = False, headings: str = "setext", # setext | atx | None ) -> str: if not annotations: @@ -43,6 +57,7 @@ def format_markdown( def format_markdown_atx( document: Document = Document(), annotations: list[Annotation] = [], + first: bool = False, ) -> str: return format_markdown(document, annotations, headings="atx") @@ -50,6 +65,7 @@ def format_markdown_atx( def format_markdown_setext( document: Document = Document(), annotations: list[Annotation] = [], + first: bool = False, ) -> str: return format_markdown(document, annotations, headings="setext") @@ -57,6 +73,7 @@ def format_markdown_setext( def format_count( document: Document = Document(), annotations: list[Annotation] = [], + first: bool = False, ) -> str: if not annotations: return "" @@ -76,13 +93,14 @@ def format_count( def format_csv( document: Document = Document(), annotations: list[Annotation] = [], + first: bool = False, ) -> str: header: str = "type,tag,page,quote,note,author,title,ref,file" template: str = ( '{{type}},{{tag}},{{page}},"{{quote}}","{{note}}",' '"{{doc.author}}","{{doc.title}}","{{doc.ref}}","{{file}}"' ) - output = f"{header}\n" + output = f"{header}\n" if first else "" if not annotations: return "" diff --git a/tests/test_formatting.py b/tests/test_formatting.py index b018041..ac7f2c3 100644 --- a/tests/test_formatting.py +++ b/tests/test_formatting.py @@ -54,6 +54,15 @@ def test_count_default(): def test_csv_default(): fmt = format_csv assert fmt(document, annotations) == ( + 'Highlight,,0,"my lovely text","","document-author",' + '"document-title","","myfile.pdf"\n' + 'Highlight,,0,"my second text","with note","document-author",' + '"document-title","","myfile.pdf"' + ) + +def test_csv_with_header(): + fmt = format_csv + assert fmt(document, annotations, first=True) == ( "type,tag,page,quote,note,author,title,ref,file\n" 'Highlight,,0,"my lovely text","","document-author",' '"document-title","","myfile.pdf"\n'