diff --git a/README.md b/README.md index 61a6f47..89052bf 100644 --- a/README.md +++ b/README.md @@ -177,8 +177,8 @@ Known issues to be fixed: - [x] Speed? - should be fine, on my machine (old i5 laptop) it takes around 90s for ~1000 documents with ~4000 annotations - [x] ensure all cmdline options do what they should -- [ ] annotations carry over color object from fitz, should just be Color object or simple tuple with rgb vals -- [ ] docstrings, docstrings! +- [x] annotations carry over color object from fitz, should just be Color object or simple tuple with rgb vals +- [x] docstrings, docstrings! - [ ] testing testing testing!! - [ ] refactor into some better abstractions (e.g. Exporter Protocol -> stdout/markdown implementations; Extractor Protocol -> PDF implementation) @@ -188,7 +188,7 @@ features to be implemented: - [x] static analysis (lint, typecheck etc) on pushes - [x] test pipeline on master pushes - [ ] release pipeline to pypi on tags -- [ ] add page number if available +- [x] add page number if available - exists in Annotation, just need to place in output - [ ] show overall amount of extractions at the end - [ ] custom formatting decided by user @@ -233,14 +233,18 @@ I am not sure if there is much I can do about these issues for now. and for myself whenever I forget. The basic building blocks currently in here are three: - extractors -: extract data from a source file attached to a papis document. +: Extract data from a source file attached to a papis document. + +- annotations +: The actual extracted blocks of text, containing some metadata + info as well, such as their color, type, page. - exporters -: put the extracted data somewhere like stdout or into your notes. +: Put the extracted data somewhere. For now stdout or into your notes. -- templates -: make sure the exporter saves the data according to your preferred layout, -such as a markdown syntax or csv-structure. +- formatters +: Make sure the exporter saves the data according to your preferred layout, + such as a markdown syntax or csv-structure. Splitting it into those three building blocks makes it easier to recombine them in any way, should someone want to save highlights as csv data in their notes, diff --git a/papis_extract/__init__.py b/papis_extract/__init__.py index 6379b27..026e613 100644 --- a/papis_extract/__init__.py +++ b/papis_extract/__init__.py @@ -8,8 +8,7 @@ import papis.strings from papis.document import Document from papis_extract import extractor, exporter -from papis_extract.annotation_data import AnnotatedDocument -from papis_extract.templating import Csv, Markdown, Templating +from papis_extract.formatter import MarkdownFormatter, Formatter logger = papis.logging.get_logger(__name__) @@ -39,8 +38,7 @@ papis.config.register_default_settings(DEFAULT_OPTIONS) @click.option( "--manual/--no-manual", "-m", - help= - "Open each note in editor for manual editing after extracting its annotations.", + help="Open note in editor for manual editing after annotation extraction.", ) @click.option( "--template", @@ -82,23 +80,19 @@ def main( return if template == "csv": - template_type = Csv() - else: - template_type = Markdown() - - run(documents, edit=manual, write=write, git=git, template=template_type) + raise NotImplementedError + run(documents, edit=manual, write=write, git=git, template=MarkdownFormatter()) def run( documents: list[Document], + template: Formatter, edit: bool = False, write: bool = False, git: bool = False, - template: Templating = Markdown(), ) -> None: - doc_annotations: list[AnnotatedDocument] = extractor.start(documents) - + template.annotated_docs = extractor.start(documents) if write: - exporter.to_notes(doc_annotations, template, edit=edit, git=git) + exporter.to_notes(template, edit=edit, git=git) else: - exporter.to_stdout(doc_annotations, template) + exporter.to_stdout(template) diff --git a/papis_extract/annotation_data.py b/papis_extract/annotation_data.py index 5fd5546..f0bad88 100644 --- a/papis_extract/annotation_data.py +++ b/papis_extract/annotation_data.py @@ -5,8 +5,6 @@ import papis.config from papis.document import Document import chevron -from papis_extract.templating import Templating - TEXT_SIMILARITY_MINIMUM = 0.75 COLOR_SIMILARITY_MINIMUM = 0.833 @@ -36,7 +34,7 @@ class Annotation: type: str = "Highlight" minimum_similarity_color: float = 1.0 - def format(self, template: Templating): + def format(self, template: str, doc: Document = Document()): """Return a formatted string of the annotation. Given a provided formatting pattern, this method returns the annotation @@ -50,8 +48,9 @@ class Annotation: "page": self.page, "tag": self.tag, "type": self.type, + "doc": doc, } - return chevron.render(template.string, data) + return chevron.render(template, data) @property def colorname(self): @@ -89,6 +88,7 @@ class AnnotatedDocument: """Contains all annotations belonging to a single papis document. Combines a document with a list of annotations which belong to it.""" + document: Document annotations: list[Annotation] diff --git a/papis_extract/exporter.py b/papis_extract/exporter.py index 8e05c6c..14ee617 100644 --- a/papis_extract/exporter.py +++ b/papis_extract/exporter.py @@ -7,62 +7,34 @@ import papis.git import papis.config import Levenshtein -from papis_extract.annotation_data import AnnotatedDocument -from papis_extract.templating import Templating +from papis_extract.formatter import Formatter logger = papis.logging.get_logger(__name__) -def to_stdout(annots: list[AnnotatedDocument], template: Templating) -> None: +def to_stdout(template: Formatter) -> None: """Pretty print annotations to stdout. Gives a nice human-readable representations of the annotations in somewhat of a list form. Not intended for machine-readability. """ - if not annots: - return - - last = annots[-1] - for entry in annots: - if not entry.annotations: - continue - - title_decoration = ( - f"{'=' * len(entry.document.get('title', ''))} " - f"{'-' * len(entry.document.get('author', ''))}" - ) - print( - f"{title_decoration}\n{papis.document.describe(entry.document)}\n{title_decoration}\n" - ) - for a in entry.annotations: - print(a.format(template)) - - if entry != last: - print("\n") + output:str = template.execute() + print(output.rstrip('\n')) -def to_notes( - annots: list[AnnotatedDocument], template: Templating, edit: bool, git: bool -) -> None: +def to_notes(template: Formatter, edit: bool, git: bool) -> None: """Write annotations into document notes. Permanently writes the given annotations into notes belonging to papis documents. Creates new notes for documents missing a note field or appends to existing. """ - if not annots: - return - - for entry in annots: - if not entry.annotations: - continue - - formatted_annotations: list[str] = [] - for a in entry.annotations: - formatted_annotations.append(a.format(template)) - - _add_annots_to_note(entry.document, formatted_annotations) + annotated_docs = template.annotated_docs + for entry in annotated_docs: + formatted_annotations = template.execute(entry).split("\n") + if formatted_annotations: + _add_annots_to_note(entry.document, formatted_annotations) if edit: papis.commands.edit.edit_notes(entry.document, git=git) @@ -117,7 +89,7 @@ def _drop_existing_annotations( ) -> list[str]: """Returns the input annotations dropping any existing. - Takes a list of formatted annotations and a list of strings + Takes a list of formatted annotations and a list of strings (most probably existing lines in a file). If anny annotations match an existing line closely enough, they will be dropped. @@ -130,7 +102,9 @@ def _drop_existing_annotations( remaining: list[str] = [] for an in formatted_annotations: an_split = an.splitlines() - if not _test_similarity(an_split[0], file_lines, minimum_similarity): + if an_split and not _test_similarity( + an_split[0], file_lines, minimum_similarity + ): remaining.append(an) return remaining diff --git a/papis_extract/formatter.py b/papis_extract/formatter.py new file mode 100644 index 0000000..3b921dc --- /dev/null +++ b/papis_extract/formatter.py @@ -0,0 +1,100 @@ +from dataclasses import dataclass, field +from typing import Protocol + +from papis_extract.annotation_data import AnnotatedDocument + + +@dataclass +class Formatter(Protocol): + annotated_docs: list[AnnotatedDocument] + header: str + string: str + footer: str + + def execute(self, doc: AnnotatedDocument | None = None) -> str: + raise NotImplementedError + + +@dataclass +class MarkdownFormatter: + annotated_docs: list[AnnotatedDocument] = field(default_factory=lambda: list()) + header: str = "" + string: str = ( + "{{#tag}}#{{tag}}\n{{/tag}}" + "{{#quote}}> {{quote}}{{/quote}} {{#page}}[p. {{page}}]{{/page}}\n" + "{{#note}} NOTE: {{note}}{{/note}}" + ) + footer: str = "" + + def execute(self, doc: AnnotatedDocument | None = None) -> str: + output = "" + documents = self.annotated_docs if doc is None else [doc] + last = documents[-1] + for entry in documents: + if not entry.annotations: + continue + + title_decoration = ( + f"{'=' * len(entry.document.get('title', ''))} " + f"{'-' * len(entry.document.get('author', ''))}" + ) + output += ( + f"{title_decoration}\n" + f"{entry.document['title']} - {entry.document['author']}\n" + f"{title_decoration}\n\n" + ) + for a in entry.annotations: + output += a.format(self.string) + + if entry != last: + print(f"entry: {entry}, last: {last}") + output += "\n\n\n" + + return output + +@dataclass +class CountFormatter: + annotated_docs: list[AnnotatedDocument] = field(default_factory=lambda: list()) + header: str = "" + string: str = "" + footer: str = "" + + def execute(self, doc: AnnotatedDocument | None = None) -> str: + output = "" + documents = self.annotated_docs if doc is None else [doc] + last = documents[-1] + for entry in documents: + if not entry.annotations: + continue + + title_decoration = ( + f"{'=' * len(entry.document.get('title', ''))} " + f"{'-' * len(entry.document.get('author', ''))}" + ) + output += ( + f"{title_decoration}\n" + f"{entry.document['title']} - {entry.document['author']}\n" + f"{title_decoration}\n\n" + ) + for a in entry.annotations: + output += a.format(self.string) + + if entry != last: + print(f"entry: {entry}, last: {last}") + output += "\n\n\n" + + return output + +@dataclass +class CsvFormatter: + header: str = "type, tag, page, quote, note, file" + string: str = "{{type}}, {{tag}}, {{page}}, {{quote}}, {{note}}, {{file}}" + footer: str = "" + + +@dataclass +class CustomFormatter: + def __init__(self, header: str = "", string: str = "", footer: str = "") -> None: + self.header = header + self.string = string + self.footer = footer diff --git a/papis_extract/templating.py b/papis_extract/templating.py deleted file mode 100644 index c8abf7f..0000000 --- a/papis_extract/templating.py +++ /dev/null @@ -1,35 +0,0 @@ -from dataclasses import dataclass -from typing import Protocol - - -@dataclass -class Templating(Protocol): - header: str - string: str - footer: str - - -@dataclass -class Markdown: - header: str = "" - string: str = ( - "{{#tag}}#{{tag}}\n{{/tag}}" - "{{#quote}}> {{quote}}{{/quote}} {{#page}}[p. {{page}}]{{/page}}\n" - "{{#note}} NOTE: {{note}}{{/note}}" - ) - footer: str = "" - - -@dataclass -class Csv: - header: str = "type, tag, page, quote, note, file" - string: str = "{{type}}, {{tag}}, {{page}}, {{quote}}, {{note}}, {{file}}" - footer: str = "" - - -@dataclass -class Custom: - def __init__(self, header: str = "", string: str = "", footer: str = "") -> None: - self.header = header - self.string = string - self.footer = footer