From e56f0141369df0a630015ad8bb2d7a23f2cee689 Mon Sep 17 00:00:00 2001 From: Marty Oehme Date: Thu, 31 Aug 2023 21:32:24 +0200 Subject: [PATCH 01/22] Add formatting style Markdown --- papis_extract/__init__.py | 6 +++--- papis_extract/annotation_data.py | 19 ++++++++++++++++--- papis_extract/exporter.py | 20 +++++++++----------- tests/test_formatting.py | 14 ++++++++++++++ 4 files changed, 42 insertions(+), 17 deletions(-) create mode 100644 tests/test_formatting.py diff --git a/papis_extract/__init__.py b/papis_extract/__init__.py index af3a834..e396962 100644 --- a/papis_extract/__init__.py +++ b/papis_extract/__init__.py @@ -8,7 +8,7 @@ import papis.strings from papis.document import Document from papis_extract import extractor, exporter -from papis_extract.annotation_data import AnnotatedDocument +from papis_extract.annotation_data import AnnotatedDocument, Markdown logger = papis.logging.get_logger(__name__) @@ -84,6 +84,6 @@ def run( doc_annotations: list[AnnotatedDocument] = extractor.start(documents) if write: - exporter.to_notes(doc_annotations, edit=edit, git=git) + exporter.to_notes(doc_annotations, Markdown(), edit=edit, git=git) else: - exporter.to_stdout(doc_annotations) + exporter.to_stdout(doc_annotations, Markdown()) diff --git a/papis_extract/annotation_data.py b/papis_extract/annotation_data.py index fa06e0d..486c397 100644 --- a/papis_extract/annotation_data.py +++ b/papis_extract/annotation_data.py @@ -1,5 +1,6 @@ import math from dataclasses import dataclass, field +from typing import Protocol import papis.config from papis.document import Document @@ -55,9 +56,7 @@ class Annotation: Finds the closest named color to the annotation and returns it, using euclidian distance between the two color vectors. """ - annot_colors = ( - self.colors or (0.0, 0.0, 0.0) - ) + annot_colors = self.colors or (0.0, 0.0, 0.0) nearest = None minimum_similarity = ( papis.config.getfloat("minimum_similarity_color", "plugins.extract") or 1.0 @@ -85,3 +84,17 @@ class Annotation: class AnnotatedDocument: document: Document annotations: list[Annotation] + + +@dataclass +class Templating(Protocol): + string: str + + +@dataclass +class Markdown: + string: str = ( + "{{#tag}}#{{tag}}\n{{/tag}}" + "{{#quote}}> {{quote}}{{/quote}} {{#page}}[p. {{page}}]{{/page}}\n" + "{{#note}} NOTE: {{note}}{{/note}}" + ) diff --git a/papis_extract/exporter.py b/papis_extract/exporter.py index b1220d0..08bcfbb 100644 --- a/papis_extract/exporter.py +++ b/papis_extract/exporter.py @@ -7,17 +7,12 @@ import papis.git import papis.config import Levenshtein -from papis_extract.annotation_data import AnnotatedDocument, Annotation +from papis_extract.annotation_data import AnnotatedDocument, Annotation, Templating logger = papis.logging.get_logger(__name__) -def _format_annotation(annotation: Annotation) -> str: - note = f"NOTE: {annotation.content}" if annotation.content else "" - return f"> {annotation.text}\n {note}" - - -def to_stdout(annots: list[AnnotatedDocument]) -> None: +def to_stdout(annots: list[AnnotatedDocument], template: Templating) -> None: """Pretty print annotations to stdout. Gives a nice human-readable representations of @@ -27,6 +22,7 @@ def to_stdout(annots: list[AnnotatedDocument]) -> None: if not annots: return + last = annots[-1] for entry in annots: if not entry.annotations: continue @@ -39,13 +35,15 @@ def to_stdout(annots: list[AnnotatedDocument]) -> None: f"{title_decoration}\n{papis.document.describe(entry.document)}\n{title_decoration}\n" ) for a in entry.annotations: - print(_format_annotation(a)) + print(a.format(template.string)) - if entry != annots[-1]: + if entry != last: print("\n") -def to_notes(annots: list[AnnotatedDocument], edit: bool, git: bool) -> None: +def to_notes( + annots: list[AnnotatedDocument], template: Templating, edit: bool, git: bool +) -> None: """Write annotations into document notes. Permanently writes the given annotations into notes @@ -61,7 +59,7 @@ def to_notes(annots: list[AnnotatedDocument], edit: bool, git: bool) -> None: formatted_annotations: list[str] = [] for a in entry.annotations: - formatted_annotations.append(_format_annotation(a)) + formatted_annotations.append(a.format(template.string)) _add_annots_to_note(entry.document, formatted_annotations) diff --git a/tests/test_formatting.py b/tests/test_formatting.py new file mode 100644 index 0000000..07f1092 --- /dev/null +++ b/tests/test_formatting.py @@ -0,0 +1,14 @@ +import chevron + +from papis_extract.annotation_data import Markdown + +def test_markdown_default(): + fmt = Markdown() + assert chevron.render(fmt.string, { + "file": "somefile/somewhere.pdf", + "quote": "I am quote", + "note": "and including note.", + "page": 46, + "tag": "important", + "type": "highlight", + }) == "#important\n> I am quote [p. 46]\n NOTE: and including note." From 5450776eb2604da52de14f751effdcfc46d8572f Mon Sep 17 00:00:00 2001 From: Marty Oehme Date: Thu, 31 Aug 2023 21:40:38 +0200 Subject: [PATCH 02/22] refactor: Extract templating to model module --- papis_extract/__init__.py | 3 ++- papis_extract/annotation_data.py | 13 ------------- papis_extract/exporter.py | 3 ++- papis_extract/model/templating.py | 16 ++++++++++++++++ 4 files changed, 20 insertions(+), 15 deletions(-) create mode 100644 papis_extract/model/templating.py diff --git a/papis_extract/__init__.py b/papis_extract/__init__.py index e396962..b9b5451 100644 --- a/papis_extract/__init__.py +++ b/papis_extract/__init__.py @@ -8,7 +8,8 @@ import papis.strings from papis.document import Document from papis_extract import extractor, exporter -from papis_extract.annotation_data import AnnotatedDocument, Markdown +from papis_extract.annotation_data import AnnotatedDocument +from papis_extract.model.templating import Markdown logger = papis.logging.get_logger(__name__) diff --git a/papis_extract/annotation_data.py b/papis_extract/annotation_data.py index 486c397..6b641c4 100644 --- a/papis_extract/annotation_data.py +++ b/papis_extract/annotation_data.py @@ -85,16 +85,3 @@ class AnnotatedDocument: document: Document annotations: list[Annotation] - -@dataclass -class Templating(Protocol): - string: str - - -@dataclass -class Markdown: - string: str = ( - "{{#tag}}#{{tag}}\n{{/tag}}" - "{{#quote}}> {{quote}}{{/quote}} {{#page}}[p. {{page}}]{{/page}}\n" - "{{#note}} NOTE: {{note}}{{/note}}" - ) diff --git a/papis_extract/exporter.py b/papis_extract/exporter.py index 08bcfbb..2b0d4fd 100644 --- a/papis_extract/exporter.py +++ b/papis_extract/exporter.py @@ -7,7 +7,8 @@ import papis.git import papis.config import Levenshtein -from papis_extract.annotation_data import AnnotatedDocument, Annotation, Templating +from papis_extract.annotation_data import AnnotatedDocument +from papis_extract.model.templating import Templating logger = papis.logging.get_logger(__name__) diff --git a/papis_extract/model/templating.py b/papis_extract/model/templating.py new file mode 100644 index 0000000..f1ebb92 --- /dev/null +++ b/papis_extract/model/templating.py @@ -0,0 +1,16 @@ +from dataclasses import dataclass +from typing import Protocol + + +@dataclass +class Templating(Protocol): + string: str + + +@dataclass +class Markdown: + string: str = ( + "{{#tag}}#{{tag}}\n{{/tag}}" + "{{#quote}}> {{quote}}{{/quote}} {{#page}}[p. {{page}}]{{/page}}\n" + "{{#note}} NOTE: {{note}}{{/note}}" + ) From e633c0335e8a0c61a9b72868cac345424d6db716 Mon Sep 17 00:00:00 2001 From: Marty Oehme Date: Tue, 19 Sep 2023 17:34:33 +0200 Subject: [PATCH 03/22] chore: Make whoosh database optional dependency --- pyproject.toml | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 8ee68a7..8f52f48 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,17 +12,19 @@ pymupdf = "^1.23.1" levenshtein = "^0.21.1" papis = "^0.13" click = "^8.1.7" -whoosh = "^2.7.4" +whoosh = { version = "^2.7.4", optional = true } python-magic = "^0.4.27" chevron = "^0.14.0" -[tool.poetry.plugins."papis.command"] -extract = "papis_extract:main" - +[tool.poetry.extras] +whoosh = ["whoosh"] [tool.poetry.group.dev.dependencies] pytest = "^7.4.0" +[tool.poetry.plugins."papis.command"] +extract = "papis_extract:main" + [build-system] requires = ["poetry-core"] build-backend = "poetry.core.masonry.api" From 4eb983d9e38e957f2044df62fa829b364be4842e Mon Sep 17 00:00:00 2001 From: Marty Oehme Date: Tue, 19 Sep 2023 17:35:39 +0200 Subject: [PATCH 04/22] refactor: Move templating to separate file --- papis_extract/__init__.py | 2 +- papis_extract/annotation_data.py | 7 ++-- papis_extract/exporter.py | 6 ++-- papis_extract/model/templating.py | 16 ---------- papis_extract/templating.py | 35 ++++++++++++++++++++ tests/test_annotation.py | 19 +++++------ tests/test_formatting.py | 53 +++++++++++++++++++++++++------ 7 files changed, 97 insertions(+), 41 deletions(-) delete mode 100644 papis_extract/model/templating.py create mode 100644 papis_extract/templating.py diff --git a/papis_extract/__init__.py b/papis_extract/__init__.py index b9b5451..c23ae39 100644 --- a/papis_extract/__init__.py +++ b/papis_extract/__init__.py @@ -9,7 +9,7 @@ from papis.document import Document from papis_extract import extractor, exporter from papis_extract.annotation_data import AnnotatedDocument -from papis_extract.model.templating import Markdown +from papis_extract.templating import Markdown logger = papis.logging.get_logger(__name__) diff --git a/papis_extract/annotation_data.py b/papis_extract/annotation_data.py index 6b641c4..c94244f 100644 --- a/papis_extract/annotation_data.py +++ b/papis_extract/annotation_data.py @@ -1,11 +1,12 @@ import math from dataclasses import dataclass, field -from typing import Protocol import papis.config from papis.document import Document import chevron +from papis_extract.templating import Templating + TEXT_SIMILARITY_MINIMUM = 0.75 COLOR_SIMILARITY_MINIMUM = 0.833 @@ -32,7 +33,7 @@ class Annotation: type: str = "Highlight" minimum_similarity_color: float = 1.0 - def format(self, formatting): + def format(self, template: Templating): """Return a formatted string of the annotation. Given a provided formatting pattern, this method returns the annotation @@ -47,7 +48,7 @@ class Annotation: "tag": self.tag, "type": self.type, } - return chevron.render(formatting, data) + return chevron.render(template.string, data) @property def colorname(self): diff --git a/papis_extract/exporter.py b/papis_extract/exporter.py index 2b0d4fd..0fca9d7 100644 --- a/papis_extract/exporter.py +++ b/papis_extract/exporter.py @@ -8,7 +8,7 @@ import papis.config import Levenshtein from papis_extract.annotation_data import AnnotatedDocument -from papis_extract.model.templating import Templating +from papis_extract.templating import Templating logger = papis.logging.get_logger(__name__) @@ -36,7 +36,7 @@ def to_stdout(annots: list[AnnotatedDocument], template: Templating) -> None: f"{title_decoration}\n{papis.document.describe(entry.document)}\n{title_decoration}\n" ) for a in entry.annotations: - print(a.format(template.string)) + print(a.format(template)) if entry != last: print("\n") @@ -60,7 +60,7 @@ def to_notes( formatted_annotations: list[str] = [] for a in entry.annotations: - formatted_annotations.append(a.format(template.string)) + formatted_annotations.append(a.format(template)) _add_annots_to_note(entry.document, formatted_annotations) diff --git a/papis_extract/model/templating.py b/papis_extract/model/templating.py deleted file mode 100644 index f1ebb92..0000000 --- a/papis_extract/model/templating.py +++ /dev/null @@ -1,16 +0,0 @@ -from dataclasses import dataclass -from typing import Protocol - - -@dataclass -class Templating(Protocol): - string: str - - -@dataclass -class Markdown: - string: str = ( - "{{#tag}}#{{tag}}\n{{/tag}}" - "{{#quote}}> {{quote}}{{/quote}} {{#page}}[p. {{page}}]{{/page}}\n" - "{{#note}} NOTE: {{note}}{{/note}}" - ) diff --git a/papis_extract/templating.py b/papis_extract/templating.py new file mode 100644 index 0000000..c8abf7f --- /dev/null +++ b/papis_extract/templating.py @@ -0,0 +1,35 @@ +from dataclasses import dataclass +from typing import Protocol + + +@dataclass +class Templating(Protocol): + header: str + string: str + footer: str + + +@dataclass +class Markdown: + header: str = "" + string: str = ( + "{{#tag}}#{{tag}}\n{{/tag}}" + "{{#quote}}> {{quote}}{{/quote}} {{#page}}[p. {{page}}]{{/page}}\n" + "{{#note}} NOTE: {{note}}{{/note}}" + ) + footer: str = "" + + +@dataclass +class Csv: + header: str = "type, tag, page, quote, note, file" + string: str = "{{type}}, {{tag}}, {{page}}, {{quote}}, {{note}}, {{file}}" + footer: str = "" + + +@dataclass +class Custom: + def __init__(self, header: str = "", string: str = "", footer: str = "") -> None: + self.header = header + self.string = string + self.footer = footer diff --git a/tests/test_annotation.py b/tests/test_annotation.py index 542c3a8..60b35ed 100644 --- a/tests/test_annotation.py +++ b/tests/test_annotation.py @@ -1,17 +1,20 @@ import pytest from papis_extract.annotation_data import Annotation +from papis_extract.templating import Custom @pytest.mark.parametrize( "fmt_string,expected", [ - ("{{quote}}", "I am the text value"), + (Custom(string="{{quote}}"), "I am the text value"), ( - "> {{quote}}\n{{#note}}Note: {{note}}{{/note}}", + Custom(string="> {{quote}}\n{{#note}}Note: {{note}}{{/note}}"), "> I am the text value\nNote: Whereas I represent the note", ), ( - "{{#note}}Note: {{note}}{{/note}}{{#page}}, p. {{page}}{{/page}}", + Custom( + string="{{#note}}Note: {{note}}{{/note}}{{#page}}, p. {{page}}{{/page}}" + ), "Note: Whereas I represent the note", ), ], @@ -25,13 +28,13 @@ def test_formatting(fmt_string, expected): assert sut.format(fmt_string) == expected + def test_colorname_matches_exact(): - sut = Annotation( - "testfile", colors=(1.0,0.0,0.0), minimum_similarity_color=1.0 - ) + sut = Annotation("testfile", colors=(1.0, 0.0, 0.0), minimum_similarity_color=1.0) c_name = sut.colorname assert c_name == "red" + # TODO inject closeness value instead of relying on default @pytest.mark.parametrize( "color_value", @@ -44,8 +47,6 @@ def test_colorname_matches_exact(): ], ) def test_matches_inexact_colorname(color_value): - sut = Annotation( - "testfile", colors=color_value, minimum_similarity_color=0.833 - ) + sut = Annotation("testfile", colors=color_value, minimum_similarity_color=0.833) c_name = sut.colorname assert c_name == "red" diff --git a/tests/test_formatting.py b/tests/test_formatting.py index 07f1092..5ab53c2 100644 --- a/tests/test_formatting.py +++ b/tests/test_formatting.py @@ -1,14 +1,49 @@ import chevron -from papis_extract.annotation_data import Markdown +from papis_extract.templating import Markdown, Csv + + +def test_template_markers(): + ... + def test_markdown_default(): fmt = Markdown() - assert chevron.render(fmt.string, { - "file": "somefile/somewhere.pdf", - "quote": "I am quote", - "note": "and including note.", - "page": 46, - "tag": "important", - "type": "highlight", - }) == "#important\n> I am quote [p. 46]\n NOTE: and including note." + assert ( + chevron.render( + fmt.string, + { + "file": "somefile/somewhere.pdf", + "quote": "I am quote", + "note": "and including note.", + "page": 46, + "tag": "important", + "type": "highlight", + }, + ) + == "#important\n> I am quote [p. 46]\n NOTE: and including note." + ) + + +def test_csv_string(): + fmt = Csv() + assert ( + chevron.render( + fmt.string, + { + "file": "somefile/somewhere.pdf", + "quote": "I am quote", + "note": "and including note.", + "page": 46, + "tag": "important", + "type": "highlight", + }, + ) + == "highlight, important, 46, " + "I am quote, and including note., somefile/somewhere.pdf" + ) + + +def test_csv_header(): + fmt = Csv() + assert chevron.render(fmt.header, {}) == "type, tag, page, quote, note, file" From 07d4de9a4663af692a51ddd87483291e6f3c059b Mon Sep 17 00:00:00 2001 From: Marty Oehme Date: Tue, 19 Sep 2023 17:52:45 +0200 Subject: [PATCH 05/22] docs: Add docstrings --- papis_extract/annotation_data.py | 9 ++++++++- papis_extract/exporter.py | 8 ++++++++ papis_extract/extractor.py | 1 + 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/papis_extract/annotation_data.py b/papis_extract/annotation_data.py index c94244f..5fd5546 100644 --- a/papis_extract/annotation_data.py +++ b/papis_extract/annotation_data.py @@ -22,7 +22,10 @@ COLORS = { @dataclass class Annotation: - """A PDF annotation object""" + """A PDF annotation object. + + Contains all information necessary for the annotation itself, content and metadata. + """ file: str colors: tuple[float, float, float] = field(default_factory=lambda: (0.0, 0.0, 0.0)) @@ -83,6 +86,10 @@ class Annotation: @dataclass class AnnotatedDocument: + """Contains all annotations belonging to a single papis document. + + Combines a document with a list of annotations which belong to it.""" document: Document annotations: list[Annotation] + # TODO could implement a from_doc() static method to generate annotation list? diff --git a/papis_extract/exporter.py b/papis_extract/exporter.py index 0fca9d7..8e05c6c 100644 --- a/papis_extract/exporter.py +++ b/papis_extract/exporter.py @@ -115,6 +115,14 @@ def _add_annots_to_note( def _drop_existing_annotations( formatted_annotations: list[str], file_lines: list[str] ) -> list[str]: + """Returns the input annotations dropping any existing. + + Takes a list of formatted annotations and a list of strings + (most probably existing lines in a file). If anny annotations + match an existing line closely enough, they will be dropped. + + Returns list of annotations without duplicates. + """ minimum_similarity = ( papis.config.getfloat("minimum_similarity", "plugins.extract") or 1.0 ) diff --git a/papis_extract/extractor.py b/papis_extract/extractor.py index a4bc536..f6802f0 100644 --- a/papis_extract/extractor.py +++ b/papis_extract/extractor.py @@ -83,6 +83,7 @@ def extract(filename: Path) -> list[Annotation]: def is_pdf(fname: Path) -> bool: + """Check if file is a pdf, using mime type.""" return magic.from_file(fname, mime=True) == "application/pdf" From 9674592a9f63b9a9680b542955f1a7a0e288af2c Mon Sep 17 00:00:00 2001 From: Marty Oehme Date: Tue, 19 Sep 2023 17:55:30 +0200 Subject: [PATCH 06/22] docs: Add developer notes to README --- README.md | 40 ++++++++++++++++++++++++++++++---------- 1 file changed, 30 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index d2d1574..61a6f47 100644 --- a/README.md +++ b/README.md @@ -96,7 +96,7 @@ in fact you can use the plugin without having to set up any of it if you are hap The full default settings look as follows: -```yaml +```conf [plugins.extract] on_import: False tags = {"important": "red", "toread": "blue"} @@ -107,7 +107,7 @@ minimum_similarity_color = 0.833 # for matching tag to color ### Automatic extraction -```yaml +```conf [plugins.extract] on_import: True ``` @@ -119,7 +119,7 @@ if `False` extraction only happens when you explicitly invoke it. Extraction will *not* happen automatically when you add new annotations to an existing document, regardless of this setting. -#### Automatic tagging +### Automatic tagging By supplying the tags option with a valid python dictionary of the form `{"tag": "color", "tag2": "color2"}`, you can enable automatic tagging for your annotations. @@ -128,7 +128,7 @@ You thus ascribe specific meanings to the colors you use in highlighting. For example, if you always highlight the most essential arguments and findings in red and always highlight things you have to follow up on in blue, you can assign the meanings 'important' and 'todo' to them respectively as follows: -```yaml +```conf [plugins.extract] tags = {"red": "important", "blue": "toread"} ``` @@ -140,7 +140,7 @@ no defaults are set here. ### Advanced configuration -```yaml +```conf [plugins.extract] minimum_similarity: 0.75, # for checking against existing annotations minimum_similarity_content: 0.9, # for checking if highlight or note @@ -219,12 +219,32 @@ This plugin makes an effort to find the right combination and extract the writte as well as any additional notes made - but things *will* slip through or extract weirdly every now and again. -The easiest extraction is provided if your program writes the selection itself into the highlight -content, because then we can just use that. It is harder to parse if it does not and will sometimes -get additional words in front or behind (especially if the highlight ends in the middle of a line) -or even cut a few off. +Secondly, a note on the pages: I use the page number that the mupdf library gives me when it +extracts anything from the pdf file. Sometimes that number will be correct for the document, +sometimes it will however be the number of the *pdf document* internally. This can happen if +e.g. an article or a book has frontmatter without numbering scheme or with a different one. +Sometimes the correct pages will still be embedded in the pdf and everything will work, +others it won't. So always double check your page numbers! -I am not sure if there is much I can do about this. +I am not sure if there is much I can do about these issues for now. + +## For developers + +and for myself whenever I forget. The basic building blocks currently in here are three: + +- extractors +: extract data from a source file attached to a papis document. + +- exporters +: put the extracted data somewhere like stdout or into your notes. + +- templates +: make sure the exporter saves the data according to your preferred layout, +such as a markdown syntax or csv-structure. + +Splitting it into those three building blocks makes it easier to recombine them in any way, +should someone want to save highlights as csv data in their notes, +or should we ever include more extractors than the one for PDFs. --- From cbe2e7cb03f5c40abf7d1b25bed497366026ae22 Mon Sep 17 00:00:00 2001 From: Marty Oehme Date: Tue, 19 Sep 2023 18:30:18 +0200 Subject: [PATCH 07/22] feat: Allow cli option for template choice --- papis_extract/__init__.py | 44 ++++++++++++++++++++++++++------------- 1 file changed, 29 insertions(+), 15 deletions(-) diff --git a/papis_extract/__init__.py b/papis_extract/__init__.py index c23ae39..6379b27 100644 --- a/papis_extract/__init__.py +++ b/papis_extract/__init__.py @@ -9,7 +9,7 @@ from papis.document import Document from papis_extract import extractor, exporter from papis_extract.annotation_data import AnnotatedDocument -from papis_extract.templating import Markdown +from papis_extract.templating import Csv, Markdown, Templating logger = papis.logging.get_logger(__name__) @@ -29,17 +29,24 @@ papis.config.register_default_settings(DEFAULT_OPTIONS) @click.help_option("-h", "--help") @papis.cli.query_argument() @papis.cli.doc_folder_option() -@papis.cli.git_option(help="Add changes made to the notes files") +@papis.cli.git_option(help="Commit changes made to the notes files.") @papis.cli.all_option() -@click.option( - "--manual/--no-manual", - "-m", - help="Open each note in editor for manual editing after extracting its annotations", -) @click.option( "--write/--no-write", "-w", - help="Do not write annotations to notes only print results to stdout", + help="Do not write annotations to notes only print results to stdout.", +) +@click.option( + "--manual/--no-manual", + "-m", + help= + "Open each note in editor for manual editing after extracting its annotations.", +) +@click.option( + "--template", + "-t", + type=click.Choice(["markdown", "csv"], case_sensitive=False), + help="Choose an output template to format annotations with.", ) def main( query: str, @@ -53,17 +60,19 @@ def main( doc_folder: str, manual: bool, write: bool, + template: str, git: bool, ) -> None: - """Extract annotations from any pdf document + """Extract annotations from any pdf document. The extract plugin allows manual or automatic extraction of all annotations - contained in the pdf documents belonging to entries of the pubs library. + contained in the pdf documents belonging to entries of the papis library. It can write those changes to stdout or directly create and update notes for papis documents. It adds a `papis extract` subcommand through which it is invoked, but can - optionally run whenever a new document is imported for a pubs entry. + optionally run whenever a new document is imported for a papis entry, + if set in the plugin configuration. """ documents = papis.cli.handle_doc_folder_query_all_sort( query, doc_folder, sort_field=None, sort_reverse=False, _all=_all @@ -72,7 +81,12 @@ def main( logger.warning(papis.strings.no_documents_retrieved_message) return - run(documents, edit=manual, write=write, git=git) + if template == "csv": + template_type = Csv() + else: + template_type = Markdown() + + run(documents, edit=manual, write=write, git=git, template=template_type) def run( @@ -80,11 +94,11 @@ def run( edit: bool = False, write: bool = False, git: bool = False, + template: Templating = Markdown(), ) -> None: - doc_annotations: list[AnnotatedDocument] = extractor.start(documents) if write: - exporter.to_notes(doc_annotations, Markdown(), edit=edit, git=git) + exporter.to_notes(doc_annotations, template, edit=edit, git=git) else: - exporter.to_stdout(doc_annotations, Markdown()) + exporter.to_stdout(doc_annotations, template) From 66f937e2a84dc1826396660a6b5a027eff01ba96 Mon Sep 17 00:00:00 2001 From: Marty Oehme Date: Tue, 19 Sep 2023 18:31:12 +0200 Subject: [PATCH 08/22] test: Add local papis settings for testing --- .papis.config | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 .papis.config diff --git a/.papis.config b/.papis.config new file mode 100644 index 0000000..5e61770 --- /dev/null +++ b/.papis.config @@ -0,0 +1,7 @@ +[settings] +default-library = main +picktool = papis +formater = python + +[main] +dir = temp/lib From 5a6d672c76f7ccb058d894f95550c5236f6067a6 Mon Sep 17 00:00:00 2001 From: Marty Oehme Date: Tue, 19 Sep 2023 21:43:19 +0200 Subject: [PATCH 09/22] refactor: Move formatting logic to formatters Formatters (previously templates) were pure data containers before, continating the 'template' for how things should be formatted using mustache. The formatting would be done a) in the exporters and b) in the annotations. This spread of formatting has now been consolidated into the Formatter, which fixes the overall spread of formatting code and now can coherently format a whole output instead of just individual annotations. A formatter contains references to all documents and contained annotations and will format everything at once by default, but the formatting function can be invoked with reference to a specific annotated document to only format that. This commit should put more separation into the concerns of exporter and formatter and made formatting a concern purely of the formatters and annotation objects. --- README.md | 20 ++++--- papis_extract/__init__.py | 22 +++---- papis_extract/annotation_data.py | 8 +-- papis_extract/exporter.py | 54 +++++------------ papis_extract/formatter.py | 100 +++++++++++++++++++++++++++++++ papis_extract/templating.py | 35 ----------- 6 files changed, 138 insertions(+), 101 deletions(-) create mode 100644 papis_extract/formatter.py delete mode 100644 papis_extract/templating.py diff --git a/README.md b/README.md index 61a6f47..89052bf 100644 --- a/README.md +++ b/README.md @@ -177,8 +177,8 @@ Known issues to be fixed: - [x] Speed? - should be fine, on my machine (old i5 laptop) it takes around 90s for ~1000 documents with ~4000 annotations - [x] ensure all cmdline options do what they should -- [ ] annotations carry over color object from fitz, should just be Color object or simple tuple with rgb vals -- [ ] docstrings, docstrings! +- [x] annotations carry over color object from fitz, should just be Color object or simple tuple with rgb vals +- [x] docstrings, docstrings! - [ ] testing testing testing!! - [ ] refactor into some better abstractions (e.g. Exporter Protocol -> stdout/markdown implementations; Extractor Protocol -> PDF implementation) @@ -188,7 +188,7 @@ features to be implemented: - [x] static analysis (lint, typecheck etc) on pushes - [x] test pipeline on master pushes - [ ] release pipeline to pypi on tags -- [ ] add page number if available +- [x] add page number if available - exists in Annotation, just need to place in output - [ ] show overall amount of extractions at the end - [ ] custom formatting decided by user @@ -233,14 +233,18 @@ I am not sure if there is much I can do about these issues for now. and for myself whenever I forget. The basic building blocks currently in here are three: - extractors -: extract data from a source file attached to a papis document. +: Extract data from a source file attached to a papis document. + +- annotations +: The actual extracted blocks of text, containing some metadata + info as well, such as their color, type, page. - exporters -: put the extracted data somewhere like stdout or into your notes. +: Put the extracted data somewhere. For now stdout or into your notes. -- templates -: make sure the exporter saves the data according to your preferred layout, -such as a markdown syntax or csv-structure. +- formatters +: Make sure the exporter saves the data according to your preferred layout, + such as a markdown syntax or csv-structure. Splitting it into those three building blocks makes it easier to recombine them in any way, should someone want to save highlights as csv data in their notes, diff --git a/papis_extract/__init__.py b/papis_extract/__init__.py index 6379b27..026e613 100644 --- a/papis_extract/__init__.py +++ b/papis_extract/__init__.py @@ -8,8 +8,7 @@ import papis.strings from papis.document import Document from papis_extract import extractor, exporter -from papis_extract.annotation_data import AnnotatedDocument -from papis_extract.templating import Csv, Markdown, Templating +from papis_extract.formatter import MarkdownFormatter, Formatter logger = papis.logging.get_logger(__name__) @@ -39,8 +38,7 @@ papis.config.register_default_settings(DEFAULT_OPTIONS) @click.option( "--manual/--no-manual", "-m", - help= - "Open each note in editor for manual editing after extracting its annotations.", + help="Open note in editor for manual editing after annotation extraction.", ) @click.option( "--template", @@ -82,23 +80,19 @@ def main( return if template == "csv": - template_type = Csv() - else: - template_type = Markdown() - - run(documents, edit=manual, write=write, git=git, template=template_type) + raise NotImplementedError + run(documents, edit=manual, write=write, git=git, template=MarkdownFormatter()) def run( documents: list[Document], + template: Formatter, edit: bool = False, write: bool = False, git: bool = False, - template: Templating = Markdown(), ) -> None: - doc_annotations: list[AnnotatedDocument] = extractor.start(documents) - + template.annotated_docs = extractor.start(documents) if write: - exporter.to_notes(doc_annotations, template, edit=edit, git=git) + exporter.to_notes(template, edit=edit, git=git) else: - exporter.to_stdout(doc_annotations, template) + exporter.to_stdout(template) diff --git a/papis_extract/annotation_data.py b/papis_extract/annotation_data.py index 5fd5546..f0bad88 100644 --- a/papis_extract/annotation_data.py +++ b/papis_extract/annotation_data.py @@ -5,8 +5,6 @@ import papis.config from papis.document import Document import chevron -from papis_extract.templating import Templating - TEXT_SIMILARITY_MINIMUM = 0.75 COLOR_SIMILARITY_MINIMUM = 0.833 @@ -36,7 +34,7 @@ class Annotation: type: str = "Highlight" minimum_similarity_color: float = 1.0 - def format(self, template: Templating): + def format(self, template: str, doc: Document = Document()): """Return a formatted string of the annotation. Given a provided formatting pattern, this method returns the annotation @@ -50,8 +48,9 @@ class Annotation: "page": self.page, "tag": self.tag, "type": self.type, + "doc": doc, } - return chevron.render(template.string, data) + return chevron.render(template, data) @property def colorname(self): @@ -89,6 +88,7 @@ class AnnotatedDocument: """Contains all annotations belonging to a single papis document. Combines a document with a list of annotations which belong to it.""" + document: Document annotations: list[Annotation] diff --git a/papis_extract/exporter.py b/papis_extract/exporter.py index 8e05c6c..14ee617 100644 --- a/papis_extract/exporter.py +++ b/papis_extract/exporter.py @@ -7,62 +7,34 @@ import papis.git import papis.config import Levenshtein -from papis_extract.annotation_data import AnnotatedDocument -from papis_extract.templating import Templating +from papis_extract.formatter import Formatter logger = papis.logging.get_logger(__name__) -def to_stdout(annots: list[AnnotatedDocument], template: Templating) -> None: +def to_stdout(template: Formatter) -> None: """Pretty print annotations to stdout. Gives a nice human-readable representations of the annotations in somewhat of a list form. Not intended for machine-readability. """ - if not annots: - return - - last = annots[-1] - for entry in annots: - if not entry.annotations: - continue - - title_decoration = ( - f"{'=' * len(entry.document.get('title', ''))} " - f"{'-' * len(entry.document.get('author', ''))}" - ) - print( - f"{title_decoration}\n{papis.document.describe(entry.document)}\n{title_decoration}\n" - ) - for a in entry.annotations: - print(a.format(template)) - - if entry != last: - print("\n") + output:str = template.execute() + print(output.rstrip('\n')) -def to_notes( - annots: list[AnnotatedDocument], template: Templating, edit: bool, git: bool -) -> None: +def to_notes(template: Formatter, edit: bool, git: bool) -> None: """Write annotations into document notes. Permanently writes the given annotations into notes belonging to papis documents. Creates new notes for documents missing a note field or appends to existing. """ - if not annots: - return - - for entry in annots: - if not entry.annotations: - continue - - formatted_annotations: list[str] = [] - for a in entry.annotations: - formatted_annotations.append(a.format(template)) - - _add_annots_to_note(entry.document, formatted_annotations) + annotated_docs = template.annotated_docs + for entry in annotated_docs: + formatted_annotations = template.execute(entry).split("\n") + if formatted_annotations: + _add_annots_to_note(entry.document, formatted_annotations) if edit: papis.commands.edit.edit_notes(entry.document, git=git) @@ -117,7 +89,7 @@ def _drop_existing_annotations( ) -> list[str]: """Returns the input annotations dropping any existing. - Takes a list of formatted annotations and a list of strings + Takes a list of formatted annotations and a list of strings (most probably existing lines in a file). If anny annotations match an existing line closely enough, they will be dropped. @@ -130,7 +102,9 @@ def _drop_existing_annotations( remaining: list[str] = [] for an in formatted_annotations: an_split = an.splitlines() - if not _test_similarity(an_split[0], file_lines, minimum_similarity): + if an_split and not _test_similarity( + an_split[0], file_lines, minimum_similarity + ): remaining.append(an) return remaining diff --git a/papis_extract/formatter.py b/papis_extract/formatter.py new file mode 100644 index 0000000..3b921dc --- /dev/null +++ b/papis_extract/formatter.py @@ -0,0 +1,100 @@ +from dataclasses import dataclass, field +from typing import Protocol + +from papis_extract.annotation_data import AnnotatedDocument + + +@dataclass +class Formatter(Protocol): + annotated_docs: list[AnnotatedDocument] + header: str + string: str + footer: str + + def execute(self, doc: AnnotatedDocument | None = None) -> str: + raise NotImplementedError + + +@dataclass +class MarkdownFormatter: + annotated_docs: list[AnnotatedDocument] = field(default_factory=lambda: list()) + header: str = "" + string: str = ( + "{{#tag}}#{{tag}}\n{{/tag}}" + "{{#quote}}> {{quote}}{{/quote}} {{#page}}[p. {{page}}]{{/page}}\n" + "{{#note}} NOTE: {{note}}{{/note}}" + ) + footer: str = "" + + def execute(self, doc: AnnotatedDocument | None = None) -> str: + output = "" + documents = self.annotated_docs if doc is None else [doc] + last = documents[-1] + for entry in documents: + if not entry.annotations: + continue + + title_decoration = ( + f"{'=' * len(entry.document.get('title', ''))} " + f"{'-' * len(entry.document.get('author', ''))}" + ) + output += ( + f"{title_decoration}\n" + f"{entry.document['title']} - {entry.document['author']}\n" + f"{title_decoration}\n\n" + ) + for a in entry.annotations: + output += a.format(self.string) + + if entry != last: + print(f"entry: {entry}, last: {last}") + output += "\n\n\n" + + return output + +@dataclass +class CountFormatter: + annotated_docs: list[AnnotatedDocument] = field(default_factory=lambda: list()) + header: str = "" + string: str = "" + footer: str = "" + + def execute(self, doc: AnnotatedDocument | None = None) -> str: + output = "" + documents = self.annotated_docs if doc is None else [doc] + last = documents[-1] + for entry in documents: + if not entry.annotations: + continue + + title_decoration = ( + f"{'=' * len(entry.document.get('title', ''))} " + f"{'-' * len(entry.document.get('author', ''))}" + ) + output += ( + f"{title_decoration}\n" + f"{entry.document['title']} - {entry.document['author']}\n" + f"{title_decoration}\n\n" + ) + for a in entry.annotations: + output += a.format(self.string) + + if entry != last: + print(f"entry: {entry}, last: {last}") + output += "\n\n\n" + + return output + +@dataclass +class CsvFormatter: + header: str = "type, tag, page, quote, note, file" + string: str = "{{type}}, {{tag}}, {{page}}, {{quote}}, {{note}}, {{file}}" + footer: str = "" + + +@dataclass +class CustomFormatter: + def __init__(self, header: str = "", string: str = "", footer: str = "") -> None: + self.header = header + self.string = string + self.footer = footer diff --git a/papis_extract/templating.py b/papis_extract/templating.py deleted file mode 100644 index c8abf7f..0000000 --- a/papis_extract/templating.py +++ /dev/null @@ -1,35 +0,0 @@ -from dataclasses import dataclass -from typing import Protocol - - -@dataclass -class Templating(Protocol): - header: str - string: str - footer: str - - -@dataclass -class Markdown: - header: str = "" - string: str = ( - "{{#tag}}#{{tag}}\n{{/tag}}" - "{{#quote}}> {{quote}}{{/quote}} {{#page}}[p. {{page}}]{{/page}}\n" - "{{#note}} NOTE: {{note}}{{/note}}" - ) - footer: str = "" - - -@dataclass -class Csv: - header: str = "type, tag, page, quote, note, file" - string: str = "{{type}}, {{tag}}, {{page}}, {{quote}}, {{note}}, {{file}}" - footer: str = "" - - -@dataclass -class Custom: - def __init__(self, header: str = "", string: str = "", footer: str = "") -> None: - self.header = header - self.string = string - self.footer = footer From 5f0bc2ffad4c0da067b90395b23e2030298c2c85 Mon Sep 17 00:00:00 2001 From: Marty Oehme Date: Wed, 20 Sep 2023 08:38:06 +0200 Subject: [PATCH 10/22] feat: Add count formatter Added formatter which counts and outputs the number of annotations in each document. --- papis_extract/__init__.py | 11 ++++++++--- papis_extract/formatter.py | 30 +++++++++++++----------------- 2 files changed, 21 insertions(+), 20 deletions(-) diff --git a/papis_extract/__init__.py b/papis_extract/__init__.py index 026e613..e09b99d 100644 --- a/papis_extract/__init__.py +++ b/papis_extract/__init__.py @@ -8,7 +8,7 @@ import papis.strings from papis.document import Document from papis_extract import extractor, exporter -from papis_extract.formatter import MarkdownFormatter, Formatter +from papis_extract.formatter import CountFormatter, MarkdownFormatter, Formatter logger = papis.logging.get_logger(__name__) @@ -43,7 +43,7 @@ papis.config.register_default_settings(DEFAULT_OPTIONS) @click.option( "--template", "-t", - type=click.Choice(["markdown", "csv"], case_sensitive=False), + type=click.Choice(["markdown", "count", "csv"], case_sensitive=False), help="Choose an output template to format annotations with.", ) def main( @@ -81,7 +81,12 @@ def main( if template == "csv": raise NotImplementedError - run(documents, edit=manual, write=write, git=git, template=MarkdownFormatter()) + elif template == "count": + formatter = CountFormatter() + else: + formatter = MarkdownFormatter() + + run(documents, edit=manual, write=write, git=git, template=formatter) def run( diff --git a/papis_extract/formatter.py b/papis_extract/formatter.py index 3b921dc..6602a31 100644 --- a/papis_extract/formatter.py +++ b/papis_extract/formatter.py @@ -47,11 +47,11 @@ class MarkdownFormatter: output += a.format(self.string) if entry != last: - print(f"entry: {entry}, last: {last}") output += "\n\n\n" return output + @dataclass class CountFormatter: annotated_docs: list[AnnotatedDocument] = field(default_factory=lambda: list()) @@ -60,31 +60,27 @@ class CountFormatter: footer: str = "" def execute(self, doc: AnnotatedDocument | None = None) -> str: - output = "" documents = self.annotated_docs if doc is None else [doc] - last = documents[-1] + output = "" for entry in documents: if not entry.annotations: continue - title_decoration = ( - f"{'=' * len(entry.document.get('title', ''))} " - f"{'-' * len(entry.document.get('author', ''))}" - ) - output += ( - f"{title_decoration}\n" - f"{entry.document['title']} - {entry.document['author']}\n" - f"{title_decoration}\n\n" - ) - for a in entry.annotations: - output += a.format(self.string) + count = 0 + for _ in entry.annotations: + count += 1 - if entry != last: - print(f"entry: {entry}, last: {last}") - output += "\n\n\n" + d = entry.document + output += ( + f"{d['author'] if 'author' in d else ''}" + f"{' - ' if 'author' in d else ''}" # only put separator if author + f"{entry.document['title'] if 'title' in d else ''}: " + f"{count}\n" + ) return output + @dataclass class CsvFormatter: header: str = "type, tag, page, quote, note, file" From e511ffa48d8c59e9e8b01a343a3b8d2e97832af6 Mon Sep 17 00:00:00 2001 From: Marty Oehme Date: Wed, 20 Sep 2023 08:49:55 +0200 Subject: [PATCH 11/22] feat: Add CSV formatter Added formatter for csv-compatible syntax. The formatting is quite basic with no escaping happening should that be necessary. However, for an initial csv output it suffices for me. --- papis_extract/__init__.py | 19 ++++++++++++------- papis_extract/annotation_data.py | 4 ++-- papis_extract/exporter.py | 10 +++++----- papis_extract/formatter.py | 24 +++++++++++++++++++++--- 4 files changed, 40 insertions(+), 17 deletions(-) diff --git a/papis_extract/__init__.py b/papis_extract/__init__.py index e09b99d..6b38f9a 100644 --- a/papis_extract/__init__.py +++ b/papis_extract/__init__.py @@ -8,7 +8,12 @@ import papis.strings from papis.document import Document from papis_extract import extractor, exporter -from papis_extract.formatter import CountFormatter, MarkdownFormatter, Formatter +from papis_extract.formatter import ( + CountFormatter, + CsvFormatter, + MarkdownFormatter, + Formatter, +) logger = papis.logging.get_logger(__name__) @@ -80,24 +85,24 @@ def main( return if template == "csv": - raise NotImplementedError + formatter = CsvFormatter() elif template == "count": formatter = CountFormatter() else: formatter = MarkdownFormatter() - run(documents, edit=manual, write=write, git=git, template=formatter) + run(documents, edit=manual, write=write, git=git, formatter=formatter) def run( documents: list[Document], - template: Formatter, + formatter: Formatter, edit: bool = False, write: bool = False, git: bool = False, ) -> None: - template.annotated_docs = extractor.start(documents) + formatter.annotated_docs = extractor.start(documents) if write: - exporter.to_notes(template, edit=edit, git=git) + exporter.to_notes(formatter, edit=edit, git=git) else: - exporter.to_stdout(template) + exporter.to_stdout(formatter) diff --git a/papis_extract/annotation_data.py b/papis_extract/annotation_data.py index f0bad88..138b755 100644 --- a/papis_extract/annotation_data.py +++ b/papis_extract/annotation_data.py @@ -34,7 +34,7 @@ class Annotation: type: str = "Highlight" minimum_similarity_color: float = 1.0 - def format(self, template: str, doc: Document = Document()): + def format(self, formatting: str, doc: Document = Document()): """Return a formatted string of the annotation. Given a provided formatting pattern, this method returns the annotation @@ -50,7 +50,7 @@ class Annotation: "type": self.type, "doc": doc, } - return chevron.render(template, data) + return chevron.render(formatting, data) @property def colorname(self): diff --git a/papis_extract/exporter.py b/papis_extract/exporter.py index 14ee617..8f03ebd 100644 --- a/papis_extract/exporter.py +++ b/papis_extract/exporter.py @@ -12,27 +12,27 @@ from papis_extract.formatter import Formatter logger = papis.logging.get_logger(__name__) -def to_stdout(template: Formatter) -> None: +def to_stdout(formatter: Formatter) -> None: """Pretty print annotations to stdout. Gives a nice human-readable representations of the annotations in somewhat of a list form. Not intended for machine-readability. """ - output:str = template.execute() + output:str = formatter.execute() print(output.rstrip('\n')) -def to_notes(template: Formatter, edit: bool, git: bool) -> None: +def to_notes(formatter: Formatter, edit: bool, git: bool) -> None: """Write annotations into document notes. Permanently writes the given annotations into notes belonging to papis documents. Creates new notes for documents missing a note field or appends to existing. """ - annotated_docs = template.annotated_docs + annotated_docs = formatter.annotated_docs for entry in annotated_docs: - formatted_annotations = template.execute(entry).split("\n") + formatted_annotations = formatter.execute(entry).split("\n") if formatted_annotations: _add_annots_to_note(entry.document, formatted_annotations) diff --git a/papis_extract/formatter.py b/papis_extract/formatter.py index 6602a31..d687607 100644 --- a/papis_extract/formatter.py +++ b/papis_extract/formatter.py @@ -73,7 +73,7 @@ class CountFormatter: d = entry.document output += ( f"{d['author'] if 'author' in d else ''}" - f"{' - ' if 'author' in d else ''}" # only put separator if author + f"{' - ' if 'author' in d else ''}" # only put separator if author f"{entry.document['title'] if 'title' in d else ''}: " f"{count}\n" ) @@ -83,10 +83,28 @@ class CountFormatter: @dataclass class CsvFormatter: - header: str = "type, tag, page, quote, note, file" - string: str = "{{type}}, {{tag}}, {{page}}, {{quote}}, {{note}}, {{file}}" + annotated_docs: list[AnnotatedDocument] = field(default_factory=lambda: list()) + header: str = "type,tag,page,quote,note,author,title,ref,file" + string: str = ( + '{{type}},{{tag}},{{page}},"{{quote}}","{{note}}",' + '"{{doc.author}}","{{doc.title}}","{{doc.ref}}","{{file}}"' + ) footer: str = "" + def execute(self, doc: AnnotatedDocument | None = None) -> str: + documents = self.annotated_docs if doc is None else [doc] + output = f"{self.header}\n" + for entry in documents: + if not entry.annotations: + continue + + d = entry.document + for a in entry.annotations: + output += a.format(self.string, doc=d) + output += "\n" + + return output + @dataclass class CustomFormatter: From 3670f70319c8d9aa6c14be5982c4ac6572afe09a Mon Sep 17 00:00:00 2001 From: Marty Oehme Date: Wed, 20 Sep 2023 09:03:08 +0200 Subject: [PATCH 12/22] docs: Add formatting documentation Added documentation on using output templates and that they will invalidate the 'existing' annotation search. --- README.md | 28 +++++++++++++++++++++++++++- papis_extract/__init__.py | 3 --- 2 files changed, 27 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 89052bf..95ec9ce 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ Easily organize all your highlights and thoughts next to your documents and refe ## Installation: -You can install from pypi with `pip install git+https://git.martyoeh.me/Marty/papis-extract.git`. +You can install through pip with `pip install git+https://git.martyoeh.me/Marty/papis-extract.git`. That's it! If you have papis and papis-extract installed in the same environment (whether virtual or global), everything should now be set up. @@ -87,6 +87,30 @@ On my current laptop, extracting ~4000 annotations from ~1000 library documents though this will vary with the length and size of the PDFs you have. For smaller workloads the process should be almost instant. +You can change the format that you want your annotations in with the `--template` option. +To output annotations in a markdown-compatible syntax (the default), do: + +```bash +papis extract --template markdown +``` + +To instead see them in a csv syntax simply invoke: + +```bash +papis extract --template csv +``` + +And if you only want to know how many annotations exist in the documents, you can invoke: + +```bash +papis extract --template count +``` + +For now, these are the only formatters the plugin knows about. + +Be aware that if you write to your notes using a different template the plugin will *not* detect old annotations and drop them, +so you will be doubling up your annotations. + ## Configuration ### Basic configuration @@ -181,6 +205,8 @@ Known issues to be fixed: - [x] docstrings, docstrings! - [ ] testing testing testing!! - [ ] refactor into some better abstractions (e.g. Exporter Protocol -> stdout/markdown implementations; Extractor Protocol -> PDF implementation) +- [ ] dependency injection for extractor/exporter/formatter/annotation modules + - [ ] any call to papis.config should start from init and be injected? features to be implemented: diff --git a/papis_extract/__init__.py b/papis_extract/__init__.py index 6b38f9a..f8f6941 100644 --- a/papis_extract/__init__.py +++ b/papis_extract/__init__.py @@ -53,12 +53,9 @@ papis.config.register_default_settings(DEFAULT_OPTIONS) ) def main( query: str, - # info: bool, # _papis_id: bool, # _file: bool, - # notes: bool, # _dir: bool, - # _format: str, _all: bool, doc_folder: str, manual: bool, From 31b878c9eb9dac75d9fc860a08d8084c8461bdce Mon Sep 17 00:00:00 2001 From: Marty Oehme Date: Wed, 20 Sep 2023 17:22:29 +0200 Subject: [PATCH 13/22] refactor: Move Annotations into annotation module --- papis_extract/{annotation_data.py => annotation.py} | 0 papis_extract/extractor.py | 2 +- papis_extract/formatter.py | 2 +- 3 files changed, 2 insertions(+), 2 deletions(-) rename papis_extract/{annotation_data.py => annotation.py} (100%) diff --git a/papis_extract/annotation_data.py b/papis_extract/annotation.py similarity index 100% rename from papis_extract/annotation_data.py rename to papis_extract/annotation.py diff --git a/papis_extract/extractor.py b/papis_extract/extractor.py index f6802f0..bdb8780 100644 --- a/papis_extract/extractor.py +++ b/papis_extract/extractor.py @@ -10,7 +10,7 @@ import papis.config import papis.document from papis.document import Document -from papis_extract.annotation_data import Annotation, AnnotatedDocument +from papis_extract.annotation import Annotation, AnnotatedDocument logger = papis.logging.get_logger(__name__) diff --git a/papis_extract/formatter.py b/papis_extract/formatter.py index d687607..4ba782d 100644 --- a/papis_extract/formatter.py +++ b/papis_extract/formatter.py @@ -1,7 +1,7 @@ from dataclasses import dataclass, field from typing import Protocol -from papis_extract.annotation_data import AnnotatedDocument +from papis_extract.annotation import AnnotatedDocument @dataclass From 929e70d7ac42337220ae7e5b998b5e22eb7002c5 Mon Sep 17 00:00:00 2001 From: Marty Oehme Date: Thu, 21 Sep 2023 19:36:00 +0200 Subject: [PATCH 14/22] chore: Update poetry.lock --- poetry.lock | 305 ++++++++++++++++++++++++++++------------------------ 1 file changed, 162 insertions(+), 143 deletions(-) diff --git a/poetry.lock b/poetry.lock index 1336bfc..963b69b 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand. [[package]] name = "arxiv2bib" @@ -582,54 +582,54 @@ plugins = ["importlib-metadata"] [[package]] name = "pymupdf" -version = "1.23.2" +version = "1.23.3" description = "A high performance Python library for data extraction, analysis, conversion & manipulation of PDF (and other) documents." optional = false python-versions = ">=3.8" files = [ - {file = "PyMuPDF-1.23.2-cp310-none-macosx_10_9_x86_64.whl", hash = "sha256:701aff64dbf0635c2c875b518979b46b935ed4d3b3d2aee1c449e2960831d766"}, - {file = "PyMuPDF-1.23.2-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:949c071b50825cf341f03546e7354cef942c36fcc071a72a0417c035d6ee7e33"}, - {file = "PyMuPDF-1.23.2-cp310-none-manylinux2014_aarch64.whl", hash = "sha256:5209612dfc9038fbbb1a61dc01bd298d5279646d5e1c98cfe80878db3d862a3e"}, - {file = "PyMuPDF-1.23.2-cp310-none-manylinux2014_x86_64.whl", hash = "sha256:f20ba77a61440220bd2e380ceef8a86bf51f97ac9374a8af00aeedea904dad46"}, - {file = "PyMuPDF-1.23.2-cp310-none-win32.whl", hash = "sha256:01c45723fbc389fac2ab8150e5ba80c357706ca69a74c29ec1a83a05921c53d1"}, - {file = "PyMuPDF-1.23.2-cp310-none-win_amd64.whl", hash = "sha256:ff799db717d5b0e423bd81fbae8131cf3463a80a642524a96952f6f3deaf2a95"}, - {file = "PyMuPDF-1.23.2-cp311-none-macosx_10_9_x86_64.whl", hash = "sha256:1f372bcc70b888f0c953add8b15627efb9f3cc2c7b8ad0916560b6081093932c"}, - {file = "PyMuPDF-1.23.2-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:6389eb4bfc27264a951497847089e5e4485f6609c351ac321071d62881a21982"}, - {file = "PyMuPDF-1.23.2-cp311-none-manylinux2014_aarch64.whl", hash = "sha256:0fc5f600b3a72c29a0944cbcbc1375962ad669023265c50cd1d8f794d7ae95f7"}, - {file = "PyMuPDF-1.23.2-cp311-none-manylinux2014_x86_64.whl", hash = "sha256:4fc4a6183a7a8006b83476fc0b26d6fb849996050e8c3c911b9d6a66fe6ccc1e"}, - {file = "PyMuPDF-1.23.2-cp311-none-win32.whl", hash = "sha256:66f94d35fd48e2b5cbe70a4601f036f76cb826318b893994ab7bd4186a65e78f"}, - {file = "PyMuPDF-1.23.2-cp311-none-win_amd64.whl", hash = "sha256:1ec04285451231c68a024657b75d59a43ce0dcdade582edf3a9cc1d86c75b826"}, - {file = "PyMuPDF-1.23.2-cp38-none-macosx_10_9_x86_64.whl", hash = "sha256:40a713ea439548cf3c6bd910dc904cb868eae9d7bc1c2d0aebc04c84431822af"}, - {file = "PyMuPDF-1.23.2-cp38-none-macosx_11_0_arm64.whl", hash = "sha256:f0d57e40bdbf6c6deacf94387d3aad918535d8723aa6e3a27e4bef1f3d52158a"}, - {file = "PyMuPDF-1.23.2-cp38-none-manylinux2014_aarch64.whl", hash = "sha256:2de9da709e14a0b32ca1ed7e268615189a8c1e76a26920dd45a92d9f0e207d1f"}, - {file = "PyMuPDF-1.23.2-cp38-none-manylinux2014_x86_64.whl", hash = "sha256:7edc4b4542041a28f5644c09c1e670215ae014adc28a81d32786db73077d4cf3"}, - {file = "PyMuPDF-1.23.2-cp38-none-win32.whl", hash = "sha256:18f19be85f277a36536277f3f4991a2d1d1b9c2d0c3a515925e9bef41780efe0"}, - {file = "PyMuPDF-1.23.2-cp38-none-win_amd64.whl", hash = "sha256:a98cf7bb1ba8d64de78f443005c0f60c0c9644f73b3ebd57cbd20e232e2e5a30"}, - {file = "PyMuPDF-1.23.2-cp39-none-macosx_10_9_x86_64.whl", hash = "sha256:ac236156688627bca0a8062bb4153f77108f072dd4a06a80626fd089c2879e04"}, - {file = "PyMuPDF-1.23.2-cp39-none-macosx_11_0_arm64.whl", hash = "sha256:c1a08531194d038e068641be92fdc31276efbee2b718a8dc4281dc593f1a99e7"}, - {file = "PyMuPDF-1.23.2-cp39-none-manylinux2014_aarch64.whl", hash = "sha256:78b6c87fd375d1b017c63a426432be7ee4859f2142108b9c5dc8283599c112eb"}, - {file = "PyMuPDF-1.23.2-cp39-none-manylinux2014_x86_64.whl", hash = "sha256:41d9cd45bb61cda890d446baeeded454fb4404086cf7f7e385e440123e9ecb56"}, - {file = "PyMuPDF-1.23.2-cp39-none-win32.whl", hash = "sha256:d34da29cd4305c4b85ea57528c1a31cfc6abfac7921d27153e633470e9dac104"}, - {file = "PyMuPDF-1.23.2-cp39-none-win_amd64.whl", hash = "sha256:86127075227f868a6b115eb96a74405539dde90168cd1a98781b0f1f6d4f9d7c"}, - {file = "PyMuPDF-1.23.2.tar.gz", hash = "sha256:32302d0eb0e28d60ba305f5d74702fb0fab2ed9d9f6b3a9d853429e5023bc6bb"}, + {file = "PyMuPDF-1.23.3-cp310-none-macosx_10_9_x86_64.whl", hash = "sha256:52699939b7482c8c566a181e2a980a6801c91959ee96dae5663070fd2b960c6b"}, + {file = "PyMuPDF-1.23.3-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:95408d57ed77f3c396880a3fc0feae068c4bf577e7e2c761d24a345138062f8d"}, + {file = "PyMuPDF-1.23.3-cp310-none-manylinux2014_aarch64.whl", hash = "sha256:5eefd674e338ddd82cd9179ad7d4c2160796efd6c0d4cd1098b5314ff78688d7"}, + {file = "PyMuPDF-1.23.3-cp310-none-manylinux2014_x86_64.whl", hash = "sha256:c7696034f5f5472d1e6d3f3556858cf85e095b66c158a80b527facfa83542aee"}, + {file = "PyMuPDF-1.23.3-cp310-none-win32.whl", hash = "sha256:f3c6d427381f4ef76bec4e862c8969845e90bc842b3c534800be9cb6fe6b0e3b"}, + {file = "PyMuPDF-1.23.3-cp310-none-win_amd64.whl", hash = "sha256:0fd19017d4c7791146e38621d878393136e25a2a4fadd0372a98ab2a9aabc0c5"}, + {file = "PyMuPDF-1.23.3-cp311-none-macosx_10_9_x86_64.whl", hash = "sha256:0e88408dea51492431b111a721d88a4f4c2176786734b16374d77a421f410139"}, + {file = "PyMuPDF-1.23.3-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:c4dbf5e851373f4633b57187b0ae3dcde0efad6ef5969c4de14bb9a52a796261"}, + {file = "PyMuPDF-1.23.3-cp311-none-manylinux2014_aarch64.whl", hash = "sha256:7218c1099205edb3357cb5713661d11d7c04aaa910645da64e17c2d050d61352"}, + {file = "PyMuPDF-1.23.3-cp311-none-manylinux2014_x86_64.whl", hash = "sha256:0304d5def03d2bedf951179624ea636470b5ee0a706ea37636f7a3b2b08561a5"}, + {file = "PyMuPDF-1.23.3-cp311-none-win32.whl", hash = "sha256:35fe66d80cdc948ed55ac70c94b2e7f740fc08309c4ce125228ce0042a2fbba8"}, + {file = "PyMuPDF-1.23.3-cp311-none-win_amd64.whl", hash = "sha256:e643e4f30d1a5e358a8f65eab66dd0ea33f8170d61eb7549f0d227086c82d315"}, + {file = "PyMuPDF-1.23.3-cp38-none-macosx_10_9_x86_64.whl", hash = "sha256:95065c21c39dc93c4e224a2ac3c903bf31d635cdb569338d79e9befbac9755eb"}, + {file = "PyMuPDF-1.23.3-cp38-none-macosx_11_0_arm64.whl", hash = "sha256:0c06610d78a86fcbfbcea77320c54f561ac4d568666d621afcf1109e8cfc829b"}, + {file = "PyMuPDF-1.23.3-cp38-none-manylinux2014_aarch64.whl", hash = "sha256:6e4ef7e65b3fb7f9248f1f2dc530f10d0e00a8080dd5da52808e6638a9868a10"}, + {file = "PyMuPDF-1.23.3-cp38-none-manylinux2014_x86_64.whl", hash = "sha256:d51b848d45e09e7fedfdeb0880a2a14872e25dd4e0932b9abf6a36a69bf01f6a"}, + {file = "PyMuPDF-1.23.3-cp38-none-win32.whl", hash = "sha256:42b879913a07fb251251af20e46747abc3d5d0276a48d2c28e128f5f88ef3dcd"}, + {file = "PyMuPDF-1.23.3-cp38-none-win_amd64.whl", hash = "sha256:a283236e09c056798ecaf6e0872790c63d91edf6d5f72b76504715d6b88da976"}, + {file = "PyMuPDF-1.23.3-cp39-none-macosx_10_9_x86_64.whl", hash = "sha256:6329a223ae38641fe4ff081beffd33f5e3be800c0409569b64a33b70f1b544cf"}, + {file = "PyMuPDF-1.23.3-cp39-none-macosx_11_0_arm64.whl", hash = "sha256:640a5ada4479a2c69b811c91f163a7b55f7fe1c323b861373d6068893cc9e9e0"}, + {file = "PyMuPDF-1.23.3-cp39-none-manylinux2014_aarch64.whl", hash = "sha256:2f555d264f08e091eaf9fd27c33ba9bfdc39ac8d09aa12195ab529bcca79229d"}, + {file = "PyMuPDF-1.23.3-cp39-none-manylinux2014_x86_64.whl", hash = "sha256:96dc89254d78bddac8434be7b9f4c354fe57b224b5420614cde9c2f1d2f1355e"}, + {file = "PyMuPDF-1.23.3-cp39-none-win32.whl", hash = "sha256:f9a1d2f7484bde2ec81f3c88641f7a8b7f52450b807408ae7a340ddecb424659"}, + {file = "PyMuPDF-1.23.3-cp39-none-win_amd64.whl", hash = "sha256:7cfceb91048665965d826023c4acfc45f61f5cfcf101391b3c1d22f85cef0470"}, + {file = "PyMuPDF-1.23.3.tar.gz", hash = "sha256:021478ae6c76e8859241dbb970612c9080a8957d8bd697bba0b4531dc1cf4f87"}, ] [package.dependencies] -PyMuPDFb = "1.23.0" +PyMuPDFb = "1.23.3" [[package]] name = "pymupdfb" -version = "1.23.0" -description = "Rebased Python bindings for the PDF toolkit and renderer MuPDF - shared libraries only" +version = "1.23.3" +description = "MuPDF shared libraries for PyMuPDF." optional = false python-versions = ">=3.8" files = [ - {file = "PyMuPDFb-1.23.0-py3-none-macosx_10_9_x86_64.whl", hash = "sha256:517b6bf8eb658d15a13b43f95bd06a93ac7020b25df45b35eb43058815ff2de0"}, - {file = "PyMuPDFb-1.23.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:07f55d21e982b3ba567708f8a4d663d933b597d2dcd81e16a6b09fdc3a66e59c"}, - {file = "PyMuPDFb-1.23.0-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:3acfa3b9b4a4021cac771c74621d571b9cbb6ed7dc09833fe85f4126a7d4db97"}, - {file = "PyMuPDFb-1.23.0-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:c58801b097c02910238e378aad23be3ff0f5ef785bad269142b4efb353dc9728"}, - {file = "PyMuPDFb-1.23.0-py3-none-win32.whl", hash = "sha256:89bf1a1085bffa3d5f8db12fce61cc9e0644a296bcc9568637f7ab36509e79ae"}, - {file = "PyMuPDFb-1.23.0-py3-none-win_amd64.whl", hash = "sha256:a5f425f2b2d977f623ba87f869162cb1a1579023c92032f863ea541d3c8997d9"}, + {file = "PyMuPDFb-1.23.3-py3-none-macosx_10_9_x86_64.whl", hash = "sha256:5b05c643210eae8050d552188efab2cd68595ad75b5879a550e11af88e8bff05"}, + {file = "PyMuPDFb-1.23.3-py3-none-macosx_11_0_arm64.whl", hash = "sha256:2a2b81ac348ec123bfd72336a590399f8b0035a3052c1cf5cc2401ca7a4905e9"}, + {file = "PyMuPDFb-1.23.3-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:924f3f2229d232c965705d120b3ff38bbc37459af9d0e798b582950f875bee92"}, + {file = "PyMuPDFb-1.23.3-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:6c287b9ce5ed397043c6e13df19640c94a348e9edc8012d9a7b001c69ba30ca9"}, + {file = "PyMuPDFb-1.23.3-py3-none-win32.whl", hash = "sha256:8703e3a8efebd83814e124d0fc3a082de2d2def329b63fca1065001e6a2deb49"}, + {file = "PyMuPDFb-1.23.3-py3-none-win_amd64.whl", hash = "sha256:89d88069cb8deb100ddcf56e1feefc7cff93ff791260325ed84551f96d3abd9f"}, ] [[package]] @@ -648,13 +648,13 @@ diagrams = ["jinja2", "railroad-diagrams"] [[package]] name = "pytest" -version = "7.4.0" +version = "7.4.2" description = "pytest: simple powerful testing with Python" optional = false python-versions = ">=3.7" files = [ - {file = "pytest-7.4.0-py3-none-any.whl", hash = "sha256:78bf16451a2eb8c7a2ea98e32dc119fd2aa758f1d5d66dbf0a59d69a3969df32"}, - {file = "pytest-7.4.0.tar.gz", hash = "sha256:b4bf8c45bd59934ed84001ad51e11b4ee40d40a1229d2c79f9c592b0a3f6bd8a"}, + {file = "pytest-7.4.2-py3-none-any.whl", hash = "sha256:1d881c6124e08ff0a1bb75ba3ec0bfd8b5354a01c194ddd5a0a870a48d99b002"}, + {file = "pytest-7.4.2.tar.gz", hash = "sha256:a766259cfab564a2ad52cb1aae1b881a75c3eb7e34ca3779697c23ed47c47069"}, ] [package.dependencies] @@ -758,103 +758,119 @@ files = [ [[package]] name = "rapidfuzz" -version = "3.2.0" +version = "3.3.0" description = "rapid fuzzy string matching" optional = false python-versions = ">=3.7" files = [ - {file = "rapidfuzz-3.2.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:f5787f1cc456207dee1902804209e1a90df67e88517213aeeb1b248822413b4c"}, - {file = "rapidfuzz-3.2.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e8d91137b0b5a6ef06c3979b6302265129dee1741486b6baa241ac63a632bea7"}, - {file = "rapidfuzz-3.2.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c130e73e0079f403b7c3dbf6f85816a3773971c3e639f7289f8b4337b8fd70fe"}, - {file = "rapidfuzz-3.2.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4e18059188bfe3cdbc3462aeec2fa3302b08717e04ca34e2cc6e02fb3c0280d8"}, - {file = "rapidfuzz-3.2.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:37bb6bd6a79d5524f121ff2a7d7df4491519b3f43565dccd4596bd75aa73ab7c"}, - {file = "rapidfuzz-3.2.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ca0d6aee42effaf2e8883d2181196dd0957b1af5731b0763f10f994c32c823db"}, - {file = "rapidfuzz-3.2.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:49fc2cbbf05bfa1af3fe4c0e0c8e5c8ac118d6b6ddfb0081cff48ad53734f7ac"}, - {file = "rapidfuzz-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1bd4fdee46f6ba7d254dba8e7e8f33012c964fc891a06b036b0fd20cab0db301"}, - {file = "rapidfuzz-3.2.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:ab2863732eafd1cc58f249f145c20ad13d4c902d3ef3a369b00438c05e5bfb55"}, - {file = "rapidfuzz-3.2.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:a9658c545de62ac948027092ba7f4e8507ebc5c9aef964eca654409c58f207f0"}, - {file = "rapidfuzz-3.2.0-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:5f3e36cfadaf29f081ad4ca476e320b639d610e930e0557f395780c9b2bdb135"}, - {file = "rapidfuzz-3.2.0-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:239ffc04328e14f5e4097102bd934352a43d5912acf34fb7d3e3fe306de92787"}, - {file = "rapidfuzz-3.2.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:b56ce39ba0a77501d491bc20a2266989ae0264452758b004950ee5f4c10c641f"}, - {file = "rapidfuzz-3.2.0-cp310-cp310-win32.whl", hash = "sha256:dbebd639579ab113644699fe0c536ae00aba15b224e40a79987684333d1104a5"}, - {file = "rapidfuzz-3.2.0-cp310-cp310-win_amd64.whl", hash = "sha256:88e99229c4df99a7e5810d4d361033b44e29d8eb4faaddcfb8e4bdcb604cf40a"}, - {file = "rapidfuzz-3.2.0-cp310-cp310-win_arm64.whl", hash = "sha256:8e39c4e2e85828aa6c39cc7f30e2917d991b40190a2a3af1fa02396a3362a54e"}, - {file = "rapidfuzz-3.2.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:2f2e618389427c5e8304357a78f83df22558e61f11bc21aeb95dd544c274d330"}, - {file = "rapidfuzz-3.2.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:a2a6babfe4d3ce2eadd0079ee7861cb5f1584845c5a3394edead85457e7d7464"}, - {file = "rapidfuzz-3.2.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f223deb06895c9c136b40cd8fd7e96ee745c3bb9ed502d7367f6ad9ab6fdd40e"}, - {file = "rapidfuzz-3.2.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0de6962b45f761355fa4b37de635e4df467d57530732a40d82e748a5bc911731"}, - {file = "rapidfuzz-3.2.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:76953516cb3b75fb1234c5a90e0b86be4525f055a9e276237adb1ffe40dca536"}, - {file = "rapidfuzz-3.2.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a1e04861dddbb477500449dc67fb037656a049b6f78c4c434c6000e64aa42bb4"}, - {file = "rapidfuzz-3.2.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4ff6e725eec9c769f9d22126c80a6ada90275c0d693eca2b35d5933178bda5a2"}, - {file = "rapidfuzz-3.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9f21ce33242e579ba255c8a8b438782164acaa55bf188d9410298c40cbaa07d5"}, - {file = "rapidfuzz-3.2.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:986a7aad18768b920bb710e15ed7629d1da0af31589348c0a51d152820efc05d"}, - {file = "rapidfuzz-3.2.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:6e98f0a6fac14b7b9893147deceae12131f6ff169ae1c973635ef97617949c8f"}, - {file = "rapidfuzz-3.2.0-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:5dd5c4b9f5cd8a8271a90d1bab643028e7172808c68ed5d8dde661a3e51098e3"}, - {file = "rapidfuzz-3.2.0-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:e336b0a81c5a8e689edf6928136d19e791733a66509026d9acbaa148238186e0"}, - {file = "rapidfuzz-3.2.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:8fa44afb731535a803c4c15ee846257fef050768af96d1d6c0eadb30285d0f7b"}, - {file = "rapidfuzz-3.2.0-cp311-cp311-win32.whl", hash = "sha256:d04ad155dbecc0c143912f691d38d4790e290c2ce5411b146c0e00d4f4afd26f"}, - {file = "rapidfuzz-3.2.0-cp311-cp311-win_amd64.whl", hash = "sha256:b9e79e27344af95a71a3bb6cd3562581da5d0780ff847a13ad69ee622d940d3c"}, - {file = "rapidfuzz-3.2.0-cp311-cp311-win_arm64.whl", hash = "sha256:dc53747e73f34e8f3a3c1b0bc5b437b90a2c69d873e97781aa7c06543201409a"}, - {file = "rapidfuzz-3.2.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:613c1043332eeba0c0910de71af221ac10d820b4fa9615b0083c733b90a757f9"}, - {file = "rapidfuzz-3.2.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f0907f87beca70e44f78e318eede2416ddba19ec43d28af9248617e8a1741ef3"}, - {file = "rapidfuzz-3.2.0-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bcfd184e0b5c58497cc3d961f49ac07ae1656d161c6c4d06230d267ae4e11f00"}, - {file = "rapidfuzz-3.2.0-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7a7d53a2f1ccfb169be26fa3824b1b185420592c75853f16c6b7115315ea6784"}, - {file = "rapidfuzz-3.2.0-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2eac585803c4e8132ed5f4a150621db05c418304982c88cf706abdded65e1632"}, - {file = "rapidfuzz-3.2.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fc859f654b350def5df2ebc6d09f822b04399823e3dad1c3f2e8776c825fcde7"}, - {file = "rapidfuzz-3.2.0-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:8a165f64c528edc0bbbd09c76d64efd4dbe4240fd1961710b69586ef40486e79"}, - {file = "rapidfuzz-3.2.0-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:56a392b655597ecf40535b56bfb7c0856c10c0abc0cbc369fd25a1665420710b"}, - {file = "rapidfuzz-3.2.0-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:5863b176da42b1bb450a28375ef1502f81fbecd210a5aae295d7f2221284ad41"}, - {file = "rapidfuzz-3.2.0-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:8f8590c39a3f745b314f2697b140c8f8600fe7ecfb2101e9e4ec6e7716c66827"}, - {file = "rapidfuzz-3.2.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:da00990adf1fbc0904f22409b3451473fa465a0ef49f3075703c206080aa31b2"}, - {file = "rapidfuzz-3.2.0-cp37-cp37m-win32.whl", hash = "sha256:2504205552bf568ac478f17dd612d0e31c4a82c645c66209a442df7e572b5adc"}, - {file = "rapidfuzz-3.2.0-cp37-cp37m-win_amd64.whl", hash = "sha256:af3ac648232c109e36c8b941106d726969972644aa3ef55218c5988aa1daea03"}, - {file = "rapidfuzz-3.2.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:04d22f6058ce5d620ec4ecd771e44cfa77d571137d6c6547df57bdfc44ee2a98"}, - {file = "rapidfuzz-3.2.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:ac7ddcd372ed202d1b59b117506da695b291f135435cfbf3e71490aa8e687173"}, - {file = "rapidfuzz-3.2.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:fd3fca0224b84350f73eab1fb5728c58fd25ee4f20e512607c7d83f9bc836d3f"}, - {file = "rapidfuzz-3.2.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9bdb1f92c4666c7e1d3c21268b931cf3f06f32af98dfdeb37641159b15fa31dd"}, - {file = "rapidfuzz-3.2.0-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:871052405c465a45b53a3dc854a8be62079f42cdbb052651ff0b65e2452131e6"}, - {file = "rapidfuzz-3.2.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:fb9bb1af5680741cf974f510fb3894907a1b308e819aff3d9ea10b5326e8a5f6"}, - {file = "rapidfuzz-3.2.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:84ce2e010677835fa5ba591419e4404f11a1446f33eec3724a2bff557ae5144a"}, - {file = "rapidfuzz-3.2.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c13107e0fdca5ccae70659f45646d57453338a9dfc6b152fb7372e4bf73466a0"}, - {file = "rapidfuzz-3.2.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:538027685a1a8f1699e329f6443951267f169bfa149298734ea679db8f0e7171"}, - {file = "rapidfuzz-3.2.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:3557736672115d082979a8a12f884ed5b24268f4471fee85cfb2ec7212b68607"}, - {file = "rapidfuzz-3.2.0-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:6bc5e3da74644cf75663f5b438e0ae79b67d1f96d082cda771b0ecfed0528f40"}, - {file = "rapidfuzz-3.2.0-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:d2d0fc98d9d7bba44f929d201c2c2c35eb69ea2ffef43d939b297dafef934625"}, - {file = "rapidfuzz-3.2.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:2bf85a3bf34f27383691e8af0fd148b2a3a89f1444d4640d04ef58030f596ee0"}, - {file = "rapidfuzz-3.2.0-cp38-cp38-win32.whl", hash = "sha256:cf5ea3f1d65a0bee707245a0096c3a6f769b3ad6f1b9afc7176dfb73eb0ac98f"}, - {file = "rapidfuzz-3.2.0-cp38-cp38-win_amd64.whl", hash = "sha256:54906095444ea8b0a4013f3799b3f2c380205d7f60b9c55774e7d2264fa8d9c6"}, - {file = "rapidfuzz-3.2.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:6d44218823533e0d47770feef86c73c90a6f7e8d4923eafabf56a1fa3444eda0"}, - {file = "rapidfuzz-3.2.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:87c3d4077e61c66d5dd11198a317f83db8e8cf034239baa16e4384037b611652"}, - {file = "rapidfuzz-3.2.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:fc0e1142350566349c41173685988d942ebc89578f25ee27750d261e7d79e1ce"}, - {file = "rapidfuzz-3.2.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:de44a378751fdfb19ddf6af412b3395db4b21ab61f40139f815c82f1a1611b50"}, - {file = "rapidfuzz-3.2.0-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ca0983b30c7b289f540b11cdb550e301b3f2e8f0ef9df866aa24a16f6cd96041"}, - {file = "rapidfuzz-3.2.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:adfffb79288437006be412d74e28cddd7c5e6cc9f84a34aa9c356b13dc1ad2c9"}, - {file = "rapidfuzz-3.2.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a284386652efb3b7d41ed5dd101ab4ce5936f585c52a47fa9838fc0342235700"}, - {file = "rapidfuzz-3.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c546c83d6bc9006b86f56921b92c3e16d8ddeb4e1663653e755a5d8a3ac258da"}, - {file = "rapidfuzz-3.2.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:53b3575fa398a5021192c1592dce98965560ad00690be3ade056eab99288562c"}, - {file = "rapidfuzz-3.2.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:366ade5d0067dc6281e2a6c9e5c91bbfe023b09cef86894de8fe480b4696e3bf"}, - {file = "rapidfuzz-3.2.0-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:f946dec03cc2c77bc091d186c007d1e957d1f16a4d68a181f5fa75aea40bdf87"}, - {file = "rapidfuzz-3.2.0-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:045e5cccb0e792005d5465de0ea4621b9b67778580e558f266984704e68b0087"}, - {file = "rapidfuzz-3.2.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:fd80288b9538c87209893f0934563c20b6a43acf30693794bcc111b294447ee9"}, - {file = "rapidfuzz-3.2.0-cp39-cp39-win32.whl", hash = "sha256:a359436754ed5dd10d88706f076caa7f8e5c1469bf5ebba1897dc87aa9ff953e"}, - {file = "rapidfuzz-3.2.0-cp39-cp39-win_amd64.whl", hash = "sha256:75df3d9b895910ee810b2c96c8626cc2b5b63bb237762db36ff79fb466eccc43"}, - {file = "rapidfuzz-3.2.0-cp39-cp39-win_arm64.whl", hash = "sha256:893833a903875a50acdbcb7ed33b5426ba47412bd18b3eb80d56d982b641dc59"}, - {file = "rapidfuzz-3.2.0-pp37-pypy37_pp73-macosx_10_9_x86_64.whl", hash = "sha256:3002c3660180747243cccb40c95ade1960e6665b340f211a114f5994b345ab53"}, - {file = "rapidfuzz-3.2.0-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:aa50de7e0f95e1400b2bf38cfeb6e40cf87c862537871c2f7b2050b5db0a9dfc"}, - {file = "rapidfuzz-3.2.0-pp37-pypy37_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:54842a578a2a8e5258812a9032ffb55e6f1185490fd160cae64e57b4dc342297"}, - {file = "rapidfuzz-3.2.0-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:108861623838cd574b0faa3309ce8525c2086159de7f9e23ac263a987c070ebd"}, - {file = "rapidfuzz-3.2.0-pp37-pypy37_pp73-win_amd64.whl", hash = "sha256:d39128415f0b52be08c15eeee5f79288189933a4d6fa5dc5fff11e20614b7989"}, - {file = "rapidfuzz-3.2.0-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:3af2b75635f33ffab84e295773c84a176d4cba75311d836ad79b6795e9da11ac"}, - {file = "rapidfuzz-3.2.0-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:68c678f7f3ca3d83d1e1dd7fb7db3232037d9eef12a47f1d5fe248a76ca47571"}, - {file = "rapidfuzz-3.2.0-pp38-pypy38_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:25d2bd257034e910df0951cdeff337dbd086d7d90af3ed9f6721e7bba9fc388a"}, - {file = "rapidfuzz-3.2.0-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5c7f20e68cad26fc140c6f2ac9e8f2632a0cd66e407ba3ea4ace63c669fd4719"}, - {file = "rapidfuzz-3.2.0-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:f09fd9dc73180deb9ca1c4fbd9cc27378f0ab6ee74e97318c38c5080708702b6"}, - {file = "rapidfuzz-3.2.0-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:af7914fc7683f921492f32314cfbe915a5376cc08a982e09084cbd9b866c9fd4"}, - {file = "rapidfuzz-3.2.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:08a242c4b909abbcfa44504dc5041d5eeca4cd088ae51afd6a52b4dc61684fa2"}, - {file = "rapidfuzz-3.2.0-pp39-pypy39_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:71b07afaca28398b93d727a2565491c455896898b66daee4664acde4af94e557"}, - {file = "rapidfuzz-3.2.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:24e4c4a031c50e4eeb4787263319a0ac5bed20f4a263d28eac060150e3ba0018"}, - {file = "rapidfuzz-3.2.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:d19c2853a464c7b98cc408654412fd875b030f78023ccbefc4ba9eec754e07e7"}, - {file = "rapidfuzz-3.2.0.tar.gz", hash = "sha256:448d031d9960fea7826d42bd4284156fc68d3b55a6946eb34ca5c6acf960577b"}, + {file = "rapidfuzz-3.3.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:6bec4903d4127d1eaa20a62105a03b38184ddaef40e18393caa1d98ae3de6a0c"}, + {file = "rapidfuzz-3.3.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:2e6c4580b0de835156671390959efad13741d0fb35cc355bc546d1dbf399db5e"}, + {file = "rapidfuzz-3.3.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:66b92484cc5ea1b546d2adef50407aa011df8c92fcc22ec9b9803eff2d917dcc"}, + {file = "rapidfuzz-3.3.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f5103c8f4aca404d1db4ba65c393d85d8a78f2547ce7d4a434921a4a1383aa67"}, + {file = "rapidfuzz-3.3.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3cf3d37e38e7a205758269cd8c8a2ae506214732ef2a82bb1ef01c695963b3f5"}, + {file = "rapidfuzz-3.3.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:efb8cc7da41926e4e68773afcdb2fa9bb6a32caefbc297c818526232a58ad5d7"}, + {file = "rapidfuzz-3.3.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c8b22e1973009e89ac0e1ad157ff978a15021c2acddfa15371456ef58156aa47"}, + {file = "rapidfuzz-3.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:307c6b9e1e47afe9dc274e2e5bccb81be0941f90f395a38f77405f1d7216bc0a"}, + {file = "rapidfuzz-3.3.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:404b6bf53ac0b2b0b1f901f51953e04b758bf6905e1ee1cc29001b1cdfa55316"}, + {file = "rapidfuzz-3.3.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:dfccefebbda76796164f8ec6ec04999d635be2d86d83b09d703b8a1f312234c7"}, + {file = "rapidfuzz-3.3.0-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:4f77055d29ab2af3d3be16d50ecabb3ade6ea61bb1768b578f84cf558be5ef1a"}, + {file = "rapidfuzz-3.3.0-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:7ea934a589a7b3d522cbc358e9f8bdf6fae38c65d35596b12616f78c1c3089ec"}, + {file = "rapidfuzz-3.3.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:536238d37c9918b235899cc0e330a45304ab3c25be963912b7a969b61bbb309f"}, + {file = "rapidfuzz-3.3.0-cp310-cp310-win32.whl", hash = "sha256:c7070a163017739bfaf4c8c31d66d347d7ab401c4bdb136b268508c24410aa58"}, + {file = "rapidfuzz-3.3.0-cp310-cp310-win_amd64.whl", hash = "sha256:e45e5930d9f4f78f8d4b34baf4700f150b845cf8ed31bb2fb9149e29e07c6bb8"}, + {file = "rapidfuzz-3.3.0-cp310-cp310-win_arm64.whl", hash = "sha256:783d082341785a832c65010a5dbba3f0c3d500f919edb25f076ddd5991fc8fba"}, + {file = "rapidfuzz-3.3.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:12fd4e7c7d8a58fc43a9fbbe76b577c599403174740160937f852be4e78734e7"}, + {file = "rapidfuzz-3.3.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5a661302b2a93afd3cbeed7a2c43d671d65de1f503c129e745255507c8a91a24"}, + {file = "rapidfuzz-3.3.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ba3421dd0f5048403acdad536b451d59bccda7b050144928c07d5830af1fb127"}, + {file = "rapidfuzz-3.3.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b047aca009e7978a39b85f36a2ab3dbea2bec773d0cab739caa5c6c3e51fe051"}, + {file = "rapidfuzz-3.3.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ed199d15bfac7a9692bba218f63d117b558f5e08d44c678e2bc9bb43931a701b"}, + {file = "rapidfuzz-3.3.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d91e0d03dce17d5e80fc3f12c0c1d1b304f1ad7c26e79e9378236772ab5de393"}, + {file = "rapidfuzz-3.3.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2c72aafd9f4a83d504c898473e084548ddd3fb2b2eb56121513a13807544a8d6"}, + {file = "rapidfuzz-3.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:324bc1b508a32972bcf267d1fbf5fddf831da0bbb9c052ffaf733d0be30819f4"}, + {file = "rapidfuzz-3.3.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:b8fd22c2fc3218614991abff75989a55ca9d99c50f69376457246515ce95e27d"}, + {file = "rapidfuzz-3.3.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:8f5793a577570211f5dc9b08a9c53d9b7e649372a6dcb8756f3eb823504778eb"}, + {file = "rapidfuzz-3.3.0-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:1370bb6decb505b7fee362ffd2f111ca0c369e62a35eac35386b87a8c8f29a38"}, + {file = "rapidfuzz-3.3.0-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:906ddf3902ac4537bf21b2140c9b089c5cf4b203fdba72b447d89d6e8137132b"}, + {file = "rapidfuzz-3.3.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:bb60f7d2bbdabcd41059ccb68a8aea2353f96147a8402fac6581391e7edf809a"}, + {file = "rapidfuzz-3.3.0-cp311-cp311-win32.whl", hash = "sha256:ab7b2c2cb65075d68a9c0f28513ce5154c6e7520fe13b76755971eb135138e74"}, + {file = "rapidfuzz-3.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:cae783ac3830a20fe32e80c53a406654d3a75b9b5d3351e81ac75ce470f24ad2"}, + {file = "rapidfuzz-3.3.0-cp311-cp311-win_arm64.whl", hash = "sha256:f4b8fd0acbbddaf0d96c1f01e949d645073ad54f8fee1a59af6aa914340ae331"}, + {file = "rapidfuzz-3.3.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:9558411d11dfcf85d4b080e0bc005114868e217c41f0a36cc13dc2c8ec91eacc"}, + {file = "rapidfuzz-3.3.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:0965b30b84687107658cb6dc0852d1e14e2a80a93036320264128c8940643db7"}, + {file = "rapidfuzz-3.3.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:5724ba0f4ac93ad43cc80407cbca2c598a36daf7f65c14279deaf3ad159f00ac"}, + {file = "rapidfuzz-3.3.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e6467e0adfcb43b4d5bdd92a009cf7c8b952189b943c55050a9f1a8cd8180865"}, + {file = "rapidfuzz-3.3.0-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:da5d006f329518eb797c9ed12e05bb8332663c3afa5d2a508032f64f7232766c"}, + {file = "rapidfuzz-3.3.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:343f863df7000f212156ed030196ed20215f5231ca54749228a5d6a317b626e8"}, + {file = "rapidfuzz-3.3.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:113fb1020be6f4727ecbd4fb29a510e14effe85910edf22cafb6c5d1eea75694"}, + {file = "rapidfuzz-3.3.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:78e0d6d061835b16ad42df98fe826e4a0a3380621568f80c6ee2fa230d8d7020"}, + {file = "rapidfuzz-3.3.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:47ddf5d075e09d0baccc3d499c5eba36e2605771da65a6d95fcc72e22c5e36b9"}, + {file = "rapidfuzz-3.3.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:bc354de84a2a56890a68a526b0d689dd010df1003794d24f222ee5ba6405d39d"}, + {file = "rapidfuzz-3.3.0-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:2a58279ee1595838dfc2b80562e0a89f6cb98b427c738b57ea146318604dba11"}, + {file = "rapidfuzz-3.3.0-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:f3b40aa31c7a970696967ff43d6d5bf3be4f6c008c9ad661cf8721af9c7c81fc"}, + {file = "rapidfuzz-3.3.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:7437bfcef4b2b1417731293b97343cb45f1ca46ced381d511cd601ea41b8ab49"}, + {file = "rapidfuzz-3.3.0-cp312-cp312-win32.whl", hash = "sha256:bd6ec3ad3fa5a490dfce534bb87429c122faf6239c97d6c2763353ef61ddab08"}, + {file = "rapidfuzz-3.3.0-cp312-cp312-win_amd64.whl", hash = "sha256:3ef0cf46d84bc307100f48212966e58f7a55c6045cb4ce9fb3e386313e0fc3a1"}, + {file = "rapidfuzz-3.3.0-cp312-cp312-win_arm64.whl", hash = "sha256:cfa1345d5007efc1bdad6f40d2bbdc42fc83bb6b9fcb8cd3cc830180ccf360b9"}, + {file = "rapidfuzz-3.3.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:fbafe4e45086b9245e12d97d0c4232e866a52469221acef05192bdb2a9b96a21"}, + {file = "rapidfuzz-3.3.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6e053d10f01f06d0e10a8a229f2d82845f6d5ec13d67b6a6c11910f49f6e46b2"}, + {file = "rapidfuzz-3.3.0-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1b2ca9583c9e0361144138f4884b59e7165daa56b5983f15bbc1441de3d548a9"}, + {file = "rapidfuzz-3.3.0-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1441d18aa459e72c5aeacf6ec140878c2ec6debdbbabfd68cd5968ac07ac9b2b"}, + {file = "rapidfuzz-3.3.0-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2de666fb4d3558847b229b4a06909557628a6a1fe5ccdc68e522eed90f442e6f"}, + {file = "rapidfuzz-3.3.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6b632178ac063e8ad5b8d8bf051bd2436170c3822e865eab63e45a3289b80683"}, + {file = "rapidfuzz-3.3.0-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:7f3da8741463e0345480e49d8bc89b83713d802c5c2851d590bd7ba1aaeace87"}, + {file = "rapidfuzz-3.3.0-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:4839f1760c7d5e1f1d01230065a111773f2f78277df5d66a55902bdef77f3f93"}, + {file = "rapidfuzz-3.3.0-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:78641fecf74571d0f693cda9aaeb70db581c1df4f0ce6a9077b05558e7c5b6f5"}, + {file = "rapidfuzz-3.3.0-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:5df1d384283aa2491ecdc23d00d3b2c2f1ed745f9ca42813b3e51e39a180f9b2"}, + {file = "rapidfuzz-3.3.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:b13dc341f4deb2f599db3fe0ad52ab87f7bb1fc09911b4823ac618925fa400e0"}, + {file = "rapidfuzz-3.3.0-cp37-cp37m-win32.whl", hash = "sha256:ca3c582e57df2407c5e07db26edb4ef19c2a7882ce2bf0fc1c5a6394986f84e5"}, + {file = "rapidfuzz-3.3.0-cp37-cp37m-win_amd64.whl", hash = "sha256:483299d57306c29a2bb1eb5d978f4d25a5e8d67ffee18b4155847bf9fc422b82"}, + {file = "rapidfuzz-3.3.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:4671cd2e0d7e861b7c7aea8ed529b93e7ebec9ec4f6858cd72c395a99074826b"}, + {file = "rapidfuzz-3.3.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9bb2690f0d760ebfb4a943f51deb7b3d689b1fa7f87f8de1f005b19574f59259"}, + {file = "rapidfuzz-3.3.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:8a1bba2aa6af58ff04e9fdde40747dfdb3f1bc836bbb86533e317f9a4b6a607d"}, + {file = "rapidfuzz-3.3.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9a6f8edd755d29ac7a9804382780d4b383f494fed894514819932b9a484fa117"}, + {file = "rapidfuzz-3.3.0-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1eb377265e4d2f9e6c21a0adc865dddd52b1cd90ecd2552e99b386bb1effe38a"}, + {file = "rapidfuzz-3.3.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:319e67f8c1d7b9b45386821ace33edd289ec0995d80361775f5d6d15d684c6a6"}, + {file = "rapidfuzz-3.3.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9837f4528071e5dc695fb30098d9b49341e62fab32ef5c15094be260df1a48e4"}, + {file = "rapidfuzz-3.3.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:29acfc446d091e723caa65e2a7e1b2d7669ac927d02effc5cc636bcd2f41bba5"}, + {file = "rapidfuzz-3.3.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:25704f21afa6fc767902e9d0b840623c008d28f58022904c282e26e4f38b770d"}, + {file = "rapidfuzz-3.3.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:e5051c47a2a00b10c62b51bf1c4aefd2adb34d837b56fe16cfe505db7b7cc2be"}, + {file = "rapidfuzz-3.3.0-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:a441e90ec5ac09746d3960221df1268e7a7f0b47978c388e7dcae83e23ae3462"}, + {file = "rapidfuzz-3.3.0-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:e7f8056826ba8342f6d9c199750cddce70d1d90254320494115c26480fac44cc"}, + {file = "rapidfuzz-3.3.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:78c73efada660cc548d166f08970aeff0ee4dd0a66fa4f27bd24ac6c31551503"}, + {file = "rapidfuzz-3.3.0-cp38-cp38-win32.whl", hash = "sha256:9acd9cd547089b8835ac36223888a43bdfd2492064a8c5ee00cdaf6ef010de4b"}, + {file = "rapidfuzz-3.3.0-cp38-cp38-win_amd64.whl", hash = "sha256:93bdb512798f8226cd4b785a73c70efa582f26a7287d55337b9216b384946494"}, + {file = "rapidfuzz-3.3.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:5b8cdcf915c7058815321f2c0c30d20097722459bbb1fd2e1cae574bd03a39db"}, + {file = "rapidfuzz-3.3.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:376b87d6b7c83b6b2923f0916fea6fb7288ab66b1b1f3b0cc39e601bb09488cf"}, + {file = "rapidfuzz-3.3.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3363beb1da090be877e8a22ac6daefe8e0a6f6aeec10cebc4ec39db3abece897"}, + {file = "rapidfuzz-3.3.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:eb1dbf0b3dcb5b9615ea5b90619eb5d7756c2d377770d53c4101ce728de53a8e"}, + {file = "rapidfuzz-3.3.0-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5c4628e165910aeb96099dd42822ad32819511c2a4061dbf62169302d7299f6e"}, + {file = "rapidfuzz-3.3.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f25757cb21df937daed9727b628435390ce86af7734cbb3b5c055a1c57ffb434"}, + {file = "rapidfuzz-3.3.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e4aad090c40474f26578a968e4fbcc9418a4292201a3a0f96a2290465f5aaeec"}, + {file = "rapidfuzz-3.3.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:84ddf7a03740420bfae76ca5d86f934389729cc231285b669fd3d1c913b84005"}, + {file = "rapidfuzz-3.3.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:888c096f29f1dbfa6614ffc2780d2c766033e6c2906413d4d6d1f04e5cda05d7"}, + {file = "rapidfuzz-3.3.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:2a62d1435ef7d897ac37c5975d1f672f5e73857eb183bec821a174ed937f53fb"}, + {file = "rapidfuzz-3.3.0-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:ecc437ad773d40217e0a4704ee60002f7e699383dffbf576f41ed7ae6f4a8acc"}, + {file = "rapidfuzz-3.3.0-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:e59462ea2f94c809fdea2426a7cd2fe219f171cc7d0dfdbc5681176f86884da4"}, + {file = "rapidfuzz-3.3.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:68484409517617feccac3092ec879d5630253890e6895ffbe7880f063329d114"}, + {file = "rapidfuzz-3.3.0-cp39-cp39-win32.whl", hash = "sha256:78abf9f6e3e60d4004f66085bf4618cb5480ca6155d39d17277db7d29388e49d"}, + {file = "rapidfuzz-3.3.0-cp39-cp39-win_amd64.whl", hash = "sha256:56dec09e716fb12c9fa10649b5980e4bad9563b2b7dc74776618b84603740f6c"}, + {file = "rapidfuzz-3.3.0-cp39-cp39-win_arm64.whl", hash = "sha256:3dfe9f4cdd9f9087f7bdd7c9f4e9304557ca8c44d4a1b1eca69230535e9ab2df"}, + {file = "rapidfuzz-3.3.0-pp37-pypy37_pp73-macosx_10_9_x86_64.whl", hash = "sha256:c089bf351657a16a31b911ddab3e2f19b04062f7c8244cea1ec5a40f490e0829"}, + {file = "rapidfuzz-3.3.0-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9e737bd4f30917643c50694df1486ce1a5f869bbf523f38b867076a775ca1a00"}, + {file = "rapidfuzz-3.3.0-pp37-pypy37_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:68800fe792718b5b2a5ff7febbb6e4cc551ef767704873ec04062f642c9f5901"}, + {file = "rapidfuzz-3.3.0-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:557a736c8c6e01e2d844211eb4f6f7913f54a912f6578fdf8d72312ae906929c"}, + {file = "rapidfuzz-3.3.0-pp37-pypy37_pp73-win_amd64.whl", hash = "sha256:e809367f75b1d65ea5524a6acfcc4dcced79f4f2d19dbad8f17175ad4864515b"}, + {file = "rapidfuzz-3.3.0-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:a2cc8ea602e030dd5a220e537cc6bbb241ecfe293614415076d8045dd198acd8"}, + {file = "rapidfuzz-3.3.0-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f0861477cb2d21ed3e3a96a98adba6b24ecaaf50021991fdd72794f963a8f8e9"}, + {file = "rapidfuzz-3.3.0-pp38-pypy38_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7fa079737a22e5b098545476be428a90635bf7c89bf3ea5587fa2d07645b1569"}, + {file = "rapidfuzz-3.3.0-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ada0258e3d23518b74af2363bb4062cfe492ec0c7c4c752aff6cd084d6917830"}, + {file = "rapidfuzz-3.3.0-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:50b2b2b96b9c7841f6e2ab001153cd0bfcf707c427f20fed2f1f3849a99bd3fc"}, + {file = "rapidfuzz-3.3.0-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:062a5ed305a6d45798cf5548c780d4a434d1f188cc10b971c5c389d11fa356e7"}, + {file = "rapidfuzz-3.3.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:18b51a2858e7adc950407bdb21382256d499472ba5c5d870eada0fa880d854eb"}, + {file = "rapidfuzz-3.3.0-pp39-pypy39_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d0f0b3587ee7dd0f8d96078c33ba88e583dab8834dd658b18df29cfced360cc6"}, + {file = "rapidfuzz-3.3.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2f9a40cc64ef814ae60d567c3c9ad01ce92243a9ed6746b31bcddebc1ecc2284"}, + {file = "rapidfuzz-3.3.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:cd254dc3436347a12e683c9d1984230228270009ff0985a38cbf5cbd25e8bc8c"}, + {file = "rapidfuzz-3.3.0.tar.gz", hash = "sha256:5e71bc5829f41e78b2d009431aedeb308ee3699d2bbbc68b7739db9b40bd1465"}, ] [package.extras] @@ -883,13 +899,13 @@ use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"] [[package]] name = "soupsieve" -version = "2.4.1" +version = "2.5" description = "A modern CSS selector implementation for Beautiful Soup." optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" files = [ - {file = "soupsieve-2.4.1-py3-none-any.whl", hash = "sha256:1c1bfee6819544a3447586c889157365a27e10d88cde3ad3da0cf0ddf646feb8"}, - {file = "soupsieve-2.4.1.tar.gz", hash = "sha256:89d12b2d5dfcd2c9e8c22326da9d9aa9cb3dfab0a83a024f05704076ee8d35ea"}, + {file = "soupsieve-2.5-py3-none-any.whl", hash = "sha256:eaa337ff55a1579b6549dc679565eac1e3d000563bcb1c8ab0d0fefbc0c2cdc7"}, + {file = "soupsieve-2.5.tar.gz", hash = "sha256:5663d5a7b3bfaeee0bc4372e7fc48f9cff4940b3eec54a6451cc5299f1097690"}, ] [[package]] @@ -939,13 +955,13 @@ telegram = ["requests"] [[package]] name = "typing-extensions" -version = "4.7.1" -description = "Backported and Experimental Type Hints for Python 3.7+" +version = "4.8.0" +description = "Backported and Experimental Type Hints for Python 3.8+" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" files = [ - {file = "typing_extensions-4.7.1-py3-none-any.whl", hash = "sha256:440d5dd3af93b060174bf433bccd69b0babc3b15b1a8dca43789fd7f61514b36"}, - {file = "typing_extensions-4.7.1.tar.gz", hash = "sha256:b75ddc264f0ba5615db7ba217daeb99701ad295353c45f9e95963337ceeeffb2"}, + {file = "typing_extensions-4.8.0-py3-none-any.whl", hash = "sha256:8f92fc8806f9a6b641eaa5318da32b44d401efaac0f6678c9bc448ba3605faa0"}, + {file = "typing_extensions-4.8.0.tar.gz", hash = "sha256:df8e4339e9cb77357558cbdbceca33c303714cf861d1eef15e1070055ae8b7ef"}, ] [[package]] @@ -980,7 +996,7 @@ files = [ name = "whoosh" version = "2.7.4" description = "Fast, pure-Python full text indexing, search, and spell checking library." -optional = false +optional = true python-versions = "*" files = [ {file = "Whoosh-2.7.4-py2.py3-none-any.whl", hash = "sha256:aa39c3c3426e3fd107dcb4bde64ca1e276a65a889d9085a6e4b54ba82420a852"}, @@ -988,7 +1004,10 @@ files = [ {file = "Whoosh-2.7.4.zip", hash = "sha256:e0857375f63e9041e03fedd5b7541f97cf78917ac1b6b06c1fcc9b45375dda69"}, ] +[extras] +whoosh = ["whoosh"] + [metadata] lock-version = "2.0" python-versions = "^3.11" -content-hash = "a3af36ed2941235df158c20ba9b66bdf5a0af0554235fd004ec77c3e88def3c3" +content-hash = "9eda99eff02c403365445d0e75fcc21ed7649a59407d358d0cd429a034cbc6c9" From 7ee8d4911e0ee2e323e29218dd4d30dad238ce97 Mon Sep 17 00:00:00 2001 From: Marty Oehme Date: Thu, 21 Sep 2023 21:54:24 +0200 Subject: [PATCH 15/22] refactor: Make formatters functions Formatters have been classes so far which contained some data (the tamplate to use for formatting and the annotations and documents to format) and the actual formatting logic (an execute function). However, we can inject the annotations to be formatted and the templates so far are static only, so they can be simple variables (we can think about how to inject them at another point should it come up, no bikeshedding now). This way, we can simply pass around one function per formatter, which should make the code much lighter, easier to add to and especially less stateful which means less areas of broken interactions to worry about. --- papis_extract/__init__.py | 21 +++--- papis_extract/exporter.py | 17 +++-- papis_extract/formatter.py | 142 +++++++++++++------------------------ 3 files changed, 70 insertions(+), 110 deletions(-) diff --git a/papis_extract/__init__.py b/papis_extract/__init__.py index f8f6941..9f9ad17 100644 --- a/papis_extract/__init__.py +++ b/papis_extract/__init__.py @@ -8,12 +8,7 @@ import papis.strings from papis.document import Document from papis_extract import extractor, exporter -from papis_extract.formatter import ( - CountFormatter, - CsvFormatter, - MarkdownFormatter, - Formatter, -) +from papis_extract.formatter import Formatter, format_count, format_csv, format_markdown logger = papis.logging.get_logger(__name__) @@ -82,11 +77,11 @@ def main( return if template == "csv": - formatter = CsvFormatter() + formatter = format_csv elif template == "count": - formatter = CountFormatter() + formatter = format_count else: - formatter = MarkdownFormatter() + formatter = format_markdown run(documents, edit=manual, write=write, git=git, formatter=formatter) @@ -98,8 +93,10 @@ def run( write: bool = False, git: bool = False, ) -> None: - formatter.annotated_docs = extractor.start(documents) + annotated_docs = extractor.start(documents) if write: - exporter.to_notes(formatter, edit=edit, git=git) + exporter.to_notes( + formatter=formatter, annotated_docs=annotated_docs, edit=edit, git=git + ) else: - exporter.to_stdout(formatter) + exporter.to_stdout(formatter=formatter, annotated_docs=annotated_docs) diff --git a/papis_extract/exporter.py b/papis_extract/exporter.py index 8f03ebd..5d22270 100644 --- a/papis_extract/exporter.py +++ b/papis_extract/exporter.py @@ -6,33 +6,35 @@ import papis.api import papis.git import papis.config import Levenshtein +from papis_extract.annotation import AnnotatedDocument from papis_extract.formatter import Formatter logger = papis.logging.get_logger(__name__) -def to_stdout(formatter: Formatter) -> None: +def to_stdout(formatter: Formatter, annotated_docs: list[AnnotatedDocument]) -> None: """Pretty print annotations to stdout. Gives a nice human-readable representations of the annotations in somewhat of a list form. Not intended for machine-readability. """ - output:str = formatter.execute() - print(output.rstrip('\n')) + output: str = formatter(annotated_docs) + print(output.rstrip("\n")) -def to_notes(formatter: Formatter, edit: bool, git: bool) -> None: +def to_notes( + formatter: Formatter, annotated_docs: list[AnnotatedDocument], edit: bool, git: bool +) -> None: """Write annotations into document notes. Permanently writes the given annotations into notes belonging to papis documents. Creates new notes for documents missing a note field or appends to existing. """ - annotated_docs = formatter.annotated_docs for entry in annotated_docs: - formatted_annotations = formatter.execute(entry).split("\n") + formatted_annotations = formatter([entry]).split("\n") if formatted_annotations: _add_annots_to_note(entry.document, formatted_annotations) @@ -67,7 +69,8 @@ def _add_annots_to_note( # add newline if theres no empty space at file end if len(existing) > 0 and existing[-1].strip() != "": f.write("\n") - f.write("\n".join(new_annotations)) + print(new_annotations) + f.write("\n\n".join(new_annotations)) f.write("\n") logger.info( f"Wrote {len(new_annotations)} " diff --git a/papis_extract/formatter.py b/papis_extract/formatter.py index 4ba782d..75056ea 100644 --- a/papis_extract/formatter.py +++ b/papis_extract/formatter.py @@ -1,114 +1,74 @@ -from dataclasses import dataclass, field -from typing import Protocol +from collections.abc import Callable from papis_extract.annotation import AnnotatedDocument - -@dataclass -class Formatter(Protocol): - annotated_docs: list[AnnotatedDocument] - header: str - string: str - footer: str - - def execute(self, doc: AnnotatedDocument | None = None) -> str: - raise NotImplementedError +Formatter = Callable[[list[AnnotatedDocument]], str] -@dataclass -class MarkdownFormatter: - annotated_docs: list[AnnotatedDocument] = field(default_factory=lambda: list()) - header: str = "" - string: str = ( +def format_markdown(docs: list[AnnotatedDocument] = []) -> str: + template = ( "{{#tag}}#{{tag}}\n{{/tag}}" - "{{#quote}}> {{quote}}{{/quote}} {{#page}}[p. {{page}}]{{/page}}\n" - "{{#note}} NOTE: {{note}}{{/note}}" + "{{#quote}}> {{quote}}{{/quote}} {{#page}}[p. {{page}}]{{/page}}" + "\n{{#note}} NOTE: {{note}}{{/note}}" ) - footer: str = "" + output = "" + for entry in docs: + if not entry.annotations: + continue - def execute(self, doc: AnnotatedDocument | None = None) -> str: - output = "" - documents = self.annotated_docs if doc is None else [doc] - last = documents[-1] - for entry in documents: - if not entry.annotations: - continue + title_decoration = ( + f"{'=' * len(entry.document.get('title', ''))} " + f"{'-' * len(entry.document.get('author', ''))}" + ) + output += ( + f"{title_decoration}\n" + f"{entry.document['title']} - {entry.document['author']}\n" + f"{title_decoration}\n\n" + ) + for a in entry.annotations: + output += a.format(template) + output += "\n" - title_decoration = ( - f"{'=' * len(entry.document.get('title', ''))} " - f"{'-' * len(entry.document.get('author', ''))}" - ) - output += ( - f"{title_decoration}\n" - f"{entry.document['title']} - {entry.document['author']}\n" - f"{title_decoration}\n\n" - ) - for a in entry.annotations: - output += a.format(self.string) + output += "\n\n\n" - if entry != last: - output += "\n\n\n" - - return output + return output -@dataclass -class CountFormatter: - annotated_docs: list[AnnotatedDocument] = field(default_factory=lambda: list()) - header: str = "" - string: str = "" - footer: str = "" +def format_count(docs: list[AnnotatedDocument] = []) -> str: + output = "" + for entry in docs: + if not entry.annotations: + continue - def execute(self, doc: AnnotatedDocument | None = None) -> str: - documents = self.annotated_docs if doc is None else [doc] - output = "" - for entry in documents: - if not entry.annotations: - continue + count = 0 + for _ in entry.annotations: + count += 1 - count = 0 - for _ in entry.annotations: - count += 1 + d = entry.document + output += ( + f"{d['author'] if 'author' in d else ''}" + f"{' - ' if 'author' in d else ''}" # only put separator if author + f"{entry.document['title'] if 'title' in d else ''}: " + f"{count}\n" + ) - d = entry.document - output += ( - f"{d['author'] if 'author' in d else ''}" - f"{' - ' if 'author' in d else ''}" # only put separator if author - f"{entry.document['title'] if 'title' in d else ''}: " - f"{count}\n" - ) - - return output + return output -@dataclass -class CsvFormatter: - annotated_docs: list[AnnotatedDocument] = field(default_factory=lambda: list()) +def format_csv(docs: list[AnnotatedDocument] = []) -> str: header: str = "type,tag,page,quote,note,author,title,ref,file" - string: str = ( + template: str = ( '{{type}},{{tag}},{{page}},"{{quote}}","{{note}}",' '"{{doc.author}}","{{doc.title}}","{{doc.ref}}","{{file}}"' ) - footer: str = "" + output = f"{header}\n" + for entry in docs: + if not entry.annotations: + continue - def execute(self, doc: AnnotatedDocument | None = None) -> str: - documents = self.annotated_docs if doc is None else [doc] - output = f"{self.header}\n" - for entry in documents: - if not entry.annotations: - continue + d = entry.document + for a in entry.annotations: + output += a.format(template, doc=d) + output += "\n" - d = entry.document - for a in entry.annotations: - output += a.format(self.string, doc=d) - output += "\n" - - return output - - -@dataclass -class CustomFormatter: - def __init__(self, header: str = "", string: str = "", footer: str = "") -> None: - self.header = header - self.string = string - self.footer = footer + return output From ee4690f52bc33f1ff9e7c13d5f3153abacd82219 Mon Sep 17 00:00:00 2001 From: Marty Oehme Date: Thu, 21 Sep 2023 22:01:51 +0200 Subject: [PATCH 16/22] feat: Add atx-style markdown Added markdown with atx style headers, can be chosen as alternative markdown template on the cli. The existing 'markdown' template will still default to setext-style headers. --- README.md | 6 +++++- papis_extract/__init__.py | 17 ++++++++-------- papis_extract/formatter.py | 40 ++++++++++++++++++++++++++++---------- 3 files changed, 44 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index 95ec9ce..b65623a 100644 --- a/README.md +++ b/README.md @@ -94,6 +94,9 @@ To output annotations in a markdown-compatible syntax (the default), do: papis extract --template markdown ``` +There are sub-variants of the formatter for atx-style headers, with `--template markdown-atx` (`# Headings`), +or setext-style with `--template markdown-setext` (the default style). + To instead see them in a csv syntax simply invoke: ```bash @@ -227,7 +230,8 @@ features to be implemented: - [ ] allow custom colors -> tag name settings not dependent on color name existing (e.g. {"important": (1.0,0.0,0.0)}) - [ ] `--overwrite` mode where existing annotations are not dropped but overwritten on same line of note - [ ] `--force` mode where we simply do not drop anything -- [ ] `--format` option to choose from default or set up a custom formatter +- [x] `--format` option to choose from default or set up a custom formatter + - called `--template` in current implementation - [ ] on_add hook to extract annotations as files are added - needs upstream help, 'on_add' hook, and pass-through of affected documents diff --git a/papis_extract/__init__.py b/papis_extract/__init__.py index 9f9ad17..e679e6a 100644 --- a/papis_extract/__init__.py +++ b/papis_extract/__init__.py @@ -8,7 +8,10 @@ import papis.strings from papis.document import Document from papis_extract import extractor, exporter -from papis_extract.formatter import Formatter, format_count, format_csv, format_markdown +from papis_extract.formatter import ( + Formatter, + formatters +) logger = papis.logging.get_logger(__name__) @@ -43,7 +46,10 @@ papis.config.register_default_settings(DEFAULT_OPTIONS) @click.option( "--template", "-t", - type=click.Choice(["markdown", "count", "csv"], case_sensitive=False), + type=click.Choice( + ["markdown", "markdown-setext", "markdown-atx", "count", "csv"], + case_sensitive=False, + ), help="Choose an output template to format annotations with.", ) def main( @@ -76,12 +82,7 @@ def main( logger.warning(papis.strings.no_documents_retrieved_message) return - if template == "csv": - formatter = format_csv - elif template == "count": - formatter = format_count - else: - formatter = format_markdown + formatter = formatters[template] run(documents, edit=manual, write=write, git=git, formatter=formatter) diff --git a/papis_extract/formatter.py b/papis_extract/formatter.py index 75056ea..b00a2ee 100644 --- a/papis_extract/formatter.py +++ b/papis_extract/formatter.py @@ -5,7 +5,9 @@ from papis_extract.annotation import AnnotatedDocument Formatter = Callable[[list[AnnotatedDocument]], str] -def format_markdown(docs: list[AnnotatedDocument] = []) -> str: +def format_markdown( + docs: list[AnnotatedDocument] = [], atx_headings: bool = False +) -> str: template = ( "{{#tag}}#{{tag}}\n{{/tag}}" "{{#quote}}> {{quote}}{{/quote}} {{#page}}[p. {{page}}]{{/page}}" @@ -16,15 +18,16 @@ def format_markdown(docs: list[AnnotatedDocument] = []) -> str: if not entry.annotations: continue - title_decoration = ( - f"{'=' * len(entry.document.get('title', ''))} " - f"{'-' * len(entry.document.get('author', ''))}" - ) - output += ( - f"{title_decoration}\n" - f"{entry.document['title']} - {entry.document['author']}\n" - f"{title_decoration}\n\n" - ) + heading = f"{entry.document['title']} - {entry.document['author']}\n" + if atx_headings: + output += f"# {heading}\n" + else: + title_decoration = ( + f"{'=' * len(entry.document.get('title', ''))} " + f"{'-' * len(entry.document.get('author', ''))}" + ) + output += f"{title_decoration}\n" f"{heading}" f"{title_decoration}\n\n" + for a in entry.annotations: output += a.format(template) output += "\n" @@ -34,6 +37,14 @@ def format_markdown(docs: list[AnnotatedDocument] = []) -> str: return output +def format_markdown_atx(docs: list[AnnotatedDocument] = []) -> str: + return format_markdown(docs, atx_headings=True) + + +def format_markdown_setext(docs: list[AnnotatedDocument] = []) -> str: + return format_markdown(docs, atx_headings=False) + + def format_count(docs: list[AnnotatedDocument] = []) -> str: output = "" for entry in docs: @@ -72,3 +83,12 @@ def format_csv(docs: list[AnnotatedDocument] = []) -> str: output += "\n" return output + + +formatters: dict[str, Formatter] = { + "count": format_count, + "csv": format_csv, + "markdown": format_markdown, + "markdown_atx": format_markdown_atx, + "markdown_setext": format_markdown_setext, +} From 1e29642cba6ca47333bcdf9372cfcb7d910355f7 Mon Sep 17 00:00:00 2001 From: Marty Oehme Date: Fri, 22 Sep 2023 20:04:39 +0200 Subject: [PATCH 17/22] test: Fix formatting and annotation tests --- papis_extract/formatter.py | 12 +++--- tests/test_annotation.py | 31 ++++++++++---- tests/test_formatting.py | 86 +++++++++++++++++++++----------------- 3 files changed, 77 insertions(+), 52 deletions(-) diff --git a/papis_extract/formatter.py b/papis_extract/formatter.py index b00a2ee..75ed556 100644 --- a/papis_extract/formatter.py +++ b/papis_extract/formatter.py @@ -10,8 +10,8 @@ def format_markdown( ) -> str: template = ( "{{#tag}}#{{tag}}\n{{/tag}}" - "{{#quote}}> {{quote}}{{/quote}} {{#page}}[p. {{page}}]{{/page}}" - "\n{{#note}} NOTE: {{note}}{{/note}}" + "{{#quote}}> {{quote}}{{/quote}}{{#page}} [p. {{page}}]{{/page}}" + "{{#note}}\n NOTE: {{note}}{{/note}}" ) output = "" for entry in docs: @@ -30,11 +30,11 @@ def format_markdown( for a in entry.annotations: output += a.format(template) - output += "\n" + output += "\n\n" output += "\n\n\n" - return output + return output.rstrip() def format_markdown_atx(docs: list[AnnotatedDocument] = []) -> str: @@ -63,7 +63,7 @@ def format_count(docs: list[AnnotatedDocument] = []) -> str: f"{count}\n" ) - return output + return output.rstrip() def format_csv(docs: list[AnnotatedDocument] = []) -> str: @@ -82,7 +82,7 @@ def format_csv(docs: list[AnnotatedDocument] = []) -> str: output += a.format(template, doc=d) output += "\n" - return output + return output.rstrip() formatters: dict[str, Formatter] = { diff --git a/tests/test_annotation.py b/tests/test_annotation.py index 60b35ed..c14ee93 100644 --- a/tests/test_annotation.py +++ b/tests/test_annotation.py @@ -1,25 +1,23 @@ +from papis.document import Document import pytest -from papis_extract.annotation_data import Annotation -from papis_extract.templating import Custom +from papis_extract.annotation import Annotation @pytest.mark.parametrize( "fmt_string,expected", [ - (Custom(string="{{quote}}"), "I am the text value"), + ("{{quote}}", "I am the text value"), ( - Custom(string="> {{quote}}\n{{#note}}Note: {{note}}{{/note}}"), + "> {{quote}}\n{{#note}}Note: {{note}}{{/note}}", "> I am the text value\nNote: Whereas I represent the note", ), ( - Custom( - string="{{#note}}Note: {{note}}{{/note}}{{#page}}, p. {{page}}{{/page}}" - ), + "{{#note}}Note: {{note}}{{/note}}{{#page}}, p. {{page}}{{/page}}", "Note: Whereas I represent the note", ), ], ) -def test_formatting(fmt_string, expected): +def test_formatting_replacements(fmt_string, expected): sut = Annotation( "myfile", text="I am the text value", @@ -28,6 +26,23 @@ def test_formatting(fmt_string, expected): assert sut.format(fmt_string) == expected +@pytest.mark.parametrize( + "fmt_string,expected", + [ + ("{{doc.title}}", "document-title"), + ("{{doc.title}}-{{doc.author}}", "document-title-document-author"), + ("{{quote}} ({{doc.author}})", "I am the text value (document-author)"), + ] +) +def test_formatting_document_access(fmt_string, expected): + sut = Annotation( + "myfile", + text="I am the text value", + content="Whereas I represent the note", + ) + doc = Document(data= {"title": "document-title", "author": "document-author"}) + + assert sut.format(fmt_string, doc=doc) == expected def test_colorname_matches_exact(): sut = Annotation("testfile", colors=(1.0, 0.0, 0.0), minimum_similarity_color=1.0) diff --git a/tests/test_formatting.py b/tests/test_formatting.py index 5ab53c2..5aaef70 100644 --- a/tests/test_formatting.py +++ b/tests/test_formatting.py @@ -1,49 +1,59 @@ -import chevron +from papis.document import Document +from papis_extract.annotation import AnnotatedDocument, Annotation -from papis_extract.templating import Markdown, Csv +from papis_extract.formatter import ( + format_count, + format_csv, + format_markdown, + format_markdown_atx, +) - -def test_template_markers(): - ... +an_doc: AnnotatedDocument = AnnotatedDocument( + Document(data={"author": "document-author", "title": "document-title"}), + [ + Annotation("myfile.pdf", text="my lovely text"), + Annotation("myfile.pdf", text="my second text", content="with note"), + ], +) def test_markdown_default(): - fmt = Markdown() - assert ( - chevron.render( - fmt.string, - { - "file": "somefile/somewhere.pdf", - "quote": "I am quote", - "note": "and including note.", - "page": 46, - "tag": "important", - "type": "highlight", - }, - ) - == "#important\n> I am quote [p. 46]\n NOTE: and including note." + fmt = format_markdown + assert fmt([an_doc]) == ( + """============== --------------- +document-title - document-author +============== --------------- + +> my lovely text + +> my second text + NOTE: with note""" ) -def test_csv_string(): - fmt = Csv() - assert ( - chevron.render( - fmt.string, - { - "file": "somefile/somewhere.pdf", - "quote": "I am quote", - "note": "and including note.", - "page": 46, - "tag": "important", - "type": "highlight", - }, - ) - == "highlight, important, 46, " - "I am quote, and including note., somefile/somewhere.pdf" +def test_markdown_atx(): + fmt = format_markdown_atx + assert fmt([an_doc]) == ( + """# document-title - document-author + +> my lovely text + +> my second text + NOTE: with note""" ) -def test_csv_header(): - fmt = Csv() - assert chevron.render(fmt.header, {}) == "type, tag, page, quote, note, file" +def test_count_default(): + fmt = format_count + assert fmt([an_doc]) == ("""document-author - document-title: 2""") + + +def test_csv_default(): + fmt = format_csv + assert fmt([an_doc]) == ( + "type,tag,page,quote,note,author,title,ref,file\n" + 'Highlight,,0,"my lovely text","","document-author",' + '"document-title","","myfile.pdf"\n' + 'Highlight,,0,"my second text","with note","document-author",' + '"document-title","","myfile.pdf"' + ) From 2700e4adc3b73b0ef00e0def87ecb47231092a6a Mon Sep 17 00:00:00 2001 From: Marty Oehme Date: Fri, 22 Sep 2023 21:53:55 +0200 Subject: [PATCH 18/22] test: Add code coverage dev dependency --- poetry.lock | 66 +++++++++++++++++++++++++++++++++++++++++++++++++- pyproject.toml | 1 + 2 files changed, 66 insertions(+), 1 deletion(-) diff --git a/poetry.lock b/poetry.lock index 963b69b..886f483 100644 --- a/poetry.lock +++ b/poetry.lock @@ -183,6 +183,70 @@ files = [ {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, ] +[[package]] +name = "coverage" +version = "7.3.1" +description = "Code coverage measurement for Python" +optional = false +python-versions = ">=3.8" +files = [ + {file = "coverage-7.3.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:cd0f7429ecfd1ff597389907045ff209c8fdb5b013d38cfa7c60728cb484b6e3"}, + {file = "coverage-7.3.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:966f10df9b2b2115da87f50f6a248e313c72a668248be1b9060ce935c871f276"}, + {file = "coverage-7.3.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0575c37e207bb9b98b6cf72fdaaa18ac909fb3d153083400c2d48e2e6d28bd8e"}, + {file = "coverage-7.3.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:245c5a99254e83875c7fed8b8b2536f040997a9b76ac4c1da5bff398c06e860f"}, + {file = "coverage-7.3.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4c96dd7798d83b960afc6c1feb9e5af537fc4908852ef025600374ff1a017392"}, + {file = "coverage-7.3.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:de30c1aa80f30af0f6b2058a91505ea6e36d6535d437520067f525f7df123887"}, + {file = "coverage-7.3.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:50dd1e2dd13dbbd856ffef69196781edff26c800a74f070d3b3e3389cab2600d"}, + {file = "coverage-7.3.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:b9c0c19f70d30219113b18fe07e372b244fb2a773d4afde29d5a2f7930765136"}, + {file = "coverage-7.3.1-cp310-cp310-win32.whl", hash = "sha256:770f143980cc16eb601ccfd571846e89a5fe4c03b4193f2e485268f224ab602f"}, + {file = "coverage-7.3.1-cp310-cp310-win_amd64.whl", hash = "sha256:cdd088c00c39a27cfa5329349cc763a48761fdc785879220d54eb785c8a38520"}, + {file = "coverage-7.3.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:74bb470399dc1989b535cb41f5ca7ab2af561e40def22d7e188e0a445e7639e3"}, + {file = "coverage-7.3.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:025ded371f1ca280c035d91b43252adbb04d2aea4c7105252d3cbc227f03b375"}, + {file = "coverage-7.3.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a6191b3a6ad3e09b6cfd75b45c6aeeffe7e3b0ad46b268345d159b8df8d835f9"}, + {file = "coverage-7.3.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7eb0b188f30e41ddd659a529e385470aa6782f3b412f860ce22b2491c89b8593"}, + {file = "coverage-7.3.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:75c8f0df9dfd8ff745bccff75867d63ef336e57cc22b2908ee725cc552689ec8"}, + {file = "coverage-7.3.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:7eb3cd48d54b9bd0e73026dedce44773214064be93611deab0b6a43158c3d5a0"}, + {file = "coverage-7.3.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:ac3c5b7e75acac31e490b7851595212ed951889918d398b7afa12736c85e13ce"}, + {file = "coverage-7.3.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:5b4ee7080878077af0afa7238df1b967f00dc10763f6e1b66f5cced4abebb0a3"}, + {file = "coverage-7.3.1-cp311-cp311-win32.whl", hash = "sha256:229c0dd2ccf956bf5aeede7e3131ca48b65beacde2029f0361b54bf93d36f45a"}, + {file = "coverage-7.3.1-cp311-cp311-win_amd64.whl", hash = "sha256:c6f55d38818ca9596dc9019eae19a47410d5322408140d9a0076001a3dcb938c"}, + {file = "coverage-7.3.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:5289490dd1c3bb86de4730a92261ae66ea8d44b79ed3cc26464f4c2cde581fbc"}, + {file = "coverage-7.3.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ca833941ec701fda15414be400c3259479bfde7ae6d806b69e63b3dc423b1832"}, + {file = "coverage-7.3.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cd694e19c031733e446c8024dedd12a00cda87e1c10bd7b8539a87963685e969"}, + {file = "coverage-7.3.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:aab8e9464c00da5cb9c536150b7fbcd8850d376d1151741dd0d16dfe1ba4fd26"}, + {file = "coverage-7.3.1-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:87d38444efffd5b056fcc026c1e8d862191881143c3aa80bb11fcf9dca9ae204"}, + {file = "coverage-7.3.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:8a07b692129b8a14ad7a37941a3029c291254feb7a4237f245cfae2de78de037"}, + {file = "coverage-7.3.1-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:2829c65c8faaf55b868ed7af3c7477b76b1c6ebeee99a28f59a2cb5907a45760"}, + {file = "coverage-7.3.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:1f111a7d85658ea52ffad7084088277135ec5f368457275fc57f11cebb15607f"}, + {file = "coverage-7.3.1-cp312-cp312-win32.whl", hash = "sha256:c397c70cd20f6df7d2a52283857af622d5f23300c4ca8e5bd8c7a543825baa5a"}, + {file = "coverage-7.3.1-cp312-cp312-win_amd64.whl", hash = "sha256:5ae4c6da8b3d123500f9525b50bf0168023313963e0e2e814badf9000dd6ef92"}, + {file = "coverage-7.3.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:ca70466ca3a17460e8fc9cea7123c8cbef5ada4be3140a1ef8f7b63f2f37108f"}, + {file = "coverage-7.3.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:f2781fd3cabc28278dc982a352f50c81c09a1a500cc2086dc4249853ea96b981"}, + {file = "coverage-7.3.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6407424621f40205bbe6325686417e5e552f6b2dba3535dd1f90afc88a61d465"}, + {file = "coverage-7.3.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:04312b036580ec505f2b77cbbdfb15137d5efdfade09156961f5277149f5e344"}, + {file = "coverage-7.3.1-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ac9ad38204887349853d7c313f53a7b1c210ce138c73859e925bc4e5d8fc18e7"}, + {file = "coverage-7.3.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:53669b79f3d599da95a0afbef039ac0fadbb236532feb042c534fbb81b1a4e40"}, + {file = "coverage-7.3.1-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:614f1f98b84eb256e4f35e726bfe5ca82349f8dfa576faabf8a49ca09e630086"}, + {file = "coverage-7.3.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:f1a317fdf5c122ad642db8a97964733ab7c3cf6009e1a8ae8821089993f175ff"}, + {file = "coverage-7.3.1-cp38-cp38-win32.whl", hash = "sha256:defbbb51121189722420a208957e26e49809feafca6afeef325df66c39c4fdb3"}, + {file = "coverage-7.3.1-cp38-cp38-win_amd64.whl", hash = "sha256:f4f456590eefb6e1b3c9ea6328c1e9fa0f1006e7481179d749b3376fc793478e"}, + {file = "coverage-7.3.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:f12d8b11a54f32688b165fd1a788c408f927b0960984b899be7e4c190ae758f1"}, + {file = "coverage-7.3.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:f09195dda68d94a53123883de75bb97b0e35f5f6f9f3aa5bf6e496da718f0cb6"}, + {file = "coverage-7.3.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c6601a60318f9c3945be6ea0f2a80571f4299b6801716f8a6e4846892737ebe4"}, + {file = "coverage-7.3.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:07d156269718670d00a3b06db2288b48527fc5f36859425ff7cec07c6b367745"}, + {file = "coverage-7.3.1-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:636a8ac0b044cfeccae76a36f3b18264edcc810a76a49884b96dd744613ec0b7"}, + {file = "coverage-7.3.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:5d991e13ad2ed3aced177f524e4d670f304c8233edad3210e02c465351f785a0"}, + {file = "coverage-7.3.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:586649ada7cf139445da386ab6f8ef00e6172f11a939fc3b2b7e7c9082052fa0"}, + {file = "coverage-7.3.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:4aba512a15a3e1e4fdbfed2f5392ec221434a614cc68100ca99dcad7af29f3f8"}, + {file = "coverage-7.3.1-cp39-cp39-win32.whl", hash = "sha256:6bc6f3f4692d806831c136c5acad5ccedd0262aa44c087c46b7101c77e139140"}, + {file = "coverage-7.3.1-cp39-cp39-win_amd64.whl", hash = "sha256:553d7094cb27db58ea91332e8b5681bac107e7242c23f7629ab1316ee73c4981"}, + {file = "coverage-7.3.1-pp38.pp39.pp310-none-any.whl", hash = "sha256:220eb51f5fb38dfdb7e5d54284ca4d0cd70ddac047d750111a68ab1798945194"}, + {file = "coverage-7.3.1.tar.gz", hash = "sha256:6cb7fe1581deb67b782c153136541e20901aa312ceedaf1467dcb35255787952"}, +] + +[package.extras] +toml = ["tomli"] + [[package]] name = "dominate" version = "2.8.0" @@ -1010,4 +1074,4 @@ whoosh = ["whoosh"] [metadata] lock-version = "2.0" python-versions = "^3.11" -content-hash = "9eda99eff02c403365445d0e75fcc21ed7649a59407d358d0cd429a034cbc6c9" +content-hash = "dac37f6a903033ed95f65a69f7ca90a115aa390284835c2a8e82ea3fc0a196ab" diff --git a/pyproject.toml b/pyproject.toml index 8f52f48..8551182 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,6 +21,7 @@ whoosh = ["whoosh"] [tool.poetry.group.dev.dependencies] pytest = "^7.4.0" +coverage = "^7.3.1" [tool.poetry.plugins."papis.command"] extract = "papis_extract:main" From f67ac8cdb35e7256f8f0be3738fe9c1289de09ee Mon Sep 17 00:00:00 2001 From: Marty Oehme Date: Thu, 12 Oct 2023 19:26:41 +0200 Subject: [PATCH 19/22] chore: Fix markdown lint issues --- .markdownlint.yaml | 6 ++++++ README.md | 22 ++++++++++++---------- 2 files changed, 18 insertions(+), 10 deletions(-) create mode 100644 .markdownlint.yaml diff --git a/.markdownlint.yaml b/.markdownlint.yaml new file mode 100644 index 0000000..9e90f00 --- /dev/null +++ b/.markdownlint.yaml @@ -0,0 +1,6 @@ +MD013: false + +MD025: false + +MD007: + indent: 4 diff --git a/README.md b/README.md index b65623a..70d7afc 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ Quickly extract annotations from your pdf files with the help of the [papis](https://github.com/papis/papis) bibliography manager.\ Easily organize all your highlights and thoughts next to your documents and references.\ -## Installation: +## Installation You can install through pip with `pip install git+https://git.martyoeh.me/Marty/papis-extract.git`. @@ -24,7 +24,8 @@ You will be set up with the default options but if you want to change anything, > **Note** > This plugin is still in fairly early development. It does what I need it to do, but if you have a meticulously organized library *please* make backups before doing any operation which could affect your notes, or make use of the papis-included git options. -## Usage: + +## Usage `papis extract [OPTIONS] [QUERY]` @@ -117,6 +118,7 @@ so you will be doubling up your annotations. ## Configuration ### Basic configuration + Add `extract` plugin settings to your papis `config` file (usually `~/.config/papis/config`): You will rarely have to set everything explained in the next few paragraphs - in fact you can use the plugin without having to set up any of it if you are happy with the defaults. @@ -148,10 +150,10 @@ regardless of this setting. ### Automatic tagging -By supplying the tags option with a valid python dictionary of the form `{"tag": "color", "tag2": "color2"}`, +By supplying the tags option with a valid python dictionary of the form `{"tag": "color", "tag2": "color2"}`, you can enable automatic tagging for your annotations. -You thus ascribe specific meanings to the colors you use in highlighting. +You thus ascribe specific meanings to the colors you use in highlighting. For example, if you always highlight the most essential arguments and findings in red and always highlight things you have to follow up on in blue, you can assign the meanings 'important' and 'todo' to them respectively as follows: @@ -174,7 +176,7 @@ minimum_similarity_content: 0.9, # for checking if highlight or note minimum_similarity_color: 0.833, # for matching tag to color ``` -`minimum_similarity` sets the required similarity of an annotation with existing annotations in your notes to be dropped. +`minimum_similarity` sets the required similarity of an annotation with existing annotations in your notes to be dropped. Annotations you have in notes might change if you for example fix small spacing mistakes or a letter/punctuation that has been falsely recognized in the PDF or change similar things. Generally, this should be fine as it is but you should change this value if you either get new annotations dropped though they should be added (decrease the value) or annotations are added duplicating existing ones (increase the value). @@ -230,7 +232,7 @@ features to be implemented: - [ ] allow custom colors -> tag name settings not dependent on color name existing (e.g. {"important": (1.0,0.0,0.0)}) - [ ] `--overwrite` mode where existing annotations are not dropped but overwritten on same line of note - [ ] `--force` mode where we simply do not drop anything -- [x] `--format` option to choose from default or set up a custom formatter +- [x] `--format` option to choose from default or set up a custom formatter - called `--template` in current implementation - [ ] on_add hook to extract annotations as files are added - needs upstream help, 'on_add' hook, and pass-through of affected documents @@ -249,11 +251,11 @@ This plugin makes an effort to find the right combination and extract the writte as well as any additional notes made - but things *will* slip through or extract weirdly every now and again. -Secondly, a note on the pages: I use the page number that the mupdf library gives me when it +Secondly, a note on the pages: I use the page number that the mupdf library gives me when it extracts anything from the pdf file. Sometimes that number will be correct for the document, sometimes it will however be the number of the *pdf document* internally. This can happen if e.g. an article or a book has frontmatter without numbering scheme or with a different one. -Sometimes the correct pages will still be embedded in the pdf and everything will work, +Sometimes the correct pages will still be embedded in the pdf and everything will work, others it won't. So always double check your page numbers! I am not sure if there is much I can do about these issues for now. @@ -266,7 +268,7 @@ and for myself whenever I forget. The basic building blocks currently in here ar : Extract data from a source file attached to a papis document. - annotations -: The actual extracted blocks of text, containing some metadata +: The actual extracted blocks of text, containing some metadata info as well, such as their color, type, page. - exporters @@ -277,7 +279,7 @@ and for myself whenever I forget. The basic building blocks currently in here ar such as a markdown syntax or csv-structure. Splitting it into those three building blocks makes it easier to recombine them in any way, -should someone want to save highlights as csv data in their notes, +should someone want to save highlights as csv data in their notes, or should we ever include more extractors than the one for PDFs. --- From c9736a5f32734fe0b19333308f850455ec863dab Mon Sep 17 00:00:00 2001 From: Marty Oehme Date: Thu, 12 Oct 2023 19:27:16 +0200 Subject: [PATCH 20/22] test: Add tests for formatter sad paths --- tests/test_formatting.py | 35 ++++++++++++++++++++++++++++------- 1 file changed, 28 insertions(+), 7 deletions(-) diff --git a/tests/test_formatting.py b/tests/test_formatting.py index 5aaef70..d9a0d0f 100644 --- a/tests/test_formatting.py +++ b/tests/test_formatting.py @@ -6,6 +6,7 @@ from papis_extract.formatter import ( format_csv, format_markdown, format_markdown_atx, + format_markdown_setext, ) an_doc: AnnotatedDocument = AnnotatedDocument( @@ -15,12 +16,7 @@ an_doc: AnnotatedDocument = AnnotatedDocument( Annotation("myfile.pdf", text="my second text", content="with note"), ], ) - - -def test_markdown_default(): - fmt = format_markdown - assert fmt([an_doc]) == ( - """============== --------------- +md_default_output = """============== --------------- document-title - document-author ============== --------------- @@ -28,7 +24,11 @@ document-title - document-author > my second text NOTE: with note""" - ) + + +def test_markdown_default(): + fmt = format_markdown + assert fmt([an_doc]) == md_default_output def test_markdown_atx(): @@ -43,6 +43,11 @@ def test_markdown_atx(): ) +def test_markdown_setext(): + fmt = format_markdown_setext + assert fmt([an_doc]) == md_default_output + + def test_count_default(): fmt = format_count assert fmt([an_doc]) == ("""document-author - document-title: 2""") @@ -57,3 +62,19 @@ def test_csv_default(): 'Highlight,,0,"my second text","with note","document-author",' '"document-title","","myfile.pdf"' ) + + +# sadpath - no annotations contained for each format +def test_markdown_no_annotations(): + d: AnnotatedDocument = AnnotatedDocument(Document(data={}), []) + assert format_markdown([d]) == "" + + +def test_count_no_annotations(): + d: AnnotatedDocument = AnnotatedDocument(Document(data={}), []) + assert format_count([d]) == "" + + +def test_csv_no_annotations(): + d: AnnotatedDocument = AnnotatedDocument(Document(data={}), []) + assert format_csv([d]) == "type,tag,page,quote,note,author,title,ref,file" From 14f1b9e75c51372302b133de365c0ed1f06fa82a Mon Sep 17 00:00:00 2001 From: Marty Oehme Date: Tue, 17 Oct 2023 21:16:40 +0200 Subject: [PATCH 21/22] test: Add poetry-cov library --- poetry.lock | 20 +++++++++++++++++++- pyproject.toml | 1 + 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/poetry.lock b/poetry.lock index 886f483..c405f7c 100644 --- a/poetry.lock +++ b/poetry.lock @@ -730,6 +730,24 @@ pluggy = ">=0.12,<2.0" [package.extras] testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] +[[package]] +name = "pytest-cov" +version = "4.1.0" +description = "Pytest plugin for measuring coverage." +optional = false +python-versions = ">=3.7" +files = [ + {file = "pytest-cov-4.1.0.tar.gz", hash = "sha256:3904b13dfbfec47f003b8e77fd5b589cd11904a21ddf1ab38a64f204d6a10ef6"}, + {file = "pytest_cov-4.1.0-py3-none-any.whl", hash = "sha256:6ba70b9e97e69fcc3fb45bfeab2d0a138fb65c4d0d6a41ef33983ad114be8c3a"}, +] + +[package.dependencies] +coverage = {version = ">=5.2.1", extras = ["toml"]} +pytest = ">=4.6" + +[package.extras] +testing = ["fields", "hunter", "process-tests", "pytest-xdist", "six", "virtualenv"] + [[package]] name = "python-doi" version = "0.2.0" @@ -1074,4 +1092,4 @@ whoosh = ["whoosh"] [metadata] lock-version = "2.0" python-versions = "^3.11" -content-hash = "dac37f6a903033ed95f65a69f7ca90a115aa390284835c2a8e82ea3fc0a196ab" +content-hash = "02d5ac314a19f14103372c7ceccdaec080df3fdc2fdb1381f2f6343cd6d17db4" diff --git a/pyproject.toml b/pyproject.toml index 8551182..7e252fa 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,6 +22,7 @@ whoosh = ["whoosh"] [tool.poetry.group.dev.dependencies] pytest = "^7.4.0" coverage = "^7.3.1" +pytest-cov = "^4.1.0" [tool.poetry.plugins."papis.command"] extract = "papis_extract:main" From 984025b472a84dd641b40231c546194b49c7aeb4 Mon Sep 17 00:00:00 2001 From: Marty Oehme Date: Tue, 17 Oct 2023 22:01:41 +0200 Subject: [PATCH 22/22] feat: Add option to force-add annotations Will turn off looking for duplicate annotations and simply add all. --- papis_extract/__init__.py | 5 +++++ papis_extract/exporter.py | 35 +++++++++++++++++++++++++++-------- 2 files changed, 32 insertions(+), 8 deletions(-) diff --git a/papis_extract/__init__.py b/papis_extract/__init__.py index e679e6a..af0ccd2 100644 --- a/papis_extract/__init__.py +++ b/papis_extract/__init__.py @@ -52,6 +52,11 @@ papis.config.register_default_settings(DEFAULT_OPTIONS) ), help="Choose an output template to format annotations with.", ) +@click.option( + "--force/--no-force", + "-f", + help="Do not drop any annotations because they already exist.", +) def main( query: str, # _papis_id: bool, diff --git a/papis_extract/exporter.py b/papis_extract/exporter.py index 5d22270..af66b13 100644 --- a/papis_extract/exporter.py +++ b/papis_extract/exporter.py @@ -25,7 +25,11 @@ def to_stdout(formatter: Formatter, annotated_docs: list[AnnotatedDocument]) -> def to_notes( - formatter: Formatter, annotated_docs: list[AnnotatedDocument], edit: bool, git: bool + formatter: Formatter, + annotated_docs: list[AnnotatedDocument], + edit: bool, + git: bool, + force: bool, ) -> None: """Write annotations into document notes. @@ -36,7 +40,7 @@ def to_notes( for entry in annotated_docs: formatted_annotations = formatter([entry]).split("\n") if formatted_annotations: - _add_annots_to_note(entry.document, formatted_annotations) + _add_annots_to_note(entry.document, formatted_annotations, force=force) if edit: papis.commands.edit.edit_notes(entry.document, git=git) @@ -46,11 +50,26 @@ def _add_annots_to_note( document: papis.document.Document, formatted_annotations: list[str], git: bool = False, + force: bool = False, ) -> None: - """Append new annotations to the end of a note. + """ + Append new annotations to the end of a note. - Looks through note to determine any new annotations which should be - added and adds them to the end of the note file. + This function appends new annotations to the end of a note file. It takes in a + document object containing the note, a list of formatted annotations to be + added, and optional flags git and force. If git is True, the changes will be + committed to git. If force is True, the annotations will be added even if they + already exist in the note. + + :param document: The document object representing the note + :type document: class:`papis.document.Document` + :param formatted_annotations: A list of already formatted annotations to be added + :type formatted_annotations: list[str] + :param git: Flag indicating whether to commit changes to git, defaults to False. + :type git: bool, optional + :param force: Flag indicating whether to force adding annotations even if they + already exist, defaults to False. + :type force: bool, optional """ logger.debug("Adding annotations to note.") notes_path = papis.notes.notes_path_ensured(document) @@ -59,9 +78,9 @@ def _add_annots_to_note( with open(notes_path, "r") as file_read: existing = file_read.readlines() - new_annotations: list[str] = _drop_existing_annotations( - formatted_annotations, existing - ) + new_annotations: list[str] = [] + if not force: + new_annotations = _drop_existing_annotations(formatted_annotations, existing) if not new_annotations: return