From e325b89c9b2699e7d9e1333ce5a7f7a926c3b168 Mon Sep 17 00:00:00 2001 From: Marty Oehme Date: Tue, 29 Aug 2023 12:40:36 +0200 Subject: [PATCH 1/3] Move all extraction logic into extractor module The publically accessible default interface only contains the command line command interface and a single run function. --- papis_extract/__init__.py | 64 +++++++++----------------------------- papis_extract/extractor.py | 52 +++++++++++++++++++++++++++++-- 2 files changed, 64 insertions(+), 52 deletions(-) diff --git a/papis_extract/__init__.py b/papis_extract/__init__.py index 0b44c48..af3a834 100644 --- a/papis_extract/__init__.py +++ b/papis_extract/__init__.py @@ -1,19 +1,14 @@ -from pathlib import Path -import re - import click -import fitz_new as fitz -import magic import papis.cli import papis.config import papis.document -from papis.document import Document import papis.logging import papis.notes import papis.strings +from papis.document import Document from papis_extract import extractor, exporter -from papis_extract.annotation_data import Annotation, AnnotatedDocument +from papis_extract.annotation_data import AnnotatedDocument logger = papis.logging.get_logger(__name__) @@ -76,50 +71,19 @@ def main( logger.warning(papis.strings.no_documents_retrieved_message) return - doc_annotations: list[AnnotatedDocument] = _get_annotations_for_documents(documents) + run(documents, edit=manual, write=write, git=git) + + +def run( + documents: list[Document], + edit: bool = False, + write: bool = False, + git: bool = False, +) -> None: + + doc_annotations: list[AnnotatedDocument] = extractor.start(documents) if write: - exporter.to_notes(doc_annotations, edit=manual, git=git) + exporter.to_notes(doc_annotations, edit=edit, git=git) else: exporter.to_stdout(doc_annotations) - - # note_file: Path = Path(papis.notes.notes_path_ensured(documents[0])) - - -def is_pdf(fname: Path) -> bool: - return magic.from_file(fname, mime=True) == "application/pdf" - - -def _get_annotations_for_documents( - documents: list[Document], -) -> list[AnnotatedDocument]: - output: list[AnnotatedDocument] = [] - for doc in documents: - annotations: list[Annotation] = [] - found_pdf: bool = False - for file in doc.get_files(): - fname = Path(file) - if not _is_file_processable(fname): - break - found_pdf = True - - try: - annotations.extend(extractor.start(fname)) - except fitz.FileDataError as e: - print(f"File structure errors for {file}.\n{e}") - - if not found_pdf: - # have to remove curlys or papis logger gets upset - desc = re.sub("[{}]", "", papis.document.describe(doc)) - logger.warning("Did not find suitable PDF file for document: " f"{desc}") - output.append(AnnotatedDocument(doc, annotations)) - return output - - -def _is_file_processable(fname: Path) -> bool: - if not fname.is_file(): - logger.error(f"File {str(fname)} not readable.") - return False - if not is_pdf(fname): - return False - return True diff --git a/papis_extract/extractor.py b/papis_extract/extractor.py index 88ff012..51d6ade 100644 --- a/papis_extract/extractor.py +++ b/papis_extract/extractor.py @@ -1,17 +1,51 @@ +import re from pathlib import Path from typing import Any, Optional import Levenshtein +import magic import fitz_new as fitz import papis.logging import papis.config +import papis.document +from papis.document import Document -from papis_extract.annotation_data import Annotation +from papis_extract.annotation_data import Annotation, AnnotatedDocument logger = papis.logging.get_logger(__name__) +def start( + documents: list[Document], +) -> list[AnnotatedDocument]: + """Extract all annotations from passed documents. -def start(filename: Path) -> list[Annotation]: + Returns all annotations contained in the papis + documents passed in. + """ + + output: list[AnnotatedDocument] = [] + for doc in documents: + annotations: list[Annotation] = [] + found_pdf: bool = False + for file in doc.get_files(): + fname = Path(file) + if not _is_file_processable(fname): + break + found_pdf = True + + try: + annotations.extend(extract(fname)) + except fitz.FileDataError as e: + print(f"File structure errors for {file}.\n{e}") + + if not found_pdf: + # have to remove curlys or papis logger gets upset + desc = re.sub("[{}]", "", papis.document.describe(doc)) + logger.warning("Did not find suitable PDF file for document: " f"{desc}") + output.append(AnnotatedDocument(doc, annotations)) + return output + +def extract(filename: Path) -> list[Annotation]: """Extract annotations from a file. Returns all readable annotations contained in the file @@ -41,6 +75,20 @@ def start(filename: Path) -> list[Annotation]: return annotations +def is_pdf(fname: Path) -> bool: + return magic.from_file(fname, mime=True) == "application/pdf" + + + + +def _is_file_processable(fname: Path) -> bool: + if not fname.is_file(): + logger.error(f"File {str(fname)} not readable.") + return False + if not is_pdf(fname): + return False + return True + def _tag_from_colorname(colorname: str) -> str: color_mapping: dict[str, str] = getdict("tags", "plugins.extract") if not color_mapping: From 256117d45175ded87c3887ec5ae74c05442f8780 Mon Sep 17 00:00:00 2001 From: Marty Oehme Date: Tue, 29 Aug 2023 13:49:22 +0200 Subject: [PATCH 2/3] Add mustache templating Added mustache templating engine to be able to provide custom formatting strings. --- papis_extract/annotation_data.py | 42 +++++++++++----------------- poetry.lock | 13 ++++++++- pyproject.toml | 1 + tests/test_annotation.py | 48 ++++++++++++++++++++++++++++++-- 4 files changed, 75 insertions(+), 29 deletions(-) diff --git a/papis_extract/annotation_data.py b/papis_extract/annotation_data.py index 298f964..26e63cd 100644 --- a/papis_extract/annotation_data.py +++ b/papis_extract/annotation_data.py @@ -1,9 +1,9 @@ -import re import math from dataclasses import dataclass, field import papis.config from papis.document import Document +import chevron TEXT_SIMILARITY_MINIMUM = 0.75 COLOR_SIMILARITY_MINIMUM = 0.833 @@ -23,12 +23,13 @@ class Annotation: """A PDF annotation object""" file: str - type: str = "Highlight" - text: str = "" - content: str = "" - page: int = 1 colors: dict = field(default_factory=lambda: {"stroke": (0.0, 0.0, 0.0)}) + content: str = "" + page: int = 0 tag: str = "" + text: str = "" + type: str = "Highlight" + minimum_similarity_color: float = 1.0 def format(self, formatting): """Return a formatted string of the annotation. @@ -37,27 +38,15 @@ class Annotation: formatted with the correct marker replacements and removals, ready for display or writing. """ - output = formatting - replacements = { - r"{quote}": self.text, - r"{note}": self.content, - r"{page}": str(self.page), - r"{newline}": "\n", - r"{tag}": self.tag, + data = { + "file": self.file, + "quote": self.text, + "note": self.content, + "page": self.page, + "tag": self.tag, + "type": self.type, } - pattern = re.compile( - "|".join( - [re.escape(k) for k in sorted(replacements, key=len, reverse=True)] - ), - flags=re.DOTALL, - ) - patt_quote_container = re.compile(r"{%quote_container(.*?)%}") - patt_note_container = re.compile(r"{%note_container(.*?)%}") - patt_tag_container = re.compile(r"{%tag_container(.*?)%}") - output = patt_quote_container.sub(r"\1" if self.text else "", output) - output = patt_note_container.sub(r"\1" if self.content else "", output) - output = patt_tag_container.sub(r"\1" if self.tag else "", output) - return pattern.sub(lambda x: replacements[x.group(0)], output) + return chevron.render(formatting, data) @property def colorname(self): @@ -73,9 +62,10 @@ class Annotation: minimum_similarity = ( papis.config.getfloat("minimum_similarity_color", "plugins.extract") or 1.0 ) + minimum_similarity = self.minimum_similarity_color for name, values in COLORS.items(): similarity_ratio = self._color_similarity_ratio(values, annot_colors) - if similarity_ratio > minimum_similarity: + if similarity_ratio >= minimum_similarity: minimum_similarity = similarity_ratio nearest = name return nearest diff --git a/poetry.lock b/poetry.lock index 8d6734d..1336bfc 100644 --- a/poetry.lock +++ b/poetry.lock @@ -147,6 +147,17 @@ files = [ {file = "charset_normalizer-3.2.0-py3-none-any.whl", hash = "sha256:8e098148dd37b4ce3baca71fb394c81dc5d9c7728c95df695d2dca218edf40e6"}, ] +[[package]] +name = "chevron" +version = "0.14.0" +description = "Mustache templating language renderer" +optional = false +python-versions = "*" +files = [ + {file = "chevron-0.14.0-py3-none-any.whl", hash = "sha256:fbf996a709f8da2e745ef763f482ce2d311aa817d287593a5b990d6d6e4f0443"}, + {file = "chevron-0.14.0.tar.gz", hash = "sha256:87613aafdf6d77b6a90ff073165a61ae5086e21ad49057aa0e53681601800ebf"}, +] + [[package]] name = "click" version = "8.1.7" @@ -980,4 +991,4 @@ files = [ [metadata] lock-version = "2.0" python-versions = "^3.11" -content-hash = "d519605837788792d06ffc7bca7a92b315612ca6052227c53c558ec49dffec9f" +content-hash = "a3af36ed2941235df158c20ba9b66bdf5a0af0554235fd004ec77c3e88def3c3" diff --git a/pyproject.toml b/pyproject.toml index 4ca3257..8ee68a7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,6 +14,7 @@ papis = "^0.13" click = "^8.1.7" whoosh = "^2.7.4" python-magic = "^0.4.27" +chevron = "^0.14.0" [tool.poetry.plugins."papis.command"] extract = "papis_extract:main" diff --git a/tests/test_annotation.py b/tests/test_annotation.py index 72c7a75..d9f6188 100644 --- a/tests/test_annotation.py +++ b/tests/test_annotation.py @@ -1,7 +1,51 @@ +import pytest from papis_extract.annotation_data import Annotation -def test_matches_colorname_exact(): - sut = Annotation("testfile", colors={"stroke": (1.0, 0.0, 0.0)}) +@pytest.mark.parametrize( + "fmt_string,expected", + [ + ("{{quote}}", "I am the text value"), + ( + "> {{quote}}\n{{#note}}Note: {{note}}{{/note}}", + "> I am the text value\nNote: Whereas I represent the note", + ), + ( + "{{#note}}Note: {{note}}{{/note}}{{#page}}, p. {{page}}{{/page}}", + "Note: Whereas I represent the note", + ), + ], +) +def test_formatting(fmt_string, expected): + sut = Annotation( + "myfile", + text="I am the text value", + content="Whereas I represent the note", + ) + + assert sut.format(fmt_string) == expected + +def test_colorname_matches_exact(): + sut = Annotation( + "testfile", colors={"stroke": (1.0,0.0,0.0)}, minimum_similarity_color=1.0 + ) + c_name = sut.colorname + assert c_name == "red" + +# TODO inject closeness value instead of relying on default +@pytest.mark.parametrize( + "color_value", + [ + (1.0, 0.0, 0.0), + (0.9, 0.0, 0.0), + (0.8, 0.0, 0.0), + (0.7, 0.0, 0.0), + (0.51, 0.0, 0.0), + ], +) +def test_matches_inexact_colorname(color_value): + sut = Annotation( + "testfile", colors={"stroke": color_value}, minimum_similarity_color=0.833 + ) c_name = sut.colorname assert c_name == "red" From 20873e6ef88184c60a25b2fd5d2f2294fad35650 Mon Sep 17 00:00:00 2001 From: Marty Oehme Date: Tue, 29 Aug 2023 22:23:52 +0200 Subject: [PATCH 3/3] Change annotation color to simple rgb tuple --- papis_extract/annotation_data.py | 4 ++-- papis_extract/extractor.py | 14 ++++++++++---- tests/test_annotation.py | 4 ++-- 3 files changed, 14 insertions(+), 8 deletions(-) diff --git a/papis_extract/annotation_data.py b/papis_extract/annotation_data.py index 26e63cd..fa06e0d 100644 --- a/papis_extract/annotation_data.py +++ b/papis_extract/annotation_data.py @@ -23,7 +23,7 @@ class Annotation: """A PDF annotation object""" file: str - colors: dict = field(default_factory=lambda: {"stroke": (0.0, 0.0, 0.0)}) + colors: tuple[float, float, float] = field(default_factory=lambda: (0.0, 0.0, 0.0)) content: str = "" page: int = 0 tag: str = "" @@ -56,7 +56,7 @@ class Annotation: using euclidian distance between the two color vectors. """ annot_colors = ( - self.colors.get("stroke") or self.colors.get("fill") or (0.0, 0.0, 0.0) + self.colors or (0.0, 0.0, 0.0) ) nearest = None minimum_similarity = ( diff --git a/papis_extract/extractor.py b/papis_extract/extractor.py index 51d6ade..a4bc536 100644 --- a/papis_extract/extractor.py +++ b/papis_extract/extractor.py @@ -14,12 +14,13 @@ from papis_extract.annotation_data import Annotation, AnnotatedDocument logger = papis.logging.get_logger(__name__) + def start( documents: list[Document], ) -> list[AnnotatedDocument]: """Extract all annotations from passed documents. - Returns all annotations contained in the papis + Returns all annotations contained in the papis documents passed in. """ @@ -45,6 +46,7 @@ def start( output.append(AnnotatedDocument(doc, annotations)) return output + def extract(filename: Path) -> list[Annotation]: """Extract annotations from a file. @@ -58,11 +60,16 @@ def extract(filename: Path) -> list[Annotation]: quote, note = _retrieve_annotation_content(page, annot) if not quote and not note: continue + col = ( + annot.colors.get("fill") + or annot.colors.get("stroke") + or (0.0, 0.0, 0.0) + ) a = Annotation( file=str(filename), text=quote or "", content=note or "", - colors=annot.colors, + colors=col, type=annot.type[1], page=(page.number or 0) + 1, ) @@ -79,8 +86,6 @@ def is_pdf(fname: Path) -> bool: return magic.from_file(fname, mime=True) == "application/pdf" - - def _is_file_processable(fname: Path) -> bool: if not fname.is_file(): logger.error(f"File {str(fname)} not readable.") @@ -89,6 +94,7 @@ def _is_file_processable(fname: Path) -> bool: return False return True + def _tag_from_colorname(colorname: str) -> str: color_mapping: dict[str, str] = getdict("tags", "plugins.extract") if not color_mapping: diff --git a/tests/test_annotation.py b/tests/test_annotation.py index d9f6188..542c3a8 100644 --- a/tests/test_annotation.py +++ b/tests/test_annotation.py @@ -27,7 +27,7 @@ def test_formatting(fmt_string, expected): def test_colorname_matches_exact(): sut = Annotation( - "testfile", colors={"stroke": (1.0,0.0,0.0)}, minimum_similarity_color=1.0 + "testfile", colors=(1.0,0.0,0.0), minimum_similarity_color=1.0 ) c_name = sut.colorname assert c_name == "red" @@ -45,7 +45,7 @@ def test_colorname_matches_exact(): ) def test_matches_inexact_colorname(color_value): sut = Annotation( - "testfile", colors={"stroke": color_value}, minimum_similarity_color=0.833 + "testfile", colors=color_value, minimum_similarity_color=0.833 ) c_name = sut.colorname assert c_name == "red"