diff --git a/papis_extract/__init__.py b/papis_extract/__init__.py index af3a834..0b44c48 100644 --- a/papis_extract/__init__.py +++ b/papis_extract/__init__.py @@ -1,14 +1,19 @@ +from pathlib import Path +import re + import click +import fitz_new as fitz +import magic import papis.cli import papis.config import papis.document +from papis.document import Document import papis.logging import papis.notes import papis.strings -from papis.document import Document from papis_extract import extractor, exporter -from papis_extract.annotation_data import AnnotatedDocument +from papis_extract.annotation_data import Annotation, AnnotatedDocument logger = papis.logging.get_logger(__name__) @@ -71,19 +76,50 @@ def main( logger.warning(papis.strings.no_documents_retrieved_message) return - run(documents, edit=manual, write=write, git=git) - - -def run( - documents: list[Document], - edit: bool = False, - write: bool = False, - git: bool = False, -) -> None: - - doc_annotations: list[AnnotatedDocument] = extractor.start(documents) + doc_annotations: list[AnnotatedDocument] = _get_annotations_for_documents(documents) if write: - exporter.to_notes(doc_annotations, edit=edit, git=git) + exporter.to_notes(doc_annotations, edit=manual, git=git) else: exporter.to_stdout(doc_annotations) + + # note_file: Path = Path(papis.notes.notes_path_ensured(documents[0])) + + +def is_pdf(fname: Path) -> bool: + return magic.from_file(fname, mime=True) == "application/pdf" + + +def _get_annotations_for_documents( + documents: list[Document], +) -> list[AnnotatedDocument]: + output: list[AnnotatedDocument] = [] + for doc in documents: + annotations: list[Annotation] = [] + found_pdf: bool = False + for file in doc.get_files(): + fname = Path(file) + if not _is_file_processable(fname): + break + found_pdf = True + + try: + annotations.extend(extractor.start(fname)) + except fitz.FileDataError as e: + print(f"File structure errors for {file}.\n{e}") + + if not found_pdf: + # have to remove curlys or papis logger gets upset + desc = re.sub("[{}]", "", papis.document.describe(doc)) + logger.warning("Did not find suitable PDF file for document: " f"{desc}") + output.append(AnnotatedDocument(doc, annotations)) + return output + + +def _is_file_processable(fname: Path) -> bool: + if not fname.is_file(): + logger.error(f"File {str(fname)} not readable.") + return False + if not is_pdf(fname): + return False + return True diff --git a/papis_extract/annotation_data.py b/papis_extract/annotation_data.py index fa06e0d..298f964 100644 --- a/papis_extract/annotation_data.py +++ b/papis_extract/annotation_data.py @@ -1,9 +1,9 @@ +import re import math from dataclasses import dataclass, field import papis.config from papis.document import Document -import chevron TEXT_SIMILARITY_MINIMUM = 0.75 COLOR_SIMILARITY_MINIMUM = 0.833 @@ -23,13 +23,12 @@ class Annotation: """A PDF annotation object""" file: str - colors: tuple[float, float, float] = field(default_factory=lambda: (0.0, 0.0, 0.0)) - content: str = "" - page: int = 0 - tag: str = "" - text: str = "" type: str = "Highlight" - minimum_similarity_color: float = 1.0 + text: str = "" + content: str = "" + page: int = 1 + colors: dict = field(default_factory=lambda: {"stroke": (0.0, 0.0, 0.0)}) + tag: str = "" def format(self, formatting): """Return a formatted string of the annotation. @@ -38,15 +37,27 @@ class Annotation: formatted with the correct marker replacements and removals, ready for display or writing. """ - data = { - "file": self.file, - "quote": self.text, - "note": self.content, - "page": self.page, - "tag": self.tag, - "type": self.type, + output = formatting + replacements = { + r"{quote}": self.text, + r"{note}": self.content, + r"{page}": str(self.page), + r"{newline}": "\n", + r"{tag}": self.tag, } - return chevron.render(formatting, data) + pattern = re.compile( + "|".join( + [re.escape(k) for k in sorted(replacements, key=len, reverse=True)] + ), + flags=re.DOTALL, + ) + patt_quote_container = re.compile(r"{%quote_container(.*?)%}") + patt_note_container = re.compile(r"{%note_container(.*?)%}") + patt_tag_container = re.compile(r"{%tag_container(.*?)%}") + output = patt_quote_container.sub(r"\1" if self.text else "", output) + output = patt_note_container.sub(r"\1" if self.content else "", output) + output = patt_tag_container.sub(r"\1" if self.tag else "", output) + return pattern.sub(lambda x: replacements[x.group(0)], output) @property def colorname(self): @@ -56,16 +67,15 @@ class Annotation: using euclidian distance between the two color vectors. """ annot_colors = ( - self.colors or (0.0, 0.0, 0.0) + self.colors.get("stroke") or self.colors.get("fill") or (0.0, 0.0, 0.0) ) nearest = None minimum_similarity = ( papis.config.getfloat("minimum_similarity_color", "plugins.extract") or 1.0 ) - minimum_similarity = self.minimum_similarity_color for name, values in COLORS.items(): similarity_ratio = self._color_similarity_ratio(values, annot_colors) - if similarity_ratio >= minimum_similarity: + if similarity_ratio > minimum_similarity: minimum_similarity = similarity_ratio nearest = name return nearest diff --git a/papis_extract/extractor.py b/papis_extract/extractor.py index a4bc536..88ff012 100644 --- a/papis_extract/extractor.py +++ b/papis_extract/extractor.py @@ -1,53 +1,17 @@ -import re from pathlib import Path from typing import Any, Optional import Levenshtein -import magic import fitz_new as fitz import papis.logging import papis.config -import papis.document -from papis.document import Document -from papis_extract.annotation_data import Annotation, AnnotatedDocument +from papis_extract.annotation_data import Annotation logger = papis.logging.get_logger(__name__) -def start( - documents: list[Document], -) -> list[AnnotatedDocument]: - """Extract all annotations from passed documents. - - Returns all annotations contained in the papis - documents passed in. - """ - - output: list[AnnotatedDocument] = [] - for doc in documents: - annotations: list[Annotation] = [] - found_pdf: bool = False - for file in doc.get_files(): - fname = Path(file) - if not _is_file_processable(fname): - break - found_pdf = True - - try: - annotations.extend(extract(fname)) - except fitz.FileDataError as e: - print(f"File structure errors for {file}.\n{e}") - - if not found_pdf: - # have to remove curlys or papis logger gets upset - desc = re.sub("[{}]", "", papis.document.describe(doc)) - logger.warning("Did not find suitable PDF file for document: " f"{desc}") - output.append(AnnotatedDocument(doc, annotations)) - return output - - -def extract(filename: Path) -> list[Annotation]: +def start(filename: Path) -> list[Annotation]: """Extract annotations from a file. Returns all readable annotations contained in the file @@ -60,16 +24,11 @@ def extract(filename: Path) -> list[Annotation]: quote, note = _retrieve_annotation_content(page, annot) if not quote and not note: continue - col = ( - annot.colors.get("fill") - or annot.colors.get("stroke") - or (0.0, 0.0, 0.0) - ) a = Annotation( file=str(filename), text=quote or "", content=note or "", - colors=col, + colors=annot.colors, type=annot.type[1], page=(page.number or 0) + 1, ) @@ -82,19 +41,6 @@ def extract(filename: Path) -> list[Annotation]: return annotations -def is_pdf(fname: Path) -> bool: - return magic.from_file(fname, mime=True) == "application/pdf" - - -def _is_file_processable(fname: Path) -> bool: - if not fname.is_file(): - logger.error(f"File {str(fname)} not readable.") - return False - if not is_pdf(fname): - return False - return True - - def _tag_from_colorname(colorname: str) -> str: color_mapping: dict[str, str] = getdict("tags", "plugins.extract") if not color_mapping: diff --git a/poetry.lock b/poetry.lock index 1336bfc..8d6734d 100644 --- a/poetry.lock +++ b/poetry.lock @@ -147,17 +147,6 @@ files = [ {file = "charset_normalizer-3.2.0-py3-none-any.whl", hash = "sha256:8e098148dd37b4ce3baca71fb394c81dc5d9c7728c95df695d2dca218edf40e6"}, ] -[[package]] -name = "chevron" -version = "0.14.0" -description = "Mustache templating language renderer" -optional = false -python-versions = "*" -files = [ - {file = "chevron-0.14.0-py3-none-any.whl", hash = "sha256:fbf996a709f8da2e745ef763f482ce2d311aa817d287593a5b990d6d6e4f0443"}, - {file = "chevron-0.14.0.tar.gz", hash = "sha256:87613aafdf6d77b6a90ff073165a61ae5086e21ad49057aa0e53681601800ebf"}, -] - [[package]] name = "click" version = "8.1.7" @@ -991,4 +980,4 @@ files = [ [metadata] lock-version = "2.0" python-versions = "^3.11" -content-hash = "a3af36ed2941235df158c20ba9b66bdf5a0af0554235fd004ec77c3e88def3c3" +content-hash = "d519605837788792d06ffc7bca7a92b315612ca6052227c53c558ec49dffec9f" diff --git a/pyproject.toml b/pyproject.toml index 8ee68a7..4ca3257 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,7 +14,6 @@ papis = "^0.13" click = "^8.1.7" whoosh = "^2.7.4" python-magic = "^0.4.27" -chevron = "^0.14.0" [tool.poetry.plugins."papis.command"] extract = "papis_extract:main" diff --git a/tests/test_annotation.py b/tests/test_annotation.py index 542c3a8..72c7a75 100644 --- a/tests/test_annotation.py +++ b/tests/test_annotation.py @@ -1,51 +1,7 @@ -import pytest from papis_extract.annotation_data import Annotation -@pytest.mark.parametrize( - "fmt_string,expected", - [ - ("{{quote}}", "I am the text value"), - ( - "> {{quote}}\n{{#note}}Note: {{note}}{{/note}}", - "> I am the text value\nNote: Whereas I represent the note", - ), - ( - "{{#note}}Note: {{note}}{{/note}}{{#page}}, p. {{page}}{{/page}}", - "Note: Whereas I represent the note", - ), - ], -) -def test_formatting(fmt_string, expected): - sut = Annotation( - "myfile", - text="I am the text value", - content="Whereas I represent the note", - ) - - assert sut.format(fmt_string) == expected - -def test_colorname_matches_exact(): - sut = Annotation( - "testfile", colors=(1.0,0.0,0.0), minimum_similarity_color=1.0 - ) - c_name = sut.colorname - assert c_name == "red" - -# TODO inject closeness value instead of relying on default -@pytest.mark.parametrize( - "color_value", - [ - (1.0, 0.0, 0.0), - (0.9, 0.0, 0.0), - (0.8, 0.0, 0.0), - (0.7, 0.0, 0.0), - (0.51, 0.0, 0.0), - ], -) -def test_matches_inexact_colorname(color_value): - sut = Annotation( - "testfile", colors=color_value, minimum_similarity_color=0.833 - ) +def test_matches_colorname_exact(): + sut = Annotation("testfile", colors={"stroke": (1.0, 0.0, 0.0)}) c_name = sut.colorname assert c_name == "red"