Compare commits

..

No commits in common. "20873e6ef88184c60a25b2fd5d2f2294fad35650" and "b564ab479237a6fecc9a4859b2c69161fc524fae" have entirely different histories.

6 changed files with 84 additions and 148 deletions

View file

@ -1,14 +1,19 @@
from pathlib import Path
import re
import click import click
import fitz_new as fitz
import magic
import papis.cli import papis.cli
import papis.config import papis.config
import papis.document import papis.document
from papis.document import Document
import papis.logging import papis.logging
import papis.notes import papis.notes
import papis.strings import papis.strings
from papis.document import Document
from papis_extract import extractor, exporter from papis_extract import extractor, exporter
from papis_extract.annotation_data import AnnotatedDocument from papis_extract.annotation_data import Annotation, AnnotatedDocument
logger = papis.logging.get_logger(__name__) logger = papis.logging.get_logger(__name__)
@ -71,19 +76,50 @@ def main(
logger.warning(papis.strings.no_documents_retrieved_message) logger.warning(papis.strings.no_documents_retrieved_message)
return return
run(documents, edit=manual, write=write, git=git) doc_annotations: list[AnnotatedDocument] = _get_annotations_for_documents(documents)
def run(
documents: list[Document],
edit: bool = False,
write: bool = False,
git: bool = False,
) -> None:
doc_annotations: list[AnnotatedDocument] = extractor.start(documents)
if write: if write:
exporter.to_notes(doc_annotations, edit=edit, git=git) exporter.to_notes(doc_annotations, edit=manual, git=git)
else: else:
exporter.to_stdout(doc_annotations) exporter.to_stdout(doc_annotations)
# note_file: Path = Path(papis.notes.notes_path_ensured(documents[0]))
def is_pdf(fname: Path) -> bool:
return magic.from_file(fname, mime=True) == "application/pdf"
def _get_annotations_for_documents(
documents: list[Document],
) -> list[AnnotatedDocument]:
output: list[AnnotatedDocument] = []
for doc in documents:
annotations: list[Annotation] = []
found_pdf: bool = False
for file in doc.get_files():
fname = Path(file)
if not _is_file_processable(fname):
break
found_pdf = True
try:
annotations.extend(extractor.start(fname))
except fitz.FileDataError as e:
print(f"File structure errors for {file}.\n{e}")
if not found_pdf:
# have to remove curlys or papis logger gets upset
desc = re.sub("[{}]", "", papis.document.describe(doc))
logger.warning("Did not find suitable PDF file for document: " f"{desc}")
output.append(AnnotatedDocument(doc, annotations))
return output
def _is_file_processable(fname: Path) -> bool:
if not fname.is_file():
logger.error(f"File {str(fname)} not readable.")
return False
if not is_pdf(fname):
return False
return True

View file

@ -1,9 +1,9 @@
import re
import math import math
from dataclasses import dataclass, field from dataclasses import dataclass, field
import papis.config import papis.config
from papis.document import Document from papis.document import Document
import chevron
TEXT_SIMILARITY_MINIMUM = 0.75 TEXT_SIMILARITY_MINIMUM = 0.75
COLOR_SIMILARITY_MINIMUM = 0.833 COLOR_SIMILARITY_MINIMUM = 0.833
@ -23,13 +23,12 @@ class Annotation:
"""A PDF annotation object""" """A PDF annotation object"""
file: str file: str
colors: tuple[float, float, float] = field(default_factory=lambda: (0.0, 0.0, 0.0))
content: str = ""
page: int = 0
tag: str = ""
text: str = ""
type: str = "Highlight" type: str = "Highlight"
minimum_similarity_color: float = 1.0 text: str = ""
content: str = ""
page: int = 1
colors: dict = field(default_factory=lambda: {"stroke": (0.0, 0.0, 0.0)})
tag: str = ""
def format(self, formatting): def format(self, formatting):
"""Return a formatted string of the annotation. """Return a formatted string of the annotation.
@ -38,15 +37,27 @@ class Annotation:
formatted with the correct marker replacements and removals, ready formatted with the correct marker replacements and removals, ready
for display or writing. for display or writing.
""" """
data = { output = formatting
"file": self.file, replacements = {
"quote": self.text, r"{quote}": self.text,
"note": self.content, r"{note}": self.content,
"page": self.page, r"{page}": str(self.page),
"tag": self.tag, r"{newline}": "\n",
"type": self.type, r"{tag}": self.tag,
} }
return chevron.render(formatting, data) pattern = re.compile(
"|".join(
[re.escape(k) for k in sorted(replacements, key=len, reverse=True)]
),
flags=re.DOTALL,
)
patt_quote_container = re.compile(r"{%quote_container(.*?)%}")
patt_note_container = re.compile(r"{%note_container(.*?)%}")
patt_tag_container = re.compile(r"{%tag_container(.*?)%}")
output = patt_quote_container.sub(r"\1" if self.text else "", output)
output = patt_note_container.sub(r"\1" if self.content else "", output)
output = patt_tag_container.sub(r"\1" if self.tag else "", output)
return pattern.sub(lambda x: replacements[x.group(0)], output)
@property @property
def colorname(self): def colorname(self):
@ -56,16 +67,15 @@ class Annotation:
using euclidian distance between the two color vectors. using euclidian distance between the two color vectors.
""" """
annot_colors = ( annot_colors = (
self.colors or (0.0, 0.0, 0.0) self.colors.get("stroke") or self.colors.get("fill") or (0.0, 0.0, 0.0)
) )
nearest = None nearest = None
minimum_similarity = ( minimum_similarity = (
papis.config.getfloat("minimum_similarity_color", "plugins.extract") or 1.0 papis.config.getfloat("minimum_similarity_color", "plugins.extract") or 1.0
) )
minimum_similarity = self.minimum_similarity_color
for name, values in COLORS.items(): for name, values in COLORS.items():
similarity_ratio = self._color_similarity_ratio(values, annot_colors) similarity_ratio = self._color_similarity_ratio(values, annot_colors)
if similarity_ratio >= minimum_similarity: if similarity_ratio > minimum_similarity:
minimum_similarity = similarity_ratio minimum_similarity = similarity_ratio
nearest = name nearest = name
return nearest return nearest

View file

@ -1,53 +1,17 @@
import re
from pathlib import Path from pathlib import Path
from typing import Any, Optional from typing import Any, Optional
import Levenshtein import Levenshtein
import magic
import fitz_new as fitz import fitz_new as fitz
import papis.logging import papis.logging
import papis.config import papis.config
import papis.document
from papis.document import Document
from papis_extract.annotation_data import Annotation, AnnotatedDocument from papis_extract.annotation_data import Annotation
logger = papis.logging.get_logger(__name__) logger = papis.logging.get_logger(__name__)
def start( def start(filename: Path) -> list[Annotation]:
documents: list[Document],
) -> list[AnnotatedDocument]:
"""Extract all annotations from passed documents.
Returns all annotations contained in the papis
documents passed in.
"""
output: list[AnnotatedDocument] = []
for doc in documents:
annotations: list[Annotation] = []
found_pdf: bool = False
for file in doc.get_files():
fname = Path(file)
if not _is_file_processable(fname):
break
found_pdf = True
try:
annotations.extend(extract(fname))
except fitz.FileDataError as e:
print(f"File structure errors for {file}.\n{e}")
if not found_pdf:
# have to remove curlys or papis logger gets upset
desc = re.sub("[{}]", "", papis.document.describe(doc))
logger.warning("Did not find suitable PDF file for document: " f"{desc}")
output.append(AnnotatedDocument(doc, annotations))
return output
def extract(filename: Path) -> list[Annotation]:
"""Extract annotations from a file. """Extract annotations from a file.
Returns all readable annotations contained in the file Returns all readable annotations contained in the file
@ -60,16 +24,11 @@ def extract(filename: Path) -> list[Annotation]:
quote, note = _retrieve_annotation_content(page, annot) quote, note = _retrieve_annotation_content(page, annot)
if not quote and not note: if not quote and not note:
continue continue
col = (
annot.colors.get("fill")
or annot.colors.get("stroke")
or (0.0, 0.0, 0.0)
)
a = Annotation( a = Annotation(
file=str(filename), file=str(filename),
text=quote or "", text=quote or "",
content=note or "", content=note or "",
colors=col, colors=annot.colors,
type=annot.type[1], type=annot.type[1],
page=(page.number or 0) + 1, page=(page.number or 0) + 1,
) )
@ -82,19 +41,6 @@ def extract(filename: Path) -> list[Annotation]:
return annotations return annotations
def is_pdf(fname: Path) -> bool:
return magic.from_file(fname, mime=True) == "application/pdf"
def _is_file_processable(fname: Path) -> bool:
if not fname.is_file():
logger.error(f"File {str(fname)} not readable.")
return False
if not is_pdf(fname):
return False
return True
def _tag_from_colorname(colorname: str) -> str: def _tag_from_colorname(colorname: str) -> str:
color_mapping: dict[str, str] = getdict("tags", "plugins.extract") color_mapping: dict[str, str] = getdict("tags", "plugins.extract")
if not color_mapping: if not color_mapping:

13
poetry.lock generated
View file

@ -147,17 +147,6 @@ files = [
{file = "charset_normalizer-3.2.0-py3-none-any.whl", hash = "sha256:8e098148dd37b4ce3baca71fb394c81dc5d9c7728c95df695d2dca218edf40e6"}, {file = "charset_normalizer-3.2.0-py3-none-any.whl", hash = "sha256:8e098148dd37b4ce3baca71fb394c81dc5d9c7728c95df695d2dca218edf40e6"},
] ]
[[package]]
name = "chevron"
version = "0.14.0"
description = "Mustache templating language renderer"
optional = false
python-versions = "*"
files = [
{file = "chevron-0.14.0-py3-none-any.whl", hash = "sha256:fbf996a709f8da2e745ef763f482ce2d311aa817d287593a5b990d6d6e4f0443"},
{file = "chevron-0.14.0.tar.gz", hash = "sha256:87613aafdf6d77b6a90ff073165a61ae5086e21ad49057aa0e53681601800ebf"},
]
[[package]] [[package]]
name = "click" name = "click"
version = "8.1.7" version = "8.1.7"
@ -991,4 +980,4 @@ files = [
[metadata] [metadata]
lock-version = "2.0" lock-version = "2.0"
python-versions = "^3.11" python-versions = "^3.11"
content-hash = "a3af36ed2941235df158c20ba9b66bdf5a0af0554235fd004ec77c3e88def3c3" content-hash = "d519605837788792d06ffc7bca7a92b315612ca6052227c53c558ec49dffec9f"

View file

@ -14,7 +14,6 @@ papis = "^0.13"
click = "^8.1.7" click = "^8.1.7"
whoosh = "^2.7.4" whoosh = "^2.7.4"
python-magic = "^0.4.27" python-magic = "^0.4.27"
chevron = "^0.14.0"
[tool.poetry.plugins."papis.command"] [tool.poetry.plugins."papis.command"]
extract = "papis_extract:main" extract = "papis_extract:main"

View file

@ -1,51 +1,7 @@
import pytest
from papis_extract.annotation_data import Annotation from papis_extract.annotation_data import Annotation
@pytest.mark.parametrize( def test_matches_colorname_exact():
"fmt_string,expected", sut = Annotation("testfile", colors={"stroke": (1.0, 0.0, 0.0)})
[
("{{quote}}", "I am the text value"),
(
"> {{quote}}\n{{#note}}Note: {{note}}{{/note}}",
"> I am the text value\nNote: Whereas I represent the note",
),
(
"{{#note}}Note: {{note}}{{/note}}{{#page}}, p. {{page}}{{/page}}",
"Note: Whereas I represent the note",
),
],
)
def test_formatting(fmt_string, expected):
sut = Annotation(
"myfile",
text="I am the text value",
content="Whereas I represent the note",
)
assert sut.format(fmt_string) == expected
def test_colorname_matches_exact():
sut = Annotation(
"testfile", colors=(1.0,0.0,0.0), minimum_similarity_color=1.0
)
c_name = sut.colorname
assert c_name == "red"
# TODO inject closeness value instead of relying on default
@pytest.mark.parametrize(
"color_value",
[
(1.0, 0.0, 0.0),
(0.9, 0.0, 0.0),
(0.8, 0.0, 0.0),
(0.7, 0.0, 0.0),
(0.51, 0.0, 0.0),
],
)
def test_matches_inexact_colorname(color_value):
sut = Annotation(
"testfile", colors=color_value, minimum_similarity_color=0.833
)
c_name = sut.colorname c_name = sut.colorname
assert c_name == "red" assert c_name == "red"