Compare commits

..

No commits in common. "20873e6ef88184c60a25b2fd5d2f2294fad35650" and "b564ab479237a6fecc9a4859b2c69161fc524fae" have entirely different histories.

6 changed files with 84 additions and 148 deletions

View file

@ -1,14 +1,19 @@
from pathlib import Path
import re
import click
import fitz_new as fitz
import magic
import papis.cli
import papis.config
import papis.document
from papis.document import Document
import papis.logging
import papis.notes
import papis.strings
from papis.document import Document
from papis_extract import extractor, exporter
from papis_extract.annotation_data import AnnotatedDocument
from papis_extract.annotation_data import Annotation, AnnotatedDocument
logger = papis.logging.get_logger(__name__)
@ -71,19 +76,50 @@ def main(
logger.warning(papis.strings.no_documents_retrieved_message)
return
run(documents, edit=manual, write=write, git=git)
def run(
documents: list[Document],
edit: bool = False,
write: bool = False,
git: bool = False,
) -> None:
doc_annotations: list[AnnotatedDocument] = extractor.start(documents)
doc_annotations: list[AnnotatedDocument] = _get_annotations_for_documents(documents)
if write:
exporter.to_notes(doc_annotations, edit=edit, git=git)
exporter.to_notes(doc_annotations, edit=manual, git=git)
else:
exporter.to_stdout(doc_annotations)
# note_file: Path = Path(papis.notes.notes_path_ensured(documents[0]))
def is_pdf(fname: Path) -> bool:
return magic.from_file(fname, mime=True) == "application/pdf"
def _get_annotations_for_documents(
documents: list[Document],
) -> list[AnnotatedDocument]:
output: list[AnnotatedDocument] = []
for doc in documents:
annotations: list[Annotation] = []
found_pdf: bool = False
for file in doc.get_files():
fname = Path(file)
if not _is_file_processable(fname):
break
found_pdf = True
try:
annotations.extend(extractor.start(fname))
except fitz.FileDataError as e:
print(f"File structure errors for {file}.\n{e}")
if not found_pdf:
# have to remove curlys or papis logger gets upset
desc = re.sub("[{}]", "", papis.document.describe(doc))
logger.warning("Did not find suitable PDF file for document: " f"{desc}")
output.append(AnnotatedDocument(doc, annotations))
return output
def _is_file_processable(fname: Path) -> bool:
if not fname.is_file():
logger.error(f"File {str(fname)} not readable.")
return False
if not is_pdf(fname):
return False
return True

View file

@ -1,9 +1,9 @@
import re
import math
from dataclasses import dataclass, field
import papis.config
from papis.document import Document
import chevron
TEXT_SIMILARITY_MINIMUM = 0.75
COLOR_SIMILARITY_MINIMUM = 0.833
@ -23,13 +23,12 @@ class Annotation:
"""A PDF annotation object"""
file: str
colors: tuple[float, float, float] = field(default_factory=lambda: (0.0, 0.0, 0.0))
content: str = ""
page: int = 0
tag: str = ""
text: str = ""
type: str = "Highlight"
minimum_similarity_color: float = 1.0
text: str = ""
content: str = ""
page: int = 1
colors: dict = field(default_factory=lambda: {"stroke": (0.0, 0.0, 0.0)})
tag: str = ""
def format(self, formatting):
"""Return a formatted string of the annotation.
@ -38,15 +37,27 @@ class Annotation:
formatted with the correct marker replacements and removals, ready
for display or writing.
"""
data = {
"file": self.file,
"quote": self.text,
"note": self.content,
"page": self.page,
"tag": self.tag,
"type": self.type,
output = formatting
replacements = {
r"{quote}": self.text,
r"{note}": self.content,
r"{page}": str(self.page),
r"{newline}": "\n",
r"{tag}": self.tag,
}
return chevron.render(formatting, data)
pattern = re.compile(
"|".join(
[re.escape(k) for k in sorted(replacements, key=len, reverse=True)]
),
flags=re.DOTALL,
)
patt_quote_container = re.compile(r"{%quote_container(.*?)%}")
patt_note_container = re.compile(r"{%note_container(.*?)%}")
patt_tag_container = re.compile(r"{%tag_container(.*?)%}")
output = patt_quote_container.sub(r"\1" if self.text else "", output)
output = patt_note_container.sub(r"\1" if self.content else "", output)
output = patt_tag_container.sub(r"\1" if self.tag else "", output)
return pattern.sub(lambda x: replacements[x.group(0)], output)
@property
def colorname(self):
@ -56,16 +67,15 @@ class Annotation:
using euclidian distance between the two color vectors.
"""
annot_colors = (
self.colors or (0.0, 0.0, 0.0)
self.colors.get("stroke") or self.colors.get("fill") or (0.0, 0.0, 0.0)
)
nearest = None
minimum_similarity = (
papis.config.getfloat("minimum_similarity_color", "plugins.extract") or 1.0
)
minimum_similarity = self.minimum_similarity_color
for name, values in COLORS.items():
similarity_ratio = self._color_similarity_ratio(values, annot_colors)
if similarity_ratio >= minimum_similarity:
if similarity_ratio > minimum_similarity:
minimum_similarity = similarity_ratio
nearest = name
return nearest

View file

@ -1,53 +1,17 @@
import re
from pathlib import Path
from typing import Any, Optional
import Levenshtein
import magic
import fitz_new as fitz
import papis.logging
import papis.config
import papis.document
from papis.document import Document
from papis_extract.annotation_data import Annotation, AnnotatedDocument
from papis_extract.annotation_data import Annotation
logger = papis.logging.get_logger(__name__)
def start(
documents: list[Document],
) -> list[AnnotatedDocument]:
"""Extract all annotations from passed documents.
Returns all annotations contained in the papis
documents passed in.
"""
output: list[AnnotatedDocument] = []
for doc in documents:
annotations: list[Annotation] = []
found_pdf: bool = False
for file in doc.get_files():
fname = Path(file)
if not _is_file_processable(fname):
break
found_pdf = True
try:
annotations.extend(extract(fname))
except fitz.FileDataError as e:
print(f"File structure errors for {file}.\n{e}")
if not found_pdf:
# have to remove curlys or papis logger gets upset
desc = re.sub("[{}]", "", papis.document.describe(doc))
logger.warning("Did not find suitable PDF file for document: " f"{desc}")
output.append(AnnotatedDocument(doc, annotations))
return output
def extract(filename: Path) -> list[Annotation]:
def start(filename: Path) -> list[Annotation]:
"""Extract annotations from a file.
Returns all readable annotations contained in the file
@ -60,16 +24,11 @@ def extract(filename: Path) -> list[Annotation]:
quote, note = _retrieve_annotation_content(page, annot)
if not quote and not note:
continue
col = (
annot.colors.get("fill")
or annot.colors.get("stroke")
or (0.0, 0.0, 0.0)
)
a = Annotation(
file=str(filename),
text=quote or "",
content=note or "",
colors=col,
colors=annot.colors,
type=annot.type[1],
page=(page.number or 0) + 1,
)
@ -82,19 +41,6 @@ def extract(filename: Path) -> list[Annotation]:
return annotations
def is_pdf(fname: Path) -> bool:
return magic.from_file(fname, mime=True) == "application/pdf"
def _is_file_processable(fname: Path) -> bool:
if not fname.is_file():
logger.error(f"File {str(fname)} not readable.")
return False
if not is_pdf(fname):
return False
return True
def _tag_from_colorname(colorname: str) -> str:
color_mapping: dict[str, str] = getdict("tags", "plugins.extract")
if not color_mapping:

13
poetry.lock generated
View file

@ -147,17 +147,6 @@ files = [
{file = "charset_normalizer-3.2.0-py3-none-any.whl", hash = "sha256:8e098148dd37b4ce3baca71fb394c81dc5d9c7728c95df695d2dca218edf40e6"},
]
[[package]]
name = "chevron"
version = "0.14.0"
description = "Mustache templating language renderer"
optional = false
python-versions = "*"
files = [
{file = "chevron-0.14.0-py3-none-any.whl", hash = "sha256:fbf996a709f8da2e745ef763f482ce2d311aa817d287593a5b990d6d6e4f0443"},
{file = "chevron-0.14.0.tar.gz", hash = "sha256:87613aafdf6d77b6a90ff073165a61ae5086e21ad49057aa0e53681601800ebf"},
]
[[package]]
name = "click"
version = "8.1.7"
@ -991,4 +980,4 @@ files = [
[metadata]
lock-version = "2.0"
python-versions = "^3.11"
content-hash = "a3af36ed2941235df158c20ba9b66bdf5a0af0554235fd004ec77c3e88def3c3"
content-hash = "d519605837788792d06ffc7bca7a92b315612ca6052227c53c558ec49dffec9f"

View file

@ -14,7 +14,6 @@ papis = "^0.13"
click = "^8.1.7"
whoosh = "^2.7.4"
python-magic = "^0.4.27"
chevron = "^0.14.0"
[tool.poetry.plugins."papis.command"]
extract = "papis_extract:main"

View file

@ -1,51 +1,7 @@
import pytest
from papis_extract.annotation_data import Annotation
@pytest.mark.parametrize(
"fmt_string,expected",
[
("{{quote}}", "I am the text value"),
(
"> {{quote}}\n{{#note}}Note: {{note}}{{/note}}",
"> I am the text value\nNote: Whereas I represent the note",
),
(
"{{#note}}Note: {{note}}{{/note}}{{#page}}, p. {{page}}{{/page}}",
"Note: Whereas I represent the note",
),
],
)
def test_formatting(fmt_string, expected):
sut = Annotation(
"myfile",
text="I am the text value",
content="Whereas I represent the note",
)
assert sut.format(fmt_string) == expected
def test_colorname_matches_exact():
sut = Annotation(
"testfile", colors=(1.0,0.0,0.0), minimum_similarity_color=1.0
)
c_name = sut.colorname
assert c_name == "red"
# TODO inject closeness value instead of relying on default
@pytest.mark.parametrize(
"color_value",
[
(1.0, 0.0, 0.0),
(0.9, 0.0, 0.0),
(0.8, 0.0, 0.0),
(0.7, 0.0, 0.0),
(0.51, 0.0, 0.0),
],
)
def test_matches_inexact_colorname(color_value):
sut = Annotation(
"testfile", colors=color_value, minimum_similarity_color=0.833
)
def test_matches_colorname_exact():
sut = Annotation("testfile", colors={"stroke": (1.0, 0.0, 0.0)})
c_name = sut.colorname
assert c_name == "red"