Compare commits
3 commits
b564ab4792
...
20873e6ef8
Author | SHA1 | Date | |
---|---|---|---|
20873e6ef8 | |||
256117d451 | |||
e325b89c9b |
6 changed files with 147 additions and 83 deletions
|
@ -1,19 +1,14 @@
|
||||||
from pathlib import Path
|
|
||||||
import re
|
|
||||||
|
|
||||||
import click
|
import click
|
||||||
import fitz_new as fitz
|
|
||||||
import magic
|
|
||||||
import papis.cli
|
import papis.cli
|
||||||
import papis.config
|
import papis.config
|
||||||
import papis.document
|
import papis.document
|
||||||
from papis.document import Document
|
|
||||||
import papis.logging
|
import papis.logging
|
||||||
import papis.notes
|
import papis.notes
|
||||||
import papis.strings
|
import papis.strings
|
||||||
|
from papis.document import Document
|
||||||
|
|
||||||
from papis_extract import extractor, exporter
|
from papis_extract import extractor, exporter
|
||||||
from papis_extract.annotation_data import Annotation, AnnotatedDocument
|
from papis_extract.annotation_data import AnnotatedDocument
|
||||||
|
|
||||||
logger = papis.logging.get_logger(__name__)
|
logger = papis.logging.get_logger(__name__)
|
||||||
|
|
||||||
|
@ -76,50 +71,19 @@ def main(
|
||||||
logger.warning(papis.strings.no_documents_retrieved_message)
|
logger.warning(papis.strings.no_documents_retrieved_message)
|
||||||
return
|
return
|
||||||
|
|
||||||
doc_annotations: list[AnnotatedDocument] = _get_annotations_for_documents(documents)
|
run(documents, edit=manual, write=write, git=git)
|
||||||
|
|
||||||
|
|
||||||
|
def run(
|
||||||
|
documents: list[Document],
|
||||||
|
edit: bool = False,
|
||||||
|
write: bool = False,
|
||||||
|
git: bool = False,
|
||||||
|
) -> None:
|
||||||
|
|
||||||
|
doc_annotations: list[AnnotatedDocument] = extractor.start(documents)
|
||||||
|
|
||||||
if write:
|
if write:
|
||||||
exporter.to_notes(doc_annotations, edit=manual, git=git)
|
exporter.to_notes(doc_annotations, edit=edit, git=git)
|
||||||
else:
|
else:
|
||||||
exporter.to_stdout(doc_annotations)
|
exporter.to_stdout(doc_annotations)
|
||||||
|
|
||||||
# note_file: Path = Path(papis.notes.notes_path_ensured(documents[0]))
|
|
||||||
|
|
||||||
|
|
||||||
def is_pdf(fname: Path) -> bool:
|
|
||||||
return magic.from_file(fname, mime=True) == "application/pdf"
|
|
||||||
|
|
||||||
|
|
||||||
def _get_annotations_for_documents(
|
|
||||||
documents: list[Document],
|
|
||||||
) -> list[AnnotatedDocument]:
|
|
||||||
output: list[AnnotatedDocument] = []
|
|
||||||
for doc in documents:
|
|
||||||
annotations: list[Annotation] = []
|
|
||||||
found_pdf: bool = False
|
|
||||||
for file in doc.get_files():
|
|
||||||
fname = Path(file)
|
|
||||||
if not _is_file_processable(fname):
|
|
||||||
break
|
|
||||||
found_pdf = True
|
|
||||||
|
|
||||||
try:
|
|
||||||
annotations.extend(extractor.start(fname))
|
|
||||||
except fitz.FileDataError as e:
|
|
||||||
print(f"File structure errors for {file}.\n{e}")
|
|
||||||
|
|
||||||
if not found_pdf:
|
|
||||||
# have to remove curlys or papis logger gets upset
|
|
||||||
desc = re.sub("[{}]", "", papis.document.describe(doc))
|
|
||||||
logger.warning("Did not find suitable PDF file for document: " f"{desc}")
|
|
||||||
output.append(AnnotatedDocument(doc, annotations))
|
|
||||||
return output
|
|
||||||
|
|
||||||
|
|
||||||
def _is_file_processable(fname: Path) -> bool:
|
|
||||||
if not fname.is_file():
|
|
||||||
logger.error(f"File {str(fname)} not readable.")
|
|
||||||
return False
|
|
||||||
if not is_pdf(fname):
|
|
||||||
return False
|
|
||||||
return True
|
|
||||||
|
|
|
@ -1,9 +1,9 @@
|
||||||
import re
|
|
||||||
import math
|
import math
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
|
|
||||||
import papis.config
|
import papis.config
|
||||||
from papis.document import Document
|
from papis.document import Document
|
||||||
|
import chevron
|
||||||
|
|
||||||
TEXT_SIMILARITY_MINIMUM = 0.75
|
TEXT_SIMILARITY_MINIMUM = 0.75
|
||||||
COLOR_SIMILARITY_MINIMUM = 0.833
|
COLOR_SIMILARITY_MINIMUM = 0.833
|
||||||
|
@ -23,12 +23,13 @@ class Annotation:
|
||||||
"""A PDF annotation object"""
|
"""A PDF annotation object"""
|
||||||
|
|
||||||
file: str
|
file: str
|
||||||
type: str = "Highlight"
|
colors: tuple[float, float, float] = field(default_factory=lambda: (0.0, 0.0, 0.0))
|
||||||
text: str = ""
|
|
||||||
content: str = ""
|
content: str = ""
|
||||||
page: int = 1
|
page: int = 0
|
||||||
colors: dict = field(default_factory=lambda: {"stroke": (0.0, 0.0, 0.0)})
|
|
||||||
tag: str = ""
|
tag: str = ""
|
||||||
|
text: str = ""
|
||||||
|
type: str = "Highlight"
|
||||||
|
minimum_similarity_color: float = 1.0
|
||||||
|
|
||||||
def format(self, formatting):
|
def format(self, formatting):
|
||||||
"""Return a formatted string of the annotation.
|
"""Return a formatted string of the annotation.
|
||||||
|
@ -37,27 +38,15 @@ class Annotation:
|
||||||
formatted with the correct marker replacements and removals, ready
|
formatted with the correct marker replacements and removals, ready
|
||||||
for display or writing.
|
for display or writing.
|
||||||
"""
|
"""
|
||||||
output = formatting
|
data = {
|
||||||
replacements = {
|
"file": self.file,
|
||||||
r"{quote}": self.text,
|
"quote": self.text,
|
||||||
r"{note}": self.content,
|
"note": self.content,
|
||||||
r"{page}": str(self.page),
|
"page": self.page,
|
||||||
r"{newline}": "\n",
|
"tag": self.tag,
|
||||||
r"{tag}": self.tag,
|
"type": self.type,
|
||||||
}
|
}
|
||||||
pattern = re.compile(
|
return chevron.render(formatting, data)
|
||||||
"|".join(
|
|
||||||
[re.escape(k) for k in sorted(replacements, key=len, reverse=True)]
|
|
||||||
),
|
|
||||||
flags=re.DOTALL,
|
|
||||||
)
|
|
||||||
patt_quote_container = re.compile(r"{%quote_container(.*?)%}")
|
|
||||||
patt_note_container = re.compile(r"{%note_container(.*?)%}")
|
|
||||||
patt_tag_container = re.compile(r"{%tag_container(.*?)%}")
|
|
||||||
output = patt_quote_container.sub(r"\1" if self.text else "", output)
|
|
||||||
output = patt_note_container.sub(r"\1" if self.content else "", output)
|
|
||||||
output = patt_tag_container.sub(r"\1" if self.tag else "", output)
|
|
||||||
return pattern.sub(lambda x: replacements[x.group(0)], output)
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def colorname(self):
|
def colorname(self):
|
||||||
|
@ -67,15 +56,16 @@ class Annotation:
|
||||||
using euclidian distance between the two color vectors.
|
using euclidian distance between the two color vectors.
|
||||||
"""
|
"""
|
||||||
annot_colors = (
|
annot_colors = (
|
||||||
self.colors.get("stroke") or self.colors.get("fill") or (0.0, 0.0, 0.0)
|
self.colors or (0.0, 0.0, 0.0)
|
||||||
)
|
)
|
||||||
nearest = None
|
nearest = None
|
||||||
minimum_similarity = (
|
minimum_similarity = (
|
||||||
papis.config.getfloat("minimum_similarity_color", "plugins.extract") or 1.0
|
papis.config.getfloat("minimum_similarity_color", "plugins.extract") or 1.0
|
||||||
)
|
)
|
||||||
|
minimum_similarity = self.minimum_similarity_color
|
||||||
for name, values in COLORS.items():
|
for name, values in COLORS.items():
|
||||||
similarity_ratio = self._color_similarity_ratio(values, annot_colors)
|
similarity_ratio = self._color_similarity_ratio(values, annot_colors)
|
||||||
if similarity_ratio > minimum_similarity:
|
if similarity_ratio >= minimum_similarity:
|
||||||
minimum_similarity = similarity_ratio
|
minimum_similarity = similarity_ratio
|
||||||
nearest = name
|
nearest = name
|
||||||
return nearest
|
return nearest
|
||||||
|
|
|
@ -1,17 +1,53 @@
|
||||||
|
import re
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, Optional
|
from typing import Any, Optional
|
||||||
|
|
||||||
import Levenshtein
|
import Levenshtein
|
||||||
|
import magic
|
||||||
import fitz_new as fitz
|
import fitz_new as fitz
|
||||||
import papis.logging
|
import papis.logging
|
||||||
import papis.config
|
import papis.config
|
||||||
|
import papis.document
|
||||||
|
from papis.document import Document
|
||||||
|
|
||||||
from papis_extract.annotation_data import Annotation
|
from papis_extract.annotation_data import Annotation, AnnotatedDocument
|
||||||
|
|
||||||
logger = papis.logging.get_logger(__name__)
|
logger = papis.logging.get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
def start(filename: Path) -> list[Annotation]:
|
def start(
|
||||||
|
documents: list[Document],
|
||||||
|
) -> list[AnnotatedDocument]:
|
||||||
|
"""Extract all annotations from passed documents.
|
||||||
|
|
||||||
|
Returns all annotations contained in the papis
|
||||||
|
documents passed in.
|
||||||
|
"""
|
||||||
|
|
||||||
|
output: list[AnnotatedDocument] = []
|
||||||
|
for doc in documents:
|
||||||
|
annotations: list[Annotation] = []
|
||||||
|
found_pdf: bool = False
|
||||||
|
for file in doc.get_files():
|
||||||
|
fname = Path(file)
|
||||||
|
if not _is_file_processable(fname):
|
||||||
|
break
|
||||||
|
found_pdf = True
|
||||||
|
|
||||||
|
try:
|
||||||
|
annotations.extend(extract(fname))
|
||||||
|
except fitz.FileDataError as e:
|
||||||
|
print(f"File structure errors for {file}.\n{e}")
|
||||||
|
|
||||||
|
if not found_pdf:
|
||||||
|
# have to remove curlys or papis logger gets upset
|
||||||
|
desc = re.sub("[{}]", "", papis.document.describe(doc))
|
||||||
|
logger.warning("Did not find suitable PDF file for document: " f"{desc}")
|
||||||
|
output.append(AnnotatedDocument(doc, annotations))
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
def extract(filename: Path) -> list[Annotation]:
|
||||||
"""Extract annotations from a file.
|
"""Extract annotations from a file.
|
||||||
|
|
||||||
Returns all readable annotations contained in the file
|
Returns all readable annotations contained in the file
|
||||||
|
@ -24,11 +60,16 @@ def start(filename: Path) -> list[Annotation]:
|
||||||
quote, note = _retrieve_annotation_content(page, annot)
|
quote, note = _retrieve_annotation_content(page, annot)
|
||||||
if not quote and not note:
|
if not quote and not note:
|
||||||
continue
|
continue
|
||||||
|
col = (
|
||||||
|
annot.colors.get("fill")
|
||||||
|
or annot.colors.get("stroke")
|
||||||
|
or (0.0, 0.0, 0.0)
|
||||||
|
)
|
||||||
a = Annotation(
|
a = Annotation(
|
||||||
file=str(filename),
|
file=str(filename),
|
||||||
text=quote or "",
|
text=quote or "",
|
||||||
content=note or "",
|
content=note or "",
|
||||||
colors=annot.colors,
|
colors=col,
|
||||||
type=annot.type[1],
|
type=annot.type[1],
|
||||||
page=(page.number or 0) + 1,
|
page=(page.number or 0) + 1,
|
||||||
)
|
)
|
||||||
|
@ -41,6 +82,19 @@ def start(filename: Path) -> list[Annotation]:
|
||||||
return annotations
|
return annotations
|
||||||
|
|
||||||
|
|
||||||
|
def is_pdf(fname: Path) -> bool:
|
||||||
|
return magic.from_file(fname, mime=True) == "application/pdf"
|
||||||
|
|
||||||
|
|
||||||
|
def _is_file_processable(fname: Path) -> bool:
|
||||||
|
if not fname.is_file():
|
||||||
|
logger.error(f"File {str(fname)} not readable.")
|
||||||
|
return False
|
||||||
|
if not is_pdf(fname):
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
def _tag_from_colorname(colorname: str) -> str:
|
def _tag_from_colorname(colorname: str) -> str:
|
||||||
color_mapping: dict[str, str] = getdict("tags", "plugins.extract")
|
color_mapping: dict[str, str] = getdict("tags", "plugins.extract")
|
||||||
if not color_mapping:
|
if not color_mapping:
|
||||||
|
|
13
poetry.lock
generated
13
poetry.lock
generated
|
@ -147,6 +147,17 @@ files = [
|
||||||
{file = "charset_normalizer-3.2.0-py3-none-any.whl", hash = "sha256:8e098148dd37b4ce3baca71fb394c81dc5d9c7728c95df695d2dca218edf40e6"},
|
{file = "charset_normalizer-3.2.0-py3-none-any.whl", hash = "sha256:8e098148dd37b4ce3baca71fb394c81dc5d9c7728c95df695d2dca218edf40e6"},
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "chevron"
|
||||||
|
version = "0.14.0"
|
||||||
|
description = "Mustache templating language renderer"
|
||||||
|
optional = false
|
||||||
|
python-versions = "*"
|
||||||
|
files = [
|
||||||
|
{file = "chevron-0.14.0-py3-none-any.whl", hash = "sha256:fbf996a709f8da2e745ef763f482ce2d311aa817d287593a5b990d6d6e4f0443"},
|
||||||
|
{file = "chevron-0.14.0.tar.gz", hash = "sha256:87613aafdf6d77b6a90ff073165a61ae5086e21ad49057aa0e53681601800ebf"},
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "click"
|
name = "click"
|
||||||
version = "8.1.7"
|
version = "8.1.7"
|
||||||
|
@ -980,4 +991,4 @@ files = [
|
||||||
[metadata]
|
[metadata]
|
||||||
lock-version = "2.0"
|
lock-version = "2.0"
|
||||||
python-versions = "^3.11"
|
python-versions = "^3.11"
|
||||||
content-hash = "d519605837788792d06ffc7bca7a92b315612ca6052227c53c558ec49dffec9f"
|
content-hash = "a3af36ed2941235df158c20ba9b66bdf5a0af0554235fd004ec77c3e88def3c3"
|
||||||
|
|
|
@ -14,6 +14,7 @@ papis = "^0.13"
|
||||||
click = "^8.1.7"
|
click = "^8.1.7"
|
||||||
whoosh = "^2.7.4"
|
whoosh = "^2.7.4"
|
||||||
python-magic = "^0.4.27"
|
python-magic = "^0.4.27"
|
||||||
|
chevron = "^0.14.0"
|
||||||
|
|
||||||
[tool.poetry.plugins."papis.command"]
|
[tool.poetry.plugins."papis.command"]
|
||||||
extract = "papis_extract:main"
|
extract = "papis_extract:main"
|
||||||
|
|
|
@ -1,7 +1,51 @@
|
||||||
|
import pytest
|
||||||
from papis_extract.annotation_data import Annotation
|
from papis_extract.annotation_data import Annotation
|
||||||
|
|
||||||
|
|
||||||
def test_matches_colorname_exact():
|
@pytest.mark.parametrize(
|
||||||
sut = Annotation("testfile", colors={"stroke": (1.0, 0.0, 0.0)})
|
"fmt_string,expected",
|
||||||
|
[
|
||||||
|
("{{quote}}", "I am the text value"),
|
||||||
|
(
|
||||||
|
"> {{quote}}\n{{#note}}Note: {{note}}{{/note}}",
|
||||||
|
"> I am the text value\nNote: Whereas I represent the note",
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"{{#note}}Note: {{note}}{{/note}}{{#page}}, p. {{page}}{{/page}}",
|
||||||
|
"Note: Whereas I represent the note",
|
||||||
|
),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_formatting(fmt_string, expected):
|
||||||
|
sut = Annotation(
|
||||||
|
"myfile",
|
||||||
|
text="I am the text value",
|
||||||
|
content="Whereas I represent the note",
|
||||||
|
)
|
||||||
|
|
||||||
|
assert sut.format(fmt_string) == expected
|
||||||
|
|
||||||
|
def test_colorname_matches_exact():
|
||||||
|
sut = Annotation(
|
||||||
|
"testfile", colors=(1.0,0.0,0.0), minimum_similarity_color=1.0
|
||||||
|
)
|
||||||
|
c_name = sut.colorname
|
||||||
|
assert c_name == "red"
|
||||||
|
|
||||||
|
# TODO inject closeness value instead of relying on default
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"color_value",
|
||||||
|
[
|
||||||
|
(1.0, 0.0, 0.0),
|
||||||
|
(0.9, 0.0, 0.0),
|
||||||
|
(0.8, 0.0, 0.0),
|
||||||
|
(0.7, 0.0, 0.0),
|
||||||
|
(0.51, 0.0, 0.0),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_matches_inexact_colorname(color_value):
|
||||||
|
sut = Annotation(
|
||||||
|
"testfile", colors=color_value, minimum_similarity_color=0.833
|
||||||
|
)
|
||||||
c_name = sut.colorname
|
c_name = sut.colorname
|
||||||
assert c_name == "red"
|
assert c_name == "red"
|
||||||
|
|
Loading…
Reference in a new issue