Compare commits

..

No commits in common. "2c5d096d08327f030cc898f268682f35c08533c8" and "7ad1c2cf914e52bb4dd7425bce0fa20d8f739600" have entirely different histories.

6 changed files with 114 additions and 192 deletions

View file

@ -1,47 +0,0 @@
branches: main
pipeline:
code_lint:
image: python
commands:
- pip install poetry
- poetry install
- pip install black
- echo "----------------- running lint ------------------"
- python --version && poetry --version && black --version
- poetry run black .
build_dist:
image: python
commands:
- pip install poetry
- poetry install
- echo "----------------- running analysis ------------------"
- python --version && poetry --version
- poetry build
when:
branch: main
gitea_release:
image: plugins/gitea-release
settings:
api_key:
from_secret: gitea_release_token
base_url: https://git.martyoeh.me
files: dist/*
title: NEWEST_VERSION.md
note: NEWEST_CHANGES.md
when:
event: tag
tag: v*
pypi_release:
image: python
commands:
- pip install poetry
- poetry install
- echo "----------------- publishing to pypi ------------------"
- poetry publish --username "$PYPI_USERNAME" --password "$PYPI_PASSWORD"
when:
event: tag
tag: v*

View file

@ -160,7 +160,6 @@ content, because then we can just use that. It is harder to parse if it does not
- [ ] needs some way to delimit where it puts stuff and user stuff is in note
- [ ] one way is to have it look at `> [17] here be extracted annotation from page seventeen` annotations and put it in between
- [x] another, probably simpler first, is to just append missing annotations to the end of the note
- [ ] use similarity search instead of literal search for existing annotation (levenshtein)
- [x] some highlights (or annotations in general) do not contain text as content
- [x] pymupdf can extract the content of the underlying rectangle (mostly)
- [x] issue is that sometimes the highlight contents are in content, sometimes a user comment instead

View file

@ -1,23 +1,108 @@
import os
import re
import argparse
import math
from dataclasses import dataclass, field
from typing import Dict
import fitz
import Levenshtein
from pubs.plugins import PapersPlugin
from pubs.paper import Paper
from pubs.events import DocAddEvent, NoteEvent
from pubs import repo, pretty
from pubs.utils import resolve_citekey_list
from pubs.content import check_file, read_text_file, write_file
from pubs.query import get_paper_filter
from .annotation import (
PaperAnnotated,
Annotation,
COLOR_SIMILARITY_MINIMUM,
TEXT_SIMILARITY_MINIMUM,
)
CONFIRMATION_PAPER_THRESHOLD = 5
TEXT_SIMILARITY_MINIMUM = 0.75
COLOR_SIMILARITY_MINIMUM = 0.833
COLORS = {
"red": (1, 0, 0),
"green": (0, 1, 0),
"blue": (0, 0, 1),
"yellow": (1, 1, 0),
"purple": (0.5, 0, 0.5),
"orange": (1, 0.65, 0),
}
@dataclass
class Annotation:
"""A PDF annotation object"""
paper: Paper
file: str
type: str = "Highlight"
text: str = ""
content: str = ""
page: int = 1
colors: Dict = field(default_factory=lambda: {"stroke": (0.0, 0.0, 0.0)})
tag: str = ""
def format(self, formatting):
"""Return a formatted string of the annotation.
Given a provided formatting pattern, this method returns the annotation
formatted with the correct marker replacements and removals, ready
for display or writing.
"""
output = formatting
replacements = {
r"{quote}": self.text,
r"{note}": self.content,
r"{page}": str(self.page),
r"{newline}": "\n",
r"{tag}": self.tag,
}
pattern = re.compile(
"|".join(
[re.escape(k) for k in sorted(replacements, key=len, reverse=True)]
),
flags=re.DOTALL,
)
patt_quote_container = re.compile(r"{%quote_container(.*?)%}")
patt_note_container = re.compile(r"{%note_container(.*?)%}")
patt_tag_container = re.compile(r"{%tag_container(.*?)%}")
output = patt_quote_container.sub(r"\1" if self.text else "", output)
output = patt_note_container.sub(r"\1" if self.content else "", output)
output = patt_tag_container.sub(r"\1" if self.tag else "", output)
return pattern.sub(lambda x: replacements[x.group(0)], output)
@property
def colorname(self):
"""Return the stringified version of the annotation color.
Finds the closest named color to the annotation and returns it.
"""
annot_colors = (
self.colors.get("stroke") or self.colors.get("fill") or (0.0, 0.0, 0.0)
)
nearest = None
minimum_similarity = COLOR_SIMILARITY_MINIMUM
for name, values in COLORS.items():
similarity_ratio = self._color_similarity_ratio(values, annot_colors)
if similarity_ratio > minimum_similarity:
minimum_similarity = similarity_ratio
nearest = name
return nearest
def headline(self, short=False, max_authors=3):
headline = pretty.paper_oneliner(self.paper, citekey_only=short, max_authors=max_authors)
return re.sub(r"\[pdf\]", "", headline).rstrip()
def _color_similarity_ratio(self, color_one, color_two):
"""Return the similarity of two colors between 0 and 1.
Takes two rgb color tuples made of floats between 0 and 1, e.g. (1, 0.65, 0) for orange,
and returns the similarity between them, with 1 being the same color and 0 being the
difference between full black and full white, as a float.
"""
return 1 - (abs(math.dist([*color_one], [*color_two])) / 3)
class ExtractPlugin(PapersPlugin):
@ -39,7 +124,6 @@ class ExtractPlugin(PapersPlugin):
def __init__(self, conf, ui):
self.ui = ui
self.note_extension = conf["main"]["note_extension"]
self.max_authors = conf["main"]["max_authors"]
self.repository = repo.Repository(conf)
self.pubsdir = os.path.expanduser(conf["main"]["pubsdir"])
self.broker = self.repository.databroker
@ -129,12 +213,12 @@ class ExtractPlugin(PapersPlugin):
Returns all annotations belonging to the papers that
are described by the citekeys passed in.
"""
papers_annotated = []
papers_annotated = {}
for paper in papers:
file = self._get_file(paper)
try:
annotations = self._get_annotations(file)
papers_annotated.append(PaperAnnotated.from_paper(paper, annotations))
annotations = self._get_annotations(file, paper)
papers_annotated[paper.citekey] = annotations
except fitz.FileDataError as e:
self.ui.error(f"Document {file} is broken: {e}")
return papers_annotated
@ -172,7 +256,7 @@ class ExtractPlugin(PapersPlugin):
self.ui.message(
"\n".join(
pretty.paper_oneliner(
p, citekey_only=False, max_authors=self.max_authors
p, citekey_only=False, max_authors=conf["main"]["max_authors"]
)
for p in papers
)
@ -194,7 +278,7 @@ class ExtractPlugin(PapersPlugin):
self.ui.warning(f"{paper.citekey} has no valid document.")
return path
def _get_annotations(self, filename):
def _get_annotations(self, filename, paper):
"""Extract annotations from a file.
Returns all readable annotations contained in the file
@ -208,6 +292,7 @@ class ExtractPlugin(PapersPlugin):
quote, note = self._retrieve_annotation_content(page, annot)
a = Annotation(
file=filename,
paper=paper,
text=quote,
content=note,
colors=annot.colors,
@ -242,16 +327,16 @@ class ExtractPlugin(PapersPlugin):
# highlight with selection not in note
return (written, "")
def _to_stdout(self, annotated_papers, short_header=False):
def _to_stdout(self, annotated_papers, short_header=True):
"""Write annotations to stdout.
Simply outputs the gathered annotations over stdout
ready to be passed on through pipelines etc.
"""
output = ""
for paper in annotated_papers:
output += f"\n------ {paper.headline(self.short_header, self.max_authors)} ------\n\n"
for annotation in paper.annotations:
for citekey, annotations in annotated_papers.items():
output += f"\n------ {annotations[0].headline(short=short_header)} ------\n\n"
for annotation in annotations:
output += f"{annotation.format(self.formatting)}\n"
output += "\n"
self.ui.message(output.strip())
@ -263,35 +348,31 @@ class ExtractPlugin(PapersPlugin):
in the pubs notes directory. Creates new notes for
citekeys missing a note or appends to existing.
"""
for paper in annotated_papers:
if paper.annotations:
notepath = self.broker.real_notepath(paper.citekey, note_extension)
for citekey, annotations in annotated_papers.items():
if annotations:
notepath = self.broker.real_notepath(citekey, note_extension)
if check_file(notepath, fail=False):
self._append_to_note(notepath, paper)
self._append_to_note(notepath, annotations)
else:
self._write_new_note(
notepath,
paper,
paper.headline(short=True, max_authors=self.max_authors),
)
self.ui.info(f"Wrote annotations to {paper.citekey} note {notepath}.")
self._write_new_note(notepath, annotations)
self.ui.info(f"Wrote annotations to {citekey} note {notepath}.")
if edit is True:
self.ui.edit_file(notepath, temporary=False)
NoteEvent(paper.citekey).send()
NoteEvent(citekey).send()
def _write_new_note(self, notepath, paper, headline):
def _write_new_note(self, notepath, annotations):
"""Create a new note containing the annotations.
Will create a new note in the notes folder of pubs
and fill it with the annotations extracted from pdf.
"""
output = f"# {headline}\n\n"
for annotation in paper.annotations:
output = f"# {annotations[0].headline(short=short_header)}\n\n"
for annotation in annotations:
output += f"{annotation.format(self.formatting)}\n\n"
write_file(notepath, output, "w")
def _append_to_note(self, notepath, paper):
def _append_to_note(self, notepath, annotations):
"""Append new annotations to the end of a note.
Looks through note to determine any new annotations which should be
@ -300,7 +381,7 @@ class ExtractPlugin(PapersPlugin):
existing = read_text_file(notepath)
# removed annotations already found in the note
existing_dropped = [
x for x in paper.annotations if x.format(self.formatting) not in existing
x for x in annotations if x.format(self.formatting) not in existing
]
if not existing_dropped:
return

View file

@ -1,111 +0,0 @@
import math
import re
from dataclasses import dataclass, field
from typing import Dict
from pubs.paper import Paper
from pubs import pretty
TEXT_SIMILARITY_MINIMUM = 0.75
COLOR_SIMILARITY_MINIMUM = 0.833
COLORS = {
"red": (1, 0, 0),
"green": (0, 1, 0),
"blue": (0, 0, 1),
"yellow": (1, 1, 0),
"purple": (0.5, 0, 0.5),
"orange": (1, 0.65, 0),
}
class PaperAnnotated(Paper):
def __init__(self, citekey, bibdata, metadata=None, annotations=[]):
super(PaperAnnotated, self).__init__(citekey, bibdata, metadata)
self.annotations = annotations
@classmethod
def from_paper(cls, paper, annotations=[]):
return cls(paper.citekey, paper.bibdata, paper.metadata, annotations)
def __repr__(self):
return "PaperAnnotated(%s, %s, %s)" % (
self.citekey,
self.bibdata,
self.metadata,
)
def headline(self, short=False, max_authors=3):
headline = pretty.paper_oneliner(
self, citekey_only=short, max_authors=max_authors
)
return re.sub(r"\[pdf\]", "", headline).rstrip()
@dataclass
class Annotation:
"""A PDF annotation object"""
file: str
type: str = "Highlight"
text: str = ""
content: str = ""
page: int = 1
colors: Dict = field(default_factory=lambda: {"stroke": (0.0, 0.0, 0.0)})
tag: str = ""
def format(self, formatting):
"""Return a formatted string of the annotation.
Given a provided formatting pattern, this method returns the annotation
formatted with the correct marker replacements and removals, ready
for display or writing.
"""
output = formatting
replacements = {
r"{quote}": self.text,
r"{note}": self.content,
r"{page}": str(self.page),
r"{newline}": "\n",
r"{tag}": self.tag,
}
pattern = re.compile(
"|".join(
[re.escape(k) for k in sorted(replacements, key=len, reverse=True)]
),
flags=re.DOTALL,
)
patt_quote_container = re.compile(r"{%quote_container(.*?)%}")
patt_note_container = re.compile(r"{%note_container(.*?)%}")
patt_tag_container = re.compile(r"{%tag_container(.*?)%}")
output = patt_quote_container.sub(r"\1" if self.text else "", output)
output = patt_note_container.sub(r"\1" if self.content else "", output)
output = patt_tag_container.sub(r"\1" if self.tag else "", output)
return pattern.sub(lambda x: replacements[x.group(0)], output)
@property
def colorname(self):
"""Return the stringified version of the annotation color.
Finds the closest named color to the annotation and returns it.
"""
annot_colors = (
self.colors.get("stroke") or self.colors.get("fill") or (0.0, 0.0, 0.0)
)
nearest = None
minimum_similarity = COLOR_SIMILARITY_MINIMUM
for name, values in COLORS.items():
similarity_ratio = self._color_similarity_ratio(values, annot_colors)
if similarity_ratio > minimum_similarity:
minimum_similarity = similarity_ratio
nearest = name
return nearest
def _color_similarity_ratio(self, color_one, color_two):
"""Return the similarity of two colors between 0 and 1.
Takes two rgb color tuples made of floats between 0 and 1, e.g. (1, 0.65, 0) for orange,
and returns the similarity between them, with 1 being the same color and 0 being the
difference between full black and full white, as a float.
"""
return 1 - (abs(math.dist([*color_one], [*color_two])) / 3)

View file

@ -5,7 +5,7 @@ description = "A pdf annotation extraction plugin for pubs bibliography manager"
authors = ["Marty Oehme <marty.oehme@gmail.com>"]
license = "LGPL-3.0"
readme = "README.md"
packages = [{include = "pubs"}]
packages = [{include = "extract"}]
[tool.poetry.dependencies]
python = "^3.10"