Compare commits

..

5 commits

Author SHA1 Message Date
2c5d096d08
Add continuous integration pipeline
All checks were successful
ci/woodpecker/push/woodpecker Pipeline was successful
2023-01-13 19:14:06 +01:00
0634cbb381
Format plugin file with black 2023-01-13 19:06:20 +01:00
e1a7d9f613
Move plugin to nested folder for easy installation
Moved the plugin files to the directory structure they will have in the
python site-packages directory, being placed within the plugs directory
of the pubs application directory.
2023-01-13 18:50:42 +01:00
04c8a8ed0b
Refactor extraction to use PaperAnnotated class 2022-12-25 12:17:05 +01:00
7a415b4d7d
Refactor annotation into separate class 2022-12-25 10:56:56 +01:00
6 changed files with 192 additions and 114 deletions

47
.woodpecker.yml Normal file
View file

@ -0,0 +1,47 @@
branches: main
pipeline:
code_lint:
image: python
commands:
- pip install poetry
- poetry install
- pip install black
- echo "----------------- running lint ------------------"
- python --version && poetry --version && black --version
- poetry run black .
build_dist:
image: python
commands:
- pip install poetry
- poetry install
- echo "----------------- running analysis ------------------"
- python --version && poetry --version
- poetry build
when:
branch: main
gitea_release:
image: plugins/gitea-release
settings:
api_key:
from_secret: gitea_release_token
base_url: https://git.martyoeh.me
files: dist/*
title: NEWEST_VERSION.md
note: NEWEST_CHANGES.md
when:
event: tag
tag: v*
pypi_release:
image: python
commands:
- pip install poetry
- poetry install
- echo "----------------- publishing to pypi ------------------"
- poetry publish --username "$PYPI_USERNAME" --password "$PYPI_PASSWORD"
when:
event: tag
tag: v*

View file

@ -160,6 +160,7 @@ content, because then we can just use that. It is harder to parse if it does not
- [ ] needs some way to delimit where it puts stuff and user stuff is in note - [ ] needs some way to delimit where it puts stuff and user stuff is in note
- [ ] one way is to have it look at `> [17] here be extracted annotation from page seventeen` annotations and put it in between - [ ] one way is to have it look at `> [17] here be extracted annotation from page seventeen` annotations and put it in between
- [x] another, probably simpler first, is to just append missing annotations to the end of the note - [x] another, probably simpler first, is to just append missing annotations to the end of the note
- [ ] use similarity search instead of literal search for existing annotation (levenshtein)
- [x] some highlights (or annotations in general) do not contain text as content - [x] some highlights (or annotations in general) do not contain text as content
- [x] pymupdf can extract the content of the underlying rectangle (mostly) - [x] pymupdf can extract the content of the underlying rectangle (mostly)
- [x] issue is that sometimes the highlight contents are in content, sometimes a user comment instead - [x] issue is that sometimes the highlight contents are in content, sometimes a user comment instead

View file

@ -0,0 +1,111 @@
import math
import re
from dataclasses import dataclass, field
from typing import Dict
from pubs.paper import Paper
from pubs import pretty
TEXT_SIMILARITY_MINIMUM = 0.75
COLOR_SIMILARITY_MINIMUM = 0.833
COLORS = {
"red": (1, 0, 0),
"green": (0, 1, 0),
"blue": (0, 0, 1),
"yellow": (1, 1, 0),
"purple": (0.5, 0, 0.5),
"orange": (1, 0.65, 0),
}
class PaperAnnotated(Paper):
def __init__(self, citekey, bibdata, metadata=None, annotations=[]):
super(PaperAnnotated, self).__init__(citekey, bibdata, metadata)
self.annotations = annotations
@classmethod
def from_paper(cls, paper, annotations=[]):
return cls(paper.citekey, paper.bibdata, paper.metadata, annotations)
def __repr__(self):
return "PaperAnnotated(%s, %s, %s)" % (
self.citekey,
self.bibdata,
self.metadata,
)
def headline(self, short=False, max_authors=3):
headline = pretty.paper_oneliner(
self, citekey_only=short, max_authors=max_authors
)
return re.sub(r"\[pdf\]", "", headline).rstrip()
@dataclass
class Annotation:
"""A PDF annotation object"""
file: str
type: str = "Highlight"
text: str = ""
content: str = ""
page: int = 1
colors: Dict = field(default_factory=lambda: {"stroke": (0.0, 0.0, 0.0)})
tag: str = ""
def format(self, formatting):
"""Return a formatted string of the annotation.
Given a provided formatting pattern, this method returns the annotation
formatted with the correct marker replacements and removals, ready
for display or writing.
"""
output = formatting
replacements = {
r"{quote}": self.text,
r"{note}": self.content,
r"{page}": str(self.page),
r"{newline}": "\n",
r"{tag}": self.tag,
}
pattern = re.compile(
"|".join(
[re.escape(k) for k in sorted(replacements, key=len, reverse=True)]
),
flags=re.DOTALL,
)
patt_quote_container = re.compile(r"{%quote_container(.*?)%}")
patt_note_container = re.compile(r"{%note_container(.*?)%}")
patt_tag_container = re.compile(r"{%tag_container(.*?)%}")
output = patt_quote_container.sub(r"\1" if self.text else "", output)
output = patt_note_container.sub(r"\1" if self.content else "", output)
output = patt_tag_container.sub(r"\1" if self.tag else "", output)
return pattern.sub(lambda x: replacements[x.group(0)], output)
@property
def colorname(self):
"""Return the stringified version of the annotation color.
Finds the closest named color to the annotation and returns it.
"""
annot_colors = (
self.colors.get("stroke") or self.colors.get("fill") or (0.0, 0.0, 0.0)
)
nearest = None
minimum_similarity = COLOR_SIMILARITY_MINIMUM
for name, values in COLORS.items():
similarity_ratio = self._color_similarity_ratio(values, annot_colors)
if similarity_ratio > minimum_similarity:
minimum_similarity = similarity_ratio
nearest = name
return nearest
def _color_similarity_ratio(self, color_one, color_two):
"""Return the similarity of two colors between 0 and 1.
Takes two rgb color tuples made of floats between 0 and 1, e.g. (1, 0.65, 0) for orange,
and returns the similarity between them, with 1 being the same color and 0 being the
difference between full black and full white, as a float.
"""
return 1 - (abs(math.dist([*color_one], [*color_two])) / 3)

View file

@ -1,108 +1,23 @@
import os import os
import re
import argparse import argparse
import math
from dataclasses import dataclass, field
from typing import Dict
import fitz import fitz
import Levenshtein import Levenshtein
from pubs.plugins import PapersPlugin from pubs.plugins import PapersPlugin
from pubs.paper import Paper
from pubs.events import DocAddEvent, NoteEvent from pubs.events import DocAddEvent, NoteEvent
from pubs import repo, pretty from pubs import repo, pretty
from pubs.utils import resolve_citekey_list from pubs.utils import resolve_citekey_list
from pubs.content import check_file, read_text_file, write_file from pubs.content import check_file, read_text_file, write_file
from pubs.query import get_paper_filter from pubs.query import get_paper_filter
from .annotation import (
PaperAnnotated,
Annotation,
COLOR_SIMILARITY_MINIMUM,
TEXT_SIMILARITY_MINIMUM,
)
CONFIRMATION_PAPER_THRESHOLD = 5 CONFIRMATION_PAPER_THRESHOLD = 5
TEXT_SIMILARITY_MINIMUM = 0.75
COLOR_SIMILARITY_MINIMUM = 0.833
COLORS = {
"red": (1, 0, 0),
"green": (0, 1, 0),
"blue": (0, 0, 1),
"yellow": (1, 1, 0),
"purple": (0.5, 0, 0.5),
"orange": (1, 0.65, 0),
}
@dataclass
class Annotation:
"""A PDF annotation object"""
paper: Paper
file: str
type: str = "Highlight"
text: str = ""
content: str = ""
page: int = 1
colors: Dict = field(default_factory=lambda: {"stroke": (0.0, 0.0, 0.0)})
tag: str = ""
def format(self, formatting):
"""Return a formatted string of the annotation.
Given a provided formatting pattern, this method returns the annotation
formatted with the correct marker replacements and removals, ready
for display or writing.
"""
output = formatting
replacements = {
r"{quote}": self.text,
r"{note}": self.content,
r"{page}": str(self.page),
r"{newline}": "\n",
r"{tag}": self.tag,
}
pattern = re.compile(
"|".join(
[re.escape(k) for k in sorted(replacements, key=len, reverse=True)]
),
flags=re.DOTALL,
)
patt_quote_container = re.compile(r"{%quote_container(.*?)%}")
patt_note_container = re.compile(r"{%note_container(.*?)%}")
patt_tag_container = re.compile(r"{%tag_container(.*?)%}")
output = patt_quote_container.sub(r"\1" if self.text else "", output)
output = patt_note_container.sub(r"\1" if self.content else "", output)
output = patt_tag_container.sub(r"\1" if self.tag else "", output)
return pattern.sub(lambda x: replacements[x.group(0)], output)
@property
def colorname(self):
"""Return the stringified version of the annotation color.
Finds the closest named color to the annotation and returns it.
"""
annot_colors = (
self.colors.get("stroke") or self.colors.get("fill") or (0.0, 0.0, 0.0)
)
nearest = None
minimum_similarity = COLOR_SIMILARITY_MINIMUM
for name, values in COLORS.items():
similarity_ratio = self._color_similarity_ratio(values, annot_colors)
if similarity_ratio > minimum_similarity:
minimum_similarity = similarity_ratio
nearest = name
return nearest
def headline(self, short=False, max_authors=3):
headline = pretty.paper_oneliner(self.paper, citekey_only=short, max_authors=max_authors)
return re.sub(r"\[pdf\]", "", headline).rstrip()
def _color_similarity_ratio(self, color_one, color_two):
"""Return the similarity of two colors between 0 and 1.
Takes two rgb color tuples made of floats between 0 and 1, e.g. (1, 0.65, 0) for orange,
and returns the similarity between them, with 1 being the same color and 0 being the
difference between full black and full white, as a float.
"""
return 1 - (abs(math.dist([*color_one], [*color_two])) / 3)
class ExtractPlugin(PapersPlugin): class ExtractPlugin(PapersPlugin):
@ -124,6 +39,7 @@ class ExtractPlugin(PapersPlugin):
def __init__(self, conf, ui): def __init__(self, conf, ui):
self.ui = ui self.ui = ui
self.note_extension = conf["main"]["note_extension"] self.note_extension = conf["main"]["note_extension"]
self.max_authors = conf["main"]["max_authors"]
self.repository = repo.Repository(conf) self.repository = repo.Repository(conf)
self.pubsdir = os.path.expanduser(conf["main"]["pubsdir"]) self.pubsdir = os.path.expanduser(conf["main"]["pubsdir"])
self.broker = self.repository.databroker self.broker = self.repository.databroker
@ -213,12 +129,12 @@ class ExtractPlugin(PapersPlugin):
Returns all annotations belonging to the papers that Returns all annotations belonging to the papers that
are described by the citekeys passed in. are described by the citekeys passed in.
""" """
papers_annotated = {} papers_annotated = []
for paper in papers: for paper in papers:
file = self._get_file(paper) file = self._get_file(paper)
try: try:
annotations = self._get_annotations(file, paper) annotations = self._get_annotations(file)
papers_annotated[paper.citekey] = annotations papers_annotated.append(PaperAnnotated.from_paper(paper, annotations))
except fitz.FileDataError as e: except fitz.FileDataError as e:
self.ui.error(f"Document {file} is broken: {e}") self.ui.error(f"Document {file} is broken: {e}")
return papers_annotated return papers_annotated
@ -256,7 +172,7 @@ class ExtractPlugin(PapersPlugin):
self.ui.message( self.ui.message(
"\n".join( "\n".join(
pretty.paper_oneliner( pretty.paper_oneliner(
p, citekey_only=False, max_authors=conf["main"]["max_authors"] p, citekey_only=False, max_authors=self.max_authors
) )
for p in papers for p in papers
) )
@ -278,7 +194,7 @@ class ExtractPlugin(PapersPlugin):
self.ui.warning(f"{paper.citekey} has no valid document.") self.ui.warning(f"{paper.citekey} has no valid document.")
return path return path
def _get_annotations(self, filename, paper): def _get_annotations(self, filename):
"""Extract annotations from a file. """Extract annotations from a file.
Returns all readable annotations contained in the file Returns all readable annotations contained in the file
@ -292,7 +208,6 @@ class ExtractPlugin(PapersPlugin):
quote, note = self._retrieve_annotation_content(page, annot) quote, note = self._retrieve_annotation_content(page, annot)
a = Annotation( a = Annotation(
file=filename, file=filename,
paper=paper,
text=quote, text=quote,
content=note, content=note,
colors=annot.colors, colors=annot.colors,
@ -327,16 +242,16 @@ class ExtractPlugin(PapersPlugin):
# highlight with selection not in note # highlight with selection not in note
return (written, "") return (written, "")
def _to_stdout(self, annotated_papers, short_header=True): def _to_stdout(self, annotated_papers, short_header=False):
"""Write annotations to stdout. """Write annotations to stdout.
Simply outputs the gathered annotations over stdout Simply outputs the gathered annotations over stdout
ready to be passed on through pipelines etc. ready to be passed on through pipelines etc.
""" """
output = "" output = ""
for citekey, annotations in annotated_papers.items(): for paper in annotated_papers:
output += f"\n------ {annotations[0].headline(short=short_header)} ------\n\n" output += f"\n------ {paper.headline(self.short_header, self.max_authors)} ------\n\n"
for annotation in annotations: for annotation in paper.annotations:
output += f"{annotation.format(self.formatting)}\n" output += f"{annotation.format(self.formatting)}\n"
output += "\n" output += "\n"
self.ui.message(output.strip()) self.ui.message(output.strip())
@ -348,31 +263,35 @@ class ExtractPlugin(PapersPlugin):
in the pubs notes directory. Creates new notes for in the pubs notes directory. Creates new notes for
citekeys missing a note or appends to existing. citekeys missing a note or appends to existing.
""" """
for citekey, annotations in annotated_papers.items(): for paper in annotated_papers:
if annotations: if paper.annotations:
notepath = self.broker.real_notepath(citekey, note_extension) notepath = self.broker.real_notepath(paper.citekey, note_extension)
if check_file(notepath, fail=False): if check_file(notepath, fail=False):
self._append_to_note(notepath, annotations) self._append_to_note(notepath, paper)
else: else:
self._write_new_note(notepath, annotations) self._write_new_note(
self.ui.info(f"Wrote annotations to {citekey} note {notepath}.") notepath,
paper,
paper.headline(short=True, max_authors=self.max_authors),
)
self.ui.info(f"Wrote annotations to {paper.citekey} note {notepath}.")
if edit is True: if edit is True:
self.ui.edit_file(notepath, temporary=False) self.ui.edit_file(notepath, temporary=False)
NoteEvent(citekey).send() NoteEvent(paper.citekey).send()
def _write_new_note(self, notepath, annotations): def _write_new_note(self, notepath, paper, headline):
"""Create a new note containing the annotations. """Create a new note containing the annotations.
Will create a new note in the notes folder of pubs Will create a new note in the notes folder of pubs
and fill it with the annotations extracted from pdf. and fill it with the annotations extracted from pdf.
""" """
output = f"# {annotations[0].headline(short=short_header)}\n\n" output = f"# {headline}\n\n"
for annotation in annotations: for annotation in paper.annotations:
output += f"{annotation.format(self.formatting)}\n\n" output += f"{annotation.format(self.formatting)}\n\n"
write_file(notepath, output, "w") write_file(notepath, output, "w")
def _append_to_note(self, notepath, annotations): def _append_to_note(self, notepath, paper):
"""Append new annotations to the end of a note. """Append new annotations to the end of a note.
Looks through note to determine any new annotations which should be Looks through note to determine any new annotations which should be
@ -381,7 +300,7 @@ class ExtractPlugin(PapersPlugin):
existing = read_text_file(notepath) existing = read_text_file(notepath)
# removed annotations already found in the note # removed annotations already found in the note
existing_dropped = [ existing_dropped = [
x for x in annotations if x.format(self.formatting) not in existing x for x in paper.annotations if x.format(self.formatting) not in existing
] ]
if not existing_dropped: if not existing_dropped:
return return

View file

@ -5,7 +5,7 @@ description = "A pdf annotation extraction plugin for pubs bibliography manager"
authors = ["Marty Oehme <marty.oehme@gmail.com>"] authors = ["Marty Oehme <marty.oehme@gmail.com>"]
license = "LGPL-3.0" license = "LGPL-3.0"
readme = "README.md" readme = "README.md"
packages = [{include = "extract"}] packages = [{include = "pubs"}]
[tool.poetry.dependencies] [tool.poetry.dependencies]
python = "^3.10" python = "^3.10"