Refactor annotation into separate class

This commit is contained in:
Marty Oehme 2022-12-25 10:56:30 +01:00
parent 7ad1c2cf91
commit 7a415b4d7d
Signed by: Marty
GPG key ID: 73BA40D5AFAF49C9
2 changed files with 101 additions and 94 deletions

95
extract/annotation.py Normal file
View file

@ -0,0 +1,95 @@
import math
import re
from dataclasses import dataclass, field
from typing import Dict
from pubs.paper import Paper
from pubs import pretty
TEXT_SIMILARITY_MINIMUM = 0.75
COLOR_SIMILARITY_MINIMUM = 0.833
COLORS = {
"red": (1, 0, 0),
"green": (0, 1, 0),
"blue": (0, 0, 1),
"yellow": (1, 1, 0),
"purple": (0.5, 0, 0.5),
"orange": (1, 0.65, 0),
}
@dataclass
class Annotation:
"""A PDF annotation object"""
paper: Paper
file: str
type: str = "Highlight"
text: str = ""
content: str = ""
page: int = 1
colors: Dict = field(default_factory=lambda: {"stroke": (0.0, 0.0, 0.0)})
tag: str = ""
def format(self, formatting):
"""Return a formatted string of the annotation.
Given a provided formatting pattern, this method returns the annotation
formatted with the correct marker replacements and removals, ready
for display or writing.
"""
output = formatting
replacements = {
r"{quote}": self.text,
r"{note}": self.content,
r"{page}": str(self.page),
r"{newline}": "\n",
r"{tag}": self.tag,
}
pattern = re.compile(
"|".join(
[re.escape(k) for k in sorted(replacements, key=len, reverse=True)]
),
flags=re.DOTALL,
)
patt_quote_container = re.compile(r"{%quote_container(.*?)%}")
patt_note_container = re.compile(r"{%note_container(.*?)%}")
patt_tag_container = re.compile(r"{%tag_container(.*?)%}")
output = patt_quote_container.sub(r"\1" if self.text else "", output)
output = patt_note_container.sub(r"\1" if self.content else "", output)
output = patt_tag_container.sub(r"\1" if self.tag else "", output)
return pattern.sub(lambda x: replacements[x.group(0)], output)
@property
def colorname(self):
"""Return the stringified version of the annotation color.
Finds the closest named color to the annotation and returns it.
"""
annot_colors = (
self.colors.get("stroke") or self.colors.get("fill") or (0.0, 0.0, 0.0)
)
nearest = None
minimum_similarity = COLOR_SIMILARITY_MINIMUM
for name, values in COLORS.items():
similarity_ratio = self._color_similarity_ratio(values, annot_colors)
if similarity_ratio > minimum_similarity:
minimum_similarity = similarity_ratio
nearest = name
return nearest
def headline(self, short=False, max_authors=3):
headline = pretty.paper_oneliner(
self.paper, citekey_only=short, max_authors=max_authors
)
return re.sub(r"\[pdf\]", "", headline).rstrip()
def _color_similarity_ratio(self, color_one, color_two):
"""Return the similarity of two colors between 0 and 1.
Takes two rgb color tuples made of floats between 0 and 1, e.g. (1, 0.65, 0) for orange,
and returns the similarity between them, with 1 being the same color and 0 being the
difference between full black and full white, as a float.
"""
return 1 - (abs(math.dist([*color_one], [*color_two])) / 3)

View file

@ -1,108 +1,18 @@
import os import os
import re
import argparse import argparse
import math
from dataclasses import dataclass, field
from typing import Dict
import fitz import fitz
import Levenshtein import Levenshtein
from pubs.plugins import PapersPlugin from pubs.plugins import PapersPlugin
from pubs.paper import Paper
from pubs.events import DocAddEvent, NoteEvent from pubs.events import DocAddEvent, NoteEvent
from pubs import repo, pretty from pubs import repo, pretty
from pubs.utils import resolve_citekey_list from pubs.utils import resolve_citekey_list
from pubs.content import check_file, read_text_file, write_file from pubs.content import check_file, read_text_file, write_file
from pubs.query import get_paper_filter from pubs.query import get_paper_filter
from .annotation import Annotation, COLOR_SIMILARITY_MINIMUM, TEXT_SIMILARITY_MINIMUM
CONFIRMATION_PAPER_THRESHOLD = 5 CONFIRMATION_PAPER_THRESHOLD = 5
TEXT_SIMILARITY_MINIMUM = 0.75
COLOR_SIMILARITY_MINIMUM = 0.833
COLORS = {
"red": (1, 0, 0),
"green": (0, 1, 0),
"blue": (0, 0, 1),
"yellow": (1, 1, 0),
"purple": (0.5, 0, 0.5),
"orange": (1, 0.65, 0),
}
@dataclass
class Annotation:
"""A PDF annotation object"""
paper: Paper
file: str
type: str = "Highlight"
text: str = ""
content: str = ""
page: int = 1
colors: Dict = field(default_factory=lambda: {"stroke": (0.0, 0.0, 0.0)})
tag: str = ""
def format(self, formatting):
"""Return a formatted string of the annotation.
Given a provided formatting pattern, this method returns the annotation
formatted with the correct marker replacements and removals, ready
for display or writing.
"""
output = formatting
replacements = {
r"{quote}": self.text,
r"{note}": self.content,
r"{page}": str(self.page),
r"{newline}": "\n",
r"{tag}": self.tag,
}
pattern = re.compile(
"|".join(
[re.escape(k) for k in sorted(replacements, key=len, reverse=True)]
),
flags=re.DOTALL,
)
patt_quote_container = re.compile(r"{%quote_container(.*?)%}")
patt_note_container = re.compile(r"{%note_container(.*?)%}")
patt_tag_container = re.compile(r"{%tag_container(.*?)%}")
output = patt_quote_container.sub(r"\1" if self.text else "", output)
output = patt_note_container.sub(r"\1" if self.content else "", output)
output = patt_tag_container.sub(r"\1" if self.tag else "", output)
return pattern.sub(lambda x: replacements[x.group(0)], output)
@property
def colorname(self):
"""Return the stringified version of the annotation color.
Finds the closest named color to the annotation and returns it.
"""
annot_colors = (
self.colors.get("stroke") or self.colors.get("fill") or (0.0, 0.0, 0.0)
)
nearest = None
minimum_similarity = COLOR_SIMILARITY_MINIMUM
for name, values in COLORS.items():
similarity_ratio = self._color_similarity_ratio(values, annot_colors)
if similarity_ratio > minimum_similarity:
minimum_similarity = similarity_ratio
nearest = name
return nearest
def headline(self, short=False, max_authors=3):
headline = pretty.paper_oneliner(self.paper, citekey_only=short, max_authors=max_authors)
return re.sub(r"\[pdf\]", "", headline).rstrip()
def _color_similarity_ratio(self, color_one, color_two):
"""Return the similarity of two colors between 0 and 1.
Takes two rgb color tuples made of floats between 0 and 1, e.g. (1, 0.65, 0) for orange,
and returns the similarity between them, with 1 being the same color and 0 being the
difference between full black and full white, as a float.
"""
return 1 - (abs(math.dist([*color_one], [*color_two])) / 3)
class ExtractPlugin(PapersPlugin): class ExtractPlugin(PapersPlugin):
@ -335,7 +245,9 @@ class ExtractPlugin(PapersPlugin):
""" """
output = "" output = ""
for citekey, annotations in annotated_papers.items(): for citekey, annotations in annotated_papers.items():
output += f"\n------ {annotations[0].headline(short=short_header)} ------\n\n" output += (
f"\n------ {annotations[0].headline(short=short_header)} ------\n\n"
)
for annotation in annotations: for annotation in annotations:
output += f"{annotation.format(self.formatting)}\n" output += f"{annotation.format(self.formatting)}\n"
output += "\n" output += "\n"
@ -354,14 +266,14 @@ class ExtractPlugin(PapersPlugin):
if check_file(notepath, fail=False): if check_file(notepath, fail=False):
self._append_to_note(notepath, annotations) self._append_to_note(notepath, annotations)
else: else:
self._write_new_note(notepath, annotations) self._write_new_note(notepath, annotations, self.short_header)
self.ui.info(f"Wrote annotations to {citekey} note {notepath}.") self.ui.info(f"Wrote annotations to {citekey} note {notepath}.")
if edit is True: if edit is True:
self.ui.edit_file(notepath, temporary=False) self.ui.edit_file(notepath, temporary=False)
NoteEvent(citekey).send() NoteEvent(citekey).send()
def _write_new_note(self, notepath, annotations): def _write_new_note(self, notepath, annotations, short_header):
"""Create a new note containing the annotations. """Create a new note containing the annotations.
Will create a new note in the notes folder of pubs Will create a new note in the notes folder of pubs