From 7a415b4d7df1245e5265a61b78c92769ad6cf28d Mon Sep 17 00:00:00 2001 From: Marty Oehme Date: Sun, 25 Dec 2022 10:56:30 +0100 Subject: [PATCH] Refactor annotation into separate class --- extract/annotation.py | 95 +++++++++++++++++++++++++++++++++++++++ extract/extract.py | 100 +++--------------------------------------- 2 files changed, 101 insertions(+), 94 deletions(-) create mode 100644 extract/annotation.py diff --git a/extract/annotation.py b/extract/annotation.py new file mode 100644 index 0000000..3df7743 --- /dev/null +++ b/extract/annotation.py @@ -0,0 +1,95 @@ +import math +import re +from dataclasses import dataclass, field +from typing import Dict + +from pubs.paper import Paper +from pubs import pretty + +TEXT_SIMILARITY_MINIMUM = 0.75 +COLOR_SIMILARITY_MINIMUM = 0.833 + +COLORS = { + "red": (1, 0, 0), + "green": (0, 1, 0), + "blue": (0, 0, 1), + "yellow": (1, 1, 0), + "purple": (0.5, 0, 0.5), + "orange": (1, 0.65, 0), +} + + +@dataclass +class Annotation: + """A PDF annotation object""" + + paper: Paper + file: str + type: str = "Highlight" + text: str = "" + content: str = "" + page: int = 1 + colors: Dict = field(default_factory=lambda: {"stroke": (0.0, 0.0, 0.0)}) + tag: str = "" + + def format(self, formatting): + """Return a formatted string of the annotation. + + Given a provided formatting pattern, this method returns the annotation + formatted with the correct marker replacements and removals, ready + for display or writing. + """ + output = formatting + replacements = { + r"{quote}": self.text, + r"{note}": self.content, + r"{page}": str(self.page), + r"{newline}": "\n", + r"{tag}": self.tag, + } + pattern = re.compile( + "|".join( + [re.escape(k) for k in sorted(replacements, key=len, reverse=True)] + ), + flags=re.DOTALL, + ) + patt_quote_container = re.compile(r"{%quote_container(.*?)%}") + patt_note_container = re.compile(r"{%note_container(.*?)%}") + patt_tag_container = re.compile(r"{%tag_container(.*?)%}") + output = patt_quote_container.sub(r"\1" if self.text else "", output) + output = patt_note_container.sub(r"\1" if self.content else "", output) + output = patt_tag_container.sub(r"\1" if self.tag else "", output) + return pattern.sub(lambda x: replacements[x.group(0)], output) + + @property + def colorname(self): + """Return the stringified version of the annotation color. + + Finds the closest named color to the annotation and returns it. + """ + annot_colors = ( + self.colors.get("stroke") or self.colors.get("fill") or (0.0, 0.0, 0.0) + ) + nearest = None + minimum_similarity = COLOR_SIMILARITY_MINIMUM + for name, values in COLORS.items(): + similarity_ratio = self._color_similarity_ratio(values, annot_colors) + if similarity_ratio > minimum_similarity: + minimum_similarity = similarity_ratio + nearest = name + return nearest + + def headline(self, short=False, max_authors=3): + headline = pretty.paper_oneliner( + self.paper, citekey_only=short, max_authors=max_authors + ) + return re.sub(r"\[pdf\]", "", headline).rstrip() + + def _color_similarity_ratio(self, color_one, color_two): + """Return the similarity of two colors between 0 and 1. + + Takes two rgb color tuples made of floats between 0 and 1, e.g. (1, 0.65, 0) for orange, + and returns the similarity between them, with 1 being the same color and 0 being the + difference between full black and full white, as a float. + """ + return 1 - (abs(math.dist([*color_one], [*color_two])) / 3) diff --git a/extract/extract.py b/extract/extract.py index 76c1149..cdcfdf9 100644 --- a/extract/extract.py +++ b/extract/extract.py @@ -1,108 +1,18 @@ import os -import re import argparse -import math -from dataclasses import dataclass, field -from typing import Dict import fitz import Levenshtein from pubs.plugins import PapersPlugin -from pubs.paper import Paper from pubs.events import DocAddEvent, NoteEvent - from pubs import repo, pretty from pubs.utils import resolve_citekey_list from pubs.content import check_file, read_text_file, write_file from pubs.query import get_paper_filter +from .annotation import Annotation, COLOR_SIMILARITY_MINIMUM, TEXT_SIMILARITY_MINIMUM CONFIRMATION_PAPER_THRESHOLD = 5 -TEXT_SIMILARITY_MINIMUM = 0.75 -COLOR_SIMILARITY_MINIMUM = 0.833 - -COLORS = { - "red": (1, 0, 0), - "green": (0, 1, 0), - "blue": (0, 0, 1), - "yellow": (1, 1, 0), - "purple": (0.5, 0, 0.5), - "orange": (1, 0.65, 0), -} - - -@dataclass -class Annotation: - """A PDF annotation object""" - - paper: Paper - file: str - type: str = "Highlight" - text: str = "" - content: str = "" - page: int = 1 - colors: Dict = field(default_factory=lambda: {"stroke": (0.0, 0.0, 0.0)}) - tag: str = "" - - def format(self, formatting): - """Return a formatted string of the annotation. - - Given a provided formatting pattern, this method returns the annotation - formatted with the correct marker replacements and removals, ready - for display or writing. - """ - output = formatting - replacements = { - r"{quote}": self.text, - r"{note}": self.content, - r"{page}": str(self.page), - r"{newline}": "\n", - r"{tag}": self.tag, - } - pattern = re.compile( - "|".join( - [re.escape(k) for k in sorted(replacements, key=len, reverse=True)] - ), - flags=re.DOTALL, - ) - patt_quote_container = re.compile(r"{%quote_container(.*?)%}") - patt_note_container = re.compile(r"{%note_container(.*?)%}") - patt_tag_container = re.compile(r"{%tag_container(.*?)%}") - output = patt_quote_container.sub(r"\1" if self.text else "", output) - output = patt_note_container.sub(r"\1" if self.content else "", output) - output = patt_tag_container.sub(r"\1" if self.tag else "", output) - return pattern.sub(lambda x: replacements[x.group(0)], output) - - @property - def colorname(self): - """Return the stringified version of the annotation color. - - Finds the closest named color to the annotation and returns it. - """ - annot_colors = ( - self.colors.get("stroke") or self.colors.get("fill") or (0.0, 0.0, 0.0) - ) - nearest = None - minimum_similarity = COLOR_SIMILARITY_MINIMUM - for name, values in COLORS.items(): - similarity_ratio = self._color_similarity_ratio(values, annot_colors) - if similarity_ratio > minimum_similarity: - minimum_similarity = similarity_ratio - nearest = name - return nearest - - def headline(self, short=False, max_authors=3): - headline = pretty.paper_oneliner(self.paper, citekey_only=short, max_authors=max_authors) - return re.sub(r"\[pdf\]", "", headline).rstrip() - - def _color_similarity_ratio(self, color_one, color_two): - """Return the similarity of two colors between 0 and 1. - - Takes two rgb color tuples made of floats between 0 and 1, e.g. (1, 0.65, 0) for orange, - and returns the similarity between them, with 1 being the same color and 0 being the - difference between full black and full white, as a float. - """ - return 1 - (abs(math.dist([*color_one], [*color_two])) / 3) class ExtractPlugin(PapersPlugin): @@ -335,7 +245,9 @@ class ExtractPlugin(PapersPlugin): """ output = "" for citekey, annotations in annotated_papers.items(): - output += f"\n------ {annotations[0].headline(short=short_header)} ------\n\n" + output += ( + f"\n------ {annotations[0].headline(short=short_header)} ------\n\n" + ) for annotation in annotations: output += f"{annotation.format(self.formatting)}\n" output += "\n" @@ -354,14 +266,14 @@ class ExtractPlugin(PapersPlugin): if check_file(notepath, fail=False): self._append_to_note(notepath, annotations) else: - self._write_new_note(notepath, annotations) + self._write_new_note(notepath, annotations, self.short_header) self.ui.info(f"Wrote annotations to {citekey} note {notepath}.") if edit is True: self.ui.edit_file(notepath, temporary=False) NoteEvent(citekey).send() - def _write_new_note(self, notepath, annotations): + def _write_new_note(self, notepath, annotations, short_header): """Create a new note containing the annotations. Will create a new note in the notes folder of pubs