From 7a415b4d7df1245e5265a61b78c92769ad6cf28d Mon Sep 17 00:00:00 2001 From: Marty Oehme Date: Sun, 25 Dec 2022 10:56:30 +0100 Subject: [PATCH 1/5] Refactor annotation into separate class --- extract/annotation.py | 95 +++++++++++++++++++++++++++++++++++++++ extract/extract.py | 100 +++--------------------------------------- 2 files changed, 101 insertions(+), 94 deletions(-) create mode 100644 extract/annotation.py diff --git a/extract/annotation.py b/extract/annotation.py new file mode 100644 index 0000000..3df7743 --- /dev/null +++ b/extract/annotation.py @@ -0,0 +1,95 @@ +import math +import re +from dataclasses import dataclass, field +from typing import Dict + +from pubs.paper import Paper +from pubs import pretty + +TEXT_SIMILARITY_MINIMUM = 0.75 +COLOR_SIMILARITY_MINIMUM = 0.833 + +COLORS = { + "red": (1, 0, 0), + "green": (0, 1, 0), + "blue": (0, 0, 1), + "yellow": (1, 1, 0), + "purple": (0.5, 0, 0.5), + "orange": (1, 0.65, 0), +} + + +@dataclass +class Annotation: + """A PDF annotation object""" + + paper: Paper + file: str + type: str = "Highlight" + text: str = "" + content: str = "" + page: int = 1 + colors: Dict = field(default_factory=lambda: {"stroke": (0.0, 0.0, 0.0)}) + tag: str = "" + + def format(self, formatting): + """Return a formatted string of the annotation. + + Given a provided formatting pattern, this method returns the annotation + formatted with the correct marker replacements and removals, ready + for display or writing. + """ + output = formatting + replacements = { + r"{quote}": self.text, + r"{note}": self.content, + r"{page}": str(self.page), + r"{newline}": "\n", + r"{tag}": self.tag, + } + pattern = re.compile( + "|".join( + [re.escape(k) for k in sorted(replacements, key=len, reverse=True)] + ), + flags=re.DOTALL, + ) + patt_quote_container = re.compile(r"{%quote_container(.*?)%}") + patt_note_container = re.compile(r"{%note_container(.*?)%}") + patt_tag_container = re.compile(r"{%tag_container(.*?)%}") + output = patt_quote_container.sub(r"\1" if self.text else "", output) + output = patt_note_container.sub(r"\1" if self.content else "", output) + output = patt_tag_container.sub(r"\1" if self.tag else "", output) + return pattern.sub(lambda x: replacements[x.group(0)], output) + + @property + def colorname(self): + """Return the stringified version of the annotation color. + + Finds the closest named color to the annotation and returns it. + """ + annot_colors = ( + self.colors.get("stroke") or self.colors.get("fill") or (0.0, 0.0, 0.0) + ) + nearest = None + minimum_similarity = COLOR_SIMILARITY_MINIMUM + for name, values in COLORS.items(): + similarity_ratio = self._color_similarity_ratio(values, annot_colors) + if similarity_ratio > minimum_similarity: + minimum_similarity = similarity_ratio + nearest = name + return nearest + + def headline(self, short=False, max_authors=3): + headline = pretty.paper_oneliner( + self.paper, citekey_only=short, max_authors=max_authors + ) + return re.sub(r"\[pdf\]", "", headline).rstrip() + + def _color_similarity_ratio(self, color_one, color_two): + """Return the similarity of two colors between 0 and 1. + + Takes two rgb color tuples made of floats between 0 and 1, e.g. (1, 0.65, 0) for orange, + and returns the similarity between them, with 1 being the same color and 0 being the + difference between full black and full white, as a float. + """ + return 1 - (abs(math.dist([*color_one], [*color_two])) / 3) diff --git a/extract/extract.py b/extract/extract.py index 76c1149..cdcfdf9 100644 --- a/extract/extract.py +++ b/extract/extract.py @@ -1,108 +1,18 @@ import os -import re import argparse -import math -from dataclasses import dataclass, field -from typing import Dict import fitz import Levenshtein from pubs.plugins import PapersPlugin -from pubs.paper import Paper from pubs.events import DocAddEvent, NoteEvent - from pubs import repo, pretty from pubs.utils import resolve_citekey_list from pubs.content import check_file, read_text_file, write_file from pubs.query import get_paper_filter +from .annotation import Annotation, COLOR_SIMILARITY_MINIMUM, TEXT_SIMILARITY_MINIMUM CONFIRMATION_PAPER_THRESHOLD = 5 -TEXT_SIMILARITY_MINIMUM = 0.75 -COLOR_SIMILARITY_MINIMUM = 0.833 - -COLORS = { - "red": (1, 0, 0), - "green": (0, 1, 0), - "blue": (0, 0, 1), - "yellow": (1, 1, 0), - "purple": (0.5, 0, 0.5), - "orange": (1, 0.65, 0), -} - - -@dataclass -class Annotation: - """A PDF annotation object""" - - paper: Paper - file: str - type: str = "Highlight" - text: str = "" - content: str = "" - page: int = 1 - colors: Dict = field(default_factory=lambda: {"stroke": (0.0, 0.0, 0.0)}) - tag: str = "" - - def format(self, formatting): - """Return a formatted string of the annotation. - - Given a provided formatting pattern, this method returns the annotation - formatted with the correct marker replacements and removals, ready - for display or writing. - """ - output = formatting - replacements = { - r"{quote}": self.text, - r"{note}": self.content, - r"{page}": str(self.page), - r"{newline}": "\n", - r"{tag}": self.tag, - } - pattern = re.compile( - "|".join( - [re.escape(k) for k in sorted(replacements, key=len, reverse=True)] - ), - flags=re.DOTALL, - ) - patt_quote_container = re.compile(r"{%quote_container(.*?)%}") - patt_note_container = re.compile(r"{%note_container(.*?)%}") - patt_tag_container = re.compile(r"{%tag_container(.*?)%}") - output = patt_quote_container.sub(r"\1" if self.text else "", output) - output = patt_note_container.sub(r"\1" if self.content else "", output) - output = patt_tag_container.sub(r"\1" if self.tag else "", output) - return pattern.sub(lambda x: replacements[x.group(0)], output) - - @property - def colorname(self): - """Return the stringified version of the annotation color. - - Finds the closest named color to the annotation and returns it. - """ - annot_colors = ( - self.colors.get("stroke") or self.colors.get("fill") or (0.0, 0.0, 0.0) - ) - nearest = None - minimum_similarity = COLOR_SIMILARITY_MINIMUM - for name, values in COLORS.items(): - similarity_ratio = self._color_similarity_ratio(values, annot_colors) - if similarity_ratio > minimum_similarity: - minimum_similarity = similarity_ratio - nearest = name - return nearest - - def headline(self, short=False, max_authors=3): - headline = pretty.paper_oneliner(self.paper, citekey_only=short, max_authors=max_authors) - return re.sub(r"\[pdf\]", "", headline).rstrip() - - def _color_similarity_ratio(self, color_one, color_two): - """Return the similarity of two colors between 0 and 1. - - Takes two rgb color tuples made of floats between 0 and 1, e.g. (1, 0.65, 0) for orange, - and returns the similarity between them, with 1 being the same color and 0 being the - difference between full black and full white, as a float. - """ - return 1 - (abs(math.dist([*color_one], [*color_two])) / 3) class ExtractPlugin(PapersPlugin): @@ -335,7 +245,9 @@ class ExtractPlugin(PapersPlugin): """ output = "" for citekey, annotations in annotated_papers.items(): - output += f"\n------ {annotations[0].headline(short=short_header)} ------\n\n" + output += ( + f"\n------ {annotations[0].headline(short=short_header)} ------\n\n" + ) for annotation in annotations: output += f"{annotation.format(self.formatting)}\n" output += "\n" @@ -354,14 +266,14 @@ class ExtractPlugin(PapersPlugin): if check_file(notepath, fail=False): self._append_to_note(notepath, annotations) else: - self._write_new_note(notepath, annotations) + self._write_new_note(notepath, annotations, self.short_header) self.ui.info(f"Wrote annotations to {citekey} note {notepath}.") if edit is True: self.ui.edit_file(notepath, temporary=False) NoteEvent(citekey).send() - def _write_new_note(self, notepath, annotations): + def _write_new_note(self, notepath, annotations, short_header): """Create a new note containing the annotations. Will create a new note in the notes folder of pubs From 04c8a8ed0baf2eddbc63dff9a0c44a76b5fb350e Mon Sep 17 00:00:00 2001 From: Marty Oehme Date: Sun, 25 Dec 2022 12:17:05 +0100 Subject: [PATCH 2/5] Refactor extraction to use PaperAnnotated class --- README.md | 1 + extract/annotation.py | 30 +++++++++++++++++++------ extract/extract.py | 51 ++++++++++++++++++++++++------------------- 3 files changed, 52 insertions(+), 30 deletions(-) diff --git a/README.md b/README.md index 03322d6..ab0f100 100644 --- a/README.md +++ b/README.md @@ -160,6 +160,7 @@ content, because then we can just use that. It is harder to parse if it does not - [ ] needs some way to delimit where it puts stuff and user stuff is in note - [ ] one way is to have it look at `> [17] here be extracted annotation from page seventeen` annotations and put it in between - [x] another, probably simpler first, is to just append missing annotations to the end of the note + - [ ] use similarity search instead of literal search for existing annotation (levenshtein) - [x] some highlights (or annotations in general) do not contain text as content - [x] pymupdf can extract the content of the underlying rectangle (mostly) - [x] issue is that sometimes the highlight contents are in content, sometimes a user comment instead diff --git a/extract/annotation.py b/extract/annotation.py index 3df7743..0baefbb 100644 --- a/extract/annotation.py +++ b/extract/annotation.py @@ -19,11 +19,33 @@ COLORS = { } +class PaperAnnotated(Paper): + def __init__(self, citekey, bibdata, metadata=None, annotations=[]): + super(PaperAnnotated, self).__init__(citekey, bibdata, metadata) + self.annotations = annotations + + @classmethod + def from_paper(cls, paper, annotations=[]): + return cls(paper.citekey, paper.bibdata, paper.metadata, annotations) + + def __repr__(self): + return "PaperAnnotated(%s, %s, %s)" % ( + self.citekey, + self.bibdata, + self.metadata, + ) + + def headline(self, short=False, max_authors=3): + headline = pretty.paper_oneliner( + self, citekey_only=short, max_authors=max_authors + ) + return re.sub(r"\[pdf\]", "", headline).rstrip() + + @dataclass class Annotation: """A PDF annotation object""" - paper: Paper file: str type: str = "Highlight" text: str = "" @@ -79,12 +101,6 @@ class Annotation: nearest = name return nearest - def headline(self, short=False, max_authors=3): - headline = pretty.paper_oneliner( - self.paper, citekey_only=short, max_authors=max_authors - ) - return re.sub(r"\[pdf\]", "", headline).rstrip() - def _color_similarity_ratio(self, color_one, color_two): """Return the similarity of two colors between 0 and 1. diff --git a/extract/extract.py b/extract/extract.py index cdcfdf9..d0496f4 100644 --- a/extract/extract.py +++ b/extract/extract.py @@ -10,7 +10,12 @@ from pubs import repo, pretty from pubs.utils import resolve_citekey_list from pubs.content import check_file, read_text_file, write_file from pubs.query import get_paper_filter -from .annotation import Annotation, COLOR_SIMILARITY_MINIMUM, TEXT_SIMILARITY_MINIMUM +from .annotation import ( + PaperAnnotated, + Annotation, + COLOR_SIMILARITY_MINIMUM, + TEXT_SIMILARITY_MINIMUM, +) CONFIRMATION_PAPER_THRESHOLD = 5 @@ -34,6 +39,7 @@ class ExtractPlugin(PapersPlugin): def __init__(self, conf, ui): self.ui = ui self.note_extension = conf["main"]["note_extension"] + self.max_authors = conf["main"]["max_authors"] self.repository = repo.Repository(conf) self.pubsdir = os.path.expanduser(conf["main"]["pubsdir"]) self.broker = self.repository.databroker @@ -123,12 +129,12 @@ class ExtractPlugin(PapersPlugin): Returns all annotations belonging to the papers that are described by the citekeys passed in. """ - papers_annotated = {} + papers_annotated = [] for paper in papers: file = self._get_file(paper) try: - annotations = self._get_annotations(file, paper) - papers_annotated[paper.citekey] = annotations + annotations = self._get_annotations(file) + papers_annotated.append(PaperAnnotated.from_paper(paper, annotations)) except fitz.FileDataError as e: self.ui.error(f"Document {file} is broken: {e}") return papers_annotated @@ -166,7 +172,7 @@ class ExtractPlugin(PapersPlugin): self.ui.message( "\n".join( pretty.paper_oneliner( - p, citekey_only=False, max_authors=conf["main"]["max_authors"] + p, citekey_only=False, max_authors=self.max_authors ) for p in papers ) @@ -188,7 +194,7 @@ class ExtractPlugin(PapersPlugin): self.ui.warning(f"{paper.citekey} has no valid document.") return path - def _get_annotations(self, filename, paper): + def _get_annotations(self, filename): """Extract annotations from a file. Returns all readable annotations contained in the file @@ -202,7 +208,6 @@ class ExtractPlugin(PapersPlugin): quote, note = self._retrieve_annotation_content(page, annot) a = Annotation( file=filename, - paper=paper, text=quote, content=note, colors=annot.colors, @@ -237,18 +242,18 @@ class ExtractPlugin(PapersPlugin): # highlight with selection not in note return (written, "") - def _to_stdout(self, annotated_papers, short_header=True): + def _to_stdout(self, annotated_papers, short_header=False): """Write annotations to stdout. Simply outputs the gathered annotations over stdout ready to be passed on through pipelines etc. """ output = "" - for citekey, annotations in annotated_papers.items(): + for paper in annotated_papers: output += ( - f"\n------ {annotations[0].headline(short=short_header)} ------\n\n" + f"\n------ {paper.headline(self.short_header, self.max_authors)} ------\n\n" ) - for annotation in annotations: + for annotation in paper.annotations: output += f"{annotation.format(self.formatting)}\n" output += "\n" self.ui.message(output.strip()) @@ -260,31 +265,31 @@ class ExtractPlugin(PapersPlugin): in the pubs notes directory. Creates new notes for citekeys missing a note or appends to existing. """ - for citekey, annotations in annotated_papers.items(): - if annotations: - notepath = self.broker.real_notepath(citekey, note_extension) + for paper in annotated_papers: + if paper.annotations: + notepath = self.broker.real_notepath(paper.citekey, note_extension) if check_file(notepath, fail=False): - self._append_to_note(notepath, annotations) + self._append_to_note(notepath, paper) else: - self._write_new_note(notepath, annotations, self.short_header) - self.ui.info(f"Wrote annotations to {citekey} note {notepath}.") + self._write_new_note(notepath, paper, paper.headline(short=True, max_authors=self.max_authors)) + self.ui.info(f"Wrote annotations to {paper.citekey} note {notepath}.") if edit is True: self.ui.edit_file(notepath, temporary=False) - NoteEvent(citekey).send() + NoteEvent(paper.citekey).send() - def _write_new_note(self, notepath, annotations, short_header): + def _write_new_note(self, notepath, paper, headline): """Create a new note containing the annotations. Will create a new note in the notes folder of pubs and fill it with the annotations extracted from pdf. """ - output = f"# {annotations[0].headline(short=short_header)}\n\n" - for annotation in annotations: + output = f"# {headline}\n\n" + for annotation in paper.annotations: output += f"{annotation.format(self.formatting)}\n\n" write_file(notepath, output, "w") - def _append_to_note(self, notepath, annotations): + def _append_to_note(self, notepath, paper): """Append new annotations to the end of a note. Looks through note to determine any new annotations which should be @@ -293,7 +298,7 @@ class ExtractPlugin(PapersPlugin): existing = read_text_file(notepath) # removed annotations already found in the note existing_dropped = [ - x for x in annotations if x.format(self.formatting) not in existing + x for x in paper.annotations if x.format(self.formatting) not in existing ] if not existing_dropped: return From e1a7d9f613267637beb3636b97cbf9591eb412a4 Mon Sep 17 00:00:00 2001 From: Marty Oehme Date: Fri, 13 Jan 2023 18:42:12 +0100 Subject: [PATCH 3/5] Move plugin to nested folder for easy installation Moved the plugin files to the directory structure they will have in the python site-packages directory, being placed within the plugs directory of the pubs application directory. --- {extract => pubs/plugs/extract}/__init__.py | 0 {extract => pubs/plugs/extract}/annotation.py | 0 {extract => pubs/plugs/extract}/extract.py | 0 pyproject.toml | 2 +- 4 files changed, 1 insertion(+), 1 deletion(-) rename {extract => pubs/plugs/extract}/__init__.py (100%) rename {extract => pubs/plugs/extract}/annotation.py (100%) rename {extract => pubs/plugs/extract}/extract.py (100%) diff --git a/extract/__init__.py b/pubs/plugs/extract/__init__.py similarity index 100% rename from extract/__init__.py rename to pubs/plugs/extract/__init__.py diff --git a/extract/annotation.py b/pubs/plugs/extract/annotation.py similarity index 100% rename from extract/annotation.py rename to pubs/plugs/extract/annotation.py diff --git a/extract/extract.py b/pubs/plugs/extract/extract.py similarity index 100% rename from extract/extract.py rename to pubs/plugs/extract/extract.py diff --git a/pyproject.toml b/pyproject.toml index 7bab3e1..0ee52ce 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ description = "A pdf annotation extraction plugin for pubs bibliography manager" authors = ["Marty Oehme "] license = "LGPL-3.0" readme = "README.md" -packages = [{include = "extract"}] +packages = [{include = "pubs"}] [tool.poetry.dependencies] python = "^3.10" From 0634cbb381f624f690ea08a3339a37a98d72a95c Mon Sep 17 00:00:00 2001 From: Marty Oehme Date: Fri, 13 Jan 2023 19:06:20 +0100 Subject: [PATCH 4/5] Format plugin file with black --- pubs/plugs/extract/extract.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/pubs/plugs/extract/extract.py b/pubs/plugs/extract/extract.py index d0496f4..c513813 100644 --- a/pubs/plugs/extract/extract.py +++ b/pubs/plugs/extract/extract.py @@ -250,9 +250,7 @@ class ExtractPlugin(PapersPlugin): """ output = "" for paper in annotated_papers: - output += ( - f"\n------ {paper.headline(self.short_header, self.max_authors)} ------\n\n" - ) + output += f"\n------ {paper.headline(self.short_header, self.max_authors)} ------\n\n" for annotation in paper.annotations: output += f"{annotation.format(self.formatting)}\n" output += "\n" @@ -271,7 +269,11 @@ class ExtractPlugin(PapersPlugin): if check_file(notepath, fail=False): self._append_to_note(notepath, paper) else: - self._write_new_note(notepath, paper, paper.headline(short=True, max_authors=self.max_authors)) + self._write_new_note( + notepath, + paper, + paper.headline(short=True, max_authors=self.max_authors), + ) self.ui.info(f"Wrote annotations to {paper.citekey} note {notepath}.") if edit is True: From 2c5d096d08327f030cc898f268682f35c08533c8 Mon Sep 17 00:00:00 2001 From: Marty Oehme Date: Fri, 13 Jan 2023 19:14:06 +0100 Subject: [PATCH 5/5] Add continuous integration pipeline --- .woodpecker.yml | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 .woodpecker.yml diff --git a/.woodpecker.yml b/.woodpecker.yml new file mode 100644 index 0000000..caa86d4 --- /dev/null +++ b/.woodpecker.yml @@ -0,0 +1,47 @@ +branches: main + +pipeline: + code_lint: + image: python + commands: + - pip install poetry + - poetry install + - pip install black + - echo "----------------- running lint ------------------" + - python --version && poetry --version && black --version + - poetry run black . + + build_dist: + image: python + commands: + - pip install poetry + - poetry install + - echo "----------------- running analysis ------------------" + - python --version && poetry --version + - poetry build + when: + branch: main + + gitea_release: + image: plugins/gitea-release + settings: + api_key: + from_secret: gitea_release_token + base_url: https://git.martyoeh.me + files: dist/* + title: NEWEST_VERSION.md + note: NEWEST_CHANGES.md + when: + event: tag + tag: v* + + pypi_release: + image: python + commands: + - pip install poetry + - poetry install + - echo "----------------- publishing to pypi ------------------" + - poetry publish --username "$PYPI_USERNAME" --password "$PYPI_PASSWORD" + when: + event: tag + tag: v*