diff --git a/.woodpecker.yml b/.woodpecker.yml new file mode 100644 index 0000000..caa86d4 --- /dev/null +++ b/.woodpecker.yml @@ -0,0 +1,47 @@ +branches: main + +pipeline: + code_lint: + image: python + commands: + - pip install poetry + - poetry install + - pip install black + - echo "----------------- running lint ------------------" + - python --version && poetry --version && black --version + - poetry run black . + + build_dist: + image: python + commands: + - pip install poetry + - poetry install + - echo "----------------- running analysis ------------------" + - python --version && poetry --version + - poetry build + when: + branch: main + + gitea_release: + image: plugins/gitea-release + settings: + api_key: + from_secret: gitea_release_token + base_url: https://git.martyoeh.me + files: dist/* + title: NEWEST_VERSION.md + note: NEWEST_CHANGES.md + when: + event: tag + tag: v* + + pypi_release: + image: python + commands: + - pip install poetry + - poetry install + - echo "----------------- publishing to pypi ------------------" + - poetry publish --username "$PYPI_USERNAME" --password "$PYPI_PASSWORD" + when: + event: tag + tag: v* diff --git a/README.md b/README.md index 03322d6..ab0f100 100644 --- a/README.md +++ b/README.md @@ -160,6 +160,7 @@ content, because then we can just use that. It is harder to parse if it does not - [ ] needs some way to delimit where it puts stuff and user stuff is in note - [ ] one way is to have it look at `> [17] here be extracted annotation from page seventeen` annotations and put it in between - [x] another, probably simpler first, is to just append missing annotations to the end of the note + - [ ] use similarity search instead of literal search for existing annotation (levenshtein) - [x] some highlights (or annotations in general) do not contain text as content - [x] pymupdf can extract the content of the underlying rectangle (mostly) - [x] issue is that sometimes the highlight contents are in content, sometimes a user comment instead diff --git a/extract/__init__.py b/pubs/plugs/extract/__init__.py similarity index 100% rename from extract/__init__.py rename to pubs/plugs/extract/__init__.py diff --git a/pubs/plugs/extract/annotation.py b/pubs/plugs/extract/annotation.py new file mode 100644 index 0000000..0baefbb --- /dev/null +++ b/pubs/plugs/extract/annotation.py @@ -0,0 +1,111 @@ +import math +import re +from dataclasses import dataclass, field +from typing import Dict + +from pubs.paper import Paper +from pubs import pretty + +TEXT_SIMILARITY_MINIMUM = 0.75 +COLOR_SIMILARITY_MINIMUM = 0.833 + +COLORS = { + "red": (1, 0, 0), + "green": (0, 1, 0), + "blue": (0, 0, 1), + "yellow": (1, 1, 0), + "purple": (0.5, 0, 0.5), + "orange": (1, 0.65, 0), +} + + +class PaperAnnotated(Paper): + def __init__(self, citekey, bibdata, metadata=None, annotations=[]): + super(PaperAnnotated, self).__init__(citekey, bibdata, metadata) + self.annotations = annotations + + @classmethod + def from_paper(cls, paper, annotations=[]): + return cls(paper.citekey, paper.bibdata, paper.metadata, annotations) + + def __repr__(self): + return "PaperAnnotated(%s, %s, %s)" % ( + self.citekey, + self.bibdata, + self.metadata, + ) + + def headline(self, short=False, max_authors=3): + headline = pretty.paper_oneliner( + self, citekey_only=short, max_authors=max_authors + ) + return re.sub(r"\[pdf\]", "", headline).rstrip() + + +@dataclass +class Annotation: + """A PDF annotation object""" + + file: str + type: str = "Highlight" + text: str = "" + content: str = "" + page: int = 1 + colors: Dict = field(default_factory=lambda: {"stroke": (0.0, 0.0, 0.0)}) + tag: str = "" + + def format(self, formatting): + """Return a formatted string of the annotation. + + Given a provided formatting pattern, this method returns the annotation + formatted with the correct marker replacements and removals, ready + for display or writing. + """ + output = formatting + replacements = { + r"{quote}": self.text, + r"{note}": self.content, + r"{page}": str(self.page), + r"{newline}": "\n", + r"{tag}": self.tag, + } + pattern = re.compile( + "|".join( + [re.escape(k) for k in sorted(replacements, key=len, reverse=True)] + ), + flags=re.DOTALL, + ) + patt_quote_container = re.compile(r"{%quote_container(.*?)%}") + patt_note_container = re.compile(r"{%note_container(.*?)%}") + patt_tag_container = re.compile(r"{%tag_container(.*?)%}") + output = patt_quote_container.sub(r"\1" if self.text else "", output) + output = patt_note_container.sub(r"\1" if self.content else "", output) + output = patt_tag_container.sub(r"\1" if self.tag else "", output) + return pattern.sub(lambda x: replacements[x.group(0)], output) + + @property + def colorname(self): + """Return the stringified version of the annotation color. + + Finds the closest named color to the annotation and returns it. + """ + annot_colors = ( + self.colors.get("stroke") or self.colors.get("fill") or (0.0, 0.0, 0.0) + ) + nearest = None + minimum_similarity = COLOR_SIMILARITY_MINIMUM + for name, values in COLORS.items(): + similarity_ratio = self._color_similarity_ratio(values, annot_colors) + if similarity_ratio > minimum_similarity: + minimum_similarity = similarity_ratio + nearest = name + return nearest + + def _color_similarity_ratio(self, color_one, color_two): + """Return the similarity of two colors between 0 and 1. + + Takes two rgb color tuples made of floats between 0 and 1, e.g. (1, 0.65, 0) for orange, + and returns the similarity between them, with 1 being the same color and 0 being the + difference between full black and full white, as a float. + """ + return 1 - (abs(math.dist([*color_one], [*color_two])) / 3) diff --git a/extract/extract.py b/pubs/plugs/extract/extract.py similarity index 69% rename from extract/extract.py rename to pubs/plugs/extract/extract.py index 76c1149..c513813 100644 --- a/extract/extract.py +++ b/pubs/plugs/extract/extract.py @@ -1,108 +1,23 @@ import os -import re import argparse -import math -from dataclasses import dataclass, field -from typing import Dict import fitz import Levenshtein from pubs.plugins import PapersPlugin -from pubs.paper import Paper from pubs.events import DocAddEvent, NoteEvent - from pubs import repo, pretty from pubs.utils import resolve_citekey_list from pubs.content import check_file, read_text_file, write_file from pubs.query import get_paper_filter +from .annotation import ( + PaperAnnotated, + Annotation, + COLOR_SIMILARITY_MINIMUM, + TEXT_SIMILARITY_MINIMUM, +) CONFIRMATION_PAPER_THRESHOLD = 5 -TEXT_SIMILARITY_MINIMUM = 0.75 -COLOR_SIMILARITY_MINIMUM = 0.833 - -COLORS = { - "red": (1, 0, 0), - "green": (0, 1, 0), - "blue": (0, 0, 1), - "yellow": (1, 1, 0), - "purple": (0.5, 0, 0.5), - "orange": (1, 0.65, 0), -} - - -@dataclass -class Annotation: - """A PDF annotation object""" - - paper: Paper - file: str - type: str = "Highlight" - text: str = "" - content: str = "" - page: int = 1 - colors: Dict = field(default_factory=lambda: {"stroke": (0.0, 0.0, 0.0)}) - tag: str = "" - - def format(self, formatting): - """Return a formatted string of the annotation. - - Given a provided formatting pattern, this method returns the annotation - formatted with the correct marker replacements and removals, ready - for display or writing. - """ - output = formatting - replacements = { - r"{quote}": self.text, - r"{note}": self.content, - r"{page}": str(self.page), - r"{newline}": "\n", - r"{tag}": self.tag, - } - pattern = re.compile( - "|".join( - [re.escape(k) for k in sorted(replacements, key=len, reverse=True)] - ), - flags=re.DOTALL, - ) - patt_quote_container = re.compile(r"{%quote_container(.*?)%}") - patt_note_container = re.compile(r"{%note_container(.*?)%}") - patt_tag_container = re.compile(r"{%tag_container(.*?)%}") - output = patt_quote_container.sub(r"\1" if self.text else "", output) - output = patt_note_container.sub(r"\1" if self.content else "", output) - output = patt_tag_container.sub(r"\1" if self.tag else "", output) - return pattern.sub(lambda x: replacements[x.group(0)], output) - - @property - def colorname(self): - """Return the stringified version of the annotation color. - - Finds the closest named color to the annotation and returns it. - """ - annot_colors = ( - self.colors.get("stroke") or self.colors.get("fill") or (0.0, 0.0, 0.0) - ) - nearest = None - minimum_similarity = COLOR_SIMILARITY_MINIMUM - for name, values in COLORS.items(): - similarity_ratio = self._color_similarity_ratio(values, annot_colors) - if similarity_ratio > minimum_similarity: - minimum_similarity = similarity_ratio - nearest = name - return nearest - - def headline(self, short=False, max_authors=3): - headline = pretty.paper_oneliner(self.paper, citekey_only=short, max_authors=max_authors) - return re.sub(r"\[pdf\]", "", headline).rstrip() - - def _color_similarity_ratio(self, color_one, color_two): - """Return the similarity of two colors between 0 and 1. - - Takes two rgb color tuples made of floats between 0 and 1, e.g. (1, 0.65, 0) for orange, - and returns the similarity between them, with 1 being the same color and 0 being the - difference between full black and full white, as a float. - """ - return 1 - (abs(math.dist([*color_one], [*color_two])) / 3) class ExtractPlugin(PapersPlugin): @@ -124,6 +39,7 @@ class ExtractPlugin(PapersPlugin): def __init__(self, conf, ui): self.ui = ui self.note_extension = conf["main"]["note_extension"] + self.max_authors = conf["main"]["max_authors"] self.repository = repo.Repository(conf) self.pubsdir = os.path.expanduser(conf["main"]["pubsdir"]) self.broker = self.repository.databroker @@ -213,12 +129,12 @@ class ExtractPlugin(PapersPlugin): Returns all annotations belonging to the papers that are described by the citekeys passed in. """ - papers_annotated = {} + papers_annotated = [] for paper in papers: file = self._get_file(paper) try: - annotations = self._get_annotations(file, paper) - papers_annotated[paper.citekey] = annotations + annotations = self._get_annotations(file) + papers_annotated.append(PaperAnnotated.from_paper(paper, annotations)) except fitz.FileDataError as e: self.ui.error(f"Document {file} is broken: {e}") return papers_annotated @@ -256,7 +172,7 @@ class ExtractPlugin(PapersPlugin): self.ui.message( "\n".join( pretty.paper_oneliner( - p, citekey_only=False, max_authors=conf["main"]["max_authors"] + p, citekey_only=False, max_authors=self.max_authors ) for p in papers ) @@ -278,7 +194,7 @@ class ExtractPlugin(PapersPlugin): self.ui.warning(f"{paper.citekey} has no valid document.") return path - def _get_annotations(self, filename, paper): + def _get_annotations(self, filename): """Extract annotations from a file. Returns all readable annotations contained in the file @@ -292,7 +208,6 @@ class ExtractPlugin(PapersPlugin): quote, note = self._retrieve_annotation_content(page, annot) a = Annotation( file=filename, - paper=paper, text=quote, content=note, colors=annot.colors, @@ -327,16 +242,16 @@ class ExtractPlugin(PapersPlugin): # highlight with selection not in note return (written, "") - def _to_stdout(self, annotated_papers, short_header=True): + def _to_stdout(self, annotated_papers, short_header=False): """Write annotations to stdout. Simply outputs the gathered annotations over stdout ready to be passed on through pipelines etc. """ output = "" - for citekey, annotations in annotated_papers.items(): - output += f"\n------ {annotations[0].headline(short=short_header)} ------\n\n" - for annotation in annotations: + for paper in annotated_papers: + output += f"\n------ {paper.headline(self.short_header, self.max_authors)} ------\n\n" + for annotation in paper.annotations: output += f"{annotation.format(self.formatting)}\n" output += "\n" self.ui.message(output.strip()) @@ -348,31 +263,35 @@ class ExtractPlugin(PapersPlugin): in the pubs notes directory. Creates new notes for citekeys missing a note or appends to existing. """ - for citekey, annotations in annotated_papers.items(): - if annotations: - notepath = self.broker.real_notepath(citekey, note_extension) + for paper in annotated_papers: + if paper.annotations: + notepath = self.broker.real_notepath(paper.citekey, note_extension) if check_file(notepath, fail=False): - self._append_to_note(notepath, annotations) + self._append_to_note(notepath, paper) else: - self._write_new_note(notepath, annotations) - self.ui.info(f"Wrote annotations to {citekey} note {notepath}.") + self._write_new_note( + notepath, + paper, + paper.headline(short=True, max_authors=self.max_authors), + ) + self.ui.info(f"Wrote annotations to {paper.citekey} note {notepath}.") if edit is True: self.ui.edit_file(notepath, temporary=False) - NoteEvent(citekey).send() + NoteEvent(paper.citekey).send() - def _write_new_note(self, notepath, annotations): + def _write_new_note(self, notepath, paper, headline): """Create a new note containing the annotations. Will create a new note in the notes folder of pubs and fill it with the annotations extracted from pdf. """ - output = f"# {annotations[0].headline(short=short_header)}\n\n" - for annotation in annotations: + output = f"# {headline}\n\n" + for annotation in paper.annotations: output += f"{annotation.format(self.formatting)}\n\n" write_file(notepath, output, "w") - def _append_to_note(self, notepath, annotations): + def _append_to_note(self, notepath, paper): """Append new annotations to the end of a note. Looks through note to determine any new annotations which should be @@ -381,7 +300,7 @@ class ExtractPlugin(PapersPlugin): existing = read_text_file(notepath) # removed annotations already found in the note existing_dropped = [ - x for x in annotations if x.format(self.formatting) not in existing + x for x in paper.annotations if x.format(self.formatting) not in existing ] if not existing_dropped: return diff --git a/pyproject.toml b/pyproject.toml index 7bab3e1..0ee52ce 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ description = "A pdf annotation extraction plugin for pubs bibliography manager" authors = ["Marty Oehme "] license = "LGPL-3.0" readme = "README.md" -packages = [{include = "extract"}] +packages = [{include = "pubs"}] [tool.poetry.dependencies] python = "^3.10"