diff --git a/.woodpecker.yml b/.woodpecker.yml deleted file mode 100644 index caa86d4..0000000 --- a/.woodpecker.yml +++ /dev/null @@ -1,47 +0,0 @@ -branches: main - -pipeline: - code_lint: - image: python - commands: - - pip install poetry - - poetry install - - pip install black - - echo "----------------- running lint ------------------" - - python --version && poetry --version && black --version - - poetry run black . - - build_dist: - image: python - commands: - - pip install poetry - - poetry install - - echo "----------------- running analysis ------------------" - - python --version && poetry --version - - poetry build - when: - branch: main - - gitea_release: - image: plugins/gitea-release - settings: - api_key: - from_secret: gitea_release_token - base_url: https://git.martyoeh.me - files: dist/* - title: NEWEST_VERSION.md - note: NEWEST_CHANGES.md - when: - event: tag - tag: v* - - pypi_release: - image: python - commands: - - pip install poetry - - poetry install - - echo "----------------- publishing to pypi ------------------" - - poetry publish --username "$PYPI_USERNAME" --password "$PYPI_PASSWORD" - when: - event: tag - tag: v* diff --git a/README.md b/README.md index ab0f100..03322d6 100644 --- a/README.md +++ b/README.md @@ -160,7 +160,6 @@ content, because then we can just use that. It is harder to parse if it does not - [ ] needs some way to delimit where it puts stuff and user stuff is in note - [ ] one way is to have it look at `> [17] here be extracted annotation from page seventeen` annotations and put it in between - [x] another, probably simpler first, is to just append missing annotations to the end of the note - - [ ] use similarity search instead of literal search for existing annotation (levenshtein) - [x] some highlights (or annotations in general) do not contain text as content - [x] pymupdf can extract the content of the underlying rectangle (mostly) - [x] issue is that sometimes the highlight contents are in content, sometimes a user comment instead diff --git a/pubs/plugs/extract/__init__.py b/extract/__init__.py similarity index 100% rename from pubs/plugs/extract/__init__.py rename to extract/__init__.py diff --git a/pubs/plugs/extract/extract.py b/extract/extract.py similarity index 69% rename from pubs/plugs/extract/extract.py rename to extract/extract.py index c513813..76c1149 100644 --- a/pubs/plugs/extract/extract.py +++ b/extract/extract.py @@ -1,23 +1,108 @@ import os +import re import argparse +import math +from dataclasses import dataclass, field +from typing import Dict import fitz import Levenshtein from pubs.plugins import PapersPlugin +from pubs.paper import Paper from pubs.events import DocAddEvent, NoteEvent + from pubs import repo, pretty from pubs.utils import resolve_citekey_list from pubs.content import check_file, read_text_file, write_file from pubs.query import get_paper_filter -from .annotation import ( - PaperAnnotated, - Annotation, - COLOR_SIMILARITY_MINIMUM, - TEXT_SIMILARITY_MINIMUM, -) CONFIRMATION_PAPER_THRESHOLD = 5 +TEXT_SIMILARITY_MINIMUM = 0.75 +COLOR_SIMILARITY_MINIMUM = 0.833 + +COLORS = { + "red": (1, 0, 0), + "green": (0, 1, 0), + "blue": (0, 0, 1), + "yellow": (1, 1, 0), + "purple": (0.5, 0, 0.5), + "orange": (1, 0.65, 0), +} + + +@dataclass +class Annotation: + """A PDF annotation object""" + + paper: Paper + file: str + type: str = "Highlight" + text: str = "" + content: str = "" + page: int = 1 + colors: Dict = field(default_factory=lambda: {"stroke": (0.0, 0.0, 0.0)}) + tag: str = "" + + def format(self, formatting): + """Return a formatted string of the annotation. + + Given a provided formatting pattern, this method returns the annotation + formatted with the correct marker replacements and removals, ready + for display or writing. + """ + output = formatting + replacements = { + r"{quote}": self.text, + r"{note}": self.content, + r"{page}": str(self.page), + r"{newline}": "\n", + r"{tag}": self.tag, + } + pattern = re.compile( + "|".join( + [re.escape(k) for k in sorted(replacements, key=len, reverse=True)] + ), + flags=re.DOTALL, + ) + patt_quote_container = re.compile(r"{%quote_container(.*?)%}") + patt_note_container = re.compile(r"{%note_container(.*?)%}") + patt_tag_container = re.compile(r"{%tag_container(.*?)%}") + output = patt_quote_container.sub(r"\1" if self.text else "", output) + output = patt_note_container.sub(r"\1" if self.content else "", output) + output = patt_tag_container.sub(r"\1" if self.tag else "", output) + return pattern.sub(lambda x: replacements[x.group(0)], output) + + @property + def colorname(self): + """Return the stringified version of the annotation color. + + Finds the closest named color to the annotation and returns it. + """ + annot_colors = ( + self.colors.get("stroke") or self.colors.get("fill") or (0.0, 0.0, 0.0) + ) + nearest = None + minimum_similarity = COLOR_SIMILARITY_MINIMUM + for name, values in COLORS.items(): + similarity_ratio = self._color_similarity_ratio(values, annot_colors) + if similarity_ratio > minimum_similarity: + minimum_similarity = similarity_ratio + nearest = name + return nearest + + def headline(self, short=False, max_authors=3): + headline = pretty.paper_oneliner(self.paper, citekey_only=short, max_authors=max_authors) + return re.sub(r"\[pdf\]", "", headline).rstrip() + + def _color_similarity_ratio(self, color_one, color_two): + """Return the similarity of two colors between 0 and 1. + + Takes two rgb color tuples made of floats between 0 and 1, e.g. (1, 0.65, 0) for orange, + and returns the similarity between them, with 1 being the same color and 0 being the + difference between full black and full white, as a float. + """ + return 1 - (abs(math.dist([*color_one], [*color_two])) / 3) class ExtractPlugin(PapersPlugin): @@ -39,7 +124,6 @@ class ExtractPlugin(PapersPlugin): def __init__(self, conf, ui): self.ui = ui self.note_extension = conf["main"]["note_extension"] - self.max_authors = conf["main"]["max_authors"] self.repository = repo.Repository(conf) self.pubsdir = os.path.expanduser(conf["main"]["pubsdir"]) self.broker = self.repository.databroker @@ -129,12 +213,12 @@ class ExtractPlugin(PapersPlugin): Returns all annotations belonging to the papers that are described by the citekeys passed in. """ - papers_annotated = [] + papers_annotated = {} for paper in papers: file = self._get_file(paper) try: - annotations = self._get_annotations(file) - papers_annotated.append(PaperAnnotated.from_paper(paper, annotations)) + annotations = self._get_annotations(file, paper) + papers_annotated[paper.citekey] = annotations except fitz.FileDataError as e: self.ui.error(f"Document {file} is broken: {e}") return papers_annotated @@ -172,7 +256,7 @@ class ExtractPlugin(PapersPlugin): self.ui.message( "\n".join( pretty.paper_oneliner( - p, citekey_only=False, max_authors=self.max_authors + p, citekey_only=False, max_authors=conf["main"]["max_authors"] ) for p in papers ) @@ -194,7 +278,7 @@ class ExtractPlugin(PapersPlugin): self.ui.warning(f"{paper.citekey} has no valid document.") return path - def _get_annotations(self, filename): + def _get_annotations(self, filename, paper): """Extract annotations from a file. Returns all readable annotations contained in the file @@ -208,6 +292,7 @@ class ExtractPlugin(PapersPlugin): quote, note = self._retrieve_annotation_content(page, annot) a = Annotation( file=filename, + paper=paper, text=quote, content=note, colors=annot.colors, @@ -242,16 +327,16 @@ class ExtractPlugin(PapersPlugin): # highlight with selection not in note return (written, "") - def _to_stdout(self, annotated_papers, short_header=False): + def _to_stdout(self, annotated_papers, short_header=True): """Write annotations to stdout. Simply outputs the gathered annotations over stdout ready to be passed on through pipelines etc. """ output = "" - for paper in annotated_papers: - output += f"\n------ {paper.headline(self.short_header, self.max_authors)} ------\n\n" - for annotation in paper.annotations: + for citekey, annotations in annotated_papers.items(): + output += f"\n------ {annotations[0].headline(short=short_header)} ------\n\n" + for annotation in annotations: output += f"{annotation.format(self.formatting)}\n" output += "\n" self.ui.message(output.strip()) @@ -263,35 +348,31 @@ class ExtractPlugin(PapersPlugin): in the pubs notes directory. Creates new notes for citekeys missing a note or appends to existing. """ - for paper in annotated_papers: - if paper.annotations: - notepath = self.broker.real_notepath(paper.citekey, note_extension) + for citekey, annotations in annotated_papers.items(): + if annotations: + notepath = self.broker.real_notepath(citekey, note_extension) if check_file(notepath, fail=False): - self._append_to_note(notepath, paper) + self._append_to_note(notepath, annotations) else: - self._write_new_note( - notepath, - paper, - paper.headline(short=True, max_authors=self.max_authors), - ) - self.ui.info(f"Wrote annotations to {paper.citekey} note {notepath}.") + self._write_new_note(notepath, annotations) + self.ui.info(f"Wrote annotations to {citekey} note {notepath}.") if edit is True: self.ui.edit_file(notepath, temporary=False) - NoteEvent(paper.citekey).send() + NoteEvent(citekey).send() - def _write_new_note(self, notepath, paper, headline): + def _write_new_note(self, notepath, annotations): """Create a new note containing the annotations. Will create a new note in the notes folder of pubs and fill it with the annotations extracted from pdf. """ - output = f"# {headline}\n\n" - for annotation in paper.annotations: + output = f"# {annotations[0].headline(short=short_header)}\n\n" + for annotation in annotations: output += f"{annotation.format(self.formatting)}\n\n" write_file(notepath, output, "w") - def _append_to_note(self, notepath, paper): + def _append_to_note(self, notepath, annotations): """Append new annotations to the end of a note. Looks through note to determine any new annotations which should be @@ -300,7 +381,7 @@ class ExtractPlugin(PapersPlugin): existing = read_text_file(notepath) # removed annotations already found in the note existing_dropped = [ - x for x in paper.annotations if x.format(self.formatting) not in existing + x for x in annotations if x.format(self.formatting) not in existing ] if not existing_dropped: return diff --git a/pubs/plugs/extract/annotation.py b/pubs/plugs/extract/annotation.py deleted file mode 100644 index 0baefbb..0000000 --- a/pubs/plugs/extract/annotation.py +++ /dev/null @@ -1,111 +0,0 @@ -import math -import re -from dataclasses import dataclass, field -from typing import Dict - -from pubs.paper import Paper -from pubs import pretty - -TEXT_SIMILARITY_MINIMUM = 0.75 -COLOR_SIMILARITY_MINIMUM = 0.833 - -COLORS = { - "red": (1, 0, 0), - "green": (0, 1, 0), - "blue": (0, 0, 1), - "yellow": (1, 1, 0), - "purple": (0.5, 0, 0.5), - "orange": (1, 0.65, 0), -} - - -class PaperAnnotated(Paper): - def __init__(self, citekey, bibdata, metadata=None, annotations=[]): - super(PaperAnnotated, self).__init__(citekey, bibdata, metadata) - self.annotations = annotations - - @classmethod - def from_paper(cls, paper, annotations=[]): - return cls(paper.citekey, paper.bibdata, paper.metadata, annotations) - - def __repr__(self): - return "PaperAnnotated(%s, %s, %s)" % ( - self.citekey, - self.bibdata, - self.metadata, - ) - - def headline(self, short=False, max_authors=3): - headline = pretty.paper_oneliner( - self, citekey_only=short, max_authors=max_authors - ) - return re.sub(r"\[pdf\]", "", headline).rstrip() - - -@dataclass -class Annotation: - """A PDF annotation object""" - - file: str - type: str = "Highlight" - text: str = "" - content: str = "" - page: int = 1 - colors: Dict = field(default_factory=lambda: {"stroke": (0.0, 0.0, 0.0)}) - tag: str = "" - - def format(self, formatting): - """Return a formatted string of the annotation. - - Given a provided formatting pattern, this method returns the annotation - formatted with the correct marker replacements and removals, ready - for display or writing. - """ - output = formatting - replacements = { - r"{quote}": self.text, - r"{note}": self.content, - r"{page}": str(self.page), - r"{newline}": "\n", - r"{tag}": self.tag, - } - pattern = re.compile( - "|".join( - [re.escape(k) for k in sorted(replacements, key=len, reverse=True)] - ), - flags=re.DOTALL, - ) - patt_quote_container = re.compile(r"{%quote_container(.*?)%}") - patt_note_container = re.compile(r"{%note_container(.*?)%}") - patt_tag_container = re.compile(r"{%tag_container(.*?)%}") - output = patt_quote_container.sub(r"\1" if self.text else "", output) - output = patt_note_container.sub(r"\1" if self.content else "", output) - output = patt_tag_container.sub(r"\1" if self.tag else "", output) - return pattern.sub(lambda x: replacements[x.group(0)], output) - - @property - def colorname(self): - """Return the stringified version of the annotation color. - - Finds the closest named color to the annotation and returns it. - """ - annot_colors = ( - self.colors.get("stroke") or self.colors.get("fill") or (0.0, 0.0, 0.0) - ) - nearest = None - minimum_similarity = COLOR_SIMILARITY_MINIMUM - for name, values in COLORS.items(): - similarity_ratio = self._color_similarity_ratio(values, annot_colors) - if similarity_ratio > minimum_similarity: - minimum_similarity = similarity_ratio - nearest = name - return nearest - - def _color_similarity_ratio(self, color_one, color_two): - """Return the similarity of two colors between 0 and 1. - - Takes two rgb color tuples made of floats between 0 and 1, e.g. (1, 0.65, 0) for orange, - and returns the similarity between them, with 1 being the same color and 0 being the - difference between full black and full white, as a float. - """ - return 1 - (abs(math.dist([*color_one], [*color_two])) / 3) diff --git a/pyproject.toml b/pyproject.toml index 0ee52ce..7bab3e1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ description = "A pdf annotation extraction plugin for pubs bibliography manager" authors = ["Marty Oehme "] license = "LGPL-3.0" readme = "README.md" -packages = [{include = "pubs"}] +packages = [{include = "extract"}] [tool.poetry.dependencies] python = "^3.10"