Add continuous integration pipeline

Format plugin file with black
Move plugin to nested folder for easy installation
2023-01-13 19:14:06 +01:00 · 2023-01-13 19:06:20 +01:00 · 2023-01-13 18:50:42 +01:00 · 2022-12-25 12:17:05 +01:00 · 2022-12-25 10:56:56 +01:00
6 changed files with 192 additions and 114 deletions
--- a/.woodpecker.yml
+++ b/.woodpecker.yml
@ -0,0 +1,47 @@
 branches: main
 pipeline:
  code_lint:
    image: python
    commands:
      - pip install poetry
      - poetry install
      - pip install black
      - echo "----------------- running lint ------------------"
      - python --version && poetry --version && black --version
      - poetry run black .
  build_dist:
    image: python
    commands:
      - pip install poetry
      - poetry install
      - echo "----------------- running analysis ------------------"
      - python --version && poetry --version
      - poetry build
    when:
      branch: main
  gitea_release:
    image: plugins/gitea-release
    settings:
      api_key:
        from_secret: gitea_release_token
      base_url: https://git.martyoeh.me
      files: dist/*
      title: NEWEST_VERSION.md
      note: NEWEST_CHANGES.md
    when:
      event: tag
      tag: v*
  pypi_release:
    image: python
    commands:
      - pip install poetry
      - poetry install
      - echo "----------------- publishing to pypi ------------------"
      - poetry publish --username "$PYPI_USERNAME" --password "$PYPI_PASSWORD"
    when:
      event: tag
      tag: v*
--- a/README.md
+++ b/README.md
@ -160,6 +160,7 @@ content, because then we can just use that. It is harder to parse if it does not
 - [ ] needs some way to delimit where it puts stuff and user stuff is in note
    - [ ] one way is to have it look at `> [17] here be extracted annotation from page seventeen` annotations and put it in between
    - [x] another, probably simpler first, is to just append missing annotations to the end of the note
    - [ ] use similarity search instead of literal search for existing annotation (levenshtein)
 - [x] some highlights (or annotations in general) do not contain text as content
    - [x] pymupdf can extract the content of the underlying rectangle (mostly)
    - [x] issue is that sometimes the highlight contents are in content, sometimes a user comment instead
--- a/pubs/plugs/extract/init.py
+++ b/pubs/plugs/extract/init.py
--- a/pubs/plugs/extract/annotation.py
+++ b/pubs/plugs/extract/annotation.py
@ -0,0 +1,111 @@
 import math
 import re
 from dataclasses import dataclass, field
 from typing import Dict
 from pubs.paper import Paper
 from pubs import pretty
 TEXT_SIMILARITY_MINIMUM = 0.75
 COLOR_SIMILARITY_MINIMUM = 0.833
 COLORS = {
    "red": (1, 0, 0),
    "green": (0, 1, 0),
    "blue": (0, 0, 1),
    "yellow": (1, 1, 0),
    "purple": (0.5, 0, 0.5),
    "orange": (1, 0.65, 0),
 }
 class PaperAnnotated(Paper):
    def __init__(self, citekey, bibdata, metadata=None, annotations=[]):
        super(PaperAnnotated, self).__init__(citekey, bibdata, metadata)
        self.annotations = annotations
    @classmethod
    def from_paper(cls, paper, annotations=[]):
        return cls(paper.citekey, paper.bibdata, paper.metadata, annotations)
    def __repr__(self):
        return "PaperAnnotated(%s, %s, %s)" % (
            self.citekey,
            self.bibdata,
            self.metadata,
        )
    def headline(self, short=False, max_authors=3):
        headline = pretty.paper_oneliner(
            self, citekey_only=short, max_authors=max_authors
        )
        return re.sub(r"\[pdf\]", "", headline).rstrip()
@dataclass
 class Annotation:
    """A PDF annotation object"""
    file: str
    type: str = "Highlight"
    text: str = ""
    content: str = ""
    page: int = 1
    colors: Dict = field(default_factory=lambda: {"stroke": (0.0, 0.0, 0.0)})
    tag: str = ""
    def format(self, formatting):
        """Return a formatted string of the annotation.
        Given a provided formatting pattern, this method returns the annotation
        formatted with the correct marker replacements and removals, ready
        for display or writing.
        """
        output = formatting
        replacements = {
            r"{quote}": self.text,
            r"{note}": self.content,
            r"{page}": str(self.page),
            r"{newline}": "\n",
            r"{tag}": self.tag,
        }
        pattern = re.compile(
            "|".join(
                [re.escape(k) for k in sorted(replacements, key=len, reverse=True)]
            ),
            flags=re.DOTALL,
        )
        patt_quote_container = re.compile(r"{%quote_container(.*?)%}")
        patt_note_container = re.compile(r"{%note_container(.*?)%}")
        patt_tag_container = re.compile(r"{%tag_container(.*?)%}")
        output = patt_quote_container.sub(r"\1" if self.text else "", output)
        output = patt_note_container.sub(r"\1" if self.content else "", output)
        output = patt_tag_container.sub(r"\1" if self.tag else "", output)
        return pattern.sub(lambda x: replacements[x.group(0)], output)
    @property
    def colorname(self):
        """Return the stringified version of the annotation color.
        Finds the closest named color to the annotation and returns it.
        """
        annot_colors = (
            self.colors.get("stroke") or self.colors.get("fill") or (0.0, 0.0, 0.0)
        )
        nearest = None
        minimum_similarity = COLOR_SIMILARITY_MINIMUM
        for name, values in COLORS.items():
            similarity_ratio = self._color_similarity_ratio(values, annot_colors)
            if similarity_ratio > minimum_similarity:
                minimum_similarity = similarity_ratio
                nearest = name
        return nearest
    def _color_similarity_ratio(self, color_one, color_two):
        """Return the similarity of two colors between 0 and 1.
        Takes two rgb color tuples made of floats between 0 and 1, e.g. (1, 0.65, 0) for orange,
        and returns the similarity between them, with 1 being the same color and 0 being the
        difference between full black and full white, as a float.
        """
        return 1 - (abs(math.dist([*color_one], [*color_two])) / 3)
--- a/pubs/plugs/extract/extract.py
+++ b/pubs/plugs/extract/extract.py
@ -1,108 +1,23 @@
 import os
 import re
 import argparse
 import math
 from dataclasses import dataclass, field
 from typing import Dict
 import fitz
 import Levenshtein
 from pubs.plugins import PapersPlugin
 from pubs.paper import Paper
 from pubs.events import DocAddEvent, NoteEvent
 from pubs import repo, pretty
 from pubs.utils import resolve_citekey_list
 from pubs.content import check_file, read_text_file, write_file
 from pubs.query import get_paper_filter
 from .annotation import (
    PaperAnnotated,
    Annotation,
    COLOR_SIMILARITY_MINIMUM,
    TEXT_SIMILARITY_MINIMUM,
 )
 CONFIRMATION_PAPER_THRESHOLD = 5
 TEXT_SIMILARITY_MINIMUM = 0.75
 COLOR_SIMILARITY_MINIMUM = 0.833
 COLORS = {
    "red": (1, 0, 0),
    "green": (0, 1, 0),
    "blue": (0, 0, 1),
    "yellow": (1, 1, 0),
    "purple": (0.5, 0, 0.5),
    "orange": (1, 0.65, 0),
 }
@dataclass
 class Annotation:
    """A PDF annotation object"""
    paper: Paper
    file: str
    type: str = "Highlight"
    text: str = ""
    content: str = ""
    page: int = 1
    colors: Dict = field(default_factory=lambda: {"stroke": (0.0, 0.0, 0.0)})
    tag: str = ""
    def format(self, formatting):
        """Return a formatted string of the annotation.
        Given a provided formatting pattern, this method returns the annotation
        formatted with the correct marker replacements and removals, ready
        for display or writing.
        """
        output = formatting
        replacements = {
            r"{quote}": self.text,
            r"{note}": self.content,
            r"{page}": str(self.page),
            r"{newline}": "\n",
            r"{tag}": self.tag,
        }
        pattern = re.compile(
            "|".join(
                [re.escape(k) for k in sorted(replacements, key=len, reverse=True)]
            ),
            flags=re.DOTALL,
        )
        patt_quote_container = re.compile(r"{%quote_container(.*?)%}")
        patt_note_container = re.compile(r"{%note_container(.*?)%}")
        patt_tag_container = re.compile(r"{%tag_container(.*?)%}")
        output = patt_quote_container.sub(r"\1" if self.text else "", output)
        output = patt_note_container.sub(r"\1" if self.content else "", output)
        output = patt_tag_container.sub(r"\1" if self.tag else "", output)
        return pattern.sub(lambda x: replacements[x.group(0)], output)
    @property
    def colorname(self):
        """Return the stringified version of the annotation color.
        Finds the closest named color to the annotation and returns it.
        """
        annot_colors = (
            self.colors.get("stroke") or self.colors.get("fill") or (0.0, 0.0, 0.0)
        )
        nearest = None
        minimum_similarity = COLOR_SIMILARITY_MINIMUM
        for name, values in COLORS.items():
            similarity_ratio = self._color_similarity_ratio(values, annot_colors)
            if similarity_ratio > minimum_similarity:
                minimum_similarity = similarity_ratio
                nearest = name
        return nearest
    def headline(self, short=False, max_authors=3):
        headline = pretty.paper_oneliner(self.paper, citekey_only=short, max_authors=max_authors)
        return re.sub(r"\[pdf\]", "", headline).rstrip()
    def _color_similarity_ratio(self, color_one, color_two):
        """Return the similarity of two colors between 0 and 1.
        Takes two rgb color tuples made of floats between 0 and 1, e.g. (1, 0.65, 0) for orange,
        and returns the similarity between them, with 1 being the same color and 0 being the
        difference between full black and full white, as a float.
        """
        return 1 - (abs(math.dist([*color_one], [*color_two])) / 3)
 class ExtractPlugin(PapersPlugin):
@ -124,6 +39,7 @@ class ExtractPlugin(PapersPlugin):
    def __init__(self, conf, ui):
        self.ui = ui
        self.note_extension = conf["main"]["note_extension"]
        self.max_authors = conf["main"]["max_authors"]
        self.repository = repo.Repository(conf)
        self.pubsdir = os.path.expanduser(conf["main"]["pubsdir"])
        self.broker = self.repository.databroker
@ -213,12 +129,12 @@ class ExtractPlugin(PapersPlugin):
        Returns all annotations belonging to the papers that
        are described by the citekeys passed in.
        """
-        papers_annotated = {}
+        papers_annotated = []
        for paper in papers:
            file = self._get_file(paper)
            try:
-                annotations = self._get_annotations(file, paper)
+                annotations = self._get_annotations(file)
-                papers_annotated[paper.citekey] = annotations
+                papers_annotated.append(PaperAnnotated.from_paper(paper, annotations))
            except fitz.FileDataError as e:
                self.ui.error(f"Document {file} is broken: {e}")
        return papers_annotated
@ -256,7 +172,7 @@ class ExtractPlugin(PapersPlugin):
            self.ui.message(
                "\n".join(
                    pretty.paper_oneliner(
-                        p, citekey_only=False, max_authors=conf["main"]["max_authors"]
+                        p, citekey_only=False, max_authors=self.max_authors
                    )
                    for p in papers
                )
@ -278,7 +194,7 @@ class ExtractPlugin(PapersPlugin):
            self.ui.warning(f"{paper.citekey} has no valid document.")
        return path
-    def _get_annotations(self, filename, paper):
+    def _get_annotations(self, filename):
        """Extract annotations from a file.
        Returns all readable annotations contained in the file
@ -292,7 +208,6 @@ class ExtractPlugin(PapersPlugin):
                    quote, note = self._retrieve_annotation_content(page, annot)
                    a = Annotation(
                        file=filename,
                        paper=paper,
                        text=quote,
                        content=note,
                        colors=annot.colors,
@ -327,16 +242,16 @@ class ExtractPlugin(PapersPlugin):
        # highlight with selection not in note
        return (written, "")
-    def _to_stdout(self, annotated_papers, short_header=True):
+    def _to_stdout(self, annotated_papers, short_header=False):
        """Write annotations to stdout.
        Simply outputs the gathered annotations over stdout
        ready to be passed on through pipelines etc.
        """
        output = ""
-        for citekey, annotations in annotated_papers.items():
+        for paper in annotated_papers:
-            output += f"\n------ {annotations[0].headline(short=short_header)} ------\n\n"
+            output += f"\n------ {paper.headline(self.short_header, self.max_authors)} ------\n\n"
-            for annotation in annotations:
+            for annotation in paper.annotations:
                output += f"{annotation.format(self.formatting)}\n"
                output += "\n"
        self.ui.message(output.strip())
@ -348,31 +263,35 @@ class ExtractPlugin(PapersPlugin):
        in the pubs notes directory. Creates new notes for
        citekeys missing a note or appends to existing.
        """
-        for citekey, annotations in annotated_papers.items():
+        for paper in annotated_papers:
-            if annotations:
+            if paper.annotations:
-                notepath = self.broker.real_notepath(citekey, note_extension)
+                notepath = self.broker.real_notepath(paper.citekey, note_extension)
                if check_file(notepath, fail=False):
-                    self._append_to_note(notepath, annotations)
+                    self._append_to_note(notepath, paper)
                else:
-                    self._write_new_note(notepath, annotations)
+                    self._write_new_note(
-                self.ui.info(f"Wrote annotations to {citekey} note {notepath}.")
+                        notepath,
                        paper,
                        paper.headline(short=True, max_authors=self.max_authors),
                    )
                self.ui.info(f"Wrote annotations to {paper.citekey} note {notepath}.")
                if edit is True:
                    self.ui.edit_file(notepath, temporary=False)
-                NoteEvent(citekey).send()
+                NoteEvent(paper.citekey).send()
-    def _write_new_note(self, notepath, annotations):
+    def _write_new_note(self, notepath, paper, headline):
        """Create a new note containing the annotations.
        Will create a new note in the notes folder of pubs
        and fill it with the annotations extracted from pdf.
        """
-        output = f"# {annotations[0].headline(short=short_header)}\n\n"
+        output = f"# {headline}\n\n"
-        for annotation in annotations:
+        for annotation in paper.annotations:
            output += f"{annotation.format(self.formatting)}\n\n"
        write_file(notepath, output, "w")
-    def _append_to_note(self, notepath, annotations):
+    def _append_to_note(self, notepath, paper):
        """Append new annotations to the end of a note.
        Looks through note to determine any new annotations which should be
@ -381,7 +300,7 @@ class ExtractPlugin(PapersPlugin):
        existing = read_text_file(notepath)
        # removed annotations already found in the note
        existing_dropped = [
-            x for x in annotations if x.format(self.formatting) not in existing
+            x for x in paper.annotations if x.format(self.formatting) not in existing
        ]
        if not existing_dropped:
            return
--- a/pyproject.toml
+++ b/pyproject.toml
@ -5,7 +5,7 @@ description = "A pdf annotation extraction plugin for pubs bibliography manager"
 authors = ["Marty Oehme <marty.oehme@gmail.com>"]
 license = "LGPL-3.0"
 readme = "README.md"
-packages = [{include = "extract"}]
+packages = [{include = "pubs"}]
 [tool.poetry.dependencies]
 python = "^3.10"
Author	SHA1	Message	Date
Marty Oehme	2c5d096d08	Add continuous integration pipeline All checks were successful ci/woodpecker/push/woodpecker Pipeline was successful Details	2023-01-13 19:14:06 +01:00
Marty Oehme	0634cbb381	Format plugin file with black	2023-01-13 19:06:20 +01:00
Marty Oehme	e1a7d9f613	Move plugin to nested folder for easy installation Moved the plugin files to the directory structure they will have in the python site-packages directory, being placed within the plugs directory of the pubs application directory.	2023-01-13 18:50:42 +01:00
Marty Oehme	04c8a8ed0b	Refactor extraction to use PaperAnnotated class	2022-12-25 12:17:05 +01:00
Marty Oehme	7a415b4d7d	Refactor annotation into separate class	2022-12-25 10:56:56 +01:00