papis-extract/papis_extract/annotation_data.py

import math
from dataclasses import dataclass, field

import papis.config
from papis.document import Document
import chevron

TEXT_SIMILARITY_MINIMUM = 0.75
COLOR_SIMILARITY_MINIMUM = 0.833

COLORS = {
    "red": (1, 0, 0),
    "green": (0, 1, 0),
    "blue": (0, 0, 1),
    "yellow": (1, 1, 0),
    "purple": (0.5, 0, 0.5),
    "orange": (1, 0.65, 0),
}


@dataclass
class Annotation:
    """A PDF annotation object.

    Contains all information necessary for the annotation itself, content and metadata.
    """

    file: str
    colors: tuple[float, float, float] = field(default_factory=lambda: (0.0, 0.0, 0.0))
    content: str = ""
    page: int = 0
    tag: str = ""
    text: str = ""
    type: str = "Highlight"
    minimum_similarity_color: float = 1.0

    def format(self, template: str, doc: Document = Document()):
        """Return a formatted string of the annotation.

        Given a provided formatting pattern, this method returns the annotation
        formatted with the correct marker replacements and removals, ready
        for display or writing.
        """
        data = {
            "file": self.file,
            "quote": self.text,
            "note": self.content,
            "page": self.page,
            "tag": self.tag,
            "type": self.type,
            "doc": doc,
        }
        return chevron.render(template, data)

    @property
    def colorname(self):
        """Return the stringified version of the annotation color.

        Finds the closest named color to the annotation and returns it,
        using euclidian distance between the two color vectors.
        """
        annot_colors = self.colors or (0.0, 0.0, 0.0)
        nearest = None
        minimum_similarity = (
            papis.config.getfloat("minimum_similarity_color", "plugins.extract") or 1.0
        )
        minimum_similarity = self.minimum_similarity_color
        for name, values in COLORS.items():
            similarity_ratio = self._color_similarity_ratio(values, annot_colors)
            if similarity_ratio >= minimum_similarity:
                minimum_similarity = similarity_ratio
                nearest = name
        return nearest

    def _color_similarity_ratio(self, color_one, color_two):
        """Return the similarity of two colors between 0 and 1.

        Takes two rgb color tuples made of floats between 0 and 1,
        e.g. (1, 0.65, 0) for orange, and returns the similarity
        between them, with 1 being the same color and 0 being the
        difference between full black and full white, as a float.
        """
        return 1 - (abs(math.dist([*color_one], [*color_two])) / 3)


@dataclass
class AnnotatedDocument:
    """Contains all annotations belonging to a single papis document.

    Combines a document with a list of annotations which belong to it."""

    document: Document
    annotations: list[Annotation]

    # TODO could implement a from_doc() static method to generate annotation list?