papis-extract/papis_extract/extractor.py

from pathlib import Path

import Levenshtein
import fitz_new as fitz
import papis.logging
import papis.config

from papis_extract.annotation_data import Annotation

COLOR_MAPPING = {}

logger = papis.logging.get_logger(__name__)


def start(filename: Path) -> list[Annotation]:
    """Extract annotations from a file.

    Returns all readable annotations contained in the file
    passed in. Only returns Highlight or Text annotations.
    """
    annotations = []
    with fitz.Document(filename) as doc:
        for page in doc:
            for annot in page.annots():
                quote, note = _retrieve_annotation_content(page, annot)
                a = Annotation(
                    file=str(filename),
                    text=quote,
                    content=note,
                    colors=annot.colors,
                    type=annot.type[1],
                    page=(page.number or 0) + 1,
                )
                a.tag = _tag_from_colorname(a.colorname)
                annotations.append(a)
    logger.debug(
        f"Found {len(annotations)} "
        f"{'annotation' if len(annotations) == 1 else 'annotations'} for {filename}."
    )
    return annotations


def _tag_from_colorname(colorname):
    return COLOR_MAPPING.get(colorname, "")


def _retrieve_annotation_content(page, annotation):
    """Gets the text content of an annotation.

    Returns the actual content of an annotation. Sometimes
    that is only the written words, sometimes that is only
    annotation notes, sometimes it is both. Runs a similarity
    comparison between strings to find out whether they
    should both be included or are the same, using
    Levenshtein distance.
    """
    content = annotation.info["content"].replace("\n", " ")
    written = page.get_textbox(annotation.rect).replace("\n", " ")

    # highlight with selection in note
    minimum_similarity = (
        papis.config.getfloat("minimum_similarity_content", "plugins.extract") or 1.0
    )
    if Levenshtein.ratio(content, written) > minimum_similarity:
        return (content, "")
    # an independent note, not a highlight
    elif content and not written:
        return ("", content)
    # both a highlight and a note
    elif content:
        return (written, content)
    # highlight with selection not in note
    return (written, "")