papis-extract/papis_extract/extraction.py

import re
from pathlib import Path
from typing import Protocol

import papis.config
import papis.document
import papis.logging
from papis.document import Document

from papis_extract.annotation import Annotation
from papis_extract.exceptions import ExtractionError

logger = papis.logging.get_logger(__name__)


class Extractor(Protocol):
    def can_process(self, filename: Path) -> bool: ...

    def run(self, filename: Path) -> list[Annotation]: ...


def start(
    extractor: Extractor,
    document: Document,
) -> list[Annotation] | None:
    """Extract all annotations from passed documents.

    Returns all annotations contained in the papis
    documents passed in (empty list if no annotations).
    If there are no files that the extractor can process,
    returns None instead.
    """
    annotations: list[Annotation] = []
    file_available: bool = False

    for file in document.get_files():
        fname = Path(file)
        if not extractor.can_process(fname):
            continue
        file_available = True

        try:
            annotations.extend(extractor.run(fname))
        except ExtractionError as e:
            logger.error(f"File extraction errors for {file}. File may be damaged.\n{e}")

    if not file_available:
        return None

    return annotations