papis-extract/papis_extract/extractor.py

import re
from pathlib import Path
from typing import Protocol

import fitz
import papis.config
import papis.document
import papis.logging
from papis.document import Document

from papis_extract.annotation import Annotation
from papis_extract.extractors.pdf import PdfExtractor

logger = papis.logging.get_logger(__name__)


class Extractor(Protocol):
    def can_process(self, filename: Path) -> bool:
        ...

    def run(self, filename: Path) -> list[Annotation]:
        ...


def start(
    document: Document,
) -> list[Annotation]:
    """Extract all annotations from passed documents.

    Returns all annotations contained in the papis
    documents passed in.
    """

    pdf_extractor: Extractor = PdfExtractor()

    annotations: list[Annotation] = []
    file_available: bool = False
    for file in document.get_files():
        fname = Path(file)
        if not pdf_extractor.can_process(fname):
            continue
        file_available = True

        try:
            annotations.extend(pdf_extractor.run(fname))
        except fitz.FileDataError as e:
            print(f"File structure errors for {file}.\n{e}")

    if not file_available:
        # have to remove curlys or papis logger gets upset
        desc = re.sub("[{}]", "", papis.document.describe(document))
        logger.warning("Did not find suitable file for document: " f"{desc}")

    return annotations


extractors: dict[str, Extractor] = {
    "pdf": PdfExtractor(),
}