import re from pathlib import Path from typing import Protocol import papis.config import papis.document import papis.logging from papis.document import Document from papis_extract.annotation import Annotation from papis_extract.extractors import ExtractionError logger = papis.logging.get_logger(__name__) class Extractor(Protocol): def can_process(self, filename: Path) -> bool: ... def run(self, filename: Path) -> list[Annotation]: ... def start( extractor: Extractor, document: Document, ) -> list[Annotation]: """Extract all annotations from passed documents. Returns all annotations contained in the papis documents passed in. """ annotations: list[Annotation] = [] file_available: bool = False for file in document.get_files(): fname = Path(file) if not extractor.can_process(fname): continue file_available = True try: annotations.extend(extractor.run(fname)) except ExtractionError as e: print(f"File extraction errors for {file}.\n{e}") if not file_available: # have to remove curlys or papis logger gets upset desc = re.sub("[{}]", "", papis.document.describe(document)) logger.info(f"No {type(extractor)} file for document: {desc}") return annotations