papis-extract/papis_extract/extraction.py
Marty Oehme 8093259551
refactor: Remove pymupdf coupling in extraction
The library is only needed for pdf extraction which is taken care of
in its own extractor plugin. In the overall extraction routine we do not
need any knowledge of the existence of pymupdf.
2024-06-14 14:59:39 +02:00

51 lines
1.3 KiB
Python

import re
from pathlib import Path
from typing import Protocol
import papis.config
import papis.document
import papis.logging
from papis.document import Document
from papis_extract.annotation import Annotation
from papis_extract.extractors import ExtractionError
logger = papis.logging.get_logger(__name__)
class Extractor(Protocol):
def can_process(self, filename: Path) -> bool: ...
def run(self, filename: Path) -> list[Annotation]: ...
def start(
extractor: Extractor,
document: Document,
) -> list[Annotation]:
"""Extract all annotations from passed documents.
Returns all annotations contained in the papis
documents passed in.
"""
annotations: list[Annotation] = []
file_available: bool = False
for file in document.get_files():
fname = Path(file)
if not extractor.can_process(fname):
continue
file_available = True
try:
annotations.extend(extractor.run(fname))
except ExtractionError as e:
print(f"File extraction errors for {file}.\n{e}")
if not file_available:
# have to remove curlys or papis logger gets upset
desc = re.sub("[{}]", "", papis.document.describe(document))
logger.info(f"No {type(extractor)} file for document: {desc}")
return annotations