Marty Oehme
8093259551
The library is only needed for pdf extraction which is taken care of in its own extractor plugin. In the overall extraction routine we do not need any knowledge of the existence of pymupdf.
51 lines
1.3 KiB
Python
51 lines
1.3 KiB
Python
import re
|
|
from pathlib import Path
|
|
from typing import Protocol
|
|
|
|
import papis.config
|
|
import papis.document
|
|
import papis.logging
|
|
from papis.document import Document
|
|
|
|
from papis_extract.annotation import Annotation
|
|
from papis_extract.extractors import ExtractionError
|
|
|
|
logger = papis.logging.get_logger(__name__)
|
|
|
|
|
|
class Extractor(Protocol):
|
|
def can_process(self, filename: Path) -> bool: ...
|
|
|
|
def run(self, filename: Path) -> list[Annotation]: ...
|
|
|
|
|
|
def start(
|
|
extractor: Extractor,
|
|
document: Document,
|
|
) -> list[Annotation]:
|
|
"""Extract all annotations from passed documents.
|
|
|
|
Returns all annotations contained in the papis
|
|
documents passed in.
|
|
"""
|
|
annotations: list[Annotation] = []
|
|
file_available: bool = False
|
|
|
|
for file in document.get_files():
|
|
fname = Path(file)
|
|
if not extractor.can_process(fname):
|
|
continue
|
|
file_available = True
|
|
|
|
try:
|
|
annotations.extend(extractor.run(fname))
|
|
except ExtractionError as e:
|
|
print(f"File extraction errors for {file}.\n{e}")
|
|
|
|
if not file_available:
|
|
# have to remove curlys or papis logger gets upset
|
|
desc = re.sub("[{}]", "", papis.document.describe(document))
|
|
logger.info(f"No {type(extractor)} file for document: {desc}")
|
|
|
|
return annotations
|