50 lines
1.3 KiB
Python
50 lines
1.3 KiB
Python
import re
|
|
from pathlib import Path
|
|
from typing import Protocol
|
|
|
|
import papis.config
|
|
import papis.document
|
|
import papis.logging
|
|
from papis.document import Document
|
|
|
|
from papis_extract.annotation import Annotation
|
|
from papis_extract.exceptions import ExtractionError
|
|
|
|
logger = papis.logging.get_logger(__name__)
|
|
|
|
|
|
class Extractor(Protocol):
|
|
def can_process(self, filename: Path) -> bool: ...
|
|
|
|
def run(self, filename: Path) -> list[Annotation]: ...
|
|
|
|
|
|
def start(
|
|
extractor: Extractor,
|
|
document: Document,
|
|
) -> list[Annotation]:
|
|
"""Extract all annotations from passed documents.
|
|
|
|
Returns all annotations contained in the papis
|
|
documents passed in.
|
|
"""
|
|
annotations: list[Annotation] = []
|
|
file_available: bool = False
|
|
|
|
for file in document.get_files():
|
|
fname = Path(file)
|
|
if not extractor.can_process(fname):
|
|
continue
|
|
file_available = True
|
|
|
|
try:
|
|
annotations.extend(extractor.run(fname))
|
|
except ExtractionError as e:
|
|
print(f"File extraction errors for {file}.\n{e}")
|
|
|
|
if not file_available:
|
|
# have to remove curlys or papis logger gets upset
|
|
desc = re.sub("[{}]", "", papis.document.describe(document))
|
|
logger.info(f"No {type(extractor)} file for document: {desc}")
|
|
|
|
return annotations
|