papis-extract/papis_extract/extraction.py

51 lines
1.3 KiB
Python
Raw Normal View History

import re
2023-08-28 08:28:06 +00:00
from pathlib import Path
from typing import Protocol
2023-08-28 08:28:06 +00:00
import papis.config
import papis.document
import papis.logging
from papis.document import Document
2023-08-28 08:28:06 +00:00
from papis_extract.annotation import Annotation
from papis_extract.exceptions import ExtractionError
2023-08-28 08:28:06 +00:00
2023-08-28 10:53:03 +00:00
logger = papis.logging.get_logger(__name__)
class Extractor(Protocol):
2024-06-12 09:46:39 +00:00
def can_process(self, filename: Path) -> bool: ...
2024-06-12 09:46:39 +00:00
def run(self, filename: Path) -> list[Annotation]: ...
def start(
extractor: Extractor,
document: Document,
) -> list[Annotation]:
"""Extract all annotations from passed documents.
2023-08-28 08:28:06 +00:00
Returns all annotations contained in the papis
documents passed in.
"""
annotations: list[Annotation] = []
file_available: bool = False
for file in document.get_files():
fname = Path(file)
if not extractor.can_process(fname):
continue
file_available = True
try:
annotations.extend(extractor.run(fname))
except ExtractionError as e:
print(f"File extraction errors for {file}.\n{e}")
if not file_available:
# have to remove curlys or papis logger gets upset
desc = re.sub("[{}]", "", papis.document.describe(document))
2024-01-24 07:55:43 +00:00
logger.info(f"No {type(extractor)} file for document: {desc}")
return annotations