2023-08-29 10:40:36 +00:00
|
|
|
import re
|
2023-08-28 08:28:06 +00:00
|
|
|
from pathlib import Path
|
2024-01-20 17:02:18 +00:00
|
|
|
from typing import Protocol
|
2023-08-28 08:28:06 +00:00
|
|
|
|
|
|
|
import papis.config
|
2023-08-29 10:40:36 +00:00
|
|
|
import papis.document
|
2024-01-20 17:02:18 +00:00
|
|
|
import papis.logging
|
2023-08-29 10:40:36 +00:00
|
|
|
from papis.document import Document
|
2023-08-28 08:28:06 +00:00
|
|
|
|
2024-01-20 15:34:10 +00:00
|
|
|
from papis_extract.annotation import Annotation
|
2024-06-14 13:18:02 +00:00
|
|
|
from papis_extract.exceptions import ExtractionError
|
2023-08-28 08:28:06 +00:00
|
|
|
|
2023-08-28 10:53:03 +00:00
|
|
|
logger = papis.logging.get_logger(__name__)
|
|
|
|
|
2023-08-29 20:23:52 +00:00
|
|
|
|
2024-01-20 17:02:18 +00:00
|
|
|
class Extractor(Protocol):
|
2024-06-12 09:46:39 +00:00
|
|
|
def can_process(self, filename: Path) -> bool: ...
|
2024-01-20 17:02:18 +00:00
|
|
|
|
2024-06-12 09:46:39 +00:00
|
|
|
def run(self, filename: Path) -> list[Annotation]: ...
|
2024-01-20 17:02:18 +00:00
|
|
|
|
|
|
|
|
2023-08-29 10:40:36 +00:00
|
|
|
def start(
|
2024-01-23 08:10:42 +00:00
|
|
|
extractor: Extractor,
|
2024-01-20 15:34:10 +00:00
|
|
|
document: Document,
|
2024-06-14 18:02:52 +00:00
|
|
|
) -> list[Annotation] | None:
|
2023-08-29 10:40:36 +00:00
|
|
|
"""Extract all annotations from passed documents.
|
2023-08-28 08:28:06 +00:00
|
|
|
|
2023-08-29 20:23:52 +00:00
|
|
|
Returns all annotations contained in the papis
|
2024-06-14 18:02:52 +00:00
|
|
|
documents passed in (empty list if no annotations).
|
|
|
|
If there are no files that the extractor can process,
|
|
|
|
returns None instead.
|
2023-08-29 10:40:36 +00:00
|
|
|
"""
|
2024-01-20 15:34:10 +00:00
|
|
|
annotations: list[Annotation] = []
|
2024-01-23 07:58:32 +00:00
|
|
|
file_available: bool = False
|
2024-01-23 08:10:42 +00:00
|
|
|
|
2024-01-20 15:34:10 +00:00
|
|
|
for file in document.get_files():
|
|
|
|
fname = Path(file)
|
2024-01-23 08:10:42 +00:00
|
|
|
if not extractor.can_process(fname):
|
2024-01-23 07:58:32 +00:00
|
|
|
continue
|
|
|
|
file_available = True
|
2023-08-29 10:40:36 +00:00
|
|
|
|
2024-01-20 15:34:10 +00:00
|
|
|
try:
|
2024-01-23 08:10:42 +00:00
|
|
|
annotations.extend(extractor.run(fname))
|
2024-06-14 12:59:39 +00:00
|
|
|
except ExtractionError as e:
|
2024-06-14 18:02:52 +00:00
|
|
|
logger.error(f"File extraction errors for {file}. File may be damaged.\n{e}")
|
2023-08-29 10:40:36 +00:00
|
|
|
|
2024-01-23 07:58:32 +00:00
|
|
|
if not file_available:
|
2024-06-14 18:02:52 +00:00
|
|
|
return None
|
2024-01-20 15:34:10 +00:00
|
|
|
|
|
|
|
return annotations
|