2023-08-29 10:40:36 +00:00
|
|
|
import re
|
2023-08-28 08:28:06 +00:00
|
|
|
from pathlib import Path
|
2024-01-20 17:02:18 +00:00
|
|
|
from typing import Protocol
|
2023-08-28 08:28:06 +00:00
|
|
|
|
2024-01-18 17:24:19 +00:00
|
|
|
import fitz
|
2023-08-28 08:28:06 +00:00
|
|
|
import papis.config
|
2023-08-29 10:40:36 +00:00
|
|
|
import papis.document
|
2024-01-20 17:02:18 +00:00
|
|
|
import papis.logging
|
2023-08-29 10:40:36 +00:00
|
|
|
from papis.document import Document
|
2023-08-28 08:28:06 +00:00
|
|
|
|
2024-01-20 15:34:10 +00:00
|
|
|
from papis_extract.annotation import Annotation
|
2023-08-28 08:28:06 +00:00
|
|
|
|
2023-08-28 10:53:03 +00:00
|
|
|
logger = papis.logging.get_logger(__name__)
|
|
|
|
|
2023-08-29 20:23:52 +00:00
|
|
|
|
2024-01-20 17:02:18 +00:00
|
|
|
class Extractor(Protocol):
|
2024-06-12 09:46:39 +00:00
|
|
|
def can_process(self, filename: Path) -> bool: ...
|
2024-01-20 17:02:18 +00:00
|
|
|
|
2024-06-12 09:46:39 +00:00
|
|
|
def run(self, filename: Path) -> list[Annotation]: ...
|
2024-01-20 17:02:18 +00:00
|
|
|
|
|
|
|
|
2023-08-29 10:40:36 +00:00
|
|
|
def start(
|
2024-01-23 08:10:42 +00:00
|
|
|
extractor: Extractor,
|
2024-01-20 15:34:10 +00:00
|
|
|
document: Document,
|
|
|
|
) -> list[Annotation]:
|
2023-08-29 10:40:36 +00:00
|
|
|
"""Extract all annotations from passed documents.
|
2023-08-28 08:28:06 +00:00
|
|
|
|
2023-08-29 20:23:52 +00:00
|
|
|
Returns all annotations contained in the papis
|
2023-08-29 10:40:36 +00:00
|
|
|
documents passed in.
|
|
|
|
"""
|
2024-01-20 15:34:10 +00:00
|
|
|
annotations: list[Annotation] = []
|
2024-01-23 07:58:32 +00:00
|
|
|
file_available: bool = False
|
2024-01-23 08:10:42 +00:00
|
|
|
|
2024-01-20 15:34:10 +00:00
|
|
|
for file in document.get_files():
|
|
|
|
fname = Path(file)
|
2024-01-23 08:10:42 +00:00
|
|
|
if not extractor.can_process(fname):
|
2024-01-23 07:58:32 +00:00
|
|
|
continue
|
|
|
|
file_available = True
|
2023-08-29 10:40:36 +00:00
|
|
|
|
2024-01-20 15:34:10 +00:00
|
|
|
try:
|
2024-01-23 08:10:42 +00:00
|
|
|
annotations.extend(extractor.run(fname))
|
2024-01-20 15:34:10 +00:00
|
|
|
except fitz.FileDataError as e:
|
|
|
|
print(f"File structure errors for {file}.\n{e}")
|
2023-08-29 10:40:36 +00:00
|
|
|
|
2024-01-23 07:58:32 +00:00
|
|
|
if not file_available:
|
2024-01-20 15:34:10 +00:00
|
|
|
# have to remove curlys or papis logger gets upset
|
|
|
|
desc = re.sub("[{}]", "", papis.document.describe(document))
|
2024-01-24 07:55:43 +00:00
|
|
|
logger.info(f"No {type(extractor)} file for document: {desc}")
|
2024-01-20 15:34:10 +00:00
|
|
|
|
|
|
|
return annotations
|