feat: Loop through all chosen extractors

This commit is contained in:
Marty Oehme 2024-01-23 09:10:42 +01:00
parent f477deea7c
commit 629932a5e8
Signed by: Marty
GPG key ID: EDBF2ED917B2EF6A
2 changed files with 37 additions and 27 deletions

View file

@ -0,0 +1,58 @@
import re
from pathlib import Path
from typing import Protocol
import fitz
import papis.config
import papis.document
import papis.logging
from papis.document import Document
from papis_extract.annotation import Annotation
from papis_extract.extractors.pdf import PdfExtractor
logger = papis.logging.get_logger(__name__)
class Extractor(Protocol):
def can_process(self, filename: Path) -> bool:
...
def run(self, filename: Path) -> list[Annotation]:
...
def start(
extractor: Extractor,
document: Document,
) -> list[Annotation]:
"""Extract all annotations from passed documents.
Returns all annotations contained in the papis
documents passed in.
"""
annotations: list[Annotation] = []
file_available: bool = False
for file in document.get_files():
fname = Path(file)
if not extractor.can_process(fname):
continue
file_available = True
try:
annotations.extend(extractor.run(fname))
except fitz.FileDataError as e:
print(f"File structure errors for {file}.\n{e}")
if not file_available:
# have to remove curlys or papis logger gets upset
desc = re.sub("[{}]", "", papis.document.describe(document))
logger.warning("Did not find suitable file for document: " f"{desc}")
return annotations
extractors: dict[str, Extractor] = {
"pdf": PdfExtractor(),
}