feat: Loop through all chosen extractors
This commit is contained in:
parent
f477deea7c
commit
629932a5e8
2 changed files with 37 additions and 27 deletions
58
papis_extract/extraction.py
Normal file
58
papis_extract/extraction.py
Normal file
|
|
@ -0,0 +1,58 @@
|
|||
import re
|
||||
from pathlib import Path
|
||||
from typing import Protocol
|
||||
|
||||
import fitz
|
||||
import papis.config
|
||||
import papis.document
|
||||
import papis.logging
|
||||
from papis.document import Document
|
||||
|
||||
from papis_extract.annotation import Annotation
|
||||
from papis_extract.extractors.pdf import PdfExtractor
|
||||
|
||||
logger = papis.logging.get_logger(__name__)
|
||||
|
||||
|
||||
class Extractor(Protocol):
|
||||
def can_process(self, filename: Path) -> bool:
|
||||
...
|
||||
|
||||
def run(self, filename: Path) -> list[Annotation]:
|
||||
...
|
||||
|
||||
|
||||
def start(
|
||||
extractor: Extractor,
|
||||
document: Document,
|
||||
) -> list[Annotation]:
|
||||
"""Extract all annotations from passed documents.
|
||||
|
||||
Returns all annotations contained in the papis
|
||||
documents passed in.
|
||||
"""
|
||||
annotations: list[Annotation] = []
|
||||
file_available: bool = False
|
||||
|
||||
for file in document.get_files():
|
||||
fname = Path(file)
|
||||
if not extractor.can_process(fname):
|
||||
continue
|
||||
file_available = True
|
||||
|
||||
try:
|
||||
annotations.extend(extractor.run(fname))
|
||||
except fitz.FileDataError as e:
|
||||
print(f"File structure errors for {file}.\n{e}")
|
||||
|
||||
if not file_available:
|
||||
# have to remove curlys or papis logger gets upset
|
||||
desc = re.sub("[{}]", "", papis.document.describe(document))
|
||||
logger.warning("Did not find suitable file for document: " f"{desc}")
|
||||
|
||||
return annotations
|
||||
|
||||
|
||||
extractors: dict[str, Extractor] = {
|
||||
"pdf": PdfExtractor(),
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue