Marty Oehme
3bdc37b729
Until now whenever an extractor could not find any valid files for a document it would inform the user of this case. However, this is not very useful: if you have a pdf and an epub extractor running, it would inform you for each document which only had one of the two formats as well as those which actually did not have any valid files for *any* of the extractors running. This commit changes the behavior to only inform the user when none of the running extractors find a valid file, since that is the actual case a user might want to be informed about.
50 lines
1.3 KiB
Python
50 lines
1.3 KiB
Python
import re
|
|
from pathlib import Path
|
|
from typing import Protocol
|
|
|
|
import papis.config
|
|
import papis.document
|
|
import papis.logging
|
|
from papis.document import Document
|
|
|
|
from papis_extract.annotation import Annotation
|
|
from papis_extract.exceptions import ExtractionError
|
|
|
|
logger = papis.logging.get_logger(__name__)
|
|
|
|
|
|
class Extractor(Protocol):
|
|
def can_process(self, filename: Path) -> bool: ...
|
|
|
|
def run(self, filename: Path) -> list[Annotation]: ...
|
|
|
|
|
|
def start(
|
|
extractor: Extractor,
|
|
document: Document,
|
|
) -> list[Annotation] | None:
|
|
"""Extract all annotations from passed documents.
|
|
|
|
Returns all annotations contained in the papis
|
|
documents passed in (empty list if no annotations).
|
|
If there are no files that the extractor can process,
|
|
returns None instead.
|
|
"""
|
|
annotations: list[Annotation] = []
|
|
file_available: bool = False
|
|
|
|
for file in document.get_files():
|
|
fname = Path(file)
|
|
if not extractor.can_process(fname):
|
|
continue
|
|
file_available = True
|
|
|
|
try:
|
|
annotations.extend(extractor.run(fname))
|
|
except ExtractionError as e:
|
|
logger.error(f"File extraction errors for {file}. File may be damaged.\n{e}")
|
|
|
|
if not file_available:
|
|
return None
|
|
|
|
return annotations
|