feat: Add extractor cli choice

Can only choose pdf for the time being, but allows additional extractors to be added in the future.
2024-01-23 08:58:32 +01:00 · 2024-01-23 08:58:32 +01:00 · f477deea7c
commit f477deea7c
parent 3b4db7b6b8
2 changed files with 22 additions and 5 deletions
--- a/papis_extract/init.py
+++ b/papis_extract/init.py
@ -50,6 +50,18 @@ papis.config.register_default_settings(DEFAULT_OPTIONS)
    ),
    help="Choose an output template to format annotations with.",
 )
@click.option(
    "--extractor",
    "-e",
    "extractors",
    type=click.Choice(
        list(extractor.extractors.keys()),
        case_sensitive=False,
    ),
    default=list(extractor.extractors.keys()),
    multiple=True,
    help="Choose an extractor to apply to the selected documents.",
 )
@click.option(
    "--force/--no-force",
    "-f",
@ -64,6 +76,7 @@ def main(
    doc_folder: str,
    manual: bool,
    write: bool,
    extractors: str,
    template: str,
    git: bool,
    force: bool,
@ -86,6 +99,7 @@ def main(
        logger.warning(papis.strings.no_documents_retrieved_message)
        return
    print(extractors)
    formatter = formatters.get(template)
    run(documents, edit=manual, write=write, git=git, formatter=formatter, force=force)
--- a/papis_extract/extractor.py
+++ b/papis_extract/extractor.py
@ -34,23 +34,26 @@ def start(
    pdf_extractor: Extractor = PdfExtractor()
    annotations: list[Annotation] = []
-    found_pdf: bool = False
+    file_available: bool = False
    for file in document.get_files():
        fname = Path(file)
        if not pdf_extractor.can_process(fname):
-            break
+            continue
-        found_pdf = True
+        file_available = True
        try:
            annotations.extend(pdf_extractor.run(fname))
        except fitz.FileDataError as e:
            print(f"File structure errors for {file}.\n{e}")
-    if not found_pdf:
+    if not file_available:
        # have to remove curlys or papis logger gets upset
        desc = re.sub("[{}]", "", papis.document.describe(document))
-        logger.warning("Did not find suitable PDF file for document: " f"{desc}")
+        logger.warning("Did not find suitable file for document: " f"{desc}")
    return annotations
 extractors: dict[str, Extractor] = {
    "pdf": PdfExtractor(),
 }