diff --git a/papis_extract/__init__.py b/papis_extract/__init__.py index 8d7bf8a..6239533 100644 --- a/papis_extract/__init__.py +++ b/papis_extract/__init__.py @@ -50,6 +50,18 @@ papis.config.register_default_settings(DEFAULT_OPTIONS) ), help="Choose an output template to format annotations with.", ) +@click.option( + "--extractor", + "-e", + "extractors", + type=click.Choice( + list(extractor.extractors.keys()), + case_sensitive=False, + ), + default=list(extractor.extractors.keys()), + multiple=True, + help="Choose an extractor to apply to the selected documents.", +) @click.option( "--force/--no-force", "-f", @@ -64,6 +76,7 @@ def main( doc_folder: str, manual: bool, write: bool, + extractors: str, template: str, git: bool, force: bool, @@ -86,6 +99,7 @@ def main( logger.warning(papis.strings.no_documents_retrieved_message) return + print(extractors) formatter = formatters.get(template) run(documents, edit=manual, write=write, git=git, formatter=formatter, force=force) diff --git a/papis_extract/extractor.py b/papis_extract/extractor.py index d88d365..b0aa3cf 100644 --- a/papis_extract/extractor.py +++ b/papis_extract/extractor.py @@ -34,23 +34,26 @@ def start( pdf_extractor: Extractor = PdfExtractor() annotations: list[Annotation] = [] - found_pdf: bool = False + file_available: bool = False for file in document.get_files(): fname = Path(file) if not pdf_extractor.can_process(fname): - break - found_pdf = True + continue + file_available = True try: annotations.extend(pdf_extractor.run(fname)) except fitz.FileDataError as e: print(f"File structure errors for {file}.\n{e}") - if not found_pdf: + if not file_available: # have to remove curlys or papis logger gets upset desc = re.sub("[{}]", "", papis.document.describe(document)) - logger.warning("Did not find suitable PDF file for document: " f"{desc}") + logger.warning("Did not find suitable file for document: " f"{desc}") return annotations +extractors: dict[str, Extractor] = { + "pdf": PdfExtractor(), +}