feat: Add extractor cli choice

Can only choose pdf for the time being, but allows additional
extractors to be added in the future.
This commit is contained in:
Marty Oehme 2024-01-23 08:58:32 +01:00
parent 3b4db7b6b8
commit f477deea7c
Signed by: Marty
GPG key ID: EDBF2ED917B2EF6A
2 changed files with 22 additions and 5 deletions

View file

@ -50,6 +50,18 @@ papis.config.register_default_settings(DEFAULT_OPTIONS)
), ),
help="Choose an output template to format annotations with.", help="Choose an output template to format annotations with.",
) )
@click.option(
"--extractor",
"-e",
"extractors",
type=click.Choice(
list(extractor.extractors.keys()),
case_sensitive=False,
),
default=list(extractor.extractors.keys()),
multiple=True,
help="Choose an extractor to apply to the selected documents.",
)
@click.option( @click.option(
"--force/--no-force", "--force/--no-force",
"-f", "-f",
@ -64,6 +76,7 @@ def main(
doc_folder: str, doc_folder: str,
manual: bool, manual: bool,
write: bool, write: bool,
extractors: str,
template: str, template: str,
git: bool, git: bool,
force: bool, force: bool,
@ -86,6 +99,7 @@ def main(
logger.warning(papis.strings.no_documents_retrieved_message) logger.warning(papis.strings.no_documents_retrieved_message)
return return
print(extractors)
formatter = formatters.get(template) formatter = formatters.get(template)
run(documents, edit=manual, write=write, git=git, formatter=formatter, force=force) run(documents, edit=manual, write=write, git=git, formatter=formatter, force=force)

View file

@ -34,23 +34,26 @@ def start(
pdf_extractor: Extractor = PdfExtractor() pdf_extractor: Extractor = PdfExtractor()
annotations: list[Annotation] = [] annotations: list[Annotation] = []
found_pdf: bool = False file_available: bool = False
for file in document.get_files(): for file in document.get_files():
fname = Path(file) fname = Path(file)
if not pdf_extractor.can_process(fname): if not pdf_extractor.can_process(fname):
break continue
found_pdf = True file_available = True
try: try:
annotations.extend(pdf_extractor.run(fname)) annotations.extend(pdf_extractor.run(fname))
except fitz.FileDataError as e: except fitz.FileDataError as e:
print(f"File structure errors for {file}.\n{e}") print(f"File structure errors for {file}.\n{e}")
if not found_pdf: if not file_available:
# have to remove curlys or papis logger gets upset # have to remove curlys or papis logger gets upset
desc = re.sub("[{}]", "", papis.document.describe(document)) desc = re.sub("[{}]", "", papis.document.describe(document))
logger.warning("Did not find suitable PDF file for document: " f"{desc}") logger.warning("Did not find suitable file for document: " f"{desc}")
return annotations return annotations
extractors: dict[str, Extractor] = {
"pdf": PdfExtractor(),
}