feat: Add extractor cli choice
Can only choose pdf for the time being, but allows additional extractors to be added in the future.
This commit is contained in:
parent
3b4db7b6b8
commit
f477deea7c
2 changed files with 22 additions and 5 deletions
|
@ -50,6 +50,18 @@ papis.config.register_default_settings(DEFAULT_OPTIONS)
|
||||||
),
|
),
|
||||||
help="Choose an output template to format annotations with.",
|
help="Choose an output template to format annotations with.",
|
||||||
)
|
)
|
||||||
|
@click.option(
|
||||||
|
"--extractor",
|
||||||
|
"-e",
|
||||||
|
"extractors",
|
||||||
|
type=click.Choice(
|
||||||
|
list(extractor.extractors.keys()),
|
||||||
|
case_sensitive=False,
|
||||||
|
),
|
||||||
|
default=list(extractor.extractors.keys()),
|
||||||
|
multiple=True,
|
||||||
|
help="Choose an extractor to apply to the selected documents.",
|
||||||
|
)
|
||||||
@click.option(
|
@click.option(
|
||||||
"--force/--no-force",
|
"--force/--no-force",
|
||||||
"-f",
|
"-f",
|
||||||
|
@ -64,6 +76,7 @@ def main(
|
||||||
doc_folder: str,
|
doc_folder: str,
|
||||||
manual: bool,
|
manual: bool,
|
||||||
write: bool,
|
write: bool,
|
||||||
|
extractors: str,
|
||||||
template: str,
|
template: str,
|
||||||
git: bool,
|
git: bool,
|
||||||
force: bool,
|
force: bool,
|
||||||
|
@ -86,6 +99,7 @@ def main(
|
||||||
logger.warning(papis.strings.no_documents_retrieved_message)
|
logger.warning(papis.strings.no_documents_retrieved_message)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
print(extractors)
|
||||||
formatter = formatters.get(template)
|
formatter = formatters.get(template)
|
||||||
|
|
||||||
run(documents, edit=manual, write=write, git=git, formatter=formatter, force=force)
|
run(documents, edit=manual, write=write, git=git, formatter=formatter, force=force)
|
||||||
|
|
|
@ -34,23 +34,26 @@ def start(
|
||||||
pdf_extractor: Extractor = PdfExtractor()
|
pdf_extractor: Extractor = PdfExtractor()
|
||||||
|
|
||||||
annotations: list[Annotation] = []
|
annotations: list[Annotation] = []
|
||||||
found_pdf: bool = False
|
file_available: bool = False
|
||||||
for file in document.get_files():
|
for file in document.get_files():
|
||||||
fname = Path(file)
|
fname = Path(file)
|
||||||
if not pdf_extractor.can_process(fname):
|
if not pdf_extractor.can_process(fname):
|
||||||
break
|
continue
|
||||||
found_pdf = True
|
file_available = True
|
||||||
|
|
||||||
try:
|
try:
|
||||||
annotations.extend(pdf_extractor.run(fname))
|
annotations.extend(pdf_extractor.run(fname))
|
||||||
except fitz.FileDataError as e:
|
except fitz.FileDataError as e:
|
||||||
print(f"File structure errors for {file}.\n{e}")
|
print(f"File structure errors for {file}.\n{e}")
|
||||||
|
|
||||||
if not found_pdf:
|
if not file_available:
|
||||||
# have to remove curlys or papis logger gets upset
|
# have to remove curlys or papis logger gets upset
|
||||||
desc = re.sub("[{}]", "", papis.document.describe(document))
|
desc = re.sub("[{}]", "", papis.document.describe(document))
|
||||||
logger.warning("Did not find suitable PDF file for document: " f"{desc}")
|
logger.warning("Did not find suitable file for document: " f"{desc}")
|
||||||
|
|
||||||
return annotations
|
return annotations
|
||||||
|
|
||||||
|
|
||||||
|
extractors: dict[str, Extractor] = {
|
||||||
|
"pdf": PdfExtractor(),
|
||||||
|
}
|
||||||
|
|
Loading…
Reference in a new issue