2024-06-14 18:02:52 +00:00
|
|
|
import re
|
2024-11-15 10:28:50 +00:00
|
|
|
|
2023-08-28 08:28:06 +00:00
|
|
|
import click
|
|
|
|
import papis.cli
|
|
|
|
import papis.config
|
|
|
|
import papis.document
|
|
|
|
import papis.logging
|
|
|
|
import papis.notes
|
|
|
|
import papis.strings
|
2023-08-29 10:40:36 +00:00
|
|
|
from papis.document import Document
|
2023-08-28 08:28:06 +00:00
|
|
|
|
2024-01-25 20:34:40 +00:00
|
|
|
from papis_extract import extraction
|
2024-06-13 19:20:53 +00:00
|
|
|
from papis_extract.annotation import Annotation
|
2024-01-25 20:34:40 +00:00
|
|
|
from papis_extract.exporter import Exporter
|
|
|
|
from papis_extract.exporters import all_exporters
|
2024-06-13 19:20:53 +00:00
|
|
|
from papis_extract.extractors import all_extractors
|
2023-10-17 20:03:35 +00:00
|
|
|
from papis_extract.formatter import Formatter, formatters
|
2023-08-28 08:28:06 +00:00
|
|
|
|
|
|
|
logger = papis.logging.get_logger(__name__)
|
|
|
|
|
2024-06-13 19:20:53 +00:00
|
|
|
DEFAULT_OPTIONS: dict[str, dict[str, bool | float | dict[str, str]]] = {
|
2023-08-28 08:28:06 +00:00
|
|
|
"plugins.extract": {
|
2023-08-28 14:41:59 +00:00
|
|
|
"tags": {},
|
2023-08-28 08:28:06 +00:00
|
|
|
"on_import": False,
|
2023-08-28 10:55:01 +00:00
|
|
|
"minimum_similarity": 0.75, # for checking against existing annotations
|
|
|
|
"minimum_similarity_content": 0.9, # for checking if highlight or note
|
|
|
|
"minimum_similarity_color": 0.833, # for matching tag to color
|
2023-08-28 08:28:06 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
papis.config.register_default_settings(DEFAULT_OPTIONS)
|
|
|
|
|
|
|
|
|
|
|
|
@click.command("extract")
|
|
|
|
@click.help_option("-h", "--help")
|
|
|
|
@papis.cli.query_argument()
|
|
|
|
@papis.cli.doc_folder_option()
|
2023-09-19 16:30:18 +00:00
|
|
|
@papis.cli.git_option(help="Commit changes made to the notes files.")
|
2023-08-28 08:28:06 +00:00
|
|
|
@papis.cli.all_option()
|
2023-09-19 16:30:18 +00:00
|
|
|
@click.option(
|
|
|
|
"--write/--no-write",
|
|
|
|
"-w",
|
2024-01-23 08:27:02 +00:00
|
|
|
help="Write extracted annotations into papis notes.",
|
|
|
|
show_default=True,
|
2023-09-19 16:30:18 +00:00
|
|
|
)
|
2023-08-28 08:28:06 +00:00
|
|
|
@click.option(
|
|
|
|
"--manual/--no-manual",
|
|
|
|
"-m",
|
2023-09-19 19:43:19 +00:00
|
|
|
help="Open note in editor for manual editing after annotation extraction.",
|
2024-01-23 08:27:02 +00:00
|
|
|
show_default=True,
|
2023-08-28 08:28:06 +00:00
|
|
|
)
|
|
|
|
@click.option(
|
2023-09-19 16:30:18 +00:00
|
|
|
"--template",
|
|
|
|
"-t",
|
2023-09-21 20:01:51 +00:00
|
|
|
type=click.Choice(
|
2024-01-20 15:34:10 +00:00
|
|
|
list(formatters.keys()),
|
2023-09-21 20:01:51 +00:00
|
|
|
case_sensitive=False,
|
|
|
|
),
|
2023-09-19 16:30:18 +00:00
|
|
|
help="Choose an output template to format annotations with.",
|
2024-01-23 08:27:02 +00:00
|
|
|
show_default=True,
|
2023-08-28 08:28:06 +00:00
|
|
|
)
|
2024-01-23 07:58:32 +00:00
|
|
|
@click.option(
|
|
|
|
"--extractor",
|
|
|
|
"-e",
|
|
|
|
"extractors",
|
|
|
|
type=click.Choice(
|
2024-01-23 08:21:46 +00:00
|
|
|
list(all_extractors.keys()),
|
2024-01-23 07:58:32 +00:00
|
|
|
case_sensitive=False,
|
|
|
|
),
|
2024-01-23 08:21:46 +00:00
|
|
|
default=list(all_extractors.keys()),
|
2024-01-23 07:58:32 +00:00
|
|
|
multiple=True,
|
2024-01-23 08:27:02 +00:00
|
|
|
help="Choose an extractor to apply to the selected documents. [default: all]",
|
2024-01-23 07:58:32 +00:00
|
|
|
)
|
2023-10-17 20:01:41 +00:00
|
|
|
@click.option(
|
|
|
|
"--force/--no-force",
|
|
|
|
"-f",
|
|
|
|
help="Do not drop any annotations because they already exist.",
|
2024-01-23 08:27:02 +00:00
|
|
|
show_default=True,
|
2023-10-17 20:01:41 +00:00
|
|
|
)
|
2023-08-28 08:28:06 +00:00
|
|
|
def main(
|
|
|
|
query: str,
|
|
|
|
# _papis_id: bool,
|
|
|
|
# _file: bool,
|
|
|
|
# _dir: bool,
|
|
|
|
_all: bool,
|
|
|
|
doc_folder: str,
|
|
|
|
manual: bool,
|
|
|
|
write: bool,
|
2024-01-23 08:10:42 +00:00
|
|
|
extractors: list[str],
|
2023-09-19 16:30:18 +00:00
|
|
|
template: str,
|
2023-08-28 08:28:06 +00:00
|
|
|
git: bool,
|
2023-10-17 20:01:41 +00:00
|
|
|
force: bool,
|
2023-08-28 08:28:06 +00:00
|
|
|
) -> None:
|
2024-06-12 09:05:13 +00:00
|
|
|
"""Extract annotations from any documents.
|
2023-08-28 08:28:06 +00:00
|
|
|
|
|
|
|
The extract plugin allows manual or automatic extraction of all annotations
|
2024-06-12 09:05:13 +00:00
|
|
|
contained in the documents belonging to entries of the papis library,
|
|
|
|
primarily targeting PDF documents currently.
|
2023-08-28 08:28:06 +00:00
|
|
|
It can write those changes to stdout or directly create and update notes
|
|
|
|
for papis documents.
|
|
|
|
|
|
|
|
It adds a `papis extract` subcommand through which it is invoked, but can
|
2023-09-19 16:30:18 +00:00
|
|
|
optionally run whenever a new document is imported for a papis entry,
|
|
|
|
if set in the plugin configuration.
|
2023-08-28 08:28:06 +00:00
|
|
|
"""
|
|
|
|
documents = papis.cli.handle_doc_folder_query_all_sort(
|
|
|
|
query, doc_folder, sort_field=None, sort_reverse=False, _all=_all
|
|
|
|
)
|
|
|
|
if not documents:
|
|
|
|
logger.warning(papis.strings.no_documents_retrieved_message)
|
|
|
|
return
|
|
|
|
|
2024-01-20 15:34:10 +00:00
|
|
|
formatter = formatters.get(template)
|
2023-09-20 06:38:06 +00:00
|
|
|
|
2024-01-23 08:10:42 +00:00
|
|
|
run(
|
|
|
|
documents,
|
|
|
|
edit=manual,
|
|
|
|
write=write,
|
|
|
|
git=git,
|
|
|
|
formatter=formatter,
|
2024-01-23 08:21:46 +00:00
|
|
|
extractors=[all_extractors.get(e) for e in extractors],
|
2024-01-23 08:10:42 +00:00
|
|
|
force=force,
|
|
|
|
)
|
2023-08-28 08:28:06 +00:00
|
|
|
|
|
|
|
|
2023-08-29 10:40:36 +00:00
|
|
|
def run(
|
2023-08-28 08:28:06 +00:00
|
|
|
documents: list[Document],
|
2024-01-20 15:34:10 +00:00
|
|
|
formatter: Formatter | None,
|
2024-01-23 08:10:42 +00:00
|
|
|
extractors: list[extraction.Extractor | None],
|
2023-08-29 10:40:36 +00:00
|
|
|
edit: bool = False,
|
|
|
|
write: bool = False,
|
|
|
|
git: bool = False,
|
2023-10-17 20:01:41 +00:00
|
|
|
force: bool = False,
|
2023-08-29 10:40:36 +00:00
|
|
|
) -> None:
|
2024-06-13 19:20:53 +00:00
|
|
|
exporter: Exporter
|
2024-01-25 20:34:40 +00:00
|
|
|
if write:
|
2024-06-13 19:20:53 +00:00
|
|
|
exporter = all_exporters["notes"](
|
2024-01-25 21:46:38 +00:00
|
|
|
formatter=formatter or formatters["markdown-atx"],
|
2024-01-25 20:34:40 +00:00
|
|
|
edit=edit,
|
|
|
|
git=git,
|
|
|
|
force=force,
|
|
|
|
)
|
|
|
|
else:
|
2024-06-13 19:20:53 +00:00
|
|
|
exporter = all_exporters["stdout"](
|
2024-01-25 20:34:40 +00:00
|
|
|
formatter=formatter or formatters["markdown"]
|
|
|
|
)
|
|
|
|
|
|
|
|
doc_annots: list[tuple[Document, list[Annotation]]] = []
|
2024-01-20 15:34:10 +00:00
|
|
|
for doc in documents:
|
2024-01-25 20:34:40 +00:00
|
|
|
annotations: list[Annotation] = []
|
2024-06-14 18:02:52 +00:00
|
|
|
valid_files: int = 0
|
2024-01-23 08:10:42 +00:00
|
|
|
for ext in extractors:
|
|
|
|
if not ext:
|
|
|
|
continue
|
2024-06-14 18:02:52 +00:00
|
|
|
added = extraction.start(ext, doc)
|
|
|
|
if added is not None:
|
|
|
|
valid_files += 1
|
|
|
|
annotations.extend(added)
|
2024-01-25 20:34:40 +00:00
|
|
|
doc_annots.append((doc, annotations))
|
2024-01-20 15:34:10 +00:00
|
|
|
|
2024-06-14 18:02:52 +00:00
|
|
|
if valid_files == 0:
|
|
|
|
# have to remove curlys or papis logger gets upset
|
|
|
|
desc = re.sub("[{}]", "", papis.document.describe(doc))
|
2024-11-15 10:28:50 +00:00
|
|
|
logger.info(
|
|
|
|
f"Document {desc} has no valid extractors for any of its files."
|
|
|
|
)
|
2024-01-25 20:34:40 +00:00
|
|
|
exporter.run(doc_annots)
|