Compare commits
No commits in common. "af885e0083a27eee7205599ecb96c512f5145800" and "999a4c88cc39f6cab12ebb6d8a1064ae38f857af" have entirely different histories.
af885e0083
...
999a4c88cc
1 changed files with 35 additions and 81 deletions
|
@ -8,12 +8,10 @@ import Levenshtein
|
||||||
from pubs.plugins import PapersPlugin
|
from pubs.plugins import PapersPlugin
|
||||||
from pubs.events import DocAddEvent, NoteEvent
|
from pubs.events import DocAddEvent, NoteEvent
|
||||||
|
|
||||||
from pubs import repo, pretty
|
from pubs import repo
|
||||||
from pubs.utils import resolve_citekey_list
|
from pubs.utils import resolve_citekey_list
|
||||||
from pubs.content import check_file, read_text_file, write_file
|
from pubs.content import check_file, read_text_file, write_file
|
||||||
from pubs.query import get_paper_filter
|
|
||||||
|
|
||||||
CONFIRMATION_PAPER_THRESHOLD=5
|
|
||||||
|
|
||||||
class ExtractPlugin(PapersPlugin):
|
class ExtractPlugin(PapersPlugin):
|
||||||
"""Extract annotations from any pdf document.
|
"""Extract annotations from any pdf document.
|
||||||
|
@ -38,18 +36,32 @@ class ExtractPlugin(PapersPlugin):
|
||||||
self.pubsdir = os.path.expanduser(conf["main"]["pubsdir"])
|
self.pubsdir = os.path.expanduser(conf["main"]["pubsdir"])
|
||||||
self.broker = self.repository.databroker
|
self.broker = self.repository.databroker
|
||||||
|
|
||||||
settings = conf["plugins"].get("extract", {})
|
# TODO implement custom annotation formatting, akin to main config citekey format
|
||||||
self.on_import = settings.get("on_import", False)
|
# e.g. `> [{page}] {annotation}`
|
||||||
self.minimum_similarity = float(settings.get("minimum_similarity", 0.75))
|
# or `:: {annotation} :: {page} ::`
|
||||||
self.formatting = settings.get(
|
# and so on
|
||||||
"formatting",
|
self.on_import = conf["plugins"].get("extract", {}).get("on_import", False)
|
||||||
"{newline}{quote_begin}> {quote} {quote_end}[{page}]{note_begin}{newline}Note: {note}{note_end}",
|
self.minimum_similarity = float(
|
||||||
|
conf["plugins"].get("extract", {}).get("minimum_similarity", 0.75)
|
||||||
|
)
|
||||||
|
self.formatting = (
|
||||||
|
conf["plugins"]
|
||||||
|
.get("extract", {})
|
||||||
|
.get(
|
||||||
|
"formatting",
|
||||||
|
"{newline}{quote_begin}> {quote} {quote_end}[{page}]{note_begin}{newline}Note: {note}{note_end}",
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
def update_parser(self, subparsers, _):
|
def update_parser(self, subparsers, conf):
|
||||||
"""Allow the usage of the pubs extract subcommand"""
|
"""Allow the usage of the pubs extract subcommand"""
|
||||||
# TODO option for ignoring missing documents or erroring.
|
# TODO option for ignoring missing documents or erroring.
|
||||||
extract_parser = subparsers.add_parser(self.name, help=self.description)
|
extract_parser = subparsers.add_parser(self.name, help=self.description)
|
||||||
|
extract_parser.add_argument(
|
||||||
|
"citekeys",
|
||||||
|
nargs=argparse.REMAINDER,
|
||||||
|
help="citekey(s) of the documents to extract from",
|
||||||
|
)
|
||||||
extract_parser.add_argument(
|
extract_parser.add_argument(
|
||||||
"-w",
|
"-w",
|
||||||
"--write",
|
"--write",
|
||||||
|
@ -64,58 +76,29 @@ class ExtractPlugin(PapersPlugin):
|
||||||
action="store_true",
|
action="store_true",
|
||||||
default=False,
|
default=False,
|
||||||
)
|
)
|
||||||
extract_parser.add_argument(
|
|
||||||
"-q",
|
|
||||||
"--query",
|
|
||||||
help="Query library instead of providing individual citekeys. For query help see pubs list command.",
|
|
||||||
action="store_true",
|
|
||||||
default=None,
|
|
||||||
dest="is_query",
|
|
||||||
)
|
|
||||||
extract_parser.add_argument(
|
|
||||||
"-i",
|
|
||||||
"--ignore-case",
|
|
||||||
action="store_false",
|
|
||||||
default=None,
|
|
||||||
dest="case_sensitive",
|
|
||||||
help="When using query mode, perform case insensitive search.",
|
|
||||||
)
|
|
||||||
extract_parser.add_argument(
|
|
||||||
"-I",
|
|
||||||
"--force-case",
|
|
||||||
action="store_true",
|
|
||||||
dest="case_sensitive",
|
|
||||||
help="When using query mode, perform case sensitive search.",
|
|
||||||
)
|
|
||||||
extract_parser.add_argument(
|
|
||||||
"--strict",
|
|
||||||
action="store_true",
|
|
||||||
default=False,
|
|
||||||
help="Force strict unicode comparison of query.",
|
|
||||||
)
|
|
||||||
extract_parser.add_argument(
|
|
||||||
"query",
|
|
||||||
nargs=argparse.REMAINDER,
|
|
||||||
help="Citekey(s)/query for the documents to extract from.",
|
|
||||||
)
|
|
||||||
extract_parser.set_defaults(func=self.command)
|
extract_parser.set_defaults(func=self.command)
|
||||||
|
|
||||||
def command(self, conf, args):
|
def command(self, conf, args):
|
||||||
"""Run the annotation extraction command."""
|
"""Run the annotation extraction command."""
|
||||||
papers = self._gather_papers(conf, args)
|
citekeys = resolve_citekey_list(
|
||||||
all_annotations = self.extract(papers)
|
self.repository, conf, args.citekeys, ui=self.ui, exit_on_fail=True
|
||||||
|
)
|
||||||
|
if not citekeys:
|
||||||
|
return
|
||||||
|
all_annotations = self.extract(citekeys)
|
||||||
if args.write:
|
if args.write:
|
||||||
self._to_notes(all_annotations, self.note_extension, args.edit)
|
self._to_notes(all_annotations, self.note_extension, args.edit)
|
||||||
else:
|
else:
|
||||||
self._to_stdout(all_annotations)
|
self._to_stdout(all_annotations)
|
||||||
self.repository.close()
|
self.repository.close()
|
||||||
|
|
||||||
def extract(self, papers):
|
def extract(self, citekeys):
|
||||||
"""Extracts annotations from citekeys.
|
"""Extracts annotations from citekeys.
|
||||||
|
|
||||||
Returns all annotations belonging to the papers that
|
Returns all annotations belonging to the papers that
|
||||||
are described by the citekeys passed in.
|
are described by the citekeys passed in.
|
||||||
"""
|
"""
|
||||||
|
papers = self._gather_papers(citekeys)
|
||||||
papers_annotated = []
|
papers_annotated = []
|
||||||
for paper in papers:
|
for paper in papers:
|
||||||
file = self._get_file(paper)
|
file = self._get_file(paper)
|
||||||
|
@ -125,44 +108,15 @@ class ExtractPlugin(PapersPlugin):
|
||||||
self.ui.error(f"Document {file} is broken: {e}")
|
self.ui.error(f"Document {file} is broken: {e}")
|
||||||
return papers_annotated
|
return papers_annotated
|
||||||
|
|
||||||
def _gather_papers(self, conf, args):
|
def _gather_papers(self, citekeys):
|
||||||
"""Get all papers for citekeys.
|
"""Get all papers for citekeys.
|
||||||
|
|
||||||
Returns all Paper objects described by the citekeys
|
Returns all Paper objects described by the citekeys
|
||||||
passed in.
|
passed in.
|
||||||
"""
|
"""
|
||||||
papers = []
|
papers = []
|
||||||
if not args.is_query:
|
for key in citekeys:
|
||||||
keys = resolve_citekey_list(
|
papers.append(self.repository.pull_paper(key))
|
||||||
self.repository, conf, args.query, ui=self.ui, exit_on_fail=True
|
|
||||||
)
|
|
||||||
if not keys:
|
|
||||||
return []
|
|
||||||
for key in keys:
|
|
||||||
papers.append(self.repository.pull_paper(key))
|
|
||||||
else:
|
|
||||||
papers = list(
|
|
||||||
filter(
|
|
||||||
get_paper_filter(
|
|
||||||
args.query,
|
|
||||||
case_sensitive=args.case_sensitive,
|
|
||||||
strict=args.strict,
|
|
||||||
),
|
|
||||||
self.repository.all_papers(),
|
|
||||||
)
|
|
||||||
)
|
|
||||||
if len(papers) > CONFIRMATION_PAPER_THRESHOLD:
|
|
||||||
self.ui.message(
|
|
||||||
"\n".join(
|
|
||||||
pretty.paper_oneliner(
|
|
||||||
p, citekey_only=False, max_authors=conf["main"]["max_authors"]
|
|
||||||
)
|
|
||||||
for p in papers
|
|
||||||
)
|
|
||||||
)
|
|
||||||
self.ui.input_yn(
|
|
||||||
question=f"Extract annotations for these papers?", default="y"
|
|
||||||
)
|
|
||||||
return papers
|
return papers
|
||||||
|
|
||||||
def _get_file(self, paper):
|
def _get_file(self, paper):
|
||||||
|
@ -253,9 +207,9 @@ class ExtractPlugin(PapersPlugin):
|
||||||
paper = contents[0]
|
paper = contents[0]
|
||||||
annotations = contents[1]
|
annotations = contents[1]
|
||||||
if annotations:
|
if annotations:
|
||||||
output += f"------ {paper.citekey} ------\n"
|
output += f"------ {paper.citekey} ------\n\n"
|
||||||
for annot in annotations:
|
for annot in annotations:
|
||||||
output += f"{annot}\n"
|
output += f"{annot}\n\n"
|
||||||
output += "\n"
|
output += "\n"
|
||||||
print(output)
|
print(output)
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue