diff --git a/extract/extract.py b/extract/extract.py index f25bd73..88b6b8d 100644 --- a/extract/extract.py +++ b/extract/extract.py @@ -8,12 +8,10 @@ import Levenshtein from pubs.plugins import PapersPlugin from pubs.events import DocAddEvent, NoteEvent -from pubs import repo, pretty +from pubs import repo from pubs.utils import resolve_citekey_list from pubs.content import check_file, read_text_file, write_file -from pubs.query import get_paper_filter -CONFIRMATION_PAPER_THRESHOLD=5 class ExtractPlugin(PapersPlugin): """Extract annotations from any pdf document. @@ -38,18 +36,32 @@ class ExtractPlugin(PapersPlugin): self.pubsdir = os.path.expanduser(conf["main"]["pubsdir"]) self.broker = self.repository.databroker - settings = conf["plugins"].get("extract", {}) - self.on_import = settings.get("on_import", False) - self.minimum_similarity = float(settings.get("minimum_similarity", 0.75)) - self.formatting = settings.get( - "formatting", - "{newline}{quote_begin}> {quote} {quote_end}[{page}]{note_begin}{newline}Note: {note}{note_end}", + # TODO implement custom annotation formatting, akin to main config citekey format + # e.g. `> [{page}] {annotation}` + # or `:: {annotation} :: {page} ::` + # and so on + self.on_import = conf["plugins"].get("extract", {}).get("on_import", False) + self.minimum_similarity = float( + conf["plugins"].get("extract", {}).get("minimum_similarity", 0.75) + ) + self.formatting = ( + conf["plugins"] + .get("extract", {}) + .get( + "formatting", + "{newline}{quote_begin}> {quote} {quote_end}[{page}]{note_begin}{newline}Note: {note}{note_end}", + ) ) - def update_parser(self, subparsers, _): + def update_parser(self, subparsers, conf): """Allow the usage of the pubs extract subcommand""" # TODO option for ignoring missing documents or erroring. extract_parser = subparsers.add_parser(self.name, help=self.description) + extract_parser.add_argument( + "citekeys", + nargs=argparse.REMAINDER, + help="citekey(s) of the documents to extract from", + ) extract_parser.add_argument( "-w", "--write", @@ -64,58 +76,29 @@ class ExtractPlugin(PapersPlugin): action="store_true", default=False, ) - extract_parser.add_argument( - "-q", - "--query", - help="Query library instead of providing individual citekeys. For query help see pubs list command.", - action="store_true", - default=None, - dest="is_query", - ) - extract_parser.add_argument( - "-i", - "--ignore-case", - action="store_false", - default=None, - dest="case_sensitive", - help="When using query mode, perform case insensitive search.", - ) - extract_parser.add_argument( - "-I", - "--force-case", - action="store_true", - dest="case_sensitive", - help="When using query mode, perform case sensitive search.", - ) - extract_parser.add_argument( - "--strict", - action="store_true", - default=False, - help="Force strict unicode comparison of query.", - ) - extract_parser.add_argument( - "query", - nargs=argparse.REMAINDER, - help="Citekey(s)/query for the documents to extract from.", - ) extract_parser.set_defaults(func=self.command) def command(self, conf, args): """Run the annotation extraction command.""" - papers = self._gather_papers(conf, args) - all_annotations = self.extract(papers) + citekeys = resolve_citekey_list( + self.repository, conf, args.citekeys, ui=self.ui, exit_on_fail=True + ) + if not citekeys: + return + all_annotations = self.extract(citekeys) if args.write: self._to_notes(all_annotations, self.note_extension, args.edit) else: self._to_stdout(all_annotations) self.repository.close() - def extract(self, papers): + def extract(self, citekeys): """Extracts annotations from citekeys. Returns all annotations belonging to the papers that are described by the citekeys passed in. """ + papers = self._gather_papers(citekeys) papers_annotated = [] for paper in papers: file = self._get_file(paper) @@ -125,44 +108,15 @@ class ExtractPlugin(PapersPlugin): self.ui.error(f"Document {file} is broken: {e}") return papers_annotated - def _gather_papers(self, conf, args): + def _gather_papers(self, citekeys): """Get all papers for citekeys. Returns all Paper objects described by the citekeys passed in. """ papers = [] - if not args.is_query: - keys = resolve_citekey_list( - self.repository, conf, args.query, ui=self.ui, exit_on_fail=True - ) - if not keys: - return [] - for key in keys: - papers.append(self.repository.pull_paper(key)) - else: - papers = list( - filter( - get_paper_filter( - args.query, - case_sensitive=args.case_sensitive, - strict=args.strict, - ), - self.repository.all_papers(), - ) - ) - if len(papers) > CONFIRMATION_PAPER_THRESHOLD: - self.ui.message( - "\n".join( - pretty.paper_oneliner( - p, citekey_only=False, max_authors=conf["main"]["max_authors"] - ) - for p in papers - ) - ) - self.ui.input_yn( - question=f"Extract annotations for these papers?", default="y" - ) + for key in citekeys: + papers.append(self.repository.pull_paper(key)) return papers def _get_file(self, paper): @@ -253,9 +207,9 @@ class ExtractPlugin(PapersPlugin): paper = contents[0] annotations = contents[1] if annotations: - output += f"------ {paper.citekey} ------\n" + output += f"------ {paper.citekey} ------\n\n" for annot in annotations: - output += f"{annot}\n" + output += f"{annot}\n\n" output += "\n" print(output)