From e201f6cf5f05ad73ca247cd521dce0ca6630e26d Mon Sep 17 00:00:00 2001 From: Marty Oehme Date: Sat, 24 Dec 2022 13:41:12 +0100 Subject: [PATCH 1/4] Remove redundant newlines in stdout --- extract/extract.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/extract/extract.py b/extract/extract.py index 88b6b8d..1b38ebf 100644 --- a/extract/extract.py +++ b/extract/extract.py @@ -12,7 +12,6 @@ from pubs import repo from pubs.utils import resolve_citekey_list from pubs.content import check_file, read_text_file, write_file - class ExtractPlugin(PapersPlugin): """Extract annotations from any pdf document. @@ -207,9 +206,9 @@ class ExtractPlugin(PapersPlugin): paper = contents[0] annotations = contents[1] if annotations: - output += f"------ {paper.citekey} ------\n\n" + output += f"------ {paper.citekey} ------\n" for annot in annotations: - output += f"{annot}\n\n" + output += f"{annot}\n" output += "\n" print(output) From 488dd0eb41a6a78261e7b7a0b62acebd73637406 Mon Sep 17 00:00:00 2001 From: Marty Oehme Date: Sat, 24 Dec 2022 14:23:07 +0100 Subject: [PATCH 2/4] Allow running queries for papers to be extracted from --- extract/extract.py | 75 +++++++++++++++++++++++++++++++++------------- 1 file changed, 55 insertions(+), 20 deletions(-) diff --git a/extract/extract.py b/extract/extract.py index 1b38ebf..a0f1dba 100644 --- a/extract/extract.py +++ b/extract/extract.py @@ -11,6 +11,8 @@ from pubs.events import DocAddEvent, NoteEvent from pubs import repo from pubs.utils import resolve_citekey_list from pubs.content import check_file, read_text_file, write_file +from pubs.query import get_paper_filter + class ExtractPlugin(PapersPlugin): """Extract annotations from any pdf document. @@ -35,10 +37,6 @@ class ExtractPlugin(PapersPlugin): self.pubsdir = os.path.expanduser(conf["main"]["pubsdir"]) self.broker = self.repository.databroker - # TODO implement custom annotation formatting, akin to main config citekey format - # e.g. `> [{page}] {annotation}` - # or `:: {annotation} :: {page} ::` - # and so on self.on_import = conf["plugins"].get("extract", {}).get("on_import", False) self.minimum_similarity = float( conf["plugins"].get("extract", {}).get("minimum_similarity", 0.75) @@ -56,11 +54,6 @@ class ExtractPlugin(PapersPlugin): """Allow the usage of the pubs extract subcommand""" # TODO option for ignoring missing documents or erroring. extract_parser = subparsers.add_parser(self.name, help=self.description) - extract_parser.add_argument( - "citekeys", - nargs=argparse.REMAINDER, - help="citekey(s) of the documents to extract from", - ) extract_parser.add_argument( "-w", "--write", @@ -75,29 +68,58 @@ class ExtractPlugin(PapersPlugin): action="store_true", default=False, ) + extract_parser.add_argument( + "-q", + "--query", + help="Query library instead of providing individual citekeys. For query help see pubs list command.", + action="store_true", + default=None, + dest="is_query", + ) + extract_parser.add_argument( + "-i", + "--ignore-case", + action="store_false", + default=None, + dest="case_sensitive", + help="When using query mode, perform case insensitive search.", + ) + extract_parser.add_argument( + "-I", + "--force-case", + action="store_true", + dest="case_sensitive", + help="When using query mode, perform case sensitive search.", + ) + extract_parser.add_argument( + "--strict", + action="store_true", + default=False, + help="Force strict unicode comparison of query.", + ) + extract_parser.add_argument( + "query", + nargs=argparse.REMAINDER, + help="Citekey(s)/query for the documents to extract from.", + ) extract_parser.set_defaults(func=self.command) def command(self, conf, args): """Run the annotation extraction command.""" - citekeys = resolve_citekey_list( - self.repository, conf, args.citekeys, ui=self.ui, exit_on_fail=True - ) - if not citekeys: - return - all_annotations = self.extract(citekeys) + papers = self._gather_papers(conf, args) + all_annotations = self.extract(papers) if args.write: self._to_notes(all_annotations, self.note_extension, args.edit) else: self._to_stdout(all_annotations) self.repository.close() - def extract(self, citekeys): + def extract(self, papers): """Extracts annotations from citekeys. Returns all annotations belonging to the papers that are described by the citekeys passed in. """ - papers = self._gather_papers(citekeys) papers_annotated = [] for paper in papers: file = self._get_file(paper) @@ -107,15 +129,28 @@ class ExtractPlugin(PapersPlugin): self.ui.error(f"Document {file} is broken: {e}") return papers_annotated - def _gather_papers(self, citekeys): + def _gather_papers(self, conf, args): """Get all papers for citekeys. Returns all Paper objects described by the citekeys passed in. """ papers = [] - for key in citekeys: - papers.append(self.repository.pull_paper(key)) + if not args.is_query: + citekeys = resolve_citekey_list( + self.repository, conf, args.query, ui=self.ui, exit_on_fail=True + ) + for key in citekeys: + papers.append(self.repository.pull_paper(key)) + else: + papers = filter( + get_paper_filter( + args.query, + case_sensitive=args.case_sensitive, + strict=args.strict, + ), + self.repository.all_papers(), + ) return papers def _get_file(self, paper): From ca56bf3cdb288fb3950f34bb3201f56a1b26a320 Mon Sep 17 00:00:00 2001 From: Marty Oehme Date: Sat, 24 Dec 2022 14:34:18 +0100 Subject: [PATCH 3/4] Add confirmation to many-paper extraction --- extract/extract.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/extract/extract.py b/extract/extract.py index a0f1dba..11ec266 100644 --- a/extract/extract.py +++ b/extract/extract.py @@ -8,11 +8,12 @@ import Levenshtein from pubs.plugins import PapersPlugin from pubs.events import DocAddEvent, NoteEvent -from pubs import repo +from pubs import repo, pretty from pubs.utils import resolve_citekey_list from pubs.content import check_file, read_text_file, write_file from pubs.query import get_paper_filter +CONFIRMATION_PAPER_THRESHOLD=5 class ExtractPlugin(PapersPlugin): """Extract annotations from any pdf document. @@ -137,20 +138,25 @@ class ExtractPlugin(PapersPlugin): """ papers = [] if not args.is_query: - citekeys = resolve_citekey_list( + keys = resolve_citekey_list( self.repository, conf, args.query, ui=self.ui, exit_on_fail=True ) - for key in citekeys: + for key in keys: papers.append(self.repository.pull_paper(key)) else: - papers = filter( + papers = list(filter( get_paper_filter( args.query, case_sensitive=args.case_sensitive, strict=args.strict, ), self.repository.all_papers(), - ) + )) + if len(papers) > CONFIRMATION_PAPER_THRESHOLD: + self.ui.message('\n'.join( + pretty.paper_oneliner(p, citekey_only=False, max_authors=conf['main']['max_authors']) + for p in papers)) + self.ui.input_yn(question=f"Extract annotations for these papers?", default='y') return papers def _get_file(self, paper): From af885e0083a27eee7205599ecb96c512f5145800 Mon Sep 17 00:00:00 2001 From: Marty Oehme Date: Sat, 24 Dec 2022 17:01:06 +0100 Subject: [PATCH 4/4] Refactor initial configuration ingestion --- extract/extract.py | 54 +++++++++++++++++++++++++--------------------- 1 file changed, 30 insertions(+), 24 deletions(-) diff --git a/extract/extract.py b/extract/extract.py index 11ec266..f25bd73 100644 --- a/extract/extract.py +++ b/extract/extract.py @@ -38,20 +38,15 @@ class ExtractPlugin(PapersPlugin): self.pubsdir = os.path.expanduser(conf["main"]["pubsdir"]) self.broker = self.repository.databroker - self.on_import = conf["plugins"].get("extract", {}).get("on_import", False) - self.minimum_similarity = float( - conf["plugins"].get("extract", {}).get("minimum_similarity", 0.75) - ) - self.formatting = ( - conf["plugins"] - .get("extract", {}) - .get( - "formatting", - "{newline}{quote_begin}> {quote} {quote_end}[{page}]{note_begin}{newline}Note: {note}{note_end}", - ) + settings = conf["plugins"].get("extract", {}) + self.on_import = settings.get("on_import", False) + self.minimum_similarity = float(settings.get("minimum_similarity", 0.75)) + self.formatting = settings.get( + "formatting", + "{newline}{quote_begin}> {quote} {quote_end}[{page}]{note_begin}{newline}Note: {note}{note_end}", ) - def update_parser(self, subparsers, conf): + def update_parser(self, subparsers, _): """Allow the usage of the pubs extract subcommand""" # TODO option for ignoring missing documents or erroring. extract_parser = subparsers.add_parser(self.name, help=self.description) @@ -141,22 +136,33 @@ class ExtractPlugin(PapersPlugin): keys = resolve_citekey_list( self.repository, conf, args.query, ui=self.ui, exit_on_fail=True ) + if not keys: + return [] for key in keys: papers.append(self.repository.pull_paper(key)) else: - papers = list(filter( - get_paper_filter( - args.query, - case_sensitive=args.case_sensitive, - strict=args.strict, - ), - self.repository.all_papers(), - )) + papers = list( + filter( + get_paper_filter( + args.query, + case_sensitive=args.case_sensitive, + strict=args.strict, + ), + self.repository.all_papers(), + ) + ) if len(papers) > CONFIRMATION_PAPER_THRESHOLD: - self.ui.message('\n'.join( - pretty.paper_oneliner(p, citekey_only=False, max_authors=conf['main']['max_authors']) - for p in papers)) - self.ui.input_yn(question=f"Extract annotations for these papers?", default='y') + self.ui.message( + "\n".join( + pretty.paper_oneliner( + p, citekey_only=False, max_authors=conf["main"]["max_authors"] + ) + for p in papers + ) + ) + self.ui.input_yn( + question=f"Extract annotations for these papers?", default="y" + ) return papers def _get_file(self, paper):