From 488dd0eb41a6a78261e7b7a0b62acebd73637406 Mon Sep 17 00:00:00 2001
From: Marty Oehme <marty.oehme@gmail.com>
Date: Sat, 24 Dec 2022 14:23:07 +0100
Subject: [PATCH] Allow running queries for papers to be extracted from

---
 extract/extract.py | 75 +++++++++++++++++++++++++++++++++-------------
 1 file changed, 55 insertions(+), 20 deletions(-)

diff --git a/extract/extract.py b/extract/extract.py
index 1b38ebf..a0f1dba 100644
--- a/extract/extract.py
+++ b/extract/extract.py
@@ -11,6 +11,8 @@ from pubs.events import DocAddEvent, NoteEvent
 from pubs import repo
 from pubs.utils import resolve_citekey_list
 from pubs.content import check_file, read_text_file, write_file
+from pubs.query import get_paper_filter
+
 
 class ExtractPlugin(PapersPlugin):
     """Extract annotations from any pdf document.
@@ -35,10 +37,6 @@ class ExtractPlugin(PapersPlugin):
         self.pubsdir = os.path.expanduser(conf["main"]["pubsdir"])
         self.broker = self.repository.databroker
 
-        # TODO implement custom annotation formatting, akin to main config citekey format
-        # e.g. `> [{page}] {annotation}`
-        # or `:: {annotation} :: {page} ::`
-        # and so on
         self.on_import = conf["plugins"].get("extract", {}).get("on_import", False)
         self.minimum_similarity = float(
             conf["plugins"].get("extract", {}).get("minimum_similarity", 0.75)
@@ -56,11 +54,6 @@ class ExtractPlugin(PapersPlugin):
         """Allow the usage of the pubs extract subcommand"""
         # TODO option for ignoring missing documents or erroring.
         extract_parser = subparsers.add_parser(self.name, help=self.description)
-        extract_parser.add_argument(
-            "citekeys",
-            nargs=argparse.REMAINDER,
-            help="citekey(s) of the documents to extract from",
-        )
         extract_parser.add_argument(
             "-w",
             "--write",
@@ -75,29 +68,58 @@ class ExtractPlugin(PapersPlugin):
             action="store_true",
             default=False,
         )
+        extract_parser.add_argument(
+            "-q",
+            "--query",
+            help="Query library instead of providing individual citekeys. For query help see pubs list command.",
+            action="store_true",
+            default=None,
+            dest="is_query",
+        )
+        extract_parser.add_argument(
+            "-i",
+            "--ignore-case",
+            action="store_false",
+            default=None,
+            dest="case_sensitive",
+            help="When using query mode, perform case insensitive search.",
+        )
+        extract_parser.add_argument(
+            "-I",
+            "--force-case",
+            action="store_true",
+            dest="case_sensitive",
+            help="When using query mode, perform case sensitive search.",
+        )
+        extract_parser.add_argument(
+            "--strict",
+            action="store_true",
+            default=False,
+            help="Force strict unicode comparison of query.",
+        )
+        extract_parser.add_argument(
+            "query",
+            nargs=argparse.REMAINDER,
+            help="Citekey(s)/query for the documents to extract from.",
+        )
         extract_parser.set_defaults(func=self.command)
 
     def command(self, conf, args):
         """Run the annotation extraction command."""
-        citekeys = resolve_citekey_list(
-            self.repository, conf, args.citekeys, ui=self.ui, exit_on_fail=True
-        )
-        if not citekeys:
-            return
-        all_annotations = self.extract(citekeys)
+        papers = self._gather_papers(conf, args)
+        all_annotations = self.extract(papers)
         if args.write:
             self._to_notes(all_annotations, self.note_extension, args.edit)
         else:
             self._to_stdout(all_annotations)
         self.repository.close()
 
-    def extract(self, citekeys):
+    def extract(self, papers):
         """Extracts annotations from citekeys.
 
         Returns all annotations belonging to the papers that
         are described by the citekeys passed in.
         """
-        papers = self._gather_papers(citekeys)
         papers_annotated = []
         for paper in papers:
             file = self._get_file(paper)
@@ -107,15 +129,28 @@ class ExtractPlugin(PapersPlugin):
                 self.ui.error(f"Document {file} is broken: {e}")
         return papers_annotated
 
-    def _gather_papers(self, citekeys):
+    def _gather_papers(self, conf, args):
         """Get all papers for citekeys.
 
         Returns all Paper objects described by the citekeys
         passed in.
         """
         papers = []
-        for key in citekeys:
-            papers.append(self.repository.pull_paper(key))
+        if not args.is_query:
+            citekeys = resolve_citekey_list(
+                self.repository, conf, args.query, ui=self.ui, exit_on_fail=True
+            )
+            for key in citekeys:
+                papers.append(self.repository.pull_paper(key))
+        else:
+            papers = filter(
+                get_paper_filter(
+                    args.query,
+                    case_sensitive=args.case_sensitive,
+                    strict=args.strict,
+                ),
+                self.repository.all_papers(),
+            )
         return papers
 
     def _get_file(self, paper):