From 7b27f3291da18caa0cd95c9f31e1dc85f03fa75d Mon Sep 17 00:00:00 2001
From: Marty Oehme <marty.oehme@gmail.com>
Date: Thu, 22 Dec 2022 21:02:01 +0100
Subject: [PATCH 1/5] Add simple docstrings

---
 extract/extract.py | 61 ++++++++++++++++++++++++++++++++++++----------
 1 file changed, 48 insertions(+), 13 deletions(-)

diff --git a/extract/extract.py b/extract/extract.py
index 067547d..bcf3dcb 100644
--- a/extract/extract.py
+++ b/extract/extract.py
@@ -1,9 +1,6 @@
 import os
 import argparse
 
-# from subprocess import Popen, PIPE, STDOUT
-# from pipes import quote as shell_quote
-
 import fitz
 
 from pubs.plugins import PapersPlugin
@@ -15,13 +12,16 @@ from pubs.content import check_file, read_text_file, write_file
 
 
 class ExtractPlugin(PapersPlugin):
-    """Make the pubs repository also a git repository.
+    """Extract annotations from any pdf document.
 
-    The git plugin creates a git repository in the pubs directory
-    and commit the changes to the pubs repository.
+    The extract plugin allows manual or automatic extraction of all annotations
+    contained in the pdf documents belonging to entries of the pubs library.
 
-    It also add the `pubs git` subcommand, so git commands can be executed
-    in the git repository from the command line.
+    It can write those changes to stdout or directly create and update notes
+    for the pubs entries.
+
+    It adds a `pubs extract` subcommand through which it is invoked, but can
+    optionally run whenever a new document is imported for a pubs entry.
     """
 
     name = "extract"
@@ -39,12 +39,9 @@ class ExtractPlugin(PapersPlugin):
         # or `:: {annotation} :: {page} ::`
         # and so on
         self.onimport = conf["plugins"].get("extract", {}).get("onimport", False)
-        # self.manual = conf['plugins'].get('git', {}).get('manual', False)
-        # self.force_color = conf['plugins'].get('git', {}).get('force_color', True)
-        # self.list_of_changes = []
 
     def update_parser(self, subparsers, conf):
-        """Allow the usage of the pubs git subcommand"""
+        """Allow the usage of the pubs extract subcommand"""
         # TODO option for ignoring missing documents or erroring.
         extract_parser = subparsers.add_parser(self.name, help=self.description)
         extract_parser.add_argument(
@@ -69,7 +66,7 @@ class ExtractPlugin(PapersPlugin):
         extract_parser.set_defaults(func=self.command)
 
     def command(self, conf, args):
-        """Run the annotation extraction"""
+        """Run the annotation extraction command."""
         citekeys = resolve_citekey_list(
             self.repository, conf, args.citekeys, ui=self.ui, exit_on_fail=True
         )
@@ -83,6 +80,11 @@ class ExtractPlugin(PapersPlugin):
         self.repository.close()
 
     def extract(self, citekeys):
+        """Extracts annotations from citekeys.
+
+        Returns all annotations belonging to the papers that
+        are described by the citekeys passed in.
+        """
         papers = self._gather_papers(citekeys)
         papers_annotated = []
         for paper in papers:
@@ -94,18 +96,35 @@ class ExtractPlugin(PapersPlugin):
         return papers_annotated
 
     def _gather_papers(self, citekeys):
+        """Get all papers for citekeys.
+
+        Returns all Paper objects described by the citekeys
+        passed in.
+        """
         papers = []
         for key in citekeys:
             papers.append(self.repository.pull_paper(key))
         return papers
 
     def _get_file(self, paper):
+        """Get path of document belonging to paper.
+
+        Returns the real path to the document which belongs
+        to the paper passed in. Emits a warning if no
+        document belongs to paper.
+        """
         path = self.broker.real_docpath(paper.docpath)
         if not path:
             self.ui.warning(f"{paper.citekey} has no valid document.")
         return path
 
     def _get_annotations(self, filename):
+        """Extract annotations from a file.
+
+        Returns all readable annotations contained in the file
+        passed in. Only returns Highlight or Text annotations
+        currently.
+        """
         annotations = []
         with fitz.Document(filename) as doc:
             for page in doc:
@@ -118,6 +137,11 @@ class ExtractPlugin(PapersPlugin):
         return annotations
 
     def _to_stdout(self, annotated_papers):
+        """Write annotations to stdout.
+
+        Simply outputs the gathered annotations over stdout
+        ready to be passed on through pipelines etc.
+        """
         output = ""
         for contents in annotated_papers:
             paper = contents[0]
@@ -130,6 +154,12 @@ class ExtractPlugin(PapersPlugin):
         print(output)
 
     def _to_notes(self, annotated_papers, note_extension="txt", edit=False):
+        """Write annotations into pubs notes.
+
+        Permanently writes the given annotations into notes
+        in the pubs notes directory. Creates new notes for
+        citekeys missing a note or appends to existing.
+        """
         for contents in annotated_papers:
             paper = contents[0]
             annotations = contents[1]
@@ -145,6 +175,11 @@ class ExtractPlugin(PapersPlugin):
                 NoteEvent(paper.citekey).send()
 
     def _write_new_note(self, notepath, annotations):
+        """Create a new note containing the annotations.
+
+        Will create a new note in the notes folder of pubs
+        and fill it with the annotations extracted from pdf.
+        """
         output = "# Annotations\n\n"
         for annotation in annotations:
             output += f"> {annotation}\n\n"

From f0caf3592573663c4a15aa62325f9504863e6ac9 Mon Sep 17 00:00:00 2001
From: Marty Oehme <marty.oehme@gmail.com>
Date: Thu, 22 Dec 2022 21:10:42 +0100
Subject: [PATCH 2/5] Refactor plugin to remove carrying config around

---
 extract/extract.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/extract/extract.py b/extract/extract.py
index bcf3dcb..5a3704f 100644
--- a/extract/extract.py
+++ b/extract/extract.py
@@ -29,7 +29,7 @@ class ExtractPlugin(PapersPlugin):
 
     def __init__(self, conf, ui):
         self.ui = ui
-        self.conf = conf
+        self.note_extension = conf["main"]["note_extension"]
         self.repository = repo.Repository(conf)
         self.pubsdir = os.path.expanduser(conf["main"]["pubsdir"])
         self.broker = self.repository.databroker
@@ -74,7 +74,7 @@ class ExtractPlugin(PapersPlugin):
             return
         all_annotations = self.extract(citekeys)
         if args.write:
-            self._to_notes(all_annotations, conf["main"]["note_extension"], args.edit)
+            self._to_notes(all_annotations, self.note_extension, args.edit)
         else:
             self._to_stdout(all_annotations)
         self.repository.close()
@@ -210,5 +210,5 @@ def modify_event(event):
         if plg.onimport:
             all_annotations = plg.extract([event.citekey])
             if all_annotations[0][1]:
-                plg._to_notes(all_annotations, plg.conf["main"]["note_extension"])
+                plg._to_notes(all_annotations, plg.note_extension)
                 plg.ui.info(f"Imported {event.citekey} annotations.")

From 9496a626c09db1141b0015d2aae7a43cb4e4a9c2 Mon Sep 17 00:00:00 2001
From: Marty Oehme <marty.oehme@gmail.com>
Date: Thu, 22 Dec 2022 21:10:56 +0100
Subject: [PATCH 3/5] Add message on successful note writing

---
 extract/extract.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/extract/extract.py b/extract/extract.py
index 5a3704f..973d4eb 100644
--- a/extract/extract.py
+++ b/extract/extract.py
@@ -169,6 +169,7 @@ class ExtractPlugin(PapersPlugin):
                     self._append_to_note(notepath, annotations)
                 else:
                     self._write_new_note(notepath, annotations)
+                self.ui.info(f"Wrote annotations to {paper.citekey} note {notepath}.")
 
                 if edit is True:
                     self.ui.edit_file(notepath, temporary=False)

From 86d06d7518615dd1dd1cad31c421bc28b997d0e2 Mon Sep 17 00:00:00 2001
From: Marty Oehme <marty.oehme@gmail.com>
Date: Thu, 22 Dec 2022 21:42:56 +0100
Subject: [PATCH 4/5] Fix page numbering starting from zero

---
 extract/extract.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/extract/extract.py b/extract/extract.py
index 973d4eb..93eca9b 100644
--- a/extract/extract.py
+++ b/extract/extract.py
@@ -133,7 +133,7 @@ class ExtractPlugin(PapersPlugin):
                         "\n", ""
                     )
                     if content:
-                        annotations.append(f"[{page.number}] {content}")
+                        annotations.append(f"[{(page.number or 0) + 1}] {content}")
         return annotations
 
     def _to_stdout(self, annotated_papers):

From d14a95e18bb4785905c6dafdb75fed4c35dd1596 Mon Sep 17 00:00:00 2001
From: Marty Oehme <marty.oehme@gmail.com>
Date: Thu, 22 Dec 2022 21:43:10 +0100
Subject: [PATCH 5/5] Add usage instructions to README

---
 README.md | 93 +++++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 70 insertions(+), 23 deletions(-)

diff --git a/README.md b/README.md
index e861ddf..17395fe 100644
--- a/README.md
+++ b/README.md
@@ -2,38 +2,85 @@
 
 Quickly extract annotations from your pdf files with the help of the pubs bibliography manager.
 
-Installation:
+## Installation:
+
+Still a bit painful since I have not set up any package management:
 
 Put `extract` folder in your pubs `plugs` directory.
 
-Add extract to your plugin list in pubs configuration file.
+Then add `extract` to your plugin list in the pubs configuration file.
 
-Usage:
+## Usage:
 
-`pubs extract <citekeys>`
+`pubs extract [-h|-w|-e] <citekeys>`
 
-This readme is a stub so far, feel free to extend it and raise a PR if you have the time.
-What follows is a not-very-sorted train of though on the plugin and pubs in general,
-to keep my thoughts in one place while I work on it.
+For example, to extract annotations from two entries, do:
 
-## extractor plugin:
+```bash
+pubs extract Bayat2015 Peck2004
+```
 
-- extracts highlights and annotations from a doc file (e.g. using PyMuPDF)
-- puts those in the annotation file of a doc in a customizable format
-- option to have it automatically run after a file is updated?
-- needs some way to delimit where it puts stuff and user stuff is in note
-    - one way is to have it look at `> [17] here be extracted annotation from page seventeen` annotations and put it in between
-    - another, probably simpler first, is to just append missing annotations to the end of the note
-- some highlights (or annotations in general) do not contain text as content
-    - pymupdf can extract the content of the underlying rectangle (mostly)
-    - issue is that sometimes the highlight contents are in content, sometimes a user comment instead
-        - we could have a comparison function which estimates how 'close' the two text snippets are and act accordingly
-- config option to map colors in annotations to meaning ('read', 'important', 'extra') in pubs
-    - colors are given in very exact 0.6509979 RGB values, meaning we could once again estimate if a color is 'close enough' in distance to tag it accordingly
-- make invoking the command run a query if `-e` option provided (or whatever) in pubs syntax and use resulting papers
-    - confirm?
+This will print the extracted annotations to the commandline through stdout.
 
-# would also be nice in pubs, missing for me
+If you invoke the command with the `-w` option, it will write it into your notes instead:
+
+```bash
+pubs extract -w Bayat2015 Peck2004
+```
+
+Will create notes for the two entries in your pubs note directory and fill them with
+the annotations. If a note already exists for any of the entries, it will instead append
+the annotations to the end of it, dropping all those that it already finds in the note
+(essentially only adding new annotations to the end).
+
+**PLEASE** Be aware that so far, I spent a single afternoon coding this plugin, it
+contains no tests and operates on your notes. In my use nothing too bad happened but
+only use it with adequate backup in place, or with your library being version controlled.
+
+You can invoke the command with `-e` to instantly edit the notes:
+
+```bash
+pubs extract -w -e Bayat2015 Peck2004
+```
+
+Will create/append annotations and drop you into the Bayat2015 note, when you close it
+directly into the Peck2004 note. Take care that it will be fairly annoying if you use this
+option with hundreds of entries being annotated.
+
+To extract the annotations for all your existing entries in one go, you can use:
+
+```bash
+pubs extract -w $(pubs list -k)
+```
+
+However, the warning for your notes' safety goes doubly for this command since it will touch
+*most* or *all* of your notes, depending on how many entries in your library have pdfs attached.
+
+This readme is still a bit messy, feel free to extend it and raise a PR if you have the time.
+
+What follows is a not-very-sorted train of though on where the plugin is at and where I
+could see myself taking it one day, provided I find the time.
+Pull requests tackling one of these areas of course very welcome.
+
+## Roadmap:
+
+- [x] extracts highlights and annotations from a doc file (e.g. using PyMuPDF)
+- [ ] puts those in the annotation file of a doc in a customizable format
+- [x] option to have it automatically run after a file is added?
+    - option to have it run whenever a pdf in the library was updated?
+- [ ] needs some way to delimit where it puts stuff and user stuff is in note
+    - [ ] one way is to have it look at `> [17] here be extracted annotation from page seventeen` annotations and put it in between
+    - [x] another, probably simpler first, is to just append missing annotations to the end of the note
+- [ ] some highlights (or annotations in general) do not contain text as content
+    - [ ] pymupdf can extract the content of the underlying rectangle (mostly)
+    - [ ] issue is that sometimes the highlight contents are in content, sometimes a user comment instead
+        - [ ] we could have a comparison function which estimates how 'close' the two text snippets are and act accordingly
+- [ ] config option to map colors in annotations to meaning ('read', 'important', 'extra') in pubs
+    - [ ] colors are given in very exact 0.6509979 RGB values, meaning we could once again estimate if a color is 'close enough' in distance to tag it accordingly
+- [ ] make invoking the command run a query if corresponding option provided (or whatever) in pubs syntax and use resulting papers
+    - [ ] confirm for many papers?
+
+## Things that would also be nice in pubs in general and don't really belong in this repository
 
 - `show` command which simply displays given entry in a nice way
     - could take multiple entries but present them all in the same larger way