From 7b27f3291da18caa0cd95c9f31e1dc85f03fa75d Mon Sep 17 00:00:00 2001 From: Marty Oehme Date: Thu, 22 Dec 2022 21:02:01 +0100 Subject: [PATCH 1/5] Add simple docstrings --- extract/extract.py | 61 ++++++++++++++++++++++++++++++++++++---------- 1 file changed, 48 insertions(+), 13 deletions(-) diff --git a/extract/extract.py b/extract/extract.py index 067547d..bcf3dcb 100644 --- a/extract/extract.py +++ b/extract/extract.py @@ -1,9 +1,6 @@ import os import argparse -# from subprocess import Popen, PIPE, STDOUT -# from pipes import quote as shell_quote - import fitz from pubs.plugins import PapersPlugin @@ -15,13 +12,16 @@ from pubs.content import check_file, read_text_file, write_file class ExtractPlugin(PapersPlugin): - """Make the pubs repository also a git repository. + """Extract annotations from any pdf document. - The git plugin creates a git repository in the pubs directory - and commit the changes to the pubs repository. + The extract plugin allows manual or automatic extraction of all annotations + contained in the pdf documents belonging to entries of the pubs library. - It also add the `pubs git` subcommand, so git commands can be executed - in the git repository from the command line. + It can write those changes to stdout or directly create and update notes + for the pubs entries. + + It adds a `pubs extract` subcommand through which it is invoked, but can + optionally run whenever a new document is imported for a pubs entry. """ name = "extract" @@ -39,12 +39,9 @@ class ExtractPlugin(PapersPlugin): # or `:: {annotation} :: {page} ::` # and so on self.onimport = conf["plugins"].get("extract", {}).get("onimport", False) - # self.manual = conf['plugins'].get('git', {}).get('manual', False) - # self.force_color = conf['plugins'].get('git', {}).get('force_color', True) - # self.list_of_changes = [] def update_parser(self, subparsers, conf): - """Allow the usage of the pubs git subcommand""" + """Allow the usage of the pubs extract subcommand""" # TODO option for ignoring missing documents or erroring. extract_parser = subparsers.add_parser(self.name, help=self.description) extract_parser.add_argument( @@ -69,7 +66,7 @@ class ExtractPlugin(PapersPlugin): extract_parser.set_defaults(func=self.command) def command(self, conf, args): - """Run the annotation extraction""" + """Run the annotation extraction command.""" citekeys = resolve_citekey_list( self.repository, conf, args.citekeys, ui=self.ui, exit_on_fail=True ) @@ -83,6 +80,11 @@ class ExtractPlugin(PapersPlugin): self.repository.close() def extract(self, citekeys): + """Extracts annotations from citekeys. + + Returns all annotations belonging to the papers that + are described by the citekeys passed in. + """ papers = self._gather_papers(citekeys) papers_annotated = [] for paper in papers: @@ -94,18 +96,35 @@ class ExtractPlugin(PapersPlugin): return papers_annotated def _gather_papers(self, citekeys): + """Get all papers for citekeys. + + Returns all Paper objects described by the citekeys + passed in. + """ papers = [] for key in citekeys: papers.append(self.repository.pull_paper(key)) return papers def _get_file(self, paper): + """Get path of document belonging to paper. + + Returns the real path to the document which belongs + to the paper passed in. Emits a warning if no + document belongs to paper. + """ path = self.broker.real_docpath(paper.docpath) if not path: self.ui.warning(f"{paper.citekey} has no valid document.") return path def _get_annotations(self, filename): + """Extract annotations from a file. + + Returns all readable annotations contained in the file + passed in. Only returns Highlight or Text annotations + currently. + """ annotations = [] with fitz.Document(filename) as doc: for page in doc: @@ -118,6 +137,11 @@ class ExtractPlugin(PapersPlugin): return annotations def _to_stdout(self, annotated_papers): + """Write annotations to stdout. + + Simply outputs the gathered annotations over stdout + ready to be passed on through pipelines etc. + """ output = "" for contents in annotated_papers: paper = contents[0] @@ -130,6 +154,12 @@ class ExtractPlugin(PapersPlugin): print(output) def _to_notes(self, annotated_papers, note_extension="txt", edit=False): + """Write annotations into pubs notes. + + Permanently writes the given annotations into notes + in the pubs notes directory. Creates new notes for + citekeys missing a note or appends to existing. + """ for contents in annotated_papers: paper = contents[0] annotations = contents[1] @@ -145,6 +175,11 @@ class ExtractPlugin(PapersPlugin): NoteEvent(paper.citekey).send() def _write_new_note(self, notepath, annotations): + """Create a new note containing the annotations. + + Will create a new note in the notes folder of pubs + and fill it with the annotations extracted from pdf. + """ output = "# Annotations\n\n" for annotation in annotations: output += f"> {annotation}\n\n" From f0caf3592573663c4a15aa62325f9504863e6ac9 Mon Sep 17 00:00:00 2001 From: Marty Oehme Date: Thu, 22 Dec 2022 21:10:42 +0100 Subject: [PATCH 2/5] Refactor plugin to remove carrying config around --- extract/extract.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/extract/extract.py b/extract/extract.py index bcf3dcb..5a3704f 100644 --- a/extract/extract.py +++ b/extract/extract.py @@ -29,7 +29,7 @@ class ExtractPlugin(PapersPlugin): def __init__(self, conf, ui): self.ui = ui - self.conf = conf + self.note_extension = conf["main"]["note_extension"] self.repository = repo.Repository(conf) self.pubsdir = os.path.expanduser(conf["main"]["pubsdir"]) self.broker = self.repository.databroker @@ -74,7 +74,7 @@ class ExtractPlugin(PapersPlugin): return all_annotations = self.extract(citekeys) if args.write: - self._to_notes(all_annotations, conf["main"]["note_extension"], args.edit) + self._to_notes(all_annotations, self.note_extension, args.edit) else: self._to_stdout(all_annotations) self.repository.close() @@ -210,5 +210,5 @@ def modify_event(event): if plg.onimport: all_annotations = plg.extract([event.citekey]) if all_annotations[0][1]: - plg._to_notes(all_annotations, plg.conf["main"]["note_extension"]) + plg._to_notes(all_annotations, plg.note_extension) plg.ui.info(f"Imported {event.citekey} annotations.") From 9496a626c09db1141b0015d2aae7a43cb4e4a9c2 Mon Sep 17 00:00:00 2001 From: Marty Oehme Date: Thu, 22 Dec 2022 21:10:56 +0100 Subject: [PATCH 3/5] Add message on successful note writing --- extract/extract.py | 1 + 1 file changed, 1 insertion(+) diff --git a/extract/extract.py b/extract/extract.py index 5a3704f..973d4eb 100644 --- a/extract/extract.py +++ b/extract/extract.py @@ -169,6 +169,7 @@ class ExtractPlugin(PapersPlugin): self._append_to_note(notepath, annotations) else: self._write_new_note(notepath, annotations) + self.ui.info(f"Wrote annotations to {paper.citekey} note {notepath}.") if edit is True: self.ui.edit_file(notepath, temporary=False) From 86d06d7518615dd1dd1cad31c421bc28b997d0e2 Mon Sep 17 00:00:00 2001 From: Marty Oehme Date: Thu, 22 Dec 2022 21:42:56 +0100 Subject: [PATCH 4/5] Fix page numbering starting from zero --- extract/extract.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extract/extract.py b/extract/extract.py index 973d4eb..93eca9b 100644 --- a/extract/extract.py +++ b/extract/extract.py @@ -133,7 +133,7 @@ class ExtractPlugin(PapersPlugin): "\n", "" ) if content: - annotations.append(f"[{page.number}] {content}") + annotations.append(f"[{(page.number or 0) + 1}] {content}") return annotations def _to_stdout(self, annotated_papers): From d14a95e18bb4785905c6dafdb75fed4c35dd1596 Mon Sep 17 00:00:00 2001 From: Marty Oehme Date: Thu, 22 Dec 2022 21:43:10 +0100 Subject: [PATCH 5/5] Add usage instructions to README --- README.md | 93 +++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 70 insertions(+), 23 deletions(-) diff --git a/README.md b/README.md index e861ddf..17395fe 100644 --- a/README.md +++ b/README.md @@ -2,38 +2,85 @@ Quickly extract annotations from your pdf files with the help of the pubs bibliography manager. -Installation: +## Installation: + +Still a bit painful since I have not set up any package management: Put `extract` folder in your pubs `plugs` directory. -Add extract to your plugin list in pubs configuration file. +Then add `extract` to your plugin list in the pubs configuration file. -Usage: +## Usage: -`pubs extract ` +`pubs extract [-h|-w|-e] ` -This readme is a stub so far, feel free to extend it and raise a PR if you have the time. -What follows is a not-very-sorted train of though on the plugin and pubs in general, -to keep my thoughts in one place while I work on it. +For example, to extract annotations from two entries, do: -## extractor plugin: +```bash +pubs extract Bayat2015 Peck2004 +``` -- extracts highlights and annotations from a doc file (e.g. using PyMuPDF) -- puts those in the annotation file of a doc in a customizable format -- option to have it automatically run after a file is updated? -- needs some way to delimit where it puts stuff and user stuff is in note - - one way is to have it look at `> [17] here be extracted annotation from page seventeen` annotations and put it in between - - another, probably simpler first, is to just append missing annotations to the end of the note -- some highlights (or annotations in general) do not contain text as content - - pymupdf can extract the content of the underlying rectangle (mostly) - - issue is that sometimes the highlight contents are in content, sometimes a user comment instead - - we could have a comparison function which estimates how 'close' the two text snippets are and act accordingly -- config option to map colors in annotations to meaning ('read', 'important', 'extra') in pubs - - colors are given in very exact 0.6509979 RGB values, meaning we could once again estimate if a color is 'close enough' in distance to tag it accordingly -- make invoking the command run a query if `-e` option provided (or whatever) in pubs syntax and use resulting papers - - confirm? +This will print the extracted annotations to the commandline through stdout. -# would also be nice in pubs, missing for me +If you invoke the command with the `-w` option, it will write it into your notes instead: + +```bash +pubs extract -w Bayat2015 Peck2004 +``` + +Will create notes for the two entries in your pubs note directory and fill them with +the annotations. If a note already exists for any of the entries, it will instead append +the annotations to the end of it, dropping all those that it already finds in the note +(essentially only adding new annotations to the end). + +**PLEASE** Be aware that so far, I spent a single afternoon coding this plugin, it +contains no tests and operates on your notes. In my use nothing too bad happened but +only use it with adequate backup in place, or with your library being version controlled. + +You can invoke the command with `-e` to instantly edit the notes: + +```bash +pubs extract -w -e Bayat2015 Peck2004 +``` + +Will create/append annotations and drop you into the Bayat2015 note, when you close it +directly into the Peck2004 note. Take care that it will be fairly annoying if you use this +option with hundreds of entries being annotated. + +To extract the annotations for all your existing entries in one go, you can use: + +```bash +pubs extract -w $(pubs list -k) +``` + +However, the warning for your notes' safety goes doubly for this command since it will touch +*most* or *all* of your notes, depending on how many entries in your library have pdfs attached. + +This readme is still a bit messy, feel free to extend it and raise a PR if you have the time. + +What follows is a not-very-sorted train of though on where the plugin is at and where I +could see myself taking it one day, provided I find the time. +Pull requests tackling one of these areas of course very welcome. + +## Roadmap: + +- [x] extracts highlights and annotations from a doc file (e.g. using PyMuPDF) +- [ ] puts those in the annotation file of a doc in a customizable format +- [x] option to have it automatically run after a file is added? + - option to have it run whenever a pdf in the library was updated? +- [ ] needs some way to delimit where it puts stuff and user stuff is in note + - [ ] one way is to have it look at `> [17] here be extracted annotation from page seventeen` annotations and put it in between + - [x] another, probably simpler first, is to just append missing annotations to the end of the note +- [ ] some highlights (or annotations in general) do not contain text as content + - [ ] pymupdf can extract the content of the underlying rectangle (mostly) + - [ ] issue is that sometimes the highlight contents are in content, sometimes a user comment instead + - [ ] we could have a comparison function which estimates how 'close' the two text snippets are and act accordingly +- [ ] config option to map colors in annotations to meaning ('read', 'important', 'extra') in pubs + - [ ] colors are given in very exact 0.6509979 RGB values, meaning we could once again estimate if a color is 'close enough' in distance to tag it accordingly +- [ ] make invoking the command run a query if corresponding option provided (or whatever) in pubs syntax and use resulting papers + - [ ] confirm for many papers? + +## Things that would also be nice in pubs in general and don't really belong in this repository - `show` command which simply displays given entry in a nice way - could take multiple entries but present them all in the same larger way