From 4281dba9c2940ef7dfe6645fe61b2bf9c4fe3eaf Mon Sep 17 00:00:00 2001 From: Marty Oehme Date: Thu, 22 Dec 2022 22:48:47 +0100 Subject: [PATCH 1/9] Fix typo in on_import configuration option --- extract/extract.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/extract/extract.py b/extract/extract.py index a1ed4c8..781267b 100644 --- a/extract/extract.py +++ b/extract/extract.py @@ -39,7 +39,7 @@ class ExtractPlugin(PapersPlugin): # e.g. `> [{page}] {annotation}` # or `:: {annotation} :: {page} ::` # and so on - self.onimport = conf["plugins"].get("extract", {}).get("onimport", False) + self.on_import = conf["plugins"].get("extract", {}).get("on_import", False) self.minimum_similarity = float(conf["plugins"].get("extract", {}).get("minimum_similarity", 0.75)) def update_parser(self, subparsers, conf): @@ -226,7 +226,7 @@ class ExtractPlugin(PapersPlugin): def modify_event(event): if ExtractPlugin.is_loaded(): plg = ExtractPlugin.get_instance() - if plg.onimport: + if plg.on_import: all_annotations = plg.extract([event.citekey]) if all_annotations[0][1]: plg._to_notes(all_annotations, plg.note_extension) From 82b99e84200d4cf3df64a7036fbcff9c0cd6159e Mon Sep 17 00:00:00 2001 From: Marty Oehme Date: Thu, 22 Dec 2022 22:55:25 +0100 Subject: [PATCH 2/9] Remove message spew on automatic import --- extract/extract.py | 1 - 1 file changed, 1 deletion(-) diff --git a/extract/extract.py b/extract/extract.py index 781267b..a53dbd9 100644 --- a/extract/extract.py +++ b/extract/extract.py @@ -230,4 +230,3 @@ def modify_event(event): all_annotations = plg.extract([event.citekey]) if all_annotations[0][1]: plg._to_notes(all_annotations, plg.note_extension) - plg.ui.info(f"Imported {event.citekey} annotations.") From b65c2a3be6dddf65c6a0257a97b04c5f28475829 Mon Sep 17 00:00:00 2001 From: Marty Oehme Date: Thu, 22 Dec 2022 22:56:07 +0100 Subject: [PATCH 3/9] Add independent note saving --- extract/extract.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/extract/extract.py b/extract/extract.py index a53dbd9..080ce2f 100644 --- a/extract/extract.py +++ b/extract/extract.py @@ -148,10 +148,16 @@ class ExtractPlugin(PapersPlugin): content = annotation.info["content"].replace("\n", " ") written = page.get_textbox(annotation.rect).replace("\n", " ") + # highlight with selection in note if Levenshtein.ratio(content,written) > self.minimum_similarity: return content + # an independent note, not a highlight + elif content and not written: + return content + # both a highlight and a note elif content: return f"{written}{connector}{content}" + # highlight with selection not in note return written def _to_stdout(self, annotated_papers): From e255405d96e2ab6ce78077f8804710c53b8b77c1 Mon Sep 17 00:00:00 2001 From: Marty Oehme Date: Thu, 22 Dec 2022 23:01:14 +0100 Subject: [PATCH 4/9] Refactor highlight prefix setting --- extract/extract.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/extract/extract.py b/extract/extract.py index 080ce2f..0d127aa 100644 --- a/extract/extract.py +++ b/extract/extract.py @@ -136,7 +136,7 @@ class ExtractPlugin(PapersPlugin): annotations.append(f"[{(page.number or 0) + 1}] {content}") return annotations - def _retrieve_annotation_content(self, page, annotation, connector = "\nNote: "): + def _retrieve_annotation_content(self, page, annotation, highlight_prefix = "> ", note_prefix = "Note: "): """Gets the text content of an annotation. Returns the actual content of an annotation. Sometimes @@ -150,15 +150,15 @@ class ExtractPlugin(PapersPlugin): # highlight with selection in note if Levenshtein.ratio(content,written) > self.minimum_similarity: - return content + return f"{highlight_prefix}{content}" # an independent note, not a highlight elif content and not written: - return content + return f"{note_prefix}{content}" # both a highlight and a note elif content: - return f"{written}{connector}{content}" + return f"{highlight_prefix}{written}\n{note_prefix}{content}" # highlight with selection not in note - return written + return f"{highlight_prefix}{written}" def _to_stdout(self, annotated_papers): """Write annotations to stdout. @@ -173,7 +173,7 @@ class ExtractPlugin(PapersPlugin): if annotations: output += f"{paper.citekey}\n" for annot in annotations: - output += f'> "{annot}"\n' + output += f'{annot}\n\n' output += "\n" print(output) From f103cc51b5822f77c7ff79c589fb77f05dda577b Mon Sep 17 00:00:00 2001 From: Marty Oehme Date: Thu, 22 Dec 2022 23:01:43 +0100 Subject: [PATCH 5/9] Add slight visual flair to stdout citekey display --- extract/extract.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extract/extract.py b/extract/extract.py index 0d127aa..044b84d 100644 --- a/extract/extract.py +++ b/extract/extract.py @@ -171,7 +171,7 @@ class ExtractPlugin(PapersPlugin): paper = contents[0] annotations = contents[1] if annotations: - output += f"{paper.citekey}\n" + output += f"------ {paper.citekey} ------\n\n" for annot in annotations: output += f'{annot}\n\n' output += "\n" From 8f01b93de2889d2c556a855f46d7db08899dd393 Mon Sep 17 00:00:00 2001 From: Marty Oehme Date: Thu, 22 Dec 2022 23:09:43 +0100 Subject: [PATCH 6/9] Add configurable note and quote prefixes --- README.md | 9 +++++++++ extract/extract.py | 4 +++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 9ca4328..a7b5c96 100644 --- a/README.md +++ b/README.md @@ -20,12 +20,21 @@ active = extract [[extract]] on_import = False +quote_prefix = "> " +note_prefix = "Note: " minimum_similarity = 0.75 ``` If `on_import` is `True` extraction is automatically run whenever a new document is added to the library, if false extraction has to be handled manually. +`quote_prefix` and `note_prefix` define what is put in front of the quoted part of an annotation and the annotator's own notes respectively, so that ultimately a note (by default) looks like this: + +```markdown +[4] > came “urban rights” caused collective outrage. Thus, through- out 2007 and 2008, protestors in towns and villages across the Nile Delta poured into the streets to rally over cuts in water flow. Deployment of massive riot police could not stop +Note: Often illegally connected to network, ‘revolution of the thirsty’ +``` + `minimum_similarity` sets the required similarity of an annotation's note and written words to be viewed as one. Any annotation that has both and is *under* the minimum similarity will be added in the following form: diff --git a/extract/extract.py b/extract/extract.py index 044b84d..221f0b5 100644 --- a/extract/extract.py +++ b/extract/extract.py @@ -41,6 +41,8 @@ class ExtractPlugin(PapersPlugin): # and so on self.on_import = conf["plugins"].get("extract", {}).get("on_import", False) self.minimum_similarity = float(conf["plugins"].get("extract", {}).get("minimum_similarity", 0.75)) + self.highlight_prefix = conf["plugins"].get("extract", {}).get("quote_prefix", "> ") + self.note_prefix = conf["plugins"].get("extract", {}).get("note_prefix", "Note: ") def update_parser(self, subparsers, conf): """Allow the usage of the pubs extract subcommand""" @@ -131,7 +133,7 @@ class ExtractPlugin(PapersPlugin): with fitz.Document(filename) as doc: for page in doc: for annot in page.annots(): - content = self._retrieve_annotation_content(page, annot) + content = self._retrieve_annotation_content(page, annot, self.highlight_prefix, self.note_prefix) if content: annotations.append(f"[{(page.number or 0) + 1}] {content}") return annotations From e3aacc4b15d39e3d0ff620e8eeabc7d4048d260a Mon Sep 17 00:00:00 2001 From: Marty Oehme Date: Thu, 22 Dec 2022 23:21:13 +0100 Subject: [PATCH 7/9] Format file --- extract/extract.py | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/extract/extract.py b/extract/extract.py index 221f0b5..2d652ed 100644 --- a/extract/extract.py +++ b/extract/extract.py @@ -40,9 +40,15 @@ class ExtractPlugin(PapersPlugin): # or `:: {annotation} :: {page} ::` # and so on self.on_import = conf["plugins"].get("extract", {}).get("on_import", False) - self.minimum_similarity = float(conf["plugins"].get("extract", {}).get("minimum_similarity", 0.75)) - self.highlight_prefix = conf["plugins"].get("extract", {}).get("quote_prefix", "> ") - self.note_prefix = conf["plugins"].get("extract", {}).get("note_prefix", "Note: ") + self.minimum_similarity = float( + conf["plugins"].get("extract", {}).get("minimum_similarity", 0.75) + ) + self.highlight_prefix = ( + conf["plugins"].get("extract", {}).get("quote_prefix", "> ") + ) + self.note_prefix = ( + conf["plugins"].get("extract", {}).get("note_prefix", "Note: ") + ) def update_parser(self, subparsers, conf): """Allow the usage of the pubs extract subcommand""" @@ -133,12 +139,16 @@ class ExtractPlugin(PapersPlugin): with fitz.Document(filename) as doc: for page in doc: for annot in page.annots(): - content = self._retrieve_annotation_content(page, annot, self.highlight_prefix, self.note_prefix) + content = self._retrieve_annotation_content( + page, annot, self.highlight_prefix, self.note_prefix + ) if content: annotations.append(f"[{(page.number or 0) + 1}] {content}") return annotations - def _retrieve_annotation_content(self, page, annotation, highlight_prefix = "> ", note_prefix = "Note: "): + def _retrieve_annotation_content( + self, page, annotation, highlight_prefix="> ", note_prefix="Note: " + ): """Gets the text content of an annotation. Returns the actual content of an annotation. Sometimes @@ -151,7 +161,7 @@ class ExtractPlugin(PapersPlugin): written = page.get_textbox(annotation.rect).replace("\n", " ") # highlight with selection in note - if Levenshtein.ratio(content,written) > self.minimum_similarity: + if Levenshtein.ratio(content, written) > self.minimum_similarity: return f"{highlight_prefix}{content}" # an independent note, not a highlight elif content and not written: @@ -175,7 +185,7 @@ class ExtractPlugin(PapersPlugin): if annotations: output += f"------ {paper.citekey} ------\n\n" for annot in annotations: - output += f'{annot}\n\n' + output += f"{annot}\n\n" output += "\n" print(output) From a3e2d8693dc51e60ab79da4f4079d949156328e4 Mon Sep 17 00:00:00 2001 From: Marty Oehme Date: Thu, 22 Dec 2022 23:47:50 +0100 Subject: [PATCH 8/9] Add formatting configuration --- extract/extract.py | 38 +++++++++++++++++++++----------------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/extract/extract.py b/extract/extract.py index 2d652ed..03456b3 100644 --- a/extract/extract.py +++ b/extract/extract.py @@ -43,11 +43,10 @@ class ExtractPlugin(PapersPlugin): self.minimum_similarity = float( conf["plugins"].get("extract", {}).get("minimum_similarity", 0.75) ) - self.highlight_prefix = ( - conf["plugins"].get("extract", {}).get("quote_prefix", "> ") - ) - self.note_prefix = ( - conf["plugins"].get("extract", {}).get("note_prefix", "Note: ") + self.formatting = ( + conf["plugins"] + .get("extract", {}) + .get("formatting", "> {quote} [{page}]\nNote: {note}") ) def update_parser(self, subparsers, conf): @@ -139,16 +138,21 @@ class ExtractPlugin(PapersPlugin): with fitz.Document(filename) as doc: for page in doc: for annot in page.annots(): - content = self._retrieve_annotation_content( - page, annot, self.highlight_prefix, self.note_prefix - ) - if content: - annotations.append(f"[{(page.number or 0) + 1}] {content}") + quote, note = self._retrieve_annotation_content(page, annot) + + replacements = [ + ("{quote}", quote), + ("{note}", note), + ("{page}", str(page.number)), + ] + output = self.formatting + for rep in replacements: + output = output.replace(rep[0], rep[1]) + + annotations.append(output) return annotations - def _retrieve_annotation_content( - self, page, annotation, highlight_prefix="> ", note_prefix="Note: " - ): + def _retrieve_annotation_content(self, page, annotation): """Gets the text content of an annotation. Returns the actual content of an annotation. Sometimes @@ -162,15 +166,15 @@ class ExtractPlugin(PapersPlugin): # highlight with selection in note if Levenshtein.ratio(content, written) > self.minimum_similarity: - return f"{highlight_prefix}{content}" + return (content, "") # an independent note, not a highlight elif content and not written: - return f"{note_prefix}{content}" + return ("", content) # both a highlight and a note elif content: - return f"{highlight_prefix}{written}\n{note_prefix}{content}" + return (written, content) # highlight with selection not in note - return f"{highlight_prefix}{written}" + return (written, "") def _to_stdout(self, annotated_papers): """Write annotations to stdout. From 59ff74475d272f51d46dfcf0178121b4b25f5ac2 Mon Sep 17 00:00:00 2001 From: Marty Oehme Date: Fri, 23 Dec 2022 00:27:19 +0100 Subject: [PATCH 9/9] Extend formatting with sections --- README.md | 26 ++++++++++++++++++++++---- extract/extract.py | 44 ++++++++++++++++++++++++++++++++------------ 2 files changed, 54 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index a7b5c96..7fda631 100644 --- a/README.md +++ b/README.md @@ -20,21 +20,39 @@ active = extract [[extract]] on_import = False -quote_prefix = "> " -note_prefix = "Note: " +formatting = "[{page}]{newline}{quote_begin}> {quote}{newline}{quote_end}{note_begin}Note: {note}{note_end}" minimum_similarity = 0.75 ``` If `on_import` is `True` extraction is automatically run whenever a new document is added to the library, if false extraction has to be handled manually. -`quote_prefix` and `note_prefix` define what is put in front of the quoted part of an annotation and the annotator's own notes respectively, so that ultimately a note (by default) looks like this: +`formatting` takes a string with a variety of template options. You can use any of the following: + +- `{page}`: The page number the annotation was found on. +- `{quote}`: The actual quoted string (i.e. highlighted). +- `{note}`: The annotation note (i.e. addded reader). +- `{quote_begin}`: Mark the area that begins a quotation. Useful to get rid of prefixes or suffixes if no quotation exists. Requires a corresponding `{quote_end}` to be set after. +- `{note_begin}`: Same idea for the note area. Requires a corresponding `{note_end}` to be set after. + +For example, the default formatting string will result in this output: ```markdown -[4] > came “urban rights” caused collective outrage. Thus, through- out 2007 and 2008, protestors in towns and villages across the Nile Delta poured into the streets to rally over cuts in water flow. Deployment of massive riot police could not stop +[4] +> came “urban rights” caused collective outrage. Thus, through- out 2007 and 2008, protestors in towns and villages across the Nile Delta poured into the streets to rally over cuts in water flow. Deployment of massive riot police could not stop Note: Often illegally connected to network, ‘revolution of the thirsty’ ``` +The formatting string `{quote_begin}Quote: {quote}{page}{newline}{quote_end}{note_begin}Note: {note}{note_end}` +will result in the following output: + +```markdown +Quote: came “urban rights” caused collective outrage. Thus, through- out 2007 and 2008, protestors in towns and villages across the Nile Delta poured into the streets to rally over cuts in water flow. Deployment of massive riot police could not stop [4] +Note: Often illegally connected to network, ‘revolution of the thirsty’ +``` + +Note, however, that in this example the page number if *only* displayed if the annotation contains a quote. + `minimum_similarity` sets the required similarity of an annotation's note and written words to be viewed as one. Any annotation that has both and is *under* the minimum similarity will be added in the following form: diff --git a/extract/extract.py b/extract/extract.py index 03456b3..8ef81e9 100644 --- a/extract/extract.py +++ b/extract/extract.py @@ -1,4 +1,5 @@ import os +import re import argparse import fitz @@ -46,7 +47,10 @@ class ExtractPlugin(PapersPlugin): self.formatting = ( conf["plugins"] .get("extract", {}) - .get("formatting", "> {quote} [{page}]\nNote: {note}") + .get( + "formatting", + "[{page}]{newline}{quote_begin}> {quote}{newline}{quote_end}{note_begin}Note: {note}{note_end}", + ) ) def update_parser(self, subparsers, conf): @@ -139,19 +143,35 @@ class ExtractPlugin(PapersPlugin): for page in doc: for annot in page.annots(): quote, note = self._retrieve_annotation_content(page, annot) - - replacements = [ - ("{quote}", quote), - ("{note}", note), - ("{page}", str(page.number)), - ] - output = self.formatting - for rep in replacements: - output = output.replace(rep[0], rep[1]) - - annotations.append(output) + annotations.append( + self._format_annotation(quote, note, page.number or 0) + ) return annotations + def _format_annotation(self, quote, note, pagenumber=0): + output = self.formatting + replacements = { + "{quote}": quote, + "{note}": note, + "{page}": str(pagenumber), + "{newline}": "\n", + } + if note == "": + output = re.sub(r"{note_begin}.*{note_end}", "", output) + if quote == "": + output = re.sub(r"{quote_begin}.*{quote_end}", "", output) + output = re.sub(r"{note_begin}", "", output) + output = re.sub(r"{note_end}", "", output) + output = re.sub(r"{quote_begin}", "", output) + output = re.sub(r"{quote_end}", "", output) + pattern = re.compile( + "|".join( + [re.escape(k) for k in sorted(replacements, key=len, reverse=True)] + ), + flags=re.DOTALL, + ) + return pattern.sub(lambda x: replacements[x.group(0)], output) + def _retrieve_annotation_content(self, page, annotation): """Gets the text content of an annotation.