From 59ff74475d272f51d46dfcf0178121b4b25f5ac2 Mon Sep 17 00:00:00 2001 From: Marty Oehme Date: Fri, 23 Dec 2022 00:27:19 +0100 Subject: [PATCH] Extend formatting with sections --- README.md | 26 ++++++++++++++++++++++---- extract/extract.py | 44 ++++++++++++++++++++++++++++++++------------ 2 files changed, 54 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index a7b5c96..7fda631 100644 --- a/README.md +++ b/README.md @@ -20,21 +20,39 @@ active = extract [[extract]] on_import = False -quote_prefix = "> " -note_prefix = "Note: " +formatting = "[{page}]{newline}{quote_begin}> {quote}{newline}{quote_end}{note_begin}Note: {note}{note_end}" minimum_similarity = 0.75 ``` If `on_import` is `True` extraction is automatically run whenever a new document is added to the library, if false extraction has to be handled manually. -`quote_prefix` and `note_prefix` define what is put in front of the quoted part of an annotation and the annotator's own notes respectively, so that ultimately a note (by default) looks like this: +`formatting` takes a string with a variety of template options. You can use any of the following: + +- `{page}`: The page number the annotation was found on. +- `{quote}`: The actual quoted string (i.e. highlighted). +- `{note}`: The annotation note (i.e. addded reader). +- `{quote_begin}`: Mark the area that begins a quotation. Useful to get rid of prefixes or suffixes if no quotation exists. Requires a corresponding `{quote_end}` to be set after. +- `{note_begin}`: Same idea for the note area. Requires a corresponding `{note_end}` to be set after. + +For example, the default formatting string will result in this output: ```markdown -[4] > came “urban rights” caused collective outrage. Thus, through- out 2007 and 2008, protestors in towns and villages across the Nile Delta poured into the streets to rally over cuts in water flow. Deployment of massive riot police could not stop +[4] +> came “urban rights” caused collective outrage. Thus, through- out 2007 and 2008, protestors in towns and villages across the Nile Delta poured into the streets to rally over cuts in water flow. Deployment of massive riot police could not stop Note: Often illegally connected to network, ‘revolution of the thirsty’ ``` +The formatting string `{quote_begin}Quote: {quote}{page}{newline}{quote_end}{note_begin}Note: {note}{note_end}` +will result in the following output: + +```markdown +Quote: came “urban rights” caused collective outrage. Thus, through- out 2007 and 2008, protestors in towns and villages across the Nile Delta poured into the streets to rally over cuts in water flow. Deployment of massive riot police could not stop [4] +Note: Often illegally connected to network, ‘revolution of the thirsty’ +``` + +Note, however, that in this example the page number if *only* displayed if the annotation contains a quote. + `minimum_similarity` sets the required similarity of an annotation's note and written words to be viewed as one. Any annotation that has both and is *under* the minimum similarity will be added in the following form: diff --git a/extract/extract.py b/extract/extract.py index 03456b3..8ef81e9 100644 --- a/extract/extract.py +++ b/extract/extract.py @@ -1,4 +1,5 @@ import os +import re import argparse import fitz @@ -46,7 +47,10 @@ class ExtractPlugin(PapersPlugin): self.formatting = ( conf["plugins"] .get("extract", {}) - .get("formatting", "> {quote} [{page}]\nNote: {note}") + .get( + "formatting", + "[{page}]{newline}{quote_begin}> {quote}{newline}{quote_end}{note_begin}Note: {note}{note_end}", + ) ) def update_parser(self, subparsers, conf): @@ -139,19 +143,35 @@ class ExtractPlugin(PapersPlugin): for page in doc: for annot in page.annots(): quote, note = self._retrieve_annotation_content(page, annot) - - replacements = [ - ("{quote}", quote), - ("{note}", note), - ("{page}", str(page.number)), - ] - output = self.formatting - for rep in replacements: - output = output.replace(rep[0], rep[1]) - - annotations.append(output) + annotations.append( + self._format_annotation(quote, note, page.number or 0) + ) return annotations + def _format_annotation(self, quote, note, pagenumber=0): + output = self.formatting + replacements = { + "{quote}": quote, + "{note}": note, + "{page}": str(pagenumber), + "{newline}": "\n", + } + if note == "": + output = re.sub(r"{note_begin}.*{note_end}", "", output) + if quote == "": + output = re.sub(r"{quote_begin}.*{quote_end}", "", output) + output = re.sub(r"{note_begin}", "", output) + output = re.sub(r"{note_end}", "", output) + output = re.sub(r"{quote_begin}", "", output) + output = re.sub(r"{quote_end}", "", output) + pattern = re.compile( + "|".join( + [re.escape(k) for k in sorted(replacements, key=len, reverse=True)] + ), + flags=re.DOTALL, + ) + return pattern.sub(lambda x: replacements[x.group(0)], output) + def _retrieve_annotation_content(self, page, annotation): """Gets the text content of an annotation.