Extend formatting with sections

2022-12-23 00:27:19 +01:00 · 2022-12-23 00:27:19 +01:00 · 59ff74475d
commit 59ff74475d
parent a3e2d8693d
2 changed files with 54 additions and 16 deletions
--- a/README.md
+++ b/README.md
@ -20,21 +20,39 @@ active = extract

 [[extract]]
 on_import = False
-quote_prefix = "> "
-note_prefix = "Note: "
+formatting = "[{page}]{newline}{quote_begin}> {quote}{newline}{quote_end}{note_begin}Note: {note}{note_end}"
 minimum_similarity = 0.75
 ```

 If `on_import` is `True` extraction is automatically run whenever a new document is added to the library,
 if false extraction has to be handled manually.

-`quote_prefix` and `note_prefix` define what is put in front of the quoted part of an annotation and the annotator's own notes respectively, so that ultimately a note (by default) looks like this:
+`formatting` takes a string with a variety of template options. You can use any of the following:
+
+- `{page}`: The page number the annotation was found on.
+- `{quote}`: The actual quoted string (i.e. highlighted).
+- `{note}`: The annotation note (i.e. addded reader).
+- `{quote_begin}`: Mark the area that begins a quotation. Useful to get rid of prefixes or suffixes if no quotation exists. Requires a corresponding `{quote_end}` to be set after.
+- `{note_begin}`: Same idea for the note area. Requires a corresponding `{note_end}` to be set after.
+
+For example, the default formatting string will result in this output:

 ```markdown
-[4] > came “urban rights” caused collective outrage. Thus, through- out 2007 and 2008, protestors in towns and villages across the Nile Delta poured into the streets to rally over cuts in water flow. Deployment of massive riot police could not stop
+[4]
+> came “urban rights” caused collective outrage. Thus, through- out 2007 and 2008, protestors in towns and villages across the Nile Delta poured into the streets to rally over cuts in water flow. Deployment of massive riot police could not stop
 Note: Often illegally connected to network, ‘revolution of the thirsty’
 ```

+The formatting string `{quote_begin}Quote: {quote}{page}{newline}{quote_end}{note_begin}Note: {note}{note_end}`
+will result in the following output:
+
+```markdown
+Quote: came “urban rights” caused collective outrage. Thus, through- out 2007 and 2008, protestors in towns and villages across the Nile Delta poured into the streets to rally over cuts in water flow. Deployment of massive riot police could not stop [4]
+Note: Often illegally connected to network, ‘revolution of the thirsty’
+```
+
+Note, however, that in this example the page number if *only* displayed if the annotation contains a quote.
+
 `minimum_similarity` sets the required similarity of an annotation's note and written words to be viewed
 as one. Any annotation that has both and is *under* the minimum similarity will be added in the following form:

--- a/extract/extract.py
+++ b/extract/extract.py
@ -1,4 +1,5 @@
 import os
+import re
 import argparse

 import fitz
@ -46,7 +47,10 @@ class ExtractPlugin(PapersPlugin):
        self.formatting = (
            conf["plugins"]
            .get("extract", {})
-            .get("formatting", "> {quote} [{page}]\nNote: {note}")
+            .get(
+                "formatting",
+                "[{page}]{newline}{quote_begin}> {quote}{newline}{quote_end}{note_begin}Note: {note}{note_end}",
+            )
        )

    def update_parser(self, subparsers, conf):
@ -139,19 +143,35 @@ class ExtractPlugin(PapersPlugin):
            for page in doc:
                for annot in page.annots():
                    quote, note = self._retrieve_annotation_content(page, annot)
-
-                    replacements = [
-                        ("{quote}", quote),
-                        ("{note}", note),
-                        ("{page}", str(page.number)),
-                    ]
-                    output = self.formatting
-                    for rep in replacements:
-                        output = output.replace(rep[0], rep[1])
-
-                    annotations.append(output)
+                    annotations.append(
+                        self._format_annotation(quote, note, page.number or 0)
+                    )
        return annotations

+    def _format_annotation(self, quote, note, pagenumber=0):
+        output = self.formatting
+        replacements = {
+            "{quote}": quote,
+            "{note}": note,
+            "{page}": str(pagenumber),
+            "{newline}": "\n",
+        }
+        if note == "":
+            output = re.sub(r"{note_begin}.*{note_end}", "", output)
+        if quote == "":
+            output = re.sub(r"{quote_begin}.*{quote_end}", "", output)
+        output = re.sub(r"{note_begin}", "", output)
+        output = re.sub(r"{note_end}", "", output)
+        output = re.sub(r"{quote_begin}", "", output)
+        output = re.sub(r"{quote_end}", "", output)
+        pattern = re.compile(
+            "|".join(
+                [re.escape(k) for k in sorted(replacements, key=len, reverse=True)]
+            ),
+            flags=re.DOTALL,
+        )
+        return pattern.sub(lambda x: replacements[x.group(0)], output)
+
    def _retrieve_annotation_content(self, page, annotation):
        """Gets the text content of an annotation.