Extend formatting with sections

Add formatting configuration
Format file
2022-12-23 00:37:10 +01:00 · 2022-12-22 23:47:50 +01:00 · 2022-12-22 23:21:13 +01:00 · 2022-12-22 23:09:43 +01:00 · 2022-12-22 23:01:43 +01:00 · 2022-12-22 23:01:14 +01:00
2 changed files with 82 additions and 14 deletions
--- a/README.md
+++ b/README.md
@ -20,12 +20,39 @@ active = extract

 [[extract]]
 on_import = False
+formatting = "[{page}]{newline}{quote_begin}> {quote}{newline}{quote_end}{note_begin}Note: {note}{note_end}"
 minimum_similarity = 0.75
 ```

 If `on_import` is `True` extraction is automatically run whenever a new document is added to the library,
 if false extraction has to be handled manually.

+`formatting` takes a string with a variety of template options. You can use any of the following:
+
+- `{page}`: The page number the annotation was found on.
+- `{quote}`: The actual quoted string (i.e. highlighted).
+- `{note}`: The annotation note (i.e. addded reader).
+- `{quote_begin}`: Mark the area that begins a quotation. Useful to get rid of prefixes or suffixes if no quotation exists. Requires a corresponding `{quote_end}` to be set after.
+- `{note_begin}`: Same idea for the note area. Requires a corresponding `{note_end}` to be set after.
+
+For example, the default formatting string will result in this output:
+
+```markdown
+[4]
+> came “urban rights” caused collective outrage. Thus, through- out 2007 and 2008, protestors in towns and villages across the Nile Delta poured into the streets to rally over cuts in water flow. Deployment of massive riot police could not stop
+Note: Often illegally connected to network, ‘revolution of the thirsty’
+```
+
+The formatting string `{quote_begin}Quote: {quote}{page}{newline}{quote_end}{note_begin}Note: {note}{note_end}`
+will result in the following output:
+
+```markdown
+Quote: came “urban rights” caused collective outrage. Thus, through- out 2007 and 2008, protestors in towns and villages across the Nile Delta poured into the streets to rally over cuts in water flow. Deployment of massive riot police could not stop [4]
+Note: Often illegally connected to network, ‘revolution of the thirsty’
+```
+
+Note, however, that in this example the page number if *only* displayed if the annotation contains a quote.
+
 `minimum_similarity` sets the required similarity of an annotation's note and written words to be viewed
 as one. Any annotation that has both and is *under* the minimum similarity will be added in the following form:

--- a/extract/extract.py
+++ b/extract/extract.py
@ -1,4 +1,5 @@
 import os
+import re
 import argparse

 import fitz
@ -39,8 +40,18 @@ class ExtractPlugin(PapersPlugin):
        # e.g. `> [{page}] {annotation}`
        # or `:: {annotation} :: {page} ::`
        # and so on
-        self.onimport = conf["plugins"].get("extract", {}).get("onimport", False)
-        self.minimum_similarity = float(conf["plugins"].get("extract", {}).get("minimum_similarity", 0.75))
+        self.on_import = conf["plugins"].get("extract", {}).get("on_import", False)
+        self.minimum_similarity = float(
+            conf["plugins"].get("extract", {}).get("minimum_similarity", 0.75)
+        )
+        self.formatting = (
+            conf["plugins"]
+            .get("extract", {})
+            .get(
+                "formatting",
+                "[{page}]{newline}{quote_begin}> {quote}{newline}{quote_end}{note_begin}Note: {note}{note_end}",
+            )
+        )

    def update_parser(self, subparsers, conf):
        """Allow the usage of the pubs extract subcommand"""
@ -131,12 +142,37 @@ class ExtractPlugin(PapersPlugin):
        with fitz.Document(filename) as doc:
            for page in doc:
                for annot in page.annots():
-                    content = self._retrieve_annotation_content(page, annot)
-                    if content:
-                        annotations.append(f"[{(page.number or 0) + 1}] {content}")
+                    quote, note = self._retrieve_annotation_content(page, annot)
+                    annotations.append(
+                        self._format_annotation(quote, note, page.number or 0)
+                    )
        return annotations

-    def _retrieve_annotation_content(self, page, annotation, connector = "\nNote: "):
+    def _format_annotation(self, quote, note, pagenumber=0):
+        output = self.formatting
+        replacements = {
+            "{quote}": quote,
+            "{note}": note,
+            "{page}": str(pagenumber),
+            "{newline}": "\n",
+        }
+        if note == "":
+            output = re.sub(r"{note_begin}.*{note_end}", "", output)
+        if quote == "":
+            output = re.sub(r"{quote_begin}.*{quote_end}", "", output)
+        output = re.sub(r"{note_begin}", "", output)
+        output = re.sub(r"{note_end}", "", output)
+        output = re.sub(r"{quote_begin}", "", output)
+        output = re.sub(r"{quote_end}", "", output)
+        pattern = re.compile(
+            "|".join(
+                [re.escape(k) for k in sorted(replacements, key=len, reverse=True)]
+            ),
+            flags=re.DOTALL,
+        )
+        return pattern.sub(lambda x: replacements[x.group(0)], output)
+
+    def _retrieve_annotation_content(self, page, annotation):
        """Gets the text content of an annotation.

        Returns the actual content of an annotation. Sometimes
@ -148,11 +184,17 @@ class ExtractPlugin(PapersPlugin):
        content = annotation.info["content"].replace("\n", " ")
        written = page.get_textbox(annotation.rect).replace("\n", " ")

-        if Levenshtein.ratio(content,written) > self.minimum_similarity:
-            return content
+        # highlight with selection in note
+        if Levenshtein.ratio(content, written) > self.minimum_similarity:
+            return (content, "")
+        # an independent note, not a highlight
+        elif content and not written:
+            return ("", content)
+        # both a highlight and a note
        elif content:
-            return f"{written}{connector}{content}"
-        return written
+            return (written, content)
+        # highlight with selection not in note
+        return (written, "")

    def _to_stdout(self, annotated_papers):
        """Write annotations to stdout.
@ -165,9 +207,9 @@ class ExtractPlugin(PapersPlugin):
            paper = contents[0]
            annotations = contents[1]
            if annotations:
-                output += f"{paper.citekey}\n"
+                output += f"------ {paper.citekey} ------\n\n"
                for annot in annotations:
-                    output += f'> "{annot}"\n'
+                    output += f"{annot}\n\n"
                output += "\n"
        print(output)

@ -226,8 +268,7 @@ class ExtractPlugin(PapersPlugin):
 def modify_event(event):
    if ExtractPlugin.is_loaded():
        plg = ExtractPlugin.get_instance()
-        if plg.onimport:
+        if plg.on_import:
            all_annotations = plg.extract([event.citekey])
            if all_annotations[0][1]:
                plg._to_notes(all_annotations, plg.note_extension)
-                plg.ui.info(f"Imported {event.citekey} annotations.")
Author	SHA1	Message	Date
Marty Oehme	59ff74475d	Extend formatting with sections	2022-12-23 00:37:10 +01:00
Marty Oehme	a3e2d8693d	Add formatting configuration	2022-12-22 23:47:50 +01:00
Marty Oehme	e3aacc4b15	Format file	2022-12-22 23:21:13 +01:00
Marty Oehme	8f01b93de2	Add configurable note and quote prefixes	2022-12-22 23:09:43 +01:00
Marty Oehme	f103cc51b5	Add slight visual flair to stdout citekey display	2022-12-22 23:01:43 +01:00
Marty Oehme	e255405d96	Refactor highlight prefix setting	2022-12-22 23:01:14 +01:00
Marty Oehme	b65c2a3be6	Add independent note saving	2022-12-22 22:56:07 +01:00
Marty Oehme	82b99e8420	Remove message spew on automatic import	2022-12-22 22:55:25 +01:00
Marty Oehme	4281dba9c2	Fix typo in on_import configuration option	2022-12-22 22:55:08 +01:00