Add improved annotation selection through similarity

2022-12-22 22:31:21 +01:00 · 2022-12-22 22:31:21 +01:00 · b98d473cc4
commit b98d473cc4
parent c9f286fc33
3 changed files with 233 additions and 4 deletions
--- a/extract/extract.py
+++ b/extract/extract.py
@ -2,6 +2,7 @@ import os
 import argparse

 import fitz
+import Levenshtein

 from pubs.plugins import PapersPlugin
 from pubs.events import DocAddEvent, NoteEvent
@ -39,6 +40,7 @@ class ExtractPlugin(PapersPlugin):
        # or `:: {annotation} :: {page} ::`
        # and so on
        self.onimport = conf["plugins"].get("extract", {}).get("onimport", False)
+        self.minimum_similarity = conf["plugins"].get("extract", {}).get("minimum_similarity", 0.75)

    def update_parser(self, subparsers, conf):
        """Allow the usage of the pubs extract subcommand"""
@ -134,13 +136,22 @@ class ExtractPlugin(PapersPlugin):
                        annotations.append(f"[{(page.number or 0) + 1}] {content}")
        return annotations

-    def _retrieve_annotation_content(self, page, annotation):
+    def _retrieve_annotation_content(self, page, annotation, connector = "\nNote: "):
+        """Gets the text content of an annotation.
+
+        Returns the actual content of an annotation. Sometimes
+        that is only the written words, sometimes that is only
+        annotation notes, sometimes it is both. Runs a similarity
+        comparison between strings to find out whether they
+        should both be included or are doubling up.
+        """
        content = annotation.info["content"].replace("\n", " ")
        written = page.get_textbox(annotation.rect).replace("\n", " ")
-        if written in content:
+
+        if Levenshtein.ratio(content,written) > self.minimum_similarity:
            return content
        elif content:
-            return f"{written}\nNote: {content}"
+            return f"{written}{connector}{content}"
        return written

    def _to_stdout(self, annotated_papers):