Add improved annotation selection through similarity

This commit is contained in:
Marty Oehme 2022-12-22 22:31:21 +01:00
parent c9f286fc33
commit b98d473cc4
Signed by: Marty
GPG key ID: 73BA40D5AFAF49C9
3 changed files with 233 additions and 4 deletions

View file

@ -2,6 +2,7 @@ import os
import argparse
import fitz
import Levenshtein
from pubs.plugins import PapersPlugin
from pubs.events import DocAddEvent, NoteEvent
@ -39,6 +40,7 @@ class ExtractPlugin(PapersPlugin):
# or `:: {annotation} :: {page} ::`
# and so on
self.onimport = conf["plugins"].get("extract", {}).get("onimport", False)
self.minimum_similarity = conf["plugins"].get("extract", {}).get("minimum_similarity", 0.75)
def update_parser(self, subparsers, conf):
"""Allow the usage of the pubs extract subcommand"""
@ -134,13 +136,22 @@ class ExtractPlugin(PapersPlugin):
annotations.append(f"[{(page.number or 0) + 1}] {content}")
return annotations
def _retrieve_annotation_content(self, page, annotation):
def _retrieve_annotation_content(self, page, annotation, connector = "\nNote: "):
"""Gets the text content of an annotation.
Returns the actual content of an annotation. Sometimes
that is only the written words, sometimes that is only
annotation notes, sometimes it is both. Runs a similarity
comparison between strings to find out whether they
should both be included or are doubling up.
"""
content = annotation.info["content"].replace("\n", " ")
written = page.get_textbox(annotation.rect).replace("\n", " ")
if written in content:
if Levenshtein.ratio(content,written) > self.minimum_similarity:
return content
elif content:
return f"{written}\nNote: {content}"
return f"{written}{connector}{content}"
return written
def _to_stdout(self, annotated_papers):