From c9f286fc3393e842d31142780d43eec3ff58e2ba Mon Sep 17 00:00:00 2001 From: Marty Oehme Date: Thu, 22 Dec 2022 22:06:41 +0100 Subject: [PATCH] Add extraction for no-content and note highlights --- README.md | 13 +++++++++++++ extract/extract.py | 13 ++++++++++--- 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 17395fe..7572ffb 100644 --- a/README.md +++ b/README.md @@ -62,6 +62,18 @@ What follows is a not-very-sorted train of though on where the plugin is at and could see myself taking it one day, provided I find the time. Pull requests tackling one of these areas of course very welcome. +## Issues + +A note on the extraction. Highlights in pdfs are somewhat difficult to parse +(as are most things in them). Sometimes they contain the selected text that is written on the +page, sometimes they contain the annotators thoughts as a note, sometimes they contain nothing. +This plugin makes an effort to find the right combination and extract the written words, +as well as any additional notes made - but things *will* slip through or extract weirdly every now +and again. + +The easiest extraction is provided if your program writes the selection itself into the highlight +content, because then we can just use that. It is harder to parse if it does not. + ## Roadmap: - [x] extracts highlights and annotations from a doc file (e.g. using PyMuPDF) @@ -79,6 +91,7 @@ Pull requests tackling one of these areas of course very welcome. - [ ] colors are given in very exact 0.6509979 RGB values, meaning we could once again estimate if a color is 'close enough' in distance to tag it accordingly - [ ] make invoking the command run a query if corresponding option provided (or whatever) in pubs syntax and use resulting papers - [ ] confirm for many papers? +- [ ] warning when the amount of annotations in file is different than the amount extracted? ## Things that would also be nice in pubs in general and don't really belong in this repository diff --git a/extract/extract.py b/extract/extract.py index 93eca9b..ce2cfe9 100644 --- a/extract/extract.py +++ b/extract/extract.py @@ -129,13 +129,20 @@ class ExtractPlugin(PapersPlugin): with fitz.Document(filename) as doc: for page in doc: for annot in page.annots(): - content = annot.get_text() or annot.info["content"].replace( - "\n", "" - ) + content = self._retrieve_annotation_content(page, annot) if content: annotations.append(f"[{(page.number or 0) + 1}] {content}") return annotations + def _retrieve_annotation_content(self, page, annotation): + content = annotation.info["content"].replace("\n", " ") + written = page.get_textbox(annotation.rect).replace("\n", " ") + if written in content: + return content + elif content: + return f"{written}\nNote: {content}" + return written + def _to_stdout(self, annotated_papers): """Write annotations to stdout.