Refactor annotations into dataclass

This commit is contained in:
Marty Oehme 2022-12-24 18:01:55 +01:00
parent af885e0083
commit e4f1ee3591
Signed by: Marty
GPG key ID: 73BA40D5AFAF49C9

View file

@ -1,11 +1,14 @@
import os import os
import re import re
import argparse import argparse
from dataclasses import dataclass
from typing import Tuple
import fitz import fitz
import Levenshtein import Levenshtein
from pubs.plugins import PapersPlugin from pubs.plugins import PapersPlugin
from pubs.paper import Paper
from pubs.events import DocAddEvent, NoteEvent from pubs.events import DocAddEvent, NoteEvent
from pubs import repo, pretty from pubs import repo, pretty
@ -15,6 +18,44 @@ from pubs.query import get_paper_filter
CONFIRMATION_PAPER_THRESHOLD = 5 CONFIRMATION_PAPER_THRESHOLD = 5
@dataclass
class Annotation:
"""A PDF annotation object"""
paper: Paper
file: str
type: str = "Highlight"
text: str = ""
content: str = ""
page: int = 1
colors: Tuple = (0.0, 0.0, 0.0)
def formatted(self, formatting):
output = formatting
replacements = {
r"{quote}": self.text,
r"{note}": self.content,
r"{page}": str(self.page),
r"{newline}": "\n",
}
if self.text == "":
output = re.sub(r"{quote_begin}.*{quote_end}", "", output)
if self.content == "":
output = re.sub(r"{note_begin}.*{note_end}", "", output)
output = re.sub(r"{note_begin}", "", output)
output = re.sub(r"{note_end}", "", output)
output = re.sub(r"{quote_begin}", "", output)
output = re.sub(r"{quote_end}", "", output)
pattern = re.compile(
"|".join(
[re.escape(k) for k in sorted(replacements, key=len, reverse=True)]
),
flags=re.DOTALL,
)
return pattern.sub(lambda x: replacements[x.group(0)], output)
class ExtractPlugin(PapersPlugin): class ExtractPlugin(PapersPlugin):
"""Extract annotations from any pdf document. """Extract annotations from any pdf document.
@ -116,11 +157,12 @@ class ExtractPlugin(PapersPlugin):
Returns all annotations belonging to the papers that Returns all annotations belonging to the papers that
are described by the citekeys passed in. are described by the citekeys passed in.
""" """
papers_annotated = [] papers_annotated = {}
for paper in papers: for paper in papers:
file = self._get_file(paper) file = self._get_file(paper)
try: try:
papers_annotated.append((paper, self._get_annotations(file))) annotations = self._get_annotations(file, paper)
papers_annotated[paper.citekey] = annotations
except fitz.FileDataError as e: except fitz.FileDataError as e:
self.ui.error(f"Document {file} is broken: {e}") self.ui.error(f"Document {file} is broken: {e}")
return papers_annotated return papers_annotated
@ -177,7 +219,7 @@ class ExtractPlugin(PapersPlugin):
self.ui.warning(f"{paper.citekey} has no valid document.") self.ui.warning(f"{paper.citekey} has no valid document.")
return path return path
def _get_annotations(self, filename): def _get_annotations(self, filename, paper):
"""Extract annotations from a file. """Extract annotations from a file.
Returns all readable annotations contained in the file Returns all readable annotations contained in the file
@ -190,34 +232,18 @@ class ExtractPlugin(PapersPlugin):
for annot in page.annots(): for annot in page.annots():
quote, note = self._retrieve_annotation_content(page, annot) quote, note = self._retrieve_annotation_content(page, annot)
annotations.append( annotations.append(
self._format_annotation(quote, note, page.number or 0) Annotation(
file=filename,
paper=paper,
text=quote,
content=note,
colors=annot.colors,
type=annot.type,
page=(page.number or 0) + 1,
)
) )
return annotations return annotations
def _format_annotation(self, quote, note, pagenumber=0):
output = self.formatting
replacements = {
r"{quote}": quote,
r"{note}": note,
r"{page}": str(pagenumber),
r"{newline}": "\n",
}
if note == "":
output = re.sub(r"{note_begin}.*{note_end}", "", output)
if quote == "":
output = re.sub(r"{quote_begin}.*{quote_end}", "", output)
output = re.sub(r"{note_begin}", "", output)
output = re.sub(r"{note_end}", "", output)
output = re.sub(r"{quote_begin}", "", output)
output = re.sub(r"{quote_end}", "", output)
pattern = re.compile(
"|".join(
[re.escape(k) for k in sorted(replacements, key=len, reverse=True)]
),
flags=re.DOTALL,
)
return pattern.sub(lambda x: replacements[x.group(0)], output)
def _retrieve_annotation_content(self, page, annotation): def _retrieve_annotation_content(self, page, annotation):
"""Gets the text content of an annotation. """Gets the text content of an annotation.
@ -249,13 +275,11 @@ class ExtractPlugin(PapersPlugin):
ready to be passed on through pipelines etc. ready to be passed on through pipelines etc.
""" """
output = "" output = ""
for contents in annotated_papers: for citekey, annotations in annotated_papers.items():
paper = contents[0] output += f"------ {citekey} ------\n"
annotations = contents[1] for annotation in annotations:
if annotations: # for annot in annotations:
output += f"------ {paper.citekey} ------\n" output += f"{annotation.formatted(self.formatting)}\n"
for annot in annotations:
output += f"{annot}\n"
output += "\n" output += "\n"
print(output) print(output)
@ -266,20 +290,18 @@ class ExtractPlugin(PapersPlugin):
in the pubs notes directory. Creates new notes for in the pubs notes directory. Creates new notes for
citekeys missing a note or appends to existing. citekeys missing a note or appends to existing.
""" """
for contents in annotated_papers: for citekey, annotations in annotated_papers.items():
paper = contents[0]
annotations = contents[1]
if annotations: if annotations:
notepath = self.broker.real_notepath(paper.citekey, note_extension) notepath = self.broker.real_notepath(citekey, note_extension)
if check_file(notepath, fail=False): if check_file(notepath, fail=False):
self._append_to_note(notepath, annotations) self._append_to_note(notepath, annotations)
else: else:
self._write_new_note(notepath, annotations) self._write_new_note(notepath, annotations)
self.ui.info(f"Wrote annotations to {paper.citekey} note {notepath}.") self.ui.info(f"Wrote annotations to {citekey} note {notepath}.")
if edit is True: if edit is True:
self.ui.edit_file(notepath, temporary=False) self.ui.edit_file(notepath, temporary=False)
NoteEvent(paper.citekey).send() NoteEvent(citekey).send()
def _write_new_note(self, notepath, annotations): def _write_new_note(self, notepath, annotations):
"""Create a new note containing the annotations. """Create a new note containing the annotations.
@ -289,7 +311,7 @@ class ExtractPlugin(PapersPlugin):
""" """
output = "# Annotations\n\n" output = "# Annotations\n\n"
for annotation in annotations: for annotation in annotations:
output += f"{annotation}\n\n" output += f"{annotation.formatted(self.formatting)}\n\n"
write_file(notepath, output, "w") write_file(notepath, output, "w")
def _append_to_note(self, notepath, annotations): def _append_to_note(self, notepath, annotations):
@ -300,13 +322,13 @@ class ExtractPlugin(PapersPlugin):
""" """
existing = read_text_file(notepath) existing = read_text_file(notepath)
# removed annotations already found in the note # removed annotations already found in the note
existing_dropped = [x for x in annotations if x not in existing] existing_dropped = [x for x in annotations if x.formatted(self.formatting) not in existing]
if not existing_dropped: if not existing_dropped:
return return
output = "" output = ""
for annotation in existing_dropped: for annotation in existing_dropped:
output += f"{annotation}\n\n" output += f"{annotation.formatted(self.formatting)}\n\n"
write_file(notepath, output, "a") write_file(notepath, output, "a")