Refactor extraction to use PaperAnnotated class

This commit is contained in:
Marty Oehme 2022-12-25 12:17:05 +01:00
parent 7a415b4d7d
commit 62f472d9ed
Signed by: Marty
GPG key ID: 73BA40D5AFAF49C9
3 changed files with 52 additions and 30 deletions

View file

@ -160,6 +160,7 @@ content, because then we can just use that. It is harder to parse if it does not
- [ ] needs some way to delimit where it puts stuff and user stuff is in note - [ ] needs some way to delimit where it puts stuff and user stuff is in note
- [ ] one way is to have it look at `> [17] here be extracted annotation from page seventeen` annotations and put it in between - [ ] one way is to have it look at `> [17] here be extracted annotation from page seventeen` annotations and put it in between
- [x] another, probably simpler first, is to just append missing annotations to the end of the note - [x] another, probably simpler first, is to just append missing annotations to the end of the note
- [ ] use similarity search instead of literal search for existing annotation (levenshtein)
- [x] some highlights (or annotations in general) do not contain text as content - [x] some highlights (or annotations in general) do not contain text as content
- [x] pymupdf can extract the content of the underlying rectangle (mostly) - [x] pymupdf can extract the content of the underlying rectangle (mostly)
- [x] issue is that sometimes the highlight contents are in content, sometimes a user comment instead - [x] issue is that sometimes the highlight contents are in content, sometimes a user comment instead

View file

@ -19,11 +19,33 @@ COLORS = {
} }
class PaperAnnotated(Paper):
def __init__(self, citekey, bibdata, metadata=None, annotations=[]):
super(PaperAnnotated, self).__init__(citekey, bibdata, metadata)
self.annotations = annotations
@classmethod
def from_paper(cls, paper, annotations=[]):
return cls(paper.citekey, paper.bibdata, paper.metadata, annotations)
def __repr__(self):
return "PaperAnnotated(%s, %s, %s)" % (
self.citekey,
self.bibdata,
self.metadata,
)
def headline(self, short=False, max_authors=3):
headline = pretty.paper_oneliner(
self, citekey_only=short, max_authors=max_authors
)
return re.sub(r"\[pdf\]", "", headline).rstrip()
@dataclass @dataclass
class Annotation: class Annotation:
"""A PDF annotation object""" """A PDF annotation object"""
paper: Paper
file: str file: str
type: str = "Highlight" type: str = "Highlight"
text: str = "" text: str = ""
@ -79,12 +101,6 @@ class Annotation:
nearest = name nearest = name
return nearest return nearest
def headline(self, short=False, max_authors=3):
headline = pretty.paper_oneliner(
self.paper, citekey_only=short, max_authors=max_authors
)
return re.sub(r"\[pdf\]", "", headline).rstrip()
def _color_similarity_ratio(self, color_one, color_two): def _color_similarity_ratio(self, color_one, color_two):
"""Return the similarity of two colors between 0 and 1. """Return the similarity of two colors between 0 and 1.

View file

@ -10,7 +10,12 @@ from pubs import repo, pretty
from pubs.utils import resolve_citekey_list from pubs.utils import resolve_citekey_list
from pubs.content import check_file, read_text_file, write_file from pubs.content import check_file, read_text_file, write_file
from pubs.query import get_paper_filter from pubs.query import get_paper_filter
from .annotation import Annotation, COLOR_SIMILARITY_MINIMUM, TEXT_SIMILARITY_MINIMUM from .annotation import (
PaperAnnotated,
Annotation,
COLOR_SIMILARITY_MINIMUM,
TEXT_SIMILARITY_MINIMUM,
)
CONFIRMATION_PAPER_THRESHOLD = 5 CONFIRMATION_PAPER_THRESHOLD = 5
@ -34,6 +39,7 @@ class ExtractPlugin(PapersPlugin):
def __init__(self, conf, ui): def __init__(self, conf, ui):
self.ui = ui self.ui = ui
self.note_extension = conf["main"]["note_extension"] self.note_extension = conf["main"]["note_extension"]
self.max_authors = conf["main"]["max_authors"]
self.repository = repo.Repository(conf) self.repository = repo.Repository(conf)
self.pubsdir = os.path.expanduser(conf["main"]["pubsdir"]) self.pubsdir = os.path.expanduser(conf["main"]["pubsdir"])
self.broker = self.repository.databroker self.broker = self.repository.databroker
@ -123,12 +129,12 @@ class ExtractPlugin(PapersPlugin):
Returns all annotations belonging to the papers that Returns all annotations belonging to the papers that
are described by the citekeys passed in. are described by the citekeys passed in.
""" """
papers_annotated = {} papers_annotated = []
for paper in papers: for paper in papers:
file = self._get_file(paper) file = self._get_file(paper)
try: try:
annotations = self._get_annotations(file, paper) annotations = self._get_annotations(file)
papers_annotated[paper.citekey] = annotations papers_annotated.append(PaperAnnotated.from_paper(paper, annotations))
except fitz.FileDataError as e: except fitz.FileDataError as e:
self.ui.error(f"Document {file} is broken: {e}") self.ui.error(f"Document {file} is broken: {e}")
return papers_annotated return papers_annotated
@ -166,7 +172,7 @@ class ExtractPlugin(PapersPlugin):
self.ui.message( self.ui.message(
"\n".join( "\n".join(
pretty.paper_oneliner( pretty.paper_oneliner(
p, citekey_only=False, max_authors=conf["main"]["max_authors"] p, citekey_only=False, max_authors=self.max_authors
) )
for p in papers for p in papers
) )
@ -188,7 +194,7 @@ class ExtractPlugin(PapersPlugin):
self.ui.warning(f"{paper.citekey} has no valid document.") self.ui.warning(f"{paper.citekey} has no valid document.")
return path return path
def _get_annotations(self, filename, paper): def _get_annotations(self, filename):
"""Extract annotations from a file. """Extract annotations from a file.
Returns all readable annotations contained in the file Returns all readable annotations contained in the file
@ -202,7 +208,6 @@ class ExtractPlugin(PapersPlugin):
quote, note = self._retrieve_annotation_content(page, annot) quote, note = self._retrieve_annotation_content(page, annot)
a = Annotation( a = Annotation(
file=filename, file=filename,
paper=paper,
text=quote, text=quote,
content=note, content=note,
colors=annot.colors, colors=annot.colors,
@ -237,18 +242,18 @@ class ExtractPlugin(PapersPlugin):
# highlight with selection not in note # highlight with selection not in note
return (written, "") return (written, "")
def _to_stdout(self, annotated_papers, short_header=True): def _to_stdout(self, annotated_papers, short_header=False):
"""Write annotations to stdout. """Write annotations to stdout.
Simply outputs the gathered annotations over stdout Simply outputs the gathered annotations over stdout
ready to be passed on through pipelines etc. ready to be passed on through pipelines etc.
""" """
output = "" output = ""
for citekey, annotations in annotated_papers.items(): for paper in annotated_papers:
output += ( output += (
f"\n------ {annotations[0].headline(short=short_header)} ------\n\n" f"\n------ {paper.headline(self.short_header, self.max_authors)} ------\n\n"
) )
for annotation in annotations: for annotation in paper.annotations:
output += f"{annotation.format(self.formatting)}\n" output += f"{annotation.format(self.formatting)}\n"
output += "\n" output += "\n"
self.ui.message(output.strip()) self.ui.message(output.strip())
@ -260,31 +265,31 @@ class ExtractPlugin(PapersPlugin):
in the pubs notes directory. Creates new notes for in the pubs notes directory. Creates new notes for
citekeys missing a note or appends to existing. citekeys missing a note or appends to existing.
""" """
for citekey, annotations in annotated_papers.items(): for paper in annotated_papers:
if annotations: if paper.annotations:
notepath = self.broker.real_notepath(citekey, note_extension) notepath = self.broker.real_notepath(paper.citekey, note_extension)
if check_file(notepath, fail=False): if check_file(notepath, fail=False):
self._append_to_note(notepath, annotations) self._append_to_note(notepath, paper)
else: else:
self._write_new_note(notepath, annotations, self.short_header) self._write_new_note(notepath, paper, paper.headline(short=True, max_authors=self.max_authors))
self.ui.info(f"Wrote annotations to {citekey} note {notepath}.") self.ui.info(f"Wrote annotations to {paper.citekey} note {notepath}.")
if edit is True: if edit is True:
self.ui.edit_file(notepath, temporary=False) self.ui.edit_file(notepath, temporary=False)
NoteEvent(citekey).send() NoteEvent(paper.citekey).send()
def _write_new_note(self, notepath, annotations, short_header): def _write_new_note(self, notepath, paper, headline):
"""Create a new note containing the annotations. """Create a new note containing the annotations.
Will create a new note in the notes folder of pubs Will create a new note in the notes folder of pubs
and fill it with the annotations extracted from pdf. and fill it with the annotations extracted from pdf.
""" """
output = f"# {annotations[0].headline(short=short_header)}\n\n" output = f"# {headline}\n\n"
for annotation in annotations: for annotation in paper.annotations:
output += f"{annotation.format(self.formatting)}\n\n" output += f"{annotation.format(self.formatting)}\n\n"
write_file(notepath, output, "w") write_file(notepath, output, "w")
def _append_to_note(self, notepath, annotations): def _append_to_note(self, notepath, paper):
"""Append new annotations to the end of a note. """Append new annotations to the end of a note.
Looks through note to determine any new annotations which should be Looks through note to determine any new annotations which should be
@ -293,7 +298,7 @@ class ExtractPlugin(PapersPlugin):
existing = read_text_file(notepath) existing = read_text_file(notepath)
# removed annotations already found in the note # removed annotations already found in the note
existing_dropped = [ existing_dropped = [
x for x in annotations if x.format(self.formatting) not in existing x for x in paper.annotations if x.format(self.formatting) not in existing
] ]
if not existing_dropped: if not existing_dropped:
return return