diff --git a/extract/extract.py b/extract/extract.py index f25bd73..18f3f0f 100644 --- a/extract/extract.py +++ b/extract/extract.py @@ -1,11 +1,14 @@ import os import re import argparse +from dataclasses import dataclass +from typing import Tuple import fitz import Levenshtein from pubs.plugins import PapersPlugin +from pubs.paper import Paper from pubs.events import DocAddEvent, NoteEvent from pubs import repo, pretty @@ -13,7 +16,45 @@ from pubs.utils import resolve_citekey_list from pubs.content import check_file, read_text_file, write_file from pubs.query import get_paper_filter -CONFIRMATION_PAPER_THRESHOLD=5 +CONFIRMATION_PAPER_THRESHOLD = 5 + + +@dataclass +class Annotation: + """A PDF annotation object""" + + paper: Paper + file: str + type: str = "Highlight" + text: str = "" + content: str = "" + page: int = 1 + colors: Tuple = (0.0, 0.0, 0.0) + + def formatted(self, formatting): + output = formatting + replacements = { + r"{quote}": self.text, + r"{note}": self.content, + r"{page}": str(self.page), + r"{newline}": "\n", + } + if self.text == "": + output = re.sub(r"{quote_begin}.*{quote_end}", "", output) + if self.content == "": + output = re.sub(r"{note_begin}.*{note_end}", "", output) + output = re.sub(r"{note_begin}", "", output) + output = re.sub(r"{note_end}", "", output) + output = re.sub(r"{quote_begin}", "", output) + output = re.sub(r"{quote_end}", "", output) + pattern = re.compile( + "|".join( + [re.escape(k) for k in sorted(replacements, key=len, reverse=True)] + ), + flags=re.DOTALL, + ) + return pattern.sub(lambda x: replacements[x.group(0)], output) + class ExtractPlugin(PapersPlugin): """Extract annotations from any pdf document. @@ -116,11 +157,12 @@ class ExtractPlugin(PapersPlugin): Returns all annotations belonging to the papers that are described by the citekeys passed in. """ - papers_annotated = [] + papers_annotated = {} for paper in papers: file = self._get_file(paper) try: - papers_annotated.append((paper, self._get_annotations(file))) + annotations = self._get_annotations(file, paper) + papers_annotated[paper.citekey] = annotations except fitz.FileDataError as e: self.ui.error(f"Document {file} is broken: {e}") return papers_annotated @@ -177,7 +219,7 @@ class ExtractPlugin(PapersPlugin): self.ui.warning(f"{paper.citekey} has no valid document.") return path - def _get_annotations(self, filename): + def _get_annotations(self, filename, paper): """Extract annotations from a file. Returns all readable annotations contained in the file @@ -190,34 +232,18 @@ class ExtractPlugin(PapersPlugin): for annot in page.annots(): quote, note = self._retrieve_annotation_content(page, annot) annotations.append( - self._format_annotation(quote, note, page.number or 0) + Annotation( + file=filename, + paper=paper, + text=quote, + content=note, + colors=annot.colors, + type=annot.type, + page=(page.number or 0) + 1, + ) ) return annotations - def _format_annotation(self, quote, note, pagenumber=0): - output = self.formatting - replacements = { - r"{quote}": quote, - r"{note}": note, - r"{page}": str(pagenumber), - r"{newline}": "\n", - } - if note == "": - output = re.sub(r"{note_begin}.*{note_end}", "", output) - if quote == "": - output = re.sub(r"{quote_begin}.*{quote_end}", "", output) - output = re.sub(r"{note_begin}", "", output) - output = re.sub(r"{note_end}", "", output) - output = re.sub(r"{quote_begin}", "", output) - output = re.sub(r"{quote_end}", "", output) - pattern = re.compile( - "|".join( - [re.escape(k) for k in sorted(replacements, key=len, reverse=True)] - ), - flags=re.DOTALL, - ) - return pattern.sub(lambda x: replacements[x.group(0)], output) - def _retrieve_annotation_content(self, page, annotation): """Gets the text content of an annotation. @@ -249,13 +275,11 @@ class ExtractPlugin(PapersPlugin): ready to be passed on through pipelines etc. """ output = "" - for contents in annotated_papers: - paper = contents[0] - annotations = contents[1] - if annotations: - output += f"------ {paper.citekey} ------\n" - for annot in annotations: - output += f"{annot}\n" + for citekey, annotations in annotated_papers.items(): + output += f"------ {citekey} ------\n" + for annotation in annotations: + # for annot in annotations: + output += f"{annotation.formatted(self.formatting)}\n" output += "\n" print(output) @@ -266,20 +290,18 @@ class ExtractPlugin(PapersPlugin): in the pubs notes directory. Creates new notes for citekeys missing a note or appends to existing. """ - for contents in annotated_papers: - paper = contents[0] - annotations = contents[1] + for citekey, annotations in annotated_papers.items(): if annotations: - notepath = self.broker.real_notepath(paper.citekey, note_extension) + notepath = self.broker.real_notepath(citekey, note_extension) if check_file(notepath, fail=False): self._append_to_note(notepath, annotations) else: self._write_new_note(notepath, annotations) - self.ui.info(f"Wrote annotations to {paper.citekey} note {notepath}.") + self.ui.info(f"Wrote annotations to {citekey} note {notepath}.") if edit is True: self.ui.edit_file(notepath, temporary=False) - NoteEvent(paper.citekey).send() + NoteEvent(citekey).send() def _write_new_note(self, notepath, annotations): """Create a new note containing the annotations. @@ -289,7 +311,7 @@ class ExtractPlugin(PapersPlugin): """ output = "# Annotations\n\n" for annotation in annotations: - output += f"{annotation}\n\n" + output += f"{annotation.formatted(self.formatting)}\n\n" write_file(notepath, output, "w") def _append_to_note(self, notepath, annotations): @@ -300,13 +322,13 @@ class ExtractPlugin(PapersPlugin): """ existing = read_text_file(notepath) # removed annotations already found in the note - existing_dropped = [x for x in annotations if x not in existing] + existing_dropped = [x for x in annotations if x.formatted(self.formatting) not in existing] if not existing_dropped: return output = "" for annotation in existing_dropped: - output += f"{annotation}\n\n" + output += f"{annotation.formatted(self.formatting)}\n\n" write_file(notepath, output, "a")