pubs-extract/extract/extract.py

import os
import argparse

import fitz
import Levenshtein

from pubs.plugins import PapersPlugin
from pubs.events import DocAddEvent, NoteEvent

from pubs import repo
from pubs.utils import resolve_citekey_list
from pubs.content import check_file, read_text_file, write_file


class ExtractPlugin(PapersPlugin):
    """Extract annotations from any pdf document.

    The extract plugin allows manual or automatic extraction of all annotations
    contained in the pdf documents belonging to entries of the pubs library.

    It can write those changes to stdout or directly create and update notes
    for the pubs entries.

    It adds a `pubs extract` subcommand through which it is invoked, but can
    optionally run whenever a new document is imported for a pubs entry.
    """

    name = "extract"
    description = "Extract annotations from pubs documents"

    def __init__(self, conf, ui):
        self.ui = ui
        self.note_extension = conf["main"]["note_extension"]
        self.repository = repo.Repository(conf)
        self.pubsdir = os.path.expanduser(conf["main"]["pubsdir"])
        self.broker = self.repository.databroker

        # TODO implement custom annotation formatting, akin to main config citekey format
        # e.g. `> [{page}] {annotation}`
        # or `:: {annotation} :: {page} ::`
        # and so on
        self.onimport = conf["plugins"].get("extract", {}).get("onimport", False)
        self.minimum_similarity = conf["plugins"].get("extract", {}).get("minimum_similarity", 0.75)

    def update_parser(self, subparsers, conf):
        """Allow the usage of the pubs extract subcommand"""
        # TODO option for ignoring missing documents or erroring.
        extract_parser = subparsers.add_parser(self.name, help=self.description)
        extract_parser.add_argument(
            "citekeys",
            nargs=argparse.REMAINDER,
            help="citekey(s) of the documents to extract from",
        )
        extract_parser.add_argument(
            "-w",
            "--write",
            help="Write to individual notes instead of standard out. Appends to existing notes.",
            action="store_true",
            default=None,
        )
        extract_parser.add_argument(
            "-e",
            "--edit",
            help="Open each note in editor for manual editing after extracting annotations to it.",
            action="store_true",
            default=False,
        )
        extract_parser.set_defaults(func=self.command)

    def command(self, conf, args):
        """Run the annotation extraction command."""
        citekeys = resolve_citekey_list(
            self.repository, conf, args.citekeys, ui=self.ui, exit_on_fail=True
        )
        if not citekeys:
            return
        all_annotations = self.extract(citekeys)
        if args.write:
            self._to_notes(all_annotations, self.note_extension, args.edit)
        else:
            self._to_stdout(all_annotations)
        self.repository.close()

    def extract(self, citekeys):
        """Extracts annotations from citekeys.

        Returns all annotations belonging to the papers that
        are described by the citekeys passed in.
        """
        papers = self._gather_papers(citekeys)
        papers_annotated = []
        for paper in papers:
            file = self._get_file(paper)
            try:
                papers_annotated.append((paper, self._get_annotations(file)))
            except fitz.FileDataError as e:
                self.ui.error(f"Document {file} is broken: {e}")
        return papers_annotated

    def _gather_papers(self, citekeys):
        """Get all papers for citekeys.

        Returns all Paper objects described by the citekeys
        passed in.
        """
        papers = []
        for key in citekeys:
            papers.append(self.repository.pull_paper(key))
        return papers

    def _get_file(self, paper):
        """Get path of document belonging to paper.

        Returns the real path to the document which belongs
        to the paper passed in. Emits a warning if no
        document belongs to paper.
        """
        path = self.broker.real_docpath(paper.docpath)
        if not path:
            self.ui.warning(f"{paper.citekey} has no valid document.")
        return path

    def _get_annotations(self, filename):
        """Extract annotations from a file.

        Returns all readable annotations contained in the file
        passed in. Only returns Highlight or Text annotations
        currently.
        """
        annotations = []
        with fitz.Document(filename) as doc:
            for page in doc:
                for annot in page.annots():
                    content = self._retrieve_annotation_content(page, annot)
                    if content:
                        annotations.append(f"[{(page.number or 0) + 1}] {content}")
        return annotations

    def _retrieve_annotation_content(self, page, annotation, connector = "\nNote: "):
        """Gets the text content of an annotation.

        Returns the actual content of an annotation. Sometimes
        that is only the written words, sometimes that is only
        annotation notes, sometimes it is both. Runs a similarity
        comparison between strings to find out whether they
        should both be included or are doubling up.
        """
        content = annotation.info["content"].replace("\n", " ")
        written = page.get_textbox(annotation.rect).replace("\n", " ")

        if Levenshtein.ratio(content,written) > self.minimum_similarity:
            return content
        elif content:
            return f"{written}{connector}{content}"
        return written

    def _to_stdout(self, annotated_papers):
        """Write annotations to stdout.

        Simply outputs the gathered annotations over stdout
        ready to be passed on through pipelines etc.
        """
        output = ""
        for contents in annotated_papers:
            paper = contents[0]
            annotations = contents[1]
            if annotations:
                output += f"{paper.citekey}\n"
                for annot in annotations:
                    output += f'> "{annot}"\n'
                output += "\n"
        print(output)

    def _to_notes(self, annotated_papers, note_extension="txt", edit=False):
        """Write annotations into pubs notes.

        Permanently writes the given annotations into notes
        in the pubs notes directory. Creates new notes for
        citekeys missing a note or appends to existing.
        """
        for contents in annotated_papers:
            paper = contents[0]
            annotations = contents[1]
            if annotations:
                notepath = self.broker.real_notepath(paper.citekey, note_extension)
                if check_file(notepath, fail=False):
                    self._append_to_note(notepath, annotations)
                else:
                    self._write_new_note(notepath, annotations)
                self.ui.info(f"Wrote annotations to {paper.citekey} note {notepath}.")

                if edit is True:
                    self.ui.edit_file(notepath, temporary=False)
                NoteEvent(paper.citekey).send()

    def _write_new_note(self, notepath, annotations):
        """Create a new note containing the annotations.

        Will create a new note in the notes folder of pubs
        and fill it with the annotations extracted from pdf.
        """
        output = "# Annotations\n\n"
        for annotation in annotations:
            output += f"> {annotation}\n\n"
        write_file(notepath, output, "w")

    def _append_to_note(self, notepath, annotations):
        """Append new annotations to the end of a note.

        Looks through note to determine any new annotations which should be
        added and adds them to the end of the note file.
        """
        existing = read_text_file(notepath)
        # removed annotations already found in the note
        existing_dropped = [x for x in annotations if x not in existing]
        if not existing_dropped:
            return

        output = ""
        for annotation in existing_dropped:
            output += f"> {annotation}\n\n"
        write_file(notepath, output, "a")


@DocAddEvent.listen()
def modify_event(event):
    if ExtractPlugin.is_loaded():
        plg = ExtractPlugin.get_instance()
        if plg.onimport:
            all_annotations = plg.extract([event.citekey])
            if all_annotations[0][1]:
                plg._to_notes(all_annotations, plg.note_extension)
                plg.ui.info(f"Imported {event.citekey} annotations.")
initial commit 2022-12-22 16:43:06 +00:00			`import os`
			`import argparse`

			`import fitz`
Add improved annotation selection through similarity 2022-12-22 21:31:21 +00:00			`import Levenshtein`
initial commit 2022-12-22 16:43:06 +00:00
Delineate internal from interface methods 2022-12-22 17:57:41 +00:00			`from pubs.plugins import PapersPlugin`
Add NoteEvent as notes get edited 2022-12-22 19:28:27 +00:00			`from pubs.events import DocAddEvent, NoteEvent`
initial commit 2022-12-22 16:43:06 +00:00
Delineate internal from interface methods 2022-12-22 17:57:41 +00:00			`from pubs import repo`
			`from pubs.utils import resolve_citekey_list`
Add note annotation appending 2022-12-22 19:52:10 +00:00			`from pubs.content import check_file, read_text_file, write_file`
initial commit 2022-12-22 16:43:06 +00:00

			`class ExtractPlugin(PapersPlugin):`
Add simple docstrings 2022-12-22 20:02:01 +00:00			`"""Extract annotations from any pdf document.`

			`The extract plugin allows manual or automatic extraction of all annotations`
			`contained in the pdf documents belonging to entries of the pubs library.`
initial commit 2022-12-22 16:43:06 +00:00
Add simple docstrings 2022-12-22 20:02:01 +00:00			`It can write those changes to stdout or directly create and update notes`
			`for the pubs entries.`
initial commit 2022-12-22 16:43:06 +00:00
Add simple docstrings 2022-12-22 20:02:01 +00:00			It adds a `pubs extract` subcommand through which it is invoked, but can
			`optionally run whenever a new document is imported for a pubs entry.`
initial commit 2022-12-22 16:43:06 +00:00			`"""`

			`name = "extract"`
			`description = "Extract annotations from pubs documents"`

			`def __init__(self, conf, ui):`
			`self.ui = ui`
Refactor plugin to remove carrying config around 2022-12-22 20:10:42 +00:00			`self.note_extension = conf["main"]["note_extension"]`
initial commit 2022-12-22 16:43:06 +00:00			`self.repository = repo.Repository(conf)`
Refactor annotations to list of paper,annotations tuples 2022-12-22 17:32:11 +00:00			`self.pubsdir = os.path.expanduser(conf["main"]["pubsdir"])`
			`self.broker = self.repository.databroker`
Close repository after completion 2022-12-22 17:34:33 +00:00
Update comments 2022-12-22 19:31:28 +00:00			`# TODO implement custom annotation formatting, akin to main config citekey format`
			# e.g. `> [{page}] {annotation}`
			# or `:: {annotation} :: {page} ::`
			`# and so on`
Add extracting annotations on document import 2022-12-22 19:24:20 +00:00			`self.onimport = conf["plugins"].get("extract", {}).get("onimport", False)`
Add improved annotation selection through similarity 2022-12-22 21:31:21 +00:00			`self.minimum_similarity = conf["plugins"].get("extract", {}).get("minimum_similarity", 0.75)`
initial commit 2022-12-22 16:43:06 +00:00
			`def update_parser(self, subparsers, conf):`
Add simple docstrings 2022-12-22 20:02:01 +00:00			`"""Allow the usage of the pubs extract subcommand"""`
initial commit 2022-12-22 16:43:06 +00:00			`# TODO option for ignoring missing documents or erroring.`
			`extract_parser = subparsers.add_parser(self.name, help=self.description)`
			`extract_parser.add_argument(`
			`"citekeys",`
			`nargs=argparse.REMAINDER,`
			`help="citekey(s) of the documents to extract from",`
			`)`
Add option to enable writing to notes 2022-12-22 17:33:33 +00:00			`extract_parser.add_argument(`
			`"-w",`
			`"--write",`
Add note annotation appending 2022-12-22 19:52:10 +00:00			`help="Write to individual notes instead of standard out. Appends to existing notes.",`
Format file 2022-12-22 19:32:06 +00:00			`action="store_true",`
Add option to enable writing to notes 2022-12-22 17:33:33 +00:00			`default=None,`
			`)`
Add option to edit notes after writing 2022-12-22 17:34:23 +00:00			`extract_parser.add_argument(`
			`"-e",`
			`"--edit",`
Add note annotation appending 2022-12-22 19:52:10 +00:00			`help="Open each note in editor for manual editing after extracting annotations to it.",`
Format file 2022-12-22 19:32:06 +00:00			`action="store_true",`
Add option to edit notes after writing 2022-12-22 17:34:23 +00:00			`default=False,`
			`)`
initial commit 2022-12-22 16:43:06 +00:00			`extract_parser.set_defaults(func=self.command)`

			`def command(self, conf, args):`
Add simple docstrings 2022-12-22 20:02:01 +00:00			`"""Run the annotation extraction command."""`
initial commit 2022-12-22 16:43:06 +00:00			`citekeys = resolve_citekey_list(`
			`self.repository, conf, args.citekeys, ui=self.ui, exit_on_fail=True`
			`)`
			`if not citekeys:`
			`return`
Refactor extract function to take citekeys not papers 2022-12-22 18:39:14 +00:00			`all_annotations = self.extract(citekeys)`
Add option to enable writing to notes 2022-12-22 17:33:33 +00:00			`if args.write:`
Refactor plugin to remove carrying config around 2022-12-22 20:10:42 +00:00			`self._to_notes(all_annotations, self.note_extension, args.edit)`
Add option to enable writing to notes 2022-12-22 17:33:33 +00:00			`else:`
Delineate internal from interface methods 2022-12-22 17:57:41 +00:00			`self._to_stdout(all_annotations)`
Close repository after completion 2022-12-22 17:34:33 +00:00			`self.repository.close()`
initial commit 2022-12-22 16:43:06 +00:00
Refactor extract function to take citekeys not papers 2022-12-22 18:39:14 +00:00			`def extract(self, citekeys):`
Add simple docstrings 2022-12-22 20:02:01 +00:00			`"""Extracts annotations from citekeys.`

			`Returns all annotations belonging to the papers that`
			`are described by the citekeys passed in.`
			`"""`
Refactor extract function to take citekeys not papers 2022-12-22 18:39:14 +00:00			`papers = self._gather_papers(citekeys)`
Refactor annotations to list of paper,annotations tuples 2022-12-22 17:32:11 +00:00			`papers_annotated = []`
initial commit 2022-12-22 16:43:06 +00:00			`for paper in papers:`
Delineate internal from interface methods 2022-12-22 17:57:41 +00:00			`file = self._get_file(paper)`
initial commit 2022-12-22 16:43:06 +00:00			`try:`
Delineate internal from interface methods 2022-12-22 17:57:41 +00:00			`papers_annotated.append((paper, self._get_annotations(file)))`
initial commit 2022-12-22 16:43:06 +00:00			`except fitz.FileDataError as e:`
Improve stdout rendering 2022-12-22 18:39:36 +00:00			`self.ui.error(f"Document {file} is broken: {e}")`
initial commit 2022-12-22 16:43:06 +00:00			`return papers_annotated`

Delineate internal from interface methods 2022-12-22 17:57:41 +00:00			`def _gather_papers(self, citekeys):`
Add simple docstrings 2022-12-22 20:02:01 +00:00			`"""Get all papers for citekeys.`

			`Returns all Paper objects described by the citekeys`
			`passed in.`
			`"""`
initial commit 2022-12-22 16:43:06 +00:00			`papers = []`
			`for key in citekeys:`
			`papers.append(self.repository.pull_paper(key))`
			`return papers`

Delineate internal from interface methods 2022-12-22 17:57:41 +00:00			`def _get_file(self, paper):`
Add simple docstrings 2022-12-22 20:02:01 +00:00			`"""Get path of document belonging to paper.`

			`Returns the real path to the document which belongs`
			`to the paper passed in. Emits a warning if no`
			`document belongs to paper.`
			`"""`
initial commit 2022-12-22 16:43:06 +00:00			`path = self.broker.real_docpath(paper.docpath)`
			`if not path:`
Improve stdout rendering 2022-12-22 18:39:36 +00:00			`self.ui.warning(f"{paper.citekey} has no valid document.")`
initial commit 2022-12-22 16:43:06 +00:00			`return path`

Delineate internal from interface methods 2022-12-22 17:57:41 +00:00			`def _get_annotations(self, filename):`
Add simple docstrings 2022-12-22 20:02:01 +00:00			`"""Extract annotations from a file.`

			`Returns all readable annotations contained in the file`
			`passed in. Only returns Highlight or Text annotations`
			`currently.`
			`"""`
initial commit 2022-12-22 16:43:06 +00:00			`annotations = []`
			`with fitz.Document(filename) as doc:`
			`for page in doc:`
			`for annot in page.annots():`
Add extraction for no-content and note highlights 2022-12-22 21:06:41 +00:00			`content = self._retrieve_annotation_content(page, annot)`
initial commit 2022-12-22 16:43:06 +00:00			`if content:`
Fix page numbering starting from zero 2022-12-22 20:42:56 +00:00			`annotations.append(f"[{(page.number or 0) + 1}] {content}")`
initial commit 2022-12-22 16:43:06 +00:00			`return annotations`

Add improved annotation selection through similarity 2022-12-22 21:31:21 +00:00			`def _retrieve_annotation_content(self, page, annotation, connector = "\nNote: "):`
			`"""Gets the text content of an annotation.`

			`Returns the actual content of an annotation. Sometimes`
			`that is only the written words, sometimes that is only`
			`annotation notes, sometimes it is both. Runs a similarity`
			`comparison between strings to find out whether they`
			`should both be included or are doubling up.`
			`"""`
Add extraction for no-content and note highlights 2022-12-22 21:06:41 +00:00			`content = annotation.info["content"].replace("\n", " ")`
			`written = page.get_textbox(annotation.rect).replace("\n", " ")`
Add improved annotation selection through similarity 2022-12-22 21:31:21 +00:00
			`if Levenshtein.ratio(content,written) > self.minimum_similarity:`
Add extraction for no-content and note highlights 2022-12-22 21:06:41 +00:00			`return content`
			`elif content:`
Add improved annotation selection through similarity 2022-12-22 21:31:21 +00:00			`return f"{written}{connector}{content}"`
Add extraction for no-content and note highlights 2022-12-22 21:06:41 +00:00			`return written`

Delineate internal from interface methods 2022-12-22 17:57:41 +00:00			`def _to_stdout(self, annotated_papers):`
Add simple docstrings 2022-12-22 20:02:01 +00:00			`"""Write annotations to stdout.`

			`Simply outputs the gathered annotations over stdout`
			`ready to be passed on through pipelines etc.`
			`"""`
Improve stdout rendering 2022-12-22 18:39:36 +00:00			`output = ""`
Refactor annotations to list of paper,annotations tuples 2022-12-22 17:32:11 +00:00			`for contents in annotated_papers:`
			`paper = contents[0]`
			`annotations = contents[1]`
initial commit 2022-12-22 16:43:06 +00:00			`if annotations:`
Format file 2022-12-22 19:32:06 +00:00			`output += f"{paper.citekey}\n"`
initial commit 2022-12-22 16:43:06 +00:00			`for annot in annotations:`
Format file 2022-12-22 19:32:06 +00:00			`output += f'> "{annot}"\n'`
			`output += "\n"`
Improve stdout rendering 2022-12-22 18:39:36 +00:00			`print(output)`
initial commit 2022-12-22 16:43:06 +00:00
Add extracting annotations on document import 2022-12-22 19:24:20 +00:00			`def _to_notes(self, annotated_papers, note_extension="txt", edit=False):`
Add simple docstrings 2022-12-22 20:02:01 +00:00			`"""Write annotations into pubs notes.`

			`Permanently writes the given annotations into notes`
			`in the pubs notes directory. Creates new notes for`
			`citekeys missing a note or appends to existing.`
			`"""`
Add writing annotations to notes 2022-12-22 17:33:00 +00:00			`for contents in annotated_papers:`
			`paper = contents[0]`
			`annotations = contents[1]`
			`if annotations:`
Format file 2022-12-22 19:32:06 +00:00			`notepath = self.broker.real_notepath(paper.citekey, note_extension)`
Add note annotation appending 2022-12-22 19:52:10 +00:00			`if check_file(notepath, fail=False):`
			`self._append_to_note(notepath, annotations)`
			`else:`
			`self._write_new_note(notepath, annotations)`
Add message on successful note writing 2022-12-22 20:10:56 +00:00			`self.ui.info(f"Wrote annotations to {paper.citekey} note {notepath}.")`
Add note annotation appending 2022-12-22 19:52:10 +00:00
Add option to edit notes after writing 2022-12-22 17:34:23 +00:00			`if edit is True:`
			`self.ui.edit_file(notepath, temporary=False)`
Add NoteEvent as notes get edited 2022-12-22 19:28:27 +00:00			`NoteEvent(paper.citekey).send()`
initial commit 2022-12-22 16:43:06 +00:00
Add note annotation appending 2022-12-22 19:52:10 +00:00			`def _write_new_note(self, notepath, annotations):`
Add simple docstrings 2022-12-22 20:02:01 +00:00			`"""Create a new note containing the annotations.`

			`Will create a new note in the notes folder of pubs`
			`and fill it with the annotations extracted from pdf.`
			`"""`
Add note annotation appending 2022-12-22 19:52:10 +00:00			`output = "# Annotations\n\n"`
			`for annotation in annotations:`
			`output += f"> {annotation}\n\n"`
			`write_file(notepath, output, "w")`

			`def _append_to_note(self, notepath, annotations):`
			`"""Append new annotations to the end of a note.`

			`Looks through note to determine any new annotations which should be`
			`added and adds them to the end of the note file.`
			`"""`
			`existing = read_text_file(notepath)`
			`# removed annotations already found in the note`
			`existing_dropped = [x for x in annotations if x not in existing]`
			`if not existing_dropped:`
			`return`

			`output = ""`
			`for annotation in existing_dropped:`
			`output += f"> {annotation}\n\n"`
			`write_file(notepath, output, "a")`

initial commit 2022-12-22 16:43:06 +00:00
Add extracting annotations on document import 2022-12-22 19:24:20 +00:00			`@DocAddEvent.listen()`
			`def modify_event(event):`
			`if ExtractPlugin.is_loaded():`
			`plg = ExtractPlugin.get_instance()`
			`if plg.onimport:`
			`all_annotations = plg.extract([event.citekey])`
			`if all_annotations[0][1]:`
Refactor plugin to remove carrying config around 2022-12-22 20:10:42 +00:00			`plg._to_notes(all_annotations, plg.note_extension)`
Add extracting annotations on document import 2022-12-22 19:24:20 +00:00			`plg.ui.info(f"Imported {event.citekey} annotations.")`