pubs-extract/extract/extract.py

import os
import argparse

# from subprocess import Popen, PIPE, STDOUT
# from pipes import quote as shell_quote

import fitz

from pubs.plugins import PapersPlugin
from pubs.events import DocAddEvent, NoteEvent

from pubs import repo
from pubs.utils import resolve_citekey_list
from pubs.content import write_file


class ExtractPlugin(PapersPlugin):
    """Make the pubs repository also a git repository.

    The git plugin creates a git repository in the pubs directory
    and commit the changes to the pubs repository.

    It also add the `pubs git` subcommand, so git commands can be executed
    in the git repository from the command line.
    """

    name = "extract"
    description = "Extract annotations from pubs documents"

    def __init__(self, conf, ui):
        self.ui = ui
        self.conf = conf
        self.repository = repo.Repository(conf)
        self.pubsdir = os.path.expanduser(conf["main"]["pubsdir"])
        self.broker = self.repository.databroker

        # TODO implement custom annotation formatting, akin to main config citekey format
        # e.g. `> [{page}] {annotation}`
        # or `:: {annotation} :: {page} ::`
        # and so on
        self.onimport = conf["plugins"].get("extract", {}).get("onimport", False)
        # self.manual = conf['plugins'].get('git', {}).get('manual', False)
        # self.force_color = conf['plugins'].get('git', {}).get('force_color', True)
        # self.list_of_changes = []

    def update_parser(self, subparsers, conf):
        """Allow the usage of the pubs git subcommand"""
        # TODO option for ignoring missing documents or erroring.
        extract_parser = subparsers.add_parser(self.name, help=self.description)
        extract_parser.add_argument(
            "citekeys",
            nargs=argparse.REMAINDER,
            help="citekey(s) of the documents to extract from",
        )
        # TODO option for writing to stdout or notes
        extract_parser.add_argument(
            "-w",
            "--write",
            help="write to individual notes instead of standard out. CAREFUL: OVERWRITES NOTES CURRENTLY",
            action="store_true",
            default=None,
        )
        extract_parser.add_argument(
            "-e",
            "--edit",
            help="open each note in editor for manual editing after extracting annotations to it",
            action="store_true",
            default=False,
        )
        extract_parser.set_defaults(func=self.command)

    def command(self, conf, args):
        """Run the annotation extraction"""
        citekeys = resolve_citekey_list(
            self.repository, conf, args.citekeys, ui=self.ui, exit_on_fail=True
        )
        if not citekeys:
            return
        all_annotations = self.extract(citekeys)
        if args.write:
            self._to_notes(all_annotations, conf["main"]["note_extension"], args.edit)
        else:
            self._to_stdout(all_annotations)
        self.repository.close()

    def extract(self, citekeys):
        papers = self._gather_papers(citekeys)
        papers_annotated = []
        for paper in papers:
            file = self._get_file(paper)
            try:
                papers_annotated.append((paper, self._get_annotations(file)))
            except fitz.FileDataError as e:
                self.ui.error(f"Document {file} is broken: {e}")
        return papers_annotated

    def _gather_papers(self, citekeys):
        papers = []
        for key in citekeys:
            papers.append(self.repository.pull_paper(key))
        return papers

    def _get_file(self, paper):
        path = self.broker.real_docpath(paper.docpath)
        if not path:
            self.ui.warning(f"{paper.citekey} has no valid document.")
        return path

    def _get_annotations(self, filename):
        annotations = []
        with fitz.Document(filename) as doc:
            for page in doc:
                for annot in page.annots():
                    content = annot.get_text() or annot.info["content"].replace(
                        "\n", ""
                    )
                    if content:
                        annotations.append(f"[{page.number}] {content}")
        return annotations

    def _to_stdout(self, annotated_papers):
        output = ""
        for contents in annotated_papers:
            paper = contents[0]
            annotations = contents[1]
            if annotations:
                output += f"{paper.citekey}\n"
                for annot in annotations:
                    output += f'> "{annot}"\n'
                output += "\n"
        print(output)

    def _to_notes(self, annotated_papers, note_extension="txt", edit=False):
        for contents in annotated_papers:
            paper = contents[0]
            annotations = contents[1]
            if annotations:
                notepath = self.broker.real_notepath(paper.citekey, note_extension)
                output = "# Annotations\n\n"
                for annotation in annotations:
                    output += f"> {annotation}\n\n"
                write_file(notepath, output, "w")
                if edit is True:
                    self.ui.edit_file(notepath, temporary=False)
                NoteEvent(paper.citekey).send()


@DocAddEvent.listen()
def modify_event(event):
    if ExtractPlugin.is_loaded():
        plg = ExtractPlugin.get_instance()
        if plg.onimport:
            all_annotations = plg.extract([event.citekey])
            if all_annotations[0][1]:
                plg._to_notes(all_annotations, plg.conf["main"]["note_extension"])
                plg.ui.info(f"Imported {event.citekey} annotations.")
initial commit 2022-12-22 16:43:06 +00:00			`import os`
			`import argparse`

			`# from subprocess import Popen, PIPE, STDOUT`
			`# from pipes import quote as shell_quote`

			`import fitz`

Delineate internal from interface methods 2022-12-22 17:57:41 +00:00			`from pubs.plugins import PapersPlugin`
Add NoteEvent as notes get edited 2022-12-22 19:28:27 +00:00			`from pubs.events import DocAddEvent, NoteEvent`
initial commit 2022-12-22 16:43:06 +00:00
Delineate internal from interface methods 2022-12-22 17:57:41 +00:00			`from pubs import repo`
			`from pubs.utils import resolve_citekey_list`
			`from pubs.content import write_file`
initial commit 2022-12-22 16:43:06 +00:00

			`class ExtractPlugin(PapersPlugin):`
			`"""Make the pubs repository also a git repository.`

			`The git plugin creates a git repository in the pubs directory`
			`and commit the changes to the pubs repository.`

			It also add the `pubs git` subcommand, so git commands can be executed
			`in the git repository from the command line.`
			`"""`

			`name = "extract"`
			`description = "Extract annotations from pubs documents"`

			`def __init__(self, conf, ui):`
			`self.ui = ui`
Add extracting annotations on document import 2022-12-22 19:24:20 +00:00			`self.conf = conf`
initial commit 2022-12-22 16:43:06 +00:00			`self.repository = repo.Repository(conf)`
Refactor annotations to list of paper,annotations tuples 2022-12-22 17:32:11 +00:00			`self.pubsdir = os.path.expanduser(conf["main"]["pubsdir"])`
			`self.broker = self.repository.databroker`
Close repository after completion 2022-12-22 17:34:33 +00:00
Update comments 2022-12-22 19:31:28 +00:00			`# TODO implement custom annotation formatting, akin to main config citekey format`
			# e.g. `> [{page}] {annotation}`
			# or `:: {annotation} :: {page} ::`
			`# and so on`
Add extracting annotations on document import 2022-12-22 19:24:20 +00:00			`self.onimport = conf["plugins"].get("extract", {}).get("onimport", False)`
initial commit 2022-12-22 16:43:06 +00:00			`# self.manual = conf['plugins'].get('git', {}).get('manual', False)`
			`# self.force_color = conf['plugins'].get('git', {}).get('force_color', True)`
			`# self.list_of_changes = []`

			`def update_parser(self, subparsers, conf):`
			`"""Allow the usage of the pubs git subcommand"""`
			`# TODO option for ignoring missing documents or erroring.`
			`extract_parser = subparsers.add_parser(self.name, help=self.description)`
			`extract_parser.add_argument(`
			`"citekeys",`
			`nargs=argparse.REMAINDER,`
			`help="citekey(s) of the documents to extract from",`
			`)`
Add option to enable writing to notes 2022-12-22 17:33:33 +00:00			`# TODO option for writing to stdout or notes`
			`extract_parser.add_argument(`
			`"-w",`
			`"--write",`
			`help="write to individual notes instead of standard out. CAREFUL: OVERWRITES NOTES CURRENTLY",`
Format file 2022-12-22 19:32:06 +00:00			`action="store_true",`
Add option to enable writing to notes 2022-12-22 17:33:33 +00:00			`default=None,`
			`)`
Add option to edit notes after writing 2022-12-22 17:34:23 +00:00			`extract_parser.add_argument(`
			`"-e",`
			`"--edit",`
			`help="open each note in editor for manual editing after extracting annotations to it",`
Format file 2022-12-22 19:32:06 +00:00			`action="store_true",`
Add option to edit notes after writing 2022-12-22 17:34:23 +00:00			`default=False,`
			`)`
initial commit 2022-12-22 16:43:06 +00:00			`extract_parser.set_defaults(func=self.command)`

			`def command(self, conf, args):`
			`"""Run the annotation extraction"""`
			`citekeys = resolve_citekey_list(`
			`self.repository, conf, args.citekeys, ui=self.ui, exit_on_fail=True`
			`)`
			`if not citekeys:`
			`return`
Refactor extract function to take citekeys not papers 2022-12-22 18:39:14 +00:00			`all_annotations = self.extract(citekeys)`
Add option to enable writing to notes 2022-12-22 17:33:33 +00:00			`if args.write:`
Add extracting annotations on document import 2022-12-22 19:24:20 +00:00			`self._to_notes(all_annotations, conf["main"]["note_extension"], args.edit)`
Add option to enable writing to notes 2022-12-22 17:33:33 +00:00			`else:`
Delineate internal from interface methods 2022-12-22 17:57:41 +00:00			`self._to_stdout(all_annotations)`
Close repository after completion 2022-12-22 17:34:33 +00:00			`self.repository.close()`
initial commit 2022-12-22 16:43:06 +00:00
Refactor extract function to take citekeys not papers 2022-12-22 18:39:14 +00:00			`def extract(self, citekeys):`
			`papers = self._gather_papers(citekeys)`
Refactor annotations to list of paper,annotations tuples 2022-12-22 17:32:11 +00:00			`papers_annotated = []`
initial commit 2022-12-22 16:43:06 +00:00			`for paper in papers:`
Delineate internal from interface methods 2022-12-22 17:57:41 +00:00			`file = self._get_file(paper)`
initial commit 2022-12-22 16:43:06 +00:00			`try:`
Delineate internal from interface methods 2022-12-22 17:57:41 +00:00			`papers_annotated.append((paper, self._get_annotations(file)))`
initial commit 2022-12-22 16:43:06 +00:00			`except fitz.FileDataError as e:`
Improve stdout rendering 2022-12-22 18:39:36 +00:00			`self.ui.error(f"Document {file} is broken: {e}")`
initial commit 2022-12-22 16:43:06 +00:00			`return papers_annotated`

Delineate internal from interface methods 2022-12-22 17:57:41 +00:00			`def _gather_papers(self, citekeys):`
initial commit 2022-12-22 16:43:06 +00:00			`papers = []`
			`for key in citekeys:`
			`papers.append(self.repository.pull_paper(key))`
			`return papers`

Delineate internal from interface methods 2022-12-22 17:57:41 +00:00			`def _get_file(self, paper):`
initial commit 2022-12-22 16:43:06 +00:00			`path = self.broker.real_docpath(paper.docpath)`
			`if not path:`
Improve stdout rendering 2022-12-22 18:39:36 +00:00			`self.ui.warning(f"{paper.citekey} has no valid document.")`
initial commit 2022-12-22 16:43:06 +00:00			`return path`

Delineate internal from interface methods 2022-12-22 17:57:41 +00:00			`def _get_annotations(self, filename):`
initial commit 2022-12-22 16:43:06 +00:00			`annotations = []`
			`with fitz.Document(filename) as doc:`
			`for page in doc:`
			`for annot in page.annots():`
			`content = annot.get_text() or annot.info["content"].replace(`
			`"\n", ""`
			`)`
			`if content:`
			`annotations.append(f"[{page.number}] {content}")`
			`return annotations`

Delineate internal from interface methods 2022-12-22 17:57:41 +00:00			`def _to_stdout(self, annotated_papers):`
Improve stdout rendering 2022-12-22 18:39:36 +00:00			`output = ""`
Refactor annotations to list of paper,annotations tuples 2022-12-22 17:32:11 +00:00			`for contents in annotated_papers:`
			`paper = contents[0]`
			`annotations = contents[1]`
initial commit 2022-12-22 16:43:06 +00:00			`if annotations:`
Format file 2022-12-22 19:32:06 +00:00			`output += f"{paper.citekey}\n"`
initial commit 2022-12-22 16:43:06 +00:00			`for annot in annotations:`
Format file 2022-12-22 19:32:06 +00:00			`output += f'> "{annot}"\n'`
			`output += "\n"`
Improve stdout rendering 2022-12-22 18:39:36 +00:00			`print(output)`
initial commit 2022-12-22 16:43:06 +00:00
Add extracting annotations on document import 2022-12-22 19:24:20 +00:00			`def _to_notes(self, annotated_papers, note_extension="txt", edit=False):`
Add writing annotations to notes 2022-12-22 17:33:00 +00:00			`for contents in annotated_papers:`
			`paper = contents[0]`
			`annotations = contents[1]`
			`if annotations:`
Format file 2022-12-22 19:32:06 +00:00			`notepath = self.broker.real_notepath(paper.citekey, note_extension)`
Add writing annotations to notes 2022-12-22 17:33:00 +00:00			`output = "# Annotations\n\n"`
			`for annotation in annotations:`
Format file 2022-12-22 19:32:06 +00:00			`output += f"> {annotation}\n\n"`
			`write_file(notepath, output, "w")`
Add option to edit notes after writing 2022-12-22 17:34:23 +00:00			`if edit is True:`
			`self.ui.edit_file(notepath, temporary=False)`
Add NoteEvent as notes get edited 2022-12-22 19:28:27 +00:00			`NoteEvent(paper.citekey).send()`
initial commit 2022-12-22 16:43:06 +00:00

Add extracting annotations on document import 2022-12-22 19:24:20 +00:00			`@DocAddEvent.listen()`
			`def modify_event(event):`
			`if ExtractPlugin.is_loaded():`
			`plg = ExtractPlugin.get_instance()`
			`if plg.onimport:`
			`all_annotations = plg.extract([event.citekey])`
			`if all_annotations[0][1]:`
			`plg._to_notes(all_annotations, plg.conf["main"]["note_extension"])`
			`plg.ui.info(f"Imported {event.citekey} annotations.")`