pubs-extract/extract/extract.py

157 lines
5.5 KiB
Python
Raw Normal View History

2022-12-22 16:43:06 +00:00
import os
import argparse
# from subprocess import Popen, PIPE, STDOUT
# from pipes import quote as shell_quote
import fitz
from pubs.plugins import PapersPlugin
2022-12-22 19:28:27 +00:00
from pubs.events import DocAddEvent, NoteEvent
2022-12-22 16:43:06 +00:00
from pubs import repo
from pubs.utils import resolve_citekey_list
from pubs.content import write_file
2022-12-22 16:43:06 +00:00
class ExtractPlugin(PapersPlugin):
"""Make the pubs repository also a git repository.
The git plugin creates a git repository in the pubs directory
and commit the changes to the pubs repository.
It also add the `pubs git` subcommand, so git commands can be executed
in the git repository from the command line.
"""
name = "extract"
description = "Extract annotations from pubs documents"
def __init__(self, conf, ui):
self.ui = ui
self.conf = conf
2022-12-22 16:43:06 +00:00
self.repository = repo.Repository(conf)
self.pubsdir = os.path.expanduser(conf["main"]["pubsdir"])
self.broker = self.repository.databroker
2022-12-22 17:34:33 +00:00
2022-12-22 19:31:28 +00:00
# TODO implement custom annotation formatting, akin to main config citekey format
# e.g. `> [{page}] {annotation}`
# or `:: {annotation} :: {page} ::`
# and so on
self.onimport = conf["plugins"].get("extract", {}).get("onimport", False)
2022-12-22 16:43:06 +00:00
# self.manual = conf['plugins'].get('git', {}).get('manual', False)
# self.force_color = conf['plugins'].get('git', {}).get('force_color', True)
# self.list_of_changes = []
def update_parser(self, subparsers, conf):
"""Allow the usage of the pubs git subcommand"""
# TODO option for ignoring missing documents or erroring.
extract_parser = subparsers.add_parser(self.name, help=self.description)
extract_parser.add_argument(
"citekeys",
nargs=argparse.REMAINDER,
help="citekey(s) of the documents to extract from",
)
2022-12-22 17:33:33 +00:00
# TODO option for writing to stdout or notes
extract_parser.add_argument(
"-w",
"--write",
help="write to individual notes instead of standard out. CAREFUL: OVERWRITES NOTES CURRENTLY",
2022-12-22 19:32:06 +00:00
action="store_true",
2022-12-22 17:33:33 +00:00
default=None,
)
2022-12-22 17:34:23 +00:00
extract_parser.add_argument(
"-e",
"--edit",
help="open each note in editor for manual editing after extracting annotations to it",
2022-12-22 19:32:06 +00:00
action="store_true",
2022-12-22 17:34:23 +00:00
default=False,
)
2022-12-22 16:43:06 +00:00
extract_parser.set_defaults(func=self.command)
def command(self, conf, args):
"""Run the annotation extraction"""
citekeys = resolve_citekey_list(
self.repository, conf, args.citekeys, ui=self.ui, exit_on_fail=True
)
if not citekeys:
return
all_annotations = self.extract(citekeys)
2022-12-22 17:33:33 +00:00
if args.write:
self._to_notes(all_annotations, conf["main"]["note_extension"], args.edit)
2022-12-22 17:33:33 +00:00
else:
self._to_stdout(all_annotations)
2022-12-22 17:34:33 +00:00
self.repository.close()
2022-12-22 16:43:06 +00:00
def extract(self, citekeys):
papers = self._gather_papers(citekeys)
papers_annotated = []
2022-12-22 16:43:06 +00:00
for paper in papers:
file = self._get_file(paper)
2022-12-22 16:43:06 +00:00
try:
papers_annotated.append((paper, self._get_annotations(file)))
2022-12-22 16:43:06 +00:00
except fitz.FileDataError as e:
2022-12-22 18:39:36 +00:00
self.ui.error(f"Document {file} is broken: {e}")
2022-12-22 16:43:06 +00:00
return papers_annotated
def _gather_papers(self, citekeys):
2022-12-22 16:43:06 +00:00
papers = []
for key in citekeys:
papers.append(self.repository.pull_paper(key))
return papers
def _get_file(self, paper):
2022-12-22 16:43:06 +00:00
path = self.broker.real_docpath(paper.docpath)
if not path:
2022-12-22 18:39:36 +00:00
self.ui.warning(f"{paper.citekey} has no valid document.")
2022-12-22 16:43:06 +00:00
return path
def _get_annotations(self, filename):
2022-12-22 16:43:06 +00:00
annotations = []
with fitz.Document(filename) as doc:
for page in doc:
for annot in page.annots():
content = annot.get_text() or annot.info["content"].replace(
"\n", ""
)
if content:
annotations.append(f"[{page.number}] {content}")
return annotations
def _to_stdout(self, annotated_papers):
2022-12-22 18:39:36 +00:00
output = ""
for contents in annotated_papers:
paper = contents[0]
annotations = contents[1]
2022-12-22 16:43:06 +00:00
if annotations:
2022-12-22 19:32:06 +00:00
output += f"{paper.citekey}\n"
2022-12-22 16:43:06 +00:00
for annot in annotations:
2022-12-22 19:32:06 +00:00
output += f'> "{annot}"\n'
output += "\n"
2022-12-22 18:39:36 +00:00
print(output)
2022-12-22 16:43:06 +00:00
def _to_notes(self, annotated_papers, note_extension="txt", edit=False):
2022-12-22 17:33:00 +00:00
for contents in annotated_papers:
paper = contents[0]
annotations = contents[1]
if annotations:
2022-12-22 19:32:06 +00:00
notepath = self.broker.real_notepath(paper.citekey, note_extension)
2022-12-22 17:33:00 +00:00
output = "# Annotations\n\n"
for annotation in annotations:
2022-12-22 19:32:06 +00:00
output += f"> {annotation}\n\n"
write_file(notepath, output, "w")
2022-12-22 17:34:23 +00:00
if edit is True:
self.ui.edit_file(notepath, temporary=False)
2022-12-22 19:28:27 +00:00
NoteEvent(paper.citekey).send()
2022-12-22 16:43:06 +00:00
@DocAddEvent.listen()
def modify_event(event):
if ExtractPlugin.is_loaded():
plg = ExtractPlugin.get_instance()
if plg.onimport:
all_annotations = plg.extract([event.citekey])
if all_annotations[0][1]:
plg._to_notes(all_annotations, plg.conf["main"]["note_extension"])
plg.ui.info(f"Imported {event.citekey} annotations.")