commit 4206554950319307b4f01bf0e38ef3e44e8ae30d Author: Marty Oehme Date: Thu Dec 22 17:43:06 2022 +0100 initial commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b015b65 --- /dev/null +++ b/.gitignore @@ -0,0 +1,206 @@ +# Created by https://www.toptal.com/developers/gitignore/api/python,linux,vim +# Edit at https://www.toptal.com/developers/gitignore?templates=python,linux,vim + +### Linux ### +*~ + +# temporary files which can be created if a process still has a handle open of a deleted file +.fuse_hidden* + +# KDE directory preferences +.directory + +# Linux trash folder which might appear on any partition or disk +.Trash-* + +# .nfs files are created when an open file is removed but is still being accessed +.nfs* + +### Python ### +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + +### Python Patch ### +# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration +poetry.toml + + +### Vim ### +# Swap +[._]*.s[a-v][a-z] +!*.svg # comment out if you don't need vector files +[._]*.sw[a-p] +[._]s[a-rt-v][a-z] +[._]ss[a-gi-z] +[._]sw[a-p] + +# Session +Session.vim +Sessionx.vim + +# Temporary +.netrwhist +# Auto-generated tag files +tags +# Persistent undo +[._]*.un~ + +# End of https://www.toptal.com/developers/gitignore/api/python,linux,vim diff --git a/extract/__init__.py b/extract/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/extract/__pycache__/__init__.cpython-310.pyc b/extract/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000..f822506 Binary files /dev/null and b/extract/__pycache__/__init__.cpython-310.pyc differ diff --git a/extract/__pycache__/extract.cpython-310.pyc b/extract/__pycache__/extract.cpython-310.pyc new file mode 100644 index 0000000..7b308c0 Binary files /dev/null and b/extract/__pycache__/extract.cpython-310.pyc differ diff --git a/extract/extract.py b/extract/extract.py new file mode 100644 index 0000000..3c41f62 --- /dev/null +++ b/extract/extract.py @@ -0,0 +1,148 @@ +import os +import sys +import argparse +from typing import Dict, List, Tuple + +# from subprocess import Popen, PIPE, STDOUT +# from pipes import quote as shell_quote + +import fitz + +# from ... import uis +from ...plugins import PapersPlugin +from ...events import PaperChangeEvent, PostCommandEvent + +from ... import repo +from ...utils import resolve_citekey_list +from ...filebroker import DocBroker + + +class ExtractPlugin(PapersPlugin): + """Make the pubs repository also a git repository. + + The git plugin creates a git repository in the pubs directory + and commit the changes to the pubs repository. + + It also add the `pubs git` subcommand, so git commands can be executed + in the git repository from the command line. + """ + + name = "extract" + description = "Extract annotations from pubs documents" + + def __init__(self, conf, ui): + pass + self.ui = ui + self.pubsdir = os.path.expanduser(conf["main"]["pubsdir"]) + self.broker = DocBroker(self.pubsdir) + self.repository = repo.Repository(conf) + self.quiet = conf["plugins"].get("extract", {}).get("quiet", False) + # self.manual = conf['plugins'].get('git', {}).get('manual', False) + # self.force_color = conf['plugins'].get('git', {}).get('force_color', True) + # self.list_of_changes = [] + + def update_parser(self, subparsers, conf): + """Allow the usage of the pubs git subcommand""" + # TODO option for quiet/loud mode. + # TODO option for ignoring missing documents or erroring. + # TODO option for writing to stdout or notes + extract_parser = subparsers.add_parser(self.name, help=self.description) + extract_parser.add_argument( + "citekeys", + nargs=argparse.REMAINDER, + help="citekey(s) of the documents to extract from", + ) + extract_parser.set_defaults(func=self.command) + + def command(self, conf, args): + """Run the annotation extraction""" + citekeys = resolve_citekey_list( + self.repository, conf, args.citekeys, ui=self.ui, exit_on_fail=True + ) + if not citekeys: + return + papers = self.gather_papers(citekeys) + all_annotations = self.extract(papers) + self.to_stdout(all_annotations) + + def extract(self, papers): + papers_annotated: Dict = {} + for paper in papers: + file = self.get_file(paper) + try: + papers_annotated[paper.citekey] = self.get_annotations(file) + except fitz.FileDataError as e: + print(f"ERROR: Document {file} is broken: {e}") + if not self.quiet: + # TODO find pretty print functionality + self.ui.info(f"Extracted annotations from {paper.citekey}") + return papers_annotated + + def gather_papers(self, citekeys): + papers = [] + for key in citekeys: + papers.append(self.repository.pull_paper(key)) + return papers + + def get_file(self, paper): + path = self.broker.real_docpath(paper.docpath) + if not path: + self.ui.error(f"{paper.citekey} has no valid document.") + return path + + def get_annotations(self, filename): + annotations = [] + with fitz.Document(filename) as doc: + for page in doc: + for annot in page.annots(): + content = annot.get_text() or annot.info["content"].replace( + "\n", "" + ) + if content: + annotations.append(f"[{page.number}] {content}") + return annotations + + def to_stdout(self, annotated_papers): + for citekey, annotations in annotated_papers.items(): + if annotations: + print(f"{citekey}") + for annot in annotations: + print(f"> \"{annot}\"") + print("") + + + def to_notes(self, filename, annotations): + with open(f"{os.path.splitext(filename)[0]}.md", "w") as out: + out.write(f"# Annotations:\n\n") + out.writelines(annotations) + out.write(f"\n---") + + +@PaperChangeEvent.listen() +def paper_change_event(event): + """When a paper is changed, commit the changes to the directory.""" + pass + # if ExtractPlugin.is_loaded(): + # git = ExtractPlugin.get_instance() + # if not git.manual: + # event_desc = event.description + # for a, b in [('\\', '\\\\'), ('"', '\\"'), ('$', '\\$'), ('`', '\\`')]: + # event_desc = event_desc.replace(a, b) + # git.list_of_changes.append(event_desc) + + +@PostCommandEvent.listen() +def git_commit(event): + pass + # if ExtractPlugin.is_loaded(): + # try: + # extract = ExtractPlugin.get_instance() + # if len(extract.list_of_changes) > 0: + # if not extract.manual: + # title = ' '.join(sys.argv) + '\n' + # message = '\n'.join([title] + extract.list_of_changes) + # + # extract.shell('add .') + # extract.shell('commit -F-', message.encode('utf-8')) + # except RuntimeError as exc: + # uis.get_ui().warning(exc.args[0])