diff --git a/.woodpecker/lint.yml b/.woodpecker/lint.yml new file mode 100644 index 0000000..eb67dbe --- /dev/null +++ b/.woodpecker/lint.yml @@ -0,0 +1,16 @@ +pipeline: + lint_ruff: + image: python + commands: + - pip install ruff + - python --version && poetry --version && ruff --version + - echo "----------------- running ruff lint ------------------" + - ruff check . + + lint_black: + image: python + commands: + - pip install black + - python --version && poetry --version && black --version + - echo "----------------- running black lint ----------------" + - black --check . diff --git a/.woodpecker/static_analysis.yml b/.woodpecker/static_analysis.yml new file mode 100644 index 0000000..786af22 --- /dev/null +++ b/.woodpecker/static_analysis.yml @@ -0,0 +1,9 @@ +pipeline: + pyright: + image: nikolaik/python-nodejs + commands: + - npm install --global pyright + - poetry install + - python --version && poetry --version && pyright --version + - echo "------------- running pyright typecheck -------------" + - poetry run pyright diff --git a/.woodpecker/test.yml b/.woodpecker/test.yml new file mode 100644 index 0000000..44e264d --- /dev/null +++ b/.woodpecker/test.yml @@ -0,0 +1,10 @@ +branches: master + +pipeline: + pytest: + image: ghcr.io/withlogicco/poetry:1.5.1 + commands: + - poetry install + - python --version && poetry --version + - echo "------------- running pytest -------------" + - poetry run pytest diff --git a/README.md b/README.md index b403962..d2d1574 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,8 @@ # papis-extract - Quickly extract annotations from your pdf files with the help of the [papis](https://github.com/papis/papis) bibliography manager.\ @@ -11,11 +11,14 @@ Easily organize all your highlights and thoughts next to your documents and refe ## Installation: -You can install from pypi with `pip install git+`. +You can install from pypi with `pip install git+https://git.martyoeh.me/Marty/papis-extract.git`. That's it! If you have papis and papis-extract installed in the same environment (whether virtual or global), everything should now be set up. +I am currently working towards the first release for pypi, see the below roadmap; +when that is done you will also be able to install in the usual pypi way. + To check if everything is working you should now see the `extract` command listed when running `papis --help`. You will be set up with the default options but if you want to change anything, read on in configuration below. @@ -166,27 +169,46 @@ The option should generally not take too much tuning, but it is there if you nee This should generally be an alright default but is here to be changed for example if you work with a lot of different annotation colors (where dark purple and light purple may different meanings) and get false positives in automatic tag recognition, or no tags are recognized at all. -## Roadmap +## Roadmap to first release Known issues to be fixed: - [x] if both content and text are empty, do not extract an annotation - [x] Speed? - should be fine, on my machine (old i5 laptop) it takes around 90s for ~1000 documents with ~4000 annotations -- [ ] ensure all cmdline options do what they should +- [x] ensure all cmdline options do what they should +- [ ] annotations carry over color object from fitz, should just be Color object or simple tuple with rgb vals +- [ ] docstrings, docstrings! +- [ ] testing testing testing!! + - [ ] refactor into some better abstractions (e.g. Exporter Protocol -> stdout/markdown implementations; Extractor Protocol -> PDF implementation) features to be implemented: -- [ ] on_add hook to extract annotations as files are added +- [ ] CICD + - [x] static analysis (lint, typecheck etc) on pushes + - [x] test pipeline on master pushes + - [ ] release pipeline to pypi on tags - [ ] add page number if available + - exists in Annotation, just need to place in output - [ ] show overall amount of extractions at the end - [ ] custom formatting decided by user + - in config as { "myformatter": ">{tag}\n{quote}\n{note}\n{page} etc"} - [ ] improved default exporters - markdown into notes - pretty display on stdout (rich?) - csv/tsv to stdout - table fmt stdout? -- [ ] arbitrary color -> name settings not dependent on color name existing +- [ ] allow custom colors -> tag name settings not dependent on color name existing (e.g. {"important": (1.0,0.0,0.0)}) +- [ ] `--overwrite` mode where existing annotations are not dropped but overwritten on same line of note +- [ ] `--force` mode where we simply do not drop anything +- [ ] `--format` option to choose from default or set up a custom formatter +- [ ] on_add hook to extract annotations as files are added + - needs upstream help, 'on_add' hook, and pass-through of affected documents + +upstream changes: + +- [ ] need a hook for adding a document/file +- [ ] need hooks to actually pass through information on the thing they worked on (i.e. their document) ## Issues diff --git a/papis_extract/extractor.py b/papis_extract/extractor.py index 9467231..88ff012 100644 --- a/papis_extract/extractor.py +++ b/papis_extract/extractor.py @@ -42,7 +42,7 @@ def start(filename: Path) -> list[Annotation]: def _tag_from_colorname(colorname: str) -> str: - color_mapping: dict[str,str] = getdict("tags", "plugins.extract") + color_mapping: dict[str, str] = getdict("tags", "plugins.extract") if not color_mapping: return "" @@ -82,6 +82,7 @@ def _retrieve_annotation_content( # just a highlight without any text return (None, None) + # mimics the functions in papis.config.{getlist,getint,getfloat} etc. def getdict(key: str, section: Optional[str] = None) -> dict[str, str]: """Dict getter @@ -97,13 +98,14 @@ def getdict(key: str, section: Optional[str] = None) -> dict[str, str]: rawvalue = eval(rawvalue) except Exception: raise SyntaxError( - "The key '{}' must be a valid Python object: {}" - .format(key, rawvalue)) + "The key '{}' must be a valid Python object: {}".format(key, rawvalue) + ) else: if not isinstance(rawvalue, dict): raise SyntaxError( - "The key '{}' must be a valid Python dict. Got: {} (type {!r})" - .format(key, rawvalue, type(rawvalue).__name__)) + "The key '{}' must be a valid Python dict. Got: {} (type {!r})".format( + key, rawvalue, type(rawvalue).__name__ + ) + ) return rawvalue - diff --git a/poetry.lock b/poetry.lock index e225e20..8d6734d 100644 --- a/poetry.lock +++ b/poetry.lock @@ -224,6 +224,17 @@ files = [ {file = "idna-3.4.tar.gz", hash = "sha256:814f528e8dead7d329833b91c5faa87d60bf71824cd12a7530b5526063d02cb4"}, ] +[[package]] +name = "iniconfig" +version = "2.0.0" +description = "brain-dead simple config-ini parsing" +optional = false +python-versions = ">=3.7" +files = [ + {file = "iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374"}, + {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"}, +] + [[package]] name = "isbnlib" version = "3.10.14" @@ -456,6 +467,17 @@ html5 = ["html5lib"] htmlsoup = ["BeautifulSoup4"] source = ["Cython (>=0.29.35)"] +[[package]] +name = "packaging" +version = "23.1" +description = "Core utilities for Python packages" +optional = false +python-versions = ">=3.7" +files = [ + {file = "packaging-23.1-py3-none-any.whl", hash = "sha256:994793af429502c4ea2ebf6bf664629d07c1a9fe974af92966e4b8d2df7edc61"}, + {file = "packaging-23.1.tar.gz", hash = "sha256:a392980d2b6cffa644431898be54b0045151319d1e7ec34f0cfed48767dd334f"}, +] + [[package]] name = "papis" version = "0.13" @@ -504,6 +526,21 @@ files = [ {file = "pbr-5.11.1.tar.gz", hash = "sha256:aefc51675b0b533d56bb5fd1c8c6c0522fe31896679882e1c4c63d5e4a0fccb3"}, ] +[[package]] +name = "pluggy" +version = "1.3.0" +description = "plugin and hook calling mechanisms for python" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pluggy-1.3.0-py3-none-any.whl", hash = "sha256:d89c696a773f8bd377d18e5ecda92b7a3793cbe66c87060a6fb58c7b6e1061f7"}, + {file = "pluggy-1.3.0.tar.gz", hash = "sha256:cf61ae8f126ac6f7c451172cf30e3e43d3ca77615509771b3a984a0730651e12"}, +] + +[package.extras] +dev = ["pre-commit", "tox"] +testing = ["pytest", "pytest-benchmark"] + [[package]] name = "prompt-toolkit" version = "3.0.39" @@ -534,36 +571,36 @@ plugins = ["importlib-metadata"] [[package]] name = "pymupdf" -version = "1.23.1" +version = "1.23.2" description = "A high performance Python library for data extraction, analysis, conversion & manipulation of PDF (and other) documents." optional = false python-versions = ">=3.8" files = [ - {file = "PyMuPDF-1.23.1-cp310-none-macosx_10_9_x86_64.whl", hash = "sha256:eb85e8c8d5ceebe078206f779e5cd918919616e865e081fad3993f90159e61c1"}, - {file = "PyMuPDF-1.23.1-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:a3a23857d655180f6147dbc9886f109e627b6654e37ba5088d1b038f6d7861c0"}, - {file = "PyMuPDF-1.23.1-cp310-none-manylinux2014_aarch64.whl", hash = "sha256:9216ca0805892b481be9544f3d3f9efbc06258fd26ba95295d540c2fa0ae0fea"}, - {file = "PyMuPDF-1.23.1-cp310-none-manylinux2014_x86_64.whl", hash = "sha256:c8173a1b8c8539ee466f74263a7c6bdf1427b8f5dc32c66ccaca22222a89e339"}, - {file = "PyMuPDF-1.23.1-cp310-none-win32.whl", hash = "sha256:0d709d3d7c9b894ad5400fa5037bf34e82133b3386b5f3f7bb60b16e9666a1bb"}, - {file = "PyMuPDF-1.23.1-cp310-none-win_amd64.whl", hash = "sha256:11af6e9e86e2a4ad1ecf8085bb64a48323614769188fbf5eaa5a198acef5de39"}, - {file = "PyMuPDF-1.23.1-cp311-none-macosx_10_9_x86_64.whl", hash = "sha256:08b93208eaccac85a78b7aa2b3020645824fb97dda2ab4f6d3a07290ac99b078"}, - {file = "PyMuPDF-1.23.1-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:ead3422bc17607e8e174e87b9d66639b10f5649fbb55c3cde13d0ba937e9c9f8"}, - {file = "PyMuPDF-1.23.1-cp311-none-manylinux2014_aarch64.whl", hash = "sha256:322ef02cefad4c705461d64a89b33bf95147958277fd422206c4ea32323dc89e"}, - {file = "PyMuPDF-1.23.1-cp311-none-manylinux2014_x86_64.whl", hash = "sha256:8ce4dcef3df3c959053a5f35df8dd35c9272cfe5fd3df54039ea3e270a9ef69c"}, - {file = "PyMuPDF-1.23.1-cp311-none-win32.whl", hash = "sha256:245ba657c3f8e39a7e2b17ede676f8519031f70f11963cb1f39b76c00aace3cf"}, - {file = "PyMuPDF-1.23.1-cp311-none-win_amd64.whl", hash = "sha256:55109e3c49f7854e2a471ca92bef5e5f8324841474a3074fb03bee99537d11cd"}, - {file = "PyMuPDF-1.23.1-cp38-none-macosx_10_9_x86_64.whl", hash = "sha256:9f48302062e8d3d2440d197379a1ac9da201c78b2f80c414bccf7a0a4109ac16"}, - {file = "PyMuPDF-1.23.1-cp38-none-macosx_11_0_arm64.whl", hash = "sha256:67a442a2235ba82fb4b604e1f2acb023131fcb5be4263d615dfef10319b1d617"}, - {file = "PyMuPDF-1.23.1-cp38-none-manylinux2014_aarch64.whl", hash = "sha256:83db19d9d3b27fc14f66d222623a15b014db88f04682a33f9ed4054c31a65184"}, - {file = "PyMuPDF-1.23.1-cp38-none-manylinux2014_x86_64.whl", hash = "sha256:756f24923bb28a8165f603ebe34ae72f951d9fc29ed7d72d8a3dad48001c19c4"}, - {file = "PyMuPDF-1.23.1-cp38-none-win32.whl", hash = "sha256:a69279f4b85db33733eecb4cc42b1b959775b31f61931ae6de2b0d640ac82ad8"}, - {file = "PyMuPDF-1.23.1-cp38-none-win_amd64.whl", hash = "sha256:89d54dce9ca91f204e7f9cf4f91111cf6feede08f710259682f0f3bd2bc77ba7"}, - {file = "PyMuPDF-1.23.1-cp39-none-macosx_10_9_x86_64.whl", hash = "sha256:0d57716ff547a17365eed5f1485fac33b60d4c15776f041503b718834a9a8fe1"}, - {file = "PyMuPDF-1.23.1-cp39-none-macosx_11_0_arm64.whl", hash = "sha256:bdbf28e9ce255b8a221d1015f8a86810f70d1db45b6c4f90033fa8d8c2fb2b11"}, - {file = "PyMuPDF-1.23.1-cp39-none-manylinux2014_aarch64.whl", hash = "sha256:6ac7ee501ed13932878d52bb291ef7abf88e9bceb6c04144bd896cd079b193c0"}, - {file = "PyMuPDF-1.23.1-cp39-none-manylinux2014_x86_64.whl", hash = "sha256:74e20395bbaa5fd5bc815e5459f017edeb668ddf1ddf4f848f05587ec71b1326"}, - {file = "PyMuPDF-1.23.1-cp39-none-win32.whl", hash = "sha256:25f554c861039ade6fcdb12cd17c942687e530afdfd337d72820529dc9bd440d"}, - {file = "PyMuPDF-1.23.1-cp39-none-win_amd64.whl", hash = "sha256:a3b2ce649b5bba352eebb4415529c81ec937595f751cc02079be2b314c5972b5"}, - {file = "PyMuPDF-1.23.1.tar.gz", hash = "sha256:34cac956024f1a30f5204a1a987d7b2c8d4b2b5df57806b82c2842e3e108786b"}, + {file = "PyMuPDF-1.23.2-cp310-none-macosx_10_9_x86_64.whl", hash = "sha256:701aff64dbf0635c2c875b518979b46b935ed4d3b3d2aee1c449e2960831d766"}, + {file = "PyMuPDF-1.23.2-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:949c071b50825cf341f03546e7354cef942c36fcc071a72a0417c035d6ee7e33"}, + {file = "PyMuPDF-1.23.2-cp310-none-manylinux2014_aarch64.whl", hash = "sha256:5209612dfc9038fbbb1a61dc01bd298d5279646d5e1c98cfe80878db3d862a3e"}, + {file = "PyMuPDF-1.23.2-cp310-none-manylinux2014_x86_64.whl", hash = "sha256:f20ba77a61440220bd2e380ceef8a86bf51f97ac9374a8af00aeedea904dad46"}, + {file = "PyMuPDF-1.23.2-cp310-none-win32.whl", hash = "sha256:01c45723fbc389fac2ab8150e5ba80c357706ca69a74c29ec1a83a05921c53d1"}, + {file = "PyMuPDF-1.23.2-cp310-none-win_amd64.whl", hash = "sha256:ff799db717d5b0e423bd81fbae8131cf3463a80a642524a96952f6f3deaf2a95"}, + {file = "PyMuPDF-1.23.2-cp311-none-macosx_10_9_x86_64.whl", hash = "sha256:1f372bcc70b888f0c953add8b15627efb9f3cc2c7b8ad0916560b6081093932c"}, + {file = "PyMuPDF-1.23.2-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:6389eb4bfc27264a951497847089e5e4485f6609c351ac321071d62881a21982"}, + {file = "PyMuPDF-1.23.2-cp311-none-manylinux2014_aarch64.whl", hash = "sha256:0fc5f600b3a72c29a0944cbcbc1375962ad669023265c50cd1d8f794d7ae95f7"}, + {file = "PyMuPDF-1.23.2-cp311-none-manylinux2014_x86_64.whl", hash = "sha256:4fc4a6183a7a8006b83476fc0b26d6fb849996050e8c3c911b9d6a66fe6ccc1e"}, + {file = "PyMuPDF-1.23.2-cp311-none-win32.whl", hash = "sha256:66f94d35fd48e2b5cbe70a4601f036f76cb826318b893994ab7bd4186a65e78f"}, + {file = "PyMuPDF-1.23.2-cp311-none-win_amd64.whl", hash = "sha256:1ec04285451231c68a024657b75d59a43ce0dcdade582edf3a9cc1d86c75b826"}, + {file = "PyMuPDF-1.23.2-cp38-none-macosx_10_9_x86_64.whl", hash = "sha256:40a713ea439548cf3c6bd910dc904cb868eae9d7bc1c2d0aebc04c84431822af"}, + {file = "PyMuPDF-1.23.2-cp38-none-macosx_11_0_arm64.whl", hash = "sha256:f0d57e40bdbf6c6deacf94387d3aad918535d8723aa6e3a27e4bef1f3d52158a"}, + {file = "PyMuPDF-1.23.2-cp38-none-manylinux2014_aarch64.whl", hash = "sha256:2de9da709e14a0b32ca1ed7e268615189a8c1e76a26920dd45a92d9f0e207d1f"}, + {file = "PyMuPDF-1.23.2-cp38-none-manylinux2014_x86_64.whl", hash = "sha256:7edc4b4542041a28f5644c09c1e670215ae014adc28a81d32786db73077d4cf3"}, + {file = "PyMuPDF-1.23.2-cp38-none-win32.whl", hash = "sha256:18f19be85f277a36536277f3f4991a2d1d1b9c2d0c3a515925e9bef41780efe0"}, + {file = "PyMuPDF-1.23.2-cp38-none-win_amd64.whl", hash = "sha256:a98cf7bb1ba8d64de78f443005c0f60c0c9644f73b3ebd57cbd20e232e2e5a30"}, + {file = "PyMuPDF-1.23.2-cp39-none-macosx_10_9_x86_64.whl", hash = "sha256:ac236156688627bca0a8062bb4153f77108f072dd4a06a80626fd089c2879e04"}, + {file = "PyMuPDF-1.23.2-cp39-none-macosx_11_0_arm64.whl", hash = "sha256:c1a08531194d038e068641be92fdc31276efbee2b718a8dc4281dc593f1a99e7"}, + {file = "PyMuPDF-1.23.2-cp39-none-manylinux2014_aarch64.whl", hash = "sha256:78b6c87fd375d1b017c63a426432be7ee4859f2142108b9c5dc8283599c112eb"}, + {file = "PyMuPDF-1.23.2-cp39-none-manylinux2014_x86_64.whl", hash = "sha256:41d9cd45bb61cda890d446baeeded454fb4404086cf7f7e385e440123e9ecb56"}, + {file = "PyMuPDF-1.23.2-cp39-none-win32.whl", hash = "sha256:d34da29cd4305c4b85ea57528c1a31cfc6abfac7921d27153e633470e9dac104"}, + {file = "PyMuPDF-1.23.2-cp39-none-win_amd64.whl", hash = "sha256:86127075227f868a6b115eb96a74405539dde90168cd1a98781b0f1f6d4f9d7c"}, + {file = "PyMuPDF-1.23.2.tar.gz", hash = "sha256:32302d0eb0e28d60ba305f5d74702fb0fab2ed9d9f6b3a9d853429e5023bc6bb"}, ] [package.dependencies] @@ -598,6 +635,26 @@ files = [ [package.extras] diagrams = ["jinja2", "railroad-diagrams"] +[[package]] +name = "pytest" +version = "7.4.0" +description = "pytest: simple powerful testing with Python" +optional = false +python-versions = ">=3.7" +files = [ + {file = "pytest-7.4.0-py3-none-any.whl", hash = "sha256:78bf16451a2eb8c7a2ea98e32dc119fd2aa758f1d5d66dbf0a59d69a3969df32"}, + {file = "pytest-7.4.0.tar.gz", hash = "sha256:b4bf8c45bd59934ed84001ad51e11b4ee40d40a1229d2c79f9c592b0a3f6bd8a"}, +] + +[package.dependencies] +colorama = {version = "*", markers = "sys_platform == \"win32\""} +iniconfig = "*" +packaging = "*" +pluggy = ">=0.12,<2.0" + +[package.extras] +testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] + [[package]] name = "python-doi" version = "0.2.0" @@ -923,4 +980,4 @@ files = [ [metadata] lock-version = "2.0" python-versions = "^3.11" -content-hash = "2e158747df6100e105c98494d8b5f4b23b7076ae76295ce7a28facf02488ebd5" +content-hash = "d519605837788792d06ffc7bca7a92b315612ca6052227c53c558ec49dffec9f" diff --git a/pyproject.toml b/pyproject.toml index 8ee3741..4ca3257 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,6 +18,10 @@ python-magic = "^0.4.27" [tool.poetry.plugins."papis.command"] extract = "papis_extract:main" + +[tool.poetry.group.dev.dependencies] +pytest = "^7.4.0" + [build-system] requires = ["poetry-core"] build-backend = "poetry.core.masonry.api" diff --git a/tests/test_annotation.py b/tests/test_annotation.py new file mode 100644 index 0000000..72c7a75 --- /dev/null +++ b/tests/test_annotation.py @@ -0,0 +1,7 @@ +from papis_extract.annotation_data import Annotation + + +def test_matches_colorname_exact(): + sut = Annotation("testfile", colors={"stroke": (1.0, 0.0, 0.0)}) + c_name = sut.colorname + assert c_name == "red"