Attempt to calculate annotation offsets
There are clear issues remaining with this approach. The wallabag-given 'start' and 'end' fields do _not_ just point to the n-th paragraph all the time (like I thought) but actually represent a beautifulsoup4 like tree descent. So: `p_start_match = re.match(r"/p\[(\d+)\]", annot["ranges"][0]["start"])` will fail on any annotation not just at the n-th paragraph. Instead we should see how we can move this tree into the beautifulsoup4 parser and make use of wallabag already having done the work for us?
This commit is contained in:
parent
6f79a12d2b
commit
1017b876e9
3 changed files with 101 additions and 19 deletions
|
@ -5,12 +5,12 @@ description = "Add your description here"
|
|||
readme = "README.md"
|
||||
requires-python = ">=3.13"
|
||||
dependencies = [
|
||||
"beautifulsoup4>=4.13.3",
|
||||
"lxml>=5.3.1",
|
||||
"requests>=2.32.3",
|
||||
# REQUIRED FOR CURRENT SHAARLI2HOARDER IMPLEMENTATION ONLY
|
||||
# "beautifulsoup4>=4.13.3",
|
||||
# "netscape-bookmarks-file-parser",
|
||||
# "pyjwt>=2.10.1",
|
||||
# "requests>=2.32.3",
|
||||
"requests>=2.32.3",
|
||||
]
|
||||
|
||||
[tool.pyright]
|
||||
|
|
64
uv.lock
64
uv.lock
|
@ -7,11 +7,30 @@ name = "2hoarder"
|
|||
version = "0.1.0"
|
||||
source = { virtual = "." }
|
||||
dependencies = [
|
||||
{ name = "beautifulsoup4" },
|
||||
{ name = "lxml" },
|
||||
{ name = "requests" },
|
||||
]
|
||||
|
||||
[package.metadata]
|
||||
requires-dist = [{ name = "requests", specifier = ">=2.32.3" }]
|
||||
requires-dist = [
|
||||
{ name = "beautifulsoup4", specifier = ">=4.13.3" },
|
||||
{ name = "lxml", specifier = ">=5.3.1" },
|
||||
{ name = "requests", specifier = ">=2.32.3" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "beautifulsoup4"
|
||||
version = "4.13.3"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "soupsieve" },
|
||||
{ name = "typing-extensions" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/f0/3c/adaf39ce1fb4afdd21b611e3d530b183bb7759c9b673d60db0e347fd4439/beautifulsoup4-4.13.3.tar.gz", hash = "sha256:1bd32405dacc920b42b83ba01644747ed77456a65760e285fbc47633ceddaf8b", size = 619516 }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/f9/49/6abb616eb3cbab6a7cca303dc02fdf3836de2e0b834bf966a7f5271a34d8/beautifulsoup4-4.13.3-py3-none-any.whl", hash = "sha256:99045d7d3f08f91f0d656bc9b7efbae189426cd913d830294a15eefa0ea4df16", size = 186015 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "certifi"
|
||||
|
@ -53,6 +72,31 @@ wheels = [
|
|||
{ url = "https://files.pythonhosted.org/packages/76/c6/c88e154df9c4e1a2a66ccf0005a88dfb2650c1dffb6f5ce603dfbd452ce3/idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3", size = 70442 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "lxml"
|
||||
version = "5.3.1"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/ef/f6/c15ca8e5646e937c148e147244817672cf920b56ac0bf2cc1512ae674be8/lxml-5.3.1.tar.gz", hash = "sha256:106b7b5d2977b339f1e97efe2778e2ab20e99994cbb0ec5e55771ed0795920c8", size = 3678591 }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/94/1c/724931daa1ace168e0237b929e44062545bf1551974102a5762c349c668d/lxml-5.3.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:c093c7088b40d8266f57ed71d93112bd64c6724d31f0794c1e52cc4857c28e0e", size = 8171881 },
|
||||
{ url = "https://files.pythonhosted.org/packages/67/0c/857b8fb6010c4246e66abeebb8639eaabba60a6d9b7c606554ecc5cbf1ee/lxml-5.3.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:b0884e3f22d87c30694e625b1e62e6f30d39782c806287450d9dc2fdf07692fd", size = 4440394 },
|
||||
{ url = "https://files.pythonhosted.org/packages/61/72/c9e81de6a000f9682ccdd13503db26e973b24c68ac45a7029173237e3eed/lxml-5.3.1-cp313-cp313-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1637fa31ec682cd5760092adfabe86d9b718a75d43e65e211d5931809bc111e7", size = 5037860 },
|
||||
{ url = "https://files.pythonhosted.org/packages/24/26/942048c4b14835711b583b48cd7209bd2b5f0b6939ceed2381a494138b14/lxml-5.3.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a364e8e944d92dcbf33b6b494d4e0fb3499dcc3bd9485beb701aa4b4201fa414", size = 4782513 },
|
||||
{ url = "https://files.pythonhosted.org/packages/e2/65/27792339caf00f610cc5be32b940ba1e3009b7054feb0c4527cebac228d4/lxml-5.3.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:779e851fd0e19795ccc8a9bb4d705d6baa0ef475329fe44a13cf1e962f18ff1e", size = 5305227 },
|
||||
{ url = "https://files.pythonhosted.org/packages/18/e1/25f7aa434a4d0d8e8420580af05ea49c3e12db6d297cf5435ac0a054df56/lxml-5.3.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c4393600915c308e546dc7003d74371744234e8444a28622d76fe19b98fa59d1", size = 4829846 },
|
||||
{ url = "https://files.pythonhosted.org/packages/fe/ed/faf235e0792547d24f61ee1448159325448a7e4f2ab706503049d8e5df19/lxml-5.3.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:673b9d8e780f455091200bba8534d5f4f465944cbdd61f31dc832d70e29064a5", size = 4949495 },
|
||||
{ url = "https://files.pythonhosted.org/packages/e5/e1/8f572ad9ed6039ba30f26dd4c2c58fb90f79362d2ee35ca3820284767672/lxml-5.3.1-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:2e4a570f6a99e96c457f7bec5ad459c9c420ee80b99eb04cbfcfe3fc18ec6423", size = 4773415 },
|
||||
{ url = "https://files.pythonhosted.org/packages/a3/75/6b57166b9d1983dac8f28f354e38bff8d6bcab013a241989c4d54c72701b/lxml-5.3.1-cp313-cp313-manylinux_2_28_ppc64le.whl", hash = "sha256:71f31eda4e370f46af42fc9f264fafa1b09f46ba07bdbee98f25689a04b81c20", size = 5337710 },
|
||||
{ url = "https://files.pythonhosted.org/packages/cc/71/4aa56e2daa83bbcc66ca27b5155be2f900d996f5d0c51078eaaac8df9547/lxml-5.3.1-cp313-cp313-manylinux_2_28_s390x.whl", hash = "sha256:42978a68d3825eaac55399eb37a4d52012a205c0c6262199b8b44fcc6fd686e8", size = 4897362 },
|
||||
{ url = "https://files.pythonhosted.org/packages/65/10/3fa2da152cd9b49332fd23356ed7643c9b74cad636ddd5b2400a9730d12b/lxml-5.3.1-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:8b1942b3e4ed9ed551ed3083a2e6e0772de1e5e3aca872d955e2e86385fb7ff9", size = 4977795 },
|
||||
{ url = "https://files.pythonhosted.org/packages/de/d2/e1da0f7b20827e7b0ce934963cb6334c1b02cf1bb4aecd218c4496880cb3/lxml-5.3.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:85c4f11be9cf08917ac2a5a8b6e1ef63b2f8e3799cec194417e76826e5f1de9c", size = 4858104 },
|
||||
{ url = "https://files.pythonhosted.org/packages/a5/35/063420e1b33d3308f5aa7fcbdd19ef6c036f741c9a7a4bd5dc8032486b27/lxml-5.3.1-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:231cf4d140b22a923b1d0a0a4e0b4f972e5893efcdec188934cc65888fd0227b", size = 5416531 },
|
||||
{ url = "https://files.pythonhosted.org/packages/c3/83/93a6457d291d1e37adfb54df23498101a4701834258c840381dd2f6a030e/lxml-5.3.1-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:5865b270b420eda7b68928d70bb517ccbe045e53b1a428129bb44372bf3d7dd5", size = 5273040 },
|
||||
{ url = "https://files.pythonhosted.org/packages/39/25/ad4ac8fac488505a2702656550e63c2a8db3a4fd63db82a20dad5689cecb/lxml-5.3.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:dbf7bebc2275016cddf3c997bf8a0f7044160714c64a9b83975670a04e6d2252", size = 5050951 },
|
||||
{ url = "https://files.pythonhosted.org/packages/82/74/f7d223c704c87e44b3d27b5e0dde173a2fcf2e89c0524c8015c2b3554876/lxml-5.3.1-cp313-cp313-win32.whl", hash = "sha256:d0751528b97d2b19a388b302be2a0ee05817097bab46ff0ed76feeec24951f78", size = 3485357 },
|
||||
{ url = "https://files.pythonhosted.org/packages/80/83/8c54533b3576f4391eebea88454738978669a6cad0d8e23266224007939d/lxml-5.3.1-cp313-cp313-win_amd64.whl", hash = "sha256:91fb6a43d72b4f8863d21f347a9163eecbf36e76e2f51068d59cd004c506f332", size = 3814484 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "requests"
|
||||
version = "2.32.3"
|
||||
|
@ -68,6 +112,24 @@ wheels = [
|
|||
{ url = "https://files.pythonhosted.org/packages/f9/9b/335f9764261e915ed497fcdeb11df5dfd6f7bf257d4a6a2a686d80da4d54/requests-2.32.3-py3-none-any.whl", hash = "sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6", size = 64928 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "soupsieve"
|
||||
version = "2.6"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/d7/ce/fbaeed4f9fb8b2daa961f90591662df6a86c1abf25c548329a86920aedfb/soupsieve-2.6.tar.gz", hash = "sha256:e2e68417777af359ec65daac1057404a3c8a5455bb8abc36f1a9866ab1a51abb", size = 101569 }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/d1/c2/fe97d779f3ef3b15f05c94a2f1e3d21732574ed441687474db9d342a7315/soupsieve-2.6-py3-none-any.whl", hash = "sha256:e72c4ff06e4fb6e4b5a9f0f55fe6e81514581fca1515028625d0f299c602ccc9", size = 36186 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "typing-extensions"
|
||||
version = "4.12.2"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/df/db/f35a00659bc03fec321ba8bce9420de607a1d37f8342eee1863174c69557/typing_extensions-4.12.2.tar.gz", hash = "sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8", size = 85321 }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/26/9f/ad63fc0248c5379346306f8668cda6e2e2e9c95e01216d2b8ffd9ff037d0/typing_extensions-4.12.2-py3-none-any.whl", hash = "sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d", size = 37438 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "urllib3"
|
||||
version = "2.3.0"
|
||||
|
|
|
@ -1,6 +1,8 @@
|
|||
import json
|
||||
import re
|
||||
|
||||
from base import Wallabag_Converter
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
# test_api_key: ak1_d202e69375c111461882_b0271d0e739ce0234f96
|
||||
from requests import Response, request
|
||||
|
@ -44,7 +46,6 @@ class API_Converter(Wallabag_Converter):
|
|||
if entry["annotations"]:
|
||||
self._create_annotations(id, entry)
|
||||
|
||||
break
|
||||
return json.dumps("Done.")
|
||||
|
||||
def _create_bookmark(self, entry) -> Response:
|
||||
|
@ -81,26 +82,45 @@ class API_Converter(Wallabag_Converter):
|
|||
return response
|
||||
|
||||
def _create_annotations(self, entry_id, entry) -> Response:
|
||||
payload = json.dumps(
|
||||
{
|
||||
"bookmarkId": entry_id,
|
||||
"startOffset": 100,
|
||||
"endOffset": 200,
|
||||
"color": "yellow",
|
||||
"text": "mytext",
|
||||
"note": "mynote",
|
||||
}
|
||||
)
|
||||
payload_dict = {
|
||||
"bookmarkId": entry_id,
|
||||
"startOffset": 5000,
|
||||
"endOffset": 6000,
|
||||
"color": "yellow",
|
||||
"text": "MYTEXT",
|
||||
"note": "mynote",
|
||||
}
|
||||
|
||||
annot_url = f"{self.api_url}/highlights"
|
||||
for annot in entry["annotations"]:
|
||||
pl = payload_dict
|
||||
pl["text"] = annot["quote"]
|
||||
pl["note"] = annot["text"] if "text" in annot else None
|
||||
pl["startOffset"], pl["endOffset"] = self._calc_annot_offsets(
|
||||
entry["content"], annot
|
||||
)
|
||||
if pl["startOffset"] == -1:
|
||||
print(f"[WARNING] Annotation not found in bookmark: {annot['quote']}")
|
||||
continue
|
||||
|
||||
response = request(
|
||||
"POST",
|
||||
annot_url,
|
||||
headers=self.headers,
|
||||
data=payload,
|
||||
data=json.dumps(pl),
|
||||
)
|
||||
print(response.json())
|
||||
|
||||
print(f"[DEBUG] New annotation: {response.json()}")
|
||||
response = Response()
|
||||
return response
|
||||
|
||||
def _calc_annot_offsets(self, content): ...
|
||||
def _calc_annot_offsets(self, content, annot) -> tuple[int, int]:
|
||||
print(f"START: {annot['ranges'][0]['start']}")
|
||||
p_start_match = re.match(r"/p\[(\d+)\]", annot["ranges"][0]["start"])
|
||||
if p_start_match and len(p_start_match.groups()) == 1:
|
||||
p_start = int(p_start_match[1])
|
||||
|
||||
bs4 = BeautifulSoup(content, "lxml")
|
||||
relevant_p = bs4.find_all("p")[p_start]
|
||||
start_offset = str(bs4.text).find(str(relevant_p.text))
|
||||
end_offset = start_offset + len(annot["quote"])
|
||||
return (start_offset, end_offset)
|
||||
|
|
Loading…
Reference in a new issue