Attempt to calculate annotation offsets

There are clear issues remaining with this approach.

The wallabag-given 'start' and 'end' fields do _not_ just point to the
n-th paragraph all the time (like I thought) but actually represent a
beautifulsoup4 like tree descent.

So:  `p_start_match = re.match(r"/p\[(\d+)\]", annot["ranges"][0]["start"])`
will fail on any annotation not just at the n-th paragraph.

Instead we should see how we can move this tree into the beautifulsoup4
parser and make use of wallabag already having done the work for us?
This commit is contained in:
Marty Oehme 2025-03-12 18:28:12 +01:00
parent 6f79a12d2b
commit 1017b876e9
Signed by: Marty
GPG key ID: 4E535BC19C61886E
3 changed files with 101 additions and 19 deletions

View file

@ -5,12 +5,12 @@ description = "Add your description here"
readme = "README.md"
requires-python = ">=3.13"
dependencies = [
"beautifulsoup4>=4.13.3",
"lxml>=5.3.1",
"requests>=2.32.3",
# REQUIRED FOR CURRENT SHAARLI2HOARDER IMPLEMENTATION ONLY
# "beautifulsoup4>=4.13.3",
# "netscape-bookmarks-file-parser",
# "pyjwt>=2.10.1",
# "requests>=2.32.3",
"requests>=2.32.3",
]
[tool.pyright]

64
uv.lock
View file

@ -7,11 +7,30 @@ name = "2hoarder"
version = "0.1.0"
source = { virtual = "." }
dependencies = [
{ name = "beautifulsoup4" },
{ name = "lxml" },
{ name = "requests" },
]
[package.metadata]
requires-dist = [{ name = "requests", specifier = ">=2.32.3" }]
requires-dist = [
{ name = "beautifulsoup4", specifier = ">=4.13.3" },
{ name = "lxml", specifier = ">=5.3.1" },
{ name = "requests", specifier = ">=2.32.3" },
]
[[package]]
name = "beautifulsoup4"
version = "4.13.3"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "soupsieve" },
{ name = "typing-extensions" },
]
sdist = { url = "https://files.pythonhosted.org/packages/f0/3c/adaf39ce1fb4afdd21b611e3d530b183bb7759c9b673d60db0e347fd4439/beautifulsoup4-4.13.3.tar.gz", hash = "sha256:1bd32405dacc920b42b83ba01644747ed77456a65760e285fbc47633ceddaf8b", size = 619516 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/f9/49/6abb616eb3cbab6a7cca303dc02fdf3836de2e0b834bf966a7f5271a34d8/beautifulsoup4-4.13.3-py3-none-any.whl", hash = "sha256:99045d7d3f08f91f0d656bc9b7efbae189426cd913d830294a15eefa0ea4df16", size = 186015 },
]
[[package]]
name = "certifi"
@ -53,6 +72,31 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/76/c6/c88e154df9c4e1a2a66ccf0005a88dfb2650c1dffb6f5ce603dfbd452ce3/idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3", size = 70442 },
]
[[package]]
name = "lxml"
version = "5.3.1"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/ef/f6/c15ca8e5646e937c148e147244817672cf920b56ac0bf2cc1512ae674be8/lxml-5.3.1.tar.gz", hash = "sha256:106b7b5d2977b339f1e97efe2778e2ab20e99994cbb0ec5e55771ed0795920c8", size = 3678591 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/94/1c/724931daa1ace168e0237b929e44062545bf1551974102a5762c349c668d/lxml-5.3.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:c093c7088b40d8266f57ed71d93112bd64c6724d31f0794c1e52cc4857c28e0e", size = 8171881 },
{ url = "https://files.pythonhosted.org/packages/67/0c/857b8fb6010c4246e66abeebb8639eaabba60a6d9b7c606554ecc5cbf1ee/lxml-5.3.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:b0884e3f22d87c30694e625b1e62e6f30d39782c806287450d9dc2fdf07692fd", size = 4440394 },
{ url = "https://files.pythonhosted.org/packages/61/72/c9e81de6a000f9682ccdd13503db26e973b24c68ac45a7029173237e3eed/lxml-5.3.1-cp313-cp313-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1637fa31ec682cd5760092adfabe86d9b718a75d43e65e211d5931809bc111e7", size = 5037860 },
{ url = "https://files.pythonhosted.org/packages/24/26/942048c4b14835711b583b48cd7209bd2b5f0b6939ceed2381a494138b14/lxml-5.3.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a364e8e944d92dcbf33b6b494d4e0fb3499dcc3bd9485beb701aa4b4201fa414", size = 4782513 },
{ url = "https://files.pythonhosted.org/packages/e2/65/27792339caf00f610cc5be32b940ba1e3009b7054feb0c4527cebac228d4/lxml-5.3.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:779e851fd0e19795ccc8a9bb4d705d6baa0ef475329fe44a13cf1e962f18ff1e", size = 5305227 },
{ url = "https://files.pythonhosted.org/packages/18/e1/25f7aa434a4d0d8e8420580af05ea49c3e12db6d297cf5435ac0a054df56/lxml-5.3.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c4393600915c308e546dc7003d74371744234e8444a28622d76fe19b98fa59d1", size = 4829846 },
{ url = "https://files.pythonhosted.org/packages/fe/ed/faf235e0792547d24f61ee1448159325448a7e4f2ab706503049d8e5df19/lxml-5.3.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:673b9d8e780f455091200bba8534d5f4f465944cbdd61f31dc832d70e29064a5", size = 4949495 },
{ url = "https://files.pythonhosted.org/packages/e5/e1/8f572ad9ed6039ba30f26dd4c2c58fb90f79362d2ee35ca3820284767672/lxml-5.3.1-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:2e4a570f6a99e96c457f7bec5ad459c9c420ee80b99eb04cbfcfe3fc18ec6423", size = 4773415 },
{ url = "https://files.pythonhosted.org/packages/a3/75/6b57166b9d1983dac8f28f354e38bff8d6bcab013a241989c4d54c72701b/lxml-5.3.1-cp313-cp313-manylinux_2_28_ppc64le.whl", hash = "sha256:71f31eda4e370f46af42fc9f264fafa1b09f46ba07bdbee98f25689a04b81c20", size = 5337710 },
{ url = "https://files.pythonhosted.org/packages/cc/71/4aa56e2daa83bbcc66ca27b5155be2f900d996f5d0c51078eaaac8df9547/lxml-5.3.1-cp313-cp313-manylinux_2_28_s390x.whl", hash = "sha256:42978a68d3825eaac55399eb37a4d52012a205c0c6262199b8b44fcc6fd686e8", size = 4897362 },
{ url = "https://files.pythonhosted.org/packages/65/10/3fa2da152cd9b49332fd23356ed7643c9b74cad636ddd5b2400a9730d12b/lxml-5.3.1-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:8b1942b3e4ed9ed551ed3083a2e6e0772de1e5e3aca872d955e2e86385fb7ff9", size = 4977795 },
{ url = "https://files.pythonhosted.org/packages/de/d2/e1da0f7b20827e7b0ce934963cb6334c1b02cf1bb4aecd218c4496880cb3/lxml-5.3.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:85c4f11be9cf08917ac2a5a8b6e1ef63b2f8e3799cec194417e76826e5f1de9c", size = 4858104 },
{ url = "https://files.pythonhosted.org/packages/a5/35/063420e1b33d3308f5aa7fcbdd19ef6c036f741c9a7a4bd5dc8032486b27/lxml-5.3.1-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:231cf4d140b22a923b1d0a0a4e0b4f972e5893efcdec188934cc65888fd0227b", size = 5416531 },
{ url = "https://files.pythonhosted.org/packages/c3/83/93a6457d291d1e37adfb54df23498101a4701834258c840381dd2f6a030e/lxml-5.3.1-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:5865b270b420eda7b68928d70bb517ccbe045e53b1a428129bb44372bf3d7dd5", size = 5273040 },
{ url = "https://files.pythonhosted.org/packages/39/25/ad4ac8fac488505a2702656550e63c2a8db3a4fd63db82a20dad5689cecb/lxml-5.3.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:dbf7bebc2275016cddf3c997bf8a0f7044160714c64a9b83975670a04e6d2252", size = 5050951 },
{ url = "https://files.pythonhosted.org/packages/82/74/f7d223c704c87e44b3d27b5e0dde173a2fcf2e89c0524c8015c2b3554876/lxml-5.3.1-cp313-cp313-win32.whl", hash = "sha256:d0751528b97d2b19a388b302be2a0ee05817097bab46ff0ed76feeec24951f78", size = 3485357 },
{ url = "https://files.pythonhosted.org/packages/80/83/8c54533b3576f4391eebea88454738978669a6cad0d8e23266224007939d/lxml-5.3.1-cp313-cp313-win_amd64.whl", hash = "sha256:91fb6a43d72b4f8863d21f347a9163eecbf36e76e2f51068d59cd004c506f332", size = 3814484 },
]
[[package]]
name = "requests"
version = "2.32.3"
@ -68,6 +112,24 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/f9/9b/335f9764261e915ed497fcdeb11df5dfd6f7bf257d4a6a2a686d80da4d54/requests-2.32.3-py3-none-any.whl", hash = "sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6", size = 64928 },
]
[[package]]
name = "soupsieve"
version = "2.6"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/d7/ce/fbaeed4f9fb8b2daa961f90591662df6a86c1abf25c548329a86920aedfb/soupsieve-2.6.tar.gz", hash = "sha256:e2e68417777af359ec65daac1057404a3c8a5455bb8abc36f1a9866ab1a51abb", size = 101569 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/d1/c2/fe97d779f3ef3b15f05c94a2f1e3d21732574ed441687474db9d342a7315/soupsieve-2.6-py3-none-any.whl", hash = "sha256:e72c4ff06e4fb6e4b5a9f0f55fe6e81514581fca1515028625d0f299c602ccc9", size = 36186 },
]
[[package]]
name = "typing-extensions"
version = "4.12.2"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/df/db/f35a00659bc03fec321ba8bce9420de607a1d37f8342eee1863174c69557/typing_extensions-4.12.2.tar.gz", hash = "sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8", size = 85321 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/26/9f/ad63fc0248c5379346306f8668cda6e2e2e9c95e01216d2b8ffd9ff037d0/typing_extensions-4.12.2-py3-none-any.whl", hash = "sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d", size = 37438 },
]
[[package]]
name = "urllib3"
version = "2.3.0"

View file

@ -1,6 +1,8 @@
import json
import re
from base import Wallabag_Converter
from bs4 import BeautifulSoup
# test_api_key: ak1_d202e69375c111461882_b0271d0e739ce0234f96
from requests import Response, request
@ -44,7 +46,6 @@ class API_Converter(Wallabag_Converter):
if entry["annotations"]:
self._create_annotations(id, entry)
break
return json.dumps("Done.")
def _create_bookmark(self, entry) -> Response:
@ -81,26 +82,45 @@ class API_Converter(Wallabag_Converter):
return response
def _create_annotations(self, entry_id, entry) -> Response:
payload = json.dumps(
{
"bookmarkId": entry_id,
"startOffset": 100,
"endOffset": 200,
"color": "yellow",
"text": "mytext",
"note": "mynote",
}
)
payload_dict = {
"bookmarkId": entry_id,
"startOffset": 5000,
"endOffset": 6000,
"color": "yellow",
"text": "MYTEXT",
"note": "mynote",
}
annot_url = f"{self.api_url}/highlights"
for annot in entry["annotations"]:
pl = payload_dict
pl["text"] = annot["quote"]
pl["note"] = annot["text"] if "text" in annot else None
pl["startOffset"], pl["endOffset"] = self._calc_annot_offsets(
entry["content"], annot
)
if pl["startOffset"] == -1:
print(f"[WARNING] Annotation not found in bookmark: {annot['quote']}")
continue
response = request(
"POST",
annot_url,
headers=self.headers,
data=payload,
data=json.dumps(pl),
)
print(response.json())
print(f"[DEBUG] New annotation: {response.json()}")
response = Response()
return response
def _calc_annot_offsets(self, content): ...
def _calc_annot_offsets(self, content, annot) -> tuple[int, int]:
print(f"START: {annot['ranges'][0]['start']}")
p_start_match = re.match(r"/p\[(\d+)\]", annot["ranges"][0]["start"])
if p_start_match and len(p_start_match.groups()) == 1:
p_start = int(p_start_match[1])
bs4 = BeautifulSoup(content, "lxml")
relevant_p = bs4.find_all("p")[p_start]
start_offset = str(bs4.text).find(str(relevant_p.text))
end_offset = start_offset + len(annot["quote"])
return (start_offset, end_offset)