Add shaarli2hoarder

2025-03-12 14:16:59 +01:00 · 2025-03-12 14:16:59 +01:00 · 36252f6f19
commit 36252f6f19
parent bd04d5bbde
5 changed files with 117 additions and 1 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@ -4,7 +4,16 @@ version = "0.1.0"
 description = "Add your description here"
 readme = "README.md"
 requires-python = ">=3.13"
-dependencies = []
+dependencies = [
+    # REQUIRED FOR CURRENT SHAARLI2HOARDER IMPLEMENTATION ONLY
+    # "beautifulsoup4>=4.13.3",
+    # "netscape-bookmarks-file-parser",
+    # "pyjwt>=2.10.1",
+    # "requests>=2.32.3",
+]

 [tool.pyright]
 typeCheckingMode = "basic"
+
+[tool.uv.sources]
+netscape-bookmarks-file-parser = { git = "https://github.com/FlyingWolFox/Netscape-Bookmarks-File-Parser.git" }
--- a/shaarli2hoarder/.gitignore
+++ b/shaarli2hoarder/.gitignore
@ -0,0 +1,14 @@
+*.json
+*.html
+
+# Python-generated files
+__pycache__/
+*.py[oc]
+build/
+dist/
+wheels/
+*.egg-info
+
+# Virtual environments
+.venv
+
--- a/shaarli2hoarder/.python-version
+++ b/shaarli2hoarder/.python-version
@ -0,0 +1 @@
+3.13
--- a/shaarli2hoarder/README.md
+++ b/shaarli2hoarder/README.md
@ -0,0 +1,12 @@
+# Shaarli 2 Hoarder converter
+
+Convert your shaarli bookmarks to hoarder json format.
+
+Simply run it like the following `uv run python convert.py <shaarli-export-file>`,
+pointing th efile at your exported html file from shaarli.
+
+It will print out the JSON representation of those bookmarks, 
+readable by Hoarder.
+
+Run it like the following `uv run python convert.py bookmarks.html > out.json`
+to generate a valid json file which you can then import thorugh the hoarder interface.
--- a/shaarli2hoarder/convert.py
+++ b/shaarli2hoarder/convert.py
@ -0,0 +1,80 @@
+import json
+import sys
+
+from bs4 import BeautifulSoup
+
+if len(sys.argv) < 2:
+    print("ERROR: Pass the bookmarks file as argument.")
+    sys.exit(1)
+path = sys.argv[1]
+
+
+def parse_bookmark(html_content):
+    soup = BeautifulSoup(html_content, "html.parser")
+
+    bookmarks = []
+
+    if len(soup.find_all("dl")) != 1:
+        print("WARNING! More than one Bookmark element found. File may be corrupt.")
+
+    first = True
+    last_desc = ""
+    for el in soup.find_all("a"):
+        bm_el = {}
+        url = el["href"]
+        title = el.string.strip() if el.string else url
+
+        # date elements
+        add_date = el.get("add_date", "")
+        last_modified = el.get("last_modified", "")
+        tag_string = el.get("tags", "")
+        tags = tag_string.split(",") if tag_string else []
+
+        # TODO: url contains '/shaare/' == note type
+
+        # desc / note
+        desc_el = el.parent.find_next_sibling("dd")
+        # Have to fix the non-closed <dt> tabs :\
+        # For now, in vim do: `:%s/<DT>.*/\0<\/DT>` to add a closing el to each line
+        description = desc_el.contents[0].strip() if desc_el else ""
+        if description and description == last_desc:
+            description = ""
+        if description:
+            last_desc = description
+
+        # print(f"URL: {url}, TITLE: {title}")
+        # print(f"ADD: {add_date}, MOD: {last_modified}, TAGS: {tags}")
+        # print(f"DESC: {description.strip()}")
+
+        content = {}
+        if "/shaare/" in url:
+            content = {"type": "text", "text": description}
+            # print(f"Detected note-style url ({url}) turning description to content.")
+            description = ""
+        else:
+            content = {"type": "link", "url": url}
+
+        bm_el = {
+            "title": title,
+            "note": description,
+            "createdAt": int(last_modified if last_modified else add_date),
+            "content": content,
+        }
+        if tags:
+            bm_el["tags"] = tags
+        if description:
+            bm_el["note"] = description
+        bookmarks.append(bm_el)
+
+    return bookmarks
+
+
+with open(path) as f:
+    contents = f.readlines()
+    for i, line in enumerate(contents):
+        if "<DT>" in line:
+            contents[i] = f"{line.rstrip()}</DT>"
+
+    bookmarks = parse_bookmark("\n".join(contents))
+
+    print(json.dumps({"bookmarks": bookmarks}, indent=2))