From 59aaa74d76ec3452425c3eecf4e856fbeefa4c8b Mon Sep 17 00:00:00 2001
From: Marty Oehme <contact@martyoeh.me>
Date: Wed, 12 Mar 2025 14:16:59 +0100
Subject: [PATCH] Refactor wallabag conversion to have simple cli

---
 README.md                               | 12 ++++
 uv.lock                                 |  8 +++
 wallabag2hoarder/README.md              | 19 ++++++
 wallabag2hoarder/__init__.py            |  0
 wallabag2hoarder/base.py                |  6 ++
 wallabag2hoarder/convert.py             | 85 ++++++++++++++-----------
 wallabag2hoarder/convert_native_json.py | 37 +++++++++++
 wallabag2hoarder/convert_netscape.py    | 61 ++++++------------
 8 files changed, 149 insertions(+), 79 deletions(-)
 create mode 100644 uv.lock
 create mode 100644 wallabag2hoarder/README.md
 create mode 100644 wallabag2hoarder/__init__.py
 create mode 100644 wallabag2hoarder/base.py
 create mode 100755 wallabag2hoarder/convert_native_json.py

diff --git a/README.md b/README.md
index e69de29..967205d 100644
--- a/README.md
+++ b/README.md
@@ -0,0 +1,12 @@
+# Hoarder migration scripts
+
+Simple scripts which try to ease the migration to a self-hosted hoarder instance.
+
+## Wallabag
+
+Use one of the flavours of transferring your wallabag entries into hoarder.
+Run it like: `./wallabag2hoarder/convert.py <input-file>`
+
+Where input file is a wallabag json export of all your saved entries.
+There are options to change the style of output (netscape HTML, native JSON) and the file to output to.
+By default converts to hoard JSON and prints to stdout.
diff --git a/uv.lock b/uv.lock
new file mode 100644
index 0000000..de32efa
--- /dev/null
+++ b/uv.lock
@@ -0,0 +1,8 @@
+version = 1
+revision = 1
+requires-python = ">=3.13"
+
+[[package]]
+name = "2hoarder"
+version = "0.1.0"
+source = { virtual = "." }
diff --git a/wallabag2hoarder/README.md b/wallabag2hoarder/README.md
new file mode 100644
index 0000000..b96935f
--- /dev/null
+++ b/wallabag2hoarder/README.md
@@ -0,0 +1,19 @@
+# wallabag2hoarder
+
+Currently supports 2 conversions:
+
+- ./convert_netscape.py:
+    Converts into the 'netscape bookmark' format which hoarder should understand as 'html' import.
+    It's a very lossy conversion, essentially only retaining url, title and creation time.
+    Not tested well.
+
+- ./convert_native_json.py:
+    Uses the fact that wallabag outputs json and hoarder supports a native json export/import to
+    transform the json into one that hoarder understands well. More tested, and works without a
+    hitch, _however_ does not correctly transfer any annotations made in wallabag. Annotations
+    are added as a simple json object to the 'note' field in hoarder.
+
+- ./convert_api.py:
+    _WIP_: Uses the public hoader API to move the wallabag articles over, _including_ annotations
+    at a best-effort. Annotation support is a little behind the curve in hoarder -- we can only
+    have highlights, not 'notes' attached to a highlight.
diff --git a/wallabag2hoarder/__init__.py b/wallabag2hoarder/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/wallabag2hoarder/base.py b/wallabag2hoarder/base.py
new file mode 100644
index 0000000..4080378
--- /dev/null
+++ b/wallabag2hoarder/base.py
@@ -0,0 +1,6 @@
+class Wallabag_Converter:
+    def __init__(self, data: list[object]):
+        self.data = data
+
+    def convert(self) -> object:
+        raise NotImplementedError()
diff --git a/wallabag2hoarder/convert.py b/wallabag2hoarder/convert.py
index a44588e..f5c9031 100755
--- a/wallabag2hoarder/convert.py
+++ b/wallabag2hoarder/convert.py
@@ -1,52 +1,61 @@
 #!/usr/bin/env python
 
-from datetime import datetime
+import argparse
 import json
 import sys
 from pathlib import Path
 
-from jinja2 import Template
+from convert_netscape import Netscape_Converter
+from convert_native_json import JSON_Converter
 
-# USAGE: ./convert.py file exportfile
 
-if len(sys.argv) < 2 or not Path(sys.argv[1]).is_file():
-    print("Please provide a file to import as the first argument.")
-    sys.exit(1)
+def main():
+    parser = argparse.ArgumentParser(description="Process input file(s)")
+    parser.add_argument("input", help="Input file")
+    parser.add_argument("--output", help="Output file")
+    parser.add_argument(
+        "--flavour", choices=["html", "json"], default="json", help="Flavour of output"
+    )
+    # TODO implement
+    parser.add_argument(
+        "--num", type=int, default=10, help="Number of items to process"
+    )
 
-INPUT_FILE = Path(sys.argv[1])
-OUTPUT_FILE = (
-    Path(sys.argv[2]) if len(sys.argv) > 2 else Path("exported_bookmarks.json")
-)
-print(f"[DEBUG] inputfile: {INPUT_FILE}")
-print(f"[DEBUG] outputfile: {OUTPUT_FILE}")
+    args = parser.parse_args()
+    if not args.input:
+        print("Please provide a file to import as the first argument.")
+        sys.exit(1)
 
-# Read JSON file
-with open(INPUT_FILE, "r") as f:
-    data_in = json.load(f)
+    INPUT_FILE = Path(args.input)
+    if not INPUT_FILE.exists():
+        print(f"Input file {INPUT_FILE} can not be accessed.")
+        sys.exit(1)
 
-# NOTE: Wallabag annotation format is as follows:
-# [{'text': '', 'quote': "A while back they raised their prices, which lost them a lot of subscribers, because they were losing money per search at the old prices. They were actually still losing money per search on the new prices. They eventually lowered the prices back down a bit (and maybe raised them again? I've completely lost the plot on their pricing at this point) and have claimed that at 25,000 users they would be breaking even.", 'ranges': [{'start': '/p[6]', 'startOffset': '429', 'end': '/p[6]', 'endOffset': '844'}]}]
-# with /p signifying the paragraph? Hoarder only has a concept of offset, so probably have to transform the paragraphs into lengths and then add them up to convert from one format to the other.
+    # Read JSON file
+    with open(INPUT_FILE, "r") as f:
+        data = json.load(f)
 
-print(f"[DEBUG] Found {len(data_in)} wallabag entries.")
+    if args.num:
+        data = data[: args.num]
 
-data_out = {"bookmarks":[]}
-n = 0
-for entry in data_in:
-    bm = {
-        "createdAt": datetime.strptime(entry["created_at"], "%Y-%m-%dT%H:%M:%S%z").timestamp(),
-        "content": {"type": "link", "url": entry["url"]},
-        "title": entry["title"] if entry["title"] else None,
-        "tags": entry["tags"] + ["wallabag"],
-        # FIXME: Need to wait for better hoarder annotation handling to import them in a good format
-        # for now we just turn them _all_ into a single note.
-        # DOABLE WITH API? https://docs.hoarder.app/api/create-a-new-highlight
-        "note": json.dumps(entry["annotations"]) if entry["annotations"] else None,
-    }
-    data_out["bookmarks"].append(bm)
-    n += 1
-    if n > 50:
-        break
+    OUTPUT=""
+    OUTPUT_FILE = args.output
+    print(f"[DEBUG] input: {INPUT_FILE}")
+    print(f"[DEBUG] output: {OUTPUT_FILE}")
+    match args.flavour:
+        case "html":
+            print("[DEBUG] style: html")
+            OUTPUT = Netscape_Converter(data).convert()
+        case "json":
+            print("[DEBUG] style: json")
+            OUTPUT = JSON_Converter(data).convert()
 
-with open(OUTPUT_FILE, "w") as f:
-    json.dump(data_out, f)
+    if OUTPUT_FILE:
+        with open(OUTPUT_FILE, "w") as f:
+            f.write(OUTPUT)
+    else:
+        print(OUTPUT)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/wallabag2hoarder/convert_native_json.py b/wallabag2hoarder/convert_native_json.py
new file mode 100755
index 0000000..0165e87
--- /dev/null
+++ b/wallabag2hoarder/convert_native_json.py
@@ -0,0 +1,37 @@
+import json
+from datetime import datetime
+
+from base import Wallabag_Converter
+
+# NOTE: Wallabag annotation format is as follows:
+# [{'text': '', 'quote': "A while back they raised their prices, which lost them a lot of subscribers, because they were losing money per search at the old prices. They were actually still losing money per search on the new prices. They eventually lowered the prices back down a bit (and maybe raised them again? I've completely lost the plot on their pricing at this point) and have claimed that at 25,000 users they would be breaking even.", 'ranges': [{'start': '/p[6]', 'startOffset': '429', 'end': '/p[6]', 'endOffset': '844'}]}]
+# with /p signifying the paragraph? Hoarder only has a concept of offset, so probably have to transform the paragraphs into lengths and then add them up to convert from one format to the other.
+
+
+class JSON_Converter(Wallabag_Converter):
+    def __init__(self, data: list[dict]):
+        self.data = data
+
+    def convert(self) -> str:
+        print(f"[DEBUG] Found {len(self.data)} wallabag entries.")
+
+        data_out = {"bookmarks": []}
+        n = 0
+        for entry in self.data:
+            bm = {
+                "createdAt": datetime.strptime(
+                    entry["created_at"], "%Y-%m-%dT%H:%M:%S%z"
+                ).timestamp(),
+                "content": {"type": "link", "url": entry["url"]},
+                "title": entry["title"] if entry["title"] else None,
+                "tags": entry["tags"] + ["_wallabag"],
+                # FIXME: Need to wait for better hoarder annotation handling to import them in a good format
+                # for now we just turn them _all_ into a single note.
+                # DOABLE WITH API? https://docs.hoarder.app/api/create-a-new-highlight
+                "note": json.dumps(entry["annotations"])
+                if entry["annotations"]
+                else None,
+            }
+            data_out["bookmarks"].append(bm)
+
+        return json.dumps(data_out)
diff --git a/wallabag2hoarder/convert_netscape.py b/wallabag2hoarder/convert_netscape.py
index 0ded133..8a3d8ac 100755
--- a/wallabag2hoarder/convert_netscape.py
+++ b/wallabag2hoarder/convert_netscape.py
@@ -1,46 +1,25 @@
-#!/usr/bin/env python
-
-import json
-import sys
-from pathlib import Path
-
+from base import Wallabag_Converter
 from jinja2 import Template
 
-# USAGE: ./convert.py file exportfile
 
-if len(sys.argv) < 2 or not Path(sys.argv[1]).is_file():
-    print("Please provide a file to import as the first argument.")
-    sys.exit(1)
+class Netscape_Converter(Wallabag_Converter):
+    def __init__(self, data: list[object]):
+        self.data = data
 
-INPUT_FILE = Path(sys.argv[1])
-OUTPUT_FILE = (
-    Path(sys.argv[2]) if len(sys.argv) > 2 else Path("exported_bookmarks.html")
-)
-print(f"[DEBUG]\ninput: {INPUT_FILE}\noutput: {OUTPUT_FILE}")
+    # TODO: Timestamp does not get recognized and instead becomes 1970-01-01 - maybe needs unix ts?
+    def _generate_html(self, data):
+        return Template("""<!DOCTYPE NETSCAPE-Bookmark-file-1>
+        <META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8">
+        <TITLE>Bookmarks</TITLE>
+        <H1>Bookmarks</H1>
+        <DL><p>
+        {% for item in data %}
+            <DT>
+                <A HREF="{{ item.url }}" ADD_DATE="{{ item.created_at }}" TAGS="{{ item.tags }}">{{ item.title }}</A>
+            </DT>
+        {% endfor %}
+        </DL><p>
+        """).render(data=data)
 
-
-# TODO: Timestamp does not get recognized and instead becomes 1970-01-01 - maybe needs unix ts?
-def generate_html(data):
-    return Template("""<!DOCTYPE NETSCAPE-Bookmark-file-1>
-    <META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8">
-    <TITLE>Bookmarks</TITLE>
-    <H1>Bookmarks</H1>
-    <DL><p>
-    {% for item in data %}
-        <DT>
-            <A HREF="{{ item.url }}" ADD_DATE="{{ item.created_at }}" TAGS="{{ item.tags }}">{{ item.title }}</A>
-        </DT>
-    {% endfor %}
-    </DL><p>
-    """).render(data=data)
-
-
-# Read JSON file
-with open(INPUT_FILE, "r") as f:
-    data = json.load(f)
-
-html_content = generate_html(data)
-
-# Save or print HTML content
-with open(OUTPUT_FILE, "w") as f:
-    f.write(html_content)
+    def convert(self) -> str:
+        return self._generate_html(self.data)