Add wallabag2hoarder directory

This commit is contained in:
Marty Oehme 2025-03-12 14:16:59 +01:00
parent b1c427067e
commit bd04d5bbde
Signed by: Marty
GPG key ID: 4E535BC19C61886E
2 changed files with 98 additions and 0 deletions

52
wallabag2hoarder/convert.py Executable file
View file

@ -0,0 +1,52 @@
#!/usr/bin/env python
from datetime import datetime
import json
import sys
from pathlib import Path
from jinja2 import Template
# USAGE: ./convert.py file exportfile
if len(sys.argv) < 2 or not Path(sys.argv[1]).is_file():
print("Please provide a file to import as the first argument.")
sys.exit(1)
INPUT_FILE = Path(sys.argv[1])
OUTPUT_FILE = (
Path(sys.argv[2]) if len(sys.argv) > 2 else Path("exported_bookmarks.json")
)
print(f"[DEBUG] inputfile: {INPUT_FILE}")
print(f"[DEBUG] outputfile: {OUTPUT_FILE}")
# Read JSON file
with open(INPUT_FILE, "r") as f:
data_in = json.load(f)
# NOTE: Wallabag annotation format is as follows:
# [{'text': '', 'quote': "A while back they raised their prices, which lost them a lot of subscribers, because they were losing money per search at the old prices. They were actually still losing money per search on the new prices. They eventually lowered the prices back down a bit (and maybe raised them again? I've completely lost the plot on their pricing at this point) and have claimed that at 25,000 users they would be breaking even.", 'ranges': [{'start': '/p[6]', 'startOffset': '429', 'end': '/p[6]', 'endOffset': '844'}]}]
# with /p signifying the paragraph? Hoarder only has a concept of offset, so probably have to transform the paragraphs into lengths and then add them up to convert from one format to the other.
print(f"[DEBUG] Found {len(data_in)} wallabag entries.")
data_out = {"bookmarks":[]}
n = 0
for entry in data_in:
bm = {
"createdAt": datetime.strptime(entry["created_at"], "%Y-%m-%dT%H:%M:%S%z").timestamp(),
"content": {"type": "link", "url": entry["url"]},
"title": entry["title"] if entry["title"] else None,
"tags": entry["tags"] + ["wallabag"],
# FIXME: Need to wait for better hoarder annotation handling to import them in a good format
# for now we just turn them _all_ into a single note.
# DOABLE WITH API? https://docs.hoarder.app/api/create-a-new-highlight
"note": json.dumps(entry["annotations"]) if entry["annotations"] else None,
}
data_out["bookmarks"].append(bm)
n += 1
if n > 50:
break
with open(OUTPUT_FILE, "w") as f:
json.dump(data_out, f)

View file

@ -0,0 +1,46 @@
#!/usr/bin/env python
import json
import sys
from pathlib import Path
from jinja2 import Template
# USAGE: ./convert.py file exportfile
if len(sys.argv) < 2 or not Path(sys.argv[1]).is_file():
print("Please provide a file to import as the first argument.")
sys.exit(1)
INPUT_FILE = Path(sys.argv[1])
OUTPUT_FILE = (
Path(sys.argv[2]) if len(sys.argv) > 2 else Path("exported_bookmarks.html")
)
print(f"[DEBUG]\ninput: {INPUT_FILE}\noutput: {OUTPUT_FILE}")
# TODO: Timestamp does not get recognized and instead becomes 1970-01-01 - maybe needs unix ts?
def generate_html(data):
return Template("""<!DOCTYPE NETSCAPE-Bookmark-file-1>
<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8">
<TITLE>Bookmarks</TITLE>
<H1>Bookmarks</H1>
<DL><p>
{% for item in data %}
<DT>
<A HREF="{{ item.url }}" ADD_DATE="{{ item.created_at }}" TAGS="{{ item.tags }}">{{ item.title }}</A>
</DT>
{% endfor %}
</DL><p>
""").render(data=data)
# Read JSON file
with open(INPUT_FILE, "r") as f:
data = json.load(f)
html_content = generate_html(data)
# Save or print HTML content
with open(OUTPUT_FILE, "w") as f:
f.write(html_content)