From 380a50ec330c085bae9e4d7e038432b98677bc0b Mon Sep 17 00:00:00 2001 From: Marty Oehme Date: Fri, 21 Nov 2025 17:13:39 +0100 Subject: [PATCH] initial commit --- .gitignore | 252 ++++++++++++++++++++++++++++++ README.md | 42 +++++ grab.nu | 96 ++++++++++++ output/maths-and-physics.csv | 18 +++ output/software-architecture.csv | 21 +++ output/software-architecture.nuon | 1 + 6 files changed, 430 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100755 grab.nu create mode 100644 output/maths-and-physics.csv create mode 100644 output/software-architecture.csv create mode 100644 output/software-architecture.nuon diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..51c13a2 --- /dev/null +++ b/.gitignore @@ -0,0 +1,252 @@ +/test/ + +# Created by https://www.toptal.com/developers/gitignore/api/linux,markdown,ansible,terraform,vim,nushell,python +# Edit at https://www.toptal.com/developers/gitignore?templates=linux,markdown,ansible,terraform,vim,nushell,python + +### Ansible ### +*.retry + +### Linux ### +*~ + +# temporary files which can be created if a process still has a handle open of a deleted file +.fuse_hidden* + +# KDE directory preferences +.directory + +# Linux trash folder which might appear on any partition or disk +.Trash-* + +# .nfs files are created when an open file is removed but is still being accessed +.nfs* + +### Python ### +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + +### Python Patch ### +# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration +poetry.toml + +# ruff +.ruff_cache/ + +# LSP config files +pyrightconfig.json + +### Terraform ### +# Local .terraform directories +**/.terraform/* + +# .tfstate files +*.tfstate +*.tfstate.* + +# Crash log files +crash.log +crash.*.log + +# Exclude all .tfvars files, which are likely to contain sensitive data, such as +# password, private keys, and other secrets. These should not be part of version +# control as they are data points which are potentially sensitive and subject +# to change depending on the environment. +*.tfvars +*.tfvars.json + +# Ignore override files as they are usually used to override resources locally and so +# are not checked in +override.tf +override.tf.json +*_override.tf +*_override.tf.json + +# Include override files you do wish to add to version control using negated pattern +# !example_override.tf + +# Include tfplan files to ignore the plan output of command: terraform plan -out=tfplan +# example: *tfplan* + +# Ignore CLI configuration files +.terraformrc +terraform.rc + +### Vim ### +# Swap +[._]*.s[a-v][a-z] +!*.svg # comment out if you don't need vector files +[._]*.sw[a-p] +[._]s[a-rt-v][a-z] +[._]ss[a-gi-z] +[._]sw[a-p] + +# Session +Session.vim +Sessionx.vim + +# Temporary +.netrwhist +# Auto-generated tag files +tags +# Persistent undo +[._]*.un~ + +# End of https://www.toptal.com/developers/gitignore/api/linux,markdown,ansible,terraform,vim,nushell,python diff --git a/README.md b/README.md new file mode 100644 index 0000000..ea375ef --- /dev/null +++ b/README.md @@ -0,0 +1,42 @@ +# Create a Bookwyrm-ready import list from humble bundle + +Usage: `./grab.nu ` + +For example: `./grab.nu https://www.humblebundle.com/books/data-engineering-science-oreilly-books | save -f data-eng.csv` + +Will download book info in parallel, but often does not find enough info for _all_ books. + +## Internal logic + +- get humble bundle page + `http get https://www.humblebundle.com/books/software-architecture-pearson-books` +- extract correct element for json data + `open out.html | pup "script#webpack-bundle-page-data text{}" | from json` + +- get transposed version + `open books.json | get bundleData.tier_item_data | transpose machine_id item | insert human_name {$in.item.human_name} | insert cover_art {$in.item.resolved_paths.front_page_art_imgix_retina} | insert publisher {$in.item.publishers.0.publisher-name} | reject item` +- get details of books into tabled format + `open books_transposed.json | where machine_id != "code_org" | insert human_name {$in.item.human_name} | insert cover_art {$in.item.resolved_paths.front_page_art_imgix_retina} | insert publisher {$in.item.publishers.0.publisher-name} | insert author {$in.item.developers.0.developer-name} | reject item | save books.csv` + +- search openlibrary for the book + `http get https://openlibrary.org/search.json?author=Oliver+Goldman&title=Effective+Software+Architecture | save search_result.json` +- get isbn from openlibrary (currently first result, primary edition) + `open search_result.json | get docs.0.cover_edition_key | http get $"https://openlibrary.org/books/($in).json" | save work_result.json` +- fill out other information from edition info if available + - publishers + - publish_date + - languages + - authors? + - title + - subtitle + - isbn_13 + - isbn_10 if exists + - olid (Openlibrary ID) = key above + +- save it into csv + - title + - author_text (First Last, First Last) + - remote_id (link, nothing) + - openlibrary_key (olid above) + - isbn_10 + - isbn_13 diff --git a/grab.nu b/grab.nu new file mode 100755 index 0000000..5422178 --- /dev/null +++ b/grab.nu @@ -0,0 +1,96 @@ +#!/usr/bin/env nu + +let debugging = false + +def main [url: string]: nothing -> any { + get_humble_page $url | + grab_bundle_data | + extract_books | + url_sanitize_author_title | + fill_details_from_openlibrary | + to_bookwyrm_csv | + to csv +} + +# TODO: Use for testing w/o hammering bundle page +def fake_get_page [file: string]: nothing -> string { + open $file +} + +def get_humble_page [url: string]: nothing -> string { + if $debugging { fake_get_page intermediate/out.html } else { http get $url | into string } +} + +def grab_bundle_data []: string -> record { + pup "script#webpack-bundle-page-data text{}" | from json +} + +def extract_books []: record -> table { + get bundleData.tier_item_data | + transpose machine_id item | + insert human_name {$in.item?.human_name} | + insert cover_art {$in.item.resolved_paths.front_page_art_imgix_retina} | + insert publisher {$in.item.publishers.0?.publisher-name} | + insert author {$in.item.developers.0?.developer-name | default ""} | + reject item +} + +def url_sanitize_author_title []: table -> table { + insert human_name_sanitized {$in.human_name | default "" | str replace --all --regex " " "+"} | + insert author_sanitized {$in.author | default "" | str replace --all --regex " " "+"} +} + +def fake_get_olid [author: string, title: string]: nothing -> string { + "OL52558571M" +} +def get_olid [author: string, title: string]: nothing -> string { + let authorsearch = if $author == "" {""} else {$"author=($author)"} + let titlesearch = if $title == "" {""} else {$"title=($title)"} + http get $"https://openlibrary.org/search.json?($titlesearch)&($authorsearch)" | + (get docs.0?.cover_edition_key? | default "") +} + +def fake_get_ol_edition [olid: string]: nothing -> record { + open intermediate/work_result.json +} +def get_ol_edition [olid: string]: nothing -> record { + http get $"https://openlibrary.org/books/($olid).json" +} + +def fill_details_from_openlibrary []: table -> table { + par-each { |row| + print -e $"Grabbing OLID for ($row.human_name) by ($row.author)" + let olid = if $debugging and $row.author_sanitized != "" { + fake_get_olid $row.author_sanitized $row.human_name_sanitized + } else { + get_olid $row.author_sanitized $row.human_name_sanitized + } + let result = if $olid != "" { + print -e $"Grabbing edition \(($olid)\) info for ($row.human_name)" + if $debugging { + fake_get_ol_edition $olid + } else { + get_ol_edition $olid + } + } else { } + $row | + insert openlibrary_key $olid | + insert isbn_13 $result.isbn_13?.0 | + insert isbn_10 $result.isbn_10?.0 | + insert publish_date $result.publish_date? | + upsert publisher $result.publishers?.0 | + upsert title ($result.title? | default $row.human_name?) | + insert subtitle ($result.subtitle? | default "") | + insert fulltitle ($result.full_title? | default "") + } +} + +def to_bookwyrm_csv [] { + $in | + upsert title {|row| if $row.fulltitle? != "" {$row.fulltitle?} else {$"($row.title?) ($row.subtitle?)"} } | + select author title openlibrary_key isbn_10 isbn_13 +} + +def sample_func [write?: bool] { + echo "also here" +} diff --git a/output/maths-and-physics.csv b/output/maths-and-physics.csv new file mode 100644 index 0000000..5d7a657 --- /dev/null +++ b/output/maths-and-physics.csv @@ -0,0 +1,18 @@ +author,title,openlibrary_key,isbn_10,isbn_13 +Deborah J. Rumsey,Statistics All-In-One for Dummies ,OL37984899M,,9781119902560 +Galen C. Duree Jr.,Optics For Dummies ,,, +Mary Jane Sterling,"Pre-Calculus For Dummies, 3rd Edition ",,, +Mark Ryan,"Calculus For Dummies, 2nd Edition ",,, +Mark Ryan,Geometry Essentials For Dummies ,,, +Mary Jane Sterling,Algebra I essentials for dummies,OL27015895M,0470618345,9780470618349 +Patrick Jones,Calculus: 1001 Practice Problems For Dummies (+ Free Online Practice) ,,, +Mike Pauken,Thermodynamics For Dummies ,,, +Mary Jane Sterling,Algebra II Essentials For Dummies ,,, +Steven Holzner,"Physics I For Dummies, 3rd Edition ",,, +Steven Holzner,Physics II for dummies ,OL25370848M,0470538066,9780470538067 +Steven Holzner,Physics essentials for dummies,OL27169436M,0470618418,9780470618417 +The Experts at Dummies,Physics I: 501 Practice Problems For Dummies (+ Free Online Practice) ,,, +Barry Schoenborn,Math For Real Life For Dummies ,,, +The Experts at Dummies,"Physics I Workbook For Dummies with Online Practice, 3rd Edition ",,, +Mary Jane Sterling,"Trigonometry For Dummies, 3rd Edition ",,, +www.buildon.org,buildOn ,,, diff --git a/output/software-architecture.csv b/output/software-architecture.csv new file mode 100644 index 0000000..0eb285e --- /dev/null +++ b/output/software-architecture.csv @@ -0,0 +1,21 @@ +author,title,openlibrary_key,isbn_10,isbn_13 +Zimmerman et al,Patterns for API Design ,,, +Code.org,Code.org ,,, +Vernon / Jaskula,Strategic Monoliths and Microservices Driving Innovation Using Purposeful Architecture,OL34773512M,,9780137355464 +Wiegers,Software Requirements Essentials Core Practices for Successful Business Analysis,OL48201406M,,9780138190224 +Emison,Serverless as a Game Changer ,,, +Moghe,The Async-First Playbook ,,, +Metz,"Practical Object-Oriented Design, 2/e ",,, +Susanne Kaiser,Architecture for Flow ,,, +Hofer / Schwentner,"Domain Storytelling A Collaborative, Visual, and Agile Way to Build Domain-Driven Software",OL34795902M,,9780137458912 +Higginbotham,Principles of Web API Design Delivering Value with APIs and Microservices,OL34773513M,,9780137355631 +Sites,Understanding Software Dynamics ,,, +Oliver Goldman,Effective Software Architecture Building Better Software Faster,OL52558571M,,9780138249311 +Lowy,Righting Software ,OL29450091M,,9780136524038 +Charpentier,Functional and Concurrent Programming ,,, +Farley,Modern Software Engineering Doing What Really Works to Build Better Software Faster,OL34779880M,,9780137314911 +Erder et al,Continuous Architecture in Practice ,,, +Srinath Perera,Software Architecture and Decision-Making ,,, +Bass et al,"Software Architecture in Practice, 4/e ",,, +Vlad Khononov,Balancing Coupling in Software Design ,,, +Vernon,Domain-Driven Design Distilled ,OL26836801M,0134434420,9780134434421 diff --git a/output/software-architecture.nuon b/output/software-architecture.nuon new file mode 100644 index 0000000..35aa67f --- /dev/null +++ b/output/software-architecture.nuon @@ -0,0 +1 @@ +[[author, title, openlibrary_key, "isbn_10", "isbn_13"]; ["Zimmerman et al", "Patterns for API Design ", "", null, null], ["Srinath Perera", "Software Architecture and Decision-Making ", "", null, null], ["Bass et al", "Software Architecture in Practice, 4/e ", "", null, null], ["Vlad Khononov", "Balancing Coupling in Software Design ", "", null, null], [Vernon, "Domain-Driven Design Distilled ", "OL26836801M", "0134434420", "9780134434421"], [Moghe, "The Async-First Playbook ", "", null, null], ["Code.org", "Code.org ", "", null, null], ["Susanne Kaiser", "Architecture for Flow ", "", null, null], ["Hofer / Schwentner", "Domain Storytelling A Collaborative, Visual, and Agile Way to Build Domain-Driven Software", "OL34795902M", null, "9780137458912"], [Charpentier, "Functional and Concurrent Programming ", "", null, null], [Emison, "Serverless as a Game Changer ", "", null, null], ["Erder et al", "Continuous Architecture in Practice ", "", null, null], [Metz, "Practical Object-Oriented Design, 2/e ", "", null, null], ["Vernon / Jaskula", "Strategic Monoliths and Microservices Driving Innovation Using Purposeful Architecture", "OL34773512M", null, "9780137355464"], [Wiegers, "Software Requirements Essentials Core Practices for Successful Business Analysis", "OL48201406M", null, "9780138190224"], [Farley, "Modern Software Engineering Doing What Really Works to Build Better Software Faster", "OL34779880M", null, "9780137314911"], [Sites, "Understanding Software Dynamics ", "", null, null], ["Oliver Goldman", "Effective Software Architecture Building Better Software Faster", "OL52558571M", null, "9780138249311"], [Lowy, "Righting Software ", "OL29450091M", null, "9780136524038"], [Higginbotham, "Principles of Web API Design Delivering Value with APIs and Microservices", "OL34773513M", null, "9780137355631"]]