diff --git a/README.md b/README.md index e7c6e20..2ef42d2 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ Any action can easily be started using [`just`](https://github.com/casey/just) w ## Dataset structure - All inputs (i.e. building blocks from other sources) are located in `input/`. -- All custom code is located in `code/`. +- All custom code is located in `src/`. - All final output data is located in `output/` ## Output data structure @@ -51,7 +51,7 @@ Contained in `packages.csv`, 4 columns: Represents information about the unique system installations represented in the raw dataset. -Contained in `packages.csv`, 2 columns: +Contained in `unique_installs.csv`, 2 columns: - `date`: the date a specific file is relevant for - `unique`: the amount of unique installations counted on the observation date diff --git a/justfile b/justfile index ba1d05a..dca1e7f 100644 --- a/justfile +++ b/justfile @@ -3,16 +3,16 @@ default: versioned all: files kernels unique packages files: - python code/files.py input output + python src/files.py input output kernels: - python code/kernels.py input output + python src/kernels.py input output unique: - python code/unique.py input output + python src/unique.py input output packages: - python code/packages.py input output + python src/packages.py input output versioned: datalad run -m "Create updated output data" -i input/ -o output/ just all diff --git a/code/.gitattributes b/src/.gitattributes similarity index 100% rename from code/.gitattributes rename to src/.gitattributes diff --git a/code/README.md b/src/README.md similarity index 85% rename from code/README.md rename to src/README.md index b1f9126..24aa885 100644 --- a/code/README.md +++ b/src/README.md @@ -1,6 +1,6 @@ # Popcorn dataset code -Each script can be run stand-alone like `python code/files.py `, +Each script can be run stand-alone like `python src/files.py `, exchanging the script file for the one intended. It is suggested, however, to run the scripts using the `just` command runner from the diff --git a/code/files.py b/src/files.py similarity index 100% rename from code/files.py rename to src/files.py diff --git a/code/kernels.py b/src/kernels.py similarity index 100% rename from code/kernels.py rename to src/kernels.py diff --git a/code/packages.py b/src/packages.py similarity index 100% rename from code/packages.py rename to src/packages.py diff --git a/src/tests/__init__.py b/src/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/tests/test_validate_date_col.py b/src/tests/test_validate_date_col.py new file mode 100644 index 0000000..e36e445 --- /dev/null +++ b/src/tests/test_validate_date_col.py @@ -0,0 +1,10 @@ +import dataframely as dy +import polars as pl + + +class DateSchema(dy.Schema): + date: dy.Date = dy.Date(nullable=False) + + @dy.rule() + def minimum_starting_date() -> pl.Expr: + return pl.col("date") > pl.date(2018, 5, 8) diff --git a/src/tests/test_validate_files.py b/src/tests/test_validate_files.py new file mode 100644 index 0000000..c232ba3 --- /dev/null +++ b/src/tests/test_validate_files.py @@ -0,0 +1,24 @@ +import dataframely as dy +import polars as pl + +from tests.test_validate_date_col import DateSchema + + +class FilesSchema(DateSchema): + filename: dy.String = dy.String(nullable=False) + mtime: dy.Float = dy.Float(nullable=False) + filesize: dy.Integer = dy.Integer(nullable=False) + + +def test_files_schema(): + _ = FilesSchema.validate( + pl.scan_csv( + "output/files.csv", + schema={ + "date": pl.Date, + "filename": pl.String, + "mtime": pl.Float32, + "filesize": pl.UInt32, + }, + ).collect(engine="streaming") + ) diff --git a/src/tests/test_validate_kernels.py b/src/tests/test_validate_kernels.py new file mode 100644 index 0000000..bdbcfa1 --- /dev/null +++ b/src/tests/test_validate_kernels.py @@ -0,0 +1,22 @@ +import dataframely as dy +import polars as pl + +from tests.test_validate_date_col import DateSchema + + +class KernelsSchema(DateSchema): + kernel: dy.String = dy.String(nullable=False) + downloads: dy.Integer = dy.Integer(nullable=False) + + +def test_kernels_schema(): + _ = KernelsSchema.validate( + pl.scan_csv( + "output/kernels.csv", + schema={ + "date": pl.Date, + "kernel": pl.String, + "downloads": pl.UInt32, + }, + ).collect(engine="streaming") + ) diff --git a/src/tests/test_validate_packages.py b/src/tests/test_validate_packages.py new file mode 100644 index 0000000..d986cd3 --- /dev/null +++ b/src/tests/test_validate_packages.py @@ -0,0 +1,24 @@ +import dataframely as dy +import polars as pl + +from tests.test_validate_date_col import DateSchema + + +class PackagesSchema(DateSchema): + package: dy.String = dy.String(nullable=False) + version: dy.String = dy.String(nullable=False) + count: dy.Integer = dy.Integer(nullable=False) + + +def test_packages_schema(): + _ = PackagesSchema.validate( + pl.scan_csv( + "output/packages.csv", + schema={ + "date": pl.Date, + "package": pl.String, + "version": pl.String, + "count": pl.UInt16, + }, + ).collect(engine="streaming") + ) diff --git a/src/tests/test_validate_unique_installs.py b/src/tests/test_validate_unique_installs.py new file mode 100644 index 0000000..109ccb1 --- /dev/null +++ b/src/tests/test_validate_unique_installs.py @@ -0,0 +1,24 @@ +import dataframely as dy +import polars as pl + +from tests.test_validate_date_col import DateSchema + + +class UniquesSchema(DateSchema): + unique: dy.Integer = dy.Integer(nullable=False) + + @dy.rule() + def cannot_be_zero() -> pl.Expr: + return pl.col("unique") > 0 + + +def test_uniques_schema(): + _ = UniquesSchema.validate( + pl.scan_csv( + "output/unique_installs.csv", + schema={ + "date": pl.Date, + "unique": pl.UInt16, + }, + ).collect(engine="streaming") + ) diff --git a/code/unique.py b/src/unique.py similarity index 100% rename from code/unique.py rename to src/unique.py