Validate CSV output schemas
Also moved code dir to src. There are reasons to do standard things in standard ways. While it is possible to get the `code/` directory to work, and recognize it as a package path, this requires wrangling the pyproject.toml file. Additionally, any import from the `code.something` path automatically shadows the python stdlib `code` module. While it may not be necessary, it still is good to not shadow standard library modules.
This commit is contained in:
parent
de96b67fac
commit
2faeda87c3
14 changed files with 111 additions and 7 deletions
|
|
@ -9,7 +9,7 @@ Any action can easily be started using [`just`](https://github.com/casey/just) w
|
|||
## Dataset structure
|
||||
|
||||
- All inputs (i.e. building blocks from other sources) are located in `input/`.
|
||||
- All custom code is located in `code/`.
|
||||
- All custom code is located in `src/`.
|
||||
- All final output data is located in `output/`
|
||||
|
||||
## Output data structure
|
||||
|
|
@ -51,7 +51,7 @@ Contained in `packages.csv`, 4 columns:
|
|||
|
||||
Represents information about the unique system installations represented in the raw dataset.
|
||||
|
||||
Contained in `packages.csv`, 2 columns:
|
||||
Contained in `unique_installs.csv`, 2 columns:
|
||||
|
||||
- `date`: the date a specific file is relevant for
|
||||
- `unique`: the amount of unique installations counted on the observation date
|
||||
|
|
|
|||
8
justfile
8
justfile
|
|
@ -3,16 +3,16 @@ default: versioned
|
|||
all: files kernels unique packages
|
||||
|
||||
files:
|
||||
python code/files.py input output
|
||||
python src/files.py input output
|
||||
|
||||
kernels:
|
||||
python code/kernels.py input output
|
||||
python src/kernels.py input output
|
||||
|
||||
unique:
|
||||
python code/unique.py input output
|
||||
python src/unique.py input output
|
||||
|
||||
packages:
|
||||
python code/packages.py input output
|
||||
python src/packages.py input output
|
||||
|
||||
versioned:
|
||||
datalad run -m "Create updated output data" -i input/ -o output/ just all
|
||||
|
|
|
|||
0
code/.gitattributes → src/.gitattributes
vendored
0
code/.gitattributes → src/.gitattributes
vendored
|
|
@ -1,6 +1,6 @@
|
|||
# Popcorn dataset code
|
||||
|
||||
Each script can be run stand-alone like `python code/files.py <input-dir> <output-dir>`,
|
||||
Each script can be run stand-alone like `python src/files.py <input-dir> <output-dir>`,
|
||||
exchanging the script file for the one intended.
|
||||
|
||||
It is suggested, however, to run the scripts using the `just` command runner from the
|
||||
0
src/tests/__init__.py
Normal file
0
src/tests/__init__.py
Normal file
10
src/tests/test_validate_date_col.py
Normal file
10
src/tests/test_validate_date_col.py
Normal file
|
|
@ -0,0 +1,10 @@
|
|||
import dataframely as dy
|
||||
import polars as pl
|
||||
|
||||
|
||||
class DateSchema(dy.Schema):
|
||||
date: dy.Date = dy.Date(nullable=False)
|
||||
|
||||
@dy.rule()
|
||||
def minimum_starting_date() -> pl.Expr:
|
||||
return pl.col("date") > pl.date(2018, 5, 8)
|
||||
24
src/tests/test_validate_files.py
Normal file
24
src/tests/test_validate_files.py
Normal file
|
|
@ -0,0 +1,24 @@
|
|||
import dataframely as dy
|
||||
import polars as pl
|
||||
|
||||
from tests.test_validate_date_col import DateSchema
|
||||
|
||||
|
||||
class FilesSchema(DateSchema):
|
||||
filename: dy.String = dy.String(nullable=False)
|
||||
mtime: dy.Float = dy.Float(nullable=False)
|
||||
filesize: dy.Integer = dy.Integer(nullable=False)
|
||||
|
||||
|
||||
def test_files_schema():
|
||||
_ = FilesSchema.validate(
|
||||
pl.scan_csv(
|
||||
"output/files.csv",
|
||||
schema={
|
||||
"date": pl.Date,
|
||||
"filename": pl.String,
|
||||
"mtime": pl.Float32,
|
||||
"filesize": pl.UInt32,
|
||||
},
|
||||
).collect(engine="streaming")
|
||||
)
|
||||
22
src/tests/test_validate_kernels.py
Normal file
22
src/tests/test_validate_kernels.py
Normal file
|
|
@ -0,0 +1,22 @@
|
|||
import dataframely as dy
|
||||
import polars as pl
|
||||
|
||||
from tests.test_validate_date_col import DateSchema
|
||||
|
||||
|
||||
class KernelsSchema(DateSchema):
|
||||
kernel: dy.String = dy.String(nullable=False)
|
||||
downloads: dy.Integer = dy.Integer(nullable=False)
|
||||
|
||||
|
||||
def test_kernels_schema():
|
||||
_ = KernelsSchema.validate(
|
||||
pl.scan_csv(
|
||||
"output/kernels.csv",
|
||||
schema={
|
||||
"date": pl.Date,
|
||||
"kernel": pl.String,
|
||||
"downloads": pl.UInt32,
|
||||
},
|
||||
).collect(engine="streaming")
|
||||
)
|
||||
24
src/tests/test_validate_packages.py
Normal file
24
src/tests/test_validate_packages.py
Normal file
|
|
@ -0,0 +1,24 @@
|
|||
import dataframely as dy
|
||||
import polars as pl
|
||||
|
||||
from tests.test_validate_date_col import DateSchema
|
||||
|
||||
|
||||
class PackagesSchema(DateSchema):
|
||||
package: dy.String = dy.String(nullable=False)
|
||||
version: dy.String = dy.String(nullable=False)
|
||||
count: dy.Integer = dy.Integer(nullable=False)
|
||||
|
||||
|
||||
def test_packages_schema():
|
||||
_ = PackagesSchema.validate(
|
||||
pl.scan_csv(
|
||||
"output/packages.csv",
|
||||
schema={
|
||||
"date": pl.Date,
|
||||
"package": pl.String,
|
||||
"version": pl.String,
|
||||
"count": pl.UInt16,
|
||||
},
|
||||
).collect(engine="streaming")
|
||||
)
|
||||
24
src/tests/test_validate_unique_installs.py
Normal file
24
src/tests/test_validate_unique_installs.py
Normal file
|
|
@ -0,0 +1,24 @@
|
|||
import dataframely as dy
|
||||
import polars as pl
|
||||
|
||||
from tests.test_validate_date_col import DateSchema
|
||||
|
||||
|
||||
class UniquesSchema(DateSchema):
|
||||
unique: dy.Integer = dy.Integer(nullable=False)
|
||||
|
||||
@dy.rule()
|
||||
def cannot_be_zero() -> pl.Expr:
|
||||
return pl.col("unique") > 0
|
||||
|
||||
|
||||
def test_uniques_schema():
|
||||
_ = UniquesSchema.validate(
|
||||
pl.scan_csv(
|
||||
"output/unique_installs.csv",
|
||||
schema={
|
||||
"date": pl.Date,
|
||||
"unique": pl.UInt16,
|
||||
},
|
||||
).collect(engine="streaming")
|
||||
)
|
||||
Loading…
Add table
Add a link
Reference in a new issue