feat(script): Begin using validities for visualization

2024-02-14 17:30:04 +01:00 · 2024-02-14 17:30:04 +01:00 · 227adb33f8
commit 227adb33f8
parent 41b2d651a6
3 changed files with 8 additions and 25 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@ -41,7 +41,7 @@ cmd = "nvim"
 [tool.poe.tasks.extract]
 help = "Extract the csv data from raw yaml files"
 shell = """
-python src/load_data.py > 02-data/processed/extracted.csv
+python src/prep_data.py > 02-data/processed/extracted.csv
 """
 [tool.poe.tasks.milestone]
 help = "Extract, render, commit and version a finished artifact"
--- a/scoping_review.qmd
+++ b/scoping_review.qmd
@ -608,36 +608,19 @@ which are then descriptively distinguished between for their primary outcome ine
 Each main thematic area will be preceded by a table prsenting the overall inequalities reviewed,
 main findings and accompanying channels that could be identified.
 Afterwards, the analytical lens will be inverted for the discussion (Section 5)
-and the reviewed studies discussed from a perspective of their analysed inequalities,
+and the reviewed studies discussed from a perspective of their analysed inequalities and limitations,
 to better identify areas of strong analytical lenses or areas of more limited analyses.

 ```{python}
-vd = by_intervention[(by_intervention['design'] == 'quasi-experimental') | (by_intervention['design'] == 'experimental')]
-vd = vd.assign(valid_ext=0)
+from src import prep_data

-# assign external validities
-vd["representativeness"] = vd["representativeness"].fillna("")
-vd.loc[vd['representativeness'].str.contains("subnational"), 'valid_ext'] = 5.0
-vd.loc[vd['representativeness'].str.contains("national"), 'valid_ext'] = 4.0
-vd.loc[vd['representativeness'].str.contains("regional"), 'valid_ext'] = 3.0
-vd.loc[vd['representativeness'].str.contains("local"), 'valid_ext'] = 2.0
-
-# assign internal validities
-vd = vd.assign(valid_int=0)
-vd["method"] = vd["method"].fillna("")
-vd.loc[vd['method'].str.contains("RCT"), 'valid_int'] = 5.0
-vd.loc[vd['method'].str.contains("|".join(["RD","regression.discontinuity"])), 'valid_int'] = 4.5
-vd.loc[vd['method'].str.contains("|".join(["IV","instrumental.variable"])), 'valid_int'] = 4.0
-vd.loc[vd['method'].str.contains("|".join(["PSM","propensity.score.matching"])), 'valid_int'] = 3.5
-vd.loc[vd['method'].str.contains("|".join(["DM","discontinuity.matching"])), 'valid_int'] = 3.0
-vd.loc[vd['method'].str.contains("|".join(["DID","difference.in.difference", "triple.diff"])), 'valid_int'] = 3.0
-vd.loc[vd['method'].str.contains("|".join(["OLS","ordinary.least.square"])), 'valid_int'] = 2.0
+validities = prep_data.calculate_validities(by_intervention)

 # Melt the dataframe to long format for plotting
-melted_df = vd.melt(value_vars=['valid_int', 'valid_ext'], id_vars
-='intervention', var_name='Validity')
+# melted_validities = validities.melt(value_vars=['valid_int', 'valid_ext'], id_vars
+# ='intervention', var_name='Validity')
 # Create a stacked histplot using Seaborn
-sns.histplot(data=melted_df, y='intervention', hue='Validity', multiple='stack')
+sns.scatterplot(data=validities, x='external_validity', y='internal_validity', hue='intervention')
 ```

 ## Institutional
--- a/src/calculate_validities.py
+++ b/src/calculate_validities.py
@ -8,7 +8,7 @@ def calculate_validities(
 ) -> DataFrame:
    EXT_COL_NAME: str = "external_validity"
    INT_COL_NAME: str = "internal_validity"
-    cols = {EXT_COL_NAME: 0, INT_COL_NAME: 0}
+    cols = {EXT_COL_NAME: 0.0, INT_COL_NAME: 0.0}

    vd = df[
        (df["design"] == "quasi-experimental") | (df["design"] == "experimental")