From a0794c6d09d9331884bb075aa834122930013082 Mon Sep 17 00:00:00 2001 From: Marty Oehme Date: Wed, 14 Feb 2024 15:48:47 +0100 Subject: [PATCH] feat(code): Add validity calculation --- src/calculate_validities.py | 71 +++++++++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) create mode 100644 src/calculate_validities.py diff --git a/src/calculate_validities.py b/src/calculate_validities.py new file mode 100644 index 0000000..e3955ce --- /dev/null +++ b/src/calculate_validities.py @@ -0,0 +1,71 @@ +from typing import cast +from pandas import DataFrame + + +def calculate_validities( + df: DataFrame, repr_col: str = "representativeness", method_col: str = "method" +) -> DataFrame: + EXT_COL_NAME: str = "external_validity" + INT_COL_NAME: str = "internal_validity" + vd = df[(df["design"] == "quasi-experimental") | (df["design"] == "experimental")] + + vd[EXT_COL_NAME] = 0 + vd[INT_COL_NAME] = 0 + vd = cast(DataFrame, vd) + + vd[repr_col] = vd[repr_col].fillna("") + # needs to check national before subnational, subnational before local + vd.loc[vd[repr_col].str.contains("national"), EXT_COL_NAME] = 5.0 + vd.loc[vd[repr_col].str.contains("regional"), EXT_COL_NAME] = 4.0 + vd.loc[vd[repr_col].str.contains("subnational"), EXT_COL_NAME] = 3.0 + vd.loc[vd[repr_col].str.contains("local"), EXT_COL_NAME] = 2.0 + + vd[method_col] = vd[method_col].fillna("") + # needs to go lowest to highest in case of multiple mentioned approaches + vd.loc[ + vd[method_col].str.contains("|".join(["OLS", "ordinary.least.square"])), + INT_COL_NAME, + ] = 2.0 + vd.loc[ + vd[method_col].str.contains("|".join(["DM", "discontinuity.matching"])), + INT_COL_NAME, + ] = 3.0 + vd.loc[ + vd[method_col].str.contains( + "|".join(["DID", "difference.in.diff", "diff.in.diff", "triple.diff"]) + ), + INT_COL_NAME, + ] = 3.0 + vd.loc[ + vd[method_col].str.contains( + "|".join(["PSM", "propensity.score.matching", "score.matching"]) + ), + INT_COL_NAME, + ] = 3.5 + vd.loc[ + vd[method_col].str.contains("|".join(["IV", "instrumental.variable"])), + INT_COL_NAME, + ] = 4.0 + vd.loc[ + vd[method_col].str.contains("|".join(["RD", "regression.discontinuity"])), + INT_COL_NAME, + ] = 4.5 + vd.loc[vd[method_col].str.contains("RCT"), INT_COL_NAME] = 5.0 + + return vd + +if __name__ == "__main__": + import sys + import load_data + from pathlib import Path + from io import StringIO + if len(sys.argv) == 2: + df = load_data.from_yml(Path(sys.argv[1])) + else: + df = load_data.from_yml() + + df = calculate_validities(df) + output = StringIO() + df.to_csv(output) + output.seek(0) + print(output.read())