refactor(code): Extract internal validity ranking keywords

Extracted keywords into dict which can be extended with either keywords or rankings as needed, providing more flexibility.
2024-02-16 17:59:09 +01:00 · 2024-02-16 17:59:09 +01:00 · e032a5c192
commit e032a5c192
parent 4e865ee2b5
1 changed files with 46 additions and 36 deletions
--- a/src/model/validity.py
+++ b/src/model/validity.py
@ -33,17 +33,53 @@ def _combined_validities(
    return r"\-"
 METHOD_RANKINGS = {
    2.0: ["OLS", "ordinary.least.square", "logistic.regression"],
    3.0: [
        "DM",
        "discontinuity.matching",
        "DID",
        "difference.in.diff",
        "diff.in.diff",
        "triple.diff",
    ],
    3.5: ["PSM", "propensity.score.matching", "score.matching"],
    4.0: ["IV", "instrumental.variable"],
    4.5: ["RD", "regression.discontinuity"],
    5.0: ["RCT", "randomi(?:s|z)ed.control.trial"],
 }
 # TODO do not filter by quasi-/experimental, but analyse the whole df passed in
 #      This allows filtering to happen where it's needed but otherwise validity
 #      given for all studies passed in.
 def calculate(
-    df: DataFrame, repr_col: str = "representativeness", method_col: str = "method"
+    df: DataFrame,
    repr_col: str = "representativeness",
    design_col: str = "design",
    method_col: str = "method",
 ) -> DataFrame:
    """Add internal and external validities to a dataframe.
    Requires a dataframe containing a study or observation per row, with a
    single column describing the study design, method and representativeness
    each respectively.
    Takes a combination of study design (simulation/observational/
    quasi-experimental/experimental/..) and its method (OLS/DID/RD/...) to
    calculate an internal validity.
    Takes a study representativeness (local/subnational/national/regional/
    census) to calculate the external validity.
    """
    EXT_COL_NAME: str = "external_validity"
    INT_COL_NAME: str = "internal_validity"
    cols = {EXT_COL_NAME: 0.0, INT_COL_NAME: 0.0}
-    vd = df[
+    # vd = df[
-        (df["design"] == "quasi-experimental") | (df["design"] == "experimental")
+    #     (df[design_col] == "quasi-experimental") | (df[design_col] == "experimental")
-    ].copy()
+    # ].copy()
-    vd.assign(**cols)
+    vd = df.assign(**cols)
    vd = cast(DataFrame, vd)
    vd[repr_col] = vd[repr_col].fillna("")
@ -57,37 +93,11 @@ def calculate(
    vd.loc[vd[repr_col].str.contains("local"), EXT_COL_NAME] = 2.0
    # needs to go lowest to highest in case of multiple mentioned approaches
-    vd.loc[
+    for rank, methods in METHOD_RANKINGS.items():
-        vd[method_col].str.contains(
+        vd.loc[
-            "|".join(["OLS", "ordinary.least.square", "logistic.regression"])
+            vd[method_col].str.contains("|".join(methods)),
-        ),
+            INT_COL_NAME,
-        INT_COL_NAME,
+        ] = rank
    ] = 2.0
    vd.loc[
        vd[method_col].str.contains("|".join(["DM", "discontinuity.matching"])),
        INT_COL_NAME,
    ] = 3.0
    vd.loc[
        vd[method_col].str.contains(
            "|".join(["DID", "difference.in.diff", "diff.in.diff", "triple.diff"])
        ),
        INT_COL_NAME,
    ] = 3.0
    vd.loc[
        vd[method_col].str.contains(
            "|".join(["PSM", "propensity.score.matching", "score.matching"])
        ),
        INT_COL_NAME,
    ] = 3.5
    vd.loc[
        vd[method_col].str.contains("|".join(["IV", "instrumental.variable"])),
        INT_COL_NAME,
    ] = 4.0
    vd.loc[
        vd[method_col].str.contains("|".join(["RD", "regression.discontinuity"])),
        INT_COL_NAME,
    ] = 4.5
    vd.loc[vd[method_col].str.contains("RCT"), INT_COL_NAME] = 5.0
    return vd