refactor(code): Extract internal validity ranking keywords

Extracted keywords into dict which can be extended with either keywords or rankings as needed, providing more flexibility.
2024-02-16 17:59:09 +01:00 · 2024-02-16 17:59:09 +01:00 · e032a5c192
commit e032a5c192
parent 4e865ee2b5
1 changed files with 46 additions and 36 deletions
--- a/src/model/validity.py
+++ b/src/model/validity.py
@ -33,17 +33,53 @@ def _combined_validities(
    return r"\-"


+METHOD_RANKINGS = {
+    2.0: ["OLS", "ordinary.least.square", "logistic.regression"],
+    3.0: [
+        "DM",
+        "discontinuity.matching",
+        "DID",
+        "difference.in.diff",
+        "diff.in.diff",
+        "triple.diff",
+    ],
+    3.5: ["PSM", "propensity.score.matching", "score.matching"],
+    4.0: ["IV", "instrumental.variable"],
+    4.5: ["RD", "regression.discontinuity"],
+    5.0: ["RCT", "randomi(?:s|z)ed.control.trial"],
+}
+
+
+# TODO do not filter by quasi-/experimental, but analyse the whole df passed in
+#      This allows filtering to happen where it's needed but otherwise validity
+#      given for all studies passed in.
 def calculate(
-    df: DataFrame, repr_col: str = "representativeness", method_col: str = "method"
+    df: DataFrame,
+    repr_col: str = "representativeness",
+    design_col: str = "design",
+    method_col: str = "method",
 ) -> DataFrame:
+    """Add internal and external validities to a dataframe.
+
+    Requires a dataframe containing a study or observation per row, with a
+    single column describing the study design, method and representativeness
+    each respectively.
+
+    Takes a combination of study design (simulation/observational/
+    quasi-experimental/experimental/..) and its method (OLS/DID/RD/...) to
+    calculate an internal validity.
+
+    Takes a study representativeness (local/subnational/national/regional/
+    census) to calculate the external validity.
+    """
    EXT_COL_NAME: str = "external_validity"
    INT_COL_NAME: str = "internal_validity"
    cols = {EXT_COL_NAME: 0.0, INT_COL_NAME: 0.0}

-    vd = df[
-        (df["design"] == "quasi-experimental") | (df["design"] == "experimental")
-    ].copy()
-    vd.assign(**cols)
+    # vd = df[
+    #     (df[design_col] == "quasi-experimental") | (df[design_col] == "experimental")
+    # ].copy()
+    vd = df.assign(**cols)
    vd = cast(DataFrame, vd)

    vd[repr_col] = vd[repr_col].fillna("")
@ -57,37 +93,11 @@ def calculate(
    vd.loc[vd[repr_col].str.contains("local"), EXT_COL_NAME] = 2.0

    # needs to go lowest to highest in case of multiple mentioned approaches
+    for rank, methods in METHOD_RANKINGS.items():
        vd.loc[
-        vd[method_col].str.contains(
-            "|".join(["OLS", "ordinary.least.square", "logistic.regression"])
-        ),
+            vd[method_col].str.contains("|".join(methods)),
            INT_COL_NAME,
-    ] = 2.0
-    vd.loc[
-        vd[method_col].str.contains("|".join(["DM", "discontinuity.matching"])),
-        INT_COL_NAME,
-    ] = 3.0
-    vd.loc[
-        vd[method_col].str.contains(
-            "|".join(["DID", "difference.in.diff", "diff.in.diff", "triple.diff"])
-        ),
-        INT_COL_NAME,
-    ] = 3.0
-    vd.loc[
-        vd[method_col].str.contains(
-            "|".join(["PSM", "propensity.score.matching", "score.matching"])
-        ),
-        INT_COL_NAME,
-    ] = 3.5
-    vd.loc[
-        vd[method_col].str.contains("|".join(["IV", "instrumental.variable"])),
-        INT_COL_NAME,
-    ] = 4.0
-    vd.loc[
-        vd[method_col].str.contains("|".join(["RD", "regression.discontinuity"])),
-        INT_COL_NAME,
-    ] = 4.5
-    vd.loc[vd[method_col].str.contains("RCT"), INT_COL_NAME] = 5.0
+        ] = rank

    return vd