From e032a5c19273f290f768236f8c807999fa77af80 Mon Sep 17 00:00:00 2001
From: Marty Oehme <marty.oehme@gmail.com>
Date: Fri, 16 Feb 2024 17:59:09 +0100
Subject: [PATCH] refactor(code): Extract internal validity ranking keywords

Extracted keywords into dict which can be extended with either keywords
or rankings as needed, providing more flexibility.
---
 src/model/validity.py | 82 ++++++++++++++++++++++++-------------------
 1 file changed, 46 insertions(+), 36 deletions(-)

diff --git a/src/model/validity.py b/src/model/validity.py
index f3922d5..c8d4ef5 100644
--- a/src/model/validity.py
+++ b/src/model/validity.py
@@ -33,17 +33,53 @@ def _combined_validities(
     return r"\-"
 
 
+METHOD_RANKINGS = {
+    2.0: ["OLS", "ordinary.least.square", "logistic.regression"],
+    3.0: [
+        "DM",
+        "discontinuity.matching",
+        "DID",
+        "difference.in.diff",
+        "diff.in.diff",
+        "triple.diff",
+    ],
+    3.5: ["PSM", "propensity.score.matching", "score.matching"],
+    4.0: ["IV", "instrumental.variable"],
+    4.5: ["RD", "regression.discontinuity"],
+    5.0: ["RCT", "randomi(?:s|z)ed.control.trial"],
+}
+
+
+# TODO do not filter by quasi-/experimental, but analyse the whole df passed in
+#      This allows filtering to happen where it's needed but otherwise validity
+#      given for all studies passed in.
 def calculate(
-    df: DataFrame, repr_col: str = "representativeness", method_col: str = "method"
+    df: DataFrame,
+    repr_col: str = "representativeness",
+    design_col: str = "design",
+    method_col: str = "method",
 ) -> DataFrame:
+    """Add internal and external validities to a dataframe.
+
+    Requires a dataframe containing a study or observation per row, with a
+    single column describing the study design, method and representativeness
+    each respectively.
+
+    Takes a combination of study design (simulation/observational/
+    quasi-experimental/experimental/..) and its method (OLS/DID/RD/...) to
+    calculate an internal validity.
+
+    Takes a study representativeness (local/subnational/national/regional/
+    census) to calculate the external validity.
+    """
     EXT_COL_NAME: str = "external_validity"
     INT_COL_NAME: str = "internal_validity"
     cols = {EXT_COL_NAME: 0.0, INT_COL_NAME: 0.0}
 
-    vd = df[
-        (df["design"] == "quasi-experimental") | (df["design"] == "experimental")
-    ].copy()
-    vd.assign(**cols)
+    # vd = df[
+    #     (df[design_col] == "quasi-experimental") | (df[design_col] == "experimental")
+    # ].copy()
+    vd = df.assign(**cols)
     vd = cast(DataFrame, vd)
 
     vd[repr_col] = vd[repr_col].fillna("")
@@ -57,37 +93,11 @@ def calculate(
     vd.loc[vd[repr_col].str.contains("local"), EXT_COL_NAME] = 2.0
 
     # needs to go lowest to highest in case of multiple mentioned approaches
-    vd.loc[
-        vd[method_col].str.contains(
-            "|".join(["OLS", "ordinary.least.square", "logistic.regression"])
-        ),
-        INT_COL_NAME,
-    ] = 2.0
-    vd.loc[
-        vd[method_col].str.contains("|".join(["DM", "discontinuity.matching"])),
-        INT_COL_NAME,
-    ] = 3.0
-    vd.loc[
-        vd[method_col].str.contains(
-            "|".join(["DID", "difference.in.diff", "diff.in.diff", "triple.diff"])
-        ),
-        INT_COL_NAME,
-    ] = 3.0
-    vd.loc[
-        vd[method_col].str.contains(
-            "|".join(["PSM", "propensity.score.matching", "score.matching"])
-        ),
-        INT_COL_NAME,
-    ] = 3.5
-    vd.loc[
-        vd[method_col].str.contains("|".join(["IV", "instrumental.variable"])),
-        INT_COL_NAME,
-    ] = 4.0
-    vd.loc[
-        vd[method_col].str.contains("|".join(["RD", "regression.discontinuity"])),
-        INT_COL_NAME,
-    ] = 4.5
-    vd.loc[vd[method_col].str.contains("RCT"), INT_COL_NAME] = 5.0
+    for rank, methods in METHOD_RANKINGS.items():
+        vd.loc[
+            vd[method_col].str.contains("|".join(methods)),
+            INT_COL_NAME,
+        ] = rank
 
     return vd