refactor(code): Split validity calc and matrix extract

Validity calculation belongs to the modelling, so we put it into the validity module. Extracting our matrix is a processing step so we made its own matrix module and put it in their. Should hopefully provide better separation of concerns going forward.
2024-02-16 11:25:19 +01:00 · 2024-02-16 11:25:19 +01:00 · fac7d4c86a
commit fac7d4c86a
parent 8333bbe9be
5 changed files with 103 additions and 89 deletions
--- a/src/model/validity.py
+++ b/src/model/validity.py
@ -1,9 +1,8 @@
 import math
+from typing import cast

 from pandas import DataFrame

-from src import prep_data
-

 def _binned_strength(strength: float) -> str:
    if strength < 3.0:
@ -28,15 +27,74 @@ def _combined_validities(
    return r"\-"


+def calculate(
+    df: DataFrame, repr_col: str = "representativeness", method_col: str = "method"
+) -> DataFrame:
+    EXT_COL_NAME: str = "external_validity"
+    INT_COL_NAME: str = "internal_validity"
+    cols = {EXT_COL_NAME: 0.0, INT_COL_NAME: 0.0}
+
+    vd = df[
+        (df["design"] == "quasi-experimental") | (df["design"] == "experimental")
+    ].copy()
+    vd.assign(**cols)
+    vd = cast(DataFrame, vd)
+
+    vd[repr_col] = vd[repr_col].fillna("")
+    vd[method_col] = vd[method_col].fillna("")
+    # needs to check national before subnational and census, subnational before local
+    vd.loc[
+        vd[repr_col].str.contains("|".join(["national", "regional"])), EXT_COL_NAME
+    ] = 4.0
+    vd.loc[vd[repr_col].str.contains("census"), EXT_COL_NAME] = 5.0
+    vd.loc[vd[repr_col].str.contains("subnational"), EXT_COL_NAME] = 3.0
+    vd.loc[vd[repr_col].str.contains("local"), EXT_COL_NAME] = 2.0
+
+    # needs to go lowest to highest in case of multiple mentioned approaches
+    vd.loc[
+        vd[method_col].str.contains(
+            "|".join(["OLS", "ordinary.least.square", "logistic.regression"])
+        ),
+        INT_COL_NAME,
+    ] = 2.0
+    vd.loc[
+        vd[method_col].str.contains("|".join(["DM", "discontinuity.matching"])),
+        INT_COL_NAME,
+    ] = 3.0
+    vd.loc[
+        vd[method_col].str.contains(
+            "|".join(["DID", "difference.in.diff", "diff.in.diff", "triple.diff"])
+        ),
+        INT_COL_NAME,
+    ] = 3.0
+    vd.loc[
+        vd[method_col].str.contains(
+            "|".join(["PSM", "propensity.score.matching", "score.matching"])
+        ),
+        INT_COL_NAME,
+    ] = 3.5
+    vd.loc[
+        vd[method_col].str.contains("|".join(["IV", "instrumental.variable"])),
+        INT_COL_NAME,
+    ] = 4.0
+    vd.loc[
+        vd[method_col].str.contains("|".join(["RD", "regression.discontinuity"])),
+        INT_COL_NAME,
+    ] = 4.5
+    vd.loc[vd[method_col].str.contains("RCT"), INT_COL_NAME] = 5.0
+
+    return vd
+
+
 def add_to_findings(
    findings_df: DataFrame, studies_by_intervention: DataFrame
 ) -> DataFrame:
    valid_subset = (
-        prep_data.calculate_validities(studies_by_intervention)[
+        calculate(studies_by_intervention)[
            ["internal_validity", "external_validity", "citation"]
        ]
        .fillna(1.0)
-        .drop_duplicates(subset=["citation"]) # type: ignore
+        .drop_duplicates(subset=["citation"])  # type: ignore
        .sort_values("internal_validity")
    )