refactor(code): Split validity calc and matrix extract
Validity calculation belongs to the modelling, so we put it into the validity module. Extracting our matrix is a processing step so we made its own matrix module and put it in their. Should hopefully provide better separation of concerns going forward.
This commit is contained in:
parent
8333bbe9be
commit
fac7d4c86a
5 changed files with 103 additions and 89 deletions
|
|
@ -1,9 +1,8 @@
|
|||
import math
|
||||
from typing import cast
|
||||
|
||||
from pandas import DataFrame
|
||||
|
||||
from src import prep_data
|
||||
|
||||
|
||||
def _binned_strength(strength: float) -> str:
|
||||
if strength < 3.0:
|
||||
|
|
@ -28,15 +27,74 @@ def _combined_validities(
|
|||
return r"\-"
|
||||
|
||||
|
||||
def calculate(
|
||||
df: DataFrame, repr_col: str = "representativeness", method_col: str = "method"
|
||||
) -> DataFrame:
|
||||
EXT_COL_NAME: str = "external_validity"
|
||||
INT_COL_NAME: str = "internal_validity"
|
||||
cols = {EXT_COL_NAME: 0.0, INT_COL_NAME: 0.0}
|
||||
|
||||
vd = df[
|
||||
(df["design"] == "quasi-experimental") | (df["design"] == "experimental")
|
||||
].copy()
|
||||
vd.assign(**cols)
|
||||
vd = cast(DataFrame, vd)
|
||||
|
||||
vd[repr_col] = vd[repr_col].fillna("")
|
||||
vd[method_col] = vd[method_col].fillna("")
|
||||
# needs to check national before subnational and census, subnational before local
|
||||
vd.loc[
|
||||
vd[repr_col].str.contains("|".join(["national", "regional"])), EXT_COL_NAME
|
||||
] = 4.0
|
||||
vd.loc[vd[repr_col].str.contains("census"), EXT_COL_NAME] = 5.0
|
||||
vd.loc[vd[repr_col].str.contains("subnational"), EXT_COL_NAME] = 3.0
|
||||
vd.loc[vd[repr_col].str.contains("local"), EXT_COL_NAME] = 2.0
|
||||
|
||||
# needs to go lowest to highest in case of multiple mentioned approaches
|
||||
vd.loc[
|
||||
vd[method_col].str.contains(
|
||||
"|".join(["OLS", "ordinary.least.square", "logistic.regression"])
|
||||
),
|
||||
INT_COL_NAME,
|
||||
] = 2.0
|
||||
vd.loc[
|
||||
vd[method_col].str.contains("|".join(["DM", "discontinuity.matching"])),
|
||||
INT_COL_NAME,
|
||||
] = 3.0
|
||||
vd.loc[
|
||||
vd[method_col].str.contains(
|
||||
"|".join(["DID", "difference.in.diff", "diff.in.diff", "triple.diff"])
|
||||
),
|
||||
INT_COL_NAME,
|
||||
] = 3.0
|
||||
vd.loc[
|
||||
vd[method_col].str.contains(
|
||||
"|".join(["PSM", "propensity.score.matching", "score.matching"])
|
||||
),
|
||||
INT_COL_NAME,
|
||||
] = 3.5
|
||||
vd.loc[
|
||||
vd[method_col].str.contains("|".join(["IV", "instrumental.variable"])),
|
||||
INT_COL_NAME,
|
||||
] = 4.0
|
||||
vd.loc[
|
||||
vd[method_col].str.contains("|".join(["RD", "regression.discontinuity"])),
|
||||
INT_COL_NAME,
|
||||
] = 4.5
|
||||
vd.loc[vd[method_col].str.contains("RCT"), INT_COL_NAME] = 5.0
|
||||
|
||||
return vd
|
||||
|
||||
|
||||
def add_to_findings(
|
||||
findings_df: DataFrame, studies_by_intervention: DataFrame
|
||||
) -> DataFrame:
|
||||
valid_subset = (
|
||||
prep_data.calculate_validities(studies_by_intervention)[
|
||||
calculate(studies_by_intervention)[
|
||||
["internal_validity", "external_validity", "citation"]
|
||||
]
|
||||
.fillna(1.0)
|
||||
.drop_duplicates(subset=["citation"]) # type: ignore
|
||||
.drop_duplicates(subset=["citation"]) # type: ignore
|
||||
.sort_values("internal_validity")
|
||||
)
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue