refactor(code): Extract internal validity ranking keywords

Extracted keywords into dict which can be extended with either keywords
or rankings as needed, providing more flexibility.
This commit is contained in:
Marty Oehme 2024-02-16 17:59:09 +01:00
parent 4e865ee2b5
commit e032a5c192
Signed by: Marty
GPG key ID: EDBF2ED917B2EF6A

View file

@ -33,17 +33,53 @@ def _combined_validities(
return r"\-" return r"\-"
METHOD_RANKINGS = {
2.0: ["OLS", "ordinary.least.square", "logistic.regression"],
3.0: [
"DM",
"discontinuity.matching",
"DID",
"difference.in.diff",
"diff.in.diff",
"triple.diff",
],
3.5: ["PSM", "propensity.score.matching", "score.matching"],
4.0: ["IV", "instrumental.variable"],
4.5: ["RD", "regression.discontinuity"],
5.0: ["RCT", "randomi(?:s|z)ed.control.trial"],
}
# TODO do not filter by quasi-/experimental, but analyse the whole df passed in
# This allows filtering to happen where it's needed but otherwise validity
# given for all studies passed in.
def calculate( def calculate(
df: DataFrame, repr_col: str = "representativeness", method_col: str = "method" df: DataFrame,
repr_col: str = "representativeness",
design_col: str = "design",
method_col: str = "method",
) -> DataFrame: ) -> DataFrame:
"""Add internal and external validities to a dataframe.
Requires a dataframe containing a study or observation per row, with a
single column describing the study design, method and representativeness
each respectively.
Takes a combination of study design (simulation/observational/
quasi-experimental/experimental/..) and its method (OLS/DID/RD/...) to
calculate an internal validity.
Takes a study representativeness (local/subnational/national/regional/
census) to calculate the external validity.
"""
EXT_COL_NAME: str = "external_validity" EXT_COL_NAME: str = "external_validity"
INT_COL_NAME: str = "internal_validity" INT_COL_NAME: str = "internal_validity"
cols = {EXT_COL_NAME: 0.0, INT_COL_NAME: 0.0} cols = {EXT_COL_NAME: 0.0, INT_COL_NAME: 0.0}
vd = df[ # vd = df[
(df["design"] == "quasi-experimental") | (df["design"] == "experimental") # (df[design_col] == "quasi-experimental") | (df[design_col] == "experimental")
].copy() # ].copy()
vd.assign(**cols) vd = df.assign(**cols)
vd = cast(DataFrame, vd) vd = cast(DataFrame, vd)
vd[repr_col] = vd[repr_col].fillna("") vd[repr_col] = vd[repr_col].fillna("")
@ -57,37 +93,11 @@ def calculate(
vd.loc[vd[repr_col].str.contains("local"), EXT_COL_NAME] = 2.0 vd.loc[vd[repr_col].str.contains("local"), EXT_COL_NAME] = 2.0
# needs to go lowest to highest in case of multiple mentioned approaches # needs to go lowest to highest in case of multiple mentioned approaches
for rank, methods in METHOD_RANKINGS.items():
vd.loc[ vd.loc[
vd[method_col].str.contains( vd[method_col].str.contains("|".join(methods)),
"|".join(["OLS", "ordinary.least.square", "logistic.regression"])
),
INT_COL_NAME, INT_COL_NAME,
] = 2.0 ] = rank
vd.loc[
vd[method_col].str.contains("|".join(["DM", "discontinuity.matching"])),
INT_COL_NAME,
] = 3.0
vd.loc[
vd[method_col].str.contains(
"|".join(["DID", "difference.in.diff", "diff.in.diff", "triple.diff"])
),
INT_COL_NAME,
] = 3.0
vd.loc[
vd[method_col].str.contains(
"|".join(["PSM", "propensity.score.matching", "score.matching"])
),
INT_COL_NAME,
] = 3.5
vd.loc[
vd[method_col].str.contains("|".join(["IV", "instrumental.variable"])),
INT_COL_NAME,
] = 4.0
vd.loc[
vd[method_col].str.contains("|".join(["RD", "regression.discontinuity"])),
INT_COL_NAME,
] = 4.5
vd.loc[vd[method_col].str.contains("RCT"), INT_COL_NAME] = 5.0
return vd return vd