From e032a5c19273f290f768236f8c807999fa77af80 Mon Sep 17 00:00:00 2001 From: Marty Oehme Date: Fri, 16 Feb 2024 17:59:09 +0100 Subject: [PATCH] refactor(code): Extract internal validity ranking keywords Extracted keywords into dict which can be extended with either keywords or rankings as needed, providing more flexibility. --- src/model/validity.py | 82 ++++++++++++++++++++++++------------------- 1 file changed, 46 insertions(+), 36 deletions(-) diff --git a/src/model/validity.py b/src/model/validity.py index f3922d5..c8d4ef5 100644 --- a/src/model/validity.py +++ b/src/model/validity.py @@ -33,17 +33,53 @@ def _combined_validities( return r"\-" +METHOD_RANKINGS = { + 2.0: ["OLS", "ordinary.least.square", "logistic.regression"], + 3.0: [ + "DM", + "discontinuity.matching", + "DID", + "difference.in.diff", + "diff.in.diff", + "triple.diff", + ], + 3.5: ["PSM", "propensity.score.matching", "score.matching"], + 4.0: ["IV", "instrumental.variable"], + 4.5: ["RD", "regression.discontinuity"], + 5.0: ["RCT", "randomi(?:s|z)ed.control.trial"], +} + + +# TODO do not filter by quasi-/experimental, but analyse the whole df passed in +# This allows filtering to happen where it's needed but otherwise validity +# given for all studies passed in. def calculate( - df: DataFrame, repr_col: str = "representativeness", method_col: str = "method" + df: DataFrame, + repr_col: str = "representativeness", + design_col: str = "design", + method_col: str = "method", ) -> DataFrame: + """Add internal and external validities to a dataframe. + + Requires a dataframe containing a study or observation per row, with a + single column describing the study design, method and representativeness + each respectively. + + Takes a combination of study design (simulation/observational/ + quasi-experimental/experimental/..) and its method (OLS/DID/RD/...) to + calculate an internal validity. + + Takes a study representativeness (local/subnational/national/regional/ + census) to calculate the external validity. + """ EXT_COL_NAME: str = "external_validity" INT_COL_NAME: str = "internal_validity" cols = {EXT_COL_NAME: 0.0, INT_COL_NAME: 0.0} - vd = df[ - (df["design"] == "quasi-experimental") | (df["design"] == "experimental") - ].copy() - vd.assign(**cols) + # vd = df[ + # (df[design_col] == "quasi-experimental") | (df[design_col] == "experimental") + # ].copy() + vd = df.assign(**cols) vd = cast(DataFrame, vd) vd[repr_col] = vd[repr_col].fillna("") @@ -57,37 +93,11 @@ def calculate( vd.loc[vd[repr_col].str.contains("local"), EXT_COL_NAME] = 2.0 # needs to go lowest to highest in case of multiple mentioned approaches - vd.loc[ - vd[method_col].str.contains( - "|".join(["OLS", "ordinary.least.square", "logistic.regression"]) - ), - INT_COL_NAME, - ] = 2.0 - vd.loc[ - vd[method_col].str.contains("|".join(["DM", "discontinuity.matching"])), - INT_COL_NAME, - ] = 3.0 - vd.loc[ - vd[method_col].str.contains( - "|".join(["DID", "difference.in.diff", "diff.in.diff", "triple.diff"]) - ), - INT_COL_NAME, - ] = 3.0 - vd.loc[ - vd[method_col].str.contains( - "|".join(["PSM", "propensity.score.matching", "score.matching"]) - ), - INT_COL_NAME, - ] = 3.5 - vd.loc[ - vd[method_col].str.contains("|".join(["IV", "instrumental.variable"])), - INT_COL_NAME, - ] = 4.0 - vd.loc[ - vd[method_col].str.contains("|".join(["RD", "regression.discontinuity"])), - INT_COL_NAME, - ] = 4.5 - vd.loc[vd[method_col].str.contains("RCT"), INT_COL_NAME] = 5.0 + for rank, methods in METHOD_RANKINGS.items(): + vd.loc[ + vd[method_col].str.contains("|".join(methods)), + INT_COL_NAME, + ] = rank return vd