From ad71859ded7a805e99a5315b716674b3e19395ce Mon Sep 17 00:00:00 2001 From: Marty Oehme Date: Sun, 14 Jul 2024 21:51:18 +0200 Subject: [PATCH] feat(script): Move all data transformations to single chunk --- 01-codechunks/_prep-data.py | 38 +++++++++++++-- article.qmd | 97 +++++++++++++++++++------------------ 2 files changed, 84 insertions(+), 51 deletions(-) diff --git a/01-codechunks/_prep-data.py b/01-codechunks/_prep-data.py index 8b3f088..416ffde 100644 --- a/01-codechunks/_prep-data.py +++ b/01-codechunks/_prep-data.py @@ -27,11 +27,41 @@ bib_sample = prep_data.bib_library_from_dir(WORKING_DATA) # load relevant studies from src import load_data -bib_df = prep_data.observations_with_metadata_df( +# each observation in a single dataframe +df = prep_data.observations_with_metadata_df( raw_observations = load_data.from_yml(PROCESSED_DATA), study_metadata = prep_data.bib_metadata_df(bib_sample), country_groups = prep_data.country_groups_df(Path(f"{SUPPLEMENTARY_DATA}/wb-country-groupings.xlsx")), ) -raw_observations = None -zot_df = None -df_country_groups = None + +# all observations but split per individual intervention +df_by_intervention = ( + df + .fillna("") + .groupby(["author", "year", "title", "design", "method", "representativeness", "citation"]) + .agg( + { + "intervention": lambda _col: "; ".join(_col), + } + ) + .reset_index() + .drop_duplicates() + .assign( + intervention=lambda _df: _df["intervention"].apply( + lambda _cell: set([x.strip() for x in re.sub(r"\(.*\)", "", _cell).split(";")]) + ), + ) + .explode("intervention") +) + +# Calc study validities (internal & external separated) +from src.model import validity + +validities = validity.calculate(df_by_intervention) +validities["identifier"] = validities["author"].str.replace(r',.*$', '', regex=True) + " (" + validities["year"].astype(str) + ")" +validities = validities.loc[(validities["design"] == "quasi-experimental") | (validities["design"] == "experimental")] +#validities["external_validity"] = validities["external_validity"].astype('category') +validities["internal_validity"] = validities["internal_validity"].astype('category') +validities["External Validity"] = validities["external_validity"] +validities["Internal Validity"] = validities["internal_validity"] + diff --git a/article.qmd b/article.qmd index d70885f..5035f70 100644 --- a/article.qmd +++ b/article.qmd @@ -40,7 +40,7 @@ crossref: # to fix the appendix crossrefs being separate from main ```{python} -#| label: load-data +#| label: prep-data #| echo: false #| output: false {{< include 01-codechunks/_prep-data.py >}} @@ -159,32 +159,15 @@ The entire spread of policies captures interventions aimed primarily at institut #| label: fig-intervention-types #| fig-cap: Available studies by primary type of intervention -by_intervention = ( - bib_df - .fillna("") - .groupby(["author", "year", "title", "design", "method", "representativeness", "citation"]) - .agg( - { - "intervention": lambda _col: "; ".join(_col), - } - ) - .reset_index() - .drop_duplicates() - .assign( - intervention=lambda _df: _df["intervention"].apply( - lambda _cell: set([x.strip() for x in re.sub(r"\(.*\)", "", _cell).split(";")]) - ), - ) - .explode("intervention") -) -sort_order = by_intervention["intervention"].value_counts().index +sort_order = df_by_intervention["intervention"].value_counts().index fig = plt.figure() fig.set_size_inches(6, 3) -ax = sns.countplot(by_intervention, x="intervention", order=by_intervention["intervention"].value_counts().index) +ax = sns.countplot(df_by_intervention, x="intervention", order=df_by_intervention["intervention"].value_counts().index) plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor") plt.show() +del sort_order, fig, ax ``` # Synthesis of evidence @@ -218,13 +201,42 @@ study_strength_bins = { 5.0: r"\+", 10.0: r"\++", } + + def strength_for(val): - return list(study_strength_bins.keys())[list(study_strength_bins.values()).index(val)] + return list(study_strength_bins.keys())[ + list(study_strength_bins.values()).index(val) + ] + findings_institutional = pd.read_csv("02-data/supplementary/findings-institutional.csv") -fd_df = validity.add_to_findings(findings_institutional, by_intervention, study_strength_bins) -md(tabulate(fd_df[["area of policy", "internal_validity", "external_validity", "findings", "channels"]].fillna(""), showindex=False, headers=["area of policy", "internal strength", "external strength", "main findings", "channels"], tablefmt="grid")) +outp = md( + tabulate( + validity.add_to_findings( + findings_institutional, df_by_intervention, study_strength_bins + )[ + [ + "area of policy", + "internal_validity", + "external_validity", + "findings", + "channels", + ] + ].fillna(""), + showindex=False, + headers=[ + "area of policy", + "internal strength", + "external strength", + "main findings", + "channels", + ], + tablefmt="grid", + ) +) +del findings_institutional +outp ``` Note: Each main finding is presented with an internal strength of evidence and an external strength of evidence which describe the combined validities of the evidence base for the respective finding. @@ -462,7 +474,7 @@ the overall output volume strongly increased during this period. #| fig-cap: Publications per year df_study_years = ( - bib_df.groupby(["author", "year", "title"]) + df.groupby(["author", "year", "title"]) .first() .reset_index() .drop_duplicates() @@ -476,12 +488,11 @@ ax = sns.barplot(df_study_years, order=years_range) ax.set_ylabel("Count") ax.set_xlabel("Year") plt.tight_layout() -years_list = np.arange(2000, 2024).tolist() ax.tick_params(axis='x', rotation=90) ax.set_ylabel("Citations") ax.set_xlabel("Year") plt.show() -df_study_years = None +del df_study_years ``` Such anomalies can point to a dispersed or different focus during the time span, @@ -497,20 +508,21 @@ It also points to a well targeted identification procedure, with more up-to-date ```{python} #| label: fig-citations-per-year-avg #| fig-cap: Average citations per year -bib_df["zot_cited"] = bib_df["zot_cited"].dropna().astype("int") -grpd = bib_df.groupby(["year"], as_index=False)["zot_cited"].mean() +df["zot_cited"] = df["zot_cited"].dropna().astype("int") +df_avg_citations = df.groupby(["year"], as_index=False)["zot_cited"].mean() fig, ax = plt.subplots() -ax.bar(grpd["year"], grpd["zot_cited"]) -sns.regplot(x=grpd["year"], y=grpd["zot_cited"], ax=ax) -#ax = sns.lmplot(data=grpd, x="year", y="zot_cited", fit_reg=True) +ax.bar(df_avg_citations["year"], df_avg_citations["zot_cited"]) +sns.regplot(x=df_avg_citations["year"], y=df_avg_citations["zot_cited"], ax=ax) +#ax = sns.lmplot(data=df_avg_citations, x="year", y="zot_cited", fit_reg=True) -years_list = np.arange(2000, 2024).tolist() -ax.set_xticks(years_list) -ax.tick_params(axis='x', rotation=90) ax.set_ylabel("Citations") ax.set_xlabel("Year") plt.tight_layout() +years_range = list(range(df_avg_citations["year"].min(), df_avg_citations["year"].max()+1)) +ax.set_xticks(years_range) +ax.tick_params(axis='x', rotation=90) plt.show() +del df_avg_citations ``` From the literature sample, several patterns emerge: @@ -549,16 +561,6 @@ while studies with higher internal validity in turn do not reach as high on the #| label: fig-validity-relation #| fig-cap: "Relation between internal and external validity" -from src.model import validity - -validities = validity.calculate(by_intervention) -validities["identifier"] = validities["author"].str.replace(r',.*$', '', regex=True) + " (" + validities["year"].astype(str) + ")" -validities = validities.loc[(validities["design"] == "quasi-experimental") | (validities["design"] == "experimental")] -#validities["external_validity"] = validities["external_validity"].astype('category') -validities["internal_validity"] = validities["internal_validity"].astype('category') -validities["External Validity"] = validities["external_validity"] -validities["Internal Validity"] = validities["internal_validity"] - plt.figure().set_figheight(5) sns.violinplot( data=validities, @@ -621,7 +623,7 @@ in which fewer studies have been identified. #| fig-cap: Studies by regions analysed by_region = ( - bib_df[["region"]] + df[["region"]] .assign( region = lambda _df: (_df["region"] .str.replace(r" ?; ?", ";", regex=True) @@ -636,6 +638,7 @@ ax = sns.countplot(by_region, x="region", order=by_region["region"].value_counts plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor") plt.show() +del by_region def regions_for_inequality(df, inequality:str): df_temp = df.loc[(df["inequality"] == inequality)] @@ -671,7 +674,7 @@ Another reason could be the actual implementation of different policy programmes # Appendices {.appendix .unnumbered} -## Appendix A {.unnumbered} +## Appendix A - Term clusters {.unnumbered} ::: {#appatbl-wow-terms}