feat(script): Move all data transformations to single chunk

This commit is contained in:
Marty Oehme 2024-07-14 21:51:18 +02:00
parent ed6c8550b6
commit ad71859ded
Signed by: Marty
GPG key ID: EDBF2ED917B2EF6A
2 changed files with 84 additions and 51 deletions

View file

@ -27,11 +27,41 @@ bib_sample = prep_data.bib_library_from_dir(WORKING_DATA)
# load relevant studies # load relevant studies
from src import load_data from src import load_data
bib_df = prep_data.observations_with_metadata_df( # each observation in a single dataframe
df = prep_data.observations_with_metadata_df(
raw_observations = load_data.from_yml(PROCESSED_DATA), raw_observations = load_data.from_yml(PROCESSED_DATA),
study_metadata = prep_data.bib_metadata_df(bib_sample), study_metadata = prep_data.bib_metadata_df(bib_sample),
country_groups = prep_data.country_groups_df(Path(f"{SUPPLEMENTARY_DATA}/wb-country-groupings.xlsx")), country_groups = prep_data.country_groups_df(Path(f"{SUPPLEMENTARY_DATA}/wb-country-groupings.xlsx")),
) )
raw_observations = None
zot_df = None # all observations but split per individual intervention
df_country_groups = None df_by_intervention = (
df
.fillna("")
.groupby(["author", "year", "title", "design", "method", "representativeness", "citation"])
.agg(
{
"intervention": lambda _col: "; ".join(_col),
}
)
.reset_index()
.drop_duplicates()
.assign(
intervention=lambda _df: _df["intervention"].apply(
lambda _cell: set([x.strip() for x in re.sub(r"\(.*\)", "", _cell).split(";")])
),
)
.explode("intervention")
)
# Calc study validities (internal & external separated)
from src.model import validity
validities = validity.calculate(df_by_intervention)
validities["identifier"] = validities["author"].str.replace(r',.*$', '', regex=True) + " (" + validities["year"].astype(str) + ")"
validities = validities.loc[(validities["design"] == "quasi-experimental") | (validities["design"] == "experimental")]
#validities["external_validity"] = validities["external_validity"].astype('category')
validities["internal_validity"] = validities["internal_validity"].astype('category')
validities["External Validity"] = validities["external_validity"]
validities["Internal Validity"] = validities["internal_validity"]

View file

@ -40,7 +40,7 @@ crossref: # to fix the appendix crossrefs being separate from main
```{python} ```{python}
#| label: load-data #| label: prep-data
#| echo: false #| echo: false
#| output: false #| output: false
{{< include 01-codechunks/_prep-data.py >}} {{< include 01-codechunks/_prep-data.py >}}
@ -159,32 +159,15 @@ The entire spread of policies captures interventions aimed primarily at institut
#| label: fig-intervention-types #| label: fig-intervention-types
#| fig-cap: Available studies by primary type of intervention #| fig-cap: Available studies by primary type of intervention
by_intervention = ( sort_order = df_by_intervention["intervention"].value_counts().index
bib_df
.fillna("")
.groupby(["author", "year", "title", "design", "method", "representativeness", "citation"])
.agg(
{
"intervention": lambda _col: "; ".join(_col),
}
)
.reset_index()
.drop_duplicates()
.assign(
intervention=lambda _df: _df["intervention"].apply(
lambda _cell: set([x.strip() for x in re.sub(r"\(.*\)", "", _cell).split(";")])
),
)
.explode("intervention")
)
sort_order = by_intervention["intervention"].value_counts().index
fig = plt.figure() fig = plt.figure()
fig.set_size_inches(6, 3) fig.set_size_inches(6, 3)
ax = sns.countplot(by_intervention, x="intervention", order=by_intervention["intervention"].value_counts().index) ax = sns.countplot(df_by_intervention, x="intervention", order=df_by_intervention["intervention"].value_counts().index)
plt.setp(ax.get_xticklabels(), rotation=45, ha="right", plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
rotation_mode="anchor") rotation_mode="anchor")
plt.show() plt.show()
del sort_order, fig, ax
``` ```
# Synthesis of evidence # Synthesis of evidence
@ -218,13 +201,42 @@ study_strength_bins = {
5.0: r"\+", 5.0: r"\+",
10.0: r"\++", 10.0: r"\++",
} }
def strength_for(val): def strength_for(val):
return list(study_strength_bins.keys())[list(study_strength_bins.values()).index(val)] return list(study_strength_bins.keys())[
list(study_strength_bins.values()).index(val)
]
findings_institutional = pd.read_csv("02-data/supplementary/findings-institutional.csv") findings_institutional = pd.read_csv("02-data/supplementary/findings-institutional.csv")
fd_df = validity.add_to_findings(findings_institutional, by_intervention, study_strength_bins)
md(tabulate(fd_df[["area of policy", "internal_validity", "external_validity", "findings", "channels"]].fillna(""), showindex=False, headers=["area of policy", "internal strength", "external strength", "main findings", "channels"], tablefmt="grid")) outp = md(
tabulate(
validity.add_to_findings(
findings_institutional, df_by_intervention, study_strength_bins
)[
[
"area of policy",
"internal_validity",
"external_validity",
"findings",
"channels",
]
].fillna(""),
showindex=False,
headers=[
"area of policy",
"internal strength",
"external strength",
"main findings",
"channels",
],
tablefmt="grid",
)
)
del findings_institutional
outp
``` ```
Note: Each main finding is presented with an internal strength of evidence and an external strength of evidence which describe the combined validities of the evidence base for the respective finding. Note: Each main finding is presented with an internal strength of evidence and an external strength of evidence which describe the combined validities of the evidence base for the respective finding.
@ -462,7 +474,7 @@ the overall output volume strongly increased during this period.
#| fig-cap: Publications per year #| fig-cap: Publications per year
df_study_years = ( df_study_years = (
bib_df.groupby(["author", "year", "title"]) df.groupby(["author", "year", "title"])
.first() .first()
.reset_index() .reset_index()
.drop_duplicates() .drop_duplicates()
@ -476,12 +488,11 @@ ax = sns.barplot(df_study_years, order=years_range)
ax.set_ylabel("Count") ax.set_ylabel("Count")
ax.set_xlabel("Year") ax.set_xlabel("Year")
plt.tight_layout() plt.tight_layout()
years_list = np.arange(2000, 2024).tolist()
ax.tick_params(axis='x', rotation=90) ax.tick_params(axis='x', rotation=90)
ax.set_ylabel("Citations") ax.set_ylabel("Citations")
ax.set_xlabel("Year") ax.set_xlabel("Year")
plt.show() plt.show()
df_study_years = None del df_study_years
``` ```
Such anomalies can point to a dispersed or different focus during the time span, Such anomalies can point to a dispersed or different focus during the time span,
@ -497,20 +508,21 @@ It also points to a well targeted identification procedure, with more up-to-date
```{python} ```{python}
#| label: fig-citations-per-year-avg #| label: fig-citations-per-year-avg
#| fig-cap: Average citations per year #| fig-cap: Average citations per year
bib_df["zot_cited"] = bib_df["zot_cited"].dropna().astype("int") df["zot_cited"] = df["zot_cited"].dropna().astype("int")
grpd = bib_df.groupby(["year"], as_index=False)["zot_cited"].mean() df_avg_citations = df.groupby(["year"], as_index=False)["zot_cited"].mean()
fig, ax = plt.subplots() fig, ax = plt.subplots()
ax.bar(grpd["year"], grpd["zot_cited"]) ax.bar(df_avg_citations["year"], df_avg_citations["zot_cited"])
sns.regplot(x=grpd["year"], y=grpd["zot_cited"], ax=ax) sns.regplot(x=df_avg_citations["year"], y=df_avg_citations["zot_cited"], ax=ax)
#ax = sns.lmplot(data=grpd, x="year", y="zot_cited", fit_reg=True) #ax = sns.lmplot(data=df_avg_citations, x="year", y="zot_cited", fit_reg=True)
years_list = np.arange(2000, 2024).tolist()
ax.set_xticks(years_list)
ax.tick_params(axis='x', rotation=90)
ax.set_ylabel("Citations") ax.set_ylabel("Citations")
ax.set_xlabel("Year") ax.set_xlabel("Year")
plt.tight_layout() plt.tight_layout()
years_range = list(range(df_avg_citations["year"].min(), df_avg_citations["year"].max()+1))
ax.set_xticks(years_range)
ax.tick_params(axis='x', rotation=90)
plt.show() plt.show()
del df_avg_citations
``` ```
From the literature sample, several patterns emerge: From the literature sample, several patterns emerge:
@ -549,16 +561,6 @@ while studies with higher internal validity in turn do not reach as high on the
#| label: fig-validity-relation #| label: fig-validity-relation
#| fig-cap: "Relation between internal and external validity" #| fig-cap: "Relation between internal and external validity"
from src.model import validity
validities = validity.calculate(by_intervention)
validities["identifier"] = validities["author"].str.replace(r',.*$', '', regex=True) + " (" + validities["year"].astype(str) + ")"
validities = validities.loc[(validities["design"] == "quasi-experimental") | (validities["design"] == "experimental")]
#validities["external_validity"] = validities["external_validity"].astype('category')
validities["internal_validity"] = validities["internal_validity"].astype('category')
validities["External Validity"] = validities["external_validity"]
validities["Internal Validity"] = validities["internal_validity"]
plt.figure().set_figheight(5) plt.figure().set_figheight(5)
sns.violinplot( sns.violinplot(
data=validities, data=validities,
@ -621,7 +623,7 @@ in which fewer studies have been identified.
#| fig-cap: Studies by regions analysed #| fig-cap: Studies by regions analysed
by_region = ( by_region = (
bib_df[["region"]] df[["region"]]
.assign( .assign(
region = lambda _df: (_df["region"] region = lambda _df: (_df["region"]
.str.replace(r" ?; ?", ";", regex=True) .str.replace(r" ?; ?", ";", regex=True)
@ -636,6 +638,7 @@ ax = sns.countplot(by_region, x="region", order=by_region["region"].value_counts
plt.setp(ax.get_xticklabels(), rotation=45, ha="right", plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
rotation_mode="anchor") rotation_mode="anchor")
plt.show() plt.show()
del by_region
def regions_for_inequality(df, inequality:str): def regions_for_inequality(df, inequality:str):
df_temp = df.loc[(df["inequality"] == inequality)] df_temp = df.loc[(df["inequality"] == inequality)]
@ -671,7 +674,7 @@ Another reason could be the actual implementation of different policy programmes
# Appendices {.appendix .unnumbered} # Appendices {.appendix .unnumbered}
## Appendix A {.unnumbered} ## Appendix A - Term clusters {.unnumbered}
::: {#appatbl-wow-terms} ::: {#appatbl-wow-terms}