feat(script): Move all data transformations to single chunk
This commit is contained in:
parent
ed6c8550b6
commit
ad71859ded
2 changed files with 84 additions and 51 deletions
|
@ -27,11 +27,41 @@ bib_sample = prep_data.bib_library_from_dir(WORKING_DATA)
|
||||||
# load relevant studies
|
# load relevant studies
|
||||||
from src import load_data
|
from src import load_data
|
||||||
|
|
||||||
bib_df = prep_data.observations_with_metadata_df(
|
# each observation in a single dataframe
|
||||||
|
df = prep_data.observations_with_metadata_df(
|
||||||
raw_observations = load_data.from_yml(PROCESSED_DATA),
|
raw_observations = load_data.from_yml(PROCESSED_DATA),
|
||||||
study_metadata = prep_data.bib_metadata_df(bib_sample),
|
study_metadata = prep_data.bib_metadata_df(bib_sample),
|
||||||
country_groups = prep_data.country_groups_df(Path(f"{SUPPLEMENTARY_DATA}/wb-country-groupings.xlsx")),
|
country_groups = prep_data.country_groups_df(Path(f"{SUPPLEMENTARY_DATA}/wb-country-groupings.xlsx")),
|
||||||
)
|
)
|
||||||
raw_observations = None
|
|
||||||
zot_df = None
|
# all observations but split per individual intervention
|
||||||
df_country_groups = None
|
df_by_intervention = (
|
||||||
|
df
|
||||||
|
.fillna("")
|
||||||
|
.groupby(["author", "year", "title", "design", "method", "representativeness", "citation"])
|
||||||
|
.agg(
|
||||||
|
{
|
||||||
|
"intervention": lambda _col: "; ".join(_col),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
.reset_index()
|
||||||
|
.drop_duplicates()
|
||||||
|
.assign(
|
||||||
|
intervention=lambda _df: _df["intervention"].apply(
|
||||||
|
lambda _cell: set([x.strip() for x in re.sub(r"\(.*\)", "", _cell).split(";")])
|
||||||
|
),
|
||||||
|
)
|
||||||
|
.explode("intervention")
|
||||||
|
)
|
||||||
|
|
||||||
|
# Calc study validities (internal & external separated)
|
||||||
|
from src.model import validity
|
||||||
|
|
||||||
|
validities = validity.calculate(df_by_intervention)
|
||||||
|
validities["identifier"] = validities["author"].str.replace(r',.*$', '', regex=True) + " (" + validities["year"].astype(str) + ")"
|
||||||
|
validities = validities.loc[(validities["design"] == "quasi-experimental") | (validities["design"] == "experimental")]
|
||||||
|
#validities["external_validity"] = validities["external_validity"].astype('category')
|
||||||
|
validities["internal_validity"] = validities["internal_validity"].astype('category')
|
||||||
|
validities["External Validity"] = validities["external_validity"]
|
||||||
|
validities["Internal Validity"] = validities["internal_validity"]
|
||||||
|
|
||||||
|
|
97
article.qmd
97
article.qmd
|
@ -40,7 +40,7 @@ crossref: # to fix the appendix crossrefs being separate from main
|
||||||
|
|
||||||
|
|
||||||
```{python}
|
```{python}
|
||||||
#| label: load-data
|
#| label: prep-data
|
||||||
#| echo: false
|
#| echo: false
|
||||||
#| output: false
|
#| output: false
|
||||||
{{< include 01-codechunks/_prep-data.py >}}
|
{{< include 01-codechunks/_prep-data.py >}}
|
||||||
|
@ -159,32 +159,15 @@ The entire spread of policies captures interventions aimed primarily at institut
|
||||||
#| label: fig-intervention-types
|
#| label: fig-intervention-types
|
||||||
#| fig-cap: Available studies by primary type of intervention
|
#| fig-cap: Available studies by primary type of intervention
|
||||||
|
|
||||||
by_intervention = (
|
sort_order = df_by_intervention["intervention"].value_counts().index
|
||||||
bib_df
|
|
||||||
.fillna("")
|
|
||||||
.groupby(["author", "year", "title", "design", "method", "representativeness", "citation"])
|
|
||||||
.agg(
|
|
||||||
{
|
|
||||||
"intervention": lambda _col: "; ".join(_col),
|
|
||||||
}
|
|
||||||
)
|
|
||||||
.reset_index()
|
|
||||||
.drop_duplicates()
|
|
||||||
.assign(
|
|
||||||
intervention=lambda _df: _df["intervention"].apply(
|
|
||||||
lambda _cell: set([x.strip() for x in re.sub(r"\(.*\)", "", _cell).split(";")])
|
|
||||||
),
|
|
||||||
)
|
|
||||||
.explode("intervention")
|
|
||||||
)
|
|
||||||
sort_order = by_intervention["intervention"].value_counts().index
|
|
||||||
|
|
||||||
fig = plt.figure()
|
fig = plt.figure()
|
||||||
fig.set_size_inches(6, 3)
|
fig.set_size_inches(6, 3)
|
||||||
ax = sns.countplot(by_intervention, x="intervention", order=by_intervention["intervention"].value_counts().index)
|
ax = sns.countplot(df_by_intervention, x="intervention", order=df_by_intervention["intervention"].value_counts().index)
|
||||||
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
|
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
|
||||||
rotation_mode="anchor")
|
rotation_mode="anchor")
|
||||||
plt.show()
|
plt.show()
|
||||||
|
del sort_order, fig, ax
|
||||||
```
|
```
|
||||||
|
|
||||||
# Synthesis of evidence
|
# Synthesis of evidence
|
||||||
|
@ -218,13 +201,42 @@ study_strength_bins = {
|
||||||
5.0: r"\+",
|
5.0: r"\+",
|
||||||
10.0: r"\++",
|
10.0: r"\++",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def strength_for(val):
|
def strength_for(val):
|
||||||
return list(study_strength_bins.keys())[list(study_strength_bins.values()).index(val)]
|
return list(study_strength_bins.keys())[
|
||||||
|
list(study_strength_bins.values()).index(val)
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
findings_institutional = pd.read_csv("02-data/supplementary/findings-institutional.csv")
|
findings_institutional = pd.read_csv("02-data/supplementary/findings-institutional.csv")
|
||||||
fd_df = validity.add_to_findings(findings_institutional, by_intervention, study_strength_bins)
|
|
||||||
|
|
||||||
md(tabulate(fd_df[["area of policy", "internal_validity", "external_validity", "findings", "channels"]].fillna(""), showindex=False, headers=["area of policy", "internal strength", "external strength", "main findings", "channels"], tablefmt="grid"))
|
outp = md(
|
||||||
|
tabulate(
|
||||||
|
validity.add_to_findings(
|
||||||
|
findings_institutional, df_by_intervention, study_strength_bins
|
||||||
|
)[
|
||||||
|
[
|
||||||
|
"area of policy",
|
||||||
|
"internal_validity",
|
||||||
|
"external_validity",
|
||||||
|
"findings",
|
||||||
|
"channels",
|
||||||
|
]
|
||||||
|
].fillna(""),
|
||||||
|
showindex=False,
|
||||||
|
headers=[
|
||||||
|
"area of policy",
|
||||||
|
"internal strength",
|
||||||
|
"external strength",
|
||||||
|
"main findings",
|
||||||
|
"channels",
|
||||||
|
],
|
||||||
|
tablefmt="grid",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
del findings_institutional
|
||||||
|
outp
|
||||||
```
|
```
|
||||||
|
|
||||||
Note: Each main finding is presented with an internal strength of evidence and an external strength of evidence which describe the combined validities of the evidence base for the respective finding.
|
Note: Each main finding is presented with an internal strength of evidence and an external strength of evidence which describe the combined validities of the evidence base for the respective finding.
|
||||||
|
@ -462,7 +474,7 @@ the overall output volume strongly increased during this period.
|
||||||
#| fig-cap: Publications per year
|
#| fig-cap: Publications per year
|
||||||
|
|
||||||
df_study_years = (
|
df_study_years = (
|
||||||
bib_df.groupby(["author", "year", "title"])
|
df.groupby(["author", "year", "title"])
|
||||||
.first()
|
.first()
|
||||||
.reset_index()
|
.reset_index()
|
||||||
.drop_duplicates()
|
.drop_duplicates()
|
||||||
|
@ -476,12 +488,11 @@ ax = sns.barplot(df_study_years, order=years_range)
|
||||||
ax.set_ylabel("Count")
|
ax.set_ylabel("Count")
|
||||||
ax.set_xlabel("Year")
|
ax.set_xlabel("Year")
|
||||||
plt.tight_layout()
|
plt.tight_layout()
|
||||||
years_list = np.arange(2000, 2024).tolist()
|
|
||||||
ax.tick_params(axis='x', rotation=90)
|
ax.tick_params(axis='x', rotation=90)
|
||||||
ax.set_ylabel("Citations")
|
ax.set_ylabel("Citations")
|
||||||
ax.set_xlabel("Year")
|
ax.set_xlabel("Year")
|
||||||
plt.show()
|
plt.show()
|
||||||
df_study_years = None
|
del df_study_years
|
||||||
```
|
```
|
||||||
|
|
||||||
Such anomalies can point to a dispersed or different focus during the time span,
|
Such anomalies can point to a dispersed or different focus during the time span,
|
||||||
|
@ -497,20 +508,21 @@ It also points to a well targeted identification procedure, with more up-to-date
|
||||||
```{python}
|
```{python}
|
||||||
#| label: fig-citations-per-year-avg
|
#| label: fig-citations-per-year-avg
|
||||||
#| fig-cap: Average citations per year
|
#| fig-cap: Average citations per year
|
||||||
bib_df["zot_cited"] = bib_df["zot_cited"].dropna().astype("int")
|
df["zot_cited"] = df["zot_cited"].dropna().astype("int")
|
||||||
grpd = bib_df.groupby(["year"], as_index=False)["zot_cited"].mean()
|
df_avg_citations = df.groupby(["year"], as_index=False)["zot_cited"].mean()
|
||||||
fig, ax = plt.subplots()
|
fig, ax = plt.subplots()
|
||||||
ax.bar(grpd["year"], grpd["zot_cited"])
|
ax.bar(df_avg_citations["year"], df_avg_citations["zot_cited"])
|
||||||
sns.regplot(x=grpd["year"], y=grpd["zot_cited"], ax=ax)
|
sns.regplot(x=df_avg_citations["year"], y=df_avg_citations["zot_cited"], ax=ax)
|
||||||
#ax = sns.lmplot(data=grpd, x="year", y="zot_cited", fit_reg=True)
|
#ax = sns.lmplot(data=df_avg_citations, x="year", y="zot_cited", fit_reg=True)
|
||||||
|
|
||||||
years_list = np.arange(2000, 2024).tolist()
|
|
||||||
ax.set_xticks(years_list)
|
|
||||||
ax.tick_params(axis='x', rotation=90)
|
|
||||||
ax.set_ylabel("Citations")
|
ax.set_ylabel("Citations")
|
||||||
ax.set_xlabel("Year")
|
ax.set_xlabel("Year")
|
||||||
plt.tight_layout()
|
plt.tight_layout()
|
||||||
|
years_range = list(range(df_avg_citations["year"].min(), df_avg_citations["year"].max()+1))
|
||||||
|
ax.set_xticks(years_range)
|
||||||
|
ax.tick_params(axis='x', rotation=90)
|
||||||
plt.show()
|
plt.show()
|
||||||
|
del df_avg_citations
|
||||||
```
|
```
|
||||||
|
|
||||||
From the literature sample, several patterns emerge:
|
From the literature sample, several patterns emerge:
|
||||||
|
@ -549,16 +561,6 @@ while studies with higher internal validity in turn do not reach as high on the
|
||||||
#| label: fig-validity-relation
|
#| label: fig-validity-relation
|
||||||
#| fig-cap: "Relation between internal and external validity"
|
#| fig-cap: "Relation between internal and external validity"
|
||||||
|
|
||||||
from src.model import validity
|
|
||||||
|
|
||||||
validities = validity.calculate(by_intervention)
|
|
||||||
validities["identifier"] = validities["author"].str.replace(r',.*$', '', regex=True) + " (" + validities["year"].astype(str) + ")"
|
|
||||||
validities = validities.loc[(validities["design"] == "quasi-experimental") | (validities["design"] == "experimental")]
|
|
||||||
#validities["external_validity"] = validities["external_validity"].astype('category')
|
|
||||||
validities["internal_validity"] = validities["internal_validity"].astype('category')
|
|
||||||
validities["External Validity"] = validities["external_validity"]
|
|
||||||
validities["Internal Validity"] = validities["internal_validity"]
|
|
||||||
|
|
||||||
plt.figure().set_figheight(5)
|
plt.figure().set_figheight(5)
|
||||||
sns.violinplot(
|
sns.violinplot(
|
||||||
data=validities,
|
data=validities,
|
||||||
|
@ -621,7 +623,7 @@ in which fewer studies have been identified.
|
||||||
#| fig-cap: Studies by regions analysed
|
#| fig-cap: Studies by regions analysed
|
||||||
|
|
||||||
by_region = (
|
by_region = (
|
||||||
bib_df[["region"]]
|
df[["region"]]
|
||||||
.assign(
|
.assign(
|
||||||
region = lambda _df: (_df["region"]
|
region = lambda _df: (_df["region"]
|
||||||
.str.replace(r" ?; ?", ";", regex=True)
|
.str.replace(r" ?; ?", ";", regex=True)
|
||||||
|
@ -636,6 +638,7 @@ ax = sns.countplot(by_region, x="region", order=by_region["region"].value_counts
|
||||||
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
|
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
|
||||||
rotation_mode="anchor")
|
rotation_mode="anchor")
|
||||||
plt.show()
|
plt.show()
|
||||||
|
del by_region
|
||||||
|
|
||||||
def regions_for_inequality(df, inequality:str):
|
def regions_for_inequality(df, inequality:str):
|
||||||
df_temp = df.loc[(df["inequality"] == inequality)]
|
df_temp = df.loc[(df["inequality"] == inequality)]
|
||||||
|
@ -671,7 +674,7 @@ Another reason could be the actual implementation of different policy programmes
|
||||||
|
|
||||||
# Appendices {.appendix .unnumbered}
|
# Appendices {.appendix .unnumbered}
|
||||||
|
|
||||||
## Appendix A {.unnumbered}
|
## Appendix A - Term clusters {.unnumbered}
|
||||||
|
|
||||||
::: {#appatbl-wow-terms}
|
::: {#appatbl-wow-terms}
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue