chore(script): Lowercase all df columns

In preparation for the processed sample renamed all columns to
their lowercase versions.
This commit is contained in:
Marty Oehme 2023-12-07 20:40:54 +01:00
parent d88c733b6d
commit 76ff71765c
Signed by: Marty
GPG key ID: EDBF2ED917B2EF6A

View file

@ -477,13 +477,13 @@ for e in sample_relevant:
]) ])
# FIXME do not just drop missing values # FIXME do not just drop missing values
bib_df = pd.DataFrame(reformatted, columns = ["Year", "Author", "Title", "Type", "Cited", "Usage", "Keywords"]) bib_df = pd.DataFrame(reformatted, columns = ["year", "author", "title", "type", "cited", "usage", "keywords"])
bib_df = bib_df.dropna(how="any") bib_df = bib_df.dropna(how="any")
bib_df["Date"] = pd.to_datetime(bib_df["Year"], format="mixed") bib_df["date"] = pd.to_datetime(bib_df["year"], format="mixed")
bib_df["Year"] = bib_df["Date"].dt.year bib_df["year"] = bib_df["date"].dt.year
# only keep newer entries # only keep newer entries
bib_df = bib_df[bib_df["Year"] >= 2000] bib_df = bib_df[bib_df["year"] >= 2000]
# Add WB country grouping definitions (income group, world region) # Add WB country grouping definitions (income group, world region)
# TODO Re-enable for processed study pool # TODO Re-enable for processed study pool
@ -498,12 +498,12 @@ bib_df = bib_df[bib_df["Year"] >= 2000]
#| fig-cap: Publications per year #| fig-cap: Publications per year
# create dummy category for white or gray lit type (based on 'article' appearing in type) # create dummy category for white or gray lit type (based on 'article' appearing in type)
bib_df["Type"].value_counts() bib_df["type"].value_counts()
bib_df["Literature"] = np.where(bib_df["Type"].str.contains("article", case=False, regex=False), "white", "gray") bib_df["literature"] = np.where(bib_df["type"].str.contains("article", case=False, regex=False), "white", "gray")
bib_df["Literature"] = bib_df["Literature"].astype("category") bib_df["literature"] = bib_df["literature"].astype("category")
# plot by year, distinguished by literature type # plot by year, distinguished by literature type
ax = sns.countplot(bib_df, x="Year", hue="Literature") ax = sns.countplot(bib_df, x="year", hue="literature")
ax.tick_params(axis='x', rotation=45) ax.tick_params(axis='x', rotation=45)
# ax.set_xlabel("") # ax.set_xlabel("")
plt.tight_layout() plt.tight_layout()
@ -525,9 +525,9 @@ First, in general, citation counts are slightly decreasing - as should generally
```{python} ```{python}
#| label: fig-citations-per-year-avg #| label: fig-citations-per-year-avg
#| fig-cap: Average citations per year #| fig-cap: Average citations per year
bib_df["Cited"] = bib_df["Cited"].astype("int") bib_df["cited"] = bib_df["cited"].astype("int")
grpd = bib_df.groupby(["Year"], as_index=False)["Cited"].mean() grpd = bib_df.groupby(["year"], as_index=False)["cited"].mean()
ax = sns.barplot(grpd, x="Year", y="Cited") ax = sns.barplot(grpd, x="year", y="cited")
ax.tick_params(axis='x', rotation=45) ax.tick_params(axis='x', rotation=45)
plt.tight_layout() plt.tight_layout()
plt.show() plt.show()
@ -555,17 +555,17 @@ Should they point towards gaps (or over-optimization) of sepcific areas of inter
#| column: page #| column: page
interv_type_df = ( interv_type_df = (
bib_df["Keywords"] bib_df["keywords"]
.str.replace(r"\_", " ") .str.replace(r"\_", " ")
.str.extractall(r"type::([\w ]+)") .str.extractall(r"type::([\w ]+)")
.reset_index(drop=True) .reset_index(drop=True)
.rename(columns = {0:"Intervention type"}) .rename(columns = {0:"intervention type"})
) )
sort_order = interv_type_df["Intervention type"].value_counts(ascending=False).index sort_order = interv_type_df["intervention type"].value_counts(ascending=False).index
fig = plt.figure() fig = plt.figure()
fig.set_size_inches(12, 4) fig.set_size_inches(12, 4)
ax = sns.countplot(interv_type_df, x="Intervention type", order=sort_order) ax = sns.countplot(interv_type_df, x="intervention type", order=sort_order)
plt.setp(ax.get_xticklabels(), rotation=45, ha="right", plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
rotation_mode="anchor") rotation_mode="anchor")
plt.show() plt.show()
@ -579,17 +579,17 @@ plt.show()
#| column: page #| column: page
inequ_type_df = ( inequ_type_df = (
bib_df["Keywords"] bib_df["keywords"]
.str.replace(r"\_", " ") .str.replace(r"\_", " ")
.str.extractall(r"inequality::([\w ]+)") .str.extractall(r"inequality::([\w ]+)")
.reset_index(drop=True) .reset_index(drop=True)
.rename(columns = {0:"Inequality type"}) .rename(columns = {0:"inequality type"})
) )
sort_order = inequ_type_df["Inequality type"].value_counts(ascending=False).index sort_order = inequ_type_df["inequality type"].value_counts(ascending=False).index
fig = plt.figure() fig = plt.figure()
fig.set_size_inches(12, 4) fig.set_size_inches(12, 4)
ax = sns.countplot(inequ_type_df, x="Inequality type", order=sort_order) ax = sns.countplot(inequ_type_df, x="inequality type", order=sort_order)
plt.setp(ax.get_xticklabels(), rotation=45, ha="right", plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
rotation_mode="anchor") rotation_mode="anchor")
plt.show() plt.show()