chore(script): Lowercase all df columns
In preparation for the processed sample renamed all columns to their lowercase versions.
This commit is contained in:
parent
d88c733b6d
commit
76ff71765c
1 changed files with 19 additions and 19 deletions
|
@ -477,13 +477,13 @@ for e in sample_relevant:
|
||||||
])
|
])
|
||||||
|
|
||||||
# FIXME do not just drop missing values
|
# FIXME do not just drop missing values
|
||||||
bib_df = pd.DataFrame(reformatted, columns = ["Year", "Author", "Title", "Type", "Cited", "Usage", "Keywords"])
|
bib_df = pd.DataFrame(reformatted, columns = ["year", "author", "title", "type", "cited", "usage", "keywords"])
|
||||||
bib_df = bib_df.dropna(how="any")
|
bib_df = bib_df.dropna(how="any")
|
||||||
bib_df["Date"] = pd.to_datetime(bib_df["Year"], format="mixed")
|
bib_df["date"] = pd.to_datetime(bib_df["year"], format="mixed")
|
||||||
bib_df["Year"] = bib_df["Date"].dt.year
|
bib_df["year"] = bib_df["date"].dt.year
|
||||||
|
|
||||||
# only keep newer entries
|
# only keep newer entries
|
||||||
bib_df = bib_df[bib_df["Year"] >= 2000]
|
bib_df = bib_df[bib_df["year"] >= 2000]
|
||||||
|
|
||||||
# Add WB country grouping definitions (income group, world region)
|
# Add WB country grouping definitions (income group, world region)
|
||||||
# TODO Re-enable for processed study pool
|
# TODO Re-enable for processed study pool
|
||||||
|
@ -498,12 +498,12 @@ bib_df = bib_df[bib_df["Year"] >= 2000]
|
||||||
#| fig-cap: Publications per year
|
#| fig-cap: Publications per year
|
||||||
|
|
||||||
# create dummy category for white or gray lit type (based on 'article' appearing in type)
|
# create dummy category for white or gray lit type (based on 'article' appearing in type)
|
||||||
bib_df["Type"].value_counts()
|
bib_df["type"].value_counts()
|
||||||
bib_df["Literature"] = np.where(bib_df["Type"].str.contains("article", case=False, regex=False), "white", "gray")
|
bib_df["literature"] = np.where(bib_df["type"].str.contains("article", case=False, regex=False), "white", "gray")
|
||||||
bib_df["Literature"] = bib_df["Literature"].astype("category")
|
bib_df["literature"] = bib_df["literature"].astype("category")
|
||||||
|
|
||||||
# plot by year, distinguished by literature type
|
# plot by year, distinguished by literature type
|
||||||
ax = sns.countplot(bib_df, x="Year", hue="Literature")
|
ax = sns.countplot(bib_df, x="year", hue="literature")
|
||||||
ax.tick_params(axis='x', rotation=45)
|
ax.tick_params(axis='x', rotation=45)
|
||||||
# ax.set_xlabel("")
|
# ax.set_xlabel("")
|
||||||
plt.tight_layout()
|
plt.tight_layout()
|
||||||
|
@ -525,9 +525,9 @@ First, in general, citation counts are slightly decreasing - as should generally
|
||||||
```{python}
|
```{python}
|
||||||
#| label: fig-citations-per-year-avg
|
#| label: fig-citations-per-year-avg
|
||||||
#| fig-cap: Average citations per year
|
#| fig-cap: Average citations per year
|
||||||
bib_df["Cited"] = bib_df["Cited"].astype("int")
|
bib_df["cited"] = bib_df["cited"].astype("int")
|
||||||
grpd = bib_df.groupby(["Year"], as_index=False)["Cited"].mean()
|
grpd = bib_df.groupby(["year"], as_index=False)["cited"].mean()
|
||||||
ax = sns.barplot(grpd, x="Year", y="Cited")
|
ax = sns.barplot(grpd, x="year", y="cited")
|
||||||
ax.tick_params(axis='x', rotation=45)
|
ax.tick_params(axis='x', rotation=45)
|
||||||
plt.tight_layout()
|
plt.tight_layout()
|
||||||
plt.show()
|
plt.show()
|
||||||
|
@ -555,17 +555,17 @@ Should they point towards gaps (or over-optimization) of sepcific areas of inter
|
||||||
#| column: page
|
#| column: page
|
||||||
|
|
||||||
interv_type_df = (
|
interv_type_df = (
|
||||||
bib_df["Keywords"]
|
bib_df["keywords"]
|
||||||
.str.replace(r"\_", " ")
|
.str.replace(r"\_", " ")
|
||||||
.str.extractall(r"type::([\w ]+)")
|
.str.extractall(r"type::([\w ]+)")
|
||||||
.reset_index(drop=True)
|
.reset_index(drop=True)
|
||||||
.rename(columns = {0:"Intervention type"})
|
.rename(columns = {0:"intervention type"})
|
||||||
)
|
)
|
||||||
|
|
||||||
sort_order = interv_type_df["Intervention type"].value_counts(ascending=False).index
|
sort_order = interv_type_df["intervention type"].value_counts(ascending=False).index
|
||||||
fig = plt.figure()
|
fig = plt.figure()
|
||||||
fig.set_size_inches(12, 4)
|
fig.set_size_inches(12, 4)
|
||||||
ax = sns.countplot(interv_type_df, x="Intervention type", order=sort_order)
|
ax = sns.countplot(interv_type_df, x="intervention type", order=sort_order)
|
||||||
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
|
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
|
||||||
rotation_mode="anchor")
|
rotation_mode="anchor")
|
||||||
plt.show()
|
plt.show()
|
||||||
|
@ -579,17 +579,17 @@ plt.show()
|
||||||
#| column: page
|
#| column: page
|
||||||
|
|
||||||
inequ_type_df = (
|
inequ_type_df = (
|
||||||
bib_df["Keywords"]
|
bib_df["keywords"]
|
||||||
.str.replace(r"\_", " ")
|
.str.replace(r"\_", " ")
|
||||||
.str.extractall(r"inequality::([\w ]+)")
|
.str.extractall(r"inequality::([\w ]+)")
|
||||||
.reset_index(drop=True)
|
.reset_index(drop=True)
|
||||||
.rename(columns = {0:"Inequality type"})
|
.rename(columns = {0:"inequality type"})
|
||||||
)
|
)
|
||||||
|
|
||||||
sort_order = inequ_type_df["Inequality type"].value_counts(ascending=False).index
|
sort_order = inequ_type_df["inequality type"].value_counts(ascending=False).index
|
||||||
fig = plt.figure()
|
fig = plt.figure()
|
||||||
fig.set_size_inches(12, 4)
|
fig.set_size_inches(12, 4)
|
||||||
ax = sns.countplot(inequ_type_df, x="Inequality type", order=sort_order)
|
ax = sns.countplot(inequ_type_df, x="inequality type", order=sort_order)
|
||||||
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
|
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
|
||||||
rotation_mode="anchor")
|
rotation_mode="anchor")
|
||||||
plt.show()
|
plt.show()
|
||||||
|
|
Loading…
Reference in a new issue