feat(script): Use extracted data for manuscript
For the first time we use the actual final extracted data from relevant studies to do analysis on instead of just the intermediate Zotero-provided metadata. We still inject the intermediate metadata where it may be useful (things like citation counts and keywords) but otherwise switch to the new data.
This commit is contained in:
parent
e4e7d9e3e4
commit
ed3d09b3f7
2 changed files with 21 additions and 16 deletions
|
@ -464,22 +464,27 @@ Keeping in mind that these results are not yet screened for their full relevance
|
||||||
|
|
||||||
```{python}
|
```{python}
|
||||||
# load relevant studies
|
# load relevant studies
|
||||||
|
from src import data
|
||||||
|
bib_df = data.from_yml(f"{PROCESSED_DATA}/relevant")
|
||||||
|
|
||||||
|
# load zotero-based metadata
|
||||||
reformatted = []
|
reformatted = []
|
||||||
for e in sample_relevant:
|
for e in sample_relevant:
|
||||||
ed = e.fields_dict
|
ed = e.fields_dict
|
||||||
reformatted.append([ed.get("year", "0000").value,
|
reformatted.append([
|
||||||
ed.get("author").value,
|
ed.get("doi", Field(key="doi", value=None)).value,
|
||||||
ed.get("title").value,
|
|
||||||
ed.get("type", Field(key="type", value=None)).value,
|
|
||||||
ed.get("times-cited", Field(key="times-cited", value=None)).value,
|
ed.get("times-cited", Field(key="times-cited", value=None)).value,
|
||||||
ed.get("usage-count-since-2013", Field(key="usage-count-since-2013", value=None)).value,
|
ed.get("usage-count-since-2013", Field(key="usage-count-since-2013", value=None)).value,
|
||||||
ed.get("keywords", Field(key="keywords", value=None)).value,
|
ed.get("keywords", Field(key="keywords", value=None)).value,
|
||||||
])
|
])
|
||||||
|
zot_df = pd.DataFrame(reformatted, columns = ["doi", "cited", "usage", "keywords"])
|
||||||
|
|
||||||
# FIXME do not just drop missing values
|
bib_df["doi"] = bib_df["uri"].str.extract(r"https?://(?:dx\.)?doi\.org/(.*)", expand=False)
|
||||||
bib_df = pd.DataFrame(reformatted, columns = ["year", "author", "title", "type", "cited", "usage", "keywords"])
|
bib_df["zot_cited"] = bib_df["doi"].map(zot_df.drop_duplicates("doi").set_index("doi")["cited"])
|
||||||
bib_df = bib_df.dropna(how="any")
|
bib_df["zot_usage"] = bib_df["doi"].map(zot_df.drop_duplicates("doi").set_index("doi")["usage"])
|
||||||
bib_df["date"] = pd.to_datetime(bib_df["year"], format="mixed")
|
bib_df["zot_keywords"] = bib_df["doi"].map(zot_df.drop_duplicates("doi").set_index("doi")["keywords"])
|
||||||
|
|
||||||
|
bib_df["date"] = pd.to_datetime(bib_df["year"], format="%Y")
|
||||||
bib_df["year"] = bib_df["date"].dt.year
|
bib_df["year"] = bib_df["date"].dt.year
|
||||||
|
|
||||||
# only keep newer entries
|
# only keep newer entries
|
||||||
|
@ -498,8 +503,8 @@ bib_df = bib_df[bib_df["year"] >= 2000]
|
||||||
#| fig-cap: Publications per year
|
#| fig-cap: Publications per year
|
||||||
|
|
||||||
# create dummy category for white or gray lit type (based on 'article' appearing in type)
|
# create dummy category for white or gray lit type (based on 'article' appearing in type)
|
||||||
bib_df["type"].value_counts()
|
bib_df["pubtype"].value_counts()
|
||||||
bib_df["literature"] = np.where(bib_df["type"].str.contains("article", case=False, regex=False), "white", "gray")
|
bib_df["literature"] = np.where(bib_df["pubtype"].str.contains("article", case=False, regex=False), "white", "gray")
|
||||||
bib_df["literature"] = bib_df["literature"].astype("category")
|
bib_df["literature"] = bib_df["literature"].astype("category")
|
||||||
|
|
||||||
# plot by year, distinguished by literature type
|
# plot by year, distinguished by literature type
|
||||||
|
@ -525,9 +530,9 @@ First, in general, citation counts are slightly decreasing - as should generally
|
||||||
```{python}
|
```{python}
|
||||||
#| label: fig-citations-per-year-avg
|
#| label: fig-citations-per-year-avg
|
||||||
#| fig-cap: Average citations per year
|
#| fig-cap: Average citations per year
|
||||||
bib_df["cited"] = bib_df["cited"].astype("int")
|
bib_df["zot_cited"] = bib_df["zot_cited"].dropna().astype("int")
|
||||||
grpd = bib_df.groupby(["year"], as_index=False)["cited"].mean()
|
grpd = bib_df.groupby(["year"], as_index=False)["zot_cited"].mean()
|
||||||
ax = sns.barplot(grpd, x="year", y="cited")
|
ax = sns.barplot(grpd, x="year", y="zot_cited")
|
||||||
ax.tick_params(axis='x', rotation=45)
|
ax.tick_params(axis='x', rotation=45)
|
||||||
plt.tight_layout()
|
plt.tight_layout()
|
||||||
plt.show()
|
plt.show()
|
||||||
|
@ -555,7 +560,7 @@ Should they point towards gaps (or over-optimization) of sepcific areas of inter
|
||||||
#| column: page
|
#| column: page
|
||||||
|
|
||||||
interv_type_df = (
|
interv_type_df = (
|
||||||
bib_df["keywords"]
|
bib_df["zot_keywords"]
|
||||||
.str.replace(r"\_", " ")
|
.str.replace(r"\_", " ")
|
||||||
.str.extractall(r"type::([\w ]+)")
|
.str.extractall(r"type::([\w ]+)")
|
||||||
.reset_index(drop=True)
|
.reset_index(drop=True)
|
||||||
|
@ -579,7 +584,7 @@ plt.show()
|
||||||
#| column: page
|
#| column: page
|
||||||
|
|
||||||
inequ_type_df = (
|
inequ_type_df = (
|
||||||
bib_df["keywords"]
|
bib_df["zot_keywords"]
|
||||||
.str.replace(r"\_", " ")
|
.str.replace(r"\_", " ")
|
||||||
.str.extractall(r"inequality::([\w ]+)")
|
.str.extractall(r"inequality::([\w ]+)")
|
||||||
.reset_index(drop=True)
|
.reset_index(drop=True)
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
import io
|
import io
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import sys
|
import sys
|
||||||
import load_yaml
|
from src import load_yaml
|
||||||
from pandas import DataFrame, read_csv
|
from pandas import DataFrame, read_csv
|
||||||
|
|
||||||
DEFAULT_YAML_PATH = Path("02-data/processed/relevant")
|
DEFAULT_YAML_PATH = Path("02-data/processed/relevant")
|
||||||
|
|
Loading…
Reference in a new issue