feat(script): Use extracted data for manuscript

For the first time we use the actual final extracted data from relevant studies
to do analysis on instead of just the intermediate Zotero-provided metadata.

We still inject the intermediate metadata where it may be useful (things like
citation counts and keywords) but otherwise switch to the new data.
This commit is contained in:
Marty Oehme 2023-12-07 22:34:18 +01:00
parent e4e7d9e3e4
commit ed3d09b3f7
Signed by: Marty
GPG key ID: EDBF2ED917B2EF6A
2 changed files with 21 additions and 16 deletions

View file

@ -464,22 +464,27 @@ Keeping in mind that these results are not yet screened for their full relevance
```{python} ```{python}
# load relevant studies # load relevant studies
from src import data
bib_df = data.from_yml(f"{PROCESSED_DATA}/relevant")
# load zotero-based metadata
reformatted = [] reformatted = []
for e in sample_relevant: for e in sample_relevant:
ed = e.fields_dict ed = e.fields_dict
reformatted.append([ed.get("year", "0000").value, reformatted.append([
ed.get("author").value, ed.get("doi", Field(key="doi", value=None)).value,
ed.get("title").value,
ed.get("type", Field(key="type", value=None)).value,
ed.get("times-cited", Field(key="times-cited", value=None)).value, ed.get("times-cited", Field(key="times-cited", value=None)).value,
ed.get("usage-count-since-2013", Field(key="usage-count-since-2013", value=None)).value, ed.get("usage-count-since-2013", Field(key="usage-count-since-2013", value=None)).value,
ed.get("keywords", Field(key="keywords", value=None)).value, ed.get("keywords", Field(key="keywords", value=None)).value,
]) ])
zot_df = pd.DataFrame(reformatted, columns = ["doi", "cited", "usage", "keywords"])
# FIXME do not just drop missing values bib_df["doi"] = bib_df["uri"].str.extract(r"https?://(?:dx\.)?doi\.org/(.*)", expand=False)
bib_df = pd.DataFrame(reformatted, columns = ["year", "author", "title", "type", "cited", "usage", "keywords"]) bib_df["zot_cited"] = bib_df["doi"].map(zot_df.drop_duplicates("doi").set_index("doi")["cited"])
bib_df = bib_df.dropna(how="any") bib_df["zot_usage"] = bib_df["doi"].map(zot_df.drop_duplicates("doi").set_index("doi")["usage"])
bib_df["date"] = pd.to_datetime(bib_df["year"], format="mixed") bib_df["zot_keywords"] = bib_df["doi"].map(zot_df.drop_duplicates("doi").set_index("doi")["keywords"])
bib_df["date"] = pd.to_datetime(bib_df["year"], format="%Y")
bib_df["year"] = bib_df["date"].dt.year bib_df["year"] = bib_df["date"].dt.year
# only keep newer entries # only keep newer entries
@ -498,8 +503,8 @@ bib_df = bib_df[bib_df["year"] >= 2000]
#| fig-cap: Publications per year #| fig-cap: Publications per year
# create dummy category for white or gray lit type (based on 'article' appearing in type) # create dummy category for white or gray lit type (based on 'article' appearing in type)
bib_df["type"].value_counts() bib_df["pubtype"].value_counts()
bib_df["literature"] = np.where(bib_df["type"].str.contains("article", case=False, regex=False), "white", "gray") bib_df["literature"] = np.where(bib_df["pubtype"].str.contains("article", case=False, regex=False), "white", "gray")
bib_df["literature"] = bib_df["literature"].astype("category") bib_df["literature"] = bib_df["literature"].astype("category")
# plot by year, distinguished by literature type # plot by year, distinguished by literature type
@ -525,9 +530,9 @@ First, in general, citation counts are slightly decreasing - as should generally
```{python} ```{python}
#| label: fig-citations-per-year-avg #| label: fig-citations-per-year-avg
#| fig-cap: Average citations per year #| fig-cap: Average citations per year
bib_df["cited"] = bib_df["cited"].astype("int") bib_df["zot_cited"] = bib_df["zot_cited"].dropna().astype("int")
grpd = bib_df.groupby(["year"], as_index=False)["cited"].mean() grpd = bib_df.groupby(["year"], as_index=False)["zot_cited"].mean()
ax = sns.barplot(grpd, x="year", y="cited") ax = sns.barplot(grpd, x="year", y="zot_cited")
ax.tick_params(axis='x', rotation=45) ax.tick_params(axis='x', rotation=45)
plt.tight_layout() plt.tight_layout()
plt.show() plt.show()
@ -555,7 +560,7 @@ Should they point towards gaps (or over-optimization) of sepcific areas of inter
#| column: page #| column: page
interv_type_df = ( interv_type_df = (
bib_df["keywords"] bib_df["zot_keywords"]
.str.replace(r"\_", " ") .str.replace(r"\_", " ")
.str.extractall(r"type::([\w ]+)") .str.extractall(r"type::([\w ]+)")
.reset_index(drop=True) .reset_index(drop=True)
@ -579,7 +584,7 @@ plt.show()
#| column: page #| column: page
inequ_type_df = ( inequ_type_df = (
bib_df["keywords"] bib_df["zot_keywords"]
.str.replace(r"\_", " ") .str.replace(r"\_", " ")
.str.extractall(r"inequality::([\w ]+)") .str.extractall(r"inequality::([\w ]+)")
.reset_index(drop=True) .reset_index(drop=True)

View file

@ -1,7 +1,7 @@
import io import io
from pathlib import Path from pathlib import Path
import sys import sys
import load_yaml from src import load_yaml
from pandas import DataFrame, read_csv from pandas import DataFrame, read_csv
DEFAULT_YAML_PATH = Path("02-data/processed/relevant") DEFAULT_YAML_PATH = Path("02-data/processed/relevant")