From ed3d09b3f7df4a25511061b31d071be184692858 Mon Sep 17 00:00:00 2001 From: Marty Oehme Date: Thu, 7 Dec 2023 22:34:18 +0100 Subject: [PATCH] feat(script): Use extracted data for manuscript For the first time we use the actual final extracted data from relevant studies to do analysis on instead of just the intermediate Zotero-provided metadata. We still inject the intermediate metadata where it may be useful (things like citation counts and keywords) but otherwise switch to the new data. --- scoping_review.qmd | 35 ++++++++++++++++++++--------------- src/data.py | 2 +- 2 files changed, 21 insertions(+), 16 deletions(-) diff --git a/scoping_review.qmd b/scoping_review.qmd index 0e26217..d5a0fd4 100644 --- a/scoping_review.qmd +++ b/scoping_review.qmd @@ -464,22 +464,27 @@ Keeping in mind that these results are not yet screened for their full relevance ```{python} # load relevant studies +from src import data +bib_df = data.from_yml(f"{PROCESSED_DATA}/relevant") + +# load zotero-based metadata reformatted = [] for e in sample_relevant: ed = e.fields_dict - reformatted.append([ed.get("year", "0000").value, - ed.get("author").value, - ed.get("title").value, - ed.get("type", Field(key="type", value=None)).value, + reformatted.append([ + ed.get("doi", Field(key="doi", value=None)).value, ed.get("times-cited", Field(key="times-cited", value=None)).value, ed.get("usage-count-since-2013", Field(key="usage-count-since-2013", value=None)).value, ed.get("keywords", Field(key="keywords", value=None)).value, ]) +zot_df = pd.DataFrame(reformatted, columns = ["doi", "cited", "usage", "keywords"]) -# FIXME do not just drop missing values -bib_df = pd.DataFrame(reformatted, columns = ["year", "author", "title", "type", "cited", "usage", "keywords"]) -bib_df = bib_df.dropna(how="any") -bib_df["date"] = pd.to_datetime(bib_df["year"], format="mixed") +bib_df["doi"] = bib_df["uri"].str.extract(r"https?://(?:dx\.)?doi\.org/(.*)", expand=False) +bib_df["zot_cited"] = bib_df["doi"].map(zot_df.drop_duplicates("doi").set_index("doi")["cited"]) +bib_df["zot_usage"] = bib_df["doi"].map(zot_df.drop_duplicates("doi").set_index("doi")["usage"]) +bib_df["zot_keywords"] = bib_df["doi"].map(zot_df.drop_duplicates("doi").set_index("doi")["keywords"]) + +bib_df["date"] = pd.to_datetime(bib_df["year"], format="%Y") bib_df["year"] = bib_df["date"].dt.year # only keep newer entries @@ -498,8 +503,8 @@ bib_df = bib_df[bib_df["year"] >= 2000] #| fig-cap: Publications per year # create dummy category for white or gray lit type (based on 'article' appearing in type) -bib_df["type"].value_counts() -bib_df["literature"] = np.where(bib_df["type"].str.contains("article", case=False, regex=False), "white", "gray") +bib_df["pubtype"].value_counts() +bib_df["literature"] = np.where(bib_df["pubtype"].str.contains("article", case=False, regex=False), "white", "gray") bib_df["literature"] = bib_df["literature"].astype("category") # plot by year, distinguished by literature type @@ -525,9 +530,9 @@ First, in general, citation counts are slightly decreasing - as should generally ```{python} #| label: fig-citations-per-year-avg #| fig-cap: Average citations per year -bib_df["cited"] = bib_df["cited"].astype("int") -grpd = bib_df.groupby(["year"], as_index=False)["cited"].mean() -ax = sns.barplot(grpd, x="year", y="cited") +bib_df["zot_cited"] = bib_df["zot_cited"].dropna().astype("int") +grpd = bib_df.groupby(["year"], as_index=False)["zot_cited"].mean() +ax = sns.barplot(grpd, x="year", y="zot_cited") ax.tick_params(axis='x', rotation=45) plt.tight_layout() plt.show() @@ -555,7 +560,7 @@ Should they point towards gaps (or over-optimization) of sepcific areas of inter #| column: page interv_type_df = ( - bib_df["keywords"] + bib_df["zot_keywords"] .str.replace(r"\_", " ") .str.extractall(r"type::([\w ]+)") .reset_index(drop=True) @@ -579,7 +584,7 @@ plt.show() #| column: page inequ_type_df = ( - bib_df["keywords"] + bib_df["zot_keywords"] .str.replace(r"\_", " ") .str.extractall(r"inequality::([\w ]+)") .reset_index(drop=True) diff --git a/src/data.py b/src/data.py index 8ba8f99..4ac0fb4 100644 --- a/src/data.py +++ b/src/data.py @@ -1,7 +1,7 @@ import io from pathlib import Path import sys -import load_yaml +from src import load_yaml from pandas import DataFrame, read_csv DEFAULT_YAML_PATH = Path("02-data/processed/relevant")