feat(script): Use extracted data for manuscript

For the first time we use the actual final extracted data from relevant studies to do analysis on instead of just the intermediate Zotero-provided metadata. We still inject the intermediate metadata where it may be useful (things like citation counts and keywords) but otherwise switch to the new data.
2023-12-07 22:34:18 +01:00 · 2023-12-07 22:34:18 +01:00 · ed3d09b3f7
commit ed3d09b3f7
parent e4e7d9e3e4
2 changed files with 21 additions and 16 deletions
--- a/scoping_review.qmd
+++ b/scoping_review.qmd
@ -464,22 +464,27 @@ Keeping in mind that these results are not yet screened for their full relevance

 ```{python}
 # load relevant studies
+from src import data
+bib_df = data.from_yml(f"{PROCESSED_DATA}/relevant")
+
+# load zotero-based metadata
 reformatted = []
 for e in sample_relevant:
    ed = e.fields_dict
-    reformatted.append([ed.get("year", "0000").value,
-                        ed.get("author").value,
-                        ed.get("title").value,
-                        ed.get("type", Field(key="type", value=None)).value,
+    reformatted.append([
+                        ed.get("doi", Field(key="doi", value=None)).value,
                        ed.get("times-cited", Field(key="times-cited", value=None)).value,
                        ed.get("usage-count-since-2013", Field(key="usage-count-since-2013", value=None)).value,
                        ed.get("keywords", Field(key="keywords", value=None)).value,
                        ])
+zot_df = pd.DataFrame(reformatted, columns = ["doi", "cited", "usage", "keywords"])

-# FIXME do not just drop missing values
-bib_df = pd.DataFrame(reformatted, columns = ["year", "author", "title", "type", "cited", "usage", "keywords"])
-bib_df = bib_df.dropna(how="any")
-bib_df["date"] = pd.to_datetime(bib_df["year"], format="mixed")
+bib_df["doi"] = bib_df["uri"].str.extract(r"https?://(?:dx\.)?doi\.org/(.*)", expand=False)
+bib_df["zot_cited"] = bib_df["doi"].map(zot_df.drop_duplicates("doi").set_index("doi")["cited"])
+bib_df["zot_usage"] = bib_df["doi"].map(zot_df.drop_duplicates("doi").set_index("doi")["usage"])
+bib_df["zot_keywords"] = bib_df["doi"].map(zot_df.drop_duplicates("doi").set_index("doi")["keywords"])
+
+bib_df["date"] = pd.to_datetime(bib_df["year"], format="%Y")
 bib_df["year"] = bib_df["date"].dt.year

 # only keep newer entries
@ -498,8 +503,8 @@ bib_df = bib_df[bib_df["year"] >= 2000]
 #| fig-cap: Publications per year

 # create dummy category for white or gray lit type (based on 'article' appearing in type)
-bib_df["type"].value_counts()
-bib_df["literature"] = np.where(bib_df["type"].str.contains("article", case=False, regex=False), "white", "gray")
+bib_df["pubtype"].value_counts()
+bib_df["literature"] = np.where(bib_df["pubtype"].str.contains("article", case=False, regex=False), "white", "gray")
 bib_df["literature"] = bib_df["literature"].astype("category")

 # plot by year, distinguished by literature type
@ -525,9 +530,9 @@ First, in general, citation counts are slightly decreasing - as should generally
 ```{python}
 #| label: fig-citations-per-year-avg
 #| fig-cap: Average citations per year
-bib_df["cited"] = bib_df["cited"].astype("int")
-grpd = bib_df.groupby(["year"], as_index=False)["cited"].mean()
-ax = sns.barplot(grpd, x="year", y="cited")
+bib_df["zot_cited"] = bib_df["zot_cited"].dropna().astype("int")
+grpd = bib_df.groupby(["year"], as_index=False)["zot_cited"].mean()
+ax = sns.barplot(grpd, x="year", y="zot_cited")
 ax.tick_params(axis='x', rotation=45)
 plt.tight_layout()
 plt.show()
@ -555,7 +560,7 @@ Should they point towards gaps (or over-optimization) of sepcific areas of inter
 #| column: page

 interv_type_df = (
-    bib_df["keywords"]
+    bib_df["zot_keywords"]
    .str.replace(r"\_", " ")
    .str.extractall(r"type::([\w ]+)")
    .reset_index(drop=True)
@ -579,7 +584,7 @@ plt.show()
 #| column: page

 inequ_type_df = (
-    bib_df["keywords"]
+    bib_df["zot_keywords"]
    .str.replace(r"\_", " ")
    .str.extractall(r"inequality::([\w ]+)")
    .reset_index(drop=True)
--- a/src/data.py
+++ b/src/data.py
@ -1,7 +1,7 @@
 import io
 from pathlib import Path
 import sys
-import load_yaml
+from src import load_yaml
 from pandas import DataFrame, read_csv

 DEFAULT_YAML_PATH = Path("02-data/processed/relevant")