diff --git a/scoping_review.qmd b/scoping_review.qmd
index 0e26217..d5a0fd4 100644
--- a/scoping_review.qmd
+++ b/scoping_review.qmd
@@ -464,22 +464,27 @@ Keeping in mind that these results are not yet screened for their full relevance
 
 ```{python}
 # load relevant studies
+from src import data
+bib_df = data.from_yml(f"{PROCESSED_DATA}/relevant")
+
+# load zotero-based metadata
 reformatted = []
 for e in sample_relevant:
     ed = e.fields_dict
-    reformatted.append([ed.get("year", "0000").value,
-                        ed.get("author").value,
-                        ed.get("title").value,
-                        ed.get("type", Field(key="type", value=None)).value,
+    reformatted.append([
+                        ed.get("doi", Field(key="doi", value=None)).value,
                         ed.get("times-cited", Field(key="times-cited", value=None)).value,
                         ed.get("usage-count-since-2013", Field(key="usage-count-since-2013", value=None)).value,
                         ed.get("keywords", Field(key="keywords", value=None)).value,
                         ])
+zot_df = pd.DataFrame(reformatted, columns = ["doi", "cited", "usage", "keywords"])
 
-# FIXME do not just drop missing values
-bib_df = pd.DataFrame(reformatted, columns = ["year", "author", "title", "type", "cited", "usage", "keywords"])
-bib_df = bib_df.dropna(how="any")
-bib_df["date"] = pd.to_datetime(bib_df["year"], format="mixed")
+bib_df["doi"] = bib_df["uri"].str.extract(r"https?://(?:dx\.)?doi\.org/(.*)", expand=False)
+bib_df["zot_cited"] = bib_df["doi"].map(zot_df.drop_duplicates("doi").set_index("doi")["cited"])
+bib_df["zot_usage"] = bib_df["doi"].map(zot_df.drop_duplicates("doi").set_index("doi")["usage"])
+bib_df["zot_keywords"] = bib_df["doi"].map(zot_df.drop_duplicates("doi").set_index("doi")["keywords"])
+
+bib_df["date"] = pd.to_datetime(bib_df["year"], format="%Y")
 bib_df["year"] = bib_df["date"].dt.year
 
 # only keep newer entries
@@ -498,8 +503,8 @@ bib_df = bib_df[bib_df["year"] >= 2000]
 #| fig-cap: Publications per year
 
 # create dummy category for white or gray lit type (based on 'article' appearing in type)
-bib_df["type"].value_counts()
-bib_df["literature"] = np.where(bib_df["type"].str.contains("article", case=False, regex=False), "white", "gray")
+bib_df["pubtype"].value_counts()
+bib_df["literature"] = np.where(bib_df["pubtype"].str.contains("article", case=False, regex=False), "white", "gray")
 bib_df["literature"] = bib_df["literature"].astype("category")
 
 # plot by year, distinguished by literature type
@@ -525,9 +530,9 @@ First, in general, citation counts are slightly decreasing - as should generally
 ```{python}
 #| label: fig-citations-per-year-avg
 #| fig-cap: Average citations per year
-bib_df["cited"] = bib_df["cited"].astype("int")
-grpd = bib_df.groupby(["year"], as_index=False)["cited"].mean()
-ax = sns.barplot(grpd, x="year", y="cited")
+bib_df["zot_cited"] = bib_df["zot_cited"].dropna().astype("int")
+grpd = bib_df.groupby(["year"], as_index=False)["zot_cited"].mean()
+ax = sns.barplot(grpd, x="year", y="zot_cited")
 ax.tick_params(axis='x', rotation=45)
 plt.tight_layout()
 plt.show()
@@ -555,7 +560,7 @@ Should they point towards gaps (or over-optimization) of sepcific areas of inter
 #| column: page
 
 interv_type_df = (
-    bib_df["keywords"]
+    bib_df["zot_keywords"]
     .str.replace(r"\_", " ")
     .str.extractall(r"type::([\w ]+)")
     .reset_index(drop=True)
@@ -579,7 +584,7 @@ plt.show()
 #| column: page
 
 inequ_type_df = (
-    bib_df["keywords"]
+    bib_df["zot_keywords"]
     .str.replace(r"\_", " ")
     .str.extractall(r"inequality::([\w ]+)")
     .reset_index(drop=True)
diff --git a/src/data.py b/src/data.py
index 8ba8f99..4ac0fb4 100644
--- a/src/data.py
+++ b/src/data.py
@@ -1,7 +1,7 @@
 import io
 from pathlib import Path
 import sys
-import load_yaml
+from src import load_yaml
 from pandas import DataFrame, read_csv
 
 DEFAULT_YAML_PATH = Path("02-data/processed/relevant")