chore(code): Update explore experiments

2023-12-21 17:01:51 +01:00 · 2023-12-21 17:01:51 +01:00 · 4c763fac0f
commit 4c763fac0f
parent 259d95693f
1 changed files with 109 additions and 9 deletions
--- a/00-notebooks/explore.qmd
+++ b/00-notebooks/explore.qmd
@ -158,14 +158,114 @@ pd.crosstab(df_income["Intervention"], df_income["Inequality"])
 #| label: tbl-income-crosstab
 #| tbl-cap: Interventions targeting income inequality

-def inequality_crosstab(df, inequality:str):
-    temp_df = df.copy()
-    temp_df['Inequality'] = temp_df['inequality'].str.split(";").explode(ignore_index=True).str.strip()
-    temp_df = temp_df.loc[temp_df['Inequality'] == inequality].copy()
-    temp_df['Intervention'] = temp_df['intervention'].str.split(";").explode(ignore_index=True).str.replace(r"\(.+\)", "", regex=True).str.strip()
-    tab = pd.crosstab(temp_df["Intervention"], temp_df["Inequality"])
-    temp_df=None
-    return tab
+temp_df = df[["intervention", "inequality"]].copy().reset_index(drop=True)
+temp_df['Inequality'] = temp_df['inequality'].str.split(";").explode(ignore_index=True).str.strip()
+temp_df['Intervention'] = temp_df['intervention'].str.split(";").explode(ignore_index=True).str.replace(r"\(.+\)", "", regex=True).str.strip()

-inequality_crosstab(df, "income")
+gender_df = temp_df.loc[temp_df["Inequality"] == "gender"]
+income_df = temp_df.loc[temp_df["Inequality"] == "income"]
+```
+
+prep full data set:
+
+```{python}
+#| echo: false
+from pathlib import Path
+import re
+## standard imports
+from IPython.core.display import Markdown as md
+import numpy as np
+import pandas as pd
+from matplotlib import pyplot as plt
+import seaborn as sns
+from tabulate import tabulate
+import bibtexparser
+
+sns.set_style("whitegrid")
+
+DATA_DIR=Path("./02-data")
+RAW_DATA=DATA_DIR.joinpath("raw")
+WORKING_DATA=DATA_DIR.joinpath("intermediate")
+PROCESSED_DATA=DATA_DIR.joinpath("processed")
+SUPPLEMENTARY_DATA=DATA_DIR.joinpath("supplementary")
+
+bib_string=""
+for partial_bib in RAW_DATA.glob("**/*.bib"):
+    with open(partial_bib) as f:
+        bib_string+="\n".join(f.readlines())
+bib_sample_raw_db = bibtexparser.parse_string(bib_string)
+
+bib_string=""
+for partial_bib in WORKING_DATA.glob("**/*.bib"):
+    with open(partial_bib) as f:
+        bib_string+="\n".join(f.readlines())
+bib_sample = bibtexparser.parse_string(bib_string)
+```
+
+```{python}
+# load relevant studies
+from src import data
+
+# load zotero-based metadata: citations and uses
+zot_df = pd.DataFrame([
+    [
+        entry["doi"] if "doi" in entry.fields_dict else None,
+        entry["times-cited"] if "times-cited" in entry.fields_dict else None,
+        entry["usage"] if "usage" in entry.fields_dict else None,
+        entry["keywords"] if "keywords" in entry.fields_dict else None,
+    ]
+    for entry in bib_sample.entries
+], columns = ["doi", "cited", "usage", "keywords"]).drop_duplicates("doi").set_index("doi")
+
+# Add WB country grouping definitions (income group, world region)
+WB_COUNTRY_GROUPS_FILE = Path(f"{SUPPLEMENTARY_DATA}/wb-country-groupings.xlsx").resolve()
+df_country_groups = pd.read_excel(WB_COUNTRY_GROUPS_FILE).set_index("Economy")
+
+bib_df = (data.from_yml(f"{PROCESSED_DATA}/relevant")
+    .assign(
+        doi=lambda _df: _df["uri"].str.extract(r"https?://(?:dx\.)?doi\.org/(.*)", expand=False),
+        zot_cited=lambda _df: _df["doi"].map(zot_df["cited"]),
+        zot_usage=lambda _df: _df["doi"].map(zot_df["usage"]),
+        zot_keywords=lambda _df: _df["doi"].map(zot_df["keywords"]),
+        date = lambda _df: pd.to_datetime(_df["year"], format="%Y"),
+        year = lambda _df: _df["date"].dt.year,
+        region = lambda _df: _df["country"].map(df_country_groups["Region"]),
+        income_group = lambda _df: _df["country"].map(df_country_groups["Income group"]),
+    )
+    .query("year >= 2000")
+)
+zot_df = None
+df_country_groups = None
+```
+
+```{python}
+df_country_groups = pd.concat([pd.read_excel(WB_COUNTRY_GROUPS_FILE), pd.DataFrame(data={'Economy':['global'],'Code':['WLD'],'Region':['World'], 'Income group':[''], 'Lending category':['']})]).set_index("Economy")
+
+def countries_to_regions(countries:str):
+    res = set()
+    for c in countries.replace(" ;", ";").replace("; ",";").split(";"):
+        if c in df_country_groups.index:
+            region = df_country_groups.at[c,'Region']
+            res.add(region)
+    return ";".join(res)
+
+# countries_to_regions("India; Nicaragua")
+bib_df['region'] = bib_df['country'].map(countries_to_regions)
+```
+
+```{python}
+bib_df = (bib_df
+    .assign(
+        # create de-duplicated joins for all observations
+        region=lambda _df: _df["region"].apply(
+            lambda _cell: set([x.strip() for x in _cell.split(";")])
+        ),
+     )
+    .explode("region")
+)
+# bib_df["region"] = bib_df["region"].str.split(";").explode().str.strip()
+ax = sns.countplot(bib_df, x="region", order=bib_df["region"].value_counts().index)
+plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
+         rotation_mode="anchor")
+plt.show()
 ```