wow-inequalities/00-notebooks/explore.qmd

---
bibliography: 02-data/supplementary/lib.bib
title: Grab yml
---

```{python}
import pandas as pd
from src import data

df = data.from_yml()
```

Get interventions:

```{python}
df['intervention'].str.split(";").explode().str.strip().value_counts()
```

Get inequalities:

```{python}
df['inequality'].str.split(";").explode().str.strip().value_counts()
```

```{python}
df.groupby(["author", "year", "title"]).first().join(df['intervention'])
```

Unique values in chain method:

```{python}
(
    df.groupby(["author", "year", "title"])
    .agg(
        {
            "intervention": lambda _col:"; ".join(_col),
            "inequality": lambda _col:"; ".join(_col),
        }
    )
    .drop_duplicates()
    .explode("inequality")
    ["inequality"].str.strip()
    .value_counts()
)
```

Merge dataset so it is collected by *STUDY* not by *OBSERVATION*.
Any required columns can be calculated similar to the agg function here.

```{python}
by_study = (
    df.groupby(["author", "year", "title"])
    .agg(
        {
            "intervention": lambda _col: "; ".join(_col),
            "inequality": lambda _col: "; ".join(_col),
            "date": lambda _col: "; ".join(_col),
            "findings": lambda _col: "; ".join(_col),
            # "region": lambda _col: "; ".join(_col), # only accessible when merging with WB data
            # "income_group": lambda _col: "; ".join(_col),
        }
    )
    .reset_index()
    .drop_duplicates()
    .assign(
        # create de-duplicated joins for all observations
        intervention=lambda _df: _df["intervention"].apply(
            lambda _cell: set([x.strip() for x in _cell.split(";")])
        ),
        inequality=lambda _df: _df["inequality"].apply(
            lambda _cell: set([x.strip() for x in _cell.split(";")])
        ),
    )
)
```

```{python}
by_study = (
    df.groupby(["author", "year", "title"])
    .first()
    .reset_index()
    .drop_duplicates()
    .assign(
        # create de-duplicated joins for all observations
        intervention=lambda _df: _df["intervention"].apply(
            lambda _cell: set([x.strip() for x in _cell.split(";")])
        ),
        inequality=lambda _df: _df["inequality"].apply(
            lambda _cell: set([x.strip() for x in _cell.split(";")])
        ),
    )
)
```

```{python}
import re
from matplotlib import pyplot as plt
import seaborn as sns
by_intervention = (
    df.groupby(["author", "year", "title"])
    .agg(
        {
            "intervention": lambda _col: "; ".join(_col),
        }
    )
    .reset_index()
    .drop_duplicates()
    .assign(
        intervention=lambda _df: _df["intervention"].apply(
            lambda _cell: set([x.strip() for x in re.sub(r"\(.*\)", "", _cell).split(";")])
        ),
    )
    .explode("intervention")
)
sort_order = by_intervention["intervention"].value_counts().index

fig = plt.figure()
fig.set_size_inches(6, 3)
ax = sns.countplot(by_intervention, x="intervention", order=by_intervention["intervention"].value_counts().index)
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
         rotation_mode="anchor")
plt.show()
by_intervention = None
```

```{python}
#| label: fig-publications-per-year
#| fig-cap: Publications per year

df_study_years = (
    df.groupby(["author", "year", "title"])
    .first()
    .reset_index()
    .drop_duplicates()
)
# plot by year TODO decide if we want to distinguish by literature type/region/etc as hue
# FIXME should be timeseries plot so no years are missing
ax = sns.countplot(df_study_years, x="year", native_scale=True)
ax.tick_params(axis='x', rotation=45)
ax.set_xlabel("")
plt.tight_layout()
plt.show()
df_study_years = None
```

```{python}
#| label: tbl-income-crosstab
#| tbl-cap: Interventions targeting income inequality

df_income = df.copy()
df_income['Inequality'] = df_income['inequality'].str.split(";").explode(ignore_index=True).str.strip()
df_income = df_income.loc[df_income['Inequality'] == "income"].copy()
df_income['Intervention'] = df_income['intervention'].str.split(";").explode(ignore_index=True).str.replace(r"\(.+\)", "", regex=True).str.strip()
pd.crosstab(df_income["Intervention"], df_income["Inequality"])
```

```{python}
#| label: tbl-income-crosstab
#| tbl-cap: Interventions targeting income inequality

temp_df = df[["intervention", "inequality"]].copy().reset_index(drop=True)
temp_df['Inequality'] = temp_df['inequality'].str.split(";").explode(ignore_index=True).str.strip()
temp_df['Intervention'] = temp_df['intervention'].str.split(";").explode(ignore_index=True).str.replace(r"\(.+\)", "", regex=True).str.strip()

gender_df = temp_df.loc[temp_df["Inequality"] == "gender"]
income_df = temp_df.loc[temp_df["Inequality"] == "income"]
```

prep full data set:

```{python}
#| echo: false
from pathlib import Path
import re
## standard imports
from IPython.core.display import Markdown as md
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from tabulate import tabulate
import bibtexparser

sns.set_style("whitegrid")

DATA_DIR=Path("./02-data")
RAW_DATA=DATA_DIR.joinpath("raw")
WORKING_DATA=DATA_DIR.joinpath("intermediate")
PROCESSED_DATA=DATA_DIR.joinpath("processed")
SUPPLEMENTARY_DATA=DATA_DIR.joinpath("supplementary")

bib_string=""
for partial_bib in RAW_DATA.glob("**/*.bib"):
    with open(partial_bib) as f:
        bib_string+="\n".join(f.readlines())
bib_sample_raw_db = bibtexparser.parse_string(bib_string)

bib_string=""
for partial_bib in WORKING_DATA.glob("**/*.bib"):
    with open(partial_bib) as f:
        bib_string+="\n".join(f.readlines())
bib_sample = bibtexparser.parse_string(bib_string)
```

```{python}
# load relevant studies
from src import data

# load zotero-based metadata: citations and uses
zot_df = pd.DataFrame([
    [
        entry["doi"] if "doi" in entry.fields_dict else None,
        entry["times-cited"] if "times-cited" in entry.fields_dict else None,
        entry["usage"] if "usage" in entry.fields_dict else None,
        entry["keywords"] if "keywords" in entry.fields_dict else None,
    ]
    for entry in bib_sample.entries
], columns = ["doi", "cited", "usage", "keywords"]).drop_duplicates("doi").set_index("doi")

# Add WB country grouping definitions (income group, world region)
WB_COUNTRY_GROUPS_FILE = Path(f"{SUPPLEMENTARY_DATA}/wb-country-groupings.xlsx").resolve()
df_country_groups = pd.read_excel(WB_COUNTRY_GROUPS_FILE).set_index("Economy")

bib_df = (data.from_yml(f"{PROCESSED_DATA}/relevant")
    .assign(
        doi=lambda _df: _df["uri"].str.extract(r"https?://(?:dx\.)?doi\.org/(.*)", expand=False),
        zot_cited=lambda _df: _df["doi"].map(zot_df["cited"]),
        zot_usage=lambda _df: _df["doi"].map(zot_df["usage"]),
        zot_keywords=lambda _df: _df["doi"].map(zot_df["keywords"]),
        date = lambda _df: pd.to_datetime(_df["year"], format="%Y"),
        year = lambda _df: _df["date"].dt.year,
        region = lambda _df: _df["country"].map(df_country_groups["Region"]),
        income_group = lambda _df: _df["country"].map(df_country_groups["Income group"]),
    )
    .query("year >= 2000")
)
zot_df = None
df_country_groups = None
```

```{python}
df_country_groups = pd.concat([pd.read_excel(WB_COUNTRY_GROUPS_FILE), pd.DataFrame(data={'Economy':['global'],'Code':['WLD'],'Region':['World'], 'Income group':[''], 'Lending category':['']})]).set_index("Economy")

def countries_to_regions(countries:str):
    res = set()
    for c in countries.replace(" ;", ";").replace("; ",";").split(";"):
        if c in df_country_groups.index:
            region = df_country_groups.at[c,'Region']
            res.add(region)
    return ";".join(res)

# countries_to_regions("India; Nicaragua")
bib_df['region'] = bib_df['country'].map(countries_to_regions)
```

```{python}
bib_df = (bib_df
    .assign(
        # create de-duplicated joins for all observations
        region=lambda _df: _df["region"].apply(
            lambda _cell: set([x.strip() for x in _cell.split(";")])
        ),
     )
    .explode("region")
)
# bib_df["region"] = bib_df["region"].str.split(";").explode().str.strip()
ax = sns.countplot(bib_df, x="region", order=bib_df["region"].value_counts().index)
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
         rotation_mode="anchor")
plt.show()
```
feat(code): Add quick data querying notebook Added a notebook which has the sole point of quickly allowing me to grab and look at the data of the processed sample I am creating. I.e. quickly list and uniq all interventions/outcomes/inequalities, doing a tiny calculation or similar. 2023-12-09 17:50:43 +00:00			`---`
			`bibliography: 02-data/supplementary/lib.bib`
			`title: Grab yml`
			`---`

			```{python}
			`import pandas as pd`
			`from src import data`

			`df = data.from_yml()`
			```

			`Get interventions:`

			```{python}
			`df['intervention'].str.split(";").explode().str.strip().value_counts()`
			```

			`Get inequalities:`

			```{python}
			`df['inequality'].str.split(";").explode().str.strip().value_counts()`
			```
feat(code): Add examples of list handling notebook Extracts interventions/inequalities and explodes them for value counts. 2023-12-11 16:14:50 +00:00
			```{python}
			`df.groupby(["author", "year", "title"]).first().join(df['intervention'])`
			```

			`Unique values in chain method:`

			```{python}
			`(`
			`df.groupby(["author", "year", "title"])`
			`.agg(`
			`{`
			`"intervention": lambda _col:"; ".join(_col),`
			`"inequality": lambda _col:"; ".join(_col),`
			`}`
			`)`
			`.drop_duplicates()`
			`.explode("inequality")`
			`["inequality"].str.strip()`
			`.value_counts()`
			`)`
			```

			`Merge dataset so it is collected by STUDY not by OBSERVATION.`
			`Any required columns can be calculated similar to the agg function here.`

			```{python}
			`by_study = (`
			`df.groupby(["author", "year", "title"])`
			`.agg(`
			`{`
			`"intervention": lambda _col: "; ".join(_col),`
			`"inequality": lambda _col: "; ".join(_col),`
			`"date": lambda _col: "; ".join(_col),`
			`"findings": lambda _col: "; ".join(_col),`
			`# "region": lambda _col: "; ".join(_col), # only accessible when merging with WB data`
			`# "income_group": lambda _col: "; ".join(_col),`
			`}`
			`)`
			`.reset_index()`
			`.drop_duplicates()`
			`.assign(`
			`# create de-duplicated joins for all observations`
			`intervention=lambda _df: _df["intervention"].apply(`
			`lambda _cell: set([x.strip() for x in _cell.split(";")])`
			`),`
			`inequality=lambda _df: _df["inequality"].apply(`
			`lambda _cell: set([x.strip() for x in _cell.split(";")])`
			`),`
			`)`
			`)`
			```

			```{python}
			`by_study = (`
			`df.groupby(["author", "year", "title"])`
			`.first()`
			`.reset_index()`
			`.drop_duplicates()`
			`.assign(`
			`# create de-duplicated joins for all observations`
			`intervention=lambda _df: _df["intervention"].apply(`
			`lambda _cell: set([x.strip() for x in _cell.split(";")])`
			`),`
			`inequality=lambda _df: _df["inequality"].apply(`
			`lambda _cell: set([x.strip() for x in _cell.split(";")])`
			`),`
			`)`
			`)`
			```

			```{python}
			`import re`
chore(code): Rename ymlgrab to explore notebook 2023-12-21 10:40:05 +00:00			`from matplotlib import pyplot as plt`
			`import seaborn as sns`
feat(code): Add examples of list handling notebook Extracts interventions/inequalities and explodes them for value counts. 2023-12-11 16:14:50 +00:00			`by_intervention = (`
			`df.groupby(["author", "year", "title"])`
			`.agg(`
			`{`
			`"intervention": lambda _col: "; ".join(_col),`
			`}`
			`)`
			`.reset_index()`
			`.drop_duplicates()`
			`.assign(`
			`intervention=lambda _df: _df["intervention"].apply(`
			`lambda _cell: set([x.strip() for x in re.sub(r"\(.*\)", "", _cell).split(";")])`
			`),`
			`)`
			`.explode("intervention")`
			`)`
			`sort_order = by_intervention["intervention"].value_counts().index`

			`fig = plt.figure()`
			`fig.set_size_inches(6, 3)`
			`ax = sns.countplot(by_intervention, x="intervention", order=by_intervention["intervention"].value_counts().index)`
			`plt.setp(ax.get_xticklabels(), rotation=45, ha="right",`
			`rotation_mode="anchor")`
			`plt.show()`
			`by_intervention = None`
			```
chore(code): Rename ymlgrab to explore notebook 2023-12-21 10:40:05 +00:00
			```{python}
			`#\| label: fig-publications-per-year`
			`#\| fig-cap: Publications per year`

			`df_study_years = (`
			`df.groupby(["author", "year", "title"])`
			`.first()`
			`.reset_index()`
			`.drop_duplicates()`
			`)`
			`# plot by year TODO decide if we want to distinguish by literature type/region/etc as hue`
			`# FIXME should be timeseries plot so no years are missing`
			`ax = sns.countplot(df_study_years, x="year", native_scale=True)`
			`ax.tick_params(axis='x', rotation=45)`
			`ax.set_xlabel("")`
			`plt.tight_layout()`
			`plt.show()`
			`df_study_years = None`
			```

			```{python}
			`#\| label: tbl-income-crosstab`
			`#\| tbl-cap: Interventions targeting income inequality`

			`df_income = df.copy()`
			`df_income['Inequality'] = df_income['inequality'].str.split(";").explode(ignore_index=True).str.strip()`
			`df_income = df_income.loc[df_income['Inequality'] == "income"].copy()`
			`df_income['Intervention'] = df_income['intervention'].str.split(";").explode(ignore_index=True).str.replace(r"\(.+\)", "", regex=True).str.strip()`
			`pd.crosstab(df_income["Intervention"], df_income["Inequality"])`
			```

			```{python}
			`#\| label: tbl-income-crosstab`
			`#\| tbl-cap: Interventions targeting income inequality`

chore(code): Update explore experiments 2023-12-21 16:01:51 +00:00			`temp_df = df[["intervention", "inequality"]].copy().reset_index(drop=True)`
			`temp_df['Inequality'] = temp_df['inequality'].str.split(";").explode(ignore_index=True).str.strip()`
			`temp_df['Intervention'] = temp_df['intervention'].str.split(";").explode(ignore_index=True).str.replace(r"\(.+\)", "", regex=True).str.strip()`

			`gender_df = temp_df.loc[temp_df["Inequality"] == "gender"]`
			`income_df = temp_df.loc[temp_df["Inequality"] == "income"]`
			```

			`prep full data set:`

			```{python}
			`#\| echo: false`
			`from pathlib import Path`
			`import re`
			`## standard imports`
			`from IPython.core.display import Markdown as md`
			`import numpy as np`
			`import pandas as pd`
			`from matplotlib import pyplot as plt`
			`import seaborn as sns`
			`from tabulate import tabulate`
			`import bibtexparser`

			`sns.set_style("whitegrid")`

			`DATA_DIR=Path("./02-data")`
			`RAW_DATA=DATA_DIR.joinpath("raw")`
			`WORKING_DATA=DATA_DIR.joinpath("intermediate")`
			`PROCESSED_DATA=DATA_DIR.joinpath("processed")`
			`SUPPLEMENTARY_DATA=DATA_DIR.joinpath("supplementary")`

			`bib_string=""`
			`for partial_bib in RAW_DATA.glob("*/.bib"):`
			`with open(partial_bib) as f:`
			`bib_string+="\n".join(f.readlines())`
			`bib_sample_raw_db = bibtexparser.parse_string(bib_string)`

			`bib_string=""`
			`for partial_bib in WORKING_DATA.glob("*/.bib"):`
			`with open(partial_bib) as f:`
			`bib_string+="\n".join(f.readlines())`
			`bib_sample = bibtexparser.parse_string(bib_string)`
			```

			```{python}
			`# load relevant studies`
			`from src import data`

			`# load zotero-based metadata: citations and uses`
			`zot_df = pd.DataFrame([`
			`[`
			`entry["doi"] if "doi" in entry.fields_dict else None,`
			`entry["times-cited"] if "times-cited" in entry.fields_dict else None,`
			`entry["usage"] if "usage" in entry.fields_dict else None,`
			`entry["keywords"] if "keywords" in entry.fields_dict else None,`
			`]`
			`for entry in bib_sample.entries`
			`], columns = ["doi", "cited", "usage", "keywords"]).drop_duplicates("doi").set_index("doi")`

			`# Add WB country grouping definitions (income group, world region)`
			`WB_COUNTRY_GROUPS_FILE = Path(f"{SUPPLEMENTARY_DATA}/wb-country-groupings.xlsx").resolve()`
			`df_country_groups = pd.read_excel(WB_COUNTRY_GROUPS_FILE).set_index("Economy")`

			`bib_df = (data.from_yml(f"{PROCESSED_DATA}/relevant")`
			`.assign(`
			`doi=lambda _df: _df["uri"].str.extract(r"https?://(?:dx\.)?doi\.org/(.*)", expand=False),`
			`zot_cited=lambda _df: _df["doi"].map(zot_df["cited"]),`
			`zot_usage=lambda _df: _df["doi"].map(zot_df["usage"]),`
			`zot_keywords=lambda _df: _df["doi"].map(zot_df["keywords"]),`
			`date = lambda _df: pd.to_datetime(_df["year"], format="%Y"),`
			`year = lambda _df: _df["date"].dt.year,`
			`region = lambda _df: _df["country"].map(df_country_groups["Region"]),`
			`income_group = lambda _df: _df["country"].map(df_country_groups["Income group"]),`
			`)`
			`.query("year >= 2000")`
			`)`
			`zot_df = None`
			`df_country_groups = None`
			```

			```{python}
			`df_country_groups = pd.concat([pd.read_excel(WB_COUNTRY_GROUPS_FILE), pd.DataFrame(data={'Economy':['global'],'Code':['WLD'],'Region':['World'], 'Income group':[''], 'Lending category':['']})]).set_index("Economy")`

			`def countries_to_regions(countries:str):`
			`res = set()`
			`for c in countries.replace(" ;", ";").replace("; ",";").split(";"):`
			`if c in df_country_groups.index:`
			`region = df_country_groups.at[c,'Region']`
			`res.add(region)`
			`return ";".join(res)`

			`# countries_to_regions("India; Nicaragua")`
			`bib_df['region'] = bib_df['country'].map(countries_to_regions)`
			```

			```{python}
			`bib_df = (bib_df`
			`.assign(`
			`# create de-duplicated joins for all observations`
			`region=lambda _df: _df["region"].apply(`
			`lambda _cell: set([x.strip() for x in _cell.split(";")])`
			`),`
			`)`
			`.explode("region")`
			`)`
			`# bib_df["region"] = bib_df["region"].str.split(";").explode().str.strip()`
			`ax = sns.countplot(bib_df, x="region", order=bib_df["region"].value_counts().index)`
			`plt.setp(ax.get_xticklabels(), rotation=45, ha="right",`
			`rotation_mode="anchor")`
			`plt.show()`
chore(code): Rename ymlgrab to explore notebook 2023-12-21 10:40:05 +00:00			```