chore(repo): Simplify directory structure
This commit is contained in:
parent
284a3b9281
commit
f384515737
284 changed files with 2 additions and 0 deletions
117
notebooks/bibmanip.qmd
Normal file
117
notebooks/bibmanip.qmd
Normal file
|
|
@ -0,0 +1,117 @@
|
|||
---
|
||||
bibliography: ../02-data/intermediate/zotero-library.bib
|
||||
csl: /home/marty/documents/library/utilities/styles/APA-7.csl
|
||||
papersize: A4
|
||||
linestretch: 1.5
|
||||
fontfamily: lmodern
|
||||
fontsize: "12"
|
||||
geometry:
|
||||
- left=2.2cm
|
||||
- right=3.5cm
|
||||
- top=2.5cm
|
||||
- bottom=2.5cm
|
||||
toc: false
|
||||
link-citations: true
|
||||
link-bibliography: true
|
||||
number-sections: false
|
||||
lang: en
|
||||
title: Scoping review on 'what works'
|
||||
subtitle: Addressing inequalities in the World of Work
|
||||
---
|
||||
|
||||
```{python}
|
||||
#| echo: false
|
||||
from pathlib import Path
|
||||
data_dir=Path("../02-data")
|
||||
|
||||
## standard imports
|
||||
from IPython.core.display import Markdown as md
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from matplotlib import pyplot as plt
|
||||
import seaborn as sns
|
||||
from tabulate import tabulate
|
||||
```
|
||||
|
||||
```{python}
|
||||
sns.set_style("whitegrid")
|
||||
```
|
||||
|
||||
```{python}
|
||||
#| echo: false
|
||||
# load and parse overall bibtex sample
|
||||
import bibtexparser
|
||||
|
||||
bib_string=""
|
||||
print(f"path: {data_dir.joinpath('raw/01_wos-sample_2023-11-02').absolute()}")
|
||||
for partial_bib in data_dir.joinpath("raw/01_wos-sample_2023-11-02").glob("*.bib"):
|
||||
with open(partial_bib) as f:
|
||||
bib_string+="\n".join(f.readlines())
|
||||
sample = bibtexparser.parse_string(bib_string)
|
||||
```
|
||||
|
||||
## Description of results
|
||||
|
||||
```{python}
|
||||
#| echo: false
|
||||
|
||||
sample_size = len(sample.entries)
|
||||
md(f"""
|
||||
The exploratory execution of queries results in an initial sample of {sample_size} studies after the identification process.
|
||||
""")
|
||||
```
|
||||
|
||||
yrs:
|
||||
|
||||
```{python}
|
||||
reformatted = []
|
||||
for e in sample.entries:
|
||||
reformatted.append([e["Year"], e["Author"], e["Title"], e["Type"], e["Times-Cited"], e["Usage-Count-Since-2013"]])
|
||||
bib_df = pd.DataFrame(reformatted, columns = ["Year", "Author", "Title", "Type", "Cited", "Usage"])
|
||||
bib_df["Date"] = pd.to_datetime(bib_df["Year"], format="%Y")
|
||||
bib_df["Year"] = bib_df["Date"].dt.year
|
||||
bib_df
|
||||
```
|
||||
|
||||
```{python}
|
||||
# RESTRICT FOR NEWER STUDIES
|
||||
bib_df = bib_df[bib_df["Year"] >= 2000]
|
||||
```
|
||||
|
||||
Publications per year:
|
||||
|
||||
```{python}
|
||||
ax = sns.countplot(bib_df[bib_df["Year"] >= 2000], x="Year")
|
||||
ax.tick_params(axis='x', rotation=45)
|
||||
plt.tight_layout()
|
||||
plt.show()
|
||||
```
|
||||
|
||||
By type:
|
||||
|
||||
```{python}
|
||||
bib_df["Type"].value_counts()
|
||||
bib_df["Literature"] = np.where(bib_df["Type"].str.contains("article", case=False, regex=False), "white", "gray")
|
||||
bib_df["Literature"] = bib_df["Literature"].astype("category")
|
||||
```
|
||||
|
||||
Per type:
|
||||
|
||||
```{python}
|
||||
ax = sns.countplot(bib_df[bib_df["Year"] >= 2000], x="Year", hue="Literature")
|
||||
ax.tick_params(axis='x', rotation=45)
|
||||
# ax.set_xlabel("")
|
||||
plt.tight_layout()
|
||||
plt.show()
|
||||
```
|
||||
|
||||
Avg num of citations:
|
||||
|
||||
```{python}
|
||||
bib_df["Cited"] = bib_df["Cited"].astype("int")
|
||||
grpd = bib_df.groupby(["Year"], as_index=False)["Cited"].mean()
|
||||
ax = sns.barplot(grpd, x="Year", y="Cited")
|
||||
ax.tick_params(axis='x', rotation=45)
|
||||
plt.tight_layout()
|
||||
plt.show()
|
||||
```
|
||||
322
notebooks/explore.qmd
Normal file
322
notebooks/explore.qmd
Normal file
|
|
@ -0,0 +1,322 @@
|
|||
---
|
||||
bibliography: 02-data/intermediate/zotero-library.bib
|
||||
title: Grab yml
|
||||
---
|
||||
|
||||
## Separate data acquisition
|
||||
|
||||
```{python}
|
||||
import pandas as pd
|
||||
from src import load_data
|
||||
|
||||
df = load_data.from_yml()
|
||||
```
|
||||
|
||||
Get interventions:
|
||||
|
||||
```{python}
|
||||
df['intervention'].str.split(";").explode().str.strip().value_counts()
|
||||
```
|
||||
|
||||
Get inequalities:
|
||||
|
||||
```{python}
|
||||
df['inequality'].str.split(";").explode().str.strip().value_counts()
|
||||
```
|
||||
|
||||
```{python}
|
||||
df.groupby(["author", "year", "title"]).first().join(df['intervention'])
|
||||
```
|
||||
|
||||
Unique values in chain method:
|
||||
|
||||
```{python}
|
||||
(
|
||||
df.groupby(["author", "year", "title"])
|
||||
.agg(
|
||||
{
|
||||
"intervention": lambda _col:"; ".join(_col),
|
||||
"inequality": lambda _col:"; ".join(_col),
|
||||
}
|
||||
)
|
||||
.drop_duplicates()
|
||||
.explode("inequality")
|
||||
["inequality"].str.strip()
|
||||
.value_counts()
|
||||
)
|
||||
```
|
||||
|
||||
Merge dataset so it is collected by *STUDY* not by *OBSERVATION*.
|
||||
Any required columns can be calculated similar to the agg function here.
|
||||
|
||||
```{python}
|
||||
by_study = (
|
||||
df.groupby(["author", "year", "title"])
|
||||
.agg(
|
||||
{
|
||||
"intervention": lambda _col: "; ".join(_col),
|
||||
"inequality": lambda _col: "; ".join(_col),
|
||||
"date": lambda _col: "; ".join(_col),
|
||||
"findings": lambda _col: "; ".join(_col),
|
||||
# "region": lambda _col: "; ".join(_col), # only accessible when merging with WB data
|
||||
# "income_group": lambda _col: "; ".join(_col),
|
||||
}
|
||||
)
|
||||
.reset_index()
|
||||
.drop_duplicates()
|
||||
.assign(
|
||||
# create de-duplicated joins for all observations
|
||||
intervention=lambda _df: _df["intervention"].apply(
|
||||
lambda _cell: set([x.strip() for x in _cell.split(";")])
|
||||
),
|
||||
inequality=lambda _df: _df["inequality"].apply(
|
||||
lambda _cell: set([x.strip() for x in _cell.split(";")])
|
||||
),
|
||||
)
|
||||
)
|
||||
```
|
||||
|
||||
```{python}
|
||||
by_study = (
|
||||
df.groupby(["author", "year", "title"])
|
||||
.first()
|
||||
.reset_index()
|
||||
.drop_duplicates()
|
||||
.assign(
|
||||
# create de-duplicated joins for all observations
|
||||
intervention=lambda _df: _df["intervention"].apply(
|
||||
lambda _cell: set([x.strip() for x in _cell.split(";")])
|
||||
),
|
||||
inequality=lambda _df: _df["inequality"].apply(
|
||||
lambda _cell: set([x.strip() for x in _cell.split(";")])
|
||||
),
|
||||
)
|
||||
)
|
||||
```
|
||||
|
||||
```{python}
|
||||
import re
|
||||
from matplotlib import pyplot as plt
|
||||
import seaborn as sns
|
||||
by_intervention = (
|
||||
df.groupby(["author", "year", "title"])
|
||||
.agg(
|
||||
{
|
||||
"intervention": lambda _col: "; ".join(_col),
|
||||
}
|
||||
)
|
||||
.reset_index()
|
||||
.drop_duplicates()
|
||||
.assign(
|
||||
intervention=lambda _df: _df["intervention"].apply(
|
||||
lambda _cell: set([x.strip() for x in re.sub(r"\(.*\)", "", _cell).split(";")])
|
||||
),
|
||||
)
|
||||
.explode("intervention")
|
||||
)
|
||||
sort_order = by_intervention["intervention"].value_counts().index
|
||||
|
||||
fig = plt.figure()
|
||||
fig.set_size_inches(6, 3)
|
||||
ax = sns.countplot(by_intervention, x="intervention", order=by_intervention["intervention"].value_counts().index)
|
||||
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
|
||||
rotation_mode="anchor")
|
||||
plt.show()
|
||||
by_intervention = None
|
||||
```
|
||||
|
||||
```{python}
|
||||
#| label: fig-publications-per-year
|
||||
#| fig-cap: Publications per year
|
||||
|
||||
df_study_years = (
|
||||
df.groupby(["author", "year", "title"])
|
||||
.first()
|
||||
.reset_index()
|
||||
.drop_duplicates()
|
||||
)
|
||||
# plot by year TODO decide if we want to distinguish by literature type/region/etc as hue
|
||||
# FIXME should be timeseries plot so no years are missing
|
||||
ax = sns.countplot(df_study_years, x="year", native_scale=True)
|
||||
ax.tick_params(axis='x', rotation=45)
|
||||
ax.set_xlabel("")
|
||||
plt.tight_layout()
|
||||
plt.show()
|
||||
df_study_years = None
|
||||
```
|
||||
|
||||
```{python}
|
||||
#| label: tbl-income-crosstab
|
||||
#| tbl-cap: Interventions targeting income inequality
|
||||
|
||||
df_income = df.copy()
|
||||
df_income['Inequality'] = df_income['inequality'].str.split(";").explode(ignore_index=True).str.strip()
|
||||
df_income = df_income.loc[df_income['Inequality'] == "income"].copy()
|
||||
df_income['Intervention'] = df_income['intervention'].str.split(";").explode(ignore_index=True).str.replace(r"\(.+\)", "", regex=True).str.strip()
|
||||
pd.crosstab(df_income["Intervention"], df_income["Inequality"])
|
||||
```
|
||||
|
||||
```{python}
|
||||
#| label: tbl-income-crosstab
|
||||
#| tbl-cap: Interventions targeting income inequality
|
||||
|
||||
temp_df = df[["intervention", "inequality"]].copy().reset_index(drop=True)
|
||||
temp_df['Inequality'] = temp_df['inequality'].str.split(";").explode(ignore_index=True).str.strip()
|
||||
temp_df['Intervention'] = temp_df['intervention'].str.split(";").explode(ignore_index=True).str.replace(r"\(.+\)", "", regex=True).str.strip()
|
||||
|
||||
gender_df = temp_df.loc[temp_df["Inequality"] == "gender"]
|
||||
income_df = temp_df.loc[temp_df["Inequality"] == "income"]
|
||||
```
|
||||
|
||||
## Complete data replication from scoping
|
||||
|
||||
prep full data set:
|
||||
|
||||
```{python}
|
||||
#| echo: false
|
||||
from pathlib import Path
|
||||
import re
|
||||
## standard imports
|
||||
from IPython.core.display import Markdown as md
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from matplotlib import pyplot as plt
|
||||
import seaborn as sns
|
||||
from tabulate import tabulate
|
||||
import bibtexparser
|
||||
|
||||
sns.set_style("whitegrid")
|
||||
|
||||
DATA_DIR=Path("./02-data")
|
||||
RAW_DATA=DATA_DIR.joinpath("raw")
|
||||
WORKING_DATA=DATA_DIR.joinpath("intermediate")
|
||||
PROCESSED_DATA=DATA_DIR.joinpath("processed")
|
||||
SUPPLEMENTARY_DATA=DATA_DIR.joinpath("supplementary")
|
||||
|
||||
bib_string=""
|
||||
for partial_bib in RAW_DATA.glob("**/*.bib"):
|
||||
with open(partial_bib) as f:
|
||||
bib_string+="\n".join(f.readlines())
|
||||
bib_sample_raw_db = bibtexparser.parse_string(bib_string)
|
||||
|
||||
bib_string=""
|
||||
for partial_bib in WORKING_DATA.glob("**/*.bib"):
|
||||
with open(partial_bib) as f:
|
||||
bib_string+="\n".join(f.readlines())
|
||||
bib_sample = bibtexparser.parse_string(bib_string)
|
||||
```
|
||||
|
||||
```{python}
|
||||
# load relevant studies
|
||||
from src import load_data
|
||||
|
||||
# load zotero-based metadata: citations and uses
|
||||
zot_df = pd.DataFrame([
|
||||
[
|
||||
entry["doi"] if "doi" in entry.fields_dict else None,
|
||||
entry["times-cited"] if "times-cited" in entry.fields_dict else None,
|
||||
entry["usage"] if "usage" in entry.fields_dict else None,
|
||||
entry["keywords"] if "keywords" in entry.fields_dict else None,
|
||||
]
|
||||
for entry in bib_sample.entries
|
||||
], columns = ["doi", "cited", "usage", "keywords"]).drop_duplicates("doi").set_index("doi")
|
||||
|
||||
# Add WB country grouping definitions (income group, world region)
|
||||
WB_COUNTRY_GROUPS_FILE = Path(f"{SUPPLEMENTARY_DATA}/wb-country-groupings.xlsx").resolve()
|
||||
df_country_groups = pd.read_excel(WB_COUNTRY_GROUPS_FILE).set_index("Economy")
|
||||
|
||||
bib_df = (load_data.from_yml(f"{PROCESSED_DATA}")
|
||||
.assign(
|
||||
doi=lambda _df: _df["uri"].str.extract(r"https?://(?:dx\.)?doi\.org/(.*)", expand=False),
|
||||
zot_cited=lambda _df: _df["doi"].map(zot_df["cited"]),
|
||||
zot_usage=lambda _df: _df["doi"].map(zot_df["usage"]),
|
||||
zot_keywords=lambda _df: _df["doi"].map(zot_df["keywords"]),
|
||||
date = lambda _df: pd.to_datetime(_df["year"], format="%Y"),
|
||||
year = lambda _df: _df["date"].dt.year,
|
||||
region = lambda _df: _df["country"].map(df_country_groups["Region"]),
|
||||
income_group = lambda _df: _df["country"].map(df_country_groups["Income group"]),
|
||||
)
|
||||
.query("year >= 2000")
|
||||
)
|
||||
zot_df = None
|
||||
df_country_groups = None
|
||||
```
|
||||
|
||||
```{python}
|
||||
df_country_groups = pd.concat([pd.read_excel(WB_COUNTRY_GROUPS_FILE), pd.DataFrame(data={'Economy':['global'],'Code':['WLD'],'Region':['Europe & Central Asia;South Asia;North America;East Asia & Pacific;Sub-Saharan Africa;Europe & Central Asia;Latin America & Caribbean'], 'Income group':[''], 'Lending category':['']})]).set_index("Economy")
|
||||
|
||||
def countries_to_regions(countries:str):
|
||||
res = set()
|
||||
for c in countries.replace(" ;", ";").replace("; ",";").split(";"):
|
||||
if c in df_country_groups.index:
|
||||
region = df_country_groups.at[c,'Region']
|
||||
res.add(region)
|
||||
return ";".join(res)
|
||||
|
||||
# countries_to_regions("India; Nicaragua")
|
||||
bib_df['region'] = bib_df['country'].map(countries_to_regions)
|
||||
bib_df['region'].value_counts().plot.bar()
|
||||
```
|
||||
|
||||
```{python}
|
||||
bib_df = (bib_df
|
||||
.assign(
|
||||
# create de-duplicated joins for all observations
|
||||
region=lambda _df: _df["region"].apply(
|
||||
lambda _cell: set([x.strip() for x in _cell.split(";")])
|
||||
),
|
||||
)
|
||||
.explode("region")
|
||||
)
|
||||
# bib_df["region"] = bib_df["region"].str.split(";").explode().str.strip()
|
||||
ax = sns.countplot(bib_df, x="region", order=bib_df["region"].value_counts().index)
|
||||
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
|
||||
rotation_mode="anchor")
|
||||
plt.show()
|
||||
```
|
||||
|
||||
```{python}
|
||||
df_inequality = (bib_df[["region", "intervention", "inequality"]]
|
||||
.assign(
|
||||
Intervention = lambda _df: (_df["intervention"]
|
||||
.str.replace(r"\(.+\)", "", regex=True)
|
||||
.str.replace(r" ?; ?", ";", regex=True)
|
||||
.str.strip()
|
||||
.str.split(";")
|
||||
),
|
||||
Inequality = lambda _df: (_df["inequality"]
|
||||
.str.replace(r"\(.+\)", "", regex=True)
|
||||
.str.replace(r" ?; ?", ";", regex=True)
|
||||
.str.strip()
|
||||
.str.split(";")
|
||||
)
|
||||
)
|
||||
.explode("Intervention")
|
||||
.explode("Inequality")
|
||||
.reset_index(drop=True)
|
||||
)
|
||||
```
|
||||
|
||||
```{python}
|
||||
def crosstab_inequality(df, inequality:str, **kwargs):
|
||||
df_temp = df.loc[(df["Inequality"] == inequality) | (df["Inequality"] == "income")]
|
||||
tab = pd.crosstab(df_temp["Intervention"], df_temp["Inequality"], **kwargs)
|
||||
return tab.drop(tab[tab[inequality] == 0].index)
|
||||
```
|
||||
|
||||
## Gender inequality
|
||||
|
||||
```{python}
|
||||
#| label: tbl-gender-crosstab
|
||||
#| tbl-cap: Interventions targeting gender inequality
|
||||
|
||||
crosstab_inequality(df_inequality, "gender", normalize=False).sort_values("gender", ascending=False)
|
||||
```
|
||||
|
||||
```{python}
|
||||
def region_vis_inequality(df, inequality:str):
|
||||
df_temp = df.loc[(df["Inequality"] == inequality)]
|
||||
return sns.countplot(df_temp, x="region", order=df_temp["region"].value_counts().index)
|
||||
region_vis_inequality(df_inequality, "spatial")
|
||||
```
|
||||
|
||||
149
notebooks/main-findings.qmd
Normal file
149
notebooks/main-findings.qmd
Normal file
|
|
@ -0,0 +1,149 @@
|
|||
load data, boilerplate:
|
||||
|
||||
```{python}
|
||||
#| echo: false
|
||||
from pathlib import Path
|
||||
import re
|
||||
## standard imports
|
||||
from IPython.core.display import Markdown as md
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from matplotlib import pyplot as plt
|
||||
import seaborn as sns
|
||||
from tabulate import tabulate
|
||||
import bibtexparser
|
||||
|
||||
sns.set_style("whitegrid")
|
||||
|
||||
DATA_DIR=Path("./02-data")
|
||||
RAW_DATA=DATA_DIR.joinpath("raw")
|
||||
WORKING_DATA=DATA_DIR.joinpath("intermediate")
|
||||
PROCESSED_DATA=DATA_DIR.joinpath("processed")
|
||||
SUPPLEMENTARY_DATA=DATA_DIR.joinpath("supplementary")
|
||||
|
||||
bib_string=""
|
||||
for partial_bib in RAW_DATA.glob("**/*.bib"):
|
||||
with open(partial_bib) as f:
|
||||
bib_string+="\n".join(f.readlines())
|
||||
bib_sample_raw_db = bibtexparser.parse_string(bib_string)
|
||||
|
||||
bib_string=""
|
||||
for partial_bib in WORKING_DATA.glob("**/*.bib"):
|
||||
with open(partial_bib) as f:
|
||||
bib_string+="\n".join(f.readlines())
|
||||
bib_sample = bibtexparser.parse_string(bib_string)
|
||||
```
|
||||
|
||||
```{python}
|
||||
# load relevant studies
|
||||
from src import load_data
|
||||
|
||||
# load zotero-based metadata: citations and uses
|
||||
zot_df = pd.DataFrame([
|
||||
[
|
||||
entry["doi"] if "doi" in entry.fields_dict else None,
|
||||
entry["times-cited"] if "times-cited" in entry.fields_dict else None,
|
||||
entry["usage"] if "usage" in entry.fields_dict else None,
|
||||
entry["keywords"] if "keywords" in entry.fields_dict else None,
|
||||
]
|
||||
for entry in bib_sample.entries
|
||||
], columns = ["doi", "cited", "usage", "keywords"]).drop_duplicates("doi").set_index("doi")
|
||||
|
||||
# Add WB country grouping definitions (income group, world region)
|
||||
WB_COUNTRY_GROUPS_FILE = Path(f"{SUPPLEMENTARY_DATA}/wb-country-groupings.xlsx").resolve()
|
||||
df_country_groups = pd.read_excel(WB_COUNTRY_GROUPS_FILE).set_index("Economy")
|
||||
|
||||
bib_df = (load_data.from_yml(f"{PROCESSED_DATA}")
|
||||
.assign(
|
||||
doi=lambda _df: _df["uri"].str.extract(r"https?://(?:dx\.)?doi\.org/(.*)", expand=False),
|
||||
zot_cited=lambda _df: _df["doi"].map(zot_df["cited"]),
|
||||
zot_usage=lambda _df: _df["doi"].map(zot_df["usage"]),
|
||||
zot_keywords=lambda _df: _df["doi"].map(zot_df["keywords"]),
|
||||
date = lambda _df: pd.to_datetime(_df["year"], format="%Y"),
|
||||
year = lambda _df: _df["date"].dt.year,
|
||||
region = lambda _df: _df["country"].map(df_country_groups["Region"]),
|
||||
income_group = lambda _df: _df["country"].map(df_country_groups["Income group"]),
|
||||
)
|
||||
.query("year >= 2000")
|
||||
)
|
||||
zot_df = None
|
||||
df_country_groups = None
|
||||
```
|
||||
|
||||
```{python}
|
||||
df_country_groups = pd.concat([pd.read_excel(WB_COUNTRY_GROUPS_FILE), pd.DataFrame(data={'Economy':['global'],'Code':['WLD'],'Region':['Europe & Central Asia;South Asia;North America;East Asia & Pacific;Sub-Saharan Africa;Europe & Central Asia;Latin America & Caribbean'], 'Income group':[''], 'Lending category':['']})]).set_index("Economy")
|
||||
|
||||
def countries_to_regions(countries:str):
|
||||
res = set()
|
||||
for c in countries.replace(" ;", ";").replace("; ",";").split(";"):
|
||||
if c in df_country_groups.index:
|
||||
region = df_country_groups.at[c,'Region']
|
||||
res.add(region)
|
||||
return ";".join(res)
|
||||
|
||||
# countries_to_regions("India; Nicaragua")
|
||||
bib_df['region'] = bib_df['country'].map(countries_to_regions)
|
||||
bib_df['region'].value_counts().plot.bar()
|
||||
```
|
||||
|
||||
```{python}
|
||||
#| label: fig-intervention-types
|
||||
#| fig-cap: Predominant type of intervention
|
||||
|
||||
by_intervention = (
|
||||
bib_df.groupby(["author", "year", "title", "design", "method", "representativeness", "citation"])
|
||||
.agg(
|
||||
{
|
||||
"intervention": lambda _col: "; ".join(_col),
|
||||
}
|
||||
)
|
||||
.reset_index()
|
||||
.drop_duplicates()
|
||||
.assign(
|
||||
intervention=lambda _df: _df["intervention"].apply(
|
||||
lambda _cell: set([x.strip() for x in re.sub(r"\(.*\)", "", _cell).split(";")])
|
||||
),
|
||||
)
|
||||
.explode("intervention")
|
||||
)
|
||||
sort_order = by_intervention["intervention"].value_counts().index
|
||||
|
||||
fig = plt.figure()
|
||||
fig.set_size_inches(6, 3)
|
||||
ax = sns.countplot(by_intervention, x="intervention", order=by_intervention["intervention"].value_counts().index)
|
||||
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
|
||||
rotation_mode="anchor")
|
||||
plt.show()
|
||||
```
|
||||
|
||||
datavis:
|
||||
|
||||
|
||||
```{python}
|
||||
findings_institutional = pd.read_csv("02-data/supplementary/findings-institutional.csv")
|
||||
findings_institutional
|
||||
from src.model import validity
|
||||
import math
|
||||
|
||||
validities = validity.calculate(by_intervention)
|
||||
valid_subset = validities[["internal_validity", "external_validity", "citation"]].fillna(1.0).drop_duplicates(subset=["citation"]).sort_values("internal_validity")
|
||||
def combined_validities(df_in, column: str = "internal_validity"):
|
||||
if not isinstance(df_in, str):
|
||||
return
|
||||
combined = 0.0
|
||||
for study in df_in.split(";"):
|
||||
new = valid_subset.loc[valid_subset["citation"] == study, column]
|
||||
if len(new) > 0 and not math.isnan(new.iat[0]):
|
||||
combined += new.iat[0]
|
||||
if combined:
|
||||
return combined
|
||||
return 0.0
|
||||
def combined_external(df_in, column: str = "external_validity"):
|
||||
return combined_validities(df_in, column)
|
||||
|
||||
findings_institutional["internal_validity"] = findings_institutional["studies"].apply(combined_validities)
|
||||
findings_institutional["external_validity"] = findings_institutional["studies"].apply(combined_external)
|
||||
findings_institutional[["area of policy", "internal_validity", "external_validity", "findings", "channels"]]
|
||||
```
|
||||
|
||||
|
||||
234
notebooks/rank_validities.qmd
Normal file
234
notebooks/rank_validities.qmd
Normal file
|
|
@ -0,0 +1,234 @@
|
|||
load data, boilerplate:
|
||||
|
||||
```{python}
|
||||
#| label: load-data
|
||||
#| echo: false
|
||||
from pathlib import Path
|
||||
import re
|
||||
## standard imports
|
||||
from IPython.core.display import Markdown as md
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from matplotlib import pyplot as plt
|
||||
import seaborn as sns
|
||||
from tabulate import tabulate
|
||||
import bibtexparser
|
||||
|
||||
sns.set_style("whitegrid")
|
||||
|
||||
DATA_DIR=Path("./02-data")
|
||||
RAW_DATA=DATA_DIR.joinpath("raw")
|
||||
WORKING_DATA=DATA_DIR.joinpath("intermediate")
|
||||
PROCESSED_DATA=DATA_DIR.joinpath("processed")
|
||||
SUPPLEMENTARY_DATA=DATA_DIR.joinpath("supplementary")
|
||||
|
||||
from src import prep_data
|
||||
|
||||
# raw database-search results
|
||||
bib_sample_raw_db = prep_data.bib_library_from_dir(RAW_DATA)
|
||||
# the complete library of sampled (and working) literature
|
||||
bib_sample = prep_data.bib_library_from_dir(WORKING_DATA)
|
||||
|
||||
# load relevant studies
|
||||
from src import load_data
|
||||
|
||||
bib_df = prep_data.observations_with_metadata_df(
|
||||
raw_observations = load_data.from_yml(PROCESSED_DATA),
|
||||
study_metadata = prep_data.bib_metadata_df(bib_sample),
|
||||
country_groups = prep_data.country_groups_df(Path(f"{SUPPLEMENTARY_DATA}/wb-country-groupings.xlsx")),
|
||||
)
|
||||
raw_observations = None
|
||||
zot_df = None
|
||||
df_country_groups = None
|
||||
```
|
||||
|
||||
prep data:
|
||||
|
||||
Map a 0-5 external validity score based on 'representativeness' to rows:
|
||||
|
||||
```{python}
|
||||
df=bib_df
|
||||
vd=df[(df['design'] == 'quasi-experimental') | (df['design'] == 'experimental')]
|
||||
|
||||
vd = vd.assign(valid_ext=0)
|
||||
vd["representativeness"] = vd["representativeness"].fillna("")
|
||||
|
||||
mask_subnational = vd['representativeness'].str.contains("subnational")
|
||||
mask_national = vd['representativeness'].str.contains("national")
|
||||
mask_regional = vd['representativeness'].str.contains("regional")
|
||||
mask_local = vd['representativeness'].str.contains("local")
|
||||
|
||||
vd.loc[mask_regional, 'valid_ext'] = 5
|
||||
vd.loc[mask_national, 'valid_ext'] = 4
|
||||
vd.loc[mask_subnational, 'valid_ext'] = 3
|
||||
vd.loc[mask_local, 'valid_ext'] = 2
|
||||
|
||||
vd[['representativeness', 'valid_ext']]
|
||||
```
|
||||
|
||||
Map an internal validity score based on study design/method:
|
||||
|
||||
```{python}
|
||||
vd = vd.assign(valid_int=0)
|
||||
vd["method"] = vd["method"].fillna("")
|
||||
|
||||
vd.loc[vd['method'].str.contains("RCT"), 'valid_int'] = 5.0
|
||||
vd.loc[vd['method'].str.contains("|".join(["RD","regression.vdiscontinuity"])), 'valid_int'] = 4.5
|
||||
vd.loc[vd['method'].str.contains("|".join(["IV","instrumental.variable"])), 'valid_int'] = 4.0
|
||||
vd.loc[vd['method'].str.contains("|".join(["PSM","propensity.score.matching"])), 'valid_int'] = 3.5
|
||||
vd.loc[vd['method'].str.contains("|".join(["DM","discontinuity.matching"])), 'valid_int'] = 3.0
|
||||
vd.loc[vd['method'].str.contains("|".join(["DID","difference.in.difference"])), 'valid_int'] = 3.0
|
||||
vd.loc[vd['method'].str.contains("|".join(["OLS","ordinary.least.square"])), 'valid_int'] = 2.0
|
||||
vd[['method', 'valid_int']]
|
||||
```
|
||||
|
||||
## visualize data:
|
||||
|
||||
Prep the by_intervention dataframe:
|
||||
|
||||
```{python}
|
||||
#| label: fig-intervention-types
|
||||
#| fig-cap: Available studies by primary type of intervention
|
||||
|
||||
by_intervention = (
|
||||
bib_df
|
||||
.fillna("")
|
||||
.groupby(["author", "year", "title", "design", "method", "region", "representativeness", "citation"])
|
||||
.agg(
|
||||
{
|
||||
"intervention": lambda _col: "; ".join(_col),
|
||||
}
|
||||
)
|
||||
.reset_index()
|
||||
.drop_duplicates()
|
||||
.assign(
|
||||
intervention=lambda _df: _df["intervention"].apply(
|
||||
lambda _cell: set([x.strip() for x in re.sub(r"\(.*\)", "", _cell).split(";")])
|
||||
),
|
||||
)
|
||||
.explode("intervention")
|
||||
)
|
||||
sort_order = by_intervention["intervention"].value_counts().index
|
||||
|
||||
fig = plt.figure()
|
||||
fig.set_size_inches(6, 3)
|
||||
ax = sns.countplot(by_intervention, x="intervention", order=by_intervention["intervention"].value_counts().index)
|
||||
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
|
||||
rotation_mode="anchor")
|
||||
plt.show()
|
||||
```
|
||||
|
||||
then visualize:
|
||||
|
||||
validities as distplot with external as categorical x and internal as hue facet.
|
||||
Nicely shows that lower-internal generally have higher external and there are two external humps at 3 and 5
|
||||
(subnational and census-based)
|
||||
|
||||
```{python}
|
||||
#| label: fig-validity-density
|
||||
from src.model import validity
|
||||
import seaborn.objects as so
|
||||
|
||||
validities = validity.calculate(by_intervention)
|
||||
validities["identifier"] = validities["author"].str.replace(r',.*$', '', regex=True) + " (" + validities["year"].astype(str) + ")"
|
||||
|
||||
|
||||
# As distplot to show hue-facetted density
|
||||
|
||||
sns.displot(
|
||||
data=validities,
|
||||
x="external_validity", hue="internal_validity",
|
||||
kind="kde",
|
||||
multiple="fill", clip=(0, None),
|
||||
palette="ch:rot=-0.5,hue=1.5,light=0.9",
|
||||
)
|
||||
```
|
||||
As a point-plot which shows the x and y correlation and the spread (roughly) per external validity
|
||||
|
||||
```{python}
|
||||
#| label: fig-validity-points
|
||||
from src.model import validity
|
||||
import seaborn.objects as so
|
||||
|
||||
validities = validity.calculate(by_intervention)
|
||||
validities["identifier"] = validities["author"].str.replace(r',.*$', '', regex=True) + " (" + validities["year"].astype(str) + ")"
|
||||
|
||||
|
||||
|
||||
sns.pointplot(
|
||||
data=validities,
|
||||
x="internal_validity", y="external_validity"
|
||||
)
|
||||
```
|
||||
|
||||
As a relation-chart which shows the internal-external relation and the deviation from individual points.
|
||||
|
||||
```{python}
|
||||
#| label: fig-validity-relation
|
||||
#| fig-cap: "Relation between internal and external validity"
|
||||
#| fig-height: 5
|
||||
#| code-fold: true
|
||||
|
||||
from src.model import validity
|
||||
|
||||
validities = validity.calculate(by_intervention)
|
||||
validities["identifier"] = validities["author"].str.replace(r',.*$', '', regex=True) + " (" + validities["year"].astype(str) + ")"
|
||||
validities = validities.loc[(validities["design"] == "quasi-experimental") | (validities["design"] == "experimental")]
|
||||
#validities["external_validity"] = validities["external_validity"].astype('category')
|
||||
validities["internal_validity"] = validities["internal_validity"].astype('category')
|
||||
|
||||
sns.pointplot(
|
||||
data=validities,
|
||||
x="internal_validity", y="external_validity",
|
||||
)
|
||||
```
|
||||
|
||||
```{python}
|
||||
#| label: fig-validity-distribution
|
||||
#| fig-cap: "Distribution of internal validities"
|
||||
#| fig-height: 5
|
||||
#| code-fold: true
|
||||
|
||||
fig, ax = plt.subplots()
|
||||
|
||||
#sns.displot(
|
||||
# data=validities,
|
||||
# x="external_validity", hue="internal_validity",
|
||||
# kind="kde",
|
||||
# multiple="fill", clip=(0, None),
|
||||
# palette="ch:rot=-0.5,hue=1.5,light=0.9",
|
||||
# bw_adjust=.65, cut=0,
|
||||
# warn_singular = False
|
||||
#)
|
||||
```
|
||||
Following plots need at least one axis, preferably external to be set to categorical.
|
||||
|
||||
As a heatmap plot for categorical data between x-y:
|
||||
|
||||
```{python}
|
||||
#| label: fig-validity-distribution
|
||||
sns.displot(
|
||||
data=validities,
|
||||
x="internal_validity", y="external_validity", hue="design",
|
||||
palette="ch:rot=-0.75,hue=1.5,light=0.9",
|
||||
)
|
||||
```
|
||||
|
||||
As a violin plot showing distribution of external along internal category:
|
||||
|
||||
```{python}
|
||||
sns.violinplot(
|
||||
data=validities,
|
||||
x="internal_validity", y="external_validity", hue="design",
|
||||
cut=0, bw_method="scott",
|
||||
orient="x"
|
||||
)
|
||||
# optional swarmplot showing the actual amount of data points for each rank
|
||||
sns.swarmplot(
|
||||
data=validities,
|
||||
x="internal_validity", y="external_validity",
|
||||
color="red",
|
||||
s=6
|
||||
)
|
||||
```
|
||||
|
||||
55
notebooks/test-magma.qmd
Normal file
55
notebooks/test-magma.qmd
Normal file
|
|
@ -0,0 +1,55 @@
|
|||
---
|
||||
bibliography: 02-data/intermediate/zotero-library.bib
|
||||
csl: /home/marty/documents/library/utilities/styles/APA-7.csl
|
||||
papersize: A4
|
||||
linestretch: 1.5
|
||||
fontfamily: lmodern
|
||||
fontsize: "12"
|
||||
geometry:
|
||||
- left=2.2cm
|
||||
- right=3.5cm
|
||||
- top=2.5cm
|
||||
- bottom=2.5cm
|
||||
toc: false
|
||||
link-citations: true
|
||||
link-bibliography: true
|
||||
number-sections: false
|
||||
lang: en
|
||||
title: Scoping review on 'what works'
|
||||
subtitle: Addressing inequalities in the World of Work
|
||||
filters:
|
||||
- src/pandoc-to-zotero-live.lua
|
||||
zotero:
|
||||
library: wow-inequalities
|
||||
client: zotero
|
||||
csl-style: apa
|
||||
---
|
||||
|
||||
```{python}
|
||||
#| echo: false
|
||||
from pathlib import Path
|
||||
DATA_DIR=Path("./02-data")
|
||||
BIB_PATH = DATA_DIR.joinpath("raw/01_wos-sample_2023-11-02")
|
||||
|
||||
## standard imports
|
||||
from IPython.core.display import Markdown as md
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from matplotlib import pyplot as plt
|
||||
import seaborn as sns
|
||||
from tabulate import tabulate
|
||||
|
||||
sns.set_style("whitegrid")
|
||||
```
|
||||
|
||||
```{python}
|
||||
rng = np.random.RandomState(0)
|
||||
x = np.linspace(0, 10, 500)
|
||||
y = np.cumsum(rng.randn(500, 6), 0)
|
||||
```
|
||||
|
||||
```{python}
|
||||
# same plotting code as above!
|
||||
plt.plot(x, y)
|
||||
plt.legend('ABCDEF', ncol=2, loc='upper left');
|
||||
```
|
||||
Loading…
Add table
Add a link
Reference in a new issue