chore(repo): Simplify directory structure

This commit is contained in:
Marty Oehme 2024-07-15 21:24:35 +02:00
parent 284a3b9281
commit f384515737
Signed by: Marty
GPG key ID: EDBF2ED917B2EF6A
284 changed files with 2 additions and 0 deletions

117
notebooks/bibmanip.qmd Normal file
View file

@ -0,0 +1,117 @@
---
bibliography: ../02-data/intermediate/zotero-library.bib
csl: /home/marty/documents/library/utilities/styles/APA-7.csl
papersize: A4
linestretch: 1.5
fontfamily: lmodern
fontsize: "12"
geometry:
- left=2.2cm
- right=3.5cm
- top=2.5cm
- bottom=2.5cm
toc: false
link-citations: true
link-bibliography: true
number-sections: false
lang: en
title: Scoping review on 'what works'
subtitle: Addressing inequalities in the World of Work
---
```{python}
#| echo: false
from pathlib import Path
data_dir=Path("../02-data")
## standard imports
from IPython.core.display import Markdown as md
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from tabulate import tabulate
```
```{python}
sns.set_style("whitegrid")
```
```{python}
#| echo: false
# load and parse overall bibtex sample
import bibtexparser
bib_string=""
print(f"path: {data_dir.joinpath('raw/01_wos-sample_2023-11-02').absolute()}")
for partial_bib in data_dir.joinpath("raw/01_wos-sample_2023-11-02").glob("*.bib"):
with open(partial_bib) as f:
bib_string+="\n".join(f.readlines())
sample = bibtexparser.parse_string(bib_string)
```
## Description of results
```{python}
#| echo: false
sample_size = len(sample.entries)
md(f"""
The exploratory execution of queries results in an initial sample of {sample_size} studies after the identification process.
""")
```
yrs:
```{python}
reformatted = []
for e in sample.entries:
reformatted.append([e["Year"], e["Author"], e["Title"], e["Type"], e["Times-Cited"], e["Usage-Count-Since-2013"]])
bib_df = pd.DataFrame(reformatted, columns = ["Year", "Author", "Title", "Type", "Cited", "Usage"])
bib_df["Date"] = pd.to_datetime(bib_df["Year"], format="%Y")
bib_df["Year"] = bib_df["Date"].dt.year
bib_df
```
```{python}
# RESTRICT FOR NEWER STUDIES
bib_df = bib_df[bib_df["Year"] >= 2000]
```
Publications per year:
```{python}
ax = sns.countplot(bib_df[bib_df["Year"] >= 2000], x="Year")
ax.tick_params(axis='x', rotation=45)
plt.tight_layout()
plt.show()
```
By type:
```{python}
bib_df["Type"].value_counts()
bib_df["Literature"] = np.where(bib_df["Type"].str.contains("article", case=False, regex=False), "white", "gray")
bib_df["Literature"] = bib_df["Literature"].astype("category")
```
Per type:
```{python}
ax = sns.countplot(bib_df[bib_df["Year"] >= 2000], x="Year", hue="Literature")
ax.tick_params(axis='x', rotation=45)
# ax.set_xlabel("")
plt.tight_layout()
plt.show()
```
Avg num of citations:
```{python}
bib_df["Cited"] = bib_df["Cited"].astype("int")
grpd = bib_df.groupby(["Year"], as_index=False)["Cited"].mean()
ax = sns.barplot(grpd, x="Year", y="Cited")
ax.tick_params(axis='x', rotation=45)
plt.tight_layout()
plt.show()
```

322
notebooks/explore.qmd Normal file
View file

@ -0,0 +1,322 @@
---
bibliography: 02-data/intermediate/zotero-library.bib
title: Grab yml
---
## Separate data acquisition
```{python}
import pandas as pd
from src import load_data
df = load_data.from_yml()
```
Get interventions:
```{python}
df['intervention'].str.split(";").explode().str.strip().value_counts()
```
Get inequalities:
```{python}
df['inequality'].str.split(";").explode().str.strip().value_counts()
```
```{python}
df.groupby(["author", "year", "title"]).first().join(df['intervention'])
```
Unique values in chain method:
```{python}
(
df.groupby(["author", "year", "title"])
.agg(
{
"intervention": lambda _col:"; ".join(_col),
"inequality": lambda _col:"; ".join(_col),
}
)
.drop_duplicates()
.explode("inequality")
["inequality"].str.strip()
.value_counts()
)
```
Merge dataset so it is collected by *STUDY* not by *OBSERVATION*.
Any required columns can be calculated similar to the agg function here.
```{python}
by_study = (
df.groupby(["author", "year", "title"])
.agg(
{
"intervention": lambda _col: "; ".join(_col),
"inequality": lambda _col: "; ".join(_col),
"date": lambda _col: "; ".join(_col),
"findings": lambda _col: "; ".join(_col),
# "region": lambda _col: "; ".join(_col), # only accessible when merging with WB data
# "income_group": lambda _col: "; ".join(_col),
}
)
.reset_index()
.drop_duplicates()
.assign(
# create de-duplicated joins for all observations
intervention=lambda _df: _df["intervention"].apply(
lambda _cell: set([x.strip() for x in _cell.split(";")])
),
inequality=lambda _df: _df["inequality"].apply(
lambda _cell: set([x.strip() for x in _cell.split(";")])
),
)
)
```
```{python}
by_study = (
df.groupby(["author", "year", "title"])
.first()
.reset_index()
.drop_duplicates()
.assign(
# create de-duplicated joins for all observations
intervention=lambda _df: _df["intervention"].apply(
lambda _cell: set([x.strip() for x in _cell.split(";")])
),
inequality=lambda _df: _df["inequality"].apply(
lambda _cell: set([x.strip() for x in _cell.split(";")])
),
)
)
```
```{python}
import re
from matplotlib import pyplot as plt
import seaborn as sns
by_intervention = (
df.groupby(["author", "year", "title"])
.agg(
{
"intervention": lambda _col: "; ".join(_col),
}
)
.reset_index()
.drop_duplicates()
.assign(
intervention=lambda _df: _df["intervention"].apply(
lambda _cell: set([x.strip() for x in re.sub(r"\(.*\)", "", _cell).split(";")])
),
)
.explode("intervention")
)
sort_order = by_intervention["intervention"].value_counts().index
fig = plt.figure()
fig.set_size_inches(6, 3)
ax = sns.countplot(by_intervention, x="intervention", order=by_intervention["intervention"].value_counts().index)
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
rotation_mode="anchor")
plt.show()
by_intervention = None
```
```{python}
#| label: fig-publications-per-year
#| fig-cap: Publications per year
df_study_years = (
df.groupby(["author", "year", "title"])
.first()
.reset_index()
.drop_duplicates()
)
# plot by year TODO decide if we want to distinguish by literature type/region/etc as hue
# FIXME should be timeseries plot so no years are missing
ax = sns.countplot(df_study_years, x="year", native_scale=True)
ax.tick_params(axis='x', rotation=45)
ax.set_xlabel("")
plt.tight_layout()
plt.show()
df_study_years = None
```
```{python}
#| label: tbl-income-crosstab
#| tbl-cap: Interventions targeting income inequality
df_income = df.copy()
df_income['Inequality'] = df_income['inequality'].str.split(";").explode(ignore_index=True).str.strip()
df_income = df_income.loc[df_income['Inequality'] == "income"].copy()
df_income['Intervention'] = df_income['intervention'].str.split(";").explode(ignore_index=True).str.replace(r"\(.+\)", "", regex=True).str.strip()
pd.crosstab(df_income["Intervention"], df_income["Inequality"])
```
```{python}
#| label: tbl-income-crosstab
#| tbl-cap: Interventions targeting income inequality
temp_df = df[["intervention", "inequality"]].copy().reset_index(drop=True)
temp_df['Inequality'] = temp_df['inequality'].str.split(";").explode(ignore_index=True).str.strip()
temp_df['Intervention'] = temp_df['intervention'].str.split(";").explode(ignore_index=True).str.replace(r"\(.+\)", "", regex=True).str.strip()
gender_df = temp_df.loc[temp_df["Inequality"] == "gender"]
income_df = temp_df.loc[temp_df["Inequality"] == "income"]
```
## Complete data replication from scoping
prep full data set:
```{python}
#| echo: false
from pathlib import Path
import re
## standard imports
from IPython.core.display import Markdown as md
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from tabulate import tabulate
import bibtexparser
sns.set_style("whitegrid")
DATA_DIR=Path("./02-data")
RAW_DATA=DATA_DIR.joinpath("raw")
WORKING_DATA=DATA_DIR.joinpath("intermediate")
PROCESSED_DATA=DATA_DIR.joinpath("processed")
SUPPLEMENTARY_DATA=DATA_DIR.joinpath("supplementary")
bib_string=""
for partial_bib in RAW_DATA.glob("**/*.bib"):
with open(partial_bib) as f:
bib_string+="\n".join(f.readlines())
bib_sample_raw_db = bibtexparser.parse_string(bib_string)
bib_string=""
for partial_bib in WORKING_DATA.glob("**/*.bib"):
with open(partial_bib) as f:
bib_string+="\n".join(f.readlines())
bib_sample = bibtexparser.parse_string(bib_string)
```
```{python}
# load relevant studies
from src import load_data
# load zotero-based metadata: citations and uses
zot_df = pd.DataFrame([
[
entry["doi"] if "doi" in entry.fields_dict else None,
entry["times-cited"] if "times-cited" in entry.fields_dict else None,
entry["usage"] if "usage" in entry.fields_dict else None,
entry["keywords"] if "keywords" in entry.fields_dict else None,
]
for entry in bib_sample.entries
], columns = ["doi", "cited", "usage", "keywords"]).drop_duplicates("doi").set_index("doi")
# Add WB country grouping definitions (income group, world region)
WB_COUNTRY_GROUPS_FILE = Path(f"{SUPPLEMENTARY_DATA}/wb-country-groupings.xlsx").resolve()
df_country_groups = pd.read_excel(WB_COUNTRY_GROUPS_FILE).set_index("Economy")
bib_df = (load_data.from_yml(f"{PROCESSED_DATA}")
.assign(
doi=lambda _df: _df["uri"].str.extract(r"https?://(?:dx\.)?doi\.org/(.*)", expand=False),
zot_cited=lambda _df: _df["doi"].map(zot_df["cited"]),
zot_usage=lambda _df: _df["doi"].map(zot_df["usage"]),
zot_keywords=lambda _df: _df["doi"].map(zot_df["keywords"]),
date = lambda _df: pd.to_datetime(_df["year"], format="%Y"),
year = lambda _df: _df["date"].dt.year,
region = lambda _df: _df["country"].map(df_country_groups["Region"]),
income_group = lambda _df: _df["country"].map(df_country_groups["Income group"]),
)
.query("year >= 2000")
)
zot_df = None
df_country_groups = None
```
```{python}
df_country_groups = pd.concat([pd.read_excel(WB_COUNTRY_GROUPS_FILE), pd.DataFrame(data={'Economy':['global'],'Code':['WLD'],'Region':['Europe & Central Asia;South Asia;North America;East Asia & Pacific;Sub-Saharan Africa;Europe & Central Asia;Latin America & Caribbean'], 'Income group':[''], 'Lending category':['']})]).set_index("Economy")
def countries_to_regions(countries:str):
res = set()
for c in countries.replace(" ;", ";").replace("; ",";").split(";"):
if c in df_country_groups.index:
region = df_country_groups.at[c,'Region']
res.add(region)
return ";".join(res)
# countries_to_regions("India; Nicaragua")
bib_df['region'] = bib_df['country'].map(countries_to_regions)
bib_df['region'].value_counts().plot.bar()
```
```{python}
bib_df = (bib_df
.assign(
# create de-duplicated joins for all observations
region=lambda _df: _df["region"].apply(
lambda _cell: set([x.strip() for x in _cell.split(";")])
),
)
.explode("region")
)
# bib_df["region"] = bib_df["region"].str.split(";").explode().str.strip()
ax = sns.countplot(bib_df, x="region", order=bib_df["region"].value_counts().index)
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
rotation_mode="anchor")
plt.show()
```
```{python}
df_inequality = (bib_df[["region", "intervention", "inequality"]]
.assign(
Intervention = lambda _df: (_df["intervention"]
.str.replace(r"\(.+\)", "", regex=True)
.str.replace(r" ?; ?", ";", regex=True)
.str.strip()
.str.split(";")
),
Inequality = lambda _df: (_df["inequality"]
.str.replace(r"\(.+\)", "", regex=True)
.str.replace(r" ?; ?", ";", regex=True)
.str.strip()
.str.split(";")
)
)
.explode("Intervention")
.explode("Inequality")
.reset_index(drop=True)
)
```
```{python}
def crosstab_inequality(df, inequality:str, **kwargs):
df_temp = df.loc[(df["Inequality"] == inequality) | (df["Inequality"] == "income")]
tab = pd.crosstab(df_temp["Intervention"], df_temp["Inequality"], **kwargs)
return tab.drop(tab[tab[inequality] == 0].index)
```
## Gender inequality
```{python}
#| label: tbl-gender-crosstab
#| tbl-cap: Interventions targeting gender inequality
crosstab_inequality(df_inequality, "gender", normalize=False).sort_values("gender", ascending=False)
```
```{python}
def region_vis_inequality(df, inequality:str):
df_temp = df.loc[(df["Inequality"] == inequality)]
return sns.countplot(df_temp, x="region", order=df_temp["region"].value_counts().index)
region_vis_inequality(df_inequality, "spatial")
```

149
notebooks/main-findings.qmd Normal file
View file

@ -0,0 +1,149 @@
load data, boilerplate:
```{python}
#| echo: false
from pathlib import Path
import re
## standard imports
from IPython.core.display import Markdown as md
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from tabulate import tabulate
import bibtexparser
sns.set_style("whitegrid")
DATA_DIR=Path("./02-data")
RAW_DATA=DATA_DIR.joinpath("raw")
WORKING_DATA=DATA_DIR.joinpath("intermediate")
PROCESSED_DATA=DATA_DIR.joinpath("processed")
SUPPLEMENTARY_DATA=DATA_DIR.joinpath("supplementary")
bib_string=""
for partial_bib in RAW_DATA.glob("**/*.bib"):
with open(partial_bib) as f:
bib_string+="\n".join(f.readlines())
bib_sample_raw_db = bibtexparser.parse_string(bib_string)
bib_string=""
for partial_bib in WORKING_DATA.glob("**/*.bib"):
with open(partial_bib) as f:
bib_string+="\n".join(f.readlines())
bib_sample = bibtexparser.parse_string(bib_string)
```
```{python}
# load relevant studies
from src import load_data
# load zotero-based metadata: citations and uses
zot_df = pd.DataFrame([
[
entry["doi"] if "doi" in entry.fields_dict else None,
entry["times-cited"] if "times-cited" in entry.fields_dict else None,
entry["usage"] if "usage" in entry.fields_dict else None,
entry["keywords"] if "keywords" in entry.fields_dict else None,
]
for entry in bib_sample.entries
], columns = ["doi", "cited", "usage", "keywords"]).drop_duplicates("doi").set_index("doi")
# Add WB country grouping definitions (income group, world region)
WB_COUNTRY_GROUPS_FILE = Path(f"{SUPPLEMENTARY_DATA}/wb-country-groupings.xlsx").resolve()
df_country_groups = pd.read_excel(WB_COUNTRY_GROUPS_FILE).set_index("Economy")
bib_df = (load_data.from_yml(f"{PROCESSED_DATA}")
.assign(
doi=lambda _df: _df["uri"].str.extract(r"https?://(?:dx\.)?doi\.org/(.*)", expand=False),
zot_cited=lambda _df: _df["doi"].map(zot_df["cited"]),
zot_usage=lambda _df: _df["doi"].map(zot_df["usage"]),
zot_keywords=lambda _df: _df["doi"].map(zot_df["keywords"]),
date = lambda _df: pd.to_datetime(_df["year"], format="%Y"),
year = lambda _df: _df["date"].dt.year,
region = lambda _df: _df["country"].map(df_country_groups["Region"]),
income_group = lambda _df: _df["country"].map(df_country_groups["Income group"]),
)
.query("year >= 2000")
)
zot_df = None
df_country_groups = None
```
```{python}
df_country_groups = pd.concat([pd.read_excel(WB_COUNTRY_GROUPS_FILE), pd.DataFrame(data={'Economy':['global'],'Code':['WLD'],'Region':['Europe & Central Asia;South Asia;North America;East Asia & Pacific;Sub-Saharan Africa;Europe & Central Asia;Latin America & Caribbean'], 'Income group':[''], 'Lending category':['']})]).set_index("Economy")
def countries_to_regions(countries:str):
res = set()
for c in countries.replace(" ;", ";").replace("; ",";").split(";"):
if c in df_country_groups.index:
region = df_country_groups.at[c,'Region']
res.add(region)
return ";".join(res)
# countries_to_regions("India; Nicaragua")
bib_df['region'] = bib_df['country'].map(countries_to_regions)
bib_df['region'].value_counts().plot.bar()
```
```{python}
#| label: fig-intervention-types
#| fig-cap: Predominant type of intervention
by_intervention = (
bib_df.groupby(["author", "year", "title", "design", "method", "representativeness", "citation"])
.agg(
{
"intervention": lambda _col: "; ".join(_col),
}
)
.reset_index()
.drop_duplicates()
.assign(
intervention=lambda _df: _df["intervention"].apply(
lambda _cell: set([x.strip() for x in re.sub(r"\(.*\)", "", _cell).split(";")])
),
)
.explode("intervention")
)
sort_order = by_intervention["intervention"].value_counts().index
fig = plt.figure()
fig.set_size_inches(6, 3)
ax = sns.countplot(by_intervention, x="intervention", order=by_intervention["intervention"].value_counts().index)
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
rotation_mode="anchor")
plt.show()
```
datavis:
```{python}
findings_institutional = pd.read_csv("02-data/supplementary/findings-institutional.csv")
findings_institutional
from src.model import validity
import math
validities = validity.calculate(by_intervention)
valid_subset = validities[["internal_validity", "external_validity", "citation"]].fillna(1.0).drop_duplicates(subset=["citation"]).sort_values("internal_validity")
def combined_validities(df_in, column: str = "internal_validity"):
if not isinstance(df_in, str):
return
combined = 0.0
for study in df_in.split(";"):
new = valid_subset.loc[valid_subset["citation"] == study, column]
if len(new) > 0 and not math.isnan(new.iat[0]):
combined += new.iat[0]
if combined:
return combined
return 0.0
def combined_external(df_in, column: str = "external_validity"):
return combined_validities(df_in, column)
findings_institutional["internal_validity"] = findings_institutional["studies"].apply(combined_validities)
findings_institutional["external_validity"] = findings_institutional["studies"].apply(combined_external)
findings_institutional[["area of policy", "internal_validity", "external_validity", "findings", "channels"]]
```

View file

@ -0,0 +1,234 @@
load data, boilerplate:
```{python}
#| label: load-data
#| echo: false
from pathlib import Path
import re
## standard imports
from IPython.core.display import Markdown as md
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from tabulate import tabulate
import bibtexparser
sns.set_style("whitegrid")
DATA_DIR=Path("./02-data")
RAW_DATA=DATA_DIR.joinpath("raw")
WORKING_DATA=DATA_DIR.joinpath("intermediate")
PROCESSED_DATA=DATA_DIR.joinpath("processed")
SUPPLEMENTARY_DATA=DATA_DIR.joinpath("supplementary")
from src import prep_data
# raw database-search results
bib_sample_raw_db = prep_data.bib_library_from_dir(RAW_DATA)
# the complete library of sampled (and working) literature
bib_sample = prep_data.bib_library_from_dir(WORKING_DATA)
# load relevant studies
from src import load_data
bib_df = prep_data.observations_with_metadata_df(
raw_observations = load_data.from_yml(PROCESSED_DATA),
study_metadata = prep_data.bib_metadata_df(bib_sample),
country_groups = prep_data.country_groups_df(Path(f"{SUPPLEMENTARY_DATA}/wb-country-groupings.xlsx")),
)
raw_observations = None
zot_df = None
df_country_groups = None
```
prep data:
Map a 0-5 external validity score based on 'representativeness' to rows:
```{python}
df=bib_df
vd=df[(df['design'] == 'quasi-experimental') | (df['design'] == 'experimental')]
vd = vd.assign(valid_ext=0)
vd["representativeness"] = vd["representativeness"].fillna("")
mask_subnational = vd['representativeness'].str.contains("subnational")
mask_national = vd['representativeness'].str.contains("national")
mask_regional = vd['representativeness'].str.contains("regional")
mask_local = vd['representativeness'].str.contains("local")
vd.loc[mask_regional, 'valid_ext'] = 5
vd.loc[mask_national, 'valid_ext'] = 4
vd.loc[mask_subnational, 'valid_ext'] = 3
vd.loc[mask_local, 'valid_ext'] = 2
vd[['representativeness', 'valid_ext']]
```
Map an internal validity score based on study design/method:
```{python}
vd = vd.assign(valid_int=0)
vd["method"] = vd["method"].fillna("")
vd.loc[vd['method'].str.contains("RCT"), 'valid_int'] = 5.0
vd.loc[vd['method'].str.contains("|".join(["RD","regression.vdiscontinuity"])), 'valid_int'] = 4.5
vd.loc[vd['method'].str.contains("|".join(["IV","instrumental.variable"])), 'valid_int'] = 4.0
vd.loc[vd['method'].str.contains("|".join(["PSM","propensity.score.matching"])), 'valid_int'] = 3.5
vd.loc[vd['method'].str.contains("|".join(["DM","discontinuity.matching"])), 'valid_int'] = 3.0
vd.loc[vd['method'].str.contains("|".join(["DID","difference.in.difference"])), 'valid_int'] = 3.0
vd.loc[vd['method'].str.contains("|".join(["OLS","ordinary.least.square"])), 'valid_int'] = 2.0
vd[['method', 'valid_int']]
```
## visualize data:
Prep the by_intervention dataframe:
```{python}
#| label: fig-intervention-types
#| fig-cap: Available studies by primary type of intervention
by_intervention = (
bib_df
.fillna("")
.groupby(["author", "year", "title", "design", "method", "region", "representativeness", "citation"])
.agg(
{
"intervention": lambda _col: "; ".join(_col),
}
)
.reset_index()
.drop_duplicates()
.assign(
intervention=lambda _df: _df["intervention"].apply(
lambda _cell: set([x.strip() for x in re.sub(r"\(.*\)", "", _cell).split(";")])
),
)
.explode("intervention")
)
sort_order = by_intervention["intervention"].value_counts().index
fig = plt.figure()
fig.set_size_inches(6, 3)
ax = sns.countplot(by_intervention, x="intervention", order=by_intervention["intervention"].value_counts().index)
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
rotation_mode="anchor")
plt.show()
```
then visualize:
validities as distplot with external as categorical x and internal as hue facet.
Nicely shows that lower-internal generally have higher external and there are two external humps at 3 and 5
(subnational and census-based)
```{python}
#| label: fig-validity-density
from src.model import validity
import seaborn.objects as so
validities = validity.calculate(by_intervention)
validities["identifier"] = validities["author"].str.replace(r',.*$', '', regex=True) + " (" + validities["year"].astype(str) + ")"
# As distplot to show hue-facetted density
sns.displot(
data=validities,
x="external_validity", hue="internal_validity",
kind="kde",
multiple="fill", clip=(0, None),
palette="ch:rot=-0.5,hue=1.5,light=0.9",
)
```
As a point-plot which shows the x and y correlation and the spread (roughly) per external validity
```{python}
#| label: fig-validity-points
from src.model import validity
import seaborn.objects as so
validities = validity.calculate(by_intervention)
validities["identifier"] = validities["author"].str.replace(r',.*$', '', regex=True) + " (" + validities["year"].astype(str) + ")"
sns.pointplot(
data=validities,
x="internal_validity", y="external_validity"
)
```
As a relation-chart which shows the internal-external relation and the deviation from individual points.
```{python}
#| label: fig-validity-relation
#| fig-cap: "Relation between internal and external validity"
#| fig-height: 5
#| code-fold: true
from src.model import validity
validities = validity.calculate(by_intervention)
validities["identifier"] = validities["author"].str.replace(r',.*$', '', regex=True) + " (" + validities["year"].astype(str) + ")"
validities = validities.loc[(validities["design"] == "quasi-experimental") | (validities["design"] == "experimental")]
#validities["external_validity"] = validities["external_validity"].astype('category')
validities["internal_validity"] = validities["internal_validity"].astype('category')
sns.pointplot(
data=validities,
x="internal_validity", y="external_validity",
)
```
```{python}
#| label: fig-validity-distribution
#| fig-cap: "Distribution of internal validities"
#| fig-height: 5
#| code-fold: true
fig, ax = plt.subplots()
#sns.displot(
# data=validities,
# x="external_validity", hue="internal_validity",
# kind="kde",
# multiple="fill", clip=(0, None),
# palette="ch:rot=-0.5,hue=1.5,light=0.9",
# bw_adjust=.65, cut=0,
# warn_singular = False
#)
```
Following plots need at least one axis, preferably external to be set to categorical.
As a heatmap plot for categorical data between x-y:
```{python}
#| label: fig-validity-distribution
sns.displot(
data=validities,
x="internal_validity", y="external_validity", hue="design",
palette="ch:rot=-0.75,hue=1.5,light=0.9",
)
```
As a violin plot showing distribution of external along internal category:
```{python}
sns.violinplot(
data=validities,
x="internal_validity", y="external_validity", hue="design",
cut=0, bw_method="scott",
orient="x"
)
# optional swarmplot showing the actual amount of data points for each rank
sns.swarmplot(
data=validities,
x="internal_validity", y="external_validity",
color="red",
s=6
)
```

55
notebooks/test-magma.qmd Normal file
View file

@ -0,0 +1,55 @@
---
bibliography: 02-data/intermediate/zotero-library.bib
csl: /home/marty/documents/library/utilities/styles/APA-7.csl
papersize: A4
linestretch: 1.5
fontfamily: lmodern
fontsize: "12"
geometry:
- left=2.2cm
- right=3.5cm
- top=2.5cm
- bottom=2.5cm
toc: false
link-citations: true
link-bibliography: true
number-sections: false
lang: en
title: Scoping review on 'what works'
subtitle: Addressing inequalities in the World of Work
filters:
- src/pandoc-to-zotero-live.lua
zotero:
library: wow-inequalities
client: zotero
csl-style: apa
---
```{python}
#| echo: false
from pathlib import Path
DATA_DIR=Path("./02-data")
BIB_PATH = DATA_DIR.joinpath("raw/01_wos-sample_2023-11-02")
## standard imports
from IPython.core.display import Markdown as md
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from tabulate import tabulate
sns.set_style("whitegrid")
```
```{python}
rng = np.random.RandomState(0)
x = np.linspace(0, 10, 500)
y = np.cumsum(rng.randn(500, 6), 0)
```
```{python}
# same plotting code as above!
plt.plot(x, y)
plt.legend('ABCDEF', ncol=2, loc='upper left');
```