feat(code): Add experiments for validity visualization

This commit is contained in:
Marty Oehme 2024-02-20 17:58:35 +01:00
parent 3cb96ffef2
commit e50e5cfcbc
Signed by: Marty
GPG key ID: EDBF2ED917B2EF6A

View file

@ -1,6 +1,7 @@
load data, boilerplate: load data, boilerplate:
```{python} ```{python}
#| label: load-data
#| echo: false #| echo: false
from pathlib import Path from pathlib import Path
import re import re
@ -21,71 +22,26 @@ WORKING_DATA=DATA_DIR.joinpath("intermediate")
PROCESSED_DATA=DATA_DIR.joinpath("processed") PROCESSED_DATA=DATA_DIR.joinpath("processed")
SUPPLEMENTARY_DATA=DATA_DIR.joinpath("supplementary") SUPPLEMENTARY_DATA=DATA_DIR.joinpath("supplementary")
bib_string="" from src import prep_data
for partial_bib in RAW_DATA.glob("**/*.bib"):
with open(partial_bib) as f:
bib_string+="\n".join(f.readlines())
bib_sample_raw_db = bibtexparser.parse_string(bib_string)
bib_string="" # raw database-search results
for partial_bib in WORKING_DATA.glob("**/*.bib"): bib_sample_raw_db = prep_data.bib_library_from_dir(RAW_DATA)
with open(partial_bib) as f: # the complete library of sampled (and working) literature
bib_string+="\n".join(f.readlines()) bib_sample = prep_data.bib_library_from_dir(WORKING_DATA)
bib_sample = bibtexparser.parse_string(bib_string)
```
```{python}
# load relevant studies # load relevant studies
from src import load_data from src import load_data
# load zotero-based metadata: citations and uses bib_df = prep_data.observations_with_metadata_df(
zot_df = pd.DataFrame([ raw_observations = load_data.from_yml(PROCESSED_DATA),
[ study_metadata = prep_data.bib_metadata_df(bib_sample),
entry["doi"] if "doi" in entry.fields_dict else None, country_groups = prep_data.country_groups_df(Path(f"{SUPPLEMENTARY_DATA}/wb-country-groupings.xlsx")),
entry["times-cited"] if "times-cited" in entry.fields_dict else None,
entry["usage"] if "usage" in entry.fields_dict else None,
entry["keywords"] if "keywords" in entry.fields_dict else None,
]
for entry in bib_sample.entries
], columns = ["doi", "cited", "usage", "keywords"]).drop_duplicates("doi").set_index("doi")
# Add WB country grouping definitions (income group, world region)
WB_COUNTRY_GROUPS_FILE = Path(f"{SUPPLEMENTARY_DATA}/wb-country-groupings.xlsx").resolve()
df_country_groups = pd.read_excel(WB_COUNTRY_GROUPS_FILE).set_index("Economy")
bib_df = (load_data.from_yml(f"{PROCESSED_DATA}")
.assign(
doi=lambda _df: _df["uri"].str.extract(r"https?://(?:dx\.)?doi\.org/(.*)", expand=False),
zot_cited=lambda _df: _df["doi"].map(zot_df["cited"]),
zot_usage=lambda _df: _df["doi"].map(zot_df["usage"]),
zot_keywords=lambda _df: _df["doi"].map(zot_df["keywords"]),
date = lambda _df: pd.to_datetime(_df["year"], format="%Y"),
year = lambda _df: _df["date"].dt.year,
region = lambda _df: _df["country"].map(df_country_groups["Region"]),
income_group = lambda _df: _df["country"].map(df_country_groups["Income group"]),
)
.query("year >= 2000")
) )
raw_observations = None
zot_df = None zot_df = None
df_country_groups = None df_country_groups = None
``` ```
```{python}
df_country_groups = pd.concat([pd.read_excel(WB_COUNTRY_GROUPS_FILE), pd.DataFrame(data={'Economy':['global'],'Code':['WLD'],'Region':['Europe & Central Asia;South Asia;North America;East Asia & Pacific;Sub-Saharan Africa;Europe & Central Asia;Latin America & Caribbean'], 'Income group':[''], 'Lending category':['']})]).set_index("Economy")
def countries_to_regions(countries:str):
res = set()
for c in countries.replace(" ;", ";").replace("; ",";").split(";"):
if c in df_country_groups.index:
region = df_country_groups.at[c,'Region']
res.add(region)
return ";".join(res)
# countries_to_regions("India; Nicaragua")
bib_df['region'] = bib_df['country'].map(countries_to_regions)
bib_df['region'].value_counts().plot.bar()
```
prep data: prep data:
Map a 0-5 external validity score based on 'representativeness' to rows: Map a 0-5 external validity score based on 'representativeness' to rows:
@ -126,3 +82,81 @@ vd.loc[vd['method'].str.contains("|".join(["OLS","ordinary.least.square"])), 'va
vd[['method', 'valid_int']] vd[['method', 'valid_int']]
``` ```
## visualize data:
Prep the by_intervention dataframe:
```{python}
#| label: fig-intervention-types
#| fig-cap: Available studies by primary type of intervention
by_intervention = (
bib_df
.fillna("")
.groupby(["author", "year", "title", "design", "method", "region", "representativeness", "citation"])
.agg(
{
"intervention": lambda _col: "; ".join(_col),
}
)
.reset_index()
.drop_duplicates()
.assign(
intervention=lambda _df: _df["intervention"].apply(
lambda _cell: set([x.strip() for x in re.sub(r"\(.*\)", "", _cell).split(";")])
),
)
.explode("intervention")
)
sort_order = by_intervention["intervention"].value_counts().index
fig = plt.figure()
fig.set_size_inches(6, 3)
ax = sns.countplot(by_intervention, x="intervention", order=by_intervention["intervention"].value_counts().index)
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
rotation_mode="anchor")
plt.show()
```
then visualize:
validities as distplot with external as categorical x and internal as hue facet.
Nicely shows that lower-internal generally have higher external and there are two external humps at 3 and 5
(subnational and census-based)
```{python}
#| label: fig-validity-density
from src.model import validity
import seaborn.objects as so
validities = validity.calculate(by_intervention)
validities["identifier"] = validities["author"].str.replace(r',.*$', '', regex=True) + " (" + validities["year"].astype(str) + ")"
# As distplot to show hue-facetted density
sns.displot(
data=validities,
x="external_validity", hue="internal_validity",
kind="kde",
multiple="fill", clip=(0, None),
palette="ch:rot=-0.5,hue=1.5,light=0.9",
)
```
As a point-plot which shows the x and y correlation and the spread (roughly) per external validity
```{python}
#| label: fig-validity-points
from src.model import validity
import seaborn.objects as so
validities = validity.calculate(by_intervention)
validities["identifier"] = validities["author"].str.replace(r',.*$', '', regex=True) + " (" + validities["year"].astype(str) + ")"
sns.pointplot(
data=validities,
x="internal_validity", y="external_validity"
)
```