From e50e5cfcbcfa3215998e5cb3aab2501e1ff210b4 Mon Sep 17 00:00:00 2001 From: Marty Oehme Date: Tue, 20 Feb 2024 17:58:35 +0100 Subject: [PATCH] feat(code): Add experiments for validity visualization --- 00-notebooks/rank_validities.qmd | 146 +++++++++++++++++++------------ 1 file changed, 90 insertions(+), 56 deletions(-) diff --git a/00-notebooks/rank_validities.qmd b/00-notebooks/rank_validities.qmd index 38e1e5d..e6a5c80 100644 --- a/00-notebooks/rank_validities.qmd +++ b/00-notebooks/rank_validities.qmd @@ -1,6 +1,7 @@ load data, boilerplate: ```{python} +#| label: load-data #| echo: false from pathlib import Path import re @@ -21,72 +22,27 @@ WORKING_DATA=DATA_DIR.joinpath("intermediate") PROCESSED_DATA=DATA_DIR.joinpath("processed") SUPPLEMENTARY_DATA=DATA_DIR.joinpath("supplementary") -bib_string="" -for partial_bib in RAW_DATA.glob("**/*.bib"): - with open(partial_bib) as f: - bib_string+="\n".join(f.readlines()) -bib_sample_raw_db = bibtexparser.parse_string(bib_string) +from src import prep_data -bib_string="" -for partial_bib in WORKING_DATA.glob("**/*.bib"): - with open(partial_bib) as f: - bib_string+="\n".join(f.readlines()) -bib_sample = bibtexparser.parse_string(bib_string) -``` +# raw database-search results +bib_sample_raw_db = prep_data.bib_library_from_dir(RAW_DATA) +# the complete library of sampled (and working) literature +bib_sample = prep_data.bib_library_from_dir(WORKING_DATA) -```{python} # load relevant studies from src import load_data -# load zotero-based metadata: citations and uses -zot_df = pd.DataFrame([ - [ - entry["doi"] if "doi" in entry.fields_dict else None, - entry["times-cited"] if "times-cited" in entry.fields_dict else None, - entry["usage"] if "usage" in entry.fields_dict else None, - entry["keywords"] if "keywords" in entry.fields_dict else None, - ] - for entry in bib_sample.entries -], columns = ["doi", "cited", "usage", "keywords"]).drop_duplicates("doi").set_index("doi") - -# Add WB country grouping definitions (income group, world region) -WB_COUNTRY_GROUPS_FILE = Path(f"{SUPPLEMENTARY_DATA}/wb-country-groupings.xlsx").resolve() -df_country_groups = pd.read_excel(WB_COUNTRY_GROUPS_FILE).set_index("Economy") - -bib_df = (load_data.from_yml(f"{PROCESSED_DATA}") - .assign( - doi=lambda _df: _df["uri"].str.extract(r"https?://(?:dx\.)?doi\.org/(.*)", expand=False), - zot_cited=lambda _df: _df["doi"].map(zot_df["cited"]), - zot_usage=lambda _df: _df["doi"].map(zot_df["usage"]), - zot_keywords=lambda _df: _df["doi"].map(zot_df["keywords"]), - date = lambda _df: pd.to_datetime(_df["year"], format="%Y"), - year = lambda _df: _df["date"].dt.year, - region = lambda _df: _df["country"].map(df_country_groups["Region"]), - income_group = lambda _df: _df["country"].map(df_country_groups["Income group"]), - ) - .query("year >= 2000") +bib_df = prep_data.observations_with_metadata_df( + raw_observations = load_data.from_yml(PROCESSED_DATA), + study_metadata = prep_data.bib_metadata_df(bib_sample), + country_groups = prep_data.country_groups_df(Path(f"{SUPPLEMENTARY_DATA}/wb-country-groupings.xlsx")), ) +raw_observations = None zot_df = None df_country_groups = None ``` -```{python} -df_country_groups = pd.concat([pd.read_excel(WB_COUNTRY_GROUPS_FILE), pd.DataFrame(data={'Economy':['global'],'Code':['WLD'],'Region':['Europe & Central Asia;South Asia;North America;East Asia & Pacific;Sub-Saharan Africa;Europe & Central Asia;Latin America & Caribbean'], 'Income group':[''], 'Lending category':['']})]).set_index("Economy") - -def countries_to_regions(countries:str): - res = set() - for c in countries.replace(" ;", ";").replace("; ",";").split(";"): - if c in df_country_groups.index: - region = df_country_groups.at[c,'Region'] - res.add(region) - return ";".join(res) - -# countries_to_regions("India; Nicaragua") -bib_df['region'] = bib_df['country'].map(countries_to_regions) -bib_df['region'].value_counts().plot.bar() -``` - -prep data: +prep data: Map a 0-5 external validity score based on 'representativeness' to rows: @@ -126,3 +82,81 @@ vd.loc[vd['method'].str.contains("|".join(["OLS","ordinary.least.square"])), 'va vd[['method', 'valid_int']] ``` +## visualize data: + +Prep the by_intervention dataframe: + +```{python} +#| label: fig-intervention-types +#| fig-cap: Available studies by primary type of intervention + +by_intervention = ( + bib_df + .fillna("") + .groupby(["author", "year", "title", "design", "method", "region", "representativeness", "citation"]) + .agg( + { + "intervention": lambda _col: "; ".join(_col), + } + ) + .reset_index() + .drop_duplicates() + .assign( + intervention=lambda _df: _df["intervention"].apply( + lambda _cell: set([x.strip() for x in re.sub(r"\(.*\)", "", _cell).split(";")]) + ), + ) + .explode("intervention") +) +sort_order = by_intervention["intervention"].value_counts().index + +fig = plt.figure() +fig.set_size_inches(6, 3) +ax = sns.countplot(by_intervention, x="intervention", order=by_intervention["intervention"].value_counts().index) +plt.setp(ax.get_xticklabels(), rotation=45, ha="right", + rotation_mode="anchor") +plt.show() +``` + +then visualize: + +validities as distplot with external as categorical x and internal as hue facet. +Nicely shows that lower-internal generally have higher external and there are two external humps at 3 and 5 +(subnational and census-based) + +```{python} +#| label: fig-validity-density +from src.model import validity +import seaborn.objects as so + +validities = validity.calculate(by_intervention) +validities["identifier"] = validities["author"].str.replace(r',.*$', '', regex=True) + " (" + validities["year"].astype(str) + ")" + + +# As distplot to show hue-facetted density + +sns.displot( + data=validities, + x="external_validity", hue="internal_validity", + kind="kde", + multiple="fill", clip=(0, None), + palette="ch:rot=-0.5,hue=1.5,light=0.9", +) +``` +As a point-plot which shows the x and y correlation and the spread (roughly) per external validity + +```{python} +#| label: fig-validity-points +from src.model import validity +import seaborn.objects as so + +validities = validity.calculate(by_intervention) +validities["identifier"] = validities["author"].str.replace(r',.*$', '', regex=True) + " (" + validities["year"].astype(str) + ")" + + + +sns.pointplot( + data=validities, + x="internal_validity", y="external_validity" +) +```