feat(code): Add experiments for validity visualization

2024-02-20 17:58:35 +01:00 · 2024-02-20 17:58:35 +01:00 · e50e5cfcbc
commit e50e5cfcbc
parent 3cb96ffef2
1 changed files with 90 additions and 56 deletions
--- a/00-notebooks/rank_validities.qmd
+++ b/00-notebooks/rank_validities.qmd
@ -1,6 +1,7 @@
 load data, boilerplate:

 ```{python}
+#| label: load-data
 #| echo: false
 from pathlib import Path
 import re
@ -21,72 +22,27 @@ WORKING_DATA=DATA_DIR.joinpath("intermediate")
 PROCESSED_DATA=DATA_DIR.joinpath("processed")
 SUPPLEMENTARY_DATA=DATA_DIR.joinpath("supplementary")

-bib_string=""
-for partial_bib in RAW_DATA.glob("**/*.bib"):
-    with open(partial_bib) as f:
-        bib_string+="\n".join(f.readlines())
-bib_sample_raw_db = bibtexparser.parse_string(bib_string)
+from src import prep_data

-bib_string=""
-for partial_bib in WORKING_DATA.glob("**/*.bib"):
-    with open(partial_bib) as f:
-        bib_string+="\n".join(f.readlines())
-bib_sample = bibtexparser.parse_string(bib_string)
-```
+# raw database-search results
+bib_sample_raw_db = prep_data.bib_library_from_dir(RAW_DATA)
+# the complete library of sampled (and working) literature
+bib_sample = prep_data.bib_library_from_dir(WORKING_DATA)

-```{python}
 # load relevant studies
 from src import load_data

-# load zotero-based metadata: citations and uses
-zot_df = pd.DataFrame([
-    [
-        entry["doi"] if "doi" in entry.fields_dict else None,
-        entry["times-cited"] if "times-cited" in entry.fields_dict else None,
-        entry["usage"] if "usage" in entry.fields_dict else None,
-        entry["keywords"] if "keywords" in entry.fields_dict else None,
-    ]
-    for entry in bib_sample.entries
-], columns = ["doi", "cited", "usage", "keywords"]).drop_duplicates("doi").set_index("doi")
-
-# Add WB country grouping definitions (income group, world region)
-WB_COUNTRY_GROUPS_FILE = Path(f"{SUPPLEMENTARY_DATA}/wb-country-groupings.xlsx").resolve()
-df_country_groups = pd.read_excel(WB_COUNTRY_GROUPS_FILE).set_index("Economy")
-
-bib_df = (load_data.from_yml(f"{PROCESSED_DATA}")
-    .assign(
-        doi=lambda _df: _df["uri"].str.extract(r"https?://(?:dx\.)?doi\.org/(.*)", expand=False),
-        zot_cited=lambda _df: _df["doi"].map(zot_df["cited"]),
-        zot_usage=lambda _df: _df["doi"].map(zot_df["usage"]),
-        zot_keywords=lambda _df: _df["doi"].map(zot_df["keywords"]),
-        date = lambda _df: pd.to_datetime(_df["year"], format="%Y"),
-        year = lambda _df: _df["date"].dt.year,
-        region = lambda _df: _df["country"].map(df_country_groups["Region"]),
-        income_group = lambda _df: _df["country"].map(df_country_groups["Income group"]),
-    )
-    .query("year >= 2000")
+bib_df = prep_data.observations_with_metadata_df(
+    raw_observations = load_data.from_yml(PROCESSED_DATA),
+    study_metadata = prep_data.bib_metadata_df(bib_sample),
+    country_groups = prep_data.country_groups_df(Path(f"{SUPPLEMENTARY_DATA}/wb-country-groupings.xlsx")),
 )
+raw_observations = None
 zot_df = None
 df_country_groups = None
 ```

-```{python}
-df_country_groups = pd.concat([pd.read_excel(WB_COUNTRY_GROUPS_FILE), pd.DataFrame(data={'Economy':['global'],'Code':['WLD'],'Region':['Europe & Central Asia;South Asia;North America;East Asia & Pacific;Sub-Saharan Africa;Europe & Central Asia;Latin America & Caribbean'], 'Income group':[''], 'Lending category':['']})]).set_index("Economy")
-
-def countries_to_regions(countries:str):
-    res = set()
-    for c in countries.replace(" ;", ";").replace("; ",";").split(";"):
-        if c in df_country_groups.index:
-            region = df_country_groups.at[c,'Region']
-            res.add(region)
-    return ";".join(res)
-
-# countries_to_regions("India; Nicaragua")
-bib_df['region'] = bib_df['country'].map(countries_to_regions)
-bib_df['region'].value_counts().plot.bar()
-```
-
-prep data: 
+prep data:

 Map a 0-5 external validity score based on 'representativeness' to rows:

@ -126,3 +82,81 @@ vd.loc[vd['method'].str.contains("|".join(["OLS","ordinary.least.square"])), 'va
 vd[['method', 'valid_int']]
 ```

+## visualize data:
+
+Prep the by_intervention dataframe:
+
+```{python}
+#| label: fig-intervention-types
+#| fig-cap: Available studies by primary type of intervention
+
+by_intervention = (
+    bib_df
+    .fillna("")
+    .groupby(["author", "year", "title", "design", "method", "region", "representativeness", "citation"])
+    .agg(
+        {
+            "intervention": lambda _col: "; ".join(_col),
+        }
+    )
+    .reset_index()
+    .drop_duplicates()
+    .assign(
+        intervention=lambda _df: _df["intervention"].apply(
+            lambda _cell: set([x.strip() for x in re.sub(r"\(.*\)", "", _cell).split(";")])
+        ),
+    )
+    .explode("intervention")
+)
+sort_order = by_intervention["intervention"].value_counts().index
+
+fig = plt.figure()
+fig.set_size_inches(6, 3)
+ax = sns.countplot(by_intervention, x="intervention", order=by_intervention["intervention"].value_counts().index)
+plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
+         rotation_mode="anchor")
+plt.show()
+```
+
+then visualize:
+
+validities as distplot with external as categorical x and internal as hue facet.
+Nicely shows that lower-internal generally have higher external and there are two external humps at 3 and 5
+(subnational and census-based)
+
+```{python}
+#| label: fig-validity-density
+from src.model import validity
+import seaborn.objects as so
+
+validities = validity.calculate(by_intervention)
+validities["identifier"] = validities["author"].str.replace(r',.*$', '', regex=True) + " (" + validities["year"].astype(str) + ")"
+
+
+# As distplot to show hue-facetted density
+
+sns.displot(
+    data=validities,
+    x="external_validity", hue="internal_validity",
+    kind="kde",
+    multiple="fill", clip=(0, None),
+    palette="ch:rot=-0.5,hue=1.5,light=0.9",
+)
+```
+As a point-plot which shows the x and y correlation and the spread (roughly) per external validity
+
+```{python}
+#| label: fig-validity-points
+from src.model import validity
+import seaborn.objects as so
+
+validities = validity.calculate(by_intervention)
+validities["identifier"] = validities["author"].str.replace(r',.*$', '', regex=True) + " (" + validities["year"].astype(str) + ")"
+
+
+
+sns.pointplot(
+    data=validities,
+    x="internal_validity", y="external_validity"
+)
+```