feat(code): Add experiments for validity visualization
This commit is contained in:
parent
3cb96ffef2
commit
e50e5cfcbc
1 changed files with 90 additions and 56 deletions
|
@ -1,6 +1,7 @@
|
|||
load data, boilerplate:
|
||||
|
||||
```{python}
|
||||
#| label: load-data
|
||||
#| echo: false
|
||||
from pathlib import Path
|
||||
import re
|
||||
|
@ -21,72 +22,27 @@ WORKING_DATA=DATA_DIR.joinpath("intermediate")
|
|||
PROCESSED_DATA=DATA_DIR.joinpath("processed")
|
||||
SUPPLEMENTARY_DATA=DATA_DIR.joinpath("supplementary")
|
||||
|
||||
bib_string=""
|
||||
for partial_bib in RAW_DATA.glob("**/*.bib"):
|
||||
with open(partial_bib) as f:
|
||||
bib_string+="\n".join(f.readlines())
|
||||
bib_sample_raw_db = bibtexparser.parse_string(bib_string)
|
||||
from src import prep_data
|
||||
|
||||
bib_string=""
|
||||
for partial_bib in WORKING_DATA.glob("**/*.bib"):
|
||||
with open(partial_bib) as f:
|
||||
bib_string+="\n".join(f.readlines())
|
||||
bib_sample = bibtexparser.parse_string(bib_string)
|
||||
```
|
||||
# raw database-search results
|
||||
bib_sample_raw_db = prep_data.bib_library_from_dir(RAW_DATA)
|
||||
# the complete library of sampled (and working) literature
|
||||
bib_sample = prep_data.bib_library_from_dir(WORKING_DATA)
|
||||
|
||||
```{python}
|
||||
# load relevant studies
|
||||
from src import load_data
|
||||
|
||||
# load zotero-based metadata: citations and uses
|
||||
zot_df = pd.DataFrame([
|
||||
[
|
||||
entry["doi"] if "doi" in entry.fields_dict else None,
|
||||
entry["times-cited"] if "times-cited" in entry.fields_dict else None,
|
||||
entry["usage"] if "usage" in entry.fields_dict else None,
|
||||
entry["keywords"] if "keywords" in entry.fields_dict else None,
|
||||
]
|
||||
for entry in bib_sample.entries
|
||||
], columns = ["doi", "cited", "usage", "keywords"]).drop_duplicates("doi").set_index("doi")
|
||||
|
||||
# Add WB country grouping definitions (income group, world region)
|
||||
WB_COUNTRY_GROUPS_FILE = Path(f"{SUPPLEMENTARY_DATA}/wb-country-groupings.xlsx").resolve()
|
||||
df_country_groups = pd.read_excel(WB_COUNTRY_GROUPS_FILE).set_index("Economy")
|
||||
|
||||
bib_df = (load_data.from_yml(f"{PROCESSED_DATA}")
|
||||
.assign(
|
||||
doi=lambda _df: _df["uri"].str.extract(r"https?://(?:dx\.)?doi\.org/(.*)", expand=False),
|
||||
zot_cited=lambda _df: _df["doi"].map(zot_df["cited"]),
|
||||
zot_usage=lambda _df: _df["doi"].map(zot_df["usage"]),
|
||||
zot_keywords=lambda _df: _df["doi"].map(zot_df["keywords"]),
|
||||
date = lambda _df: pd.to_datetime(_df["year"], format="%Y"),
|
||||
year = lambda _df: _df["date"].dt.year,
|
||||
region = lambda _df: _df["country"].map(df_country_groups["Region"]),
|
||||
income_group = lambda _df: _df["country"].map(df_country_groups["Income group"]),
|
||||
)
|
||||
.query("year >= 2000")
|
||||
bib_df = prep_data.observations_with_metadata_df(
|
||||
raw_observations = load_data.from_yml(PROCESSED_DATA),
|
||||
study_metadata = prep_data.bib_metadata_df(bib_sample),
|
||||
country_groups = prep_data.country_groups_df(Path(f"{SUPPLEMENTARY_DATA}/wb-country-groupings.xlsx")),
|
||||
)
|
||||
raw_observations = None
|
||||
zot_df = None
|
||||
df_country_groups = None
|
||||
```
|
||||
|
||||
```{python}
|
||||
df_country_groups = pd.concat([pd.read_excel(WB_COUNTRY_GROUPS_FILE), pd.DataFrame(data={'Economy':['global'],'Code':['WLD'],'Region':['Europe & Central Asia;South Asia;North America;East Asia & Pacific;Sub-Saharan Africa;Europe & Central Asia;Latin America & Caribbean'], 'Income group':[''], 'Lending category':['']})]).set_index("Economy")
|
||||
|
||||
def countries_to_regions(countries:str):
|
||||
res = set()
|
||||
for c in countries.replace(" ;", ";").replace("; ",";").split(";"):
|
||||
if c in df_country_groups.index:
|
||||
region = df_country_groups.at[c,'Region']
|
||||
res.add(region)
|
||||
return ";".join(res)
|
||||
|
||||
# countries_to_regions("India; Nicaragua")
|
||||
bib_df['region'] = bib_df['country'].map(countries_to_regions)
|
||||
bib_df['region'].value_counts().plot.bar()
|
||||
```
|
||||
|
||||
prep data:
|
||||
prep data:
|
||||
|
||||
Map a 0-5 external validity score based on 'representativeness' to rows:
|
||||
|
||||
|
@ -126,3 +82,81 @@ vd.loc[vd['method'].str.contains("|".join(["OLS","ordinary.least.square"])), 'va
|
|||
vd[['method', 'valid_int']]
|
||||
```
|
||||
|
||||
## visualize data:
|
||||
|
||||
Prep the by_intervention dataframe:
|
||||
|
||||
```{python}
|
||||
#| label: fig-intervention-types
|
||||
#| fig-cap: Available studies by primary type of intervention
|
||||
|
||||
by_intervention = (
|
||||
bib_df
|
||||
.fillna("")
|
||||
.groupby(["author", "year", "title", "design", "method", "region", "representativeness", "citation"])
|
||||
.agg(
|
||||
{
|
||||
"intervention": lambda _col: "; ".join(_col),
|
||||
}
|
||||
)
|
||||
.reset_index()
|
||||
.drop_duplicates()
|
||||
.assign(
|
||||
intervention=lambda _df: _df["intervention"].apply(
|
||||
lambda _cell: set([x.strip() for x in re.sub(r"\(.*\)", "", _cell).split(";")])
|
||||
),
|
||||
)
|
||||
.explode("intervention")
|
||||
)
|
||||
sort_order = by_intervention["intervention"].value_counts().index
|
||||
|
||||
fig = plt.figure()
|
||||
fig.set_size_inches(6, 3)
|
||||
ax = sns.countplot(by_intervention, x="intervention", order=by_intervention["intervention"].value_counts().index)
|
||||
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
|
||||
rotation_mode="anchor")
|
||||
plt.show()
|
||||
```
|
||||
|
||||
then visualize:
|
||||
|
||||
validities as distplot with external as categorical x and internal as hue facet.
|
||||
Nicely shows that lower-internal generally have higher external and there are two external humps at 3 and 5
|
||||
(subnational and census-based)
|
||||
|
||||
```{python}
|
||||
#| label: fig-validity-density
|
||||
from src.model import validity
|
||||
import seaborn.objects as so
|
||||
|
||||
validities = validity.calculate(by_intervention)
|
||||
validities["identifier"] = validities["author"].str.replace(r',.*$', '', regex=True) + " (" + validities["year"].astype(str) + ")"
|
||||
|
||||
|
||||
# As distplot to show hue-facetted density
|
||||
|
||||
sns.displot(
|
||||
data=validities,
|
||||
x="external_validity", hue="internal_validity",
|
||||
kind="kde",
|
||||
multiple="fill", clip=(0, None),
|
||||
palette="ch:rot=-0.5,hue=1.5,light=0.9",
|
||||
)
|
||||
```
|
||||
As a point-plot which shows the x and y correlation and the spread (roughly) per external validity
|
||||
|
||||
```{python}
|
||||
#| label: fig-validity-points
|
||||
from src.model import validity
|
||||
import seaborn.objects as so
|
||||
|
||||
validities = validity.calculate(by_intervention)
|
||||
validities["identifier"] = validities["author"].str.replace(r',.*$', '', regex=True) + " (" + validities["year"].astype(str) + ")"
|
||||
|
||||
|
||||
|
||||
sns.pointplot(
|
||||
data=validities,
|
||||
x="internal_validity", y="external_validity"
|
||||
)
|
||||
```
|
||||
|
|
Loading…
Reference in a new issue