feat(code): Allow custom strength of evidence bins

For display in findings summaries we can now allow arbitrary strength
of evidence binning. We simply pass in a dict with the strength (as
float) as the key and the string-representation that should appear
in the table as value.
This commit is contained in:
Marty Oehme 2024-02-18 16:57:39 +01:00
parent 2b0fa5db7c
commit b453afd112
Signed by: Marty
GPG key ID: EDBF2ED917B2EF6A
2 changed files with 74 additions and 39 deletions

View file

@ -613,13 +613,23 @@ g = sns.PairGrid(validities[["internal_validity", "external_validity", "identifi
#| label: tbl-findings-institutional
from src.model import validity
study_strength_bins = {
0.0: r"\-",
5.0: r"\+",
10.0: r"\++",
}
def strength_for(val):
return list(study_strength_bins.keys())[list(study_strength_bins.values()).index(val)]
findings_institutional = pd.read_csv("02-data/supplementary/findings-institutional.csv")
fd_df = validity.add_to_findings(findings_institutional, by_intervention)
fd_df = validity.add_to_findings(findings_institutional, by_intervention, study_strength_bins)
md(tabulate(fd_df[["area of policy", "internal_validity", "external_validity", "findings", "channels"]].fillna(""), showindex=False, headers="keys", tablefmt="grid"))
```
Note: Each main finding is presented with an internal strength of evidence and an external strength of evidence which describe the combined validities of the evidence base for the respective finding. Validities are binned to a weak (-) evidence base up to a validity rank of 2.9, evidential (+) between 3.0 and 5.9 and strong evidence base (++) above 6.0.
Note: Each main finding is presented with an internal strength of evidence and an external strength of evidence which describe the combined validities of the evidence base for the respective finding.
Validities are segmented to a weak (-) evidence base under a validity ranking of `{python} strength_for(r"\+")`,
evidential (+) from `{python} strength_for(r"\+")` and under `{python} strength_for(r"\++")` and strong evidence base (++) for `{python} strength_for(r"\++")` and above.
Summary of main findings for institutional policies
@ -1199,6 +1209,7 @@ def crosstab_inequality(df, inequality:str, **kwargs):
As can be seen in @fig-region-counts, taken by region for the overall study sample,
the evidence base receives a relatively even split between the World Bank regional country groupings.
Studies tend to base their analyses more in national comparative studies for the North American and Europe and Central Asian regions, while relying more on case studies restricted to a single country context for developing countries in other regions, though this trend does not hold strongly everywhere or over time.
A slight trend towards studies focusing on evidence-based research in developing countries is visible, though with an overall rising output, as seen in @fig-publications-per-year,
and the ability for reliance on more recent datasets, this is to be expected.

View file

@ -3,36 +3,6 @@ from typing import cast
from pandas import DataFrame
def _binned_strength(strength: float) -> str:
if strength < 3.0:
return r"\-"
if strength < 6.0:
return r"\+"
return r"\++"
def _combined_validities(
apply_to: DataFrame, by_intervention: DataFrame, column: str = "internal_validity"
):
if not isinstance(apply_to, str):
return
combined = 0.0
for study in apply_to.split(";"):
if study not in by_intervention["citation"].unique():
print(
f"WARNING: Findings table {study} study did not match any study in interventions dataframe!"
)
new = by_intervention.loc[by_intervention["citation"] == study, column]
if len(new) == 0 or math.isnan(new.iat[0]):
continue
combined += new.iat[0]
if combined:
return _binned_strength(combined)
return r"\-"
METHOD_RANKINGS = {
2.0: ["OLS", "ordinary.least.square", "logistic.regression"],
3.0: [
@ -56,7 +26,6 @@ METHOD_RANKINGS = {
def calculate(
df: DataFrame,
repr_col: str = "representativeness",
design_col: str = "design",
method_col: str = "method",
) -> DataFrame:
"""Add internal and external validities to a dataframe.
@ -76,9 +45,6 @@ def calculate(
INT_COL_NAME: str = "internal_validity"
cols = {EXT_COL_NAME: 0.0, INT_COL_NAME: 0.0}
# vd = df[
# (df[design_col] == "quasi-experimental") | (df[design_col] == "experimental")
# ].copy()
vd = df.assign(**cols)
vd = cast(DataFrame, vd)
@ -103,8 +69,23 @@ def calculate(
def add_to_findings(
findings_df: DataFrame, studies_by_intervention: DataFrame
findings_df: DataFrame,
studies_by_intervention: DataFrame,
strength_bins: dict[float, str] | None = None,
) -> DataFrame:
"""Returns summary of findings with validities added.
Requires a 'findings' dataframe with at least a 'citation' column,
containing the bibtex keys of studies containing the findings
(semicolon-separated without spaces).
Then, another dataframe with a row per study is required containing
the study's 'method' and 'representativeness', in columns
named respectively.
Returns the correct bin to put the validity in, using the
optionally passed in bins dictionary or the default 0 (weak) ->
5 (evidence) -> 10 (strong) bins.
"""
valid_subset = (
calculate(studies_by_intervention)[
["internal_validity", "external_validity", "citation"]
@ -115,11 +96,54 @@ def add_to_findings(
)
def apply_internal(df):
return _combined_validities(df, valid_subset, "internal_validity")
return _combined_validities(
df, valid_subset, "internal_validity", strength_bins
)
def apply_external(df):
return _combined_validities(df, valid_subset, "external_validity")
return _combined_validities(
df, valid_subset, "external_validity", strength_bins
)
findings_df["internal_validity"] = findings_df["studies"].apply(apply_internal)
findings_df["external_validity"] = findings_df["studies"].apply(apply_external)
return findings_df
DEFAULT_BINS = {
0.0: r"\-",
1.0: r"\+",
1.5: r"\++",
}
def _combined_validities(
apply_to: DataFrame,
by_intervention: DataFrame,
column: str = "internal_validity",
strength_bins: dict[float, str] | None = None,
):
if not isinstance(apply_to, str):
return
combined = 0.0
for study in apply_to.split(";"):
if study not in by_intervention["citation"].unique():
print(
f"WARNING: Findings table {study} study did not match any study in interventions dataframe!"
)
new = by_intervention.loc[by_intervention["citation"] == study, column]
if len(new) == 0 or math.isnan(new.iat[0]):
continue
combined += new.iat[0]
if combined:
return _binned_strength(combined, bins=strength_bins or DEFAULT_BINS)
return r"\-"
def _binned_strength(strength: float, bins: dict[float, str]) -> str:
bin = ""
for val, txt in sorted(bins.items()):
if strength >= val:
bin = txt
return bin