From b453afd112cc91fd10d2efac4539fac747f8d044 Mon Sep 17 00:00:00 2001
From: Marty Oehme <marty.oehme@gmail.com>
Date: Sun, 18 Feb 2024 16:57:39 +0100
Subject: [PATCH] feat(code): Allow custom strength of evidence bins

For display in findings summaries we can now allow arbitrary strength
of evidence binning. We simply pass in a dict with the strength (as
float) as the key and the string-representation that should appear
in the table as value.
---
 scoping_review.qmd    | 15 ++++++-
 src/model/validity.py | 98 +++++++++++++++++++++++++++----------------
 2 files changed, 74 insertions(+), 39 deletions(-)

diff --git a/scoping_review.qmd b/scoping_review.qmd
index 6096ae6..cae4013 100644
--- a/scoping_review.qmd
+++ b/scoping_review.qmd
@@ -613,13 +613,23 @@ g = sns.PairGrid(validities[["internal_validity", "external_validity", "identifi
 #| label: tbl-findings-institutional
 from src.model import validity
 
+study_strength_bins = {
+    0.0: r"\-",
+    5.0: r"\+",
+    10.0: r"\++",
+}
+def strength_for(val):
+    return list(study_strength_bins.keys())[list(study_strength_bins.values()).index(val)]
+
 findings_institutional = pd.read_csv("02-data/supplementary/findings-institutional.csv")
-fd_df = validity.add_to_findings(findings_institutional, by_intervention)
+fd_df = validity.add_to_findings(findings_institutional, by_intervention, study_strength_bins)
 
 md(tabulate(fd_df[["area of policy",  "internal_validity", "external_validity", "findings", "channels"]].fillna(""), showindex=False, headers="keys", tablefmt="grid"))
 ```
 
-Note: Each main finding is presented with an internal strength of evidence and an external strength of evidence which describe the combined validities of the evidence base for the respective finding. Validities are binned to a weak (-) evidence base up to a validity rank of 2.9, evidential (+) between 3.0 and 5.9 and strong evidence base (++) above 6.0.
+Note: Each main finding is presented with an internal strength of evidence and an external strength of evidence which describe the combined validities of the evidence base for the respective finding.
+Validities are segmented to a weak (-) evidence base under a validity ranking of `{python} strength_for(r"\+")`,
+evidential (+) from `{python} strength_for(r"\+")` and under `{python} strength_for(r"\++")` and strong evidence base (++) for `{python} strength_for(r"\++")` and above.
 
 Summary of main findings for institutional policies
 
@@ -1199,6 +1209,7 @@ def crosstab_inequality(df, inequality:str, **kwargs):
 
 As can be seen in @fig-region-counts, taken by region for the overall study sample,
 the evidence base receives a relatively even split between the World Bank regional country groupings.
+
 Studies tend to base their analyses more in national comparative studies for the North American and Europe and Central Asian regions, while relying more on case studies restricted to a single country context for developing countries in other regions, though this trend does not hold strongly everywhere or over time.
 A slight trend towards studies focusing on evidence-based research in developing countries is visible, though with an overall rising output, as seen in @fig-publications-per-year,
 and the ability for reliance on more recent datasets, this is to be expected.
diff --git a/src/model/validity.py b/src/model/validity.py
index c8d4ef5..9f5788c 100644
--- a/src/model/validity.py
+++ b/src/model/validity.py
@@ -3,36 +3,6 @@ from typing import cast
 
 from pandas import DataFrame
 
-
-def _binned_strength(strength: float) -> str:
-    if strength < 3.0:
-        return r"\-"
-    if strength < 6.0:
-        return r"\+"
-    return r"\++"
-
-
-def _combined_validities(
-    apply_to: DataFrame, by_intervention: DataFrame, column: str = "internal_validity"
-):
-    if not isinstance(apply_to, str):
-        return
-    combined = 0.0
-    for study in apply_to.split(";"):
-        if study not in by_intervention["citation"].unique():
-            print(
-                f"WARNING: Findings table {study} study did not match any study in interventions dataframe!"
-            )
-        new = by_intervention.loc[by_intervention["citation"] == study, column]
-        if len(new) == 0 or math.isnan(new.iat[0]):
-            continue
-        combined += new.iat[0]
-
-    if combined:
-        return _binned_strength(combined)
-    return r"\-"
-
-
 METHOD_RANKINGS = {
     2.0: ["OLS", "ordinary.least.square", "logistic.regression"],
     3.0: [
@@ -56,7 +26,6 @@ METHOD_RANKINGS = {
 def calculate(
     df: DataFrame,
     repr_col: str = "representativeness",
-    design_col: str = "design",
     method_col: str = "method",
 ) -> DataFrame:
     """Add internal and external validities to a dataframe.
@@ -76,9 +45,6 @@ def calculate(
     INT_COL_NAME: str = "internal_validity"
     cols = {EXT_COL_NAME: 0.0, INT_COL_NAME: 0.0}
 
-    # vd = df[
-    #     (df[design_col] == "quasi-experimental") | (df[design_col] == "experimental")
-    # ].copy()
     vd = df.assign(**cols)
     vd = cast(DataFrame, vd)
 
@@ -103,8 +69,23 @@ def calculate(
 
 
 def add_to_findings(
-    findings_df: DataFrame, studies_by_intervention: DataFrame
+    findings_df: DataFrame,
+    studies_by_intervention: DataFrame,
+    strength_bins: dict[float, str] | None = None,
 ) -> DataFrame:
+    """Returns summary of findings with validities added.
+
+    Requires a 'findings' dataframe with at least a 'citation' column,
+    containing the bibtex keys of studies containing the findings
+    (semicolon-separated without spaces).
+    Then, another dataframe with a row per study is required containing
+    the study's 'method' and 'representativeness', in columns
+    named respectively.
+
+    Returns the correct bin to put the validity in, using the
+    optionally passed in bins dictionary or the default 0 (weak) ->
+    5 (evidence) -> 10 (strong) bins.
+    """
     valid_subset = (
         calculate(studies_by_intervention)[
             ["internal_validity", "external_validity", "citation"]
@@ -115,11 +96,54 @@ def add_to_findings(
     )
 
     def apply_internal(df):
-        return _combined_validities(df, valid_subset, "internal_validity")
+        return _combined_validities(
+            df, valid_subset, "internal_validity", strength_bins
+        )
 
     def apply_external(df):
-        return _combined_validities(df, valid_subset, "external_validity")
+        return _combined_validities(
+            df, valid_subset, "external_validity", strength_bins
+        )
 
     findings_df["internal_validity"] = findings_df["studies"].apply(apply_internal)
     findings_df["external_validity"] = findings_df["studies"].apply(apply_external)
     return findings_df
+
+
+DEFAULT_BINS = {
+    0.0: r"\-",
+    1.0: r"\+",
+    1.5: r"\++",
+}
+
+
+def _combined_validities(
+    apply_to: DataFrame,
+    by_intervention: DataFrame,
+    column: str = "internal_validity",
+    strength_bins: dict[float, str] | None = None,
+):
+    if not isinstance(apply_to, str):
+        return
+    combined = 0.0
+    for study in apply_to.split(";"):
+        if study not in by_intervention["citation"].unique():
+            print(
+                f"WARNING: Findings table {study} study did not match any study in interventions dataframe!"
+            )
+        new = by_intervention.loc[by_intervention["citation"] == study, column]
+        if len(new) == 0 or math.isnan(new.iat[0]):
+            continue
+        combined += new.iat[0]
+
+    if combined:
+        return _binned_strength(combined, bins=strength_bins or DEFAULT_BINS)
+    return r"\-"
+
+
+def _binned_strength(strength: float, bins: dict[float, str]) -> str:
+    bin = ""
+    for val, txt in sorted(bins.items()):
+        if strength >= val:
+            bin = txt
+    return bin