feat(code): Add examples of list handling notebook

Extracts interventions/inequalities and explodes them for value counts.
2023-12-11 17:14:50 +01:00 · 2023-12-11 17:14:50 +01:00 · b5e467e016
commit b5e467e016
parent 85497854c1
1 changed files with 99 additions and 0 deletions
--- a/00-notebooks/yml-grab.qmd
+++ b/00-notebooks/yml-grab.qmd
@ -21,3 +21,102 @@ Get inequalities:
 ```{python}
 df['inequality'].str.split(";").explode().str.strip().value_counts()
 ```
+
+```{python}
+df.groupby(["author", "year", "title"]).first().join(df['intervention'])
+```
+
+Unique values in chain method:
+
+```{python}
+(
+    df.groupby(["author", "year", "title"])
+    .agg(
+        {
+            "intervention": lambda _col:"; ".join(_col),
+            "inequality": lambda _col:"; ".join(_col),
+        }
+    )
+    .drop_duplicates()
+    .explode("inequality")
+    ["inequality"].str.strip()
+    .value_counts()
+)
+```
+
+Merge dataset so it is collected by *STUDY* not by *OBSERVATION*.
+Any required columns can be calculated similar to the agg function here.
+
+```{python}
+by_study = (
+    df.groupby(["author", "year", "title"])
+    .agg(
+        {
+            "intervention": lambda _col: "; ".join(_col),
+            "inequality": lambda _col: "; ".join(_col),
+            "date": lambda _col: "; ".join(_col),
+            "findings": lambda _col: "; ".join(_col),
+            # "region": lambda _col: "; ".join(_col), # only accessible when merging with WB data
+            # "income_group": lambda _col: "; ".join(_col),
+        }
+    )
+    .reset_index()
+    .drop_duplicates()
+    .assign(
+        # create de-duplicated joins for all observations
+        intervention=lambda _df: _df["intervention"].apply(
+            lambda _cell: set([x.strip() for x in _cell.split(";")])
+        ),
+        inequality=lambda _df: _df["inequality"].apply(
+            lambda _cell: set([x.strip() for x in _cell.split(";")])
+        ),
+    )
+)
+```
+
+```{python}
+by_study = (
+    df.groupby(["author", "year", "title"])
+    .first()
+    .reset_index()
+    .drop_duplicates()
+    .assign(
+        # create de-duplicated joins for all observations
+        intervention=lambda _df: _df["intervention"].apply(
+            lambda _cell: set([x.strip() for x in _cell.split(";")])
+        ),
+        inequality=lambda _df: _df["inequality"].apply(
+            lambda _cell: set([x.strip() for x in _cell.split(";")])
+        ),
+    )
+)
+```
+
+```{python}
+import re
+by_intervention = (
+    df.groupby(["author", "year", "title"])
+    .agg(
+        {
+            "intervention": lambda _col: "; ".join(_col),
+        }
+    )
+    .reset_index()
+    .drop_duplicates()
+    .assign(
+        intervention=lambda _df: _df["intervention"].apply(
+            lambda _cell: set([x.strip() for x in re.sub(r"\(.*\)", "", _cell).split(";")])
+        ),
+    )
+    .explode("intervention")
+)
+sort_order = by_intervention["intervention"].value_counts().index
+
+fig = plt.figure()
+fig.set_size_inches(6, 3)
+ax = sns.countplot(by_intervention, x="intervention", order=by_intervention["intervention"].value_counts().index)
+plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
+         rotation_mode="anchor")
+plt.show()
+by_intervention = None
+```