refactor(code): Move observation data prep to src

Moved the overall metadata, country grouping code to the src directory somewhat trimming the prep code in the beginning of the manuscript itself.
2024-02-17 17:30:58 +01:00 · 2024-02-17 17:30:58 +01:00 · 42222447dc
commit 42222447dc
parent 71d02ff8dd
2 changed files with 168 additions and 52 deletions
--- a/scoping_review.qmd
+++ b/scoping_review.qmd
@ -46,70 +46,32 @@ WORKING_DATA=DATA_DIR.joinpath("intermediate")
 PROCESSED_DATA=DATA_DIR.joinpath("processed")
 SUPPLEMENTARY_DATA=DATA_DIR.joinpath("supplementary")

-bib_string=""
-for partial_bib in RAW_DATA.glob("**/*.bib"):
-    with open(partial_bib) as f:
-        bib_string+="\n".join(f.readlines())
-with all_logging_disabled():
-    bib_sample_raw_db = bibtexparser.parse_string(bib_string)
+from src import prep_data

-bib_string=""
-for partial_bib in WORKING_DATA.glob("**/*.bib"):
-    with open(partial_bib) as f:
-        bib_string+="\n".join(f.readlines())
-with all_logging_disabled():
-    bib_sample = bibtexparser.parse_string(bib_string)
+# raw database-search results
+bib_sample_raw_db = prep_data.bib_library_from_dir(RAW_DATA)
+# the complete library of sampled (and working) literature
+bib_sample = prep_data.bib_library_from_dir(WORKING_DATA)

 # load relevant studies
 from src import load_data
+raw_observations_df = load_data.from_yml(PROCESSED_DATA)

 # load zotero-based metadata: citations and uses
-zot_df = pd.DataFrame([
-    [
-        entry["doi"] if "doi" in entry.fields_dict else None,
-        entry["times-cited"] if "times-cited" in entry.fields_dict else None,
-        entry["usage"] if "usage" in entry.fields_dict else None,
-        entry["keywords"] if "keywords" in entry.fields_dict else None,
-    ]
-    for entry in bib_sample.entries
-], columns = ["doi", "cited", "usage", "keywords"]).drop_duplicates("doi").set_index("doi")
+zot_df = prep_data.bib_metadata_df(bib_sample)

 # Add WB country grouping definitions (income group, world region)
-WB_COUNTRY_GROUPS_FILE = Path(f"{SUPPLEMENTARY_DATA}/wb-country-groupings.xlsx").resolve()
-df_country_groups = pd.concat([pd.read_excel(WB_COUNTRY_GROUPS_FILE), pd.DataFrame(data={'Economy':['global'],'Code':['WLD'],'Region':['Europe & Central Asia;South Asia;North America;East Asia & Pacific;Sub-Saharan Africa;Europe & Central Asia;Latin America & Caribbean'], 'Income group':[''], 'Lending category':['']})]).set_index("Economy")
+df_country_groups = prep_data.country_groups_df(Path(f"{SUPPLEMENTARY_DATA}/wb-country-groupings.xlsx"))

-def countries_to_regions(countries:str):
-    res = set()
-    for c in countries.replace(" ;", ";").replace("; ",";").split(";"):
-        if c in df_country_groups.index:
-            region = df_country_groups.at[c,'Region']
-            res.add(region)
-    return ";".join(res)
-
-def countries_to_income_groups(countries:str):
-    res = set()
-    for c in countries.replace(" ;", ";").replace("; ",";").split(";"):
-        if c in df_country_groups.index:
-            region = df_country_groups.at[c,'Income group']
-            res.add(region)
-    return ";".join(res)
-
-
-bib_df = (load_data.from_yml(f"{PROCESSED_DATA}")
-    .assign(
-        doi=lambda _df: _df["uri"].str.extract(r"https?://(?:dx\.)?doi\.org/(.*)", expand=False),
-        zot_cited=lambda _df: _df["doi"].map(zot_df["cited"]),
-        zot_usage=lambda _df: _df["doi"].map(zot_df["usage"]),
-        zot_keywords=lambda _df: _df["doi"].map(zot_df["keywords"]),
-        date = lambda _df: pd.to_datetime(_df["year"], format="%Y"),
-        year = lambda _df: _df["date"].dt.year,
-        region = lambda _df: _df["country"].map(countries_to_regions),
-        income_group = lambda _df: _df["country"].map(countries_to_income_groups),
-    )
-    .query("year >= 2000")
+bib_df = prep_data.observations_with_metadata_df(
+    raw_observations = raw_observations_df,
+    study_metadata = zot_df,
+    country_groups = df_country_groups
 )
+raw_observations = None
 zot_df = None
 df_country_groups = None
+
 ```

 <!-- pagebreak to separate from TOC -->
--- a/src/prep_data.py
+++ b/src/prep_data.py
@ -0,0 +1,154 @@
+import logging
+from contextlib import contextmanager, nullcontext
+from pathlib import Path
+
+import bibtexparser
+import pandas as pd
+
+
+@contextmanager
+def all_logging_disabled(highest_level=logging.CRITICAL):
+    previous_level = logging.root.manager.disable
+    logging.disable(highest_level)
+    try:
+        yield
+    finally:
+        logging.disable(previous_level)
+
+
+def bib_library_from_dir(
+    dir: Path, disable_warnings: bool = True
+) -> bibtexparser.Library:
+    bib_string = ""
+    for partial_bib in dir.glob("**/*.bib"):
+        with open(partial_bib) as f:
+            bib_string += "\n".join(f.readlines())
+    if disable_warnings:
+        cm = all_logging_disabled
+    else:
+        cm = nullcontext
+    with cm():
+        return bibtexparser.parse_string(bib_string)
+
+
+def bib_metadata_df(sample: bibtexparser.Library) -> pd.DataFrame:
+    """Returns dataframe with relevant metadata extracted from a bibtex library.
+
+    DataFrame contains: doi, the number times it has been cited in the
+    literature (WOS statistic), its overall usage, and all keywords it
+    has been assigned (in the reference management software).
+    """
+    return (
+        pd.DataFrame(
+            [
+                [
+                    entry["doi"] if "doi" in entry.fields_dict else None,
+                    entry["times-cited"]
+                    if "times-cited" in entry.fields_dict
+                    else None,
+                    entry["usage"] if "usage" in entry.fields_dict else None,
+                    entry["keywords"] if "keywords" in entry.fields_dict else None,
+                ]
+                for entry in sample.entries
+            ],
+            columns=["doi", "cited", "usage", "keywords"],
+        )
+        .drop_duplicates("doi")
+        .set_index("doi")
+    )
+
+
+def country_groups_df(wb_xlsx_file: Path) -> pd.DataFrame:
+    """Returns a dataframe of WB country group definitions.
+
+    DataFrame contains the region of a country, the country code,
+    its income group, economy group and lending category.
+
+    Contains an additional entry for 'global' with the country
+    code 'WLD' which can be used for non-country specific
+    observations (e.g. wide comparative studies using 30+
+    developing/developed country datasets).
+    """
+    return pd.concat(
+        [
+            pd.read_excel(wb_xlsx_file),
+            pd.DataFrame(
+                data={
+                    "Economy": ["global"],
+                    "Code": ["WLD"],
+                    "Region": [
+                        "Europe & Central Asia;South Asia;North America;East Asia & Pacific;Sub-Saharan Africa;Europe & Central Asia;Latin America & Caribbean"
+                    ],
+                    "Income group": [""],
+                    "Lending category": [""],
+                }
+            ),
+        ]
+    ).set_index("Economy")
+
+
+def observations_with_metadata_df(
+    raw_observations: pd.DataFrame,
+    study_metadata: pd.DataFrame | None = None,
+    country_groups: pd.DataFrame | None = None,
+) -> pd.DataFrame:
+    """Returns observations with metadata and country information attached.
+
+    Returned DataFrame contains the raw observations data with one
+    observation per row, with correct study and WB country grouping metadata
+    attached for each row as well.
+    """
+    df = raw_observations.assign(
+        doi=lambda _df: _df["uri"].str.extract(
+            r"https?://(?:dx\.)?doi\.org/(.*)", expand=False
+        ),
+        date=lambda _df: pd.to_datetime(_df["year"], format="%Y"),
+        year=lambda _df: _df["date"].dt.year,
+    ).query("year >= 2000")
+    if study_metadata is not None:
+        df = df.assign(
+            zot_cited=lambda _df: _df["doi"].map(study_metadata["cited"]),
+            zot_usage=lambda _df: _df["doi"].map(study_metadata["usage"]),
+            zot_keywords=lambda _df: _df["doi"].map(study_metadata["keywords"]),
+        )
+    if country_groups is not None:
+
+        def c_to_region_closure(countries):
+            return countries_to_regions(country_groups, countries)
+
+        def c_to_income_closure(countries):
+            return countries_to_income_groups(country_groups, countries)
+
+        df = df.assign(
+            region=lambda _df: _df["country"].map(c_to_region_closure),
+            income_group=lambda _df: _df["country"].map(c_to_income_closure),
+        )
+    return df
+
+
+def countries_to_regions(country_groups: pd.DataFrame, countries: str) -> str:
+    """Returns the regions for countries passed in.
+
+    Return is in semicolon-concatenated string-list ready to be passed into
+    pandas for an apply or mapping method.
+    """
+    res = set()
+    for c in countries.replace(" ;", ";").replace("; ", ";").split(";"):
+        if c in country_groups.index:
+            region = country_groups.at[c, "Region"]
+            res.add(region)
+    return ";".join(res)
+
+
+def countries_to_income_groups(country_groups: pd.DataFrame, countries: str):
+    """Returns the income groups for countries passed in.
+
+    Return is in semicolon-concatenated string-list ready to be passed into
+    pandas for an apply or mapping method.
+    """
+    res = set()
+    for c in countries.replace(" ;", ";").replace("; ", ";").split(";"):
+        if c in country_groups.index:
+            region = country_groups.at[c, "Income group"]
+            res.add(region)
+    return ";".join(res)