From 42222447dc591eae2158d0e9f66192edff110520 Mon Sep 17 00:00:00 2001 From: Marty Oehme Date: Sat, 17 Feb 2024 17:30:58 +0100 Subject: [PATCH] refactor(code): Move observation data prep to src Moved the overall metadata, country grouping code to the src directory somewhat trimming the prep code in the beginning of the manuscript itself. --- scoping_review.qmd | 66 +++++-------------- src/prep_data.py | 154 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 168 insertions(+), 52 deletions(-) create mode 100644 src/prep_data.py diff --git a/scoping_review.qmd b/scoping_review.qmd index bb14c97..a40864b 100644 --- a/scoping_review.qmd +++ b/scoping_review.qmd @@ -46,70 +46,32 @@ WORKING_DATA=DATA_DIR.joinpath("intermediate") PROCESSED_DATA=DATA_DIR.joinpath("processed") SUPPLEMENTARY_DATA=DATA_DIR.joinpath("supplementary") -bib_string="" -for partial_bib in RAW_DATA.glob("**/*.bib"): - with open(partial_bib) as f: - bib_string+="\n".join(f.readlines()) -with all_logging_disabled(): - bib_sample_raw_db = bibtexparser.parse_string(bib_string) +from src import prep_data -bib_string="" -for partial_bib in WORKING_DATA.glob("**/*.bib"): - with open(partial_bib) as f: - bib_string+="\n".join(f.readlines()) -with all_logging_disabled(): - bib_sample = bibtexparser.parse_string(bib_string) +# raw database-search results +bib_sample_raw_db = prep_data.bib_library_from_dir(RAW_DATA) +# the complete library of sampled (and working) literature +bib_sample = prep_data.bib_library_from_dir(WORKING_DATA) # load relevant studies from src import load_data +raw_observations_df = load_data.from_yml(PROCESSED_DATA) # load zotero-based metadata: citations and uses -zot_df = pd.DataFrame([ - [ - entry["doi"] if "doi" in entry.fields_dict else None, - entry["times-cited"] if "times-cited" in entry.fields_dict else None, - entry["usage"] if "usage" in entry.fields_dict else None, - entry["keywords"] if "keywords" in entry.fields_dict else None, - ] - for entry in bib_sample.entries -], columns = ["doi", "cited", "usage", "keywords"]).drop_duplicates("doi").set_index("doi") +zot_df = prep_data.bib_metadata_df(bib_sample) # Add WB country grouping definitions (income group, world region) -WB_COUNTRY_GROUPS_FILE = Path(f"{SUPPLEMENTARY_DATA}/wb-country-groupings.xlsx").resolve() -df_country_groups = pd.concat([pd.read_excel(WB_COUNTRY_GROUPS_FILE), pd.DataFrame(data={'Economy':['global'],'Code':['WLD'],'Region':['Europe & Central Asia;South Asia;North America;East Asia & Pacific;Sub-Saharan Africa;Europe & Central Asia;Latin America & Caribbean'], 'Income group':[''], 'Lending category':['']})]).set_index("Economy") +df_country_groups = prep_data.country_groups_df(Path(f"{SUPPLEMENTARY_DATA}/wb-country-groupings.xlsx")) -def countries_to_regions(countries:str): - res = set() - for c in countries.replace(" ;", ";").replace("; ",";").split(";"): - if c in df_country_groups.index: - region = df_country_groups.at[c,'Region'] - res.add(region) - return ";".join(res) - -def countries_to_income_groups(countries:str): - res = set() - for c in countries.replace(" ;", ";").replace("; ",";").split(";"): - if c in df_country_groups.index: - region = df_country_groups.at[c,'Income group'] - res.add(region) - return ";".join(res) - - -bib_df = (load_data.from_yml(f"{PROCESSED_DATA}") - .assign( - doi=lambda _df: _df["uri"].str.extract(r"https?://(?:dx\.)?doi\.org/(.*)", expand=False), - zot_cited=lambda _df: _df["doi"].map(zot_df["cited"]), - zot_usage=lambda _df: _df["doi"].map(zot_df["usage"]), - zot_keywords=lambda _df: _df["doi"].map(zot_df["keywords"]), - date = lambda _df: pd.to_datetime(_df["year"], format="%Y"), - year = lambda _df: _df["date"].dt.year, - region = lambda _df: _df["country"].map(countries_to_regions), - income_group = lambda _df: _df["country"].map(countries_to_income_groups), - ) - .query("year >= 2000") +bib_df = prep_data.observations_with_metadata_df( + raw_observations = raw_observations_df, + study_metadata = zot_df, + country_groups = df_country_groups ) +raw_observations = None zot_df = None df_country_groups = None + ``` diff --git a/src/prep_data.py b/src/prep_data.py new file mode 100644 index 0000000..883404e --- /dev/null +++ b/src/prep_data.py @@ -0,0 +1,154 @@ +import logging +from contextlib import contextmanager, nullcontext +from pathlib import Path + +import bibtexparser +import pandas as pd + + +@contextmanager +def all_logging_disabled(highest_level=logging.CRITICAL): + previous_level = logging.root.manager.disable + logging.disable(highest_level) + try: + yield + finally: + logging.disable(previous_level) + + +def bib_library_from_dir( + dir: Path, disable_warnings: bool = True +) -> bibtexparser.Library: + bib_string = "" + for partial_bib in dir.glob("**/*.bib"): + with open(partial_bib) as f: + bib_string += "\n".join(f.readlines()) + if disable_warnings: + cm = all_logging_disabled + else: + cm = nullcontext + with cm(): + return bibtexparser.parse_string(bib_string) + + +def bib_metadata_df(sample: bibtexparser.Library) -> pd.DataFrame: + """Returns dataframe with relevant metadata extracted from a bibtex library. + + DataFrame contains: doi, the number times it has been cited in the + literature (WOS statistic), its overall usage, and all keywords it + has been assigned (in the reference management software). + """ + return ( + pd.DataFrame( + [ + [ + entry["doi"] if "doi" in entry.fields_dict else None, + entry["times-cited"] + if "times-cited" in entry.fields_dict + else None, + entry["usage"] if "usage" in entry.fields_dict else None, + entry["keywords"] if "keywords" in entry.fields_dict else None, + ] + for entry in sample.entries + ], + columns=["doi", "cited", "usage", "keywords"], + ) + .drop_duplicates("doi") + .set_index("doi") + ) + + +def country_groups_df(wb_xlsx_file: Path) -> pd.DataFrame: + """Returns a dataframe of WB country group definitions. + + DataFrame contains the region of a country, the country code, + its income group, economy group and lending category. + + Contains an additional entry for 'global' with the country + code 'WLD' which can be used for non-country specific + observations (e.g. wide comparative studies using 30+ + developing/developed country datasets). + """ + return pd.concat( + [ + pd.read_excel(wb_xlsx_file), + pd.DataFrame( + data={ + "Economy": ["global"], + "Code": ["WLD"], + "Region": [ + "Europe & Central Asia;South Asia;North America;East Asia & Pacific;Sub-Saharan Africa;Europe & Central Asia;Latin America & Caribbean" + ], + "Income group": [""], + "Lending category": [""], + } + ), + ] + ).set_index("Economy") + + +def observations_with_metadata_df( + raw_observations: pd.DataFrame, + study_metadata: pd.DataFrame | None = None, + country_groups: pd.DataFrame | None = None, +) -> pd.DataFrame: + """Returns observations with metadata and country information attached. + + Returned DataFrame contains the raw observations data with one + observation per row, with correct study and WB country grouping metadata + attached for each row as well. + """ + df = raw_observations.assign( + doi=lambda _df: _df["uri"].str.extract( + r"https?://(?:dx\.)?doi\.org/(.*)", expand=False + ), + date=lambda _df: pd.to_datetime(_df["year"], format="%Y"), + year=lambda _df: _df["date"].dt.year, + ).query("year >= 2000") + if study_metadata is not None: + df = df.assign( + zot_cited=lambda _df: _df["doi"].map(study_metadata["cited"]), + zot_usage=lambda _df: _df["doi"].map(study_metadata["usage"]), + zot_keywords=lambda _df: _df["doi"].map(study_metadata["keywords"]), + ) + if country_groups is not None: + + def c_to_region_closure(countries): + return countries_to_regions(country_groups, countries) + + def c_to_income_closure(countries): + return countries_to_income_groups(country_groups, countries) + + df = df.assign( + region=lambda _df: _df["country"].map(c_to_region_closure), + income_group=lambda _df: _df["country"].map(c_to_income_closure), + ) + return df + + +def countries_to_regions(country_groups: pd.DataFrame, countries: str) -> str: + """Returns the regions for countries passed in. + + Return is in semicolon-concatenated string-list ready to be passed into + pandas for an apply or mapping method. + """ + res = set() + for c in countries.replace(" ;", ";").replace("; ", ";").split(";"): + if c in country_groups.index: + region = country_groups.at[c, "Region"] + res.add(region) + return ";".join(res) + + +def countries_to_income_groups(country_groups: pd.DataFrame, countries: str): + """Returns the income groups for countries passed in. + + Return is in semicolon-concatenated string-list ready to be passed into + pandas for an apply or mapping method. + """ + res = set() + for c in countries.replace(" ;", ";").replace("; ", ";").split(";"): + if c in country_groups.index: + region = country_groups.at[c, "Income group"] + res.add(region) + return ";".join(res)