fix(script): Correctly map regions to studies

Correctly mapping multiple countries to multiple regions or income groups.
This commit is contained in:
Marty Oehme 2023-12-21 16:13:25 +01:00
parent 8bab7256e1
commit 3f44b0d710
Signed by: Marty
GPG key ID: EDBF2ED917B2EF6A

View file

@ -72,7 +72,24 @@ zot_df = pd.DataFrame([
# Add WB country grouping definitions (income group, world region) # Add WB country grouping definitions (income group, world region)
WB_COUNTRY_GROUPS_FILE = Path(f"{SUPPLEMENTARY_DATA}/wb-country-groupings.xlsx").resolve() WB_COUNTRY_GROUPS_FILE = Path(f"{SUPPLEMENTARY_DATA}/wb-country-groupings.xlsx").resolve()
df_country_groups = pd.read_excel(WB_COUNTRY_GROUPS_FILE).set_index("Economy") df_country_groups = pd.concat([pd.read_excel(WB_COUNTRY_GROUPS_FILE), pd.DataFrame(data={'Economy':['global'],'Code':['WLD'],'Region':['World'], 'Income group':[''], 'Lending category':['']})]).set_index("Economy")
def countries_to_regions(countries:str):
res = set()
for c in countries.replace(" ;", ";").replace("; ",";").split(";"):
if c in df_country_groups.index:
region = df_country_groups.at[c,'Region']
res.add(region)
return ";".join(res)
def countries_to_income_groups(countries:str):
res = set()
for c in countries.replace(" ;", ";").replace("; ",";").split(";"):
if c in df_country_groups.index:
region = df_country_groups.at[c,'Income group']
res.add(region)
return ";".join(res)
bib_df = (data.from_yml(f"{PROCESSED_DATA}/relevant") bib_df = (data.from_yml(f"{PROCESSED_DATA}/relevant")
.assign( .assign(
@ -82,8 +99,8 @@ bib_df = (data.from_yml(f"{PROCESSED_DATA}/relevant")
zot_keywords=lambda _df: _df["doi"].map(zot_df["keywords"]), zot_keywords=lambda _df: _df["doi"].map(zot_df["keywords"]),
date = lambda _df: pd.to_datetime(_df["year"], format="%Y"), date = lambda _df: pd.to_datetime(_df["year"], format="%Y"),
year = lambda _df: _df["date"].dt.year, year = lambda _df: _df["date"].dt.year,
region = lambda _df: _df["country"].map(df_country_groups["Region"]), region = lambda _df: _df["country"].map(countries_to_regions),
income_group = lambda _df: _df["country"].map(df_country_groups["Income group"]), income_group = lambda _df: _df["country"].map(countries_to_income_groups),
) )
.query("year >= 2000") .query("year >= 2000")
) )