feat(data): Prepare loading WB country group data

2023-12-07 20:11:27 +01:00 · 2023-12-07 20:11:27 +01:00 · d88c733b6d
commit d88c733b6d
parent 8f0f57edcc
4 changed files with 41 additions and 3 deletions
--- a/02-data/supplementary/wb-country-groupings.xlsx
+++ b/02-data/supplementary/wb-country-groupings.xlsx
--- a/poetry.lock
+++ b/poetry.lock
@ -569,6 +569,17 @@ files = [
    {file = "defusedxml-0.7.1.tar.gz", hash = "sha256:1bb3032db185915b62d7c6209c5a8792be6a32ab2fedacc84e01b52c51aa3e69"},
 ]

+[[package]]
+name = "et-xmlfile"
+version = "1.1.0"
+description = "An implementation of lxml.xmlfile for the standard library"
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "et_xmlfile-1.1.0-py3-none-any.whl", hash = "sha256:a2ba85d1d6a74ef63837eed693bcb89c3f752169b0e3e7ae5b16ca5e1b3deada"},
+    {file = "et_xmlfile-1.1.0.tar.gz", hash = "sha256:8eb9e2bc2f8c97e37a2dc85a09ecdcdec9d8a396530a6d5a33b30b9a92da0c5c"},
+]
+
 [[package]]
 name = "executing"
 version = "2.0.1"
@ -1747,6 +1758,20 @@ files = [
    {file = "numpy-1.26.1.tar.gz", hash = "sha256:c8c6c72d4a9f831f328efb1312642a1cafafaa88981d9ab76368d50d07d93cbe"},
 ]

+[[package]]
+name = "openpyxl"
+version = "3.1.2"
+description = "A Python library to read/write Excel 2010 xlsx/xlsm files"
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "openpyxl-3.1.2-py2.py3-none-any.whl", hash = "sha256:f91456ead12ab3c6c2e9491cf33ba6d08357d802192379bb482f1033ade496f5"},
+    {file = "openpyxl-3.1.2.tar.gz", hash = "sha256:a6f5977418eff3b2d5500d54d9db50c8277a368436f4e4f8ddb1be3422870184"},
+]
+
+[package.dependencies]
+et-xmlfile = "*"
+
 [[package]]
 name = "overrides"
 version = "7.4.0"
@ -3021,4 +3046,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p
 [metadata]
 lock-version = "2.0"
 python-versions = "<3.13,>=3.11"
-content-hash = "e02d73a52358bb9ed8da5e6d5cc8c55992ee449a429c820105610f6c484066e3"
+content-hash = "f6f60ec28f3f1e61377114f1b58e6117b45fb290f362ad471790611505e95dfc"
--- a/pyproject.toml
+++ b/pyproject.toml
@ -13,6 +13,7 @@ bibtexparser = {version = ">=2.0.0b1", allow-prereleases = true}
 jupyter = "^1.0.0"
 jupyter-cache = "^0.6.1"
 tabulate = "^0.9.0"
+openpyxl = "^3.1.2"

 [tool.poetry.group.dev.dependencies]
 pynvim = "^0.4.3"
--- a/scoping_review.qmd
+++ b/scoping_review.qmd
@ -463,8 +463,7 @@ as can be seen in @fig-publications-per-year.
 Keeping in mind that these results are not yet screened for their full relevance to the topic at hand, so far only being *potentially* relevant in falling into the requirements of the search pattern, an increased results output does not necessarily mean a clearly rising amount of relevant literature.

 ```{python}
-#| label: fig-publications-per-year
-#| fig-cap: Publications per year
+# load relevant studies
 reformatted = []
 for e in sample_relevant:
    ed = e.fields_dict
@ -476,6 +475,7 @@ for e in sample_relevant:
                        ed.get("usage-count-since-2013", Field(key="usage-count-since-2013", value=None)).value,
                        ed.get("keywords", Field(key="keywords", value=None)).value,
                        ])
+
 # FIXME do not just drop missing values
 bib_df = pd.DataFrame(reformatted, columns = ["Year", "Author", "Title", "Type", "Cited", "Usage", "Keywords"])
 bib_df = bib_df.dropna(how="any")
@ -485,6 +485,18 @@ bib_df["Year"] = bib_df["Date"].dt.year
 # only keep newer entries
 bib_df = bib_df[bib_df["Year"] >= 2000]

+# Add WB country grouping definitions (income group, world region)
+# TODO Re-enable for processed study pool
+# WB_COUNTRY_GROUPS_FILE = Path(f"{SUPPLEMENTARY_DATA}/wb-country-groupings.xlsx").resolve()
+# df_country_groups = pd.read_excel(WB_COUNTRY_GROUPS_FILE)
+# bib_df["income group"] = bib_df["country"].map(df_country_groups.set_index("Economy")["Income group"])
+# bib_df["region"] = bib_df["country"].map(df_country_groups.set_index("Economy")["Region"])
+```
+
+```{python}
+#| label: fig-publications-per-year
+#| fig-cap: Publications per year
+
 # create dummy category for white or gray lit type (based on 'article' appearing in type)
 bib_df["Type"].value_counts()
 bib_df["Literature"] = np.where(bib_df["Type"].str.contains("article", case=False, regex=False), "white", "gray")