feat(data): Prepare loading WB country group data

This commit is contained in:
Marty Oehme 2023-12-07 20:11:27 +01:00
parent 8f0f57edcc
commit d88c733b6d
Signed by: Marty
GPG key ID: EDBF2ED917B2EF6A
4 changed files with 41 additions and 3 deletions

Binary file not shown.

27
poetry.lock generated
View file

@ -569,6 +569,17 @@ files = [
{file = "defusedxml-0.7.1.tar.gz", hash = "sha256:1bb3032db185915b62d7c6209c5a8792be6a32ab2fedacc84e01b52c51aa3e69"}, {file = "defusedxml-0.7.1.tar.gz", hash = "sha256:1bb3032db185915b62d7c6209c5a8792be6a32ab2fedacc84e01b52c51aa3e69"},
] ]
[[package]]
name = "et-xmlfile"
version = "1.1.0"
description = "An implementation of lxml.xmlfile for the standard library"
optional = false
python-versions = ">=3.6"
files = [
{file = "et_xmlfile-1.1.0-py3-none-any.whl", hash = "sha256:a2ba85d1d6a74ef63837eed693bcb89c3f752169b0e3e7ae5b16ca5e1b3deada"},
{file = "et_xmlfile-1.1.0.tar.gz", hash = "sha256:8eb9e2bc2f8c97e37a2dc85a09ecdcdec9d8a396530a6d5a33b30b9a92da0c5c"},
]
[[package]] [[package]]
name = "executing" name = "executing"
version = "2.0.1" version = "2.0.1"
@ -1747,6 +1758,20 @@ files = [
{file = "numpy-1.26.1.tar.gz", hash = "sha256:c8c6c72d4a9f831f328efb1312642a1cafafaa88981d9ab76368d50d07d93cbe"}, {file = "numpy-1.26.1.tar.gz", hash = "sha256:c8c6c72d4a9f831f328efb1312642a1cafafaa88981d9ab76368d50d07d93cbe"},
] ]
[[package]]
name = "openpyxl"
version = "3.1.2"
description = "A Python library to read/write Excel 2010 xlsx/xlsm files"
optional = false
python-versions = ">=3.6"
files = [
{file = "openpyxl-3.1.2-py2.py3-none-any.whl", hash = "sha256:f91456ead12ab3c6c2e9491cf33ba6d08357d802192379bb482f1033ade496f5"},
{file = "openpyxl-3.1.2.tar.gz", hash = "sha256:a6f5977418eff3b2d5500d54d9db50c8277a368436f4e4f8ddb1be3422870184"},
]
[package.dependencies]
et-xmlfile = "*"
[[package]] [[package]]
name = "overrides" name = "overrides"
version = "7.4.0" version = "7.4.0"
@ -3021,4 +3046,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p
[metadata] [metadata]
lock-version = "2.0" lock-version = "2.0"
python-versions = "<3.13,>=3.11" python-versions = "<3.13,>=3.11"
content-hash = "e02d73a52358bb9ed8da5e6d5cc8c55992ee449a429c820105610f6c484066e3" content-hash = "f6f60ec28f3f1e61377114f1b58e6117b45fb290f362ad471790611505e95dfc"

View file

@ -13,6 +13,7 @@ bibtexparser = {version = ">=2.0.0b1", allow-prereleases = true}
jupyter = "^1.0.0" jupyter = "^1.0.0"
jupyter-cache = "^0.6.1" jupyter-cache = "^0.6.1"
tabulate = "^0.9.0" tabulate = "^0.9.0"
openpyxl = "^3.1.2"
[tool.poetry.group.dev.dependencies] [tool.poetry.group.dev.dependencies]
pynvim = "^0.4.3" pynvim = "^0.4.3"

View file

@ -463,8 +463,7 @@ as can be seen in @fig-publications-per-year.
Keeping in mind that these results are not yet screened for their full relevance to the topic at hand, so far only being *potentially* relevant in falling into the requirements of the search pattern, an increased results output does not necessarily mean a clearly rising amount of relevant literature. Keeping in mind that these results are not yet screened for their full relevance to the topic at hand, so far only being *potentially* relevant in falling into the requirements of the search pattern, an increased results output does not necessarily mean a clearly rising amount of relevant literature.
```{python} ```{python}
#| label: fig-publications-per-year # load relevant studies
#| fig-cap: Publications per year
reformatted = [] reformatted = []
for e in sample_relevant: for e in sample_relevant:
ed = e.fields_dict ed = e.fields_dict
@ -476,6 +475,7 @@ for e in sample_relevant:
ed.get("usage-count-since-2013", Field(key="usage-count-since-2013", value=None)).value, ed.get("usage-count-since-2013", Field(key="usage-count-since-2013", value=None)).value,
ed.get("keywords", Field(key="keywords", value=None)).value, ed.get("keywords", Field(key="keywords", value=None)).value,
]) ])
# FIXME do not just drop missing values # FIXME do not just drop missing values
bib_df = pd.DataFrame(reformatted, columns = ["Year", "Author", "Title", "Type", "Cited", "Usage", "Keywords"]) bib_df = pd.DataFrame(reformatted, columns = ["Year", "Author", "Title", "Type", "Cited", "Usage", "Keywords"])
bib_df = bib_df.dropna(how="any") bib_df = bib_df.dropna(how="any")
@ -485,6 +485,18 @@ bib_df["Year"] = bib_df["Date"].dt.year
# only keep newer entries # only keep newer entries
bib_df = bib_df[bib_df["Year"] >= 2000] bib_df = bib_df[bib_df["Year"] >= 2000]
# Add WB country grouping definitions (income group, world region)
# TODO Re-enable for processed study pool
# WB_COUNTRY_GROUPS_FILE = Path(f"{SUPPLEMENTARY_DATA}/wb-country-groupings.xlsx").resolve()
# df_country_groups = pd.read_excel(WB_COUNTRY_GROUPS_FILE)
# bib_df["income group"] = bib_df["country"].map(df_country_groups.set_index("Economy")["Income group"])
# bib_df["region"] = bib_df["country"].map(df_country_groups.set_index("Economy")["Region"])
```
```{python}
#| label: fig-publications-per-year
#| fig-cap: Publications per year
# create dummy category for white or gray lit type (based on 'article' appearing in type) # create dummy category for white or gray lit type (based on 'article' appearing in type)
bib_df["Type"].value_counts() bib_df["Type"].value_counts()
bib_df["Literature"] = np.where(bib_df["Type"].str.contains("article", case=False, regex=False), "white", "gray") bib_df["Literature"] = np.where(bib_df["Type"].str.contains("article", case=False, regex=False), "white", "gray")