feat(data): Prepare loading WB country group data
This commit is contained in:
parent
8f0f57edcc
commit
d88c733b6d
4 changed files with 41 additions and 3 deletions
BIN
02-data/supplementary/wb-country-groupings.xlsx
Normal file
BIN
02-data/supplementary/wb-country-groupings.xlsx
Normal file
Binary file not shown.
27
poetry.lock
generated
27
poetry.lock
generated
|
@ -569,6 +569,17 @@ files = [
|
||||||
{file = "defusedxml-0.7.1.tar.gz", hash = "sha256:1bb3032db185915b62d7c6209c5a8792be6a32ab2fedacc84e01b52c51aa3e69"},
|
{file = "defusedxml-0.7.1.tar.gz", hash = "sha256:1bb3032db185915b62d7c6209c5a8792be6a32ab2fedacc84e01b52c51aa3e69"},
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "et-xmlfile"
|
||||||
|
version = "1.1.0"
|
||||||
|
description = "An implementation of lxml.xmlfile for the standard library"
|
||||||
|
optional = false
|
||||||
|
python-versions = ">=3.6"
|
||||||
|
files = [
|
||||||
|
{file = "et_xmlfile-1.1.0-py3-none-any.whl", hash = "sha256:a2ba85d1d6a74ef63837eed693bcb89c3f752169b0e3e7ae5b16ca5e1b3deada"},
|
||||||
|
{file = "et_xmlfile-1.1.0.tar.gz", hash = "sha256:8eb9e2bc2f8c97e37a2dc85a09ecdcdec9d8a396530a6d5a33b30b9a92da0c5c"},
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "executing"
|
name = "executing"
|
||||||
version = "2.0.1"
|
version = "2.0.1"
|
||||||
|
@ -1747,6 +1758,20 @@ files = [
|
||||||
{file = "numpy-1.26.1.tar.gz", hash = "sha256:c8c6c72d4a9f831f328efb1312642a1cafafaa88981d9ab76368d50d07d93cbe"},
|
{file = "numpy-1.26.1.tar.gz", hash = "sha256:c8c6c72d4a9f831f328efb1312642a1cafafaa88981d9ab76368d50d07d93cbe"},
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "openpyxl"
|
||||||
|
version = "3.1.2"
|
||||||
|
description = "A Python library to read/write Excel 2010 xlsx/xlsm files"
|
||||||
|
optional = false
|
||||||
|
python-versions = ">=3.6"
|
||||||
|
files = [
|
||||||
|
{file = "openpyxl-3.1.2-py2.py3-none-any.whl", hash = "sha256:f91456ead12ab3c6c2e9491cf33ba6d08357d802192379bb482f1033ade496f5"},
|
||||||
|
{file = "openpyxl-3.1.2.tar.gz", hash = "sha256:a6f5977418eff3b2d5500d54d9db50c8277a368436f4e4f8ddb1be3422870184"},
|
||||||
|
]
|
||||||
|
|
||||||
|
[package.dependencies]
|
||||||
|
et-xmlfile = "*"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "overrides"
|
name = "overrides"
|
||||||
version = "7.4.0"
|
version = "7.4.0"
|
||||||
|
@ -3021,4 +3046,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p
|
||||||
[metadata]
|
[metadata]
|
||||||
lock-version = "2.0"
|
lock-version = "2.0"
|
||||||
python-versions = "<3.13,>=3.11"
|
python-versions = "<3.13,>=3.11"
|
||||||
content-hash = "e02d73a52358bb9ed8da5e6d5cc8c55992ee449a429c820105610f6c484066e3"
|
content-hash = "f6f60ec28f3f1e61377114f1b58e6117b45fb290f362ad471790611505e95dfc"
|
||||||
|
|
|
@ -13,6 +13,7 @@ bibtexparser = {version = ">=2.0.0b1", allow-prereleases = true}
|
||||||
jupyter = "^1.0.0"
|
jupyter = "^1.0.0"
|
||||||
jupyter-cache = "^0.6.1"
|
jupyter-cache = "^0.6.1"
|
||||||
tabulate = "^0.9.0"
|
tabulate = "^0.9.0"
|
||||||
|
openpyxl = "^3.1.2"
|
||||||
|
|
||||||
[tool.poetry.group.dev.dependencies]
|
[tool.poetry.group.dev.dependencies]
|
||||||
pynvim = "^0.4.3"
|
pynvim = "^0.4.3"
|
||||||
|
|
|
@ -463,8 +463,7 @@ as can be seen in @fig-publications-per-year.
|
||||||
Keeping in mind that these results are not yet screened for their full relevance to the topic at hand, so far only being *potentially* relevant in falling into the requirements of the search pattern, an increased results output does not necessarily mean a clearly rising amount of relevant literature.
|
Keeping in mind that these results are not yet screened for their full relevance to the topic at hand, so far only being *potentially* relevant in falling into the requirements of the search pattern, an increased results output does not necessarily mean a clearly rising amount of relevant literature.
|
||||||
|
|
||||||
```{python}
|
```{python}
|
||||||
#| label: fig-publications-per-year
|
# load relevant studies
|
||||||
#| fig-cap: Publications per year
|
|
||||||
reformatted = []
|
reformatted = []
|
||||||
for e in sample_relevant:
|
for e in sample_relevant:
|
||||||
ed = e.fields_dict
|
ed = e.fields_dict
|
||||||
|
@ -476,6 +475,7 @@ for e in sample_relevant:
|
||||||
ed.get("usage-count-since-2013", Field(key="usage-count-since-2013", value=None)).value,
|
ed.get("usage-count-since-2013", Field(key="usage-count-since-2013", value=None)).value,
|
||||||
ed.get("keywords", Field(key="keywords", value=None)).value,
|
ed.get("keywords", Field(key="keywords", value=None)).value,
|
||||||
])
|
])
|
||||||
|
|
||||||
# FIXME do not just drop missing values
|
# FIXME do not just drop missing values
|
||||||
bib_df = pd.DataFrame(reformatted, columns = ["Year", "Author", "Title", "Type", "Cited", "Usage", "Keywords"])
|
bib_df = pd.DataFrame(reformatted, columns = ["Year", "Author", "Title", "Type", "Cited", "Usage", "Keywords"])
|
||||||
bib_df = bib_df.dropna(how="any")
|
bib_df = bib_df.dropna(how="any")
|
||||||
|
@ -485,6 +485,18 @@ bib_df["Year"] = bib_df["Date"].dt.year
|
||||||
# only keep newer entries
|
# only keep newer entries
|
||||||
bib_df = bib_df[bib_df["Year"] >= 2000]
|
bib_df = bib_df[bib_df["Year"] >= 2000]
|
||||||
|
|
||||||
|
# Add WB country grouping definitions (income group, world region)
|
||||||
|
# TODO Re-enable for processed study pool
|
||||||
|
# WB_COUNTRY_GROUPS_FILE = Path(f"{SUPPLEMENTARY_DATA}/wb-country-groupings.xlsx").resolve()
|
||||||
|
# df_country_groups = pd.read_excel(WB_COUNTRY_GROUPS_FILE)
|
||||||
|
# bib_df["income group"] = bib_df["country"].map(df_country_groups.set_index("Economy")["Income group"])
|
||||||
|
# bib_df["region"] = bib_df["country"].map(df_country_groups.set_index("Economy")["Region"])
|
||||||
|
```
|
||||||
|
|
||||||
|
```{python}
|
||||||
|
#| label: fig-publications-per-year
|
||||||
|
#| fig-cap: Publications per year
|
||||||
|
|
||||||
# create dummy category for white or gray lit type (based on 'article' appearing in type)
|
# create dummy category for white or gray lit type (based on 'article' appearing in type)
|
||||||
bib_df["Type"].value_counts()
|
bib_df["Type"].value_counts()
|
||||||
bib_df["Literature"] = np.where(bib_df["Type"].str.contains("article", case=False, regex=False), "white", "gray")
|
bib_df["Literature"] = np.where(bib_df["Type"].str.contains("article", case=False, regex=False), "white", "gray")
|
||||||
|
|
Loading…
Reference in a new issue