7.12. Case Study HTML GDP

7.12.1. Case Study - 1

"""
>>> result.loc['Polska']
PKB          6.741270e+11
Ludność      3.842069e+07
PerCapita    1.754594e+04
Name: Polska, dtype: float64
"""

import pandas as pd
import matplotlib.pyplot as plt


pd.set_option('display.width', 200)
pd.set_option('display.max_columns', 15)
pd.set_option('display.max_rows', 100)
pd.set_option('display.min_rows', 100)
pd.set_option('display.max_seq_items', 100)


USD = 1

# PKB = 'https://pl.wikipedia.org/wiki/Lista_pa%C5%84stw_%C5%9Bwiata_wed%C5%82ug_PKB_nominalnego'
PKB = 'https://python3.info/_static/percapita-pkb.html'

# LUDNOSC = 'https://pl.wikipedia.org/wiki/Lista_pa%C5%84stw_%C5%9Bwiata_wed%C5%82ug_liczby_ludno%C5%9Bci'
LUDNOSC = 'https://python3.info/_static/percapita-ludnosc.html'

LUDNOSC_PANSTWA = {
    'Chińska Republika Ludowa': 'Chiny',
    'Korea Północna': pd.NA,
    'Republika Chińska': 'Tajwan',
    'Kuba': pd.NA,
    'Zachodni Brzeg': pd.NA,
    'Strefa Gazy': pd.NA}

LUDNOSC_COLUMNS = {
    'Państwo, obszar lub terytorium zależne': 'Państwo',
    '2018': 'Ludność'}


def clean(column):
    return (column
        .str.replace('\xa0', '')
        .str.replace(' ', ''))

pkb = (pd
    .read_html(PKB)[1]
    .rename(columns={'2021 r.': 'PKB'})
    .loc[:, ['Państwo', 'PKB']]
    .replace('b.d.', pd.NA)
    .dropna(how='any', axis='rows')
    .apply(clean)
    .astype({'PKB': 'int64'})
    .set_index('Państwo')
    .mul(1_000_000*USD))

ludnosc = (pd
    .read_html(LUDNOSC)[0]
    .droplevel(level=0, axis='columns')
    .rename(columns=LUDNOSC_COLUMNS)
    .loc[:, ['Państwo', 'Ludność']]
    .replace(LUDNOSC_PANSTWA)
    .set_index('Państwo')
    .query('index in @pkb.index')
    .apply(clean)
    .astype({'Ludność': 'int64'}))

result = (pkb
    .merge(ludnosc, left_index=True, right_index=True)
    .sort_index(ascending=True)
    .eval('PerCapita = PKB / Ludność'))

plot = (result
    .loc[:, ['PerCapita']]
    .round({'PerCapita': 1})
    .sort_values('PerCapita', ascending=False)
    .head(n=30)
    .plot(kind='bar', legend=True, grid=True, figsize=(16,10)))

# plt.show()
../../_images/html-gdp-1-top10.png

Figure 7.12. Top 10 countries with highest Global Domestic Product Per Capita

7.12.2. Case Study - 2

import pandas as pd
import matplotlib.pyplot as plt


pd.set_option('display.width', 80)
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 10)
pd.set_option('display.max_colwidth', 100)

USD = 1

# PKB = 'https://pl.wikipedia.org/wiki/Lista_pa%C5%84stw_%C5%9Bwiata_wed%C5%82ug_PKB_nominalnego'
# LUDNOSC = 'https://pl.wikipedia.org/wiki/Lista_pa%C5%84stw_%C5%9Bwiata_wed%C5%82ug_liczby_ludno%C5%9Bci'
PKB = 'https://python3.info/_static/html-gdp-pkb.html'
LUDNOSC = 'https://python3.info/_static/html-gdp-ludnosc.html'

pkb = (
    pd
    .read_html(PKB)[2]
    .rename(columns={'Państwo':'kraj', '2022 r.':'pkb'})
    .loc[:, ['kraj', 'pkb']]
    .replace({'pkb': {'b.d.':pd.NA, ' ':''}}, regex=True)
    .astype({'kraj': 'string', 'pkb': 'Int64'})
    .convert_dtypes()
    .dropna()
    .set_index('kraj', drop=True)
    .mul(1_000_000*USD))

ludnosc = (
    pd
    .read_html(LUDNOSC)[0]
    .droplevel(0, axis='columns')
    .rename(columns={'Państwo, obszar lub terytorium zależne': 'kraj', '2022': 'ludnosc'})
    .loc[:, ['kraj', 'ludnosc']]
    .replace({'ludnosc': {'\xa0':'', r'\[3\]':'', ' ':'', '–':pd.NA}}, regex=True)
    .replace({'kraj': {'Chińska Republika Ludowa': 'Chiny'}})
    .dropna()
    .astype({'kraj': 'string', 'ludnosc': 'Int64'})
    .convert_dtypes()
    .set_index('kraj', drop=True)
    .query('index in @pkb.index'))

data = (
    pkb
    .join(ludnosc)
    .assign(per_capita=lambda df: df.pkb/df.ludnosc)
    .round({'per_capita': 2})
    .astype({'pkb': 'Int64', 'ludnosc': 'Int64', 'per_capita': 'Float64'})
    .convert_dtypes()
    .dropna()
    .sort_values('per_capita', ascending=False))

top_10_percapita = (
    data
    .head(10)
    .loc[:, ['per_capita']]
    .sort_values('per_capita')
).plot(
    title='Top 10 krajów wg PKB na mieszkańca',
    xlabel='Country',
    ylabel='USD Per Capita',
    kind='bar',
    legend=False,
    figsize=(20, 10),
)

plt.tight_layout()
plt.show()
../../_images/html-gdp-2-top10.png

Figure 7.13. Top 10 countries with highest Global Domestic Product Per Capita

7.12.3. Case Study - 2

# %%

import pandas as pd
from matplotlib import pyplot as plt

pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 10)
pd.set_option('display.max_rows', 500)

# %%

PKB = 'https://pl.wikipedia.org/wiki/Lista_pa%C5%84stw_%C5%9Bwiata_wed%C5%82ug_PKB_nominalnego'
LUDNOSC = 'https://pl.wikipedia.org/wiki/Lista_pa%C5%84stw_%C5%9Bwiata_wed%C5%82ug_liczby_ludno%C5%9Bci'

USD = 1

# %%

pkb = (
    pd
    .read_html(PKB)[3]
    .rename(columns={'Państwo':'kraj', '2021 r.':'pkb'})
    .loc[:, ['kraj', 'pkb']]
    .replace({'pkb': {'\xa0': '', 'b.d.': pd.NA}}, regex=True)
    .dropna(how='any', axis='rows')
    .astype({'kraj': 'str', 'pkb': 'int64'})
    .convert_dtypes()
    .set_index('kraj', drop=True)
    .drop(['Strefa euro', 'Wyspy Kokosowe', 'Palestyna'], axis='rows')
    .mul(1_000_000*USD)
)

# %%

ludnosc = (
    pd
    .read_html(LUDNOSC)[0]
    .droplevel(0, axis='columns')
    .rename(columns={'Państwo, obszar lub terytorium zależne':'kraj', '2022':'ludnosc'})
    .loc[:, ['kraj', 'ludnosc']]
    .replace({'ludnosc': {'\xa0': '', ' ': '', '–': pd.NA, r'\[3\]': ''}}, regex=True)
    .replace({'Chińska Republika Ludowa':'Chiny'})
    .dropna(how='any', axis='rows')
    .astype({'kraj': 'str', 'ludnosc': 'int64'})
    .convert_dtypes()
    .set_index('kraj', drop=True)
)

# %%

pkb.query('index not in @ludnosc.index')

# %%

result = (
    pd
    .concat([pkb, ludnosc], axis='columns', join='inner')
    .assign(per_capita=lambda df: df['pkb'] / df['ludnosc'])
    .round({'per_capita': 2})
    .convert_dtypes()
    .sort_values(by='per_capita', ascending=False)
)

# %%

plot_top10 = (
    result
    .head(10)
    .loc[:, ['per_capita']]
    .plot(
        kind='bar',
        title='Top 10 countries by GDP per capita',
        xlabel='Country',
        ylabel='GDP per capita [USD]',
        grid=False,
        figsize=(15,10),
        color='green')
)

plt.tight_layout()
plt.show()
../../_images/html-gdp-3-top10.png

Figure 7.14. Top 10 countries with highest Global Domestic Product Per Capita