7.2. Case Study CSV COVID19

7.2.1. Case Study - 0x01

# %% Imports
from datetime import date
import pandas as pd
from pandas.tseries.holiday import AbstractHolidayCalendar, Holiday
from pandas.tseries.holiday import EasterMonday, Easter
from pandas.tseries.offsets import Day
import matplotlib.pyplot as plt
import matplotlib.axes
from pathlib import Path


# %% Settings
pd.set_option('display.width', 200)
pd.set_option('display.max_columns', 15)
pd.set_option('display.max_rows', 100)


# %% Data Sources
# CONFIRMED = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
# DEATHS = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv'
# RECOVERED = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv'

# CONFIRMED = 'https://python3.info/_static/covid19-confirmed.csv'
# DEATHS = 'https://python3.info/_static/covid19-deaths.csv'
# RECOVERED = 'https://python3.info/_static/covid19-recovered.csv'

DATA = Path(__file__).resolve().parent / '..' / 'data'
CONFIRMED = DATA / 'covid19-confirmed.csv'
DEATHS = DATA / 'covid19-deaths.csv'
RECOVERED = DATA / 'covid19-recovered.csv'


# %% Data Frames
confirmed = pd.read_csv(CONFIRMED)
deaths = pd.read_csv(DEATHS)
recovered = pd.read_csv(RECOVERED)


# %% Get Country from DataFrame
def covid19(country: str = None) -> pd.DataFrame:
    """
    Get Confirmed, Deaths, Recovered for given country

    >>> covid19('Poland').loc['2022-01-01']
    Confirmed    4120248
    Deaths         97559
    Recovered          0
    Name: 2022-01-01 00:00:00, dtype: int64

    >>> covid19('France').loc['2022-01-01']
    Confirmed    10296909
    Deaths         124839
    Recovered           0
    Name: 2022-01-01 00:00:00, dtype: int64

    >>> covid19().loc['2022-01-01']
    Confirmed    289931319
    Deaths         5473487
    Recovered            0
    Name: 2022-01-01 00:00:00, dtype: int64
    """
    def _get(data: pd.DataFrame, country: str = None) -> pd.Series:
        """
        Get Country from DataFrame

        >>> _get(confirmed, 'Poland').loc['2022-01-01']
        4120248
        >>> _get(deaths, 'Poland').loc['2022-01-01']
        97559
        >>> _get(recovered, 'Poland').loc['2022-01-01']
        0
        """
        if country is not None:
            data = data.query('`Country/Region` == @country')
        return (data
                .transpose()
                .iloc[4:]
                .sum(axis='columns')
                .astype('int64')
                .rename(pd.to_datetime, axis='index'))

    return pd.DataFrame({
        'Confirmed': _get(confirmed, country),
        'Deaths': _get(deaths, country),
        'Recovered': _get(recovered, country)})


# %% Calendars
class PLHolidayCalendar(AbstractHolidayCalendar):
    """
    Custom Holiday calendar for Poland based on
    https://en.wikipedia.org/wiki/Public_holidays_in_Poland
    """
    rules = [
        Holiday('New Years Day', month=1, day=1),
        Holiday('Epiphany', month=1, day=6),
        Holiday('Easter', month=1, day=1, offset=[Easter()]),
        EasterMonday,
        Holiday('May Day', month=5, day=1),
        Holiday('Constitution Day', month=5, day=3),
        Holiday('Pentecost Sunday', month=1, day=1, offset=[Easter(), Day(49)]),
        Holiday('Corpus Christi', month=1, day=1, offset=[Easter(), Day(60)]),
        Holiday('Assumption of the Blessed Virgin Mary', month=8, day=15),
        Holiday('All Saints Day', month=11, day=1),
        Holiday('Independence Day', month=11, day=11),
        Holiday('Christmas Day', month=12, day=25),
        Holiday('Second Day of Christmastide', month=12, day=26),
    ]


# %% Show trendline
def plot_trendline(data: pd.DataFrame) -> matplotlib.axes.Axes:
    return (data
            .loc[:, ['Confirmed','Deaths']]
            .plot(kind='line',
                  subplots=True,
                  layout=(2, 1),
                  figsize=(10, 10)))


# %% Show fatalities
def plot_fatalities(data: pd.DataFrame) -> matplotlib.axes.Axes:
    return ((data['Deaths'] / data['Confirmed'])
            .mul(100)  # convert to percent
            .round(2)
            .plot(kind='line',
                  title='Percent of deaths vs. new cases in last two weeks',
                  xlabel='Day',
                  ylabel='Percent',
                  ylim=(0.0, 6.0),
                  figsize=(10, 10),
                  grid=True))


# %% Confirmed cases day-by-day difference
def plot_confirmed_daily(data: pd.DataFrame) -> matplotlib.axes.Axes:
    return data['Confirmed'].diff().plot()


# %% Covid infection waves
def plot_confirmed_waves(data:pd.DataFrame) -> matplotlib.axes.Axes:
    return data['Confirmed'].diff().resample('W').median().plot()


# %% Confirmed cases in last two weeks
def plot_confirmed_last(data: pd.DataFrame, freq='2W') -> matplotlib.axes.Axes:
    return data['Confirmed'].last(freq).diff().plot()


# %% Confirmed cases every month
def plot_confirmed_monthly(data: pd.DataFrame) -> matplotlib.axes.Axes:
    return data['Confirmed'].resample('M').sum().plot()


# %%
def plot_confirmed_after_holidays(
        data: pd.DataFrame,
        since: date | str | None = '2021-01-01',
        until: date | str | None = '2022-02-07',
        days: int = 14,
        calendar: AbstractHolidayCalendar = PLHolidayCalendar(),
    ) -> matplotlib.axes.Axes:
    """
    Confirmed cases in period of 14 days after holidays
    """
    def _get(since, days):
        return (data
                .loc[since:, 'Confirmed']
                .iloc[:days]
                .reset_index(drop=True))

    data = {column: _get(since=holiday, days=days)
            for holiday in calendar.holidays(since, until)
            if (column := holiday.strftime('%Y-%m-%d'))}

    return pd.DataFrame(data).diff().plot(
        kind='line',
        subplots=True,
        layout=(15,1),
        sharex=True,
        figsize=(5, 15),
        grid=True)


# %% Main
if __name__ == '__main__':
    poland = covid19('Poland')
    usa = covid19('US')
    france = covid19('France')
    china = covid19('China')
    world = covid19()


    data = poland.loc['2020-01-01':'2022-02-01']

    plot_trendline(data)
    # plt.show()

    plot_fatalities(data)
    # plt.show()

    plot_confirmed_daily(data)
    # plt.show()

    plot_confirmed_waves(data)
    # plt.show()

    plot_confirmed_last(data)
    # plt.show()

    plot_confirmed_monthly(data)
    # plt.show()

    plot_confirmed_after_holidays(data)
    # plt.show()


"""TODO:

# Resample
poland['Confirmed'].shift(periods=1, freq='D').plot(kind='line')


# Z rozróżnianiem na kwartały (Q)
plot = poland['Confirmed'].resample('Q').plot(kind='line', legend=True)
plot[0].name = '2020-Q1'
plot[1].name = '2020-Q2'
plot[2].name = '2020-Q3'
plot[3].name = '2020-Q4'
plot[4].name = '2021-Q1'
plot[5].name = '2021-Q2'
plot[6].name = '2021-Q3'
plot[7].name = '2021-Q4'
plot[8].name = '2022-Q1'
plt.show()


# Makrotrendy
poland['Confirmed'].diff().rolling(window=14).median().plot()
plt.show()

poland['Confirmed'].diff().rolling(window=14).median().plot()
plt.show()

poland['Confirmed'].diff().resample('W').median().plot()
plt.show()

poland['Confirmed'].diff().resample('M').median().plot()
plt.show()

poland['Confirmed'].diff().resample('Q').median().plot()
plt.show()
"""
../../_images/covid19-poland-confirmed-daily.png

Figure 7.4. Confirmed daily plot for COVID19 pandemy in Poland.

../../_images/covid19-poland-confirmed-holidays.png

Figure 7.5. Confirmed holidays plot for COVID19 pandemy in Poland.

../../_images/covid19-poland-confirmed-last.png

Figure 7.6. Confirmed last plot for COVID19 pandemy in Poland.

../../_images/covid19-poland-confirmed-monthly.png

Figure 7.7. Confirmed monthly plot for COVID19 pandemy in Poland.

../../_images/covid19-poland-confirmed-waves.png

Figure 7.8. Confirmed waves plot for COVID19 pandemy in Poland.

../../_images/covid19-poland-fatalities.png

Figure 7.9. Fatalities plot for COVID19 pandemy in Poland.

../../_images/covid19-poland-trendline.png

Figure 7.10. Trendline plot for COVID19 pandemy in Poland.

7.2.2. Case Study - 0x02

from datetime import date
from pathlib import Path
import pandas as pd
from pandas.tseries.holiday import AbstractHolidayCalendar, Holiday, EasterMonday, Easter
from pandas.tseries.offsets import Day

PROCENT = 1


# %%

# CONFIRMED = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
# DEATHS = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv'
# RECOVERED = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv'

# CONFIRMED = 'https://python3.info/_static/covid19-confirmed.csv'
# DEATHS = 'https://python3.info/_static/covid19-deaths.csv'
# RECOVERED = 'https://python3.info/_static/covid19-recovered.csv'

DATA = Path(__file__).resolve().parent / '..' / 'data'
CONFIRMED = DATA / 'covid19-confirmed.csv'
DEATHS = DATA / 'covid19-deaths.csv'
RECOVERED = DATA / 'covid19-recovered.csv'

# %%

confirmed = pd.read_csv(CONFIRMED).convert_dtypes()
recovered = pd.read_csv(RECOVERED).convert_dtypes()
deaths = pd.read_csv(DEATHS).convert_dtypes()

# %%

class PLHolidayCalendar(AbstractHolidayCalendar):
    """
    Custom Holiday calendar for Poland based on
    https://en.wikipedia.org/wiki/Public_holidays_in_Poland

    >>> PLHolidayCalendar().holidays(start='2000-01-01', end='2000-12-31')
    DatetimeIndex(['2000-01-01', '2000-01-06', '2000-04-23', '2000-04-24',
                   '2000-05-01', '2000-05-03', '2000-06-11', '2000-06-22',
                   '2000-08-15', '2000-11-01', '2000-11-11', '2000-12-25',
                   '2000-12-26'],
                  dtype='datetime64[ns]', freq=None)
    """
    rules = [
        Holiday('New Years Day', month=1, day=1),
        Holiday('Epiphany', month=1, day=6),
        Holiday('Easter', month=1, day=1, offset=[Easter()]),
        EasterMonday,
        Holiday('May Day', month=5, day=1),
        Holiday('Constitution Day', month=5, day=3),
        Holiday('Pentecost Sunday', month=1, day=1, offset=[Easter(), Day(49)]),
        Holiday('Corpus Christi', month=1, day=1, offset=[Easter(), Day(60)]),
        Holiday('Assumption of the Blessed Virgin Mary', month=8, day=15),
        Holiday('All Saints Day', month=11, day=1),
        Holiday('Independence Day', month=11, day=11),
        Holiday('Christmas Day', month=12, day=25),
        Holiday('Second Day of Christmastide', month=12, day=26),
    ]


# %%

def _parse(data, country, name):
    """
    >>> _parse(confirmed, 'Poland', name='confirmed').loc['2021-08-04']
    confirmed    2883448
    Name: 2021-08-04 00:00:00, dtype: int64

    >>> _parse(confirmed, 'Poland', name='confirmed').loc['2021-08-05']
    confirmed    2883624
    Name: 2021-08-05 00:00:00, dtype: int64

    >>> _parse(recovered, 'Poland', name='recovered').loc['2021-08-04']
    recovered    2653981
    Name: 2021-08-04 00:00:00, dtype: int64

    >>> _parse(recovered, 'Poland', name='recovered').loc['2021-08-05']
    recovered    0
    Name: 2021-08-05 00:00:00, dtype: int64

    >>> _parse(deaths, 'Poland', name='deaths').loc['2021-08-04']
    deaths    75269
    Name: 2021-08-04 00:00:00, dtype: int64

    >>> _parse(deaths, 'Poland', name='deaths').loc['2021-08-05']
    deaths    75275
    Name: 2021-08-05 00:00:00, dtype: int64
    """
    if country is not None:
        query = data['Country/Region'] == country
        data = data.loc[query]

    return (
        data
        .transpose()
        .iloc[4:]
        .sum(axis='columns')
        .astype('int')
        .to_frame()
        .rename(lambda x: name, axis='columns')
        .rename(pd.to_datetime, axis='index'))


# %%
def get(country=None):
    """
    >>> get('Poland').loc['2021-08-04']
    confirmed    2883448
    recovered    2653981
    deaths         75269
    Name: 2021-08-04 00:00:00, dtype: int64

    >>> get('Poland').loc['2021-08-05']
    confirmed    2883624
    recovered          0
    deaths         75275
    Name: 2021-08-05 00:00:00, dtype: int64

    >>> get('United Kingdom').loc['2021-08-04']
    confirmed    5980830
    recovered      24693
    deaths        157209
    Name: 2021-08-04 00:00:00, dtype: int64

    >>> get('United Kingdom').loc['2021-08-05']
    confirmed    6010860
    recovered          0
    deaths        157314
    Name: 2021-08-05 00:00:00, dtype: int64

    >>> get().loc['2021-08-04']
    confirmed    200758580
    recovered    130899061
    deaths         4283131
    Name: 2021-08-04 00:00:00, dtype: int64

    >>> get().loc['2021-08-05']
    confirmed    201444202
    recovered            0
    deaths         4294122
    Name: 2021-08-05 00:00:00, dtype: int64
    """
    return pd.concat((
       _parse(confirmed, country, name='confirmed'),
       _parse(recovered, country, name='recovered'),
       _parse(deaths, country, name='deaths'),
    ), axis='columns')


# %%

poland = get('Poland')
germany = get('Germany')
india = get('India')

uk = get('United Kingdom')
france = get('France')
china = get('China')

world = get()


# %%

def liczba_potwierdzonych_oraz_smierci_w_tygodniowych_okresach():
    return world.loc[:, ['confirmed', 'deaths']].resample('W').sum()

def liczba_zachorowan_na_jeden_przypadek_smiertelny():
    return world['confirmed'] / world['deaths']

def procent_smiertelnosci():
    return world['deaths'] / world['confirmed'] * 100*PROCENT

def get_holidays(year: int, calendar: AbstractHolidayCalendar) -> pd.DatetimeIndex:
    return calendar.holidays(start=date(year, 1, 1), end=date(year, 12, 31))

def liczba_zachorowan_po_swietach(year, calendar=PLHolidayCalendar()):
    """
    >>> data = liczba_zachorowan_po_swietach(year=2022)
    >>> plot = data.plot(
    ...    kind='line',
    ...    subplots=True,
    ...    sharey=True,
    ...    sharex=True,
    ...    grid=True,
    ...    figsize=(10,20))
    >>> # plt.show()
    """
    today = pd.Timestamp('today')
    holidays = get_holidays(year, calendar)
    holidays_until_today = holidays[holidays < today]

    def days_after_holiday(holiday, days=10):
        return (poland
                .loc[holiday:, 'confirmed']
                .iloc[:days]
                .diff()
                .reset_index(drop=True)
                .iloc[1:]
                .astype('int'))

    return pd.DataFrame({
        column_name: days_after_holiday(swieto)
        for i, swieto in enumerate(holidays_until_today)
        if (column_name := format(swieto, '%Y-%m-%d'))
    })

7.2.3. Case Study - 0x03

from pathlib import Path
from doctest import testmod as run_tests
import pandas as pd
from matplotlib import pyplot as plt

# %%

pd.set_option('display.width', 200)
pd.set_option('display.max_columns', 8)
pd.set_option('display.max_rows', 100)
pd.set_option('display.min_rows', 100)
pd.set_option('display.max_seq_items', 100)

# %%

# CONFIRMED = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
# DEATHS = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv'
# RECOVERED = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv'

# CONFIRMED = 'https://python3.info/_static/covid19-confirmed.csv'
# DEATHS = 'https://python3.info/_static/covid19-deaths.csv'
# RECOVERED = 'https://python3.info/_static/covid19-recovered.csv'

DATA = Path(__file__).resolve().parent / '..' / 'data'
CONFIRMED = DATA / 'covid19-confirmed.csv'
DEATHS = DATA / 'covid19-deaths.csv'
RECOVERED = DATA / 'covid19-recovered.csv'

# %%

COLUMNS = {
    'Province/State': 'region',
    'Country/Region': 'country',
}

confirmed = pd.read_csv(CONFIRMED).rename(columns=COLUMNS)
deaths = pd.read_csv(DEATHS).rename(columns=COLUMNS)
recovered = pd.read_csv(RECOVERED).rename(columns=COLUMNS)

# %%
def _get(df: pd.DataFrame, country: str, name: str) -> pd.Series:
    """
    >>> _get(confirmed, 'Poland', 'confirmed').loc['2021-01-01']
    np.int64(1305774)
    >>> _get(deaths, 'Poland', 'deaths').loc['2021-01-01']
    np.int64(28956)
    >>> _get(recovered, 'Poland', 'recovered').loc['2021-01-01']
    np.int64(1046281)
    """
    if country is not None:
        df = df.query('country == @country')
    return (df
        .transpose()
        .iloc[4:]
        .sum(axis='columns')
        .rename(name)
        .rename(index=pd.to_datetime)
        .astype('int64')
        .convert_dtypes())

def covid19(country: str = None) -> pd.DataFrame:
    """
    >>> covid19('Poland').loc['2021-01-01']
    confirmed    1305774
    deaths         28956
    recovered    1046281
    Name: 2021-01-01 00:00:00, dtype: Int64

    >>> covid19('US').loc['2021-01-01']
    confirmed    20397400
    deaths         352804
    recovered           0
    Name: 2021-01-01 00:00:00, dtype: Int64

    >>> covid19('China').loc['2021-01-01']
    confirmed    102649
    deaths         4884
    recovered     90031
    Name: 2021-01-01 00:00:00, dtype: Int64
    """
    return pd.concat((
        _get(confirmed, country, name='confirmed'),
        _get(deaths, country, name='deaths'),
        _get(recovered, country, name='recovered')
    ), axis='columns')


# %%
pl = covid19('Poland')
us = covid19('US')
india = covid19('India')
china = covid19('China')
france = covid19('France')
world = covid19()


# %%

data = pl['confirmed']
plot_confirmed_total = data.plot(
    kind='line',
    label='Confirmed',
    title='Total confirmed cases in Poland',
    xlabel='Date',
    ylabel='Total confirmed cases',)

plt.tight_layout()
# plt.show()
# %%

data = pl['confirmed'].diff()
plot_confirmed_daily = data.plot(
    kind='line',
    label='Confirmed',
    title='Daily confirmed cases in Poland',
    xlabel='Date',
    ylabel='Daily confirmed cases',)

plt.tight_layout()
# plt.show()

# %%

def mortality(df: pd.DataFrame, since='2020-04-01', until=None) -> pd.Series:
    return (df.deaths / df.confirmed).loc[slice(since,until)].mul(100).dropna()

data = mortality(pl)
plot_mortality = data.plot(
    kind='line',
    title='Mortality in Poland',
    ylabel='mortality [%]',
    label='Mortality',
    xlabel='date')
plt.hlines(data.mean(), xmin=data.index.min(), xmax=data.index.max(), color='red', label='Mean')
plt.legend()
plt.tight_layout()
# plt.show()
../../_images/covid19-c-poland-confirmed-total.png
../../_images/covid19-c-poland-confirmed-daily.png
../../_images/covid19-c-poland-mortality.png