7.2. Case Study CSV COVID19
Data Source: https://github.com/CSSEGISandData/COVID-19/tree/master/csse_covid_19_data/csse_covid_19_time_series
https://github.com/CSSEGISandData/COVID-19/tree/master/csse_covid_19_data/csse_covid_19_time_series
7.2.1. Case Study - 0x01
# %% Imports
from datetime import date
import pandas as pd
from pandas.tseries.holiday import AbstractHolidayCalendar, Holiday
from pandas.tseries.holiday import EasterMonday, Easter
from pandas.tseries.offsets import Day
import matplotlib.pyplot as plt
import matplotlib.axes
from pathlib import Path
# %% Settings
pd.set_option('display.width', 200)
pd.set_option('display.max_columns', 15)
pd.set_option('display.max_rows', 100)
# %% Data Sources
# CONFIRMED = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
# DEATHS = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv'
# RECOVERED = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv'
# CONFIRMED = 'https://python3.info/_static/covid19-confirmed.csv'
# DEATHS = 'https://python3.info/_static/covid19-deaths.csv'
# RECOVERED = 'https://python3.info/_static/covid19-recovered.csv'
DATA = Path(__file__).resolve().parent / '..' / 'data'
CONFIRMED = DATA / 'covid19-confirmed.csv'
DEATHS = DATA / 'covid19-deaths.csv'
RECOVERED = DATA / 'covid19-recovered.csv'
# %% Data Frames
confirmed = pd.read_csv(CONFIRMED)
deaths = pd.read_csv(DEATHS)
recovered = pd.read_csv(RECOVERED)
# %% Get Country from DataFrame
def covid19(country: str = None) -> pd.DataFrame:
"""
Get Confirmed, Deaths, Recovered for given country
>>> covid19('Poland').loc['2022-01-01']
Confirmed 4120248
Deaths 97559
Recovered 0
Name: 2022-01-01 00:00:00, dtype: int64
>>> covid19('France').loc['2022-01-01']
Confirmed 10296909
Deaths 124839
Recovered 0
Name: 2022-01-01 00:00:00, dtype: int64
>>> covid19().loc['2022-01-01']
Confirmed 289931319
Deaths 5473487
Recovered 0
Name: 2022-01-01 00:00:00, dtype: int64
"""
def _get(data: pd.DataFrame, country: str = None) -> pd.Series:
"""
Get Country from DataFrame
>>> _get(confirmed, 'Poland').loc['2022-01-01']
4120248
>>> _get(deaths, 'Poland').loc['2022-01-01']
97559
>>> _get(recovered, 'Poland').loc['2022-01-01']
0
"""
if country is not None:
data = data.query('`Country/Region` == @country')
return (data
.transpose()
.iloc[4:]
.sum(axis='columns')
.astype('int64')
.rename(pd.to_datetime, axis='index'))
return pd.DataFrame({
'Confirmed': _get(confirmed, country),
'Deaths': _get(deaths, country),
'Recovered': _get(recovered, country)})
# %% Calendars
class PLHolidayCalendar(AbstractHolidayCalendar):
"""
Custom Holiday calendar for Poland based on
https://en.wikipedia.org/wiki/Public_holidays_in_Poland
"""
rules = [
Holiday('New Years Day', month=1, day=1),
Holiday('Epiphany', month=1, day=6),
Holiday('Easter', month=1, day=1, offset=[Easter()]),
EasterMonday,
Holiday('May Day', month=5, day=1),
Holiday('Constitution Day', month=5, day=3),
Holiday('Pentecost Sunday', month=1, day=1, offset=[Easter(), Day(49)]),
Holiday('Corpus Christi', month=1, day=1, offset=[Easter(), Day(60)]),
Holiday('Assumption of the Blessed Virgin Mary', month=8, day=15),
Holiday('All Saints Day', month=11, day=1),
Holiday('Independence Day', month=11, day=11),
Holiday('Christmas Day', month=12, day=25),
Holiday('Second Day of Christmastide', month=12, day=26),
]
# %% Show trendline
def plot_trendline(data: pd.DataFrame) -> matplotlib.axes.Axes:
return (data
.loc[:, ['Confirmed','Deaths']]
.plot(kind='line',
subplots=True,
layout=(2, 1),
figsize=(10, 10)))
# %% Show fatalities
def plot_fatalities(data: pd.DataFrame) -> matplotlib.axes.Axes:
return ((data['Deaths'] / data['Confirmed'])
.mul(100) # convert to percent
.round(2)
.plot(kind='line',
title='Percent of deaths vs. new cases in last two weeks',
xlabel='Day',
ylabel='Percent',
ylim=(0.0, 6.0),
figsize=(10, 10),
grid=True))
# %% Confirmed cases day-by-day difference
def plot_confirmed_daily(data: pd.DataFrame) -> matplotlib.axes.Axes:
return data['Confirmed'].diff().plot()
# %% Covid infection waves
def plot_confirmed_waves(data:pd.DataFrame) -> matplotlib.axes.Axes:
return data['Confirmed'].diff().resample('W').median().plot()
# %% Confirmed cases in last two weeks
def plot_confirmed_last(data: pd.DataFrame, freq='2W') -> matplotlib.axes.Axes:
return data['Confirmed'].last(freq).diff().plot()
# %% Confirmed cases every month
def plot_confirmed_monthly(data: pd.DataFrame) -> matplotlib.axes.Axes:
return data['Confirmed'].resample('M').sum().plot()
# %%
def plot_confirmed_after_holidays(
data: pd.DataFrame,
since: date | str | None = '2021-01-01',
until: date | str | None = '2022-02-07',
days: int = 14,
calendar: AbstractHolidayCalendar = PLHolidayCalendar(),
) -> matplotlib.axes.Axes:
"""
Confirmed cases in period of 14 days after holidays
"""
def _get(since, days):
return (data
.loc[since:, 'Confirmed']
.iloc[:days]
.reset_index(drop=True))
data = {column: _get(since=holiday, days=days)
for holiday in calendar.holidays(since, until)
if (column := holiday.strftime('%Y-%m-%d'))}
return pd.DataFrame(data).diff().plot(
kind='line',
subplots=True,
layout=(15,1),
sharex=True,
figsize=(5, 15),
grid=True)
# %% Main
if __name__ == '__main__':
poland = covid19('Poland')
usa = covid19('US')
france = covid19('France')
china = covid19('China')
world = covid19()
data = poland.loc['2020-01-01':'2022-02-01']
plot_trendline(data)
# plt.show()
plot_fatalities(data)
# plt.show()
plot_confirmed_daily(data)
# plt.show()
plot_confirmed_waves(data)
# plt.show()
plot_confirmed_last(data)
# plt.show()
plot_confirmed_monthly(data)
# plt.show()
plot_confirmed_after_holidays(data)
# plt.show()
"""TODO:
# Resample
poland['Confirmed'].shift(periods=1, freq='D').plot(kind='line')
# Z rozróżnianiem na kwartały (Q)
plot = poland['Confirmed'].resample('Q').plot(kind='line', legend=True)
plot[0].name = '2020-Q1'
plot[1].name = '2020-Q2'
plot[2].name = '2020-Q3'
plot[3].name = '2020-Q4'
plot[4].name = '2021-Q1'
plot[5].name = '2021-Q2'
plot[6].name = '2021-Q3'
plot[7].name = '2021-Q4'
plot[8].name = '2022-Q1'
plt.show()
# Makrotrendy
poland['Confirmed'].diff().rolling(window=14).median().plot()
plt.show()
poland['Confirmed'].diff().rolling(window=14).median().plot()
plt.show()
poland['Confirmed'].diff().resample('W').median().plot()
plt.show()
poland['Confirmed'].diff().resample('M').median().plot()
plt.show()
poland['Confirmed'].diff().resample('Q').median().plot()
plt.show()
"""
7.2.2. Case Study - 0x02
from datetime import date
from pathlib import Path
import pandas as pd
from pandas.tseries.holiday import AbstractHolidayCalendar, Holiday, EasterMonday, Easter
from pandas.tseries.offsets import Day
PROCENT = 1
# %%
# CONFIRMED = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
# DEATHS = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv'
# RECOVERED = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv'
# CONFIRMED = 'https://python3.info/_static/covid19-confirmed.csv'
# DEATHS = 'https://python3.info/_static/covid19-deaths.csv'
# RECOVERED = 'https://python3.info/_static/covid19-recovered.csv'
DATA = Path(__file__).resolve().parent / '..' / 'data'
CONFIRMED = DATA / 'covid19-confirmed.csv'
DEATHS = DATA / 'covid19-deaths.csv'
RECOVERED = DATA / 'covid19-recovered.csv'
# %%
confirmed = pd.read_csv(CONFIRMED).convert_dtypes()
recovered = pd.read_csv(RECOVERED).convert_dtypes()
deaths = pd.read_csv(DEATHS).convert_dtypes()
# %%
class PLHolidayCalendar(AbstractHolidayCalendar):
"""
Custom Holiday calendar for Poland based on
https://en.wikipedia.org/wiki/Public_holidays_in_Poland
>>> PLHolidayCalendar().holidays(start='2000-01-01', end='2000-12-31')
DatetimeIndex(['2000-01-01', '2000-01-06', '2000-04-23', '2000-04-24',
'2000-05-01', '2000-05-03', '2000-06-11', '2000-06-22',
'2000-08-15', '2000-11-01', '2000-11-11', '2000-12-25',
'2000-12-26'],
dtype='datetime64[ns]', freq=None)
"""
rules = [
Holiday('New Years Day', month=1, day=1),
Holiday('Epiphany', month=1, day=6),
Holiday('Easter', month=1, day=1, offset=[Easter()]),
EasterMonday,
Holiday('May Day', month=5, day=1),
Holiday('Constitution Day', month=5, day=3),
Holiday('Pentecost Sunday', month=1, day=1, offset=[Easter(), Day(49)]),
Holiday('Corpus Christi', month=1, day=1, offset=[Easter(), Day(60)]),
Holiday('Assumption of the Blessed Virgin Mary', month=8, day=15),
Holiday('All Saints Day', month=11, day=1),
Holiday('Independence Day', month=11, day=11),
Holiday('Christmas Day', month=12, day=25),
Holiday('Second Day of Christmastide', month=12, day=26),
]
# %%
def _parse(data, country, name):
"""
>>> _parse(confirmed, 'Poland', name='confirmed').loc['2021-08-04']
confirmed 2883448
Name: 2021-08-04 00:00:00, dtype: int64
>>> _parse(confirmed, 'Poland', name='confirmed').loc['2021-08-05']
confirmed 2883624
Name: 2021-08-05 00:00:00, dtype: int64
>>> _parse(recovered, 'Poland', name='recovered').loc['2021-08-04']
recovered 2653981
Name: 2021-08-04 00:00:00, dtype: int64
>>> _parse(recovered, 'Poland', name='recovered').loc['2021-08-05']
recovered 0
Name: 2021-08-05 00:00:00, dtype: int64
>>> _parse(deaths, 'Poland', name='deaths').loc['2021-08-04']
deaths 75269
Name: 2021-08-04 00:00:00, dtype: int64
>>> _parse(deaths, 'Poland', name='deaths').loc['2021-08-05']
deaths 75275
Name: 2021-08-05 00:00:00, dtype: int64
"""
if country is not None:
query = data['Country/Region'] == country
data = data.loc[query]
return (
data
.transpose()
.iloc[4:]
.sum(axis='columns')
.astype('int')
.to_frame()
.rename(lambda x: name, axis='columns')
.rename(pd.to_datetime, axis='index'))
# %%
def get(country=None):
"""
>>> get('Poland').loc['2021-08-04']
confirmed 2883448
recovered 2653981
deaths 75269
Name: 2021-08-04 00:00:00, dtype: int64
>>> get('Poland').loc['2021-08-05']
confirmed 2883624
recovered 0
deaths 75275
Name: 2021-08-05 00:00:00, dtype: int64
>>> get('United Kingdom').loc['2021-08-04']
confirmed 5980830
recovered 24693
deaths 157209
Name: 2021-08-04 00:00:00, dtype: int64
>>> get('United Kingdom').loc['2021-08-05']
confirmed 6010860
recovered 0
deaths 157314
Name: 2021-08-05 00:00:00, dtype: int64
>>> get().loc['2021-08-04']
confirmed 200758580
recovered 130899061
deaths 4283131
Name: 2021-08-04 00:00:00, dtype: int64
>>> get().loc['2021-08-05']
confirmed 201444202
recovered 0
deaths 4294122
Name: 2021-08-05 00:00:00, dtype: int64
"""
return pd.concat((
_parse(confirmed, country, name='confirmed'),
_parse(recovered, country, name='recovered'),
_parse(deaths, country, name='deaths'),
), axis='columns')
# %%
poland = get('Poland')
germany = get('Germany')
india = get('India')
uk = get('United Kingdom')
france = get('France')
china = get('China')
world = get()
# %%
def liczba_potwierdzonych_oraz_smierci_w_tygodniowych_okresach():
return world.loc[:, ['confirmed', 'deaths']].resample('W').sum()
def liczba_zachorowan_na_jeden_przypadek_smiertelny():
return world['confirmed'] / world['deaths']
def procent_smiertelnosci():
return world['deaths'] / world['confirmed'] * 100*PROCENT
def get_holidays(year: int, calendar: AbstractHolidayCalendar) -> pd.DatetimeIndex:
return calendar.holidays(start=date(year, 1, 1), end=date(year, 12, 31))
def liczba_zachorowan_po_swietach(year, calendar=PLHolidayCalendar()):
"""
>>> data = liczba_zachorowan_po_swietach(year=2022)
>>> plot = data.plot(
... kind='line',
... subplots=True,
... sharey=True,
... sharex=True,
... grid=True,
... figsize=(10,20))
>>> # plt.show()
"""
today = pd.Timestamp('today')
holidays = get_holidays(year, calendar)
holidays_until_today = holidays[holidays < today]
def days_after_holiday(holiday, days=10):
return (poland
.loc[holiday:, 'confirmed']
.iloc[:days]
.diff()
.reset_index(drop=True)
.iloc[1:]
.astype('int'))
return pd.DataFrame({
column_name: days_after_holiday(swieto)
for i, swieto in enumerate(holidays_until_today)
if (column_name := format(swieto, '%Y-%m-%d'))
})
7.2.3. Case Study - 0x03
from pathlib import Path
from doctest import testmod as run_tests
import pandas as pd
from matplotlib import pyplot as plt
# %%
pd.set_option('display.width', 200)
pd.set_option('display.max_columns', 8)
pd.set_option('display.max_rows', 100)
pd.set_option('display.min_rows', 100)
pd.set_option('display.max_seq_items', 100)
# %%
# CONFIRMED = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
# DEATHS = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv'
# RECOVERED = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv'
# CONFIRMED = 'https://python3.info/_static/covid19-confirmed.csv'
# DEATHS = 'https://python3.info/_static/covid19-deaths.csv'
# RECOVERED = 'https://python3.info/_static/covid19-recovered.csv'
DATA = Path(__file__).resolve().parent / '..' / 'data'
CONFIRMED = DATA / 'covid19-confirmed.csv'
DEATHS = DATA / 'covid19-deaths.csv'
RECOVERED = DATA / 'covid19-recovered.csv'
# %%
COLUMNS = {
'Province/State': 'region',
'Country/Region': 'country',
}
confirmed = pd.read_csv(CONFIRMED).rename(columns=COLUMNS)
deaths = pd.read_csv(DEATHS).rename(columns=COLUMNS)
recovered = pd.read_csv(RECOVERED).rename(columns=COLUMNS)
# %%
def _get(df: pd.DataFrame, country: str, name: str) -> pd.Series:
"""
>>> _get(confirmed, 'Poland', 'confirmed').loc['2021-01-01']
np.int64(1305774)
>>> _get(deaths, 'Poland', 'deaths').loc['2021-01-01']
np.int64(28956)
>>> _get(recovered, 'Poland', 'recovered').loc['2021-01-01']
np.int64(1046281)
"""
if country is not None:
df = df.query('country == @country')
return (df
.transpose()
.iloc[4:]
.sum(axis='columns')
.rename(name)
.rename(index=pd.to_datetime)
.astype('int64')
.convert_dtypes())
def covid19(country: str = None) -> pd.DataFrame:
"""
>>> covid19('Poland').loc['2021-01-01']
confirmed 1305774
deaths 28956
recovered 1046281
Name: 2021-01-01 00:00:00, dtype: Int64
>>> covid19('US').loc['2021-01-01']
confirmed 20397400
deaths 352804
recovered 0
Name: 2021-01-01 00:00:00, dtype: Int64
>>> covid19('China').loc['2021-01-01']
confirmed 102649
deaths 4884
recovered 90031
Name: 2021-01-01 00:00:00, dtype: Int64
"""
return pd.concat((
_get(confirmed, country, name='confirmed'),
_get(deaths, country, name='deaths'),
_get(recovered, country, name='recovered')
), axis='columns')
# %%
pl = covid19('Poland')
us = covid19('US')
india = covid19('India')
china = covid19('China')
france = covid19('France')
world = covid19()
# %%
data = pl['confirmed']
plot_confirmed_total = data.plot(
kind='line',
label='Confirmed',
title='Total confirmed cases in Poland',
xlabel='Date',
ylabel='Total confirmed cases',)
plt.tight_layout()
# plt.show()
# %%
data = pl['confirmed'].diff()
plot_confirmed_daily = data.plot(
kind='line',
label='Confirmed',
title='Daily confirmed cases in Poland',
xlabel='Date',
ylabel='Daily confirmed cases',)
plt.tight_layout()
# plt.show()
# %%
def mortality(df: pd.DataFrame, since='2020-04-01', until=None) -> pd.Series:
return (df.deaths / df.confirmed).loc[slice(since,until)].mul(100).dropna()
data = mortality(pl)
plot_mortality = data.plot(
kind='line',
title='Mortality in Poland',
ylabel='mortality [%]',
label='Mortality',
xlabel='date')
plt.hlines(data.mean(), xmin=data.index.min(), xmax=data.index.max(), color='red', label='Mean')
plt.legend()
plt.tight_layout()
# plt.show()