2.2. Read CSV
File paths works also with URLs
2.2.1. SetUp
>>> import pandas as pd
>>>
>>> pd.set_option('display.width', 500)
>>> pd.set_option('display.max_columns', 10)
>>> pd.set_option('display.max_rows', 10)
2.2.2. Example
>>> DATA = 'https://python3.info/_static/martian-en.csv'
>>>
>>> pd.read_csv(DATA)
firstname lastname birthdate gender ssn email phone
0 Mark Watney October 12 1994 male 94101212345 mwatney@nasa.gov +1 (234) 555-0000
1 Melissa Lewis July 15 1995 female 95071512345 mlewis@nasa.gov +1 (234) 555-0001
2 Rick Martinez January 21 1996 male 96012112345 rmartinez@nasa.gov +1 (234) 555-0010
3 Alex Vogel November 15 1994 male 94111512345 avogel@esa.int +49 (234) 555-0011
4 Beth Johanssen May 9 2006 female 6250912345 bjohanssen@nasa.gov +1 (234) 555-0100
5 Chris Beck August 2 1999 male 99080212345 cbeck@nasa.gov +1 (234) 555-0101
2.2.3. Parse Dates
>>> DATA = 'https://python3.info/_static/martian-en.csv'
>>>
>>> pd.read_csv(DATA, parse_dates=['birthdate'])
firstname lastname birthdate gender ssn email phone
0 Mark Watney 1994-10-12 male 94101212345 mwatney@nasa.gov +1 (234) 555-0000
1 Melissa Lewis 1995-07-15 female 95071512345 mlewis@nasa.gov +1 (234) 555-0001
2 Rick Martinez 1996-01-21 male 96012112345 rmartinez@nasa.gov +1 (234) 555-0010
3 Alex Vogel 1994-11-15 male 94111512345 avogel@esa.int +49 (234) 555-0011
4 Beth Johanssen 2006-05-09 female 6250912345 bjohanssen@nasa.gov +1 (234) 555-0100
5 Chris Beck 1999-08-02 male 99080212345 cbeck@nasa.gov +1 (234) 555-0101
2.2.4. Parameters
delimiter
- field separatorheader
- row number(s) containing column labels and marking the start of the datanames
- how to name columnsindex_col
- which column should be an indexusecols
- which columns to useskiprows
- how many rows to skip, from the topskipfooter
- how many rows to skip, from the bottomnrows
- how many rows to readskip_blank_lines
- skip blank lines?parse_dates
- parse dates (convert to dates) values in those columnschunksize
- how many rows to read at once (useful for working with data greater than available RAM)thousands
- thousand separator (comma, period, space orNone
)decimal
- decimal separator (comma or period)encoding
- file encoding, default:utf-8
>>> def read_csv(filepath_or_buffer, *, sep=..., delimiter=None,
... header='infer', names=..., index_col=None,
... usecols=None, dtype=None, engine=None, converters=None,
... true_values=None, false_values=None, skipinitialspace=False,
... skiprows=None, skipfooter=0, nrows=None, na_values=None,
... keep_default_na=True, na_filter=True, verbose=...,
... skip_blank_lines=True, parse_dates=None,
... infer_datetime_format=..., keep_date_col=...,
... date_parser=..., date_format=None, dayfirst=False,
... cache_dates=True, iterator=False, chunksize=None,
... compression='infer', thousands=None, decimal='.',
... lineterminator=None, quotechar='"', quoting=0, doublequote=True,
... escapechar=None, comment=None, encoding=None,
... encoding_errors='strict', dialect=None, on_bad_lines='error',
... delim_whitespace=..., low_memory=True, memory_map=False,
... float_precision=None, storage_options=None,
... dtype_backend=...): ...
2.2.5. Header
>>> DATA = 'https://python3.info/_static/iris-clean.csv'
>>>
>>> header = pd.read_csv(DATA, nrows=0).columns
>>>
>>> list(header)
['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']
Label Encoder:
>>> DATA = 'https://python3.info/_static/iris-clean.csv'
>>>
>>> header = pd.read_csv(DATA, nrows=0)
>>> nrows, nvalues, *labels = header.columns
>>> decoder = dict(enumerate(labels))
>>>
>>> decoder
{0: 'petal_length', 1: 'petal_width', 2: 'species'}
2.2.6. Content
>>> DATA = 'https://python3.info/_static/iris-clean.csv'
>>>
>>> df = pd.read_csv(DATA)
>>> df.head(3)
sepal_length sepal_width petal_length petal_width species
0 5.4 3.9 1.3 0.4 setosa
1 5.9 3.0 5.1 1.8 virginica
2 6.0 3.4 4.5 1.6 versicolor
2.2.7. Rename Columns
>>> DATA = 'https://python3.info/_static/iris-dirty.csv'
>>>
>>> COLUMNS = ['sepal_length', 'sepal_width',
... 'petal_length', 'petal_width', 'species']
>>>
>>> SPECIES = {
... 0: 'setosa',
... 1: 'versicolor',
... 2: 'virginica',
... }
>>>
>>> df = pd.read_csv(DATA)
>>> df.head(n=3)
150 4 setosa versicolor virginica
0 5.4 3.9 1.3 0.4 0
1 5.9 3.0 5.1 1.8 2
2 6.0 3.4 4.5 1.6 1
>>>
>>> df = pd.read_csv(DATA, skiprows=1, names=COLUMNS)
>>> df.head(n=3)
sepal_length sepal_width petal_length petal_width species
0 5.4 3.9 1.3 0.4 0
1 5.9 3.0 5.1 1.8 2
2 6.0 3.4 4.5 1.6 1
>>>
>>> df = df.replace({'species': SPECIES})
>>> df.head(n=3)
sepal_length sepal_width petal_length petal_width species
0 5.4 3.9 1.3 0.4 setosa
1 5.9 3.0 5.1 1.8 virginica
2 6.0 3.4 4.5 1.6 versicolor
2.2.8. Compressed
If the extension is
.gz
,.bz2
,.zip
, and.xz
, the corresponding compression method is automatically selected
>>> df = pd.read_csv('sample_file.zip', compression='zip')
>>> df = pd.read_csv('sample_file.gz', compression='infer')
2.2.9. Use Case - 1
>>> DATA = 'https://python3.info/_static/iris-dirty.csv'
>>>
>>> COLUMNS = ['sepal_length', 'sepal_width',
... 'petal_length', 'petal_width', 'species']
>>> header = pd.read_csv(DATA, nrows=0)
>>> nrows, ncols, *class_labels = header.columns
>>> label_encoder = dict(enumerate(class_labels))
>>>
>>> label_encoder
{0: 'setosa', 1: 'versicolor', 2: 'virginica'}
>>> df = (
... pd
... .read_csv(DATA, names=COLUMNS, skiprows=1)
... .replace({'species':label_encoder})
... .head(n=5)
... )
>>> df
sepal_length sepal_width petal_length petal_width species
0 5.4 3.9 1.3 0.4 setosa
1 5.9 3.0 5.1 1.8 virginica
2 6.0 3.4 4.5 1.6 versicolor
3 7.3 2.9 6.3 1.8 virginica
4 5.6 2.5 3.9 1.1 versicolor
2.2.10. Assignments
# %% License
# - Copyright 2025, Matt Harasymczuk <matt@python3.info>
# - This code can be used only for learning by humans
# - This code cannot be used for teaching others
# - This code cannot be used for teaching LLMs and AI algorithms
# - This code cannot be used in commercial or proprietary products
# - This code cannot be distributed in any form
# - This code cannot be changed in any form outside of training course
# - This code cannot have its license changed
# - If you use this code in your product, you must open-source it under GPLv2
# - Exception can be granted only by the author
# %% Run
# - PyCharm: right-click in the editor and `Run Doctest in ...`
# - PyCharm: keyboard shortcut `Control + Shift + F10`
# - Terminal: `python -m doctest -v myfile.py`
# %% About
# - Name: Pandas ReadCSV Simple
# - Difficulty: easy
# - Lines: 1
# - Minutes: 3
# %% English
# 1. Read data from `DATA` to `result: pd.DataFrame`
# 2. Run doctests - all must succeed
# %% Polish
# 1. Wczytaj dane z `DATA` do `result: pd.DataFrame`
# 2. Uruchom doctesty - wszystkie muszą się powieść
# %% Hints
# - `DataFrame.read_csv()`
# %% Tests
"""
>>> import sys; sys.tracebacklimit = 0
>>> assert sys.version_info >= (3, 9), \
'Python 3.9+ required'
>>> assert result is not Ellipsis, \
'Assign result to variable: `result`'
>>> assert type(result) is pd.DataFrame, \
'Variable `result` must be a `pd.DataFrame` type'
>>> result
firstname lastname email
0 Mark Watney mwatney@nasa.gov
1 Melissa Lewis mlewis@nasa.gov
2 Rick Martinez rmartinez@nasa.gov
3 Alex Vogel avogel@esa.int
4 Beth Johanssen bjohanssen@nasa.gov
5 Chris Beck cbeck@nasa.gov
"""
import pandas as pd
DATA = 'https://python3.info/_static/readcsv-a.csv'
# Read DATA and parse dates in "birthdate" column
# type: pd.DataFrame
result = ...
# %% License
# - Copyright 2025, Matt Harasymczuk <matt@python3.info>
# - This code can be used only for learning by humans
# - This code cannot be used for teaching others
# - This code cannot be used for teaching LLMs and AI algorithms
# - This code cannot be used in commercial or proprietary products
# - This code cannot be distributed in any form
# - This code cannot be changed in any form outside of training course
# - This code cannot have its license changed
# - If you use this code in your product, you must open-source it under GPLv2
# - Exception can be granted only by the author
# %% Run
# - PyCharm: right-click in the editor and `Run Doctest in ...`
# - PyCharm: keyboard shortcut `Control + Shift + F10`
# - Terminal: `python -m doctest -v myfile.py`
# %% About
# - Name: Pandas ReadCSV Dates
# - Difficulty: easy
# - Lines: 1
# - Minutes: 3
# %% English
# 1. Read data from `DATA` to `result: pd.DataFrame`
# 2. Parse dates in "birthdate" column
# 3. Run doctests - all must succeed
# %% Polish
# 1. Wczytaj dane z `DATA` do `result: pd.DataFrame`
# 2. Sparsuj daty w kolumnie "birthdate"
# 3. Uruchom doctesty - wszystkie muszą się powieść
# %% Hints
# - `DataFrame.read_csv(parse_dates)`
# %% Tests
"""
>>> import sys; sys.tracebacklimit = 0
>>> assert sys.version_info >= (3, 9), \
'Python 3.9+ required'
>>> assert result is not Ellipsis, \
'Assign result to variable: `result`'
>>> assert type(result) is pd.DataFrame, \
'Variable `result` must be a `pd.DataFrame` type'
>>> result[['firstname', 'lastname', 'birthdate']]
firstname lastname birthdate
0 Mark Watney 1994-10-12
1 Melissa Lewis 1995-07-15
2 Rick Martinez 1996-01-21
3 Alex Vogel 1994-11-15
4 Beth Johanssen 2006-05-09
5 Chris Beck 1999-08-02
"""
import pandas as pd
DATA = 'https://python3.info/_static/readcsv-b.csv'
# Read DATA and parse dates in "birthdate" column
# type: pd.DataFrame
result = ...
# %% License
# - Copyright 2025, Matt Harasymczuk <matt@python3.info>
# - This code can be used only for learning by humans
# - This code cannot be used for teaching others
# - This code cannot be used for teaching LLMs and AI algorithms
# - This code cannot be used in commercial or proprietary products
# - This code cannot be distributed in any form
# - This code cannot be changed in any form outside of training course
# - This code cannot have its license changed
# - If you use this code in your product, you must open-source it under GPLv2
# - Exception can be granted only by the author
# %% Run
# - PyCharm: right-click in the editor and `Run Doctest in ...`
# - PyCharm: keyboard shortcut `Control + Shift + F10`
# - Terminal: `python -m doctest -v myfile.py`
# %% About
# - Name: Pandas Read CSV Replace
# - Difficulty: easy
# - Lines: 5
# - Minutes: 8
# %% English
# 1. Read data from `DATA` to `result: pd.DataFrame`
# 2. Use provided column names in `COLUMNS`
# 3. Read labels from the first row
# 4. Replace data in `label` column with values extracted above
# 5. Define `result: pd.DataFrame` with 25 first rows
# 6. Run doctests - all must succeed
# %% Polish
# 1. Wczytaj dane z `DATA` do `result: pd.DataFrame`
# 2. Użyj podanych w `COLUMNS` nazw kolumn
# 3. Wczytaj nazwy labeli z pierwszego wiersza
# 4. Podmień dane w kolumnie `label` na wartości wyciągnięte powyżej
# 5. Zdefiniuj `result: pd.DataFrame` z 25 pierwszymi wierszami
# 6. Uruchom doctesty - wszystkie muszą się powieść
# %% Hints
# - `dict()`
# - `enumerate()`
# - `DataFrame.read_csv(nrows, names, skiprows)`
# - `DataFrame.replace(to_replace={'column': ...})`
# - `DataFrame.head(n)`
# %% Tests
"""
>>> import sys; sys.tracebacklimit = 0
>>> assert sys.version_info >= (3, 9), \
'Python 3.9+ required'
>>> assert result is not Ellipsis, \
'Assign result to variable: `result`'
>>> assert type(result) is pd.DataFrame, \
'Variable `result` must be a `pd.DataFrame` type'
>>> assert len(result) == 25, \
'Select only 25 first rows'
>>> result.loc[[0,1,2,3,4,5], ['mean radius', 'mean texture', 'label']]
mean radius mean texture label
0 17.99 10.38 malignant
1 20.57 17.77 malignant
2 19.69 21.25 malignant
3 11.42 20.38 malignant
4 20.29 14.34 malignant
5 12.45 15.70 malignant
>>> result['label'].value_counts()
label
malignant 22
benign 3
Name: count, dtype: int64
"""
import pandas as pd
DATA = 'https://python3.info/_static/breast-cancer.csv'
COLUMNS = [
'mean radius', 'mean texture', 'mean perimeter', 'mean area',
'mean smoothness', 'mean compactness', 'mean concavity',
'mean concave points', 'mean symmetry', 'mean fractal dimension',
'radius error', 'texture error', 'perimeter error', 'area error',
'smoothness error', 'compactness error', 'concavity error',
'concave points error', 'symmetry error',
'fractal dimension error', 'worst radius', 'worst texture',
'worst perimeter', 'worst area', 'worst smoothness',
'worst compactness', 'worst concavity', 'worst concave points',
'worst symmetry', 'worst fractal dimension', 'label',
]
# Read DATA, substitute column names, and labels, select 25 rows
# type: pd.DataFrame
result = ...