2.2. Read CSV

  • File paths works also with URLs

2.2.1. SetUp

>>> import pandas as pd
>>>
>>> pd.set_option('display.width', 500)
>>> pd.set_option('display.max_columns', 10)
>>> pd.set_option('display.max_rows', 10)

2.2.2. Example

>>> DATA = 'https://python3.info/_static/martian-en.csv'
>>>
>>> pd.read_csv(DATA)
  firstname   lastname         birthdate  gender          ssn                email               phone
0      Mark     Watney   October 12 1994    male  94101212345     mwatney@nasa.gov   +1 (234) 555-0000
1   Melissa      Lewis      July 15 1995  female  95071512345      mlewis@nasa.gov   +1 (234) 555-0001
2      Rick   Martinez   January 21 1996    male  96012112345   rmartinez@nasa.gov   +1 (234) 555-0010
3      Alex      Vogel  November 15 1994    male  94111512345       avogel@esa.int  +49 (234) 555-0011
4      Beth  Johanssen        May 9 2006  female   6250912345  bjohanssen@nasa.gov   +1 (234) 555-0100
5     Chris       Beck     August 2 1999    male  99080212345       cbeck@nasa.gov   +1 (234) 555-0101

2.2.3. Parse Dates

>>> DATA = 'https://python3.info/_static/martian-en.csv'
>>>
>>> pd.read_csv(DATA, parse_dates=['birthdate'])
  firstname   lastname  birthdate  gender          ssn                email               phone
0      Mark     Watney 1994-10-12    male  94101212345     mwatney@nasa.gov   +1 (234) 555-0000
1   Melissa      Lewis 1995-07-15  female  95071512345      mlewis@nasa.gov   +1 (234) 555-0001
2      Rick   Martinez 1996-01-21    male  96012112345   rmartinez@nasa.gov   +1 (234) 555-0010
3      Alex      Vogel 1994-11-15    male  94111512345       avogel@esa.int  +49 (234) 555-0011
4      Beth  Johanssen 2006-05-09  female   6250912345  bjohanssen@nasa.gov   +1 (234) 555-0100
5     Chris       Beck 1999-08-02    male  99080212345       cbeck@nasa.gov   +1 (234) 555-0101

2.2.4. Parameters

  • delimiter - field separator

  • header - row number(s) containing column labels and marking the start of the data

  • names - how to name columns

  • index_col - which column should be an index

  • usecols - which columns to use

  • skiprows - how many rows to skip, from the top

  • skipfooter - how many rows to skip, from the bottom

  • nrows - how many rows to read

  • skip_blank_lines - skip blank lines?

  • parse_dates - parse dates (convert to dates) values in those columns

  • chunksize - how many rows to read at once (useful for working with data greater than available RAM)

  • thousands - thousand separator (comma, period, space or None)

  • decimal - decimal separator (comma or period)

  • encoding - file encoding, default: utf-8

>>> def read_csv(filepath_or_buffer, *, sep=..., delimiter=None,
...              header='infer', names=..., index_col=None,
...              usecols=None, dtype=None, engine=None, converters=None,
...              true_values=None, false_values=None, skipinitialspace=False,
...              skiprows=None, skipfooter=0, nrows=None, na_values=None,
...              keep_default_na=True, na_filter=True, verbose=...,
...              skip_blank_lines=True, parse_dates=None,
...              infer_datetime_format=..., keep_date_col=...,
...              date_parser=..., date_format=None, dayfirst=False,
...              cache_dates=True, iterator=False, chunksize=None,
...              compression='infer', thousands=None, decimal='.',
...              lineterminator=None, quotechar='"', quoting=0, doublequote=True,
...              escapechar=None, comment=None, encoding=None,
...              encoding_errors='strict', dialect=None, on_bad_lines='error',
...              delim_whitespace=..., low_memory=True, memory_map=False,
...              float_precision=None, storage_options=None,
...              dtype_backend=...): ...

2.2.6. Content

>>> DATA = 'https://python3.info/_static/iris-clean.csv'
>>>
>>> df = pd.read_csv(DATA)
>>> df.head(3)
   sepal_length  sepal_width  petal_length  petal_width     species
0           5.4          3.9           1.3          0.4      setosa
1           5.9          3.0           5.1          1.8   virginica
2           6.0          3.4           4.5          1.6  versicolor

2.2.7. Rename Columns

>>> DATA = 'https://python3.info/_static/iris-dirty.csv'
>>>
>>> COLUMNS =  ['sepal_length', 'sepal_width',
...             'petal_length', 'petal_width', 'species']
>>>
>>> SPECIES = {
...     0: 'setosa',
...     1: 'versicolor',
...     2: 'virginica',
... }
>>>
>>> df = pd.read_csv(DATA)
>>> df.head(n=3)
   150    4  setosa  versicolor  virginica
0  5.4  3.9     1.3         0.4          0
1  5.9  3.0     5.1         1.8          2
2  6.0  3.4     4.5         1.6          1
>>>
>>> df = pd.read_csv(DATA, skiprows=1, names=COLUMNS)
>>> df.head(n=3)
   sepal_length  sepal_width  petal_length  petal_width  species
0           5.4          3.9           1.3          0.4        0
1           5.9          3.0           5.1          1.8        2
2           6.0          3.4           4.5          1.6        1
>>>
>>> df = df.replace({'species': SPECIES})
>>> df.head(n=3)
   sepal_length  sepal_width  petal_length  petal_width     species
0           5.4          3.9           1.3          0.4      setosa
1           5.9          3.0           5.1          1.8   virginica
2           6.0          3.4           4.5          1.6  versicolor

2.2.8. Compressed

  • If the extension is .gz, .bz2, .zip, and .xz, the corresponding compression method is automatically selected

>>> df = pd.read_csv('sample_file.zip', compression='zip')  
>>> df = pd.read_csv('sample_file.gz', compression='infer')  

2.2.9. Use Case - 1

>>> DATA = 'https://python3.info/_static/iris-dirty.csv'
>>>
>>> COLUMNS =  ['sepal_length', 'sepal_width',
...             'petal_length', 'petal_width', 'species']
>>> header = pd.read_csv(DATA, nrows=0)
>>> nrows, ncols, *class_labels = header.columns
>>> label_encoder = dict(enumerate(class_labels))
>>>
>>> label_encoder
{0: 'setosa', 1: 'versicolor', 2: 'virginica'}
>>> df = (
...     pd
...     .read_csv(DATA, names=COLUMNS, skiprows=1)
...     .replace({'species':label_encoder})
...     .head(n=5)
... )
>>> df
   sepal_length  sepal_width  petal_length  petal_width     species
0           5.4          3.9           1.3          0.4      setosa
1           5.9          3.0           5.1          1.8   virginica
2           6.0          3.4           4.5          1.6  versicolor
3           7.3          2.9           6.3          1.8   virginica
4           5.6          2.5           3.9          1.1  versicolor

2.2.10. Assignments

# %% License
# - Copyright 2025, Matt Harasymczuk <matt@python3.info>
# - This code can be used only for learning by humans
# - This code cannot be used for teaching others
# - This code cannot be used for teaching LLMs and AI algorithms
# - This code cannot be used in commercial or proprietary products
# - This code cannot be distributed in any form
# - This code cannot be changed in any form outside of training course
# - This code cannot have its license changed
# - If you use this code in your product, you must open-source it under GPLv2
# - Exception can be granted only by the author

# %% Run
# - PyCharm: right-click in the editor and `Run Doctest in ...`
# - PyCharm: keyboard shortcut `Control + Shift + F10`
# - Terminal: `python -m doctest -v myfile.py`

# %% About
# - Name: Pandas ReadCSV Simple
# - Difficulty: easy
# - Lines: 1
# - Minutes: 3

# %% English
# 1. Read data from `DATA` to `result: pd.DataFrame`
# 2. Run doctests - all must succeed

# %% Polish
# 1. Wczytaj dane z `DATA` do `result: pd.DataFrame`
# 2. Uruchom doctesty - wszystkie muszą się powieść

# %% Hints
# - `DataFrame.read_csv()`

# %% Tests
"""
>>> import sys; sys.tracebacklimit = 0
>>> assert sys.version_info >= (3, 9), \
'Python 3.9+ required'

>>> assert result is not Ellipsis, \
'Assign result to variable: `result`'
>>> assert type(result) is pd.DataFrame, \
'Variable `result` must be a `pd.DataFrame` type'

>>> result
  firstname   lastname                email
0      Mark     Watney     mwatney@nasa.gov
1   Melissa      Lewis      mlewis@nasa.gov
2      Rick   Martinez   rmartinez@nasa.gov
3      Alex      Vogel       avogel@esa.int
4      Beth  Johanssen  bjohanssen@nasa.gov
5     Chris       Beck       cbeck@nasa.gov
"""

import pandas as pd


DATA = 'https://python3.info/_static/readcsv-a.csv'


# Read DATA and parse dates in "birthdate" column
# type: pd.DataFrame
result = ...


# %% License
# - Copyright 2025, Matt Harasymczuk <matt@python3.info>
# - This code can be used only for learning by humans
# - This code cannot be used for teaching others
# - This code cannot be used for teaching LLMs and AI algorithms
# - This code cannot be used in commercial or proprietary products
# - This code cannot be distributed in any form
# - This code cannot be changed in any form outside of training course
# - This code cannot have its license changed
# - If you use this code in your product, you must open-source it under GPLv2
# - Exception can be granted only by the author

# %% Run
# - PyCharm: right-click in the editor and `Run Doctest in ...`
# - PyCharm: keyboard shortcut `Control + Shift + F10`
# - Terminal: `python -m doctest -v myfile.py`

# %% About
# - Name: Pandas ReadCSV Dates
# - Difficulty: easy
# - Lines: 1
# - Minutes: 3

# %% English
# 1. Read data from `DATA` to `result: pd.DataFrame`
# 2. Parse dates in "birthdate" column
# 3. Run doctests - all must succeed

# %% Polish
# 1. Wczytaj dane z `DATA` do `result: pd.DataFrame`
# 2. Sparsuj daty w kolumnie "birthdate"
# 3. Uruchom doctesty - wszystkie muszą się powieść

# %% Hints
# - `DataFrame.read_csv(parse_dates)`

# %% Tests
"""
>>> import sys; sys.tracebacklimit = 0
>>> assert sys.version_info >= (3, 9), \
'Python 3.9+ required'

>>> assert result is not Ellipsis, \
'Assign result to variable: `result`'
>>> assert type(result) is pd.DataFrame, \
'Variable `result` must be a `pd.DataFrame` type'

>>> result[['firstname', 'lastname', 'birthdate']]
  firstname   lastname  birthdate
0      Mark     Watney 1994-10-12
1   Melissa      Lewis 1995-07-15
2      Rick   Martinez 1996-01-21
3      Alex      Vogel 1994-11-15
4      Beth  Johanssen 2006-05-09
5     Chris       Beck 1999-08-02
"""

import pandas as pd


DATA = 'https://python3.info/_static/readcsv-b.csv'


# Read DATA and parse dates in "birthdate" column
# type: pd.DataFrame
result = ...


# %% License
# - Copyright 2025, Matt Harasymczuk <matt@python3.info>
# - This code can be used only for learning by humans
# - This code cannot be used for teaching others
# - This code cannot be used for teaching LLMs and AI algorithms
# - This code cannot be used in commercial or proprietary products
# - This code cannot be distributed in any form
# - This code cannot be changed in any form outside of training course
# - This code cannot have its license changed
# - If you use this code in your product, you must open-source it under GPLv2
# - Exception can be granted only by the author

# %% Run
# - PyCharm: right-click in the editor and `Run Doctest in ...`
# - PyCharm: keyboard shortcut `Control + Shift + F10`
# - Terminal: `python -m doctest -v myfile.py`

# %% About
# - Name: Pandas Read CSV Replace
# - Difficulty: easy
# - Lines: 5
# - Minutes: 8

# %% English
# 1. Read data from `DATA` to `result: pd.DataFrame`
# 2. Use provided column names in `COLUMNS`
# 3. Read labels from the first row
# 4. Replace data in `label` column with values extracted above
# 5. Define `result: pd.DataFrame` with 25 first rows
# 6. Run doctests - all must succeed

# %% Polish
# 1. Wczytaj dane z `DATA` do `result: pd.DataFrame`
# 2. Użyj podanych w `COLUMNS` nazw kolumn
# 3. Wczytaj nazwy labeli z pierwszego wiersza
# 4. Podmień dane w kolumnie `label` na wartości wyciągnięte powyżej
# 5. Zdefiniuj `result: pd.DataFrame` z 25 pierwszymi wierszami
# 6. Uruchom doctesty - wszystkie muszą się powieść

# %% Hints
# - `dict()`
# - `enumerate()`
# - `DataFrame.read_csv(nrows, names, skiprows)`
# - `DataFrame.replace(to_replace={'column': ...})`
# - `DataFrame.head(n)`

# %% Tests
"""
>>> import sys; sys.tracebacklimit = 0
>>> assert sys.version_info >= (3, 9), \
'Python 3.9+ required'

>>> assert result is not Ellipsis, \
'Assign result to variable: `result`'
>>> assert type(result) is pd.DataFrame, \
'Variable `result` must be a `pd.DataFrame` type'
>>> assert len(result) == 25, \
'Select only 25 first rows'

>>> result.loc[[0,1,2,3,4,5], ['mean radius', 'mean texture', 'label']]
   mean radius  mean texture      label
0        17.99         10.38  malignant
1        20.57         17.77  malignant
2        19.69         21.25  malignant
3        11.42         20.38  malignant
4        20.29         14.34  malignant
5        12.45         15.70  malignant

>>> result['label'].value_counts()
label
malignant    22
benign        3
Name: count, dtype: int64
"""

import pandas as pd


DATA = 'https://python3.info/_static/breast-cancer.csv'

COLUMNS = [
    'mean radius', 'mean texture', 'mean perimeter', 'mean area',
    'mean smoothness', 'mean compactness', 'mean concavity',
    'mean concave points', 'mean symmetry', 'mean fractal dimension',
    'radius error', 'texture error', 'perimeter error', 'area error',
    'smoothness error', 'compactness error', 'concavity error',
    'concave points error', 'symmetry error',
    'fractal dimension error', 'worst radius', 'worst texture',
    'worst perimeter', 'worst area', 'worst smoothness',
    'worst compactness', 'worst concavity', 'worst concave points',
    'worst symmetry', 'worst fractal dimension', 'label',
]


# Read DATA, substitute column names, and labels, select 25 rows
# type: pd.DataFrame
result = ...