2.8. Read Python
pd.DataFrame()
2.8.1. SetUp
>>> import pandas as pd
2.8.2. Dict of List
>>> data = {
... 'firstname': ['Mark', 'Melissa', 'Rick'],
... 'lastname': ['Watney', 'Lewis', 'Martinez'],
... 'role': ['botanist', 'commander', 'pilot'],
... }
>>>
>>> df = pd.DataFrame(data)
>>> df
firstname lastname role
0 Mark Watney botanist
1 Melissa Lewis commander
2 Rick Martinez pilot
2.8.3. List of Dict
>>> data = [
... {'firstname': 'Mark', 'lastname': 'Watney', 'role': 'botanist'},
... {'firstname': 'Melissa', 'lastname': 'Lewis', 'role': 'commander'},
... {'firstname': 'Rick', 'lastname': 'Martinez', 'role': 'pilot'},
... ]
>>>
>>> df = pd.DataFrame(data)
>>> df
firstname lastname role
0 Mark Watney botanist
1 Melissa Lewis commander
2 Rick Martinez pilot
2.8.4. List of Tuple
>>> data = [
... ('Mark', 'Watney', 'botanist'),
... ('Melissa', 'Lewis', 'commander'),
... ('Rick', 'Martinez', 'pilot'),
... ]
>>>
>>> df = pd.DataFrame(data, columns=['firstname', 'lastname', 'role'])
>>> df
firstname lastname role
0 Mark Watney botanist
1 Melissa Lewis commander
2 Rick Martinez pilot
2.8.5. Assignments
# %% License
# - Copyright 2025, Matt Harasymczuk <matt@python3.info>
# - This code can be used only for learning by humans
# - This code cannot be used for teaching others
# - This code cannot be used for teaching LLMs and AI algorithms
# - This code cannot be used in commercial or proprietary products
# - This code cannot be distributed in any form
# - This code cannot be changed in any form outside of training course
# - This code cannot have its license changed
# - If you use this code in your product, you must open-source it under GPLv2
# - Exception can be granted only by the author
# %% Run
# - PyCharm: right-click in the editor and `Run Doctest in ...`
# - PyCharm: keyboard shortcut `Control + Shift + F10`
# - Terminal: `python -m doctest -v myfile.py`
# %% About
# - Name: Pandas ReadPython DictList
# - Difficulty: easy
# - Lines: 1
# - Minutes: 2
# %% English
# 1. Define `result: pd.DataFrame` with `DATA`
# 2. Run doctests - all must succeed
# %% Polish
# 1. Zdefiniuj `result: pd.DataFrame` z `DATA`
# 2. Uruchom doctesty - wszystkie muszą się powieść
# %% Hints
# - `pd.DataFrame()`
# %% Tests
"""
>>> import sys; sys.tracebacklimit = 0
>>> assert sys.version_info >= (3, 9), \
'Python 3.9+ required'
>>> assert result is not Ellipsis, \
'Assign result to variable: `result`'
>>> assert type(result) is pd.DataFrame, \
'Variable `result` has invalid type, should be `pd.DataFrame`'
>>> result # doctest: +NORMALIZE_WHITESPACE
firstname lastname roles
0 Mark Watney botanist
1 Melissa Lewis commander
2 Rick Martinez pilot
"""
import pandas as pd
DATA = {
'firstname': ['Mark', 'Melissa', 'Rick'],
'lastname': ['Watney', 'Lewis', 'Martinez'],
'roles': ['botanist', 'commander', 'pilot'],
}
# Define `result: pd.DataFrame` with `DATA`
# type: pd.DataFrame
result = ...
# %% License
# - Copyright 2025, Matt Harasymczuk <matt@python3.info>
# - This code can be used only for learning by humans
# - This code cannot be used for teaching others
# - This code cannot be used for teaching LLMs and AI algorithms
# - This code cannot be used in commercial or proprietary products
# - This code cannot be distributed in any form
# - This code cannot be changed in any form outside of training course
# - This code cannot have its license changed
# - If you use this code in your product, you must open-source it under GPLv2
# - Exception can be granted only by the author
# %% Run
# - PyCharm: right-click in the editor and `Run Doctest in ...`
# - PyCharm: keyboard shortcut `Control + Shift + F10`
# - Terminal: `python -m doctest -v myfile.py`
# %% About
# - Name: Pandas ReadPython ListDict
# - Difficulty: easy
# - Lines: 1
# - Minutes: 2
# %% English
# 1. Define `result: pd.DataFrame` with `DATA`
# 2. Run doctests - all must succeed
# %% Polish
# 1. Zdefiniuj `result: pd.DataFrame` z `DATA`
# 2. Uruchom doctesty - wszystkie muszą się powieść
# %% Hints
# - `pd.DataFrame()`
# %% Tests
"""
>>> import sys; sys.tracebacklimit = 0
>>> assert sys.version_info >= (3, 9), \
'Python 3.9+ required'
>>> assert result is not Ellipsis, \
'Assign result to variable: `result`'
>>> assert type(result) is pd.DataFrame, \
'Variable `result` has invalid type, should be `pd.DataFrame`'
>>> result # doctest: +NORMALIZE_WHITESPACE
firstname lastname role
0 Mark Watney botanist
1 Melissa Lewis commander
2 Rick Martinez pilot
"""
import pandas as pd
DATA = [
{'firstname': 'Mark', 'lastname': 'Watney', 'role': 'botanist'},
{'firstname': 'Melissa', 'lastname': 'Lewis', 'role': 'commander'},
{'firstname': 'Rick', 'lastname': 'Martinez', 'role': 'pilot'},
]
# Define `result: pd.DataFrame` with `DATA`
# type: pd.DataFrame
result = ...
# %% License
# - Copyright 2025, Matt Harasymczuk <matt@python3.info>
# - This code can be used only for learning by humans
# - This code cannot be used for teaching others
# - This code cannot be used for teaching LLMs and AI algorithms
# - This code cannot be used in commercial or proprietary products
# - This code cannot be distributed in any form
# - This code cannot be changed in any form outside of training course
# - This code cannot have its license changed
# - If you use this code in your product, you must open-source it under GPLv2
# - Exception can be granted only by the author
# %% Run
# - PyCharm: right-click in the editor and `Run Doctest in ...`
# - PyCharm: keyboard shortcut `Control + Shift + F10`
# - Terminal: `python -m doctest -v myfile.py`
# %% About
# - Name: Pandas ReadPython ListList
# - Difficulty: easy
# - Lines: 1
# - Minutes: 2
# %% English
# 1. Define `result: pd.DataFrame` with `DATA`
# 2. Run doctests - all must succeed
# %% Polish
# 1. Zdefiniuj `result: pd.DataFrame` z `DATA`
# 2. Uruchom doctesty - wszystkie muszą się powieść
# %% Hints
# - `pd.DataFrame()`
# %% Tests
"""
>>> import sys; sys.tracebacklimit = 0
>>> assert sys.version_info >= (3, 9), \
'Python 3.9+ required'
>>> assert result is not Ellipsis, \
'Assign result to variable: `result`'
>>> assert type(result) is pd.DataFrame, \
'Variable `result` has invalid type, should be `pd.DataFrame`'
>>> result # doctest: +NORMALIZE_WHITESPACE
firstname lastname role
0 Mark Watney botanist
1 Melissa Lewis commander
2 Rick Martinez pilot
"""
import pandas as pd
DATA = [
('Mark', 'Watney', 'botanist'),
('Melissa', 'Lewis', 'commander'),
('Rick', 'Martinez', 'pilot'),
]
COLUMNS = ['firstname', 'lastname', 'role']
# Define `result: pd.DataFrame` with `DATA`
# type: pd.DataFrame
result = ...
# %% License
# - Copyright 2025, Matt Harasymczuk <matt@python3.info>
# - This code can be used only for learning by humans
# - This code cannot be used for teaching others
# - This code cannot be used for teaching LLMs and AI algorithms
# - This code cannot be used in commercial or proprietary products
# - This code cannot be distributed in any form
# - This code cannot be changed in any form outside of training course
# - This code cannot have its license changed
# - If you use this code in your product, you must open-source it under GPLv2
# - Exception can be granted only by the author
# %% Run
# - PyCharm: right-click in the editor and `Run Doctest in ...`
# - PyCharm: keyboard shortcut `Control + Shift + F10`
# - Terminal: `python -m doctest -v myfile.py`
# %% About
# - Name: Pandas Read PythonDict
# - Difficulty: medium
# - Lines: 10
# - Minutes: 8
# %% English
# 1. Convert `DATA` to format with one column per each attrbute for example:
# - `group1_year`, `group2_year`,
# - `group1_name`, `group2_name`
# 2. Note, that enumeration starts with one
# 3. Convert data to `result: pd.DataFrame`
# 4. Convert data in `group1_gid` and `group2_gid` to `int`
# 5. Run doctests - all must succeed
# %% Polish
# 1. Przekonweruj `DATA` do formatu z jedną kolumną dla każdego atrybutu, np:
# - `group1_year`, `group2_year`,
# - `group1_name`, `group2_name`
# 2. Zwróć uwagę, że enumeracja zaczyna się od jeden
# 3. Przekonwertuj dane do `result: pd.DataFrame`
# 4. Przekonwertuj dane w `group1_gid` i `group2_gid` do `int`
# 5. Uruchom doctesty - wszystkie muszą się powieść
# %% Hints
# - `dict.pop()`
# - `enumerate(start=1)`
# - `column_name = f'group{i}_{field}'`
# - `pd.DataFrame()`
# %% Tests
"""
>>> import sys; sys.tracebacklimit = 0
>>> pd.set_option('display.width', 250)
>>> pd.set_option('display.max_columns', 20)
>>> pd.set_option('display.max_rows', 30)
>>> assert result is not Ellipsis, \
'Assign result to variable: `result`'
>>> assert type(result) is pd.DataFrame, \
'Variable `result` has invalid type, should be `pd.DataFrame`'
>>> result.convert_dtypes() # doctest: +NORMALIZE_WHITESPACE
firstname lastname group1_gid group1_name group2_gid group2_name group3_gid group3_name
0 Mark Watney 1 users <NA> <NA> <NA> <NA>
1 Melissa Lewis 1 users 2 staff 3 admins
2 Rick Martinez 1 users <NA> <NA> <NA> <NA>
3 Alex Vogel <NA> <NA> <NA> <NA> <NA> <NA>
4 Beth Johanssen 1 users 2 staff <NA> <NA>
5 Chris Beck 1 users <NA> <NA> <NA> <NA>
"""
import pandas as pd
DATA = [
{"firstname": "Mark", "lastname": "Watney", "groups": [
{"gid": 1, "name": "users"}]},
{"firstname": "Melissa", "lastname": "Lewis", "groups": [
{"gid": 1, "name": "users"},
{"gid": 2, "name": "staff"},
{"gid": 3, "name": "admins"}]},
{"firstname": "Rick", "lastname": "Martinez", "groups": [
{"gid": 1, "name": "users"}]},
{"firstname": "Alex", "lastname": "Vogel", "groups": []},
{"firstname": "Beth", "lastname": "Johanssen", "groups": [
{"gid": 1, "name": "users"},
{"gid": 2, "name": "staff"}]},
{"firstname": "Chris", "lastname": "Beck", "groups": [
{"gid": 1, "name": "users"}]},
]
# Define variable data with flatten `DATA`
# Each group field prefixed with group and number
# type: list[dict]
data = ...
# Convert `data` to DataFrame
# type: pd.DataFrame
result = ...
# Zamienić User i Group na User i Group
# %% License
# - Copyright 2025, Matt Harasymczuk <matt@python3.info>
# - This code can be used only for learning by humans
# - This code cannot be used for teaching others
# - This code cannot be used for teaching LLMs and AI algorithms
# - This code cannot be used in commercial or proprietary products
# - This code cannot be distributed in any form
# - This code cannot be changed in any form outside of training course
# - This code cannot have its license changed
# - If you use this code in your product, you must open-source it under GPLv2
# - Exception can be granted only by the author
# %% Run
# - PyCharm: right-click in the editor and `Run Doctest in ...`
# - PyCharm: keyboard shortcut `Control + Shift + F10`
# - Terminal: `python -m doctest -v myfile.py`
# %% About
# - Name: Pandas Read PythonObj
# - Difficulty: medium
# - Lines: 10
# - Minutes: 8
# %% English
# 1. Convert `DATA` to format with one column per each attrbute for example:
# - `group1_year`, `group2_year`,
# - `group1_name`, `group2_name`
# 2. Note, that enumeration starts with one
# 3. Convert data to `result: pd.DataFrame`
# 4. Convert data in `group1_gid` and `group2_gid` to `int`
# 5. Run doctests - all must succeed
# %% Polish
# 1. Przekonweruj `DATA` do formatu z jedną kolumną dla każdego atrybutu, np:
# - `group1_year`, `group2_year`,
# - `group1_name`, `group2_name`
# 2. Zwróć uwagę, że enumeracja zaczyna się od jeden
# 3. Przekonwertuj dane do `result: pd.DataFrame`
# 4. Przekonwertuj dane w `group1_gid` i `group2_gid` do `int`
# 5. Uruchom doctesty - wszystkie muszą się powieść
# %% Hints
# - `vars()`
# - `dict.pop()`
# - `enumerate(start=1)`
# - `column_name = f'group{i}_{field}'`
# - `pd.DataFrame()`
# %% Tests
"""
>>> import sys; sys.tracebacklimit = 0
>>> pd.set_option('display.width', 250)
>>> pd.set_option('display.max_columns', 20)
>>> pd.set_option('display.max_rows', 30)
>>> assert result is not Ellipsis, \
'Assign result to variable: `result`'
>>> assert type(result) is pd.DataFrame, \
'Variable `result` has invalid type, should be `pd.DataFrame`'
>>> result.convert_dtypes() # doctest: +NORMALIZE_WHITESPACE
firstname lastname group1_gid group1_name group2_gid group2_name group3_gid group3_name
0 Mark Watney 1 users <NA> <NA> <NA> <NA>
1 Melissa Lewis 1 users 2 staff 3 admins
2 Rick Martinez 1 users <NA> <NA> <NA> <NA>
3 Alex Vogel <NA> <NA> <NA> <NA> <NA> <NA>
4 Beth Johanssen 1 users 2 staff <NA> <NA>
5 Chris Beck 1 users <NA> <NA> <NA> <NA>
"""
import pandas as pd
class User:
def __init__(self, firstname, lastname, groups=None):
self.firstname = firstname
self.lastname = lastname
self.groups = list(groups) if groups else []
def __str__(self):
return f'{self.firstname} {self.lastname} [{self.groups}]'
def __repr__(self):
clsname = self.__class__.__name__
firstname = self.firstname
lastname = self.lastname
groups = self.groups
return f'{clsname}({firstname=}, {lastname=}, {groups=})'
class Group:
def __init__(self, gid, name):
self.gid = gid
self.name = name
def __str__(self):
return f'{self.gid}({self.name})'
def __repr__(self):
clsname = self.__class__.__name__
gid = self.gid
name = self.name
return f'{clsname}({gid=}, {name=})'
DATA = [
User('Mark', 'Watney', groups=[
Group(1, 'users')]),
User('Melissa', 'Lewis', groups=[
Group(1, 'users'),
Group(2, 'staff'),
Group(3, 'admins')]),
User('Rick', 'Martinez', groups=[
Group(1, 'users')]),
User('Alex', 'Vogel', groups=[]),
User('Beth', 'Johanssen', groups=[
Group(1, 'users'),
Group(2, 'staff')]),
User('Chris', 'Beck', groups=[
Group(1, 'users')]),
]
# Convert DATA to list[dict], then flatten
# type: list[dict]
data = ...
# Convert `data` to DataFrame
# type: pd.DataFrame
result = ...