2.8. Read Python

pd.DataFrame()

2.8.1. SetUp

>>> import pandas as pd

2.8.2. Dict of List

>>> data = {
...     'firstname': ['Mark', 'Melissa', 'Rick'],
...     'lastname': ['Watney', 'Lewis', 'Martinez'],
...     'role': ['botanist', 'commander', 'pilot'],
... }
>>>
>>> df = pd.DataFrame(data)
>>> df
  firstname  lastname       role
0      Mark    Watney   botanist
1   Melissa     Lewis  commander
2      Rick  Martinez      pilot

2.8.3. List of Dict

>>> data = [
...     {'firstname': 'Mark', 'lastname': 'Watney', 'role': 'botanist'},
...     {'firstname': 'Melissa', 'lastname': 'Lewis', 'role': 'commander'},
...     {'firstname': 'Rick', 'lastname': 'Martinez', 'role': 'pilot'},
... ]
>>>
>>> df = pd.DataFrame(data)
>>> df
  firstname  lastname       role
0      Mark    Watney   botanist
1   Melissa     Lewis  commander
2      Rick  Martinez      pilot

2.8.4. List of Tuple

>>> data = [
...     ('Mark', 'Watney', 'botanist'),
...     ('Melissa', 'Lewis', 'commander'),
...     ('Rick', 'Martinez', 'pilot'),
... ]
>>>
>>> df = pd.DataFrame(data, columns=['firstname', 'lastname', 'role'])
>>> df
  firstname  lastname       role
0      Mark    Watney   botanist
1   Melissa     Lewis  commander
2      Rick  Martinez      pilot

2.8.5. Assignments

# %% License
# - Copyright 2025, Matt Harasymczuk <matt@python3.info>
# - This code can be used only for learning by humans
# - This code cannot be used for teaching others
# - This code cannot be used for teaching LLMs and AI algorithms
# - This code cannot be used in commercial or proprietary products
# - This code cannot be distributed in any form
# - This code cannot be changed in any form outside of training course
# - This code cannot have its license changed
# - If you use this code in your product, you must open-source it under GPLv2
# - Exception can be granted only by the author

# %% Run
# - PyCharm: right-click in the editor and `Run Doctest in ...`
# - PyCharm: keyboard shortcut `Control + Shift + F10`
# - Terminal: `python -m doctest -v myfile.py`

# %% About
# - Name: Pandas ReadPython DictList
# - Difficulty: easy
# - Lines: 1
# - Minutes: 2

# %% English
# 1. Define `result: pd.DataFrame` with `DATA`
# 2. Run doctests - all must succeed

# %% Polish
# 1. Zdefiniuj `result: pd.DataFrame` z `DATA`
# 2. Uruchom doctesty - wszystkie muszą się powieść

# %% Hints
# - `pd.DataFrame()`

# %% Tests
"""
>>> import sys; sys.tracebacklimit = 0
>>> assert sys.version_info >= (3, 9), \
'Python 3.9+ required'

>>> assert result is not Ellipsis, \
'Assign result to variable: `result`'
>>> assert type(result) is pd.DataFrame, \
'Variable `result` has invalid type, should be `pd.DataFrame`'

>>> result  # doctest: +NORMALIZE_WHITESPACE
  firstname  lastname      roles
0      Mark    Watney   botanist
1   Melissa     Lewis  commander
2      Rick  Martinez      pilot
"""
import pandas as pd


DATA = {
    'firstname': ['Mark', 'Melissa', 'Rick'],
    'lastname': ['Watney', 'Lewis', 'Martinez'],
    'roles': ['botanist', 'commander', 'pilot'],
}


# Define `result: pd.DataFrame` with `DATA`
# type: pd.DataFrame
result = ...

# %% License
# - Copyright 2025, Matt Harasymczuk <matt@python3.info>
# - This code can be used only for learning by humans
# - This code cannot be used for teaching others
# - This code cannot be used for teaching LLMs and AI algorithms
# - This code cannot be used in commercial or proprietary products
# - This code cannot be distributed in any form
# - This code cannot be changed in any form outside of training course
# - This code cannot have its license changed
# - If you use this code in your product, you must open-source it under GPLv2
# - Exception can be granted only by the author

# %% Run
# - PyCharm: right-click in the editor and `Run Doctest in ...`
# - PyCharm: keyboard shortcut `Control + Shift + F10`
# - Terminal: `python -m doctest -v myfile.py`

# %% About
# - Name: Pandas ReadPython ListDict
# - Difficulty: easy
# - Lines: 1
# - Minutes: 2

# %% English
# 1. Define `result: pd.DataFrame` with `DATA`
# 2. Run doctests - all must succeed

# %% Polish
# 1. Zdefiniuj `result: pd.DataFrame` z `DATA`
# 2. Uruchom doctesty - wszystkie muszą się powieść

# %% Hints
# - `pd.DataFrame()`

# %% Tests
"""
>>> import sys; sys.tracebacklimit = 0
>>> assert sys.version_info >= (3, 9), \
'Python 3.9+ required'

>>> assert result is not Ellipsis, \
'Assign result to variable: `result`'
>>> assert type(result) is pd.DataFrame, \
'Variable `result` has invalid type, should be `pd.DataFrame`'

>>> result  # doctest: +NORMALIZE_WHITESPACE
  firstname  lastname       role
0      Mark    Watney   botanist
1   Melissa     Lewis  commander
2      Rick  Martinez      pilot
"""
import pandas as pd


DATA = [
    {'firstname': 'Mark', 'lastname': 'Watney', 'role': 'botanist'},
    {'firstname': 'Melissa', 'lastname': 'Lewis', 'role': 'commander'},
    {'firstname': 'Rick', 'lastname': 'Martinez', 'role': 'pilot'},
]


# Define `result: pd.DataFrame` with `DATA`
# type: pd.DataFrame
result = ...

# %% License
# - Copyright 2025, Matt Harasymczuk <matt@python3.info>
# - This code can be used only for learning by humans
# - This code cannot be used for teaching others
# - This code cannot be used for teaching LLMs and AI algorithms
# - This code cannot be used in commercial or proprietary products
# - This code cannot be distributed in any form
# - This code cannot be changed in any form outside of training course
# - This code cannot have its license changed
# - If you use this code in your product, you must open-source it under GPLv2
# - Exception can be granted only by the author

# %% Run
# - PyCharm: right-click in the editor and `Run Doctest in ...`
# - PyCharm: keyboard shortcut `Control + Shift + F10`
# - Terminal: `python -m doctest -v myfile.py`

# %% About
# - Name: Pandas ReadPython ListList
# - Difficulty: easy
# - Lines: 1
# - Minutes: 2

# %% English
# 1. Define `result: pd.DataFrame` with `DATA`
# 2. Run doctests - all must succeed

# %% Polish
# 1. Zdefiniuj `result: pd.DataFrame` z `DATA`
# 2. Uruchom doctesty - wszystkie muszą się powieść

# %% Hints
# - `pd.DataFrame()`

# %% Tests
"""
>>> import sys; sys.tracebacklimit = 0
>>> assert sys.version_info >= (3, 9), \
'Python 3.9+ required'

>>> assert result is not Ellipsis, \
'Assign result to variable: `result`'
>>> assert type(result) is pd.DataFrame, \
'Variable `result` has invalid type, should be `pd.DataFrame`'

>>> result  # doctest: +NORMALIZE_WHITESPACE
  firstname  lastname       role
0      Mark    Watney   botanist
1   Melissa     Lewis  commander
2      Rick  Martinez      pilot
"""
import pandas as pd


DATA = [
    ('Mark', 'Watney', 'botanist'),
    ('Melissa', 'Lewis', 'commander'),
    ('Rick', 'Martinez', 'pilot'),
]

COLUMNS = ['firstname', 'lastname', 'role']


# Define `result: pd.DataFrame` with `DATA`
# type: pd.DataFrame
result = ...

# %% License
# - Copyright 2025, Matt Harasymczuk <matt@python3.info>
# - This code can be used only for learning by humans
# - This code cannot be used for teaching others
# - This code cannot be used for teaching LLMs and AI algorithms
# - This code cannot be used in commercial or proprietary products
# - This code cannot be distributed in any form
# - This code cannot be changed in any form outside of training course
# - This code cannot have its license changed
# - If you use this code in your product, you must open-source it under GPLv2
# - Exception can be granted only by the author

# %% Run
# - PyCharm: right-click in the editor and `Run Doctest in ...`
# - PyCharm: keyboard shortcut `Control + Shift + F10`
# - Terminal: `python -m doctest -v myfile.py`

# %% About
# - Name: Pandas Read PythonDict
# - Difficulty: medium
# - Lines: 10
# - Minutes: 8

# %% English
# 1. Convert `DATA` to format with one column per each attrbute for example:
#    - `group1_year`, `group2_year`,
#    - `group1_name`, `group2_name`
# 2. Note, that enumeration starts with one
# 3. Convert data to `result: pd.DataFrame`
# 4. Convert data in `group1_gid` and `group2_gid` to `int`
# 5. Run doctests - all must succeed

# %% Polish
# 1. Przekonweruj `DATA` do formatu z jedną kolumną dla każdego atrybutu, np:
#    - `group1_year`, `group2_year`,
#    - `group1_name`, `group2_name`
# 2. Zwróć uwagę, że enumeracja zaczyna się od jeden
# 3. Przekonwertuj dane do `result: pd.DataFrame`
# 4. Przekonwertuj dane w `group1_gid` i `group2_gid` do `int`
# 5. Uruchom doctesty - wszystkie muszą się powieść

# %% Hints
# - `dict.pop()`
# - `enumerate(start=1)`
# - `column_name = f'group{i}_{field}'`
# - `pd.DataFrame()`

# %% Tests
"""
>>> import sys; sys.tracebacklimit = 0
>>> pd.set_option('display.width', 250)
>>> pd.set_option('display.max_columns', 20)
>>> pd.set_option('display.max_rows', 30)

>>> assert result is not Ellipsis, \
'Assign result to variable: `result`'
>>> assert type(result) is pd.DataFrame, \
'Variable `result` has invalid type, should be `pd.DataFrame`'

>>> result.convert_dtypes()  # doctest: +NORMALIZE_WHITESPACE
  firstname   lastname  group1_gid group1_name  group2_gid group2_name  group3_gid group3_name
0      Mark     Watney           1       users        <NA>        <NA>        <NA>        <NA>
1   Melissa      Lewis           1       users           2       staff           3      admins
2      Rick   Martinez           1       users        <NA>        <NA>        <NA>        <NA>
3      Alex      Vogel        <NA>        <NA>        <NA>        <NA>        <NA>        <NA>
4      Beth  Johanssen           1       users           2       staff        <NA>        <NA>
5     Chris       Beck           1       users        <NA>        <NA>        <NA>        <NA>
"""
import pandas as pd


DATA = [
    {"firstname": "Mark", "lastname": "Watney", "groups": [
        {"gid": 1, "name": "users"}]},

    {"firstname": "Melissa", "lastname": "Lewis", "groups": [
        {"gid": 1, "name": "users"},
        {"gid": 2, "name": "staff"},
        {"gid": 3, "name": "admins"}]},

    {"firstname": "Rick", "lastname": "Martinez", "groups": [
        {"gid": 1, "name": "users"}]},

    {"firstname": "Alex", "lastname": "Vogel", "groups": []},

    {"firstname": "Beth", "lastname": "Johanssen", "groups": [
        {"gid": 1, "name": "users"},
        {"gid": 2, "name": "staff"}]},

    {"firstname": "Chris", "lastname": "Beck", "groups": [
        {"gid": 1, "name": "users"}]},
]


# Define variable data with flatten `DATA`
# Each group field prefixed with group and number
# type: list[dict]
data = ...

# Convert `data` to DataFrame
# type: pd.DataFrame
result = ...

# Zamienić User i Group na User i Group
# %% License
# - Copyright 2025, Matt Harasymczuk <matt@python3.info>
# - This code can be used only for learning by humans
# - This code cannot be used for teaching others
# - This code cannot be used for teaching LLMs and AI algorithms
# - This code cannot be used in commercial or proprietary products
# - This code cannot be distributed in any form
# - This code cannot be changed in any form outside of training course
# - This code cannot have its license changed
# - If you use this code in your product, you must open-source it under GPLv2
# - Exception can be granted only by the author

# %% Run
# - PyCharm: right-click in the editor and `Run Doctest in ...`
# - PyCharm: keyboard shortcut `Control + Shift + F10`
# - Terminal: `python -m doctest -v myfile.py`

# %% About
# - Name: Pandas Read PythonObj
# - Difficulty: medium
# - Lines: 10
# - Minutes: 8

# %% English
# 1. Convert `DATA` to format with one column per each attrbute for example:
#    - `group1_year`, `group2_year`,
#    - `group1_name`, `group2_name`
# 2. Note, that enumeration starts with one
# 3. Convert data to `result: pd.DataFrame`
# 4. Convert data in `group1_gid` and `group2_gid` to `int`
# 5. Run doctests - all must succeed

# %% Polish
# 1. Przekonweruj `DATA` do formatu z jedną kolumną dla każdego atrybutu, np:
#    - `group1_year`, `group2_year`,
#    - `group1_name`, `group2_name`
# 2. Zwróć uwagę, że enumeracja zaczyna się od jeden
# 3. Przekonwertuj dane do `result: pd.DataFrame`
# 4. Przekonwertuj dane w `group1_gid` i `group2_gid` do `int`
# 5. Uruchom doctesty - wszystkie muszą się powieść

# %% Hints
# - `vars()`
# - `dict.pop()`
# - `enumerate(start=1)`
# - `column_name = f'group{i}_{field}'`
# - `pd.DataFrame()`

# %% Tests
"""
>>> import sys; sys.tracebacklimit = 0
>>> pd.set_option('display.width', 250)
>>> pd.set_option('display.max_columns', 20)
>>> pd.set_option('display.max_rows', 30)

>>> assert result is not Ellipsis, \
'Assign result to variable: `result`'
>>> assert type(result) is pd.DataFrame, \
'Variable `result` has invalid type, should be `pd.DataFrame`'

>>> result.convert_dtypes()  # doctest: +NORMALIZE_WHITESPACE
  firstname   lastname  group1_gid group1_name  group2_gid group2_name  group3_gid group3_name
0      Mark     Watney           1       users        <NA>        <NA>        <NA>        <NA>
1   Melissa      Lewis           1       users           2       staff           3      admins
2      Rick   Martinez           1       users        <NA>        <NA>        <NA>        <NA>
3      Alex      Vogel        <NA>        <NA>        <NA>        <NA>        <NA>        <NA>
4      Beth  Johanssen           1       users           2       staff        <NA>        <NA>
5     Chris       Beck           1       users        <NA>        <NA>        <NA>        <NA>
"""

import pandas as pd


class User:
    def __init__(self, firstname, lastname, groups=None):
        self.firstname = firstname
        self.lastname = lastname
        self.groups = list(groups) if groups else []

    def __str__(self):
        return f'{self.firstname} {self.lastname} [{self.groups}]'

    def __repr__(self):
        clsname = self.__class__.__name__
        firstname = self.firstname
        lastname = self.lastname
        groups = self.groups
        return f'{clsname}({firstname=}, {lastname=}, {groups=})'


class Group:
    def __init__(self, gid, name):
        self.gid = gid
        self.name = name

    def __str__(self):
        return f'{self.gid}({self.name})'

    def __repr__(self):
        clsname = self.__class__.__name__
        gid = self.gid
        name = self.name
        return f'{clsname}({gid=}, {name=})'


DATA = [
    User('Mark', 'Watney', groups=[
        Group(1, 'users')]),

    User('Melissa', 'Lewis', groups=[
        Group(1, 'users'),
        Group(2, 'staff'),
        Group(3, 'admins')]),

    User('Rick', 'Martinez', groups=[
        Group(1, 'users')]),

    User('Alex', 'Vogel', groups=[]),

    User('Beth', 'Johanssen', groups=[
        Group(1, 'users'),
        Group(2, 'staff')]),

    User('Chris', 'Beck', groups=[
        Group(1, 'users')]),
]


# Convert DATA to list[dict], then flatten
# type: list[dict]
data = ...

# Convert `data` to DataFrame
# type: pd.DataFrame
result = ...