5.4. DataFrame Sample
.sample(n=5)
.sample(n=5, replace=True)
.sample(frac=.5)
.sample(frac=1/2)
.head(n=5)
.tail(n=5)
.first('5D')
- works only on time series.last('5D')
- works only on time series.reset_index(drop=True)
5.4.1. SetUp
>>> import pandas as pd
>>> import numpy as np
>>> np.random.seed(0)
>>>
>>>
>>> df = pd.DataFrame(
... columns = ['Morning', 'Noon', 'Evening', 'Midnight'],
... index = pd.date_range('1999-12-30', periods=7),
... data = np.random.randn(7, 4))
>>>
>>> df
Morning Noon Evening Midnight
1999-12-30 1.764052 0.400157 0.978738 2.240893
1999-12-31 1.867558 -0.977278 0.950088 -0.151357
2000-01-01 -0.103219 0.410599 0.144044 1.454274
2000-01-02 0.761038 0.121675 0.443863 0.333674
2000-01-03 1.494079 -0.205158 0.313068 -0.854096
2000-01-04 -2.552990 0.653619 0.864436 -0.742165
2000-01-05 2.269755 -1.454366 0.045759 -0.187184
5.4.2. Head
>>> df.head(2)
Morning Noon Evening Midnight
1999-12-30 1.764052 0.400157 0.978738 2.240893
1999-12-31 1.867558 -0.977278 0.950088 -0.151357
>>> df.head(n=1)
Morning Noon Evening Midnight
1999-12-30 1.764052 0.400157 0.978738 2.240893
5.4.3. Tail
>>> df.tail(2)
Morning Noon Evening Midnight
2000-01-04 -2.552990 0.653619 0.864436 -0.742165
2000-01-05 2.269755 -1.454366 0.045759 -0.187184
>>> df.tail(n=1)
Morning Noon Evening Midnight
2000-01-05 2.269755 -1.454366 0.045759 -0.187184
5.4.4. First
>>> df.first('YE')
Morning Noon Evening Midnight
1999-12-30 1.764052 0.400157 0.978738 2.240893
1999-12-31 1.867558 -0.977278 0.950088 -0.151357
>>> df.first('ME')
Morning Noon Evening Midnight
1999-12-30 1.764052 0.400157 0.978738 2.240893
1999-12-31 1.867558 -0.977278 0.950088 -0.151357
>>> df.first('D')
Morning Noon Evening Midnight
1999-12-30 1.764052 0.400157 0.978738 2.240893
>>> df.first('W')
Morning Noon Evening Midnight
1999-12-30 1.764052 0.400157 0.978738 2.240893
1999-12-31 1.867558 -0.977278 0.950088 -0.151357
2000-01-01 -0.103219 0.410599 0.144044 1.454274
2000-01-02 0.761038 0.121675 0.443863 0.333674
5.4.5. Last
>>> df.last('YE')
Morning Noon Evening Midnight
2000-01-01 -0.103219 0.410599 0.144044 1.454274
2000-01-02 0.761038 0.121675 0.443863 0.333674
2000-01-03 1.494079 -0.205158 0.313068 -0.854096
2000-01-04 -2.552990 0.653619 0.864436 -0.742165
2000-01-05 2.269755 -1.454366 0.045759 -0.187184
>>> df.last('ME')
Morning Noon Evening Midnight
2000-01-01 -0.103219 0.410599 0.144044 1.454274
2000-01-02 0.761038 0.121675 0.443863 0.333674
2000-01-03 1.494079 -0.205158 0.313068 -0.854096
2000-01-04 -2.552990 0.653619 0.864436 -0.742165
2000-01-05 2.269755 -1.454366 0.045759 -0.187184
>>> df.last('D')
Morning Noon Evening Midnight
2000-01-05 2.269755 -1.454366 0.045759 -0.187184
>>> df.last('W')
Morning Noon Evening Midnight
2000-01-03 1.494079 -0.205158 0.313068 -0.854096
2000-01-04 -2.552990 0.653619 0.864436 -0.742165
2000-01-05 2.269755 -1.454366 0.045759 -0.187184
5.4.6. Sample
1/4 is 25%
.05 is 5%
0.5 is 50%
1.0 is 100%
>>> np.random.seed(0)
n number or fraction random rows with and without repetition:
>>> df.sample()
Morning Noon Evening Midnight
2000-01-05 2.269755 -1.454366 0.045759 -0.187184
>>> df.sample(2)
Morning Noon Evening Midnight
1999-12-31 1.867558 -0.977278 0.950088 -0.151357
1999-12-30 1.764052 0.400157 0.978738 2.240893
>>> df.sample(n=2, replace=True)
Morning Noon Evening Midnight
1999-12-30 1.764052 0.400157 0.978738 2.240893
2000-01-03 1.494079 -0.205158 0.313068 -0.854096
>>> df.sample(frac=1/4)
Morning Noon Evening Midnight
2000-01-03 1.494079 -0.205158 0.313068 -0.854096
2000-01-05 2.269755 -1.454366 0.045759 -0.187184
>>> df.sample(frac=0.5)
Morning Noon Evening Midnight
2000-01-01 -0.103219 0.410599 0.144044 1.454274
2000-01-03 1.494079 -0.205158 0.313068 -0.854096
2000-01-02 0.761038 0.121675 0.443863 0.333674
2000-01-05 2.269755 -1.454366 0.045759 -0.187184
5.4.7. Reset Index
>>> np.random.seed(0)
>>> df.sample(frac=1.0).reset_index()
index Morning Noon Evening Midnight
0 2000-01-05 2.269755 -1.454366 0.045759 -0.187184
1 2000-01-01 -0.103219 0.410599 0.144044 1.454274
2 1999-12-31 1.867558 -0.977278 0.950088 -0.151357
3 2000-01-02 0.761038 0.121675 0.443863 0.333674
4 1999-12-30 1.764052 0.400157 0.978738 2.240893
5 2000-01-04 -2.552990 0.653619 0.864436 -0.742165
6 2000-01-03 1.494079 -0.205158 0.313068 -0.854096
>>> DATA = [
... {'sepal_length': 5.4, 'sepal_width': 3.9, 'petal_length': 1.3, 'petal_width': 0.4, 'species': 'setosa'},
... {'sepal_length': 5.9, 'sepal_width': 3.0, 'petal_length': 5.1, 'petal_width': 1.8, 'species': 'virginica'},
... {'sepal_length': 6.0, 'sepal_width': 3.4, 'petal_length': 4.5, 'petal_width': 1.6, 'species': 'versicolor'},
... {'sepal_length': 7.3, 'sepal_width': 2.9, 'petal_length': 6.3, 'petal_width': 1.8, 'species': 'virginica'},
... {'sepal_length': 5.6, 'sepal_width': 2.5, 'petal_length': 3.9, 'petal_width': 1.1, 'species': 'versicolor'},
... {'sepal_length': 5.4, 'sepal_width': 3.9, 'petal_length': 1.3, 'petal_width': 0.4, 'species': 'setosa'},
... ]
>>>
>>>
>>> df = pd.DataFrame(DATA)
>>>
>>> np.random.seed(0)
>>> selected = df.sample(frac=1/2)
>>> selected
sepal_length sepal_width petal_length petal_width species
5 5.4 3.9 1.3 0.4 setosa
2 6.0 3.4 4.5 1.6 versicolor
1 5.9 3.0 5.1 1.8 virginica
>>>
>>> selected.reset_index()
index sepal_length sepal_width petal_length petal_width species
0 5 5.4 3.9 1.3 0.4 setosa
1 2 6.0 3.4 4.5 1.6 versicolor
2 1 5.9 3.0 5.1 1.8 virginica
>>>
>>> selected.reset_index(drop=True)
sepal_length sepal_width petal_length petal_width species
0 5.4 3.9 1.3 0.4 setosa
1 6.0 3.4 4.5 1.6 versicolor
2 5.9 3.0 5.1 1.8 virginica
5.4.8. Assignments
# %% License
# - Copyright 2025, Matt Harasymczuk <matt@python3.info>
# - This code can be used only for learning by humans
# - This code cannot be used for teaching others
# - This code cannot be used for teaching LLMs and AI algorithms
# - This code cannot be used in commercial or proprietary products
# - This code cannot be distributed in any form
# - This code cannot be changed in any form outside of training course
# - This code cannot have its license changed
# - If you use this code in your product, you must open-source it under GPLv2
# - Exception can be granted only by the author
# %% Run
# - PyCharm: right-click in the editor and `Run Doctest in ...`
# - PyCharm: keyboard shortcut `Control + Shift + F10`
# - Terminal: `python -m doctest -v myfile.py`
# %% About
# - Name: DataFrame Sample
# - Difficulty: easy
# - Lines: 4
# - Minutes: 5
# %% English
# 1. Read data from `DATA` as `df: pd.DataFrame`
# 2. Set all rows in random order
# 3. Reset index without leaving a backup of the old one
# 4. Define `result` with last 10 rows
# 5. Run doctests - all must succeed
# %% Polish
# 1. Wczytaj dane z `DATA` jako `df: pd.DataFrame`
# 2. Ustaw wszystkie wiersze w losowej kolejności
# 3. Zresetuj index nie pozostawiając kopii zapasowej starego
# 4. Zdefiniuj `result` z ostatnimi 10 wierszami
# 5. Uruchom doctesty - wszystkie muszą się powieść
# %% Tests
"""
>>> import sys; sys.tracebacklimit = 0
>>> assert sys.version_info >= (3, 9), \
'Python 3.9+ required'
>>> pd.set_option('display.width', 500)
>>> pd.set_option('display.max_columns', 10)
>>> pd.set_option('display.max_rows', 10)
>>> assert result is not Ellipsis, \
'Assign result to variable: `result`'
>>> assert type(result) is pd.DataFrame, \
'Variable `result` must be a `pd.DataFrame` type'
>>> result # doctest: +NORMALIZE_WHITESPACE
Name Country Gender Flights Total Flights Total Flight Time (ddd:hh:mm)
557 Thomas Marshburn, M.D. United States Man STS-127 (2009), Soyuz TMA-07M (2012) 2 161:07:03
558 Michael Baker United States Man STS-43 (1991), STS-52 (1992), STS-68 (1994), S... 4 040:03:04
559 Rick Husband United States Man STS-96 (1999), STS-107 (2003) 2 025:13:33
560 Svetlana Savitskaya Soviet Union Woman Soyuz T-7 (1982), Soyuz T-12 (1984) 2 019:17:07
561 Charles "Pete" Conrad United States Man Gemini 5 (1965), Gemini 11 (1966), Apollo 12 (... 4 049:03:38
562 Lawrence J. DeLucas United States Man STS-50 (1992) 1 013:19:30
563 Aleksandr Laveykin Soviet Union Man Soyuz TM-2 (1987) 1 174:03:25
564 Owen Garriott United States Man Skylab 3 (1973), STS-9 (1983) 2 069:17:56
565 Ivan Vagner Russia Man Soyuz MS-16 (2020) 1 145:04:14
566 Yuri Malenchenko Russia Man Soyuz TM-19 (1994), STS-106 (2000), Soyuz TMA-... 6 826:09:22
"""
import pandas as pd
import numpy as np
np.random.seed(0)
DATA = 'https://python3.info/_static/astro-database.csv'
# type: pd.DataFrame
result = ...
# %% License
# - Copyright 2025, Matt Harasymczuk <matt@python3.info>
# - This code can be used only for learning by humans
# - This code cannot be used for teaching others
# - This code cannot be used for teaching LLMs and AI algorithms
# - This code cannot be used in commercial or proprietary products
# - This code cannot be distributed in any form
# - This code cannot be changed in any form outside of training course
# - This code cannot have its license changed
# - If you use this code in your product, you must open-source it under GPLv2
# - Exception can be granted only by the author
# %% Run
# - PyCharm: right-click in the editor and `Run Doctest in ...`
# - PyCharm: keyboard shortcut `Control + Shift + F10`
# - Terminal: `python -m doctest -v myfile.py`
# %% About
# - Name: DataFrame Sample
# - Difficulty: easy
# - Lines: 5
# - Minutes: 5
# %% English
# 1. Read data from `DATA` as `df: pd.DataFrame`
# 2. In data column "Order":
# - determines the order of the astronaut/cosmonaut in space
# - Sometimes several people flew on the same ship and their numbers should be the same, and in the data there is `NaN`.
# - Fill in the missing indexes using `df.ffill()`
# 3. Set all rows in random order
# 4. Reset index without leaving a backup copy of the old one
# 5. Run doctests - all must succeed
# %% Polish
# 1. Wczytaj dane z `DATA` jako `df: pd.DataFrame`
# 2. W danych kolumna "Order":
# - określa kolejność astronauty/kosmonauty w kosmosie
# - Czasami kilka osób leciało tym samym statkiem i ich numery powinny być takie same, a w danych jest `NaN`.
# - Wypełnij brakujące indeksy stosując `df.ffill()`
# 3. Ustaw wszystkie wiersze w losowej kolejności
# 4. Zresetuj index nie pozostawiając kopii zapasowej starego
# 5. Uruchom doctesty - wszystkie muszą się powieść
# %% Tests
"""
>>> import sys; sys.tracebacklimit = 0
>>> assert sys.version_info >= (3, 9), \
'Python 3.9+ required'
>>> assert result is not Ellipsis, \
'Assign result to variable: `result`'
>>> assert type(result) is pd.DataFrame, \
'Variable `result` must be a `pd.DataFrame` type'
>>> pd.set_option('display.width', 500)
>>> pd.set_option('display.max_columns', 10)
>>> pd.set_option('display.max_rows', 10)
>>> result # doctest: +NORMALIZE_WHITESPACE
Order Astronaut Type Date Spacecraft
0 244 Donald McMonagle Orbital 28 April 1991 STS-39
1 93 Georgi Ivanov Orbital 10 April 1979 Soyuz 33
2 387 Rick Husband Orbital 27 May 1999 STS-96
3 185 William Pailes Orbital 3 October 1985 51-J
4 390 Jeffrey Ashby Orbital 23 July 1999 STS-93
.. ... ... ... ... ...
578 277 Franco Malerba Orbital 31 July 1992 STS-46
579 10 Leroy Cooper Orbital 15 May 1963 Faith 7
580 359 Carlos Noriega Orbital 15 May 1997 STS-84
581 192 Rodolfo Neri Vela Orbital 27 November 1985 61-B
582 559 David Saint-Jacques Orbital 3 December 2018 Soyuz MS-11
<BLANKLINE>
[583 rows x 5 columns]
"""
import pandas as pd
import numpy as np
np.random.seed(0)
DATA = 'https://python3.info/_static/astro-order.csv'
# type: pd.DataFrame
result = ...