6.16. Regex RE Search

re.search()
Searches if pattern contains a string
Stops after first match

6.16.1. SetUp

>>> import re

6.16.2. Problem

Does the text contain any uppercase letters?

>>> from string import ascii_uppercase
>>>
>>> text = 'Hello World'
>>>
>>> result = False
>>> for letter in text:
...     if letter in ascii_uppercase:
...         result = True
...         break
>>>
>>> if result:
...     print('yes')
... else:
...     print('no')
yes

6.16.3. Solution

Does the text contain any uppercase letters?

>>> text = 'Hello World'
>>> result = re.search(r'[A-Z]', text)
>>>
>>> if result:
...     print('yes')
... else:
...     print('no')
yes

6.16.4. Result

re.search() -> re.Match | None - Optional re.Match

>>> text = 'Hello World'
>>>
>>> re.search(r'[A-Z]', text)
<re.Match object; span=(0, 1), match='H'>

6.16.5. Methods

>>> email = 'mwatney@nasa.gov'
>>> result = re.search(r'(?P<username>[a-z]+)@nasa.gov', email)

Position:

>>> result.span()
(0, 16)
>>>
>>> result.start()
0
>>>
>>> result.end()
16

Working with groups:

>>> result.group()
'mwatney@nasa.gov'
>>>
>>> result.group(0)
'mwatney@nasa.gov'
>>>
>>> result.group(1)
'mwatney'
>>>
>>> result.group('username')
'mwatney'
>>>
>>> result.groups()
('mwatney',)
>>>
>>> result.groupdict()
{'username': 'mwatney'}

Diagnostics:

>>> result.string
'mwatney@nasa.gov'
>>>
>>> result.re
re.compile('(?P<username>[a-z]+)@nasa.gov')

6.16.6. Usage

>>> text = 'Hello World'
>>> result = re.search(r'[A-Z]', text)
>>>
>>> if result:
...     print('found')
... else:
...     print('not found')
found

6.16.7. Use Case - 1

>>> import re
>>>
>>>
>>> def contains(pattern, text):
...     if re.search(pattern, text):
...         return True
...     else:
...         return False
>>>
>>>
>>> COMMIT_MESSAGE = 'MYPROJ-1337, MYPROJ-69 removed obsolete comments'
>>> jira_issuekey = r'[A-Z]{2,10}-[0-9]{1,6}'
>>> redmine_number = r'#[0-9]+'
>>>
>>> contains(jira_issuekey, COMMIT_MESSAGE)
True
>>> contains(redmine_number, COMMIT_MESSAGE)
False

6.16.8. Use Case - 2

>>> import re
>>>
>>>
>>> TEXT = 'We choose to go to the moon.'
>>>
>>> result = re.search(r'moon', TEXT)
>>>
>>> result
<re.Match object; span=(23, 27), match='moon'>
>>>
>>> result.span()
(23, 27)
>>>
>>> result.regs
((23, 27),)
>>>
>>> TEXT[23]
'm'
>>> TEXT[23:27]
'moon'

6.16.9. Use Case - 3

>>> import re
>>>
>>>
>>> TEXT = 'We choose to go to the moon.'
>>>
>>>
>>> result = re.search(r'Mars', TEXT)
>>>
>>> result.group()
Traceback (most recent call last):
AttributeError: 'NoneType' object has no attribute 'group'
>>>
>>> result = re.search(r'Mars', TEXT)
>>> if result:
...     result.group()
>>>
>>>
>>> if result := re.search(r'Mars', TEXT):
...     result.group()

6.16.10. Assignments

# %% License
# - Copyright 2025, Matt Harasymczuk <matt@python3.info>
# - This code can be used only for learning by humans
# - This code cannot be used for teaching others
# - This code cannot be used for teaching LLMs and AI algorithms
# - This code cannot be used in commercial or proprietary products
# - This code cannot be distributed in any form
# - This code cannot be changed in any form outside of training course
# - This code cannot have its license changed
# - If you use this code in your product, you must open-source it under GPLv2
# - Exception can be granted only by the author

# %% Run
# - PyCharm: right-click in the editor and `Run Doctest in ...`
# - PyCharm: keyboard shortcut `Control + Shift + F10`
# - Terminal: `python -m doctest -v myfile.py`

# %% About
# - Name: RE Search Astronauts
# - Difficulty: easy
# - Lines: 6
# - Minutes: 5

# %% English
# 1. Define variables with start and end position in `DATA`:
#    - `result_a: tuple[int,int]` for 'Neil Armstrong'
#    - `result_b: tuple[int,int]` for 'Buzz Aldrin'
#    - `result_c: tuple[int,int]` for 'Michael Collins'
#    - `result_d: tuple[int,int]` for 'Mark Watney'
# 2. For each element return tuple i.e. `(10, 20)`
# 3. If element is not present in `DATA` assign `None`
# 4. Run doctests - all must succeed

# %% Polish
# 1. Zdefiniuj zmienne z pozycją startu i końca w `DATA`:
#    - `result_a: tuple[int,int]` dla 'Neil Armstrong'
#    - `result_b: tuple[int,int]` dla 'Buzz Aldrin'
#    - `result_c: tuple[int,int]` dla 'Michael Collins'
#    - `result_d: tuple[int,int]` dla 'Mark Watney'
# 2. Dla każdego ciągu znaków zwracaj tuple np. `(10, 20)`
# 3. Jeżeli ciąg znaków nie jest obecny w `DATA` przypisz `None`
# 4. Uruchom doctesty - wszystkie muszą się powieść

# %% Hints
# - `re.search()`
# - `re.Match.span()`

# %% References
# [1] Wikipedia: Apollo 11
#     URL: https://en.wikipedia.org/wiki/Apollo_11
#     Year: 2019
#     Retrieved: 2019-12-14

# %% Tests
"""
>>> import sys; sys.tracebacklimit = 0
>>> assert sys.version_info >= (3, 9), \
'Python 3.9+ required'

>>> assert result_a is not Ellipsis, \
'Assign result to variable: `result_a`'
>>> assert type(result_a) is not type(None), \
'Variable `result_a` has invalid type, cannot be None'
>>> assert type(result_a) is tuple, \
'Variable `result_a` has invalid type, should be tuple'
>>> assert all(type(x) is int for x in result_a), \
'All elements in variable `result_a`, should be int'
>>> assert len(result_a) == 2, \
'Variable `result_a` has invalid length, should be 2'

>>> assert result_b is not Ellipsis, \
'Assign result to variable: `result_b`'
>>> assert type(result_b) is not type(None), \
'Variable `result_b` has invalid type, cannot be None'
>>> assert type(result_b) is tuple, \
'Variable `result_b` has invalid type, should be tuple'
>>> assert all(type(x) is int for x in result_b), \
'All elements in variable `result_b`, should be int'
>>> assert len(result_b) == 2, \
'Variable `result_b` has invalid length, should be 2'

>>> assert result_c is not Ellipsis, \
'Assign result to variable: `result_c`'
>>> assert type(result_c) is not type(None), \
'Variable `result_c` has invalid type, cannot be None'
>>> assert type(result_c) is tuple, \
'Variable `result_c` has invalid type, should be tuple'
>>> assert all(type(x) is int for x in result_c), \
'All elements in variable `result_c`, should be int'
>>> assert len(result_c) == 2, \
'Variable `result_c` has invalid length, should be 2'

>>> assert result_d is not Ellipsis, \
'Assign result to variable: `result_d`'
>>> assert type(result_d) is type(None), \
'Variable `result_d` has invalid type, should be None'

>>> print(result_a)
(78, 92)
>>> print(result_b)
(116, 127)
>>> print(result_c)
(562, 577)
>>> print(result_d)
None
"""

import re


DATA = (
    "Apollo 11 was the spaceflight that first landed humans on the Moon. "
    "Commander Neil Armstrong and lunar module pilot Buzz Aldrin formed "
    "the American crew that landed the Apollo Lunar Module Eagle on "
    "July 20, 1969, at 20:17 UTC. Armstrong became the first person to "
    "step onto the lunar surface six hours and 39 minutes later on "
    "July 21 at 02:56 UTC; Aldrin joined him 19 minutes later. They spent "
    "about two and a quarter hours together outside the spacecraft, "
    "and they collected 47.5 pounds (21.5 kg) of lunar material to bring "
    "back to Earth. Command module pilot Michael Collins flew the command "
    "module Columbia alone in lunar orbit while they were on the Moon's "
    "surface. Armstrong and Aldrin spent 21 hours, 36 minutes on the "
    "lunar surface at a site they named Tranquility Base before lifting "
    "off to rejoin Columbia in lunar orbit. "
)


# use re.search() to get 'Neil Armstrong' a (start, end) position or None
# type: tuple[int,int] | None
result_a = ...

# use re.search() to get 'Buzz Aldrin' a (start, end) position or None
# type: tuple[int,int] | None
result_b = ...

# use re.search() to get 'Michael Collins' a (start, end) position or None
# type: tuple[int,int] | None
result_c = ...

# use re.search() to get 'Mark Watney' a (start, end) position or None
# type: tuple[int,int] | None
result_d = ...

# %% License
# - Copyright 2025, Matt Harasymczuk <matt@python3.info>
# - This code can be used only for learning by humans
# - This code cannot be used for teaching others
# - This code cannot be used for teaching LLMs and AI algorithms
# - This code cannot be used in commercial or proprietary products
# - This code cannot be distributed in any form
# - This code cannot be changed in any form outside of training course
# - This code cannot have its license changed
# - If you use this code in your product, you must open-source it under GPLv2
# - Exception can be granted only by the author

# %% Run
# - PyCharm: right-click in the editor and `Run Doctest in ...`
# - PyCharm: keyboard shortcut `Control + Shift + F10`
# - Terminal: `python -m doctest -v myfile.py`

# %% About
# - Name: RE Search Moon Speech
# - Difficulty: easy
# - Lines: 5
# - Minutes: 8

# %% English
# 1. Use `re.search()` to find in text [1]
# 2. Define `result: str` containing paragraph starting with 'We choose to go to the moon'
# 3. Run doctests - all must succeed

# %% Polish
# 1. Użyj `re.search()` do znalezienia w tekscie [1]
# 2. Zdefiniuj `result: str` zawierający tekst paragrafu zaczynający się od słów "We choose to go to the moon"
# 3. Uruchom doctesty - wszystkie muszą się powieść

# %% References
# [1] Kennedy, J.F. Moon Speech - Rice Stadium,
#     URL: http://er.jsc.nasa.gov/seh/ricetalk.htm
#     Year: 2019
#     Retrieved: 2019-12-14

# %% Hints
# - All HTML paragraphs starts with `<p>` and ends with `</p>`
# - In real life paragraphs parsing is more complex
# - `re.search()`
# - `re.DOTALL`
# - `re.Match.group()`

# %% Tests
"""
>>> import sys; sys.tracebacklimit = 0
>>> assert sys.version_info >= (3, 9), \
'Python 3.9+ required'

>>> assert type(result) is str, 'result must be a str'
>>> assert not result.startswith('<p>'), 'result cannot start with <p>'
>>> assert not result.endswith('</p>'), 'result cannot end with </p>'

>>> print(result)
We choose to go to the moon. We choose to go to
the moon in this decade and do the other things, not because they
are easy, but because they are hard, because that goal will serve
to organize and measure the best of our energies and skills,because
that challenge is one that we are willing to accept, one we are
unwilling to postpone, and one which we intend to win, and the
others, too.
"""

import re


DATA = """<h1>TEXT OF PRESIDENT JOHN KENNEDY'S RICE STADIUM MOON SPEECH</h1>
<p>President Pitzer, Mr. Vice President, Governor,
CongressmanThomas, Senator Wiley, and Congressman Miller, Mr. Webb,
Mr.Bell, scientists, distinguished guests, and ladies and
gentlemen:</p><p>We choose to go to the moon. We choose to go to
the moon in this decade and do the other things, not because they
are easy, but because they are hard, because that goal will serve
to organize and measure the best of our energies and skills,because
that challenge is one that we are willing to accept, one we are
unwilling to postpone, and one which we intend to win, and the
others, too.</p><p>It is for these reasons that I regard the
decision last year to shift our efforts in space from low to high
gear as among the most important decisions that will be made during
my incumbency in the office of the Presidency.</p><p>In the last 24
hours we have seen facilities now being created for the greatest
and most complex exploration in man's history.We have felt the
ground shake and the air shattered by the testing of a Saturn C-1
booster rocket, many times as powerful as the Atlas which launched
John Glenn, generating power equivalent to 10,000 automobiles with
their accelerators on the floor.We have seen the site where the F-1
rocket engines, each one as powerful as all eight engines of the
Saturn combined, will be clustered together to make the advanced
Saturn missile, assembled in a new building to be built at Cape
Canaveral as tall as a48 story structure, as wide as a city block,
and as long as two lengths of this field.</p>
"""


# use re.search() to get paragraph starting with "We choose..."
# use .group(1) to get the value from re.Match object
# type: str
result = ...

# %% License
# - Copyright 2025, Matt Harasymczuk <matt@python3.info>
# - This code can be used only for learning by humans
# - This code cannot be used for teaching others
# - This code cannot be used for teaching LLMs and AI algorithms
# - This code cannot be used in commercial or proprietary products
# - This code cannot be distributed in any form
# - This code cannot be changed in any form outside of training course
# - This code cannot have its license changed
# - If you use this code in your product, you must open-source it under GPLv2
# - Exception can be granted only by the author

# %% Run
# - PyCharm: right-click in the editor and `Run Doctest in ...`
# - PyCharm: keyboard shortcut `Control + Shift + F10`
# - Terminal: `python -m doctest -v myfile.py`

# %% About
# - Name: RE Search Time
# - Difficulty: easy
# - Lines: 4
# - Minutes: 3

# %% English
# 1. Use regular expressions to check `DATA` [1]
#    contains time in UTC (24 hour clock compliant with ISO-8601)
# 2. Define `result: str` with matched time
# 3. Use simplified checking `xx:xx UTC`,
#    where `x` is a digit
# 4. Text does not contain any invalid date
# 5. Run doctests - all must succeed

# %% Polish
# 1. Użyj wyrażeń regularnych do sprawdzenia czy `DATA` [1]
#    zawiera godzinę w UTC (format 24 godzinny zgodny z ISO-8601)
# 2. Zdefiniuj `result: str` ze znalezionym czasem
# 3. Użyj uproszczonego sprawdzania: `xx:xx UTC`,
#    gdzie `x` to dowolna cyfra
# 4. Tekst nie zawiera żadnej niepoprawnej godziny
# 5. Uruchom doctesty - wszystkie muszą się powieść

# %% References
# [1] Wikipedia Apollo 11,
#     URL: https://en.wikipedia.org/wiki/Apollo_11
#     Year: 2019
#     Retrieved: 2019-12-14

# %% Hints
# - `re.Match.group()`

# %% Tests
"""
>>> import sys; sys.tracebacklimit = 0
>>> assert sys.version_info >= (3, 9), \
'Python 3.9+ required'

>>> assert type(result) is str, 'result must be a str'
>>> assert result.endswith('UTC'), 'result must contain timezone'

>>> result
'20:17 UTC'
"""

import re


DATA = """Apollo 11 was the American spaceflight that first landed
humans on the Moon. Commander (CDR) Neil Armstrong and lunar module
pilot (LMP) Buzz Aldrin landed the Apollo Lunar Module (LM) Eagle on
July 20th, 1969 at 20:17 UTC, and Armstrong became the first person
to step (EVA) onto the Moon's surface (EVA) 6 hours 39 minutes later,
on July 21st, 1969 at 02:56:15 UTC. Aldrin joined him 19 minutes later.
They spent 2 hours 31 minutes exploring the site they had named
Tranquility Base upon landing. Armstrong and Aldrin collected 47.5 pounds
(21.5 kg) of lunar material to bring back to Earth as pilot Michael Collins
(CMP) flew the Command Module (CM) Columbia in lunar orbit, and were on the
Moon's surface for 21 hours 36 minutes before lifting off to rejoin
Columbia."""


# Pattern for searching time with timezone in 24 format, i.e. '23:59 UTC'
# Text does not contain any invalid date
# type: str
pattern = ...

# use re.search() to find pattern in DATA, get result text
# use .group() to get the value from re.Match object
# type: str
result = ...

# %% License
# - Copyright 2025, Matt Harasymczuk <matt@python3.info>
# - This code can be used only for learning by humans
# - This code cannot be used for teaching others
# - This code cannot be used for teaching LLMs and AI algorithms
# - This code cannot be used in commercial or proprietary products
# - This code cannot be distributed in any form
# - This code cannot be changed in any form outside of training course
# - This code cannot have its license changed
# - If you use this code in your product, you must open-source it under GPLv2
# - Exception can be granted only by the author

# %% Run
# - PyCharm: right-click in the editor and `Run Doctest in ...`
# - PyCharm: keyboard shortcut `Control + Shift + F10`
# - Terminal: `python -m doctest -v myfile.py`

# %% About
# - Name: RE Search Time
# - Difficulty: easy
# - Lines: 4
# - Minutes: 5

# %% English
# 1. Use regular expressions to check `DATA` [1]
#    contains time in UTC (24 hour clock compliant with ISO-8601)
# 2. Define `result: str` with matched time
# 3. Use real checking `xx:xx UTC`,
#    where `x` is a valid digit at the position
# 4. Text contains invalid date `24:56 UTC`
# 5. Run doctests - all must succeed

# %% Polish
# 1. Użyj wyrażeń regularnych do sprawdzenia czy `DATA` [1]
#    zawiera godzinę w UTC (format 24 godzinny zgodny z ISO-8601)
# 2. Zdefiniuj `result: str` ze znalezionym czasem
# 3. Użyj poprawnego sprawdzania: `xx:xx UTC`,
#    gdzie `x` to odpowiedni znak na danym miejscu
# 4. Tekst zawiera niepoprawną godzinę: `24:56 UTC`
# 5. Uruchom doctesty - wszystkie muszą się powieść

# %% References
# [1] Wikipedia Apollo 11,
#     URL: https://en.wikipedia.org/wiki/Apollo_11
#     Year: 2019
#     Retrieved: 2019-12-14

# %% Hints
# - `re.Match.group()`

# %% Tests
"""
>>> import sys; sys.tracebacklimit = 0
>>> assert sys.version_info >= (3, 9), \
'Python 3.9+ required'

>>> assert type(result) is str, 'result must be a str'
>>> assert result.endswith('UTC'), 'result must contain timezone'

>>> result
'02:56 UTC'
"""

import re


DATA = """Apollo 11 was the American spaceflight that first landed
humans on the Moon. Commander (CDR) Neil Armstrong and lunar module
pilot (LMP) Buzz Aldrin landed the Apollo Lunar Module (LM) Eagle on
July 20th, 1969 at 24:17 UTC, and Armstrong became the first person
to step (EVA) onto the Moon's surface (EVA) 6 hours 39 minutes later,
on July 21st, 1969 at 02:56 UTC. Aldrin joined him 19 minutes later.
They spent 2 hours 31 minutes exploring the site they had named
Tranquility Base upon landing. Armstrong and Aldrin collected 47.5 pounds
(21.5 kg) of lunar material to bring back to Earth as pilot Michael Collins
(CMP) flew the Command Module (CM) Columbia in lunar orbit, and were on the
Moon's surface for 21 hours 36 minutes before lifting off to rejoin
Columbia."""


# Pattern for searching time with timezone in 24 format, i.e. '23:59 UTC'
# Text contains invalid date `24:56 UTC`
# type: str
pattern = ...

# use re.search() to find pattern in DATA, get result text
# use .group() to get the value from re.Match object
# type: str
result = ...