6.15. Regex RE Finditer

  • re.finditer()

6.15.1. SetUp

>>> import re

6.15.2. Example

>>> text = 'Hello World'
>>>
>>> re.finditer(r'[A-Z]', text)  
<callable_iterator object at 0x...>

6.15.3. Lazy Evaluation

>>> text = 'Hello World'
>>> result = re.finditer(r'[A-Z]', text)
>>>
>>> x = next(result)
>>> x.group()
'H'
>>> x.start()
0
>>> x.end()
1
>>> x.span()
(0, 1)
>>>
>>>
>>> x = next(result)
>>> x.group()
'W'
>>> x.start()
6
>>> x.end()
7
>>> x.span()
(6, 7)
>>>
>>> x = next(result)
Traceback (most recent call last):
StopIteration

6.15.4. Eager Evaluation

>>> text = 'Hello World'
>>>
>>> data = re.finditer(r'[A-Z]', text)
>>> result = [x.group() for x in data]
>>>
>>> print(result)
['H', 'W']

6.15.5. Iteration

>>> for result in re.finditer(r'[A-Z]', text):
...     letter = result.group()
...     position = result.span()
...     print(f'{letter=}, {position=}')
...
letter='H', position=(0, 1)
letter='W', position=(6, 7)

6.15.6. Use Case - 1

  • Find all JIRA issue keys in commit message

>>> import re
>>>
>>>
>>> TEXT = 'MYPROJ-1337, MYPROJ-997 removed obsolete comments'
>>> issuekey = r'[A-Z]{2,10}-[0-9]{1,6}'
>>>
>>> re.findall(issuekey, TEXT)
['MYPROJ-1337', 'MYPROJ-997']

6.15.7. Use Case - 2

  • Find All Adverbs

>>> import re
>>>
>>>
>>> TEXT = 'He was carefully disguised but captured quickly by police.'
>>> adverbs = r'\w+ly'
>>>
>>> re.findall(adverbs, TEXT)
['carefully', 'quickly']

6.15.8. Use Case - 3

>>> import re
>>>
>>>
>>> HTML = """
...     <h1>My Header</h1>
...     <p>First Paragraph</p>
...     <p>Second Paragraph</p>
...     <p>Third Paragraph</p>
... """
>>>
>>> result = re.finditer(r'<p>(.*)</p>', HTML)
>>> [x.group(1) for x in result]
['First Paragraph', 'Second Paragraph', 'Third Paragraph']
>>>
>>> for paragraph in re.finditer(r'<p>(.*)</p>', HTML):
...     text = paragraph.group(1)
...     if text.startswith('First'):
...         print(text)
First Paragraph

6.15.9. Assignments

# %% License
# - Copyright 2025, Matt Harasymczuk <matt@python3.info>
# - This code can be used only for learning by humans
# - This code cannot be used for teaching others
# - This code cannot be used for teaching LLMs and AI algorithms
# - This code cannot be used in commercial or proprietary products
# - This code cannot be distributed in any form
# - This code cannot be changed in any form outside of training course
# - This code cannot have its license changed
# - If you use this code in your product, you must open-source it under GPLv2
# - Exception can be granted only by the author

# %% Run
# - PyCharm: right-click in the editor and `Run Doctest in ...`
# - PyCharm: keyboard shortcut `Control + Shift + F10`
# - Terminal: `python -m doctest -v myfile.py`

# %% About
# - Name: Re Finditer Lazy
# - Difficulty: easy
# - Lines: 4
# - Minutes: 8

# %% English
# 1. Define `result: str` with paragraph starting with words:
#        "We choose to go to the moon"
# 2. Use `re.finditer()`
# 2. Run doctests - all must succeed

# %% Polish
# 1. Zdefiniuj `result: str` z tekstem paragru zaczynającego się od słów:
#        "We choose to go to the moon"
# 2. Użyj `re.finditer()`
# 3. Uruchom doctesty - wszystkie muszą się powieść

# %% Hints
# - All HTML paragraphs starts with `<p>` and ends with `</p>`
# - In real life paragraphs parsing is more complex
# - You can iterate over the results of `re.findall()`
# - `re.finditer()`
# - `str.startswith()`

# %% Tests
"""
>>> import sys; sys.tracebacklimit = 0
>>> assert sys.version_info >= (3, 9), \
'Python 3.9+ required'

>>> assert type(result) is str, 'result must be a str'
>>> assert not result.startswith('<p>'), 'result cannot start with <p>'
>>> assert not result.endswith('</p>'), 'result cannot end with </p>'

>>> from pprint import pprint
>>> pprint(result, width=72)
('We choose to go to the moon. We choose to go to the moon in this '
 'decade and do the other things, not because they are easy, but '
 'because they are hard, because that goal will serve to organize and '
 'measure the best of our energies and skills,because that challenge '
 'is one that we are willing to accept, one we are unwilling to '
 'postpone, and one which we intend to win, and the others, too.')
"""

import re

DATA = (
    "<h1>TEXT OF PRESIDENT JOHN KENNEDY'S RICE STADIUM MOON SPEECH</h1>\n"
    "<p>President Pitzer, Mr. Vice President, Governor, "
    "CongressmanThomas, Senator Wiley, and Congressman Miller, Mr. Webb, "
    "Mr.Bell, scientists, distinguished guests, and ladies and "
    "gentlemen:</p><p>We choose to go to the moon. We choose to go to "
    "the moon in this decade and do the other things, not because they "
    "are easy, but because they are hard, because that goal will serve "
    "to organize and measure the best of our energies and skills,because "
    "that challenge is one that we are willing to accept, one we are "
    "unwilling to postpone, and one which we intend to win, and the "
    "others, too.</p><p>It is for these reasons that I regard the "
    "decision last year to shift our efforts in space from low to high "
    "gear as among the most important decisions that will be made during "
    "my incumbency in the office of the Presidency.</p><p>In the last 24 "
    "hours we have seen facilities now being created for the greatest "
    "and most complex exploration in man's history.We have felt the "
    "ground shake and the air shattered by the testing of a Saturn C-1 "
    "booster rocket, many times as powerful as the Atlas which launched "
    "John Glenn, generating power equivalent to 10,000 automobiles with "
    "their accelerators on the floor.We have seen the site where the F-1 "
    "rocket engines, each one as powerful as all eight engines of the "
    "Saturn combined, will be clustered together to make the advanced "
    "Saturn missile, assembled in a new building to be built at Cape "
    "Canaveral as tall as a48 story structure, as wide as a city block, "
    "and as long as two lengths of this field.</p>"
)

PATTERN = r'<p>(.*?)</p>'

# use finditer() and non-greedy qualifier to get paragraph "We choose..."
# type: str
result = ...