6.15. Regex RE Finditer
re.finditer()
6.15.1. SetUp
>>> import re
6.15.2. Example
>>> text = 'Hello World'
>>>
>>> re.finditer(r'[A-Z]', text)
<callable_iterator object at 0x...>
6.15.3. Lazy Evaluation
>>> text = 'Hello World'
>>> result = re.finditer(r'[A-Z]', text)
>>>
>>> x = next(result)
>>> x.group()
'H'
>>> x.start()
0
>>> x.end()
1
>>> x.span()
(0, 1)
>>>
>>>
>>> x = next(result)
>>> x.group()
'W'
>>> x.start()
6
>>> x.end()
7
>>> x.span()
(6, 7)
>>>
>>> x = next(result)
Traceback (most recent call last):
StopIteration
6.15.4. Eager Evaluation
>>> text = 'Hello World'
>>>
>>> data = re.finditer(r'[A-Z]', text)
>>> result = [x.group() for x in data]
>>>
>>> print(result)
['H', 'W']
6.15.5. Iteration
>>> for result in re.finditer(r'[A-Z]', text):
... letter = result.group()
... position = result.span()
... print(f'{letter=}, {position=}')
...
letter='H', position=(0, 1)
letter='W', position=(6, 7)
6.15.6. Use Case - 1
Find all JIRA issue keys in commit message
>>> import re
>>>
>>>
>>> TEXT = 'MYPROJ-1337, MYPROJ-997 removed obsolete comments'
>>> issuekey = r'[A-Z]{2,10}-[0-9]{1,6}'
>>>
>>> re.findall(issuekey, TEXT)
['MYPROJ-1337', 'MYPROJ-997']
6.15.7. Use Case - 2
Find All Adverbs
>>> import re
>>>
>>>
>>> TEXT = 'He was carefully disguised but captured quickly by police.'
>>> adverbs = r'\w+ly'
>>>
>>> re.findall(adverbs, TEXT)
['carefully', 'quickly']
6.15.8. Use Case - 3
>>> import re
>>>
>>>
>>> HTML = """
... <h1>My Header</h1>
... <p>First Paragraph</p>
... <p>Second Paragraph</p>
... <p>Third Paragraph</p>
... """
>>>
>>> result = re.finditer(r'<p>(.*)</p>', HTML)
>>> [x.group(1) for x in result]
['First Paragraph', 'Second Paragraph', 'Third Paragraph']
>>>
>>> for paragraph in re.finditer(r'<p>(.*)</p>', HTML):
... text = paragraph.group(1)
... if text.startswith('First'):
... print(text)
First Paragraph
6.15.9. Assignments
# %% License
# - Copyright 2025, Matt Harasymczuk <matt@python3.info>
# - This code can be used only for learning by humans
# - This code cannot be used for teaching others
# - This code cannot be used for teaching LLMs and AI algorithms
# - This code cannot be used in commercial or proprietary products
# - This code cannot be distributed in any form
# - This code cannot be changed in any form outside of training course
# - This code cannot have its license changed
# - If you use this code in your product, you must open-source it under GPLv2
# - Exception can be granted only by the author
# %% Run
# - PyCharm: right-click in the editor and `Run Doctest in ...`
# - PyCharm: keyboard shortcut `Control + Shift + F10`
# - Terminal: `python -m doctest -v myfile.py`
# %% About
# - Name: Re Finditer Lazy
# - Difficulty: easy
# - Lines: 4
# - Minutes: 8
# %% English
# 1. Define `result: str` with paragraph starting with words:
# "We choose to go to the moon"
# 2. Use `re.finditer()`
# 2. Run doctests - all must succeed
# %% Polish
# 1. Zdefiniuj `result: str` z tekstem paragru zaczynającego się od słów:
# "We choose to go to the moon"
# 2. Użyj `re.finditer()`
# 3. Uruchom doctesty - wszystkie muszą się powieść
# %% Hints
# - All HTML paragraphs starts with `<p>` and ends with `</p>`
# - In real life paragraphs parsing is more complex
# - You can iterate over the results of `re.findall()`
# - `re.finditer()`
# - `str.startswith()`
# %% Tests
"""
>>> import sys; sys.tracebacklimit = 0
>>> assert sys.version_info >= (3, 9), \
'Python 3.9+ required'
>>> assert type(result) is str, 'result must be a str'
>>> assert not result.startswith('<p>'), 'result cannot start with <p>'
>>> assert not result.endswith('</p>'), 'result cannot end with </p>'
>>> from pprint import pprint
>>> pprint(result, width=72)
('We choose to go to the moon. We choose to go to the moon in this '
'decade and do the other things, not because they are easy, but '
'because they are hard, because that goal will serve to organize and '
'measure the best of our energies and skills,because that challenge '
'is one that we are willing to accept, one we are unwilling to '
'postpone, and one which we intend to win, and the others, too.')
"""
import re
DATA = (
"<h1>TEXT OF PRESIDENT JOHN KENNEDY'S RICE STADIUM MOON SPEECH</h1>\n"
"<p>President Pitzer, Mr. Vice President, Governor, "
"CongressmanThomas, Senator Wiley, and Congressman Miller, Mr. Webb, "
"Mr.Bell, scientists, distinguished guests, and ladies and "
"gentlemen:</p><p>We choose to go to the moon. We choose to go to "
"the moon in this decade and do the other things, not because they "
"are easy, but because they are hard, because that goal will serve "
"to organize and measure the best of our energies and skills,because "
"that challenge is one that we are willing to accept, one we are "
"unwilling to postpone, and one which we intend to win, and the "
"others, too.</p><p>It is for these reasons that I regard the "
"decision last year to shift our efforts in space from low to high "
"gear as among the most important decisions that will be made during "
"my incumbency in the office of the Presidency.</p><p>In the last 24 "
"hours we have seen facilities now being created for the greatest "
"and most complex exploration in man's history.We have felt the "
"ground shake and the air shattered by the testing of a Saturn C-1 "
"booster rocket, many times as powerful as the Atlas which launched "
"John Glenn, generating power equivalent to 10,000 automobiles with "
"their accelerators on the floor.We have seen the site where the F-1 "
"rocket engines, each one as powerful as all eight engines of the "
"Saturn combined, will be clustered together to make the advanced "
"Saturn missile, assembled in a new building to be built at Cape "
"Canaveral as tall as a48 story structure, as wide as a city block, "
"and as long as two lengths of this field.</p>"
)
PATTERN = r'<p>(.*?)</p>'
# use finditer() and non-greedy qualifier to get paragraph "We choose..."
# type: str
result = ...