6.21. Regex Case Study
6.21.1. National Identification Numbers
>>> def pesel_check_digit(self):
... weights = (1, 3, 7, 9, 1, 3, 7, 9, 1, 3)
... check = sum(w * int(n) for w, n in zip(weights, self.pesel))
... return str((10 - check) % 10)
6.21.2. Dates
ISO Date:
>>> pattern = r'^\d{4}-\d{2}-\d{2}$'
US Long Date:
>>> pattern = r'^\w+ \d{2}, \d{4}$'
US Short Date:
>>> pattern = r'^\d{2}/\d{2}/\d{2}$'
6.21.3. Email
>>> pattern = r'^[a-zA-Z0-9][\w.+-]*@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]{2,20}$'
W3C HTML5 Standard [2] regexp for email field
>>> pattern = r"^[a-zA-Z0-9.!#$%&'*+/=?^_`{|}~-]+@[a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)*$"
6.21.4. URL
W3C standard for URL understanding
^(?=[^&])(?:(?<scheme>[^:/?#]+):)?(?://(?<authority>[^/?#]*))?
(?<path>[^?#]*)(?:\?(?<query>[^#]*))?(?:#(?<fragment>.*))?
>>> scheme = r'(?:(?<scheme>[^:/?#]+):)?'
>>> authority = r'(?://(?<authority>[^/?#]*))?'
>>> path = r'(?<path>[^?#]*)'
>>> query = r'(?:\?(?<query>[^#]*))?'
>>> fragment = r'(?:#(?<fragment>.*))?'
>>>
>>> pattern = f'^(?=[^&]){scheme}{authority}{path}{query}{fragment}'
>>>
>>> print(pattern)
^(?=[^&])(?:(?<scheme>[^:/?#]+):)?(?://(?<authority>[^/?#]*))?(?<path>[^?#]*)(?:\?(?<query>[^#]*))?(?:#(?<fragment>.*))?
W3C standard for URL parsing
/^\s*[a-z](?:[-a-z0-9\+\.])*:(?:\/\/(?:(?:%[0-9a-f][0-9a-f]|[-a-z0-9\._~
\uA0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF\u10000-\u1FFFD\u20000-\u2FFFD\u30000-
\u3FFFD\u40000-\u4FFFD\u50000-\u5FFFD\u60000-\u6FFFD\u70000-\u7FFFD\u80000-
\u8FFFD\u90000-\u9FFFD\uA0000-\uAFFFD\uB0000-\uBFFFD\uC0000-\uCFFFD\uD0000-
\uDFFFD\uE1000-\uEFFFD!\$&\'\(\)\*\+,;=:])*@)?(?:\[(?:(?:(?:[0-9a-f]{1,4}:)
{6}(?:[0-9a-f]{1,4}:[0-9a-f]{1,4}|(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4]
[0-9]|25[0-5])(?:\.(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])){3}
)|::(?:[0-9a-f]{1,4}:){5}(?:[0-9a-f]{1,4}:[0-9a-f]{1,4}|(?:[0-9]|[1-9][0-9]
|1[0-9][0-9]|2[0-4][0-9]|25[0-5])(?:\.(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2
[0-4][0-9]|25[0-5])){3})|(?:[0-9a-f]{1,4})?::(?:[0-9a-f]{1,4}:){4}
(?:[0-9a-f]{1,4}:[0-9a-f]{1,4}|(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]
|25[0-5])(?:\.(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])){3})|
(?:[0-9a-f]{1,4}:[0-9a-f]{1,4})?::(?:[0-9a-f]{1,4}:){3}(?:[0-9a-f]{1,4}:
[0-9a-f]{1,4}|(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])(?:\.
(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])){3})|(?:(?:[0-9a-f]
{1,4}:){0,2}[0-9a-f]{1,4})?::(?:[0-9a-f]{1,4}:){2}(?:[0-9a-f]{1,4}:[0-9a-f]
{1,4}|(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])(?:\.(?:[0-9]|
[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])){3})|(?:(?:[0-9a-f]{1,4}:){0,3}
[0-9a-f]{1,4})?::[0-9a-f]{1,4}:(?:[0-9a-f]{1,4}:[0-9a-f]{1,4}|(?:[0-9]|
[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])(?:\.(?:[0-9]|[1-9][0-9]|1[0-9]
[0-9]|2[0-4][0-9]|25[0-5])){3})|(?:(?:[0-9a-f]{1,4}:){0,4}[0-9a-f]{1,4})?::
(?:[0-9a-f]{1,4}:[0-9a-f]{1,4}|(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|
25[0-5])(?:\.(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])){3})|
(?:(?:[0-9a-f]{1,4}:){0,5}[0-9a-f]{1,4})?::[0-9a-f]{1,4}|(?:(?:[0-9a-f]
{1,4}:){0,6}[0-9a-f]{1,4})?::)|v[0-9a-f]+[-a-z0-9\._~!\$&\'\(\)\*\+,;
=:]+)\]|(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])(?:\.
(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])){3}|
(?:%[0-9a-f][0-9a-f]|[-a-z0-9\
._~\uA0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF\u10000-\u1FFFD\u20000-\u2FFFD
\u30000-\u3FFFD\u40000-\u4FFFD\u50000-\u5FFFD\u60000-\u6FFFD\u70000-\u7FFFD
\u80000-\u8FFFD\u90000-\u9FFFD\uA0000-\uAFFFD\uB0000-\uBFFFD\uC0000-\uCFFFD
\uD0000-\uDFFFD\uE1000-\uEFFFD!\$&\'\(\)\*\+,;=@])*)(?::[0-9]*)?(?:\/(?:
(?:%[0-9a-f][0-9a-f]|[-a-z0-9\
._~\uA0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF\u10000-\u1FFFD\u20000-\u2FFFD
\u30000-\u3FFFD\u40000-\u4FFFD\u50000-\u5FFFD\u60000-\u6FFFD\u70000-\u7FFFD
\u80000-\u8FFFD\u90000-\u9FFFD\uA0000-\uAFFFD\uB0000-\uBFFFD\uC0000-\uCFFFD
\uD0000-\uDFFFD\uE1000-\uEFFFD!\$&\'\(\)\*\+,;=:@]))*)*|\/(?:(?:(?:
(?:%[0-9a-f][0-9a-f]|[-a-z0-9\
._~\uA0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF\u10000-\u1FFFD\u20000-\u2FFFD
\u30000-\u3FFFD\u40000-\u4FFFD\u50000-\u5FFFD\u60000-\u6FFFD\u70000-\u7FFFD
\u80000-\u8FFFD\u90000-\u9FFFD\uA0000-\uAFFFD\uB0000-\uBFFFD\uC0000-\uCFFFD
\uD0000-\uDFFFD\uE1000-\uEFFFD!\$&\'\(\)\*\+,;=:@]))+)(?:\/(?:
(?:%[0-9a-f][0-9a-f]|[-a-z0-9\
._~\uA0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF\u10000-\u1FFFD\u20000-\u2FFFD
\u30000-\u3FFFD\u40000-\u4FFFD\u50000-\u5FFFD\u60000-\u6FFFD\u70000-\u7FFFD
\u80000-\u8FFFD\u90000-\u9FFFD\uA0000-\uAFFFD\uB0000-\uBFFFD\uC0000-\uCFFFD
\uD0000-\uDFFFD\uE1000-\uEFFFD!\$&\'\(\)\*\+,;=:@]))*)*)?|(?:(?:
(?:%[0-9a-f][0-9a-f]|[-a-z0-9\
._~\uA0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF\u10000-\u1FFFD\u20000-\u2FFFD
\u30000-\u3FFFD\u40000-\u4FFFD\u50000-\u5FFFD\u60000-\u6FFFD\u70000-\u7FFFD
\u80000-\u8FFFD\u90000-\u9FFFD\uA0000-\uAFFFD\uB0000-\uBFFFD\uC0000-\uCFFFD
\uD0000-\uDFFFD\uE1000-\uEFFFD!\$&\'\(\)\*\+,;=:@]))+)(?:\/(?:
(?:%[0-9a-f][0-9a-f]|[-a-z0-9\
._~\uA0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF\u10000-\u1FFFD\u20000-\u2FFFD
\u30000-\u3FFFD\u40000-\u4FFFD\u50000-\u5FFFD\u60000-\u6FFFD\u70000-\u7FFFD
\u80000-\u8FFFD\u90000-\u9FFFD\uA0000-\uAFFFD\uB0000-\uBFFFD\uC0000-\uCFFFD
\uD0000-\uDFFFD\uE1000-\uEFFFD!\$&\'\(\)\*\+,;=:@]))*)*|(?!
(?:%[0-9a-f][0-9a-f]|[-a-z0-9\
._~\uA0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF\u10000-\u1FFFD\u20000-\u2FFFD
\u30000-\u3FFFD\u40000-\u4FFFD\u50000-\u5FFFD\u60000-\u6FFFD\u70000-\u7FFFD
\u80000-\u8FFFD\u90000-\u9FFFD\uA0000-\uAFFFD\uB0000-\uBFFFD\uC0000-\uCFFFD
\uD0000-\uDFFFD\uE1000-\uEFFFD!\$&\'\(\)\*\+,;=:@])))(?:\?(?:
(?:%[0-9a-f][0-9a-f]|[-a-z0-9\
._~\uA0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF\u10000-\u1FFFD\u20000-\u2FFFD
\u30000-\u3FFFD\u40000-\u4FFFD\u50000-\u5FFFD\u60000-\u6FFFD\u70000-\u7FFFD
\u80000-\u8FFFD\u90000-\u9FFFD\uA0000-\uAFFFD\uB0000-\uBFFFD\uC0000-\uCFFFD
\uD0000-\uDFFFD\uE1000-\uEFFFD!\$&\'\(\)\*\+,;=:@])
|[\uE000-\uF8FF\uF0000-\uFFFFD|\u100000-\u10FFFD\/\?])*)?(?:\#(?:
(?:%[0-9a-f][0-9a-f]|[-a-z0-9\
._~\uA0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF\u10000-\u1FFFD\u20000-\u2FFFD
\u30000-\u3FFFD\u40000-\u4FFFD\u50000-\u5FFFD\u60000-\u6FFFD\u70000-\u7FFFD
\u80000-\u8FFFD\u90000-\u9FFFD\uA0000-\uAFFFD\uB0000-\uBFFFD\uC0000-\uCFFFD
\uD0000-\uDFFFD\uE1000-\uEFFFD!\$&\'\(\)\*\+,;=:@])|[\/\?])*)?\s*$/i
6.21.5. Parsing URLs
Source [3]
To parse a URL url into its component parts, the user agent must use the following steps:
Strip leading and trailing space characters from url.
Parse url in the manner defined by RFC 3986, with the following exceptions:
Add all characters with code points less than or equal to
U+0020
or greater than or equal toU+007F
to the<unreserved>
production.Add the characters
U+0022
,U+003C
,U+003E
,U+005B
...U+005E
,U+0060
, andU+007B
...U+007D
to the<unreserved>
productionAdd a single
U+0025
PERCENT SIGN character as a second alternative way of matching the<pct-encoded>
production, except when the<pct-encoded>
is used in the<reg-name>
production.Add the
U+0023
NUMBER SIGN character to the characters allowed in the<fragment>
production.
If url doesn't match the
<URI-reference>
production, even after the above changes are made to the ABNF definitions, then parsing the URL fails with an error. [RFC 3986] Otherwise, parsing url was successful; the components of the URL are substrings of url defined as follows:
- scheme
The substring matched by the
<scheme>
production, if any.- host
The substring matched by the
<host>
production, if any.- port
The substring matched by the
<port>
production, if any.- hostport
If there is a
<scheme>
component and a<port>
component and the port given by the<port>
component is different than the default port defined for the protocol given by the<scheme>
component, then<hostport>
is the substring that starts with the substring matched by the<host>
production and ends with the substring matched by the<port>
production, and includes the colon in between the two. Otherwise, it is the same as the<host>
component.- path
The substring matched by one of the following productions, if one of them was matched:
- path-abempty
- path-absolute
- path-noscheme
- path-rootless
- path-empty
- query
The substring matched by the
<query>
production, if any.- fragment
The substring matched by the
<fragment>
production, if any.- host-specific
The substring that follows the substring matched by the <authority> production, or the whole string if the
<authority>
production wasn't matched.
6.21.6. References
6.21.7. Assignments
# %% License
# - Copyright 2025, Matt Harasymczuk <matt@python3.info>
# - This code can be used only for learning by humans
# - This code cannot be used for teaching others
# - This code cannot be used for teaching LLMs and AI algorithms
# - This code cannot be used in commercial or proprietary products
# - This code cannot be distributed in any form
# - This code cannot be changed in any form outside of training course
# - This code cannot have its license changed
# - If you use this code in your product, you must open-source it under GPLv2
# - Exception can be granted only by the author
# %% Run
# - PyCharm: right-click in the editor and `Run Doctest in ...`
# - PyCharm: keyboard shortcut `Control + Shift + F10`
# - Terminal: `python -m doctest -v myfile.py`
# %% About
# - Name: RE Standards IsValidPesel
# - Difficulty: easy
# - Lines: 4
# - Minutes: 5
# %% English
# 1. Write implementation of `is_pesel_valid`:
# - Pesel validation using regex is too complex
# - Use simplified pattern: r'^[0-9]{11}$'
# - This pattern will allow to avoid 80% of accidental mistakes
# 2. Run doctests - all must succeed
# %% Polish
# 1. Napisz implementację `is_pesel_valid`
# - Walidacja Pesel za pomocą regex jest zbyt skomplikowana
# - Użyj uproszczonego wzorca: r'^[0-9]{11}$'
# - Ten wzorzec pozwoli na uniknięcie 80% przypadkowych błędów
# 2. Uruchom doctesty - wszystkie muszą się powieść
# %% Tests
"""
>>> import sys; sys.tracebacklimit = 0
>>> assert sys.version_info >= (3, 9), \
'Python 3.9+ required'
>>> is_pesel_valid('69072101234')
True
>>> is_pesel_valid('18220812345')
True
"""
import re
PATTERN = r'^[0-9]{11}$'
def is_pesel_valid(pesel):
...
# %% License
# - Copyright 2025, Matt Harasymczuk <matt@python3.info>
# - This code can be used only for learning by humans
# - This code cannot be used for teaching others
# - This code cannot be used for teaching LLMs and AI algorithms
# - This code cannot be used in commercial or proprietary products
# - This code cannot be distributed in any form
# - This code cannot be changed in any form outside of training course
# - This code cannot have its license changed
# - If you use this code in your product, you must open-source it under GPLv2
# - Exception can be granted only by the author
# %% Run
# - PyCharm: right-click in the editor and `Run Doctest in ...`
# - PyCharm: keyboard shortcut `Control + Shift + F10`
# - Terminal: `python -m doctest -v myfile.py`
# %% About
# - Name: RE Standards IsPeselWoman
# - Difficulty: easy
# - Lines: 3
# - Minutes: 5
# %% English
# 1. Write implementation of `is_pesel_woman`:
# - Pesel belongs to a woman if second to last digit is even
# - Do not use regex
# 2. Run doctests - all must succeed
# %% Polish
# 1. Napisz implementację `is_pesel_woman`:
# - Pesel należy do kobiety, jeżeli przed ostatnia cyfra jest parzysta
# - Nie korzystaj z regex
# 2. Uruchom doctesty - wszystkie muszą się powieść
# %% Tests
"""
>>> import sys; sys.tracebacklimit = 0
>>> assert sys.version_info >= (3, 9), \
'Python 3.9+ required'
>>> is_pesel_woman(69072101234)
False
>>> is_pesel_woman(18220812345)
True
"""
PATTERN = r'^\d{11}$'
WOMAN = {0,2,4,6,8}
MAN = {1,3,5,7,9}
# type: Callable[[int], bool]
def is_pesel_woman(pesel):
"""
Check whether PESEL is woman's.
If the second to last number is even,
then PESEL is woman's, in other case PESEL is man's.
"""
...
# %% License
# - Copyright 2025, Matt Harasymczuk <matt@python3.info>
# - This code can be used only for learning by humans
# - This code cannot be used for teaching others
# - This code cannot be used for teaching LLMs and AI algorithms
# - This code cannot be used in commercial or proprietary products
# - This code cannot be distributed in any form
# - This code cannot be changed in any form outside of training course
# - This code cannot have its license changed
# - If you use this code in your product, you must open-source it under GPLv2
# - Exception can be granted only by the author
# %% Run
# - PyCharm: right-click in the editor and `Run Doctest in ...`
# - PyCharm: keyboard shortcut `Control + Shift + F10`
# - Terminal: `python -m doctest -v myfile.py`
# %% About
# - Name: RE Standards PESEL
# - Difficulty: medium
# - Lines: 0
# - Minutes: 5
# %% English
# 1. Discussion only - do not write any code
# 2. Consider Pesel only for people born before year 2000
# 3. Having PESEL "69072101234":
# - What pattern can be at the first place in PESEL?
# - What pattern can be at the second place in PESEL?
# - What pattern can be at the third place in PESEL?
# - What pattern can be at the fourth place in PESEL?
# - What pattern can be at the fifth place in PESEL?
# - What pattern can be at the sixth place in PESEL?
# 4. What is control digit or control sum?
# %% Polish
# 1. Tylko dyskusja - nie pisz żadnego kodu
# 2. Zajmujemy się tylko peselami ludzi urodzonymi przed 2000 rokiem
# 3. Mając PESEL "69072101234":
# - Jakie wyrażenie może być na pierwszym miejscu w PESEL?
# - Jakie wyrażenie może być na drugim miejscu w PESEL?
# - Jakie wyrażenie może być na trzecim miejscu w PESEL?
# - Jakie wyrażenie może być na czwartym miejscu w PESEL?
# - Jakie wyrażenie może być na piątym miejscu w PESEL?
# - Jakie wyrażenie może być na szóstym miejscu w PESEL?
# 4. Co to jest cyfra kontrolna lub suma kontrolna?