6.21. Regex Case Study

6.21.1. National Identification Numbers

>>> def pesel_check_digit(self):
...     weights = (1, 3, 7, 9, 1, 3, 7, 9, 1, 3)
...     check = sum(w * int(n) for w, n in zip(weights, self.pesel))
...     return str((10 - check) % 10)
../../_images/regex-dowodosobisty-new-1.jpg
../../_images/regex-dowodosobisty-prl-1.jpg
../../_images/regex-dowodosobisty-prl-2.jpg
../../_images/regex-dowodosobisty-prl-3.jpg
../../_images/regex-dowodosobisty-old-1.jpg
../../_images/regex-dowodosobisty-old-2.jpg
../../_images/regex-dowodosobisty-old-3.jpg

6.21.2. Dates

ISO Date:

>>> pattern = r'^\d{4}-\d{2}-\d{2}$'

US Long Date:

>>> pattern = r'^\w+ \d{2}, \d{4}$'

US Short Date:

>>> pattern = r'^\d{2}/\d{2}/\d{2}$'

6.21.3. Email

>>> pattern = r'^[a-zA-Z0-9][\w.+-]*@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]{2,20}$'

W3C HTML5 Standard [2] regexp for email field

>>> pattern = r"^[a-zA-Z0-9.!#$%&'*+/=?^_`{|}~-]+@[a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)*$"

6.21.4. URL

W3C standard for URL understanding

^(?=[^&])(?:(?<scheme>[^:/?#]+):)?(?://(?<authority>[^/?#]*))?
(?<path>[^?#]*)(?:\?(?<query>[^#]*))?(?:#(?<fragment>.*))?
>>> scheme = r'(?:(?<scheme>[^:/?#]+):)?'
>>> authority = r'(?://(?<authority>[^/?#]*))?'
>>> path = r'(?<path>[^?#]*)'
>>> query = r'(?:\?(?<query>[^#]*))?'
>>> fragment = r'(?:#(?<fragment>.*))?'
>>>
>>> pattern = f'^(?=[^&]){scheme}{authority}{path}{query}{fragment}'
>>>
>>> print(pattern)
^(?=[^&])(?:(?<scheme>[^:/?#]+):)?(?://(?<authority>[^/?#]*))?(?<path>[^?#]*)(?:\?(?<query>[^#]*))?(?:#(?<fragment>.*))?

W3C standard for URL parsing

/^\s*[a-z](?:[-a-z0-9\+\.])*:(?:\/\/(?:(?:%[0-9a-f][0-9a-f]|[-a-z0-9\._~
\uA0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF\u10000-\u1FFFD\u20000-\u2FFFD\u30000-
\u3FFFD\u40000-\u4FFFD\u50000-\u5FFFD\u60000-\u6FFFD\u70000-\u7FFFD\u80000-
\u8FFFD\u90000-\u9FFFD\uA0000-\uAFFFD\uB0000-\uBFFFD\uC0000-\uCFFFD\uD0000-
\uDFFFD\uE1000-\uEFFFD!\$&\'\(\)\*\+,;=:])*@)?(?:\[(?:(?:(?:[0-9a-f]{1,4}:)
{6}(?:[0-9a-f]{1,4}:[0-9a-f]{1,4}|(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4]
[0-9]|25[0-5])(?:\.(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])){3}
)|::(?:[0-9a-f]{1,4}:){5}(?:[0-9a-f]{1,4}:[0-9a-f]{1,4}|(?:[0-9]|[1-9][0-9]
|1[0-9][0-9]|2[0-4][0-9]|25[0-5])(?:\.(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2
[0-4][0-9]|25[0-5])){3})|(?:[0-9a-f]{1,4})?::(?:[0-9a-f]{1,4}:){4}
(?:[0-9a-f]{1,4}:[0-9a-f]{1,4}|(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]
|25[0-5])(?:\.(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])){3})|
(?:[0-9a-f]{1,4}:[0-9a-f]{1,4})?::(?:[0-9a-f]{1,4}:){3}(?:[0-9a-f]{1,4}:
[0-9a-f]{1,4}|(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])(?:\.
(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])){3})|(?:(?:[0-9a-f]
{1,4}:){0,2}[0-9a-f]{1,4})?::(?:[0-9a-f]{1,4}:){2}(?:[0-9a-f]{1,4}:[0-9a-f]
{1,4}|(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])(?:\.(?:[0-9]|
[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])){3})|(?:(?:[0-9a-f]{1,4}:){0,3}
[0-9a-f]{1,4})?::[0-9a-f]{1,4}:(?:[0-9a-f]{1,4}:[0-9a-f]{1,4}|(?:[0-9]|
[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])(?:\.(?:[0-9]|[1-9][0-9]|1[0-9]
[0-9]|2[0-4][0-9]|25[0-5])){3})|(?:(?:[0-9a-f]{1,4}:){0,4}[0-9a-f]{1,4})?::
(?:[0-9a-f]{1,4}:[0-9a-f]{1,4}|(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|
25[0-5])(?:\.(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])){3})|
(?:(?:[0-9a-f]{1,4}:){0,5}[0-9a-f]{1,4})?::[0-9a-f]{1,4}|(?:(?:[0-9a-f]
{1,4}:){0,6}[0-9a-f]{1,4})?::)|v[0-9a-f]+[-a-z0-9\._~!\$&\'\(\)\*\+,;
=:]+)\]|(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])(?:\.
(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])){3}|
(?:%[0-9a-f][0-9a-f]|[-a-z0-9\
._~\uA0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF\u10000-\u1FFFD\u20000-\u2FFFD
\u30000-\u3FFFD\u40000-\u4FFFD\u50000-\u5FFFD\u60000-\u6FFFD\u70000-\u7FFFD
\u80000-\u8FFFD\u90000-\u9FFFD\uA0000-\uAFFFD\uB0000-\uBFFFD\uC0000-\uCFFFD
\uD0000-\uDFFFD\uE1000-\uEFFFD!\$&\'\(\)\*\+,;=@])*)(?::[0-9]*)?(?:\/(?:
(?:%[0-9a-f][0-9a-f]|[-a-z0-9\
._~\uA0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF\u10000-\u1FFFD\u20000-\u2FFFD
\u30000-\u3FFFD\u40000-\u4FFFD\u50000-\u5FFFD\u60000-\u6FFFD\u70000-\u7FFFD
\u80000-\u8FFFD\u90000-\u9FFFD\uA0000-\uAFFFD\uB0000-\uBFFFD\uC0000-\uCFFFD
\uD0000-\uDFFFD\uE1000-\uEFFFD!\$&\'\(\)\*\+,;=:@]))*)*|\/(?:(?:(?:
(?:%[0-9a-f][0-9a-f]|[-a-z0-9\
._~\uA0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF\u10000-\u1FFFD\u20000-\u2FFFD
\u30000-\u3FFFD\u40000-\u4FFFD\u50000-\u5FFFD\u60000-\u6FFFD\u70000-\u7FFFD
\u80000-\u8FFFD\u90000-\u9FFFD\uA0000-\uAFFFD\uB0000-\uBFFFD\uC0000-\uCFFFD
\uD0000-\uDFFFD\uE1000-\uEFFFD!\$&\'\(\)\*\+,;=:@]))+)(?:\/(?:
(?:%[0-9a-f][0-9a-f]|[-a-z0-9\
._~\uA0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF\u10000-\u1FFFD\u20000-\u2FFFD
\u30000-\u3FFFD\u40000-\u4FFFD\u50000-\u5FFFD\u60000-\u6FFFD\u70000-\u7FFFD
\u80000-\u8FFFD\u90000-\u9FFFD\uA0000-\uAFFFD\uB0000-\uBFFFD\uC0000-\uCFFFD
\uD0000-\uDFFFD\uE1000-\uEFFFD!\$&\'\(\)\*\+,;=:@]))*)*)?|(?:(?:
(?:%[0-9a-f][0-9a-f]|[-a-z0-9\
._~\uA0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF\u10000-\u1FFFD\u20000-\u2FFFD
\u30000-\u3FFFD\u40000-\u4FFFD\u50000-\u5FFFD\u60000-\u6FFFD\u70000-\u7FFFD
\u80000-\u8FFFD\u90000-\u9FFFD\uA0000-\uAFFFD\uB0000-\uBFFFD\uC0000-\uCFFFD
\uD0000-\uDFFFD\uE1000-\uEFFFD!\$&\'\(\)\*\+,;=:@]))+)(?:\/(?:
(?:%[0-9a-f][0-9a-f]|[-a-z0-9\
._~\uA0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF\u10000-\u1FFFD\u20000-\u2FFFD
\u30000-\u3FFFD\u40000-\u4FFFD\u50000-\u5FFFD\u60000-\u6FFFD\u70000-\u7FFFD
\u80000-\u8FFFD\u90000-\u9FFFD\uA0000-\uAFFFD\uB0000-\uBFFFD\uC0000-\uCFFFD
\uD0000-\uDFFFD\uE1000-\uEFFFD!\$&\'\(\)\*\+,;=:@]))*)*|(?!
(?:%[0-9a-f][0-9a-f]|[-a-z0-9\
._~\uA0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF\u10000-\u1FFFD\u20000-\u2FFFD
\u30000-\u3FFFD\u40000-\u4FFFD\u50000-\u5FFFD\u60000-\u6FFFD\u70000-\u7FFFD
\u80000-\u8FFFD\u90000-\u9FFFD\uA0000-\uAFFFD\uB0000-\uBFFFD\uC0000-\uCFFFD
\uD0000-\uDFFFD\uE1000-\uEFFFD!\$&\'\(\)\*\+,;=:@])))(?:\?(?:
(?:%[0-9a-f][0-9a-f]|[-a-z0-9\
._~\uA0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF\u10000-\u1FFFD\u20000-\u2FFFD
\u30000-\u3FFFD\u40000-\u4FFFD\u50000-\u5FFFD\u60000-\u6FFFD\u70000-\u7FFFD
\u80000-\u8FFFD\u90000-\u9FFFD\uA0000-\uAFFFD\uB0000-\uBFFFD\uC0000-\uCFFFD
\uD0000-\uDFFFD\uE1000-\uEFFFD!\$&\'\(\)\*\+,;=:@])
|[\uE000-\uF8FF\uF0000-\uFFFFD|\u100000-\u10FFFD\/\?])*)?(?:\#(?:
(?:%[0-9a-f][0-9a-f]|[-a-z0-9\
._~\uA0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF\u10000-\u1FFFD\u20000-\u2FFFD
\u30000-\u3FFFD\u40000-\u4FFFD\u50000-\u5FFFD\u60000-\u6FFFD\u70000-\u7FFFD
\u80000-\u8FFFD\u90000-\u9FFFD\uA0000-\uAFFFD\uB0000-\uBFFFD\uC0000-\uCFFFD
\uD0000-\uDFFFD\uE1000-\uEFFFD!\$&\'\(\)\*\+,;=:@])|[\/\?])*)?\s*$/i

6.21.5. Parsing URLs

To parse a URL url into its component parts, the user agent must use the following steps:

  1. Strip leading and trailing space characters from url.

  2. Parse url in the manner defined by RFC 3986, with the following exceptions:

    • Add all characters with code points less than or equal to U+0020 or greater than or equal to U+007F to the <unreserved> production.

    • Add the characters U+0022, U+003C, U+003E, U+005B ... U+005E, U+0060, and U+007B ... U+007D to the <unreserved> production

    • Add a single U+0025 PERCENT SIGN character as a second alternative way of matching the <pct-encoded> production, except when the <pct-encoded> is used in the <reg-name> production.

    • Add the U+0023 NUMBER SIGN character to the characters allowed in the <fragment> production.

  3. If url doesn't match the <URI-reference> production, even after the above changes are made to the ABNF definitions, then parsing the URL fails with an error. [RFC 3986] Otherwise, parsing url was successful; the components of the URL are substrings of url defined as follows:

scheme

The substring matched by the <scheme> production, if any.

host

The substring matched by the <host> production, if any.

port

The substring matched by the <port> production, if any.

hostport

If there is a <scheme> component and a <port> component and the port given by the <port> component is different than the default port defined for the protocol given by the <scheme> component, then <hostport> is the substring that starts with the substring matched by the <host> production and ends with the substring matched by the <port> production, and includes the colon in between the two. Otherwise, it is the same as the <host> component.

path

The substring matched by one of the following productions, if one of them was matched:

path-abempty
path-absolute
path-noscheme
path-rootless
path-empty
query

The substring matched by the <query> production, if any.

fragment

The substring matched by the <fragment> production, if any.

host-specific

The substring that follows the substring matched by the <authority> production, or the whole string if the <authority> production wasn't matched.

6.21.6. References

6.21.7. Assignments

# %% License
# - Copyright 2025, Matt Harasymczuk <matt@python3.info>
# - This code can be used only for learning by humans
# - This code cannot be used for teaching others
# - This code cannot be used for teaching LLMs and AI algorithms
# - This code cannot be used in commercial or proprietary products
# - This code cannot be distributed in any form
# - This code cannot be changed in any form outside of training course
# - This code cannot have its license changed
# - If you use this code in your product, you must open-source it under GPLv2
# - Exception can be granted only by the author

# %% Run
# - PyCharm: right-click in the editor and `Run Doctest in ...`
# - PyCharm: keyboard shortcut `Control + Shift + F10`
# - Terminal: `python -m doctest -v myfile.py`

# %% About
# - Name: RE Standards IsValidPesel
# - Difficulty: easy
# - Lines: 4
# - Minutes: 5

# %% English
# 1. Write implementation of `is_pesel_valid`:
#    - Pesel validation using regex is too complex
#    - Use simplified pattern: r'^[0-9]{11}$'
#    - This pattern will allow to avoid 80% of accidental mistakes
# 2. Run doctests - all must succeed

# %% Polish
# 1. Napisz implementację `is_pesel_valid`
#    - Walidacja Pesel za pomocą regex jest zbyt skomplikowana
#    - Użyj uproszczonego wzorca: r'^[0-9]{11}$'
#    - Ten wzorzec pozwoli na uniknięcie 80% przypadkowych błędów
# 2. Uruchom doctesty - wszystkie muszą się powieść

# %% Tests
"""
>>> import sys; sys.tracebacklimit = 0
>>> assert sys.version_info >= (3, 9), \
'Python 3.9+ required'

>>> is_pesel_valid('69072101234')
True
>>> is_pesel_valid('18220812345')
True
"""

import re

PATTERN = r'^[0-9]{11}$'


def is_pesel_valid(pesel):
    ...


# %% License
# - Copyright 2025, Matt Harasymczuk <matt@python3.info>
# - This code can be used only for learning by humans
# - This code cannot be used for teaching others
# - This code cannot be used for teaching LLMs and AI algorithms
# - This code cannot be used in commercial or proprietary products
# - This code cannot be distributed in any form
# - This code cannot be changed in any form outside of training course
# - This code cannot have its license changed
# - If you use this code in your product, you must open-source it under GPLv2
# - Exception can be granted only by the author

# %% Run
# - PyCharm: right-click in the editor and `Run Doctest in ...`
# - PyCharm: keyboard shortcut `Control + Shift + F10`
# - Terminal: `python -m doctest -v myfile.py`

# %% About
# - Name: RE Standards IsPeselWoman
# - Difficulty: easy
# - Lines: 3
# - Minutes: 5

# %% English
# 1. Write implementation of `is_pesel_woman`:
#    - Pesel belongs to a woman if second to last digit is even
#    - Do not use regex
# 2. Run doctests - all must succeed

# %% Polish
# 1. Napisz implementację `is_pesel_woman`:
#    - Pesel należy do kobiety, jeżeli przed ostatnia cyfra jest parzysta
#    - Nie korzystaj z regex
# 2. Uruchom doctesty - wszystkie muszą się powieść

# %% Tests
"""
>>> import sys; sys.tracebacklimit = 0
>>> assert sys.version_info >= (3, 9), \
'Python 3.9+ required'

>>> is_pesel_woman(69072101234)
False
>>> is_pesel_woman(18220812345)
True
"""

PATTERN = r'^\d{11}$'
WOMAN = {0,2,4,6,8}
MAN = {1,3,5,7,9}


# type: Callable[[int], bool]
def is_pesel_woman(pesel):
    """
    Check whether PESEL is woman's.
    If the second to last number is even,
    then PESEL is woman's, in other case PESEL is man's.
    """
    ...


# %% License
# - Copyright 2025, Matt Harasymczuk <matt@python3.info>
# - This code can be used only for learning by humans
# - This code cannot be used for teaching others
# - This code cannot be used for teaching LLMs and AI algorithms
# - This code cannot be used in commercial or proprietary products
# - This code cannot be distributed in any form
# - This code cannot be changed in any form outside of training course
# - This code cannot have its license changed
# - If you use this code in your product, you must open-source it under GPLv2
# - Exception can be granted only by the author

# %% Run
# - PyCharm: right-click in the editor and `Run Doctest in ...`
# - PyCharm: keyboard shortcut `Control + Shift + F10`
# - Terminal: `python -m doctest -v myfile.py`

# %% About
# - Name: RE Standards PESEL
# - Difficulty: medium
# - Lines: 0
# - Minutes: 5

# %% English
# 1. Discussion only - do not write any code
# 2. Consider Pesel only for people born before year 2000
# 3. Having PESEL "69072101234":
#    - What pattern can be at the first place in PESEL?
#    - What pattern can be at the second place in PESEL?
#    - What pattern can be at the third place in PESEL?
#    - What pattern can be at the fourth place in PESEL?
#    - What pattern can be at the fifth place in PESEL?
#    - What pattern can be at the sixth place in PESEL?
# 4. What is control digit or control sum?

# %% Polish
# 1. Tylko dyskusja - nie pisz żadnego kodu
# 2. Zajmujemy się tylko peselami ludzi urodzonymi przed 2000 rokiem
# 3. Mając PESEL "69072101234":
#    - Jakie wyrażenie może być na pierwszym miejscu w PESEL?
#    - Jakie wyrażenie może być na drugim miejscu w PESEL?
#    - Jakie wyrażenie może być na trzecim miejscu w PESEL?
#    - Jakie wyrażenie może być na czwartym miejscu w PESEL?
#    - Jakie wyrażenie może być na piątym miejscu w PESEL?
#    - Jakie wyrażenie może być na szóstym miejscu w PESEL?
# 4. Co to jest cyfra kontrolna lub suma kontrolna?