4.11. Regex Syntax Use Cases

4.11.1. National Identification Numbers

>>> def pesel_check_digit(self):
...     weights = (1, 3, 7, 9, 1, 3, 7, 9, 1, 3)
...     check = sum(w * int(n) for w, n in zip(weights, self.pesel))
...     return str((10 - check) % 10)
../../_images/regex-dowodosobisty-new-1.jpg
../../_images/regex-dowodosobisty-prl-1.jpg
../../_images/regex-dowodosobisty-prl-2.jpg
../../_images/regex-dowodosobisty-prl-3.jpg
../../_images/regex-dowodosobisty-old-1.jpg
../../_images/regex-dowodosobisty-old-2.jpg
../../_images/regex-dowodosobisty-old-3.jpg

4.11.2. Dates

ISO Date:

>>> pattern = r'^\d{4}-\d{2}-\d{2}$'

US Long Date:

>>> pattern = r'^\w+ \d{2}, \d{4}$'

US Short Date:

>>> pattern = r'^\d{2}/\d{2}/\d{2}$'

4.11.3. Email

>>> pattern = r'^[a-zA-Z0-9][\w.+-]*@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]{2,20}$'

W3C HTML5 Standard [2] regexp for email field

>>> pattern = r"^[a-zA-Z0-9.!#$%&'*+/=?^_`{|}~-]+@[a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)*$"

4.11.4. URL

W3C standard for URL understanding

^(?=[^&])(?:(?<scheme>[^:/?#]+):)?(?://(?<authority>[^/?#]*))?
(?<path>[^?#]*)(?:\?(?<query>[^#]*))?(?:#(?<fragment>.*))?
>>> scheme = r'(?:(?<scheme>[^:/?#]+):)?'
>>> authority = r'(?://(?<authority>[^/?#]*))?'
>>> path = r'(?<path>[^?#]*)'
>>> query = r'(?:\?(?<query>[^#]*))?'
>>> fragment = r'(?:#(?<fragment>.*))?'
>>>
>>> pattern = f'^(?=[^&]){scheme}{authority}{path}{query}{fragment}'
>>>
>>> print(pattern)
^(?=[^&])(?:(?<scheme>[^:/?#]+):)?(?://(?<authority>[^/?#]*))?(?<path>[^?#]*)(?:\?(?<query>[^#]*))?(?:#(?<fragment>.*))?

W3C standard for URL parsing

/^\s*[a-z](?:[-a-z0-9\+\.])*:(?:\/\/(?:(?:%[0-9a-f][0-9a-f]|[-a-z0-9\._~
\uA0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF\u10000-\u1FFFD\u20000-\u2FFFD\u30000-
\u3FFFD\u40000-\u4FFFD\u50000-\u5FFFD\u60000-\u6FFFD\u70000-\u7FFFD\u80000-
\u8FFFD\u90000-\u9FFFD\uA0000-\uAFFFD\uB0000-\uBFFFD\uC0000-\uCFFFD\uD0000-
\uDFFFD\uE1000-\uEFFFD!\$&\'\(\)\*\+,;=:])*@)?(?:\[(?:(?:(?:[0-9a-f]{1,4}:)
{6}(?:[0-9a-f]{1,4}:[0-9a-f]{1,4}|(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4]
[0-9]|25[0-5])(?:\.(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])){3}
)|::(?:[0-9a-f]{1,4}:){5}(?:[0-9a-f]{1,4}:[0-9a-f]{1,4}|(?:[0-9]|[1-9][0-9]
|1[0-9][0-9]|2[0-4][0-9]|25[0-5])(?:\.(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2
[0-4][0-9]|25[0-5])){3})|(?:[0-9a-f]{1,4})?::(?:[0-9a-f]{1,4}:){4}
(?:[0-9a-f]{1,4}:[0-9a-f]{1,4}|(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]
|25[0-5])(?:\.(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])){3})|
(?:[0-9a-f]{1,4}:[0-9a-f]{1,4})?::(?:[0-9a-f]{1,4}:){3}(?:[0-9a-f]{1,4}:
[0-9a-f]{1,4}|(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])(?:\.
(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])){3})|(?:(?:[0-9a-f]
{1,4}:){0,2}[0-9a-f]{1,4})?::(?:[0-9a-f]{1,4}:){2}(?:[0-9a-f]{1,4}:[0-9a-f]
{1,4}|(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])(?:\.(?:[0-9]|
[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])){3})|(?:(?:[0-9a-f]{1,4}:){0,3}
[0-9a-f]{1,4})?::[0-9a-f]{1,4}:(?:[0-9a-f]{1,4}:[0-9a-f]{1,4}|(?:[0-9]|
[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])(?:\.(?:[0-9]|[1-9][0-9]|1[0-9]
[0-9]|2[0-4][0-9]|25[0-5])){3})|(?:(?:[0-9a-f]{1,4}:){0,4}[0-9a-f]{1,4})?::
(?:[0-9a-f]{1,4}:[0-9a-f]{1,4}|(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|
25[0-5])(?:\.(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])){3})|
(?:(?:[0-9a-f]{1,4}:){0,5}[0-9a-f]{1,4})?::[0-9a-f]{1,4}|(?:(?:[0-9a-f]
{1,4}:){0,6}[0-9a-f]{1,4})?::)|v[0-9a-f]+[-a-z0-9\._~!\$&\'\(\)\*\+,;
=:]+)\]|(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])(?:\.
(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])){3}|
(?:%[0-9a-f][0-9a-f]|[-a-z0-9\
._~\uA0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF\u10000-\u1FFFD\u20000-\u2FFFD
\u30000-\u3FFFD\u40000-\u4FFFD\u50000-\u5FFFD\u60000-\u6FFFD\u70000-\u7FFFD
\u80000-\u8FFFD\u90000-\u9FFFD\uA0000-\uAFFFD\uB0000-\uBFFFD\uC0000-\uCFFFD
\uD0000-\uDFFFD\uE1000-\uEFFFD!\$&\'\(\)\*\+,;=@])*)(?::[0-9]*)?(?:\/(?:
(?:%[0-9a-f][0-9a-f]|[-a-z0-9\
._~\uA0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF\u10000-\u1FFFD\u20000-\u2FFFD
\u30000-\u3FFFD\u40000-\u4FFFD\u50000-\u5FFFD\u60000-\u6FFFD\u70000-\u7FFFD
\u80000-\u8FFFD\u90000-\u9FFFD\uA0000-\uAFFFD\uB0000-\uBFFFD\uC0000-\uCFFFD
\uD0000-\uDFFFD\uE1000-\uEFFFD!\$&\'\(\)\*\+,;=:@]))*)*|\/(?:(?:(?:
(?:%[0-9a-f][0-9a-f]|[-a-z0-9\
._~\uA0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF\u10000-\u1FFFD\u20000-\u2FFFD
\u30000-\u3FFFD\u40000-\u4FFFD\u50000-\u5FFFD\u60000-\u6FFFD\u70000-\u7FFFD
\u80000-\u8FFFD\u90000-\u9FFFD\uA0000-\uAFFFD\uB0000-\uBFFFD\uC0000-\uCFFFD
\uD0000-\uDFFFD\uE1000-\uEFFFD!\$&\'\(\)\*\+,;=:@]))+)(?:\/(?:
(?:%[0-9a-f][0-9a-f]|[-a-z0-9\
._~\uA0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF\u10000-\u1FFFD\u20000-\u2FFFD
\u30000-\u3FFFD\u40000-\u4FFFD\u50000-\u5FFFD\u60000-\u6FFFD\u70000-\u7FFFD
\u80000-\u8FFFD\u90000-\u9FFFD\uA0000-\uAFFFD\uB0000-\uBFFFD\uC0000-\uCFFFD
\uD0000-\uDFFFD\uE1000-\uEFFFD!\$&\'\(\)\*\+,;=:@]))*)*)?|(?:(?:
(?:%[0-9a-f][0-9a-f]|[-a-z0-9\
._~\uA0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF\u10000-\u1FFFD\u20000-\u2FFFD
\u30000-\u3FFFD\u40000-\u4FFFD\u50000-\u5FFFD\u60000-\u6FFFD\u70000-\u7FFFD
\u80000-\u8FFFD\u90000-\u9FFFD\uA0000-\uAFFFD\uB0000-\uBFFFD\uC0000-\uCFFFD
\uD0000-\uDFFFD\uE1000-\uEFFFD!\$&\'\(\)\*\+,;=:@]))+)(?:\/(?:
(?:%[0-9a-f][0-9a-f]|[-a-z0-9\
._~\uA0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF\u10000-\u1FFFD\u20000-\u2FFFD
\u30000-\u3FFFD\u40000-\u4FFFD\u50000-\u5FFFD\u60000-\u6FFFD\u70000-\u7FFFD
\u80000-\u8FFFD\u90000-\u9FFFD\uA0000-\uAFFFD\uB0000-\uBFFFD\uC0000-\uCFFFD
\uD0000-\uDFFFD\uE1000-\uEFFFD!\$&\'\(\)\*\+,;=:@]))*)*|(?!
(?:%[0-9a-f][0-9a-f]|[-a-z0-9\
._~\uA0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF\u10000-\u1FFFD\u20000-\u2FFFD
\u30000-\u3FFFD\u40000-\u4FFFD\u50000-\u5FFFD\u60000-\u6FFFD\u70000-\u7FFFD
\u80000-\u8FFFD\u90000-\u9FFFD\uA0000-\uAFFFD\uB0000-\uBFFFD\uC0000-\uCFFFD
\uD0000-\uDFFFD\uE1000-\uEFFFD!\$&\'\(\)\*\+,;=:@])))(?:\?(?:
(?:%[0-9a-f][0-9a-f]|[-a-z0-9\
._~\uA0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF\u10000-\u1FFFD\u20000-\u2FFFD
\u30000-\u3FFFD\u40000-\u4FFFD\u50000-\u5FFFD\u60000-\u6FFFD\u70000-\u7FFFD
\u80000-\u8FFFD\u90000-\u9FFFD\uA0000-\uAFFFD\uB0000-\uBFFFD\uC0000-\uCFFFD
\uD0000-\uDFFFD\uE1000-\uEFFFD!\$&\'\(\)\*\+,;=:@])
|[\uE000-\uF8FF\uF0000-\uFFFFD|\u100000-\u10FFFD\/\?])*)?(?:\#(?:
(?:%[0-9a-f][0-9a-f]|[-a-z0-9\
._~\uA0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF\u10000-\u1FFFD\u20000-\u2FFFD
\u30000-\u3FFFD\u40000-\u4FFFD\u50000-\u5FFFD\u60000-\u6FFFD\u70000-\u7FFFD
\u80000-\u8FFFD\u90000-\u9FFFD\uA0000-\uAFFFD\uB0000-\uBFFFD\uC0000-\uCFFFD
\uD0000-\uDFFFD\uE1000-\uEFFFD!\$&\'\(\)\*\+,;=:@])|[\/\?])*)?\s*$/i

4.11.5. Parsing URLs

To parse a URL url into its component parts, the user agent must use the following steps:

  1. Strip leading and trailing space characters from url.

  2. Parse url in the manner defined by RFC 3986, with the following exceptions:

    • Add all characters with code points less than or equal to U+0020 or greater than or equal to U+007F to the <unreserved> production.

    • Add the characters U+0022, U+003C, U+003E, U+005B ... U+005E, U+0060, and U+007B ... U+007D to the <unreserved> production

    • Add a single U+0025 PERCENT SIGN character as a second alternative way of matching the <pct-encoded> production, except when the <pct-encoded> is used in the <reg-name> production.

    • Add the U+0023 NUMBER SIGN character to the characters allowed in the <fragment> production.

  3. If url doesn't match the <URI-reference> production, even after the above changes are made to the ABNF definitions, then parsing the URL fails with an error. [RFC 3986] Otherwise, parsing url was successful; the components of the URL are substrings of url defined as follows:

scheme

The substring matched by the <scheme> production, if any.

host

The substring matched by the <host> production, if any.

port

The substring matched by the <port> production, if any.

hostport

If there is a <scheme> component and a <port> component and the port given by the <port> component is different than the default port defined for the protocol given by the <scheme> component, then <hostport> is the substring that starts with the substring matched by the <host> production and ends with the substring matched by the <port> production, and includes the colon in between the two. Otherwise, it is the same as the <host> component.

path

The substring matched by one of the following productions, if one of them was matched:

path-abempty
path-absolute
path-noscheme
path-rootless
path-empty
query

The substring matched by the <query> production, if any.

fragment

The substring matched by the <fragment> production, if any.

host-specific

The substring that follows the substring matched by the <authority> production, or the whole string if the <authority> production wasn't matched.

4.11.6. References

4.11.7. Assignments

Code 4.24. Solution
"""
* Assignment: RE Standards IsValidPesel
* Complexity: easy
* Lines of code: 4 lines
* Time: 5 min

English:
    1. Write implementation of `is_pesel_valid`:
       a. Pesel validation using regex is too complex
       b. Use simplified pattern: r'^\d{11}$'
       c. This pattern will allow to avoid 80% of accidental mistakes
    2. Run doctests - all must succeed

Polish:
    1. Napisz implementację `is_pesel_valid`
       a. Walidacja Pesel za pomocą regex jest zbyt skomplikowana
       b. Użyj uproszczonego wzorca: r'^\d{11}$'
       c. Ten wzorzec pozwoli na uniknięcie 80% przypadkowych błędów
    2. Uruchom doctesty - wszystkie muszą się powieść

Tests:
    >>> import sys; sys.tracebacklimit = 0

    >>> is_pesel_valid('69072101234')
    True
    >>> is_pesel_valid('18220812345')
    True
"""

import re

PATTERN = r'^\d{11}$'


def is_pesel_valid(pesel: str) -> bool:
    ...


Code 4.25. Solution
"""
* Assignment: RE Standards IsPeselWoman
* Complexity: easy
* Lines of code: 3 lines
* Time: 5 min

English:
    1. Write implementation of `is_pesel_woman`:
       a. Pesel belongs to a woman if second to last digit is even
       b. Do not use regex
    2. Run doctests - all must succeed

Polish:
    1. Napisz implementację `is_pesel_woman`:
       a. Pesel należy do kobiety, jeżeli przed ostatnia cyfra jest parzysta
       a. Nie korzystaj z regex
    2. Uruchom doctesty - wszystkie muszą się powieść

Tests:
    >>> import sys; sys.tracebacklimit = 0

    >>> is_pesel_woman(69072101234)
    False
    >>> is_pesel_woman(18220812345)
    True
"""

PATTERN = r'^\d{11}$'
WOMAN = {0,2,4,6,8}
MAN = {1,3,5,7,9}


# type: Callable[[int], bool]
def is_pesel_woman(pesel):
    """
    Check whether PESEL is woman's.
    If the second to last number is even,
    then PESEL is woman's, in other case PESEL is man's.
    """
    ...


Code 4.26. Solution
"""
* Assignment: RE Standards PESEL
* Complexity: medium
* Lines of code: 0 lines
* Time: 5 min
* Warning: Discussion only - do not write any code

English:
    1. Discussion only - do not write any code
    2. Consider Pesel only for people born before year 2000
    3. Having PESEL "69072101234":
       a. What pattern can be at the first place in PESEL?
       a. What pattern can be at the second place in PESEL?
       a. What pattern can be at the third place in PESEL?
       a. What pattern can be at the fourth place in PESEL?
       a. What pattern can be at the fifth place in PESEL?
       a. What pattern can be at the sixth place in PESEL?
    4. What is control digit or control sum?

Polish:
    1. Tylko dyskusja - nie pisz żadnego kodu
    2. Zajmujemy się tylko peselami ludzi urodzonymi przed 2000 rokiem
    3. Mając PESEL "69072101234":
       a. Jakie wyrażenie może być na pierwszym miejscu w PESEL?
       b. Jakie wyrażenie może być na drugim miejscu w PESEL?
       c. Jakie wyrażenie może być na trzecim miejscu w PESEL?
       d. Jakie wyrażenie może być na czwartym miejscu w PESEL?
       e. Jakie wyrażenie może być na piątym miejscu w PESEL?
       f. Jakie wyrażenie może być na szóstym miejscu w PESEL?
    4. Co to jest cyfra kontrolna lub suma kontrolna?
"""