ruk·si

🐍 Python
Regex

Updated at 2019-01-02 00:34

Python has powerful regular expression tools in standard library.

match() and search() are different match() is checking if the input matches the pattern. search() is checking if the input contains the pattern.

import re

bar_re = re.compile(r'bar')
assert bar_re.match('foobar') is None
assert bar_re.search('foobar').span() == (3, 6)

match() has optional index parameter.

import re

bar_re = re.compile(r'bar')
assert bar_re.match('foobar') is None
assert bar_re.match('foobar', 3).span() == (3, 6)

split() is also sometimes useful.

import re

separator_re = re.compile(r'[,;\s]+')
result = separator_re.split('Ruksi,; Laine')
assert result == ['Ruksi', 'Laine']

re.I makes checks in-case-sensitive.

import re

letters_re = re.compile(r'[a-z]+', re.I)
assert letters_re.search('465 ThiSMatchjes').span() == (4, 16)

Groups are an essential concept.

import re

id_re = re.compile(r'^(?P<user>.+)@(?P<host>.+):(?P<project>.+)$')
id_match = id_re.match('ruksi@example.com:my-project')
assert id_match.groupdict() == {
    'user': 'ruksi',
    'host': 'example.com',
    'project': 'my-project',
}

Use Scanner class for tokenizers. It is undocumented but it has existed for ages.

import re
from enum import Enum


class TokenType(Enum):
    INTEGER = 'integer'
    IDENTIFIER = 'identifier'
    PUNCTUATION = 'punctuation'


scanner = re.Scanner([
    (r'[0-9]+', lambda scanner, token: (TokenType.INTEGER, token)),
    (r'[a-z_]+', lambda scanner, token: (TokenType.IDENTIFIER, token)),
    (r'[,.]+', lambda scanner, token: (TokenType.PUNCTUATION, token)),
    (r'\s+', None),  # None skips token
])

results, remainder = scanner.scan('45 pigeons, 23 cows, 11 spiders.')
assert results == [
    (TokenType.INTEGER, '45'),
    (TokenType.IDENTIFIER, 'pigeons'),
    (TokenType.PUNCTUATION, ','),
    (TokenType.INTEGER, '23'),
    (TokenType.IDENTIFIER, 'cows'),
    (TokenType.PUNCTUATION, ','),
    (TokenType.INTEGER, '11'),
    (TokenType.IDENTIFIER, 'spiders'),
    (TokenType.PUNCTUATION, '.')
]

Sources