🐍 Python - Regex
Updated at 2019-01-01 22:34
Python has powerful regular expression tools in standard library.
match()
and search()
are different match()
is checking if the input matches the pattern. search()
is checking if the input contains the pattern.
import re
bar_re = re.compile(r'bar')
assert bar_re.match('foobar') is None
assert bar_re.search('foobar').span() == (3, 6)
match()
has optional index parameter.
import re
bar_re = re.compile(r'bar')
assert bar_re.match('foobar') is None
assert bar_re.match('foobar', 3).span() == (3, 6)
split()
is also sometimes useful.
import re
separator_re = re.compile(r'[,;\s]+')
result = separator_re.split('Ruksi,; Laine')
assert result == ['Ruksi', 'Laine']
re.I
makes checks in-case-sensitive.
import re
letters_re = re.compile(r'[a-z]+', re.I)
assert letters_re.search('465 ThiSMatchjes').span() == (4, 16)
Groups are an essential concept.
import re
id_re = re.compile(r'^(?P<user>.+)@(?P<host>.+):(?P<project>.+)$')
id_match = id_re.match('ruksi@example.com:my-project')
assert id_match.groupdict() == {
'user': 'ruksi',
'host': 'example.com',
'project': 'my-project',
}
Use Scanner
class for tokenizers. It is undocumented but it has existed for ages.
import re
from enum import Enum
class TokenType(Enum):
INTEGER = 'integer'
IDENTIFIER = 'identifier'
PUNCTUATION = 'punctuation'
scanner = re.Scanner([
(r'[0-9]+', lambda scanner, token: (TokenType.INTEGER, token)),
(r'[a-z_]+', lambda scanner, token: (TokenType.IDENTIFIER, token)),
(r'[,.]+', lambda scanner, token: (TokenType.PUNCTUATION, token)),
(r'\s+', None), # None skips token
])
results, remainder = scanner.scan('45 pigeons, 23 cows, 11 spiders.')
assert results == [
(TokenType.INTEGER, '45'),
(TokenType.IDENTIFIER, 'pigeons'),
(TokenType.PUNCTUATION, ','),
(TokenType.INTEGER, '23'),
(TokenType.IDENTIFIER, 'cows'),
(TokenType.PUNCTUATION, ','),
(TokenType.INTEGER, '11'),
(TokenType.IDENTIFIER, 'spiders'),
(TokenType.PUNCTUATION, '.')
]