gpt4 book ai didi

python - 在 Python 中高效匹配多个正则表达式

转载 作者:太空狗 更新时间:2023-10-29 17:12:09 24 4
gpt4 key购买 nike

当你有正则表达式时,词法分析器很容易编写。今天想用Python写一个简单的通用分析器,想出了:

import re
import sys

class Token(object):
""" A simple Token structure.
Contains the token type, value and position.
"""
def __init__(self, type, val, pos):
self.type = type
self.val = val
self.pos = pos

def __str__(self):
return '%s(%s) at %s' % (self.type, self.val, self.pos)


class LexerError(Exception):
""" Lexer error exception.

pos:
Position in the input line where the error occurred.
"""
def __init__(self, pos):
self.pos = pos


class Lexer(object):
""" A simple regex-based lexer/tokenizer.

See below for an example of usage.
"""
def __init__(self, rules, skip_whitespace=True):
""" Create a lexer.

rules:
A list of rules. Each rule is a `regex, type`
pair, where `regex` is the regular expression used
to recognize the token and `type` is the type
of the token to return when it's recognized.

skip_whitespace:
If True, whitespace (\s+) will be skipped and not
reported by the lexer. Otherwise, you have to
specify your rules for whitespace, or it will be
flagged as an error.
"""
self.rules = []

for regex, type in rules:
self.rules.append((re.compile(regex), type))

self.skip_whitespace = skip_whitespace
self.re_ws_skip = re.compile('\S')

def input(self, buf):
""" Initialize the lexer with a buffer as input.
"""
self.buf = buf
self.pos = 0

def token(self):
""" Return the next token (a Token object) found in the
input buffer. None is returned if the end of the
buffer was reached.
In case of a lexing error (the current chunk of the
buffer matches no rule), a LexerError is raised with
the position of the error.
"""
if self.pos >= len(self.buf):
return None
else:
if self.skip_whitespace:
m = self.re_ws_skip.search(self.buf[self.pos:])

if m:
self.pos += m.start()
else:
return None

for token_regex, token_type in self.rules:
m = token_regex.match(self.buf[self.pos:])

if m:
value = self.buf[self.pos + m.start():self.pos + m.end()]
tok = Token(token_type, value, self.pos)
self.pos += m.end()
return tok

# if we're here, no rule matched
raise LexerError(self.pos)

def tokens(self):
""" Returns an iterator to the tokens found in the buffer.
"""
while 1:
tok = self.token()
if tok is None: break
yield tok


if __name__ == '__main__':
rules = [
('\d+', 'NUMBER'),
('[a-zA-Z_]\w+', 'IDENTIFIER'),
('\+', 'PLUS'),
('\-', 'MINUS'),
('\*', 'MULTIPLY'),
('\/', 'DIVIDE'),
('\(', 'LP'),
('\)', 'RP'),
('=', 'EQUALS'),
]

lx = Lexer(rules, skip_whitespace=True)
lx.input('erw = _abc + 12*(R4-623902) ')

try:
for tok in lx.tokens():
print tok
except LexerError, err:
print 'LexerError at position', err.pos

它工作得很好,但我有点担心它效率太低。是否有任何正则表达式技巧可以让我以更高效/优雅的方式编写它?

具体来说,有没有一种方法可以避免线性循环遍历所有正则表达式规则以找到适合的规则?

最佳答案

我建议使用 re.Scanner 类,它没有记录在标准库中,但非常值得使用。这是一个例子:

import re

scanner = re.Scanner([
(r"-?[0-9]+\.[0-9]+([eE]-?[0-9]+)?", lambda scanner, token: float(token)),
(r"-?[0-9]+", lambda scanner, token: int(token)),
(r" +", lambda scanner, token: None),
])

>>> scanner.scan("0 -1 4.5 7.8e3")[0]
[0, -1, 4.5, 7800.0]

关于python - 在 Python 中高效匹配多个正则表达式,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/133886/

24 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com