gpt4 book ai didi

java - .NET 与 Java 中的正则表达式性能

转载 作者:行者123 更新时间:2023-11-29 03:51:18 25 4
gpt4 key购买 nike

我在解析大型文本文件时遇到正则表达式性能问题。
我正在使用带有以下代码的 .NET 4.0:

private static pattern =   
@"((\D|^)(19|20|)\d\d([- /.\\])(0[1-9]|1[012]|[1-9])\4(0[1-9]|[12][0-9]|3[01]|[0-9]) (\D|$))|" +
@"((\D|^)(19|20|)\d\d([- /.\\])(0[1-9]|[12][0-9]|3[01]|[0-9])\11(0[1-9]|1[012]|[0-9]) (\D|$))|" +
@"((\D|^)(0[1-9]|1[012]|[0-9])([- /.\\])(0[1-9]|[12][0-9]|3[01]|[0-9])\18(19|20|)\d\d(\D|$))|" +
@"((\D|^)(0[1-9]|[12][0-9]|3[01]|[0-9])([- /.\\])(0[1-9]|1[012]|[0-9])\25(19|20|)\d\d(\D|$))|" +
@"((\D|^)(19|20|)\d\d(0[1-9]|1[012])(0[1-9]|[12][0-9]|3[01])(\D|$))|" +
@"((\D|^)(19|20|)\d\d(0[1-9]|[12][0-9]|3[01])(0[1-9]|1[012])(\D|$))|" +
@"((\D|^)(0[1-9]|1[012])(0[1-9]|[12][0-9]|3[01])(19|20|)\d\d(\D|$))|" +
@"((\D|^)(0[1-9]|[12][0-9]|3[01])(0[1-9]|1[012])(19|20|)\d\d(\D|$))|" +
@"((^|(?<!(\d[- /.\\\d])|\d))(19|20|)\d\d([- /.\\])(0[1-9]|1[012]|[1-9])([^- /.\\\d\w]|$|\s))|" +
@"((^|(?<!(\d[- /.\\\d])|\d))(0[1-9]|1[012]|[1-9])([- /.\\])(19|20|)\d\d([^- /.\\\d\w]|$|\s))|" +
@"((^|(?<!(\d[- /.\\\d])|\d))(0[1-9]|1[012]|[1-9])([- /.\\])(0[1-9]|[12][0-9]|3[01])([^- /.\\\d\w]|$|\s))|" +
@"((^|(?<!(\d[- /.\\\d])|\d))(0[1-9]|[12][0-9]|3[01])([- /.\\])(0[1-9]|1[012]|[1-9])([^- /.\\\d\w]|$|\s))";

private static Regex dateRegex = new new Regex(pattern,
RegexOptions.Compiled | RegexOptions.IgnoreCase |
RegexOptions.IgnorePatternWhitespace | RegexOptions.Multiline);

public static void Extract(string text)
{
foreach (Match match in dateRegex.Matches(text))
Console.Writeline("Match {0}",match.Value);
}

包含 200 个匹配项的 1MB 文本文件的处理时间约为 22 秒。
使用 Java 运行相同的正则表达式会产生更快的结果:~13 秒。
我设法通过将正则表达式拆分为多个部分并并行执行来减少 .NET 代码的处理时间。
为什么 Java 处理这个正则表达式要快得多?
我可以做些什么来提高处理此正则表达式的 .NET 性能?

干杯,
多伦

最佳答案

我现在没有时间完成分析,但我会提供目前的进展。这是重新格式化的正则表达式,以便您可以实际阅读它。我所做的唯一更改是在字符类中包装一些空格以允许自由间距模式。有 80 个捕获组(Yipes!- 其中大部分似乎是不必要的)。此表达式似乎匹配各种形式的日期。有很多需要改进的地方:

private static pattern = @"
# Match various forms of a Date.
( # Begin $1:
(\D|^) # $2:
(19|20|)\d\d # $3:
([- /.\\]) # $4:
(0[1-9]|1[012]|[1-9]) # $5:
\4
(0[1-9]|[12][0-9]|3[01]|[0-9]) # $6:
[ ][ ]
(\D|$) # $7:
) # End $1:
| ( # Begin $8:
(\D|^) # $9:
(19|20|)\d\d # $10:
([- /.\\]) # $11:
(0[1-9]|[12][0-9]|3[01]|[0-9]) # $12:
\11
(0[1-9]|1[012]|[0-9]) # $13:
[ ][ ]
(\D|$) # $14:
) # End $8:
| ( # Begin $15:
(\D|^) # $16:
(0[1-9]|1[012]|[0-9]) # $17:
([- /.\\]) # $18:
(0[1-9]|[12][0-9]|3[01]|[0-9]) # $19:
\18
(19|20|)\d\d # $20:
(\D|$) # $21:
) # End $15:
| ( # Begin $22:
(\D|^) # $23:
(0[1-9]|[12][0-9]|3[01]|[0-9]) # $24:
([- /.\\]) # $25:
(0[1-9]|1[012]|[0-9]) # $26:
\25
(19|20|)\d\d # $27:
(\D|$) # $28:
) # End $22:
| ( # Begin $29:
(\D|^) # $30:
(19|20|)\d\d # $31:
(0[1-9]|1[012]) # $32:
(0[1-9]|[12][0-9]|3[01]) # $33:
(\D|$) # $34:
) # End $29:
| ( # Begin $35:
(\D|^) # $36:
(19|20|)\d\d # $37:
(0[1-9]|[12][0-9]|3[01]) # $38:
(0[1-9]|1[012]) # $39:
(\D|$) # $40:
) # End $35:
| ( # Begin $41:
(\D|^) # $42:
(0[1-9]|1[012]) # $43:
(0[1-9]|[12][0-9]|3[01]) # $44:
(19|20|)\d\d # $45:
(\D|$) # $46:
) # End $41:
| ( # Begin $47:
(\D|^) # $48:
(0[1-9]|[12][0-9]|3[01]) # $49:
(0[1-9]|1[012]) # $50:
(19|20|)\d\d # $51:
(\D|$) # $52:
) # End $47:
| ( # Begin $53:
( ^ # Begin $54:
| (?<!
(\d[- /.\\\d]) # $55:
| \d
)
) # End $54:
(19|20|)\d\d # $56:
([- /.\\]) # $57:
(0[1-9]|1[012]|[1-9]) # $58:
([^- /.\\\d\w]|$|\s) # $59:
) # End $53:
| ( # Begin $60:
( ^ # Begin $61:
| (?<!
(\d[- /.\\\d]) # $62:
| \d
)
) # End $61:
(0[1-9]|1[012]|[1-9]) # $63:
([- /.\\]) # $64:
(19|20|)\d\d # $65:
([^- /.\\\d\w]|$|\s) # $66:
) # End $60:
| ( # Begin $67:
( ^ # Begin $68:
| (?<!
(\d[- /.\\\d]) # $69:
|\d
)
) # End $68:
(0[1-9]|1[012]|[1-9]) # $70:
([- /.\\]) # $71:
(0[1-9]|[12][0-9]|3[01]) # $72:
([^- /.\\\d\w]|$|\s)) # $73:
| ( # Begin $74:
( ^ # Begin $75:
| (?<!
(\d[- /.\\\d]) # $76:
| \d
)
) # End $75:
(0[1-9]|[12][0-9]|3[01]) # $77:
([- /.\\]) # $78:
(0[1-9]|1[012]|[1-9]) # $79:
([^- /.\\\d\w]|$|\s) # $80:
) # End $74:
";

当我有更多时间时,我会用一些建议的改进来更新这个答案。与此同时,其他正则表达式专家请随时使用这个改进的部分注释版本并运行它......

关于java - .NET 与 Java 中的正则表达式性能,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/8669896/

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com