gpt4 book ai didi

不使用正则表达式的 C++ 多字符串匹配

转载 作者:行者123 更新时间:2023-11-30 01:44:22 31 4
gpt4 key购买 nike

我正在尝试编辑现有的 C++ 程序以搜索多个字符串而不是单个字符串的文本。我是一个彻头彻尾的 C++ 菜鸟,所以我希望有人能给我指出一个可以工作并且快速 的函数。我尝试使用正则表达式 ( see my earlier post ),这很有效,但是正则表达式对于我的目的来说太慢了。采用以下代码(@WiktorStribiżew 提供):

#include <iostream>
#include <regex>
using namespace std;

int main() {
std::string pattern("A|D"); // Regex expression
std::regex rx(pattern); // Getting the regex object

std::string s("ABCDEABCABD"); // Defining the string input
std::ptrdiff_t number_of_matches = std::distance( // Count the number of matches inside the iterator
std::sregex_iterator(s.begin(), s.end(), rx),
std::sregex_iterator());

std::cout << number_of_matches << std::endl; // Displaying results
return 0;
}

这会输出想要的答案 (5),但对我来说太慢了(搜索 ~100+ GB 文件中的每一行)。我怎样才能让它更快?输入主题字符串 s 是从文件中读取的,查询字符串 pattern 是在命令行上提供的,因此它以字符串形式出现(用竖线分隔成 A|D ,例如)。

最佳答案

正则表达式和性能相互影响。
我重写了您的示例以不使用 Regex,并编译了两个版本(在 OS X 上,使用带有 -O3 -fwhole-program 的 g++ 5.3.0):

#include <iostream>
using namespace std;

#define NUM_PATTERNS 2

int main() {
std::string patterns[] = {std::string("A"), std::string("D")};
std::string str("ABCDEABCABD");
unsigned int count = 0;

for(int i = 0; i < NUM_PATTERNS; ++i)
{
for(int pos = 0; ; )
{
pos = str.find(patterns[i], pos);
if(pos == std::string::npos)
{
break;
}
++pos; // in order to not match again
++count;
}
}

std::cout << count << std::endl; // Displaying results
return 0;
}

生成的程序集有 545 行:

    .cstring
.align 3
LC0:
.ascii "basic_string::_M_construct null not valid\0"
.section __TEXT,__text_cold,regular,pure_instructions
.align 1
LCOLDB1:
.text
LHOTB1:
.align 1,0x90
.align 4,0x90
__ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEC1EPKcRKS3_.isra.18:
LFB1645:
pushq %r13
LCFI0:
pushq %r12
LCFI1:
leaq 16(%rdi), %r12
pushq %rbp
LCFI2:
pushq %rbx
LCFI3:
subq $24, %rsp
LCFI4:
testq %rsi, %rsi
movq %r12, (%rdi)
je L2
movq %rdi, %rbx
movq %rsi, %rdi
movq %rsi, %r13
call _strlen
cmpq $15, %rax
movq %rax, %rbp
movq %rax, 8(%rsp)
ja L13
cmpq $1, %rax
je L14
testq %rax, %rax
jne L15
L6:
movq 8(%rsp), %rax
movq (%rbx), %rdx
movq %rax, 8(%rbx)
movb $0, (%rdx,%rax)
addq $24, %rsp
LCFI5:
popq %rbx
LCFI6:
popq %rbp
LCFI7:
popq %r12
LCFI8:
popq %r13
LCFI9:
ret
L13:
LCFI10:
leaq 8(%rsp), %rsi
xorl %edx, %edx
movq %rbx, %rdi
call __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE9_M_createERmm
movq 8(%rsp), %rdx
movq %rax, (%rbx)
movq %rax, %rdi
movq %rdx, 16(%rbx)
L4:
movq %rbp, %rdx
movq %r13, %rsi
call _memcpy
jmp L6
L14:
movzbl 0(%r13), %eax
movb %al, 16(%rbx)
jmp L6
L2:
leaq LC0(%rip), %rdi
call __ZSt19__throw_logic_errorPKc
L15:
movq %r12, %rdi
jmp L4
LFE1645:
.section __TEXT,__text_cold,regular,pure_instructions
LCOLDE1:
.text
LHOTE1:
.cstring
LC2:
.ascii "A\0"
LC3:
.ascii "D\0"
LC4:
.ascii "ABCDEABCABD\0"
.section __TEXT,__text_cold,regular,pure_instructions
LCOLDB5:
.section __TEXT,__text_startup,regular,pure_instructions
LHOTB5:
.align 4
.globl _main
_main:
LFB1425:
pushq %r15
LCFI11:
leaq LC2(%rip), %rsi
pushq %r14
LCFI12:
pushq %r13
LCFI13:
pushq %r12
LCFI14:
pushq %rbp
LCFI15:
pushq %rbx
LCFI16:
subq $104, %rsp
LCFI17:
leaq 32(%rsp), %rbp
movq %rbp, %rdi
LEHB0:
call __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEC1EPKcRKS3_.isra.18
LEHE0:
leaq 32(%rbp), %rdi
leaq LC3(%rip), %rsi
LEHB1:
call __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEC1EPKcRKS3_.isra.18
LEHE1:
leaq LC4(%rip), %rsi
movq %rsp, %rdi
movq %rsp, %r12
LEHB2:
call __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEC1EPKcRKS3_.isra.18
LEHE2:
leaq 8(%rbp), %r14
xorl %r15d, %r15d
xorl %ebx, %ebx
L22:
leaq (%r15,%r14), %r13
xorl %eax, %eax
jmp L21
.align 4
L44:
addl $1, %eax
addl $1, %ebx
L21:
movq 0(%rbp,%r15), %rsi
movslq %eax, %rdx
movq %r12, %rdi
movq 0(%r13), %rcx
call __ZNKSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE4findEPKcmm
cmpl $-1, %eax
jne L44
addq $32, %r15
cmpq $64, %r15
jne L22
movq __ZSt4cout@GOTPCREL(%rip), %rdi
movl %ebx, %esi
LEHB3:
call __ZNSo9_M_insertImEERSoT_
movq %rax, %rdi
call __ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_
LEHE3:
movq (%rsp), %rdi
addq $16, %r12
cmpq %r12, %rdi
je L23
call __ZdlPv
L23:
movq 64(%rsp), %rdi
leaq 48(%rbp), %rax
cmpq %rax, %rdi
je L24
call __ZdlPv
L24:
movq 32(%rsp), %rdi
addq $16, %rbp
cmpq %rbp, %rdi
je L39
call __ZdlPv
L39:
addq $104, %rsp
LCFI18:
xorl %eax, %eax
popq %rbx
LCFI19:
popq %rbp
LCFI20:
popq %r12
LCFI21:
popq %r13
LCFI22:
popq %r14
LCFI23:
popq %r15
LCFI24:
ret
L38:
LCFI25:
movq %rax, %r12
movl $1, %eax
L19:
movl $1, %ebx
subq %rax, %rbx
salq $5, %rbx
addq %rbp, %rbx
L29:
cmpq %rbp, %rbx
je L27
subq $32, %rbx
movq (%rbx), %rdi
leaq 16(%rbx), %rax
cmpq %rax, %rdi
je L29
call __ZdlPv
jmp L29
L36:
movq (%rsp), %rdi
addq $16, %r12
movq %rax, %rbx
cmpq %r12, %rdi
je L32
call __ZdlPv
L32:
movq 64(%rsp), %rdi
leaq 48(%rbp), %rax
cmpq %rax, %rdi
je L33
call __ZdlPv
L33:
movq 32(%rsp), %rdi
addq $16, %rbp
cmpq %rbp, %rdi
je L34
call __ZdlPv
L34:
movq %rbx, %rdi
LEHB4:
call __Unwind_Resume
L27:
movq %r12, %rdi
call __Unwind_Resume
LEHE4:
L37:
movq %rax, %rbx
jmp L32
L35:
movq %rax, %r12
xorl %eax, %eax
jmp L19
LFE1425:
.section __DATA,__gcc_except_tab
GCC_except_table0:
LLSDA1425:
.byte 0xff
.byte 0xff
.byte 0x3
.byte 0x41
.set L$set$0,LEHB0-LFB1425
.long L$set$0
.set L$set$1,LEHE0-LEHB0
.long L$set$1
.set L$set$2,L38-LFB1425
.long L$set$2
.byte 0
.set L$set$3,LEHB1-LFB1425
.long L$set$3
.set L$set$4,LEHE1-LEHB1
.long L$set$4
.set L$set$5,L35-LFB1425
.long L$set$5
.byte 0
.set L$set$6,LEHB2-LFB1425
.long L$set$6
.set L$set$7,LEHE2-LEHB2
.long L$set$7
.set L$set$8,L37-LFB1425
.long L$set$8
.byte 0
.set L$set$9,LEHB3-LFB1425
.long L$set$9
.set L$set$10,LEHE3-LEHB3
.long L$set$10
.set L$set$11,L36-LFB1425
.long L$set$11
.byte 0
.set L$set$12,LEHB4-LFB1425
.long L$set$12
.set L$set$13,LEHE4-LEHB4
.long L$set$13
.long 0
.byte 0
.section __TEXT,__text_startup,regular,pure_instructions
.section __TEXT,__text_cold,regular,pure_instructions
LCOLDE5:
.section __TEXT,__text_startup,regular,pure_instructions
LHOTE5:
.section __TEXT,__text_cold,regular,pure_instructions
LCOLDB6:
.section __TEXT,__text_startup,regular,pure_instructions
LHOTB6:
.align 4
__GLOBAL__sub_I_test1.cpp:
LFB1626:
leaq __ZStL8__ioinit(%rip), %rdi
subq $8, %rsp
LCFI26:
call __ZNSt8ios_base4InitC1Ev
movq __ZNSt8ios_base4InitD1Ev@GOTPCREL(%rip), %rdi
leaq ___dso_handle(%rip), %rdx
addq $8, %rsp
LCFI27:
leaq __ZStL8__ioinit(%rip), %rsi
jmp ___cxa_atexit
LFE1626:
.section __TEXT,__text_cold,regular,pure_instructions
LCOLDE6:
.section __TEXT,__text_startup,regular,pure_instructions
LHOTE6:
.static_data
__ZStL8__ioinit:
.space 1
.section __TEXT,__eh_frame,coalesced,no_toc+strip_static_syms+live_support
EH_frame1:
.set L$set$14,LECIE1-LSCIE1
.long L$set$14
LSCIE1:
.long 0
.byte 0x1
.ascii "zPLR\0"
.byte 0x1
.byte 0x78
.byte 0x10
.byte 0x7
.byte 0x9b
.long ___gxx_personality_v0+4@GOTPCREL
.byte 0x10
.byte 0x10
.byte 0xc
.byte 0x7
.byte 0x8
.byte 0x90
.byte 0x1
.align 3
LECIE1:
LSFDE1:
.set L$set$15,LEFDE1-LASFDE1
.long L$set$15
LASFDE1:
.long LASFDE1-EH_frame1
.quad LFB1645-.
.set L$set$16,LFE1645-LFB1645
.quad L$set$16
.byte 0x8
.quad 0
.byte 0x4
.set L$set$17,LCFI0-LFB1645
.long L$set$17
.byte 0xe
.byte 0x10
.byte 0x8d
.byte 0x2
.byte 0x4
.set L$set$18,LCFI1-LCFI0
.long L$set$18
.byte 0xe
.byte 0x18
.byte 0x8c
.byte 0x3
.byte 0x4
.set L$set$19,LCFI2-LCFI1
.long L$set$19
.byte 0xe
.byte 0x20
.byte 0x86
.byte 0x4
.byte 0x4
.set L$set$20,LCFI3-LCFI2
.long L$set$20
.byte 0xe
.byte 0x28
.byte 0x83
.byte 0x5
.byte 0x4
.set L$set$21,LCFI4-LCFI3
.long L$set$21
.byte 0xe
.byte 0x40
.byte 0x4
.set L$set$22,LCFI5-LCFI4
.long L$set$22
.byte 0xa
.byte 0xe
.byte 0x28
.byte 0x4
.set L$set$23,LCFI6-LCFI5
.long L$set$23
.byte 0xe
.byte 0x20
.byte 0x4
.set L$set$24,LCFI7-LCFI6
.long L$set$24
.byte 0xe
.byte 0x18
.byte 0x4
.set L$set$25,LCFI8-LCFI7
.long L$set$25
.byte 0xe
.byte 0x10
.byte 0x4
.set L$set$26,LCFI9-LCFI8
.long L$set$26
.byte 0xe
.byte 0x8
.byte 0x4
.set L$set$27,LCFI10-LCFI9
.long L$set$27
.byte 0xb
.align 3
LEFDE1:
LSFDE3:
.set L$set$28,LEFDE3-LASFDE3
.long L$set$28
LASFDE3:
.long LASFDE3-EH_frame1
.quad LFB1425-.
.set L$set$29,LFE1425-LFB1425
.quad L$set$29
.byte 0x8
.quad LLSDA1425-.
.byte 0x4
.set L$set$30,LCFI11-LFB1425
.long L$set$30
.byte 0xe
.byte 0x10
.byte 0x8f
.byte 0x2
.byte 0x4
.set L$set$31,LCFI12-LCFI11
.long L$set$31
.byte 0xe
.byte 0x18
.byte 0x8e
.byte 0x3
.byte 0x4
.set L$set$32,LCFI13-LCFI12
.long L$set$32
.byte 0xe
.byte 0x20
.byte 0x8d
.byte 0x4
.byte 0x4
.set L$set$33,LCFI14-LCFI13
.long L$set$33
.byte 0xe
.byte 0x28
.byte 0x8c
.byte 0x5
.byte 0x4
.set L$set$34,LCFI15-LCFI14
.long L$set$34
.byte 0xe
.byte 0x30
.byte 0x86
.byte 0x6
.byte 0x4
.set L$set$35,LCFI16-LCFI15
.long L$set$35
.byte 0xe
.byte 0x38
.byte 0x83
.byte 0x7
.byte 0x4
.set L$set$36,LCFI17-LCFI16
.long L$set$36
.byte 0xe
.byte 0xa0,0x1
.byte 0x4
.set L$set$37,LCFI18-LCFI17
.long L$set$37
.byte 0xa
.byte 0xe
.byte 0x38
.byte 0x4
.set L$set$38,LCFI19-LCFI18
.long L$set$38
.byte 0xe
.byte 0x30
.byte 0x4
.set L$set$39,LCFI20-LCFI19
.long L$set$39
.byte 0xe
.byte 0x28
.byte 0x4
.set L$set$40,LCFI21-LCFI20
.long L$set$40
.byte 0xe
.byte 0x20
.byte 0x4
.set L$set$41,LCFI22-LCFI21
.long L$set$41
.byte 0xe
.byte 0x18
.byte 0x4
.set L$set$42,LCFI23-LCFI22
.long L$set$42
.byte 0xe
.byte 0x10
.byte 0x4
.set L$set$43,LCFI24-LCFI23
.long L$set$43
.byte 0xe
.byte 0x8
.byte 0x4
.set L$set$44,LCFI25-LCFI24
.long L$set$44
.byte 0xb
.align 3
LEFDE3:
LSFDE5:
.set L$set$45,LEFDE5-LASFDE5
.long L$set$45
LASFDE5:
.long LASFDE5-EH_frame1
.quad LFB1626-.
.set L$set$46,LFE1626-LFB1626
.quad L$set$46
.byte 0x8
.quad 0
.byte 0x4
.set L$set$47,LCFI26-LFB1626
.long L$set$47
.byte 0xe
.byte 0x10
.byte 0x4
.set L$set$48,LCFI27-LCFI26
.long L$set$48
.byte 0xe
.byte 0x8
.align 3
LEFDE5:
.mod_init_func
.align 3
.quad __GLOBAL__sub_I_test1.cpp
.constructor
.destructor
.align 1
.subsections_via_symbols

您的示例中的程序集有 31'559 行,尝试将其发布到此处或 Pastebin 时我的浏览器选项卡崩溃了。

不过,我还写了一个C版本的上述程序:

#include <stdio.h>
#include <string.h>

#define NUM_PATTERNS 2

int main()
{
const char *patterns[] = {"A", "D"};
char *str = "ABCDEABCABD";
unsigned int count = 0;

for(int i = 0; i < NUM_PATTERNS; ++i)
{
for(char *s = str; ; )
{
s = strstr(s, patterns[i]);
if(s == NULL)
{
break;
}
++s; // in order to not match again
++count;
}
}
printf("%u\n", count);
return 0;
}

编译后只有 95 行汇编代码:

    .cstring
LC0:
.ascii "ABCDEABCABD\0"
LC1:
.ascii "%u\12\0"
.section __TEXT,__text_cold,regular,pure_instructions
LCOLDB2:
.section __TEXT,__text_startup,regular,pure_instructions
LHOTB2:
.align 4
.globl _main
_main:
LFB1:
pushq %rbx
LCFI0:
leaq LC0(%rip), %rdi
xorl %ebx, %ebx
jmp L3
.align 4
L8:
leaq 1(%rax), %rdi
addl $1, %ebx
L3:
movl $65, %esi
call _strchr
testq %rax, %rax
jne L8
leaq LC0(%rip), %rdi
jmp L2
.align 4
L9:
leaq 1(%rax), %rdi
addl $1, %ebx
L2:
movl $68, %esi
call _strchr
testq %rax, %rax
jne L9
leaq LC1(%rip), %rdi
movl %ebx, %esi
xorl %eax, %eax
call _printf
xorl %eax, %eax
popq %rbx
LCFI1:
ret
LFE1:
.section __TEXT,__text_cold,regular,pure_instructions
LCOLDE2:
.section __TEXT,__text_startup,regular,pure_instructions
LHOTE2:
.section __TEXT,__eh_frame,coalesced,no_toc+strip_static_syms+live_support
EH_frame1:
.set L$set$0,LECIE1-LSCIE1
.long L$set$0
LSCIE1:
.long 0
.byte 0x1
.ascii "zR\0"
.byte 0x1
.byte 0x78
.byte 0x10
.byte 0x1
.byte 0x10
.byte 0xc
.byte 0x7
.byte 0x8
.byte 0x90
.byte 0x1
.align 3
LECIE1:
LSFDE1:
.set L$set$1,LEFDE1-LASFDE1
.long L$set$1
LASFDE1:
.long LASFDE1-EH_frame1
.quad LFB1-.
.set L$set$2,LFE1-LFB1
.quad L$set$2
.byte 0
.byte 0x4
.set L$set$3,LCFI0-LFB1
.long L$set$3
.byte 0xe
.byte 0x10
.byte 0x83
.byte 0x2
.byte 0x4
.set L$set$4,LCFI1-LCFI0
.long L$set$4
.byte 0xe
.byte 0x8
.align 3
LEFDE1:
.subsections_via_symbols

我建议您至少将那部分代码转换为 C。如果您有周围的 C++ 代码库,嵌入它应该不是什么大问题。

无论哪种方式,您都可以采用上述示例之一,更改 str要从文件中读取,请更改 patterns根据需要排列尽可能多的模式(确保将 NUM_PATTERNS 设置为相同的数量!),如果这仍然不足以提高性能,请将循环实现为跨多个线程拆分(这个话题太宽泛了尽管在这个答案中涵盖了 C 和 C++)。

当然你也可以将命令行参数解析成一个模式数组:

#include <stdlib.h>
#include <string.h>

int numPatterns(char *str)
{
int num = 1;
for(int i = 0, len = strlen(str); i < len; ++i) // loop through all characters of the string
{
if(str[i] == '|') // increase the counter each time we hit a |
{
++num;
}
}
return num;
}

void splitPatterns(char *str, int num, char** dst)
{
char *raw = (char*)(dst + num * sizeof(char*)); // this is the region after the pointers
strcpy(raw, str); // first copy the string (note, this function is dangerous if used on anything other than strings! (google buffer overflow))
dst[0] = raw;
for(int i = 0, len = strlen(str), n = 1; i < len; ++i)
{
if(raw[i] == '|')
{
raw[i] = '\0'; // replace all | by '\0' in order to split
dst[n++] = &raw[i + 1]; // and let the next string start at the next character
}
}
}

// let's say your program is called as ./program 'input_file' 'list|of|patterns'
int main(int argc, char **argv)
{
/* check that argc >= 3 and blah */

int num = numPatterns(argv[2]);
// In order to malloc() and free() only once, I'm stashing things together here.
// First comes an array of num pointers, hence num * sizeof(char*), followed by
// the strings which take up the same space as argv[2] (the | just gets
// replaced by \0), which equals strlen(argv[2]) + 1 for the terminating null byte.
char **patterns = (char**)malloc(num * sizeof(char*) + strlen(argv[2]) + 1);
splitPatterns(argv[2], num, patterns);

/* work with patterns here */

free(patterns);
}

如果您将此处的 C 示例编译为 C++,请确保更改所有来自 <*.h> 的包含指令至 <c*> ,即 <stdio.h><cstdio>

关于不使用正则表达式的 C++ 多字符串匹配,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/36360005/

31 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com