gpt4 book ai didi

c - 实现音节化算法但真的很慢

转载 作者:塔克拉玛干 更新时间:2023-11-03 05:51:38 24 4
gpt4 key购买 nike

我根据改进的 Lansky 算法实现了简单的音节化算法,但是当我需要在超过 200 万个单词的语料库上运行这个算法时,它真的很慢。有人可以指出导致它如此缓慢的原因吗?算法如下:

  1. 最后一个元音(元音组)之后的所有内容都属于最后一个音节

  2. 第一个元音(元音组)之前的所有内容都属于第一个音节

  3. 如果元音之间的辅音个数为偶数(2n),则分为halves 前半部分属于左元音,第二部分属于右元音 (n/n)。

  4. 如果元音之间的辅音个数是奇数(2n + 1),我们将它们分成n/n + 1 个部分。

  5. 如果元音之间只有一个辅音,则属于左元音。

    #include <stdio.h>
    #include <string.h>

    #define VOWELS "aeiou"

    int get_n_consonant_between(char *word, int length) {
    int count = 0;
    int i = 0;

    while (i++ < length) {
    if (strchr(VOWELS, *word)) break;
    word++;
    count++;
    }

    return count;
    }

    void syllabification(char *word, int n_vowel_groups) {
    int i = 0, length = strlen(word), consonants;
    int syllables = 0, vowel_group = 0, syl_length = 0;
    char *syllable = word;
    char hola[length];

    memset(hola, 0, length);

    if (n_vowel_groups < 2) {
    printf("CAN'T BE SPLIT INTO SYLLABLES\n\n");
    return;
    }

    while (i < length) {
    if (strchr(VOWELS, word[i])) {
    syl_length++;
    i++;
    if (vowel_group) continue;
    vowel_group = 1;
    }
    else {
    if (vowel_group) {
    consonants = get_n_consonant_between(word + i, length - i);
    if (consonants == 1) {
    // printf("only one consonant\n");
    syl_length++;
    strncpy(hola, syllable, syl_length);
    i++;
    }
    else {
    int count = consonants / 2;
    if ((consonants % 2) == 0) { /* number of consonants is 2n, first half belongs to the left vowel */
    syl_length += count;
    }
    else {
    syl_length += count;
    }
    strncpy(hola, syllable, syl_length);
    i += count;
    }

    syllables++;
    if (syllables == n_vowel_groups) {
    printf("syllable done %d: %s\n", syllables, syllable);
    break;
    }
    printf("syllable %d: %s\n", syllables, hola);

    syllable = word + i;
    syl_length = 0;
    memset(hola, 0, length);
    }
    else {
    syl_length++;
    i++;
    }
    vowel_group = 0;
    }
    }
    }

    int count_vowel_groups(char *word) {
    int i, nvowels = 0;
    int vowel_group = 0;

    for (i = 0; i < strlen(word); i++) {
    if (strchr(VOWELS, word[i])) {
    if (vowel_group) continue;
    vowel_group = 1;
    }
    else {
    if (vowel_group) nvowels++;
    vowel_group = 0;
    }
    }
    // printf("%d vowel groups\n", nvowels);
    return nvowels;
    }

    void repl() {
    char *line = NULL;
    size_t len = 0;
    int i = 0;
    int count;
    FILE *file = fopen("../syllables.txt", "r");
    while(i++ < 15) {
    getline(&line, &len, file);
    printf("\n\n%s\n", line);
    count = count_vowel_groups(line);
    syllabification(line, count);
    }
    }

    int main(int argc, char *argv[]) {
    // printf("Syllabification test:\n");
    repl();
    }

    `

最佳答案

要检查实现是否正确,需要执行大量代码正确,主要是因为我不知道术语(比如什么是 a算法的元音组)。我查了一下,谷歌给我返回了很多用于音节化的研究论文(我只能看到摘要)不同的语言,所以我不确定代码是否正确。

但我有一些建议可能会使您的代码更快:

  1. 将所有strlen(word) 移出for 循环条件。保存长度在变量中并改用该变量。所以从

    for (i = 0; i < strlen(word); i++)

    size_t len = strlen(word);
    for(i = 0; i < len; i++)
  2. 不要使用 strchr 检查字符是否为元音。我会使用查找这个表:

    // as global variable
    char vowels[256];

    int main(void)
    {
    vowels['a'] = 1;
    vowels['e'] = 1;
    vowels['i'] = 1;
    vowels['o'] = 1;
    vowels['u'] = 1;
    ...
    }

    当你想检查一个字符是否是元音时:

    // 0x20 | c make c a lower case character
    if(vowel[0x20 | word[i]])
    syl_length++;
    i++;
    if (vowel_group) continue;
    vowel_group = 1;
    }

第一个建议可能会给你一个小的性能提升,编译器是非常聪明,无论如何都可以优化它。第二个建议可能会给你更多的表现,因为它只是一个查找。在最坏的情况下strchr 必须多次遍历整个 "aeiou" 数组。1

我还建议您分析您的代码。参见 thisthis .


注释

1我做了一个非常粗糙的程序来比较建议。我添加了一些额外的代码,希望编译器不会积极优化功能。

#include <stdio.h>
#include <string.h>
#include <time.h>


int test1(time_t t)
{
char text[] = "The lazy dog is very lazy";
for(size_t i = 0; i < strlen(text); ++i)
t += text[i];

return t;
}

int test2(time_t t)
{
char text[] = "The lazy dog is very lazy";
size_t len = strlen(text);
for(size_t i = 0; i < len; ++i)
t += text[i];

return t;
}

#define VOWELS "aeiou"
char vowels[256];

int test3(time_t t)
{
char text[] = "The lazy dog is very lazy";
size_t len = strlen(text);
for(size_t i = 0; i < len; ++i)
{
if (strchr(VOWELS, text[i]))
t += text[i];
t += text[i];
}

return t;
}

int test4(time_t t)
{
char text[] = "The lazy dog is very lazy";
size_t len = strlen(text);
for(size_t i = 0; i < len; ++i)
{
if(vowels[0x20 | text[i]])
t += text[i];
t += text[i];
}

return t;
}

int main(void)
{
vowels['a'] = 1;
vowels['e'] = 1;
vowels['i'] = 1;
vowels['o'] = 1;
vowels['u'] = 1;
long times = 50000000;

long tmp = 0;

clock_t t1 = 0, t2 = 0, t3 = 0, t4 = 0;

for(long i = 0; i < times; ++i)
{
clock_t start,end;
time_t t = time(NULL);

start = clock();
tmp += test1(t);
end = clock();

t1 += end - start;
//t1 += ((double) (end - start)) / CLOCKS_PER_SEC;

start = clock();
tmp += test2(t);
end = clock();

t2 += end - start;

start = clock();
tmp += test3(t);
end = clock();

t3 += end - start;

start = clock();
tmp += test4(t);
end = clock();

t4 += end - start;
}

printf("t1: %lf %s\n", ((double) t1) / CLOCKS_PER_SEC, t1 < t2 ? "wins":"loses");
printf("t2: %lf %s\n", ((double) t2) / CLOCKS_PER_SEC, t2 < t1 ? "wins":"loses");
printf("t3: %lf %s\n", ((double) t3) / CLOCKS_PER_SEC, t3 < t4 ? "wins":"loses");
printf("t4: %lf %s\n", ((double) t4) / CLOCKS_PER_SEC, t4 < t3 ? "wins":"loses");
printf("tmp: %ld\n", tmp);


return 0;
}

结果是:

$ gcc b.c -ob -Wall -O0
$ ./b
t1: 10.866770 loses
t2: 7.588057 wins
t3: 10.801546 loses
t4: 8.366050 wins

$ gcc b.c -ob -Wall -O1
$ ./b
t1: 7.409297 loses
t2: 7.082418 wins
t3: 11.415080 loses
t4: 7.847086 wins

$ gcc b.c -ob -Wall -O2
$ ./b
t1: 6.292438 loses
t2: 5.855348 wins
t3: 9.306874 loses
t4: 6.584076 wins

$ gcc b.c -ob -Wall -O3
$ ./b
t1: 6.317390 loses
t2: 5.922087 wins
t3: 9.436450 loses
t4: 6.722685 wins

关于c - 实现音节化算法但真的很慢,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/49090878/

24 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com