C - 在无限长度的行中读取有限长度的单词-6ren

C - 在无限长度的行中读取有限长度的单词

转载作者：太空宇宙更新时间：2023-11-04 07:53:44

我想从文件中读取单词并知道新行何时开始。

我知道每行可以有三个、四个或零个单词，并且单词不能超过一定长度。但是带空格的行长度没有限制，所以不可能只将一行读入一个字符串，解析并继续。我想知道我阅读时每行中是否有三个或四个单词。

目前我使用 fscanf 和一些特定于问题的内部逻辑来决定我读取的第四个单词是在新行中还是在前一行中。但是这种方式很脆弱，很容易坏掉。

我想我可以一个字符一个字符地读取，忽略空格并查找“\n”。有没有更优雅的方式？

谢谢

编辑:我仅限于使用 C99 和标准库。

最佳答案

这里有一些代码可以完成与您的请求密切相关的工作。有几个主要区别:

它不相信用户知道他们提供的是什么，因为数据必须遵守某些规则，因此它假设用户会滥用这些规则。
因此，它记录了每一行找到的所有单词，全长记录单词，因此使用动态内存分配。

在我发布它之前已经过一些相当严格的测试。您使用 make UFLAGS=-DTEST 进行编译以获得更短的行片段(默认情况下为 64 字节 vs 4096)，这也为您提供了额外的诊断输出。我在 6 而不是 64 时用 MAX_LINE_LEN 做了很多测试——这对于调试单词在一行的多个片段上连续的问题很有用.

#include <assert.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

enum { MAX_WORD_CNT = 8 };

#ifdef TEST
static int debug = 1;
enum { MAX_LINE_LEN = 64 };
#else
static int debug = 0;
enum { MAX_LINE_LEN = 4096 };
#endif /* TEST */

typedef struct Word
{
    size_t length;
    char  *word;
} Word;

typedef struct WordList
{
    size_t  num_words;
    size_t  max_words;
    Word   *words;
} WordList;

typedef struct LineControl
{
    size_t   line_length;
    bool     part_word;
    size_t   part_len;
    WordList list;
} LineControl;

static void init_wordlist(WordList *list)
{
    list->num_words = 0;
    list->max_words = 0;
    list->words = 0;
}

static void free_wordlist(WordList *list)
{
    assert(list != 0);
    for (size_t i = 0; i < list->num_words; i++)
        free(list->words[i].word);
    free(list->words);
    init_wordlist(list);
}

static void extend_word(const char *extn, size_t ext_len, Word *word)
{
    if (debug)
        printf("old (%zu) = [%s]; extra (%zu) = [%.*s]\n", word->length, word->word,
                ext_len, (int)ext_len, extn);
    size_t space = word->length + ext_len + 1;
    char *new_space = realloc(word->word, space);
    if (new_space == 0)
    {
        fprintf(stderr, "failed to reallocate %zu bytes of memory\n", space);
        exit(EXIT_FAILURE);
    }
    word->word = new_space;
    memmove(word->word + word->length, extn, ext_len);
    word->length += ext_len;
    word->word[word->length] = '\0';
    if (debug)
        printf("new (%zu) = [%s]\n", word->length, word->word);
    }

static void addword_wordlist(const char *word, size_t word_len, WordList *list)
{
    if (list->num_words >= list->max_words)
    {
        assert(list->num_words == list->max_words);
        size_t new_max = list->max_words * 2 + 2;
        Word *new_words = realloc(list->words, new_max * sizeof(*new_words));
        if (new_words == 0)
        {
            fprintf(stderr, "failed to allocate %zu bytes of memory\n", new_max * sizeof(*new_words));
            exit(EXIT_FAILURE);
        }
        list->max_words = new_max;
        list->words = new_words;
    }
    list->words[list->num_words].word = malloc(word_len + 1);
    if (list->words[list->num_words].word == 0)
    {
        fprintf(stderr, "failed to allocate %zu bytes of memory\n", word_len + 1);
        exit(EXIT_FAILURE);
    }
    Word *wp = &list->words[list->num_words];
    wp->length = word_len;
    memmove(wp->word, word, word_len);
    wp->word[word_len] = '\0';
    list->num_words++;
}

static void init_linectrl(LineControl *ctrl)
{
    ctrl->line_length = 0;
    ctrl->part_word = false;
    ctrl->part_len = 0;
    init_wordlist(&ctrl->list);
}

static int parse_fragment(const char *line, LineControl *ctrl)
{
    char   whisp[] = " \t";
    size_t offset = 0;
    bool   got_eol = false;

    /* The only newline in the string is at the end, if it is there at all */
    assert(strchr(line, '\n') == strrchr(line, '\n'));
    assert(strchr(line, '\n') == 0 || *(strchr(line, '\n') + 1) == '\0');
    if (debug && ctrl->part_word)
    {
        assert(ctrl->list.num_words > 0);
        printf("Dealing with partial word on entry (%zu: [%s])\n",
               ctrl->part_len, ctrl->list.words[ctrl->list.num_words - 1].word);
    }

    size_t o_nonsp = 0;
    while (line[offset] != '\0')
    {
        size_t n_whisp = strspn(line + offset, whisp);
        size_t n_nonsp = strcspn(line + offset + n_whisp, whisp);
        if (debug)
            printf("offset %zu, whisp %zu, nonsp %zu\n", offset, n_whisp, n_nonsp);
        got_eol = false;
        ctrl->line_length += n_whisp + n_nonsp;
        if (line[offset + n_whisp + n_nonsp - 1] == '\n')
        {
            assert(n_nonsp > 0);
            got_eol = true;
            n_nonsp--;
        }
        if (n_whisp + n_nonsp == 0)
        {
            o_nonsp = 0;
            break;
        }

        if (n_whisp != 0)
        {
            ctrl->part_word = false;
            ctrl->part_len = 0;
        }

        /* Add words to list if the list is not already full */
        if (n_nonsp > 0)
        {
            const char *word = line + offset + n_whisp;
            if (ctrl->part_word)
            {
                assert(ctrl->list.num_words > 0);
                extend_word(word, n_nonsp,
                            &ctrl->list.words[ctrl->list.num_words - 1]);
            }
            else
            {
                addword_wordlist(word, n_nonsp, &ctrl->list);
            }
        }

        offset += n_whisp + n_nonsp;
        if (line[offset] != '\0')
        {
            ctrl->part_word = false;
            ctrl->part_len = 0;
        }
        o_nonsp = n_nonsp;
        if (got_eol)
            break;
    }

    /* Partial word detection */
    if (o_nonsp > 0 && !got_eol)
    {
        ctrl->part_word = true;
        ctrl->part_len += o_nonsp;
    }
    else
    {
        ctrl->part_word = false;
        ctrl->part_len = 0;
    }

    /* If seen newline; line complete */
    /* If No newline; line incomplete */
    return !got_eol;
}

int main(void)
{
    char line[MAX_LINE_LEN];
    size_t lineno = 0;

    while (fgets(line, sizeof(line), stdin) != 0)
    {
        LineControl ctrl;
        init_linectrl(&ctrl);
        lineno++;
        if (debug)
            printf("Line %zu: (%zu) [[%s]]\n", lineno, strlen(line), line);

        int extra = 0;
        while (parse_fragment(line, &ctrl) != 0 &&
               fgets(line, sizeof(line), stdin) != 0)
        {
            if (debug)
                printf("Extra %d for line %zu: (%zu) [[%s]]\n",
                       ++extra, lineno, strlen(line), line);
        }

        WordList *list = &ctrl.list;
        printf("Line %zu: length %zu, words = %zu\n",
               lineno, ctrl.line_length, list->num_words);
        size_t num_words = list->num_words;
        if (num_words > MAX_WORD_CNT)
            num_words = MAX_WORD_CNT;
        for (size_t i = 0; i < num_words; i++)
        {
            printf("  %zu: (%zu) %s\n",
                   i + 1, list->words[i].length, list->words[i].word);
        }
        putchar('\n');
        free_wordlist(&ctrl.list);
    }

    return 0;
}

我有一个没有动态内存分配的更简单的版本，但是当一个词被分成一行的两个片段时它不能正常工作(所以如果行片段的大小是 6(5 个字符加上空字节)，并且例如，一个单词的最大长度为 16，然后代码在组装片段时遇到了困难。因此，我采用了一种更简单的方法——存储每个单词的所有内容。从问题中不清楚最大单词大小是多少。如果代码应该反对除 0、3 或 4 个单词之外的任何内容，则可以使用数据来提出这些投诉。如果代码应该反对长度超过某个长度(例如 32)的单词，则可以使用数据来提出这些投诉也是。

test-data.1 是一个比较简单的测试文件:

    a b   
    a b      c         d                                                        

1123xxsdfdsfsfdsfdssa          1234ddfxxyff            frrrdds
1123dfdffdfdxxxxxxxxxas                        1234ydfyyyzm   knsaaass      1234asdafxxfrrrfrrrsaa    
               1123werwetrretttrretertre       aaaa     bbbbbb      ccccc        
k
                                                apoplectic-catastrophe-mongers-of-the-world-unite-for-you-have-nothing-to-lose-but-your-bad-temper                              apoplectic-catastrophe-mongers-of-the-world-unite-for-you-have-nothing-to-lose-but-your-bad-temper                                      apoplectic-catastrophe-mongers-of-the-world-unite-for-you-have-nothing-to-lose-but-your-bad-temper                                                  apoplectic-catastrophe-mongers-of-the-world-unite-for-you-have-nothing-to-lose-but-your-bad-temper

其中包含各种选项卡，如同一数据的此版本所示，其中选项卡显示为 \t:

    a b   
    a b      c         d                                                        
\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t
1123xxsdfdsfsfdsfdssa          1234ddfxxyff            frrrdds
1123dfdffdfdxxxxxxxxxas                        1234ydfyyyzm   knsaaass      1234asdafxxfrrrfrrrsaa    
               1123werwetrretttrretertre       aaaa     bbbbbb      ccccc        
k
  \t\t \t \t\t\t \t \t \t\t\t\tapoplectic-catastrophe-mongers-of-the-world-unite-for-you-have-nothing-to-lose-but-your-bad-temper\t\t\t    \t\t\t\tapoplectic-catastrophe-mongers-of-the-world-unite-for-you-have-nothing-to-lose-but-your-bad-temper  \t \t \t \t\t\t\t \t \tapoplectic-catastrophe-mongers-of-the-world-unite-for-you-have-nothing-to-lose-but-your-bad-temper\t\t           \t\t\t\t \t \t \t \t\tapoplectic-catastrophe-mongers-of-the-world-unite-for-you-have-nothing-to-lose-but-your-bad-temper\t\t\t\t\t\t    \t \t \t \t      \t \t \t

运行这个 awk 脚本分析数据:

$ awk '{ printf "%3d %d [%s]\n", length($0) + 1, NF, $0 }' test-data.1
  1 0 []
  5 0 [    ]
 11 2 [    a b   ]
 81 4 [    a b      c         d                                                        ]
 20 0 [                                                     ]
 63 3 [1123xxsdfdsfsfdsfdssa          1234ddfxxyff            frrrdds]
103 4 [1123dfdffdfdxxxxxxxxxas                        1234ydfyyyzm   knsaaass      1234asdafxxfrrrfrrrsaa    ]
 82 4 [               1123werwetrretttrretertre       aaaa     bbbbbb      ccccc        ]
  2 1 [k]
494 4 [                                                 apoplectic-catastrophe-mongers-of-the-world-unite-for-you-have-nothing-to-lose-but-your-bad-temper                              apoplectic-catastrophe-mongers-of-the-world-unite-for-you-have-nothing-to-lose-but-your-bad-temper                                      apoplectic-catastrophe-mongers-of-the-world-unite-for-you-have-nothing-to-lose-but-your-bad-temper                      apoplectic-catastrophe-mongers-of-the-world-unite-for-you-have-nothing-to-lose-but-your-bad-temper                                           ]
$

程序对该数据文件的输出是:

Line 1: length 1, words = 0

Line 2: length 5, words = 0

Line 3: length 11, words = 2
  1: (1) a
  2: (1) b

Line 4: length 81, words = 4
  1: (1) a
  2: (1) b
  3: (1) c
  4: (1) d

Line 5: length 20, words = 0

Line 6: length 63, words = 3
  1: (21) 1123xxsdfdsfsfdsfdssa
  2: (12) 1234ddfxxyff
  3: (7) frrrdds

Line 7: length 103, words = 4
  1: (23) 1123dfdffdfdxxxxxxxxxas
  2: (12) 1234ydfyyyzm
  3: (8) knsaaass
  4: (22) 1234asdafxxfrrrfrrrsaa

Line 8: length 82, words = 4
  1: (25) 1123werwetrretttrretertre
  2: (4) aaaa
  3: (6) bbbbbb
  4: (5) ccccc

Line 9: length 2, words = 1
  1: (1) k

Line 10: length 494, words = 4
  1: (98) apoplectic-catastrophe-mongers-of-the-world-unite-for-you-have-nothing-to-lose-but-your-bad-temper
  2: (98) apoplectic-catastrophe-mongers-of-the-world-unite-for-you-have-nothing-to-lose-but-your-bad-temper
  3: (98) apoplectic-catastrophe-mongers-of-the-world-unite-for-you-have-nothing-to-lose-but-your-bad-temper
  4: (98) apoplectic-catastrophe-mongers-of-the-world-unite-for-you-have-nothing-to-lose-but-your-bad-temper

您可以在输出中看到来自 awk 脚本的数据。

此代码可在我的 SOQ 中找到(堆栈溢出问题)GitHub 上的存储库，文件为 scan59.c、test-data.1、test-data.2 和 /Users/jleffler/soq/src/so-5201-4002 中的 test-data.3子目录。特别是 test-data.3 文件包含一行 9955 个字符和 693 个单词 — 以及其他不太严格的测试行。

代码运行编译并在运行 macOS 10.13.6 High Sierra 的 Mac 上干净地运行，使用 GCC 8.2.0 和 Valgrind 3.14.0.GIT。 (虽然 makefile 规定了 C11，但此代码中没有任何特定于 C11 的内容；它与 C99 完全兼容。它还可以使用 make SFLAGS='-std=c99 干净地编译 -迂腐'.)

关于C - 在无限长度的行中读取有限长度的单词，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/52014002/

文章推荐： c - 指向c中匿名结构数组的指针

文章推荐： c - 我尝试打印列表时出现问题

文章推荐： c - 检查函数返回值A，但在main中，发现值为-1

Python:Pandas 读取 csv:读取 csv 时向下转换
我遇到以下问题。我想读取一个包含数百万行和数百列的大型 csv。我想向下转换列的数据类型。我的方法是读取 csv，然后使用 pd.to_numeric() 对其进行向下转换。我不知道列数及其类型。在读
c# - 从 SQL Server 读取 - 需要从 CSV 读取
目前，我从 SQL server (2008) 数据库获取数据。 cyurrent的方法是使用DataTable，然后将其传递并使用。 if (parameters != null)
python - Dask 读取 csv 与 pandas 读取 csv
我有以下问题。我有一个巨大的 csv 文件，想用多处理加载它。对于一个包含 500000 行和 130 列不同数据类型的示例文件，Pandas 需要 19 秒。我试过 dask 因为我想多处理阅读。但
java - 读取 VC++ CArchive 二进制格式(或 Java 读取 (CObArray))
是否有关于用于序列化各种 MFC 数据结构的二进制格式的明确文档？我已经能够在十六进制编辑器中查看我自己的一些类，并使用 Java 的 ByteBuffer 类读取它们(使用自动字节顺序转换等)。但
javascript - 是否可以使用 javascript 读取 something.properties ？ (在 .hta 文件中)>> 读取 Selenium
我正在使用 Selenium 进行测试，我们用 HTML 文件编写测试用例，并用它们制作测试套件，我们的要求是编写足够健壮的测试用例，以根据测试环境改变自身。为此，我不希望在 HTML 脚本本身中包
读取.txt文件的Javascript代码
我需要一个 JavaScript 代码来读取存储为 .txt 文件的字典(或者也可以保存为任何其他类型的文件。它也可以在线获得)并将其内容存储在一个变量中。我不能找到一种让 JavaScript 像
java - 读取/过滤文本文件的最快方法是什么
我正在尝试遍历包含 SSH 登录和其他日志的日志文本文件。程序正在返回 SSH 登录的总数。我的解决方案确实有效，但似乎有点慢(在 200mo 文件上大约需要 3.5 秒)。我想知道是否有任何方法
vba - 读取/写入大量数据
我正在将大量数据从一个电子表格复制到工作簿中的其他 160 个电子表格。目前，Excel (2013) 遇到错误，因为它没有足够的资源来完成操作。我的目标是将工作表 4 中 V13:XI1150 范
VBA 读取/搜索文本文件
我正在尝试读取一个有 1147 行的文本文件。下面的代码仅读取第 1050-1147 行。我的目标是读取整个文件并提取位于不同行的特定值以在脚本中使用。一个示例是包含“BlockList: 2”的行中
Prolog 用户输入的时间限制(读取)
我正在为游戏编写解释器。用户将其移动输入解释器，程序执行该移动。现在我想为每个决定实现一个时间限制。玩家不应该能够思考超过 30 秒来写一个移动并按下回车。 call_with_time_limit
读取 HITRAN 文件格式
以this file例如，我正在尝试读取 data.frame 中的数据。来自 the doc (pdf 文件，表 1)，它遵循一些 fortran 约定。我尝试了以下但收效甚微: dir 0' 将
读取 R 的电子邮件附件
我正在使用 R 阅读 Outlook 附件。我的引用在这里:Download attachment from an outlook email using R 这是我的电子邮件的截图: 这每天都会发送
php xmlrpc 读取
我不会从表格中读取行来将主题放在列表中 php脚本 $url_obj='http://'.$host.':8069/xmlrpc/object'; $sock=new xmlrpc_client($u
读取 csv 但跳过字符串中的转义逗号
我有一个这样的 csv 文件: id,name,value 1,peter,5 2,peter\,paul,3 我如何读取此文件并告诉 R "\," 不表示新列，仅表示 ","。我必须添加该文件
macos - 读取 plist
我正在尝试读取 ~/Library/Preferences/com.apple.mail.plist (在 Snow Leopard 上)以获取电子邮件地址和其他信息以进入“关于”对话框。我使用以下代
bash - 读取 float
This question already has answers here: How do I use floating-point division in bash? (19个回答) 5个月前关闭
COBOL 读取/存储在表中
本练习的目标是读取输入文件并将其存储到表中，然后验证输入中的某些字段并输出任何错误记录。我需要读取并存储每个策略组，以便表中一次仅存储 5 条记录，而不是整个文件。所以我需要读取一个包含 5 条记录
Cassandra LWT 读取
据我了解，LWT 插入始终以 SERIAL 一致性级别完成。如果为 true，这是否意味着读取作为 LWT 插入的行可以安全地以 ANY 的一致性级别读取？换句话说，我假设 LWT 插入是完全一致的
JavaScript:读取 cookie
我看到很多很多通过java脚本读取cookie的函数，但我只想在变量中使用它一次，我是JS新手。这是我的代码 var TheNumber = (Math.random() + '') * 10000
c# - 读取.net中的程序集
我正在使用 asp.net 和 C#。我在服务器上部署了一个应用程序[已发布]，现在我想查看该网站的代码，据我所知，我可以阅读程序集来查看代码。请告诉我如何实现它。提前致谢。最佳答案您可以使用

太空宇宙

个人简介

我是一名优秀的程序员,十分优秀！

作者热门文章

滴滴打车优惠券免费领取

全站热门文章

首页

博学

6Ren·AI

商城

C - 在无限长度的行中读取有限长度的单词