c - 打印给定文本文件中最常出现的单词，无法在 C 中按频率排序-6ren

c - 打印给定文本文件中最常出现的单词，无法在 C 中按频率排序

转载作者：行者123 更新时间：2023-11-30 16:32:37

我正在做一项作业，要求我打印给定文本文件中出现次数最多的 10 个单词。我的代码正在打印文件中的单词，但它没有根据它们的频率对它们进行排序。

这是我的代码如下。我使用哈希表来存储每个唯一的单词及其频率。我目前正在使用我编写的 wordcmp 函数对单词进行排序，并在 main 中的内置 qsort 函数中调用它。

如果有人可以指导我纠正我的错误，我将非常感激。

我当前的输出:

排名前 10 的单词(共 10 个)是:

1 即时

1 个

又是1

3快乐

2你好

1 如何实现

1 让

1 个你

1 次尝试

1 这个

预期输出(我想要的):

排名前 10 的单词(共 10 个)是:

3快乐

2你好

1 个你

1 次尝试

1 这个

1 让

1 即时

1 如何实现

1 个

又是1

这是我的一些代码:

typedef struct word
{ 
  char *s;          /* the word */
  int count;        /* number of times word occurs */
  struct word* next;
}word;

struct hashtable
{
  word **table;
  int tablesize;
  int currentsize;
};
typedef struct hashtable hashtable;
int main(int argc, char *argv[])
{

    int top_words = 10;
    word *word = NULL;
    hashtable *hash = ht_create(5000);
    char *file_name;
    char *file_word;
    FILE *fp;
    struct word *present = NULL;

    fp = fopen (file_name, "r");
    if (fp == NULL)
    {
        fprintf (stderr,"%s: No such file or directory\n", file_name);
        fprintf(stderr,"The top %d words (out of 0) are:\n", top_words); 
        exit(-1);
    }

    continue_program:
    while ((file_word = getWord(fp)))
    {
        word = add(hash, file_word, 1);
    }
    fclose(fp);

    qsort((void*)hash->table, hash->currentsize, sizeof(word),(int (*)(const void *, const void *)) wordcmp);

    if(top_words > total_unique_words)
          top_words = total_unique_words;

    printf("the top %d words (out of %d) are:\n", top_words, total_unique_words);

    int iterations =0;
    for(i =0; i <= hash->tablesize && iterations< top_words; i++)
    {
          present = hash->table[i];
          if(present != NULL)
          {
              printf("     %4d %s\n", present->count, present->s);
              present = present->next;
              iterations++;
          }
    }
    freetable(hash);

 return 0;
}

int wordcmp (word *a, word *b) 
{
    if (a != NULL && b!= NULL) {

    if (a->count < b->count) 
    {
      return +1;     
    }
    else if (a->count > b->count) 
    {
        return -1; 
    }
    else if (a->count == b->count)
    {
      /*return strcmp(b->s, a->s);*/
      return 0;
    }
  }
  return 0;
}

/* Create a new hashtable. */
struct hashtable *ht_create( int size ) 
{
  int i;

  if( size < 1 ) 
    return NULL;

  hashtable *table = (hashtable *) malloc(sizeof(hashtable));
  table->table = (word **) malloc(sizeof(word *) * size);

  if(table != NULL)
  {
      table->currentsize = 0;
      table->tablesize = size;
  }

  for( i = 0; i < size; i++ ) 
  {
    table->table[i] = NULL;
  }

  return table; 
}

/* Adds a new node to the hash table*/
word * add(hashtable *h, char *key, int freq) 
{
    int index = hashcode(key) % h->tablesize;
    word *current = h->table[index];

    /* Search for duplicate value */
    while(current != NULL) {
        if(contains(h, key) == 1){
            current->count++;
            return current;
       }
         current = current->next;
     }

    /* Create new node if no duplicate is found */
    word *newnode = (struct word*)malloc(sizeof(struct word));
    if(newnode!=NULL){
          newnode->s =strdup(key);
          newnode-> count = freq;
          newnode-> next = NULL;
    }
    h->table[index] = newnode;
    h->currentsize = h->currentsize + 1;
    total_unique_words++;
    return newnode;
}

最佳答案

您面临的主要问题是尝试使用存储桶的链表链接对哈希表进行排序。当发生哈希冲突时，您的表不会调整大小，您只需使用链接列表将导致冲突的单词存储在与已存储在那里的单词链接的同一个 table[index] 中。这就是 add 的作用。

这很容易导致哈希表的内容如下所示:

table[ 0] = NULL
table[ 1] = foo
table[ 2] = NULL
table[ 3] = |some|->|words|->|that|->|collided|  /* chained bucket */
table[ 4] = other
table[ 5] = words
table[ 6] = NULL
table[ 7] = NULL
...

你不能简单地qsort表并希望得到正确的词频。 qsort 无法知道 "some" 只是链表中的开头单词，所有 qsort 获取的都是指向 “一些”和sizeof(word)。

为了让生活变得更轻松，只需忘记哈希表，并使用动态分配的 word** 数组即可。您可以使用类似的添加来增加重复项的出现次数，并避免链式存储桶的所有问题。 (如果您为每个单词提供自动存储，它会给您留下一个简单的free()指针，然后您就完成了)

以下示例采用 2 个参数。第一个是要从中读取单词的文件名，第二个整数值(可选)将排序后的输出限制为最高单词数。 words_t 结构使用自动存储限制为 32 个字符的 word(未删节词典中最大的单词为 28 个字符)。您可以更改单词或阅读的方式来解析输入并根据需要忽略标点符号和复数。以下内容在所有标点符号上分隔单词(连字符除外)，并丢弃单词的复数形式(例如，当遇到 "Mike's" 时，它会存储 "Mike" ，并丢弃“的”)

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <errno.h>

#define MAXC   32   /* max word length is 28-char, 29-char is sufficient */
#define MAXW  128   /* initial maximum number of words to allocate */

typedef struct {
    char word[MAXC];    /* struct holding individual words */
    size_t ninst;       /* and the number of times they occur */
} words_t;

/*  function prototypes */
void *addword (words_t *words, const char *word, size_t *wc, size_t *maxw);
void *xrealloc (void *ptr, size_t psz, size_t *nelem);

/* qsort compare function for words_t (alphabetical) */
int cmpwrds (const void *a, const void *b)
{
    return strcmp (((words_t *)a)->word, ((words_t *)b)->word);
}

/* qsort compare function for words_t (by occurrence - descending)
 * and alphabetical (ascending) if occurrences are equal)
 */
int cmpinst (const void *a, const void *b)
{
    int ndiff =  (((words_t *)a)->ninst < ((words_t *)b)->ninst) - 
                (((words_t *)a)->ninst > ((words_t *)b)->ninst);

    if (ndiff)
        return ndiff;

    return strcmp (((words_t *)a)->word, ((words_t *)b)->word);
}

int main (int argc, char **argv) {

    int c = 0, nc = 0, prev = ' ', total = 0;
    size_t maxw = MAXW, wc = 0, top = 0;
    char buf[MAXC] = "";
    words_t *words = NULL;
    FILE *fp = fopen (argv[1], "r");

    if (!fp) {  /* validate file open for reading */
        fprintf (stderr, "error: file open failed '%s'.\n", argv[1]);
        return 1;
    }

    if (argc > 2) { /* if 2 args, convert argv[2] to number of top words */
        char *p = argv[2];
        size_t tmp = strtoul (argv[2], &p, 0);
        if (p != argv[2] && !errno)
            top = tmp;
    }

    /* allocate/validate initial words */
    if (!(words = calloc (maxw, sizeof *words))) {
        perror ("calloc-words");
        return 1;
    }

    while ((c = fgetc(fp)) != EOF) {        /* read each character in file */
        if (c != '-' && (isspace (c) || ispunct (c))) { /* word-end found */
            if (!isspace (prev) && !ispunct (prev) &&   /* multiple ws/punct */
                !(prev == 's' && nc == 1)) {            /* exclude "'s" */
                buf[nc] = 0;                            /* nul-terminate */
                words = addword (words, buf, &wc, &maxw);   /* add word */
                nc = 0;     /* reset char count */
            }
        }
        else if (nc < MAXC - 1) {   /* add char to buf */
            buf[nc++] = c;
        }
        else {  /* chars exceed MAXC - 1; storage capability of struct */
            fprintf (stderr, "error: characters exceed %d.\n", MAXC);
            return 1;
        }
        prev = c;   /* save previous char */
    }
    if (!isspace (prev) && !ispunct (prev))     /* handle non-POSIX end */
        words = addword (words, buf, &wc, &maxw);

    if (fp != stdin) fclose (fp);   /* close file if not stdin */

    qsort (words, wc, sizeof *words, cmpinst);  /* sort words by frequency */

    printf ("'%s' contained '%zu' words.\n\n",  /* output total No. words */
            fp == stdin ? "stdin" : argv[1], wc);

    /* output top words (or all words in descending order if top not given) */
    for (size_t i = 0; i < (top != 0 ? top : wc); i++) {
        printf ("  %-28s    %5zu\n", words[i].word, words[i].ninst);
        total += words[i].ninst;
    }
    printf ("%33s------\n%34s%5d\n", " ", "Total: ", total);

    free (words);

    return 0;
}

/** add word to words, updating pointer to word-count 'wc' and
 *  the maximum words allocated 'maxw' as needed. returns pointer
 *  to words (which must be assigned back in the caller).
 */
void *addword (words_t *words, const char *word, size_t *wc, size_t *maxw)
{
    size_t i;

    for (i = 0; i < *wc; i++)
        if (strcmp (words[i].word, word) == 0) {
            words[i].ninst++;
            return words;
        }

    if (*wc == *maxw)
        words = xrealloc (words, sizeof *words, maxw);

    strcpy (words[*wc].word, word);
    words[(*wc)++].ninst++;

    return words;
}

/** realloc 'ptr' of 'nelem' of 'psz' to 'nelem * 2' of 'psz'.
 *  returns pointer to reallocated block of memory with new
 *  memory initialized to 0/NULL. return must be assigned to
 *  original pointer in caller.
 */
void *xrealloc (void *ptr, size_t psz, size_t *nelem)
{   void *memptr = realloc ((char *)ptr, *nelem * 2 * psz);
    if (!memptr) {
        perror ("realloc(): virtual memory exhausted.");
        exit (EXIT_FAILURE);
    }   /* zero new memory (optional) */
    memset ((char *)memptr + *nelem * psz, 0, *nelem * psz);
    *nelem *= 2;
    return memptr;
}

(注意:输出按出现次数降序排序，如果单词出现次数相同，则按字母顺序排序)

示例使用/输出

$ ./bin/getchar_wordcnt_top dat/damages.txt 10
'dat/damages.txt' contained '109' words.

  the                                12
  a                                  10
  in                                  7
  of                                  7
  and                                 5
  anguish                             4
  injury                              4
  jury                                4
  mental                              4
  that                                4
                                 ------
                           Total:    61

注意:要使用哈希表作为存储基础，您至少必须创建一个指向哈希表中每个单词的指针数组，然后对指针数组进行排序。否则，您将需要复制存储并将单词复制到新数组中进行排序。 (这在某种程度上是一种内存效率低下的方法)。创建一个单独的指向哈希表中要排序的每个单词的指针数组是调用 qsort 并避免链式存储桶问题的唯一方法。

关于c - 打印给定文本文件中最常出现的单词，无法在 C 中按频率排序，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/50070148/

文章推荐： c - 是什么导致了这些默认数组值？

文章推荐： c - 如何在linux中引发因EINTR而失败的semop调用？

文章推荐： c - 如何获取SSDT地址

文章推荐： c - ARM编码器计数失败

r - 按月计算的观察次数/频率
我有一个包含 34 个变量和大约 25,000 个观测值的数据集。每个观察都涉及一个特定事件。它的格式如下: no id date .... 1 363 006
r - 按月绘制单词的推文计数/频率
我已将 R 连接到 Twitter 并使用 R 中的 searchTwitter 函数进行抓取，并清除标点符号、小写字母等结果数据。现在我正在尝试执行以下操作: 计算自 2015 年 1 月至 20
c++ - 频率-线性仓到对数屏幕空间
我正在研究项目，需要可视化频谱分析以设置一些精确参数。现在，我将垃圾箱转换为屏幕空间，因为在线性空间中，较低频率的幅度被压缩在一起。这是我在C++中的代码: float windowSize = 64
matlab - 频率:在MATLAB中使用FFT的分贝图
我正在尝试使用MATLAB导入WAV文件并创建如下所示的图表类型。我基本上是在尝试获取频率信息并根据分贝对其进行绘制。这是我正在使用的代码，但似乎无法正确提取频率信息: [x fs]=wavread(
java - 计算字符串中字符的出现次数(频率)
我有一个 GUI，可以计算字符串中第一个字母的出现次数。我希望它以列格式计算所有字母，例如: 这是我到目前为止所拥有的: import java.awt.BorderLayout; import ja
r - 计算调查对象中某个值的百分比/频率
我有一个由许多变量组成的全国调查，就像这个(为了简单起见，我省略了一些变量): year id y.b sex income married pens weight 2002
事件的 SQL 频率
我被要求报告我们客户的联系频率，即每周、每月、每季度或每年看到多少客户。当在论坛中讨论“频率”时，它们通常是指某个值在表中存在的次数。我可以获得客户的联系人数量:- select A.cl
c# - 读取麦克风分贝和音调/频率
我正在尝试制作一款游戏，当麦克风发出足够响亮的声音时，我的角色会射击(在 Unity 中)。但是我不知道如何开始。感谢您的帮助! 最佳答案您可以通过使用 AudioSource.GetOutput
使用函数(频率)计算字符数
尝试计算字符数并改进我的代码，我做了一些更改，而不是使用 while 循环。好奇是否有人对我如何改进我的代码以使其更专业且更便宜有任何建议？ #include int countingCharact
php - 如何分析带有时间戳的条目的事件/频率？
我正在创建一个 MySQL 数据库，其中包含大量带有时间戳的条目。这些条目将附加到特定用户和另一个索引(例如博客作者和他的几个网站)。计算用户/全局每日条目图表的最佳方法是什么。我的两种方法是使用
MySQL 频率/分布
我创建了一项调查并将其发送出去。该调查要求用户提供电子邮件，然后要求他们从包含 8 个不同选项的下拉菜单中选择要吃哪顿饭。有些人使用同一封电子邮件多次填写调查，但食物选择不同。我有一个如下所示的 M
日期范围内的 MySQL 频率
我有一个 MySQL 数据库: Date Customer_ID 我怎样才能把它变成: Customer_ID | Count_Visits_Past_Week | Count_Visits_
bash - Gnuplot 频率
对于非常大的数据集，如何使用 gnuplot 仅在第一个和最后一个数据点的 x 轴上放置标记/标签？最佳答案在 gnuplot 4.6 及更高版本中，您可以使用命令 stats 'data.dat
python - 相关词的概率计数/频率？
我正在寻找一种方法来为具有共同词根/含义的单个词生成数值概率值。用户将使用“舞者”、“跳舞”、“跳舞”等词生成内容。如果“dancer”被提交了 30 次，跳舞了 5 次，我只需要一个值“danc
python - 时间序列数据的运行平均值/频率？
给定一个包含如下内容的数据集: [2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 65, 75, 85, 86, 87,
java - 绘制声音的音高(频率)
我想将声音的音高绘制成图表。目前我可以绘制幅度。下图是由 getUnscaledAmplitude() 返回的数据创建的: AudioInputStream audioInputStream = A
javascript - 计算数组元素的出现/频率
在 Javascript 中，我试图获取一个初始的数值数组并计算其中的元素。理想情况下，结果将是两个新数组，第一个指定每个唯一元素，第二个包含每个元素出现的次数。不过，我愿意接受有关输出格式的建议。
multithreading - InfiniBand:传输速率取决于MPI_Test *频率
我正在编写一个多线程OpenMPI应用程序，使用来自多个线程的MPI_Isend和MPI_Irecv在InfiniBand RDMA的各个列之间每秒交换数百条消息。传输量约为400-800KByte
.net - 创建包含多个音符(频率)的声音
这个站点上有很多问题，询问如何在给定频率下创建简单的正弦波。我想做的是获取阵列或列表或任何频率，然后连续连续播放它们(而不是和弦)，听起来有点像旧PC扬声器。我尝试使用Console.Beep，但是它
iphone - iPhone-识别波形/频率
我使用我的App捕获声音。假设此声音是正弦1 KHz声音，并且存在背景声音。如何识别此1 KHz声音出现在声音上？我的意思是，我可以想象如何在图像中找到元素，例如，如果您要在图像上寻找黄色正方形，那

行者123

个人简介

我是一名优秀的程序员,十分优秀！

作者热门文章

滴滴打车优惠券免费领取

全站热门文章

首页

博学

6Ren·AI

商城

c - 打印给定文本文件中最常出现的单词，无法在 C 中按频率排序