c - HashTable 如何将重复的单词只打印一次？-6ren

c - HashTable 如何将重复的单词只打印一次？

转载作者：太空宇宙更新时间：2023-11-04 03:21:16

我正在实现一个哈希表，但我遇到了一些问题，我总是在添加一个词时打印更新的哈希表，问题是，当这个词再次出现时，我只需要增加它的频率，但我的程序正在以更新的频率再次打印它:如何只打印一次重复的单词？并显示它们的频率。

我遇到的另一个问题是函数 print_freq。它收到一个 int freq，我应该用这个频率打印单词，但问题是 auxTable 没有保存 htable 中的单词，我不知道为什么它不起作用，因为 auxtable 保存频率正常，但是当它是要保存单词，它会保存一个空字符“”。

#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <ctype.h>

#define HTABLE_SIZE 1001
#define MAX_LINE_SIZ 1024

/* Hash Table */
typedef struct node* HASH_TABLE; /* estrutura de dados utilizadas para formar a hastable*/
struct node {
    char *palavra; /*word*/
    int freq;
};

/*Declaracao das funcoes*/
void inserirHashTable(char *s);
void print_ht();
void print_test();

HASH_TABLE htable[HTABLE_SIZE] = { NULL }; /* Hash table que armazenará as palavras lidas do arquivos */
unsigned int chaves[HTABLE_SIZE]; /* Vetor que armazenará as chaves das palavras da tabela*/
int tamanhoChaves=-1; /*Variavel responsavel por contar a quantidade de chaves do vetor de chaves*/
int size = 0; /* variavel responsavel por armazenar o numero de elementos da tabela*/

/*Função responsavel por processar o arquivo, a mesma recebe o arquivo como parametro,
 * pega cada palavra presente no arquivo separa dos simbolos ?!+.-... e chama a  função
 * inserirHT para inserir a palavra separada na tabela hash*/
void processarArquivo(FILE *fp)
{
    const char *seperators = " ?!'\";,.:+-*&%(){}[]<>\\\t\n"; // caractertes que deveram ser separados

    char line[MAX_LINE_SIZ];
    char *s;
    while((fgets(line,MAX_LINE_SIZ, fp)) != NULL) //pegando a linha do arquivo
    {
        for (s=strtok(line,seperators); s; s=strtok(NULL,seperators)){ // separando a palavra
            /*printf("Palavra a ser inserida %s \n",s); printf utilizado para mostrar
             * a palavra que foi seperada e será inserida*/
            inserirHashTable(s);//Chamando a função inserir
        }
    }
}

/* Função responsavel por criar a chave de cada palavra que vai para tabela,
  recebe como parametro um ponteiro para string, logo em seguida pega cada
  caractere da string e gera um unsigned int para ele, retorna por fim o
  modulo desse unsigned int pelo tamanho da tabela*/
unsigned int hash(char *tok)
{
    unsigned int hv = 0;
    while (*tok)
        hv = (hv << 4) | toupper(*tok++);
    /*printf("conversao: %d \n",hv); Printf utilizado para mostrar o valor de hv antes de ser retorna como modulo*/
    return hv % HTABLE_SIZE;
}

/* funcao responsavel por isenrir a palavra lida do arquivo na hash_table,
 * a funçãp recebe como parametro um ponteiro para estra palavra*/
void inserirHashTable(char *palavra) {
    /*printf("Inserindo a palavra %s \n",palavra); Printf utilzado para mostrar a palavra a ser inserida na tabela*/
    tamanhoChaves++; /*Assim que uma palavra é inserida o numero de chaves é incrementado*/
    chaves[tamanhoChaves] = hash(palavra);/*A palavra é convertida na função hash e sua armazenada no vetor de chaves*/
    unsigned int hashval = chaves[tamanhoChaves]; /*Chave da apalvra*/

    if (htable[hashval]==NULL){
      /*printf("indice %u de %s \n",hashval,palavra);Printf utilizado para mostrar a chave e a palavra a ser inserida*/
        htable[hashval] = malloc(sizeof(palavra)); /*Alocando memoria para palavrra*/
        htable[hashval]->palavra = palavra ; /*Inserindo a palavra*/
        htable[hashval]->freq = 1; /*Incrementado sua frequencia*/
        size++;

    }else {
        /*If a words already exists in the table, i just incremente her frequency and the size. I guess the problem for repeated word is in here*/
        htable[hashval]->freq++;
        size++;
    }
    /*A tabela é impressa a cada instante que uma palavra é inserida*/
    printf("\nAtualização da tabela\n");
    print_ht();/*Impressao das palavras já recebidas, a cada instante, com a quantidade de ocorrências*/

}


/* Function responsible to print the words that were addedd to the hash table*/
void print_ht() {
    int i=0;
    /*Tabela auxiliar que servira para impressao das palavras e suas chaves*/
    HASH_TABLE *auxTable = (HASH_TABLE*) malloc(sizeof(HASH_TABLE)*size);
    unsigned int hashval; /* variavel utilizada para pegar a chave das palavras no vetor de chaves */

    for(i; i < size; i++){
        hashval = chaves[i]; /*Pegando a chave*/
        /*printf("indice %u de %s \n",hashval,htable[hashval]->token);Printf utilizado para ver a chave e a palavra*/
        auxTable[i] = htable[hashval]; /*Atribuindo a palavra e a freq para tabela auxiliar*/
    }

    /*qsort(auxTable,size,sizeof(link),compare);*/
    /*Imprimindo a tabela*/
    printf("Palavra  |  Frequencia\n");
    for (i=0; i < size; i++)
        printf("%s  \t        %d\n",auxTable[i]->palavra,auxTable[i]->freq);
    free(auxTable);
}

/*Funcion responsible to print the words with the frequency received in the paramater*/
void print_freq(int freq){
    printf("Palavras com a frequencia: %d\n", freq);
    int i, j =0;
    HASH_TABLE *auxTable = (HASH_TABLE*) malloc(sizeof(HASH_TABLE)*size);
    unsigned int hashval;

    for(i; i < size; i++){
        hashval = chaves[i];
        /*printf("indice %u de %s \n",hashval,htable[hashval]->palavra);*/
        auxTable[i] = htable[hashval]; /*Problem is in here, when I do this, the auxTable[i]->palavra(word) = "", but the freq has been saved normally n*/
    }

    printf("Palavra  |  Frequencia\n");
    for (i=0; i < size; i++) {
        if(auxTable[i]->freq == freq) {
            printf("%s  \t           %d\n",auxTable[i]->palavra,auxTable[i]->freq); /*When I print, only the frequency is showed*/
        }
    }
    free(auxTable);

}
int main(int argc, char *argv[])
{
    int i;
    FILE *fp;

    fp = fopen("input.txt","r");
    if (NULL == fp)
    {
        fprintf(stderr,"Error ao abrir o arquivo: %s\n",fp);
    }
    printf("Imprimindo processo \n");
    processarArquivo(fp); /* debuuga aqui pra tu entender o q rola*/

    fclose(fp);
    print_freq(3); //should print the word with freq equal to 3
    //print_ht();
    /*clear_ht();*/
    return 0;
}

输出:

最佳答案

这是我解决冲突的方法，并且还允许在必要时调整哈希表的大小，而无需重新哈希所有单词。

首先，我将使用一个单链表来包含具有相同散列的所有不同单词。我还会保存哈希——完整哈希，而不是模哈希表大小——以便轻松调整哈希表的大小。最后，我喜欢对单词本身使用 C99 灵活数组成员:

struct hash_entry {
    struct hash_entry  *next;   /* Pointer to next word in this hash table entry */
    size_t              hash;   /* Any unsigned type works; I just like size_t */
    size_t              freq;   /* Frequency */
    char                word[]; /* C99 flexible array member */
};

哈希表只是一个size指针的数组，散列到hash的每个单词都在卡在entry[hash上的单链表中% 尺寸]:

struct hash_table {
    size_t              size;
    struct hash_entry **entry;
};

然后是初始化、调整大小和释放哈希表的基本函数

int hash_table_create(struct hash_table *ht, size_t size)
{
    size_t  i;

    if (ht == NULL || size < 1)
        return -1; /* Invalid parameters */

    ht->size = size;
    ht->entry = malloc(size * sizeof ht->entry[0]);
    if (ht->entry == NULL)
        return -2; /* Cannot allocate memory */

    /* Clear all entries: no hashes/chains yet! */
    for (i = 0; i < size; i++)        
        ht->entry[i] = NULL;

    return 0; /* Success, no errors. */
}

void hash_entry_free(struct hash_entry *entry)
{
    while (entry) {
        struct hash_entry *next = entry->next;

        /* I like to "poison" the freed entries;
           this makes debugging easier, if you try
           to access an already freed entry. */
        entry->hash = 0;
        entry->freq = 0;
        entry->next = NULL;

        free(entry);

        entry = next;
    }
}

void hash_table_free(struct hash_table *ht)
{
    if (ht != NULL) {
        size_t  i;

        for (i = 0; i < ht->size; i++)
            if (ht->entry[i] != NULL)
                hash_entry_free(ht->entry[i]);

        free(ht->entry);

        ht->size = 0;
        ht->entry = NULL;
    }
}

int hash_table_resize(struct hash_table *ht, size_t new_size)
{
    struct hash_entry **entry;
    struct hash_entry  *curr, *next; 
    size_t  i, k;

    if (!ht || new_size < 1)
        return -1; /* Invalid parameters */

    if (ht->size < 1 || !ht->entry)
        return -2; /* Hash table is already freed */

    entry = malloc(new_size * sizeof entry[0]);
    if (!entry)
        return -3; /* Not enough memory */

    for (i = 0; i < new_size; i++)
        entry[i] = NULL;

    for (i = 0; i < ht->size; i++) {

        /* Chain in the old hash table entry */
        curr = ht->entry[i];            

        /* We are paranoid, and clear the old entry. */
        ht->entry[i] = NULL;

        while (curr) {
            /* Remember next hash in this chain */
            next = curr->next;

            /* Index to the new hash table */
            k = curr->hash % new_size;

            /* Prepend in front of the new hash table entry chain */
            curr->next = entry[k];
            entry[k] = curr;

            /* Advance to the next entry in the old chain */
            curr = next;
        }

    /* The old table is now useless. Free it, and use the new one. */
    free(ht->entry);
    ht->entry = entry;
    ht->size = new_size;

    return 0; /* Success; no errors. */
}

至于哈希函数，我喜欢 djb2 xor hash :

size_t hash(const char *s)
{
    if (s != NULL) {
        size_t  result = 5381;

        while (*s != '\0')
            result = (result * 33) ^ (*(s++));

        return result;
    } else
        return 0;
}

size_t hash_len(const char *s, const size_t len)
{
    if (s != NULL) {
        const char *z = s + len;
        size_t  result = 5381;

        while (s < z)
            result = (result * 33) ^ (*(s++));

        return result;
    } else
        return 0;
}

我还将向哈希表中添加一个字符串/单词分成两个函数:第一个函数创建 struct hash_entry 并将源单词复制到其中，第二个函数使用第一个函数创建条目，然后将其添加到哈希表中:

struct hash_entry *hash_entry_new_len(const char *src, size_t len)
{
    struct hash_entry *h;

    if (len > 0 && !src)
       return NULL; /* NULL src, but positive len! */

    /* To accommodate the flexible array member, we need
       to add its size to the structure size. Since it
       is a string, we need to include room for the '\0'. */
    h = malloc(sizeof (struct hash_entry) + len + 1);
    if (!h)
        return NULL; /* Out of memory */

    /* Copy the string to the entry, */
    if (len > 0)
        memcpy(h->word, src, len);

    /* Add the string-terminating nul char, */
    h->word[len] = '\0';

    /* clear the next pointer, */
    h->next = NULL;

    /* set the frequency count to 1, */
    h->freq = 1;

    /* and compute the hash. */
    h->hash = hash_len(src, len);

    /* Done! */
    return h;
}

struct hash_entry *hash_entry_new(const char *src)
{
    const size_t  len = (src) ? strlen(src) : 0;
    return hash_entry_new_len(src, len);
}

struct hash_entry *hash_table_add_part(struct hash_table *ht, const char *src, const size_t len)
{
    struct hash_entry *h;
    size_t             k;

    if (!ht || ht->size < 1)
        return NULL;  /* No hash table! */

    /* We'll check src and len, so we report the right error. */
    if (!src && len > 0)
        return NULL;  /* Invalid src (NULL)! */

    h = hash_entry_new(src, len);
    if (!h)
        return NULL;  /* Must be out of memory! */

    /* Index into the hash table */
    k = h->hash % ht->size;

    /* Prepend new hash table entry to the beginning of the chain. */
    h->next = ht->entry[k];
    ht->entry[k] = h;

    /* Success! */
    return h;
}

/* Helper function, so you don't need to specify the length
   if you wish to add a copy of entire source string. */
struct hash_entry *hash_table_add(struct hash_table *ht, const char *src)
{
    const size_t  len = (src) ? strlen(src) : 0;
    return hash_table_add_part(ht, src, len);
}

len = (src) ? strlen(src) : 0 表达式是 if (src != NULL) len = strlen(src) 的简写；否则 len = 0;。我经常使用它，作为检查字符串长度的安全方法，或者如果字符串为空或 NULL，则使用 0。

另请注意，NULL 字符串将收到散列 0，而空字符串将散列为 5381。这并不重要，我只是喜欢这样点我(或者像他们说的那样完全过分挑剔并且充满空话)。

请注意，我的“普通”函数以 _len() 结尾，具有相同名称但没有 _len() 后缀的函数是使用整个字符串。这很有用，如果你不使用 strtok() 来分割字符串，但是例如strspn()/strcspn() 甚至正则表达式来查找每个字符串中有趣的单词。

要在哈希表中查找特定单词，我们需要与原始字符串进行比较；哈希本身是不够的:

struct hash_entry *hash_table_find_len(struct hash_table *ht, const char *src, const size_t len)
{
    const size_t  hashval = hash_len(src, len);
    struct hash_entry *curr = ht->entry[hashval % ht->size];

    /* No matches for sure? */
    if (!curr)
        return NULL;

    /* We have a chain (singly-linked list).
       Check each one in turn. */
    while (curr) {

       /* Since we kept the entire hash value,
          and not just the index to the hash table,
          we can use the extra bits to exclude
          words that have the same hash modulus (index)
          but different complete hash value! */
       if (curr->hash == hash) {

           /* We cannot use strncmp() if len == 0,
              so we check that case separately. */
           if (len == 0) {
               if (curr->word[0] == '\0')
                   return curr; /* Match found! */
           } else {
               if (!strncmp(curr->word, src, len) &&
                   curr->word[len] == '\0')
                   return curr; /* Match found! */
           }
       }

       /* No match. Check next one in chain. */
       curr = curr->next;
    }

    /* Nope, no match. */
    return NULL;
}

struct hash_entry *hash_table_find(struct hash_table *ht, const char *src)
{
    const size_t  len = (src) ? strlen(src) : 0;
    return hash_table_find_len(ht, src, len);
}

计算词频现在很简单:

int hash_table_seen_len(struct hash_table *ht, const char *src, const size_t len)
{
    struct hash_entry *h;

    /* Sanity checks first. */
    if (!ht || (!src && len > 0))
        return -1; /* Invalid parameters! */

    h = hash_table_find_len(ht, src, len);
    if (h) {
        /* Found match; increment freq counter. */
        h->freq++;
        /* All done. */
        return 0;
    }

    /* Not found. Add to hash table. */
    h = hash_table_add_len(ht, src, len);
    if (!h) {
        /* An error occurred; should be "out of memory",
           since we checked the other causes earlier
           in this function. */
        return -1;
    }

    /* The word was added to the hash table.
       Since its freq count is 1, we do not need
       to increment it; we're done. */
    return 0;
}

int hash_table_seen(struct hash_table *ht, const char *src)
{
    const size_t  len = (src) ? strlen(src) : 0;
    return hash_table_seen_len(ht, src, len);
}

我很确定要按频率顺序打印哈希表条目，我会使用两个辅助函数:一个用于查找最大频率，另一个用于查找小于给定频率的最大频率:

size_t hash_table_max_freq(struct hash_table *ht)
{
    size_t  result = 0;
    size_t  i;

    if (!ht || ht->size < 1)
        return 0;  /* No hash table. */

    for (i = 0; i < ht->size; i++) {
        struct hash_entry *curr = ht->entry[i];

        while (curr) {
            if (curr->freq > result)
                result = curr->freq;
            curr = curr->next;
        }
    }

    return result;
}

size_t hash_table_next_freq(struct hash_table *ht, const size_t max_freq)
{
    size_t  result = 0;
    size_t  i;

    if (!ht || ht->size < 1)
        return 0;  /* No hash table. */

    for (i = 0; i < ht->size; i++) {
        struct hash_entry *curr = ht->entry[i];

        while (curr) {
            if (curr->freq > result && curr->freq < max_freq)
                result = curr->freq;
            curr = curr->next;
        }
    }

    return result;
}

最后，我们可以“窃取”qsort()的接口(interface)来查找所有单词，或者所有具有特定频率的单词:

int hash_table_for_all(struct hash_table *ht,
                       int (*func)(struct hash_entry *, void *),
                       void *user_data)
{
    int     retval;
    size_t  i;

    if (!ht || !func)
        return -1; /* NULL hash table or function. */

    for (i = 0; i < ht->size; i++) {
        struct hash_entry *curr = ht->entry[i];
        while (curr) {
            retval = func(curr, user_data);
            if (retval)
                return retval;
            curr = curr->next;
        }
    }

    return 0;
}

int hash_table_for_freq(struct hash_table *ht, const size_t freq,
                       int (*func)(struct hash_entry *, void *),
                       void *user_data)
{
    int     retval;
    size_t  i;

    if (!ht || !func)
        return -1; /* NULL hash table or function. */

    for (i = 0; i < ht->size; i++) {
        struct hash_entry *curr = ht->entry[i];
        while (curr) {
            if (curr->freq == freq) {
                retval = func(curr, user_data);
                if (retval)
                    return retval;
            }
            curr = curr->next;
        }
    }

    return 0;
}

以上代码甚至都没有经过编译测试，所以如果您发现任何拼写错误(或遇到编译时错误)，请在评论中告诉我，以便我进行验证和修复。

关于c - HashTable 如何将重复的单词只打印一次？，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/46027594/

文章推荐： html - Z-索引不工作

文章推荐： Javascript嵌套数组单元素修改转换为整个数组修改

文章推荐： css - 将 Bootstrap 与 Less/Grunt/NOde 结合使用

数据框列中的Python计数字符串(单词)
我有以下数据框 (df_hvl)，列名“FzListe”和以下数据: FzListe 7MA1, 7OS1 7MA1, 7ZJB 7MA2, 7MA3, 7OS1 76G1, 7MA1, 7OS1 7
在写入文件之前检查文件中是否存在字符串/单词
我有点小问题。仅当尝试写入的相同字符串/单词不存在时，我才想写入文件。在我的例子中，它是一个 IP 地址和端口，用“:”分隔。如果我手动写入文件，例如 193...:80 和 193...:22，它指
正则表达式从逗号之间的最后一个单词中提取(完全匹配)单词？
如何返回结果列中的单词示例？我得到的最接近的是 [\W]{2,}[^,\W].+[?=,] ID 文本我的结果(完全匹配) 预期(完全匹配) 1 词A，世界B，词C , 世界 B, 字B 2 wo
c - 正则表达式以获取英文引号之间的字符串/单词
我想在引号之间得到一个字符串我知道一个解决方案是: /'.*?'/ 但问题是它不适用于英语中的所有格或收缩格例如: What is the name of Mario's brother in t
字符串中出现次数最多的 PHP 单词
我应该在句子中找到出现最多的单词。这是我尝试过的，但不起作用。 '); $max = -1; $resultWords = array(); $resultCount = array(); $i =
Vim高亮显示用于复制到剪贴板的字符/单词/行
我是vim的新手。我正在尝试练习(最近阅读了一些教程)，但是我发现我不能不突出显示“复制粘贴”中的字符/单词/行。在Textmate中，我通常使用SHIFT + CTRL + LeftArrowKe
json - 单词，单词类型和定义的英语JSON词典
有谁知道一个JSON格式的英语词典，该词典具有(单词，定义和单词类型，例如名词/形容词/动词/副词) 这种格式: [ {"Word" : "Chair", "Definition" : "A
Javascript - 无法替换字符串中的 "<"单词
我正在做一些 javascript，同时我注意到我无法替换 html 标记内的“ document.getElementById('label').innerHTML = document.get
groovy - 如何在groovy中替换文本文件中的字符串/单词
您好，我正在使用 groovy 2.1.5，我必须编写一个代码来显示具有给定路径的目录的内容/文件，然后它会备份文件并替换文件中的单词/字符串。这是我用来尝试替换所选文件中的单词的代码 String
python - 用python机器学习识别声音(单词)
我正在准备一个实验，我想使用python编写程序以识别参与者说出的某些单词。我在python中搜索了很多有关语音识别的内容，但结果却很复杂(例如CMUSphinx)。我要实现的是一个程序，该程序接
powershell - 如何忽略字母/单词？
假设我有以下代码: $size = 23.9 $size = "$size GB" write $size 我想在其他事情上使用相同的变量，即 if ($size -lt 20) {...} 这显然是
regex - 单词/字符的选择性正则表达式
我想替换字符串中单词 Date 的所有情况，除非它是 Date()(即 Date 后跟括号)。这是一个字符串示例以及我最初尝试的内容: x gsub("Date", paste("Date:", S
java - 寻找最小的字符串/单词
我对 Java 和编程都很陌生，请记住这一点，请不要对我严厉 ^^。接下来，我最近用 Java 进行了一些培训，我喜欢这个挑战，但现在我只是陷入困境。我做了一些示例来查找用户输入的最大字符串，一切都很
c - 找到出现次数较多的字符串(单词)
我必须给一个数字x，并写x个字符串(单词)。我必须找到写得最多的那一篇。它可以工作，但是当我尝试从文件中读取它时，却没有。例如，如果我执行 a.out'' #include #include in
javascript - 在java脚本的字符串中引用单个字符(单词)
这里是学习者，如果这个问题看起来很荒谬，请多多包涵。假设我试图引用字符串中的字符而不是字符串本身，我该怎么做呢？我的意思是; 给定:var str = "我想知道一个大脑分散的计算机如何保持理智" 我
java - 如何解析此输出并分隔每个字段/单词
这是阿克沙塔。我一直在解析以下数据。我想单独获取每个单词。我可以有一个示例代码以便我可以继续吗 RTRV-HDR RH01 SIMULATOR 09-11-18 16 13 19 M R
java - 如何拆分字符串以仅获取字符/单词
我有一个任意字符串，它总是包含至少一个英文单词后跟一系列数字:"Hello World 1234" 我如何只提取 "Hello World" 来自字符串？最佳答案在我看来你需要反正则表达式: St
java - 如何从字符串中提取子字符串(单词)？
我正在尝试输入一个四个单词的句子，然后能够使用indexOf和子字符串单独打印出每个单词。有什么想法我做错了吗？已编辑那么这就是它应该的样子吗？我已经运行了两次，得到了两个不同的答案，所以我不确定
php - 如何在文本开头查找短语(单词)
如何在文本开头查找短语(单词) 我需要非常快速的解决方案来查明文本是否以某些已知短语开头我在 Mysql (innodb) 表中的短语如下: CREATE TABLE IF NOT EXISTS `
mysql - 选择包含某些字符的MYSQL数据(单词)
我在 MYSQL 表中有一本字典，该表由 240 000 个单词组成。例如，如果我有字母 G、I、G、S、N> 和 O 我想选择表中包含所有或部分这些字母(并且没有其他字母)的所有单词。可接受的词语

太空宇宙

个人简介

我是一名优秀的程序员,十分优秀！

作者热门文章

滴滴打车优惠券免费领取

全站热门文章

首页

博学

6Ren·AI

商城

c - HashTable 如何将重复的单词只打印一次？