gpt4 book ai didi

c - HashTable 如何将重复的单词只打印一次?

转载 作者:太空宇宙 更新时间:2023-11-04 03:21:16 27 4
gpt4 key购买 nike

我正在实现一个哈希表,但我遇到了一些问题,我总是在添加一个词时打印更新的哈希表,问题是,当这个词再次出现时,我只需要增加它的频率,但我的程序正在以更新的频率再次打印它:如何只打印一次重复的单词?并显示它们的频率。

我遇到的另一个问题是函数 print_freq。它收到一个 int freq,我应该用这个频率打印单词,但问题是 auxTable 没有保存 htable 中的单词,我不知道为什么它不起作用,因为 auxtable 保存频率正常,但是当它是要保存单词,它会保存一个空字符“”。

#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <ctype.h>

#define HTABLE_SIZE 1001
#define MAX_LINE_SIZ 1024

/* Hash Table */
typedef struct node* HASH_TABLE; /* estrutura de dados utilizadas para formar a hastable*/
struct node {
char *palavra; /*word*/
int freq;
};

/*Declaracao das funcoes*/
void inserirHashTable(char *s);
void print_ht();
void print_test();

HASH_TABLE htable[HTABLE_SIZE] = { NULL }; /* Hash table que armazenará as palavras lidas do arquivos */
unsigned int chaves[HTABLE_SIZE]; /* Vetor que armazenará as chaves das palavras da tabela*/
int tamanhoChaves=-1; /*Variavel responsavel por contar a quantidade de chaves do vetor de chaves*/
int size = 0; /* variavel responsavel por armazenar o numero de elementos da tabela*/

/*Função responsavel por processar o arquivo, a mesma recebe o arquivo como parametro,
* pega cada palavra presente no arquivo separa dos simbolos ?!+.-... e chama a função
* inserirHT para inserir a palavra separada na tabela hash*/
void processarArquivo(FILE *fp)
{
const char *seperators = " ?!'\";,.:+-*&%(){}[]<>\\\t\n"; // caractertes que deveram ser separados

char line[MAX_LINE_SIZ];
char *s;
while((fgets(line,MAX_LINE_SIZ, fp)) != NULL) //pegando a linha do arquivo
{
for (s=strtok(line,seperators); s; s=strtok(NULL,seperators)){ // separando a palavra
/*printf("Palavra a ser inserida %s \n",s); printf utilizado para mostrar
* a palavra que foi seperada e será inserida*/
inserirHashTable(s);//Chamando a função inserir
}
}
}

/* Função responsavel por criar a chave de cada palavra que vai para tabela,
recebe como parametro um ponteiro para string, logo em seguida pega cada
caractere da string e gera um unsigned int para ele, retorna por fim o
modulo desse unsigned int pelo tamanho da tabela*/
unsigned int hash(char *tok)
{
unsigned int hv = 0;
while (*tok)
hv = (hv << 4) | toupper(*tok++);
/*printf("conversao: %d \n",hv); Printf utilizado para mostrar o valor de hv antes de ser retorna como modulo*/
return hv % HTABLE_SIZE;
}

/* funcao responsavel por isenrir a palavra lida do arquivo na hash_table,
* a funçãp recebe como parametro um ponteiro para estra palavra*/
void inserirHashTable(char *palavra) {
/*printf("Inserindo a palavra %s \n",palavra); Printf utilzado para mostrar a palavra a ser inserida na tabela*/
tamanhoChaves++; /*Assim que uma palavra é inserida o numero de chaves é incrementado*/
chaves[tamanhoChaves] = hash(palavra);/*A palavra é convertida na função hash e sua armazenada no vetor de chaves*/
unsigned int hashval = chaves[tamanhoChaves]; /*Chave da apalvra*/

if (htable[hashval]==NULL){
/*printf("indice %u de %s \n",hashval,palavra);Printf utilizado para mostrar a chave e a palavra a ser inserida*/
htable[hashval] = malloc(sizeof(palavra)); /*Alocando memoria para palavrra*/
htable[hashval]->palavra = palavra ; /*Inserindo a palavra*/
htable[hashval]->freq = 1; /*Incrementado sua frequencia*/
size++;

}else {
/*If a words already exists in the table, i just incremente her frequency and the size. I guess the problem for repeated word is in here*/
htable[hashval]->freq++;
size++;
}
/*A tabela é impressa a cada instante que uma palavra é inserida*/
printf("\nAtualização da tabela\n");
print_ht();/*Impressao das palavras já recebidas, a cada instante, com a quantidade de ocorrências*/

}


/* Function responsible to print the words that were addedd to the hash table*/
void print_ht() {
int i=0;
/*Tabela auxiliar que servira para impressao das palavras e suas chaves*/
HASH_TABLE *auxTable = (HASH_TABLE*) malloc(sizeof(HASH_TABLE)*size);
unsigned int hashval; /* variavel utilizada para pegar a chave das palavras no vetor de chaves */

for(i; i < size; i++){
hashval = chaves[i]; /*Pegando a chave*/
/*printf("indice %u de %s \n",hashval,htable[hashval]->token);Printf utilizado para ver a chave e a palavra*/
auxTable[i] = htable[hashval]; /*Atribuindo a palavra e a freq para tabela auxiliar*/
}

/*qsort(auxTable,size,sizeof(link),compare);*/
/*Imprimindo a tabela*/
printf("Palavra | Frequencia\n");
for (i=0; i < size; i++)
printf("%s \t %d\n",auxTable[i]->palavra,auxTable[i]->freq);
free(auxTable);
}

/*Funcion responsible to print the words with the frequency received in the paramater*/
void print_freq(int freq){
printf("Palavras com a frequencia: %d\n", freq);
int i, j =0;
HASH_TABLE *auxTable = (HASH_TABLE*) malloc(sizeof(HASH_TABLE)*size);
unsigned int hashval;

for(i; i < size; i++){
hashval = chaves[i];
/*printf("indice %u de %s \n",hashval,htable[hashval]->palavra);*/
auxTable[i] = htable[hashval]; /*Problem is in here, when I do this, the auxTable[i]->palavra(word) = "", but the freq has been saved normally n*/
}

printf("Palavra | Frequencia\n");
for (i=0; i < size; i++) {
if(auxTable[i]->freq == freq) {
printf("%s \t %d\n",auxTable[i]->palavra,auxTable[i]->freq); /*When I print, only the frequency is showed*/
}
}
free(auxTable);

}
int main(int argc, char *argv[])
{
int i;
FILE *fp;

fp = fopen("input.txt","r");
if (NULL == fp)
{
fprintf(stderr,"Error ao abrir o arquivo: %s\n",fp);
}
printf("Imprimindo processo \n");
processarArquivo(fp); /* debuuga aqui pra tu entender o q rola*/

fclose(fp);
print_freq(3); //should print the word with freq equal to 3
//print_ht();
/*clear_ht();*/
return 0;
}

输出:

https://imgur.com/a/PlRPp

最佳答案

这是我解决冲突的方法,并且还允许在必要时调整哈希表的大小,而无需重新哈希所有单词。

首先,我将使用一个单链表来包含具有相同散列的所有不同单词。我还会保存哈希——完整哈希,而不是模哈希表大小——以便轻松调整哈希表的大小。最后,我喜欢对单词本身使用 C99 灵活数组成员:

struct hash_entry {
struct hash_entry *next; /* Pointer to next word in this hash table entry */
size_t hash; /* Any unsigned type works; I just like size_t */
size_t freq; /* Frequency */
char word[]; /* C99 flexible array member */
};

哈希表只是一个size指针的数组,散列到hash的每个单词都在卡在entry[hash上的单链表中% 尺寸]:

struct hash_table {
size_t size;
struct hash_entry **entry;
};

然后是初始化、调整大小和释放哈希表的基本函数

int hash_table_create(struct hash_table *ht, size_t size)
{
size_t i;

if (ht == NULL || size < 1)
return -1; /* Invalid parameters */

ht->size = size;
ht->entry = malloc(size * sizeof ht->entry[0]);
if (ht->entry == NULL)
return -2; /* Cannot allocate memory */

/* Clear all entries: no hashes/chains yet! */
for (i = 0; i < size; i++)
ht->entry[i] = NULL;

return 0; /* Success, no errors. */
}

void hash_entry_free(struct hash_entry *entry)
{
while (entry) {
struct hash_entry *next = entry->next;

/* I like to "poison" the freed entries;
this makes debugging easier, if you try
to access an already freed entry. */
entry->hash = 0;
entry->freq = 0;
entry->next = NULL;

free(entry);

entry = next;
}
}

void hash_table_free(struct hash_table *ht)
{
if (ht != NULL) {
size_t i;

for (i = 0; i < ht->size; i++)
if (ht->entry[i] != NULL)
hash_entry_free(ht->entry[i]);

free(ht->entry);

ht->size = 0;
ht->entry = NULL;
}
}

int hash_table_resize(struct hash_table *ht, size_t new_size)
{
struct hash_entry **entry;
struct hash_entry *curr, *next;
size_t i, k;

if (!ht || new_size < 1)
return -1; /* Invalid parameters */

if (ht->size < 1 || !ht->entry)
return -2; /* Hash table is already freed */

entry = malloc(new_size * sizeof entry[0]);
if (!entry)
return -3; /* Not enough memory */

for (i = 0; i < new_size; i++)
entry[i] = NULL;

for (i = 0; i < ht->size; i++) {

/* Chain in the old hash table entry */
curr = ht->entry[i];

/* We are paranoid, and clear the old entry. */
ht->entry[i] = NULL;

while (curr) {
/* Remember next hash in this chain */
next = curr->next;

/* Index to the new hash table */
k = curr->hash % new_size;

/* Prepend in front of the new hash table entry chain */
curr->next = entry[k];
entry[k] = curr;

/* Advance to the next entry in the old chain */
curr = next;
}

/* The old table is now useless. Free it, and use the new one. */
free(ht->entry);
ht->entry = entry;
ht->size = new_size;

return 0; /* Success; no errors. */
}

至于哈希函数,我喜欢 djb2 xor hash :

size_t hash(const char *s)
{
if (s != NULL) {
size_t result = 5381;

while (*s != '\0')
result = (result * 33) ^ (*(s++));

return result;
} else
return 0;
}

size_t hash_len(const char *s, const size_t len)
{
if (s != NULL) {
const char *z = s + len;
size_t result = 5381;

while (s < z)
result = (result * 33) ^ (*(s++));

return result;
} else
return 0;
}

我还将向哈希表中添加一个字符串/单词分成两个函数:第一个函数创建 struct hash_entry 并将源单词复制到其中,第二个函数使用第一个函数创建条目,然后将其添加到哈希表中:

struct hash_entry *hash_entry_new_len(const char *src, size_t len)
{
struct hash_entry *h;

if (len > 0 && !src)
return NULL; /* NULL src, but positive len! */

/* To accommodate the flexible array member, we need
to add its size to the structure size. Since it
is a string, we need to include room for the '\0'. */
h = malloc(sizeof (struct hash_entry) + len + 1);
if (!h)
return NULL; /* Out of memory */

/* Copy the string to the entry, */
if (len > 0)
memcpy(h->word, src, len);

/* Add the string-terminating nul char, */
h->word[len] = '\0';

/* clear the next pointer, */
h->next = NULL;

/* set the frequency count to 1, */
h->freq = 1;

/* and compute the hash. */
h->hash = hash_len(src, len);

/* Done! */
return h;
}

struct hash_entry *hash_entry_new(const char *src)
{
const size_t len = (src) ? strlen(src) : 0;
return hash_entry_new_len(src, len);
}

struct hash_entry *hash_table_add_part(struct hash_table *ht, const char *src, const size_t len)
{
struct hash_entry *h;
size_t k;

if (!ht || ht->size < 1)
return NULL; /* No hash table! */

/* We'll check src and len, so we report the right error. */
if (!src && len > 0)
return NULL; /* Invalid src (NULL)! */

h = hash_entry_new(src, len);
if (!h)
return NULL; /* Must be out of memory! */

/* Index into the hash table */
k = h->hash % ht->size;

/* Prepend new hash table entry to the beginning of the chain. */
h->next = ht->entry[k];
ht->entry[k] = h;

/* Success! */
return h;
}

/* Helper function, so you don't need to specify the length
if you wish to add a copy of entire source string. */
struct hash_entry *hash_table_add(struct hash_table *ht, const char *src)
{
const size_t len = (src) ? strlen(src) : 0;
return hash_table_add_part(ht, src, len);
}

len = (src) ? strlen(src) : 0 表达式是 if (src != NULL) len = strlen(src) 的简写;否则 len = 0;。我经常使用它,作为检查字符串长度的安全方法,或者如果字符串为空或 NULL,则使用 0。

另请注意,NULL 字符串将收到散列 0,而空字符串将散列为 5381。这并不重要,我只是喜欢这样点我(或者像他们说的那样完全过分挑剔并且充满空话)。

请注意,我的“普通”函数以 _len() 结尾,具有相同名称但没有 _len() 后缀的函数是使用整个字符串。这很有用,如果你不使用 strtok() 来分割字符串,但是例如strspn()/strcspn() 甚至正则表达式来查找每个字符串中有趣的单词。

要在哈希表中查找特定单词,我们需要与原始字符串进行比较;哈希本身是不够的:

struct hash_entry *hash_table_find_len(struct hash_table *ht, const char *src, const size_t len)
{
const size_t hashval = hash_len(src, len);
struct hash_entry *curr = ht->entry[hashval % ht->size];

/* No matches for sure? */
if (!curr)
return NULL;

/* We have a chain (singly-linked list).
Check each one in turn. */
while (curr) {

/* Since we kept the entire hash value,
and not just the index to the hash table,
we can use the extra bits to exclude
words that have the same hash modulus (index)
but different complete hash value! */
if (curr->hash == hash) {

/* We cannot use strncmp() if len == 0,
so we check that case separately. */
if (len == 0) {
if (curr->word[0] == '\0')
return curr; /* Match found! */
} else {
if (!strncmp(curr->word, src, len) &&
curr->word[len] == '\0')
return curr; /* Match found! */
}
}

/* No match. Check next one in chain. */
curr = curr->next;
}

/* Nope, no match. */
return NULL;
}

struct hash_entry *hash_table_find(struct hash_table *ht, const char *src)
{
const size_t len = (src) ? strlen(src) : 0;
return hash_table_find_len(ht, src, len);
}

计算词频现在很简单:

int hash_table_seen_len(struct hash_table *ht, const char *src, const size_t len)
{
struct hash_entry *h;

/* Sanity checks first. */
if (!ht || (!src && len > 0))
return -1; /* Invalid parameters! */

h = hash_table_find_len(ht, src, len);
if (h) {
/* Found match; increment freq counter. */
h->freq++;
/* All done. */
return 0;
}

/* Not found. Add to hash table. */
h = hash_table_add_len(ht, src, len);
if (!h) {
/* An error occurred; should be "out of memory",
since we checked the other causes earlier
in this function. */
return -1;
}

/* The word was added to the hash table.
Since its freq count is 1, we do not need
to increment it; we're done. */
return 0;
}

int hash_table_seen(struct hash_table *ht, const char *src)
{
const size_t len = (src) ? strlen(src) : 0;
return hash_table_seen_len(ht, src, len);
}

我很确定要按频率顺序打印哈希表条目,我会使用两个辅助函数:一个用于查找最大频率,另一个用于查找小于给定频率的最大频率:

size_t hash_table_max_freq(struct hash_table *ht)
{
size_t result = 0;
size_t i;

if (!ht || ht->size < 1)
return 0; /* No hash table. */

for (i = 0; i < ht->size; i++) {
struct hash_entry *curr = ht->entry[i];

while (curr) {
if (curr->freq > result)
result = curr->freq;
curr = curr->next;
}
}

return result;
}

size_t hash_table_next_freq(struct hash_table *ht, const size_t max_freq)
{
size_t result = 0;
size_t i;

if (!ht || ht->size < 1)
return 0; /* No hash table. */

for (i = 0; i < ht->size; i++) {
struct hash_entry *curr = ht->entry[i];

while (curr) {
if (curr->freq > result && curr->freq < max_freq)
result = curr->freq;
curr = curr->next;
}
}

return result;
}

最后,我们可以“窃取”qsort()的接口(interface)来查找所有单词,或者所有具有特定频率的单词:

int hash_table_for_all(struct hash_table *ht,
int (*func)(struct hash_entry *, void *),
void *user_data)
{
int retval;
size_t i;

if (!ht || !func)
return -1; /* NULL hash table or function. */

for (i = 0; i < ht->size; i++) {
struct hash_entry *curr = ht->entry[i];
while (curr) {
retval = func(curr, user_data);
if (retval)
return retval;
curr = curr->next;
}
}

return 0;
}

int hash_table_for_freq(struct hash_table *ht, const size_t freq,
int (*func)(struct hash_entry *, void *),
void *user_data)
{
int retval;
size_t i;

if (!ht || !func)
return -1; /* NULL hash table or function. */

for (i = 0; i < ht->size; i++) {
struct hash_entry *curr = ht->entry[i];
while (curr) {
if (curr->freq == freq) {
retval = func(curr, user_data);
if (retval)
return retval;
}
curr = curr->next;
}
}

return 0;
}

以上代码甚至都没有经过编译测试,所以如果您发现任何拼写错误(或遇到编译时错误),请在评论中告诉我,以便我进行验证和修复。

关于c - HashTable 如何将重复的单词只打印一次?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/46027594/

27 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com