gpt4 book ai didi

c - 数组中唯一值的记录

转载 作者:太空宇宙 更新时间:2023-11-04 04:18:29 24 4
gpt4 key购买 nike

我需要添加代码来删除重复的单词,当检查时(例如,在文本中有几个单词“book”,它将它们显示为重复的单词,如 - book book)在编译时输出。这本词典大约有 140 000 个单词。请告诉我如何执行唯一性检查,以便将不相同的单词写入 'uniq' 数组

#include <ctype.h>
#include <stdio.h>
#include <sys/resource.h>
#include <sys/time.h>
#include <string.h>

#include "dictionary.h"
#undef calculate
#undef getrusage

// default dictionary
#define DICTIONARY "dictionaries/large"

// prototype
double calculate(const struct rusage* b, const struct rusage* a);

int main(int argc, char* argv[])
{
// check for correct number of args
if (argc != 2 && argc != 3)
{
printf("Usage: speller [dictionary] text\n");
return 1;
}

// structs for timing data
struct rusage before, after;

// benchmarks
double time_load = 0.0, time_check = 0.0, time_size = 0.0, time_unload = 0.0;

// determine dictionary to use
char* dictionary = (argc == 3) ? argv[1] : DICTIONARY;

// load dictionary
getrusage(RUSAGE_SELF, &before);
bool loaded = load(dictionary);
getrusage(RUSAGE_SELF, &after);

// abort if dictionary not loaded
if (!loaded)
{
printf("Could not load %s.\n", dictionary);
return 1;
}

// calculate time to load dictionary
time_load = calculate(&before, &after);

// try to open text
char* text = (argc == 3) ? argv[2] : argv[1];

//The variable fp contains the literary text
FILE* fp = fopen(text, "r");

if (fp == NULL)
{
printf("Could not open %s.\n", text);
unload();
return 1;
}

// prepare to report misspellings
printf("\nMISSPELLED WORDS\n\n");

// prepare to spell-check
int index = 0, misspellings = 0, words = 0;
char word[LENGTH+1];

//uniq_array
char uniq[300][50];

//count no uniq words
int countsUniq = 0;

// spell-check each word in text
for (int c = fgetc(fp); c != EOF; c = fgetc(fp))
{
// allow only alphabetical characters and apostrophes
if (isalpha(c) || (c == '\'' && index > 0))
{
// append character to word
word[index] = c;
index++;

// ignore alphabetical strings too long to be words
if (index > LENGTH)
{
// consume remainder of alphabetical string
while ((c = fgetc(fp)) != EOF && isalpha(c));

// prepare for new word
index = 0;
}
}

// ignore words with numbers (like MS Word can)
else if (isdigit(c))
{
// consume remainder of alphanumeric string
while ((c = fgetc(fp)) != EOF && isalnum(c));

// prepare for new word
index = 0;
}

// we must have found a whole word
else if (index > 0)
{
// terminate current word
word[index] = '\0';

// update counter
words++;

// check word's spelling
getrusage(RUSAGE_SELF, &before);
bool misspelled = !check(word);
getrusage(RUSAGE_SELF, &after);

// update benchmark
time_check += calculate(&before, &after);


// prepare for next word
index = 0;
// print word if misspelled
if (misspelled)
{
//here takes place check for uniqueness and record to an array
misspellings++;
for(int j = 0; j < 300; j++){
if(strcmp(uniq[j], word) == 0){
countsUniq++;
break;
}
}
if(countsUniq == 0){
for(int i = 0; i < 300; i++){
if(strcmp(uniq[i], "") == 0){
strcpy(uniq[i], word);
break;
}
}
}
countsUniq = 0;
}
}
}

for(int i = 0; i < 300; i++){
printf("%s\n", uniq[i]);
}

// check whether there was an error
if (ferror(fp))
{
fclose(fp);
printf("Error reading %s.\n", text);
unload();
return 1;
}

// close text
fclose(fp);

// determine dictionary's size
getrusage(RUSAGE_SELF, &before);
unsigned int n = size();
getrusage(RUSAGE_SELF, &after);

// calculate time to determine dictionary's size
time_size = calculate(&before, &after);

// unload dictionary
getrusage(RUSAGE_SELF, &before);
bool unloaded = unload();
getrusage(RUSAGE_SELF, &after);

// abort if dictionary not unloaded
if (!unloaded)
{
printf("Could not unload %s.\n", dictionary);
return 1;
}

// calculate time to unload dictionary
time_unload = calculate(&before, &after);

// report benchmarks
printf("\nWORDS MISSPELLED: %d\n", misspellings);
printf("WORDS IN DICTIONARY: %d\n", n);
printf("WORDS IN TEXT: %d\n", words);
printf("TIME IN load: %.2f\n", time_load);
printf("TIME IN check: %.2f\n", time_check);
printf("TIME IN size: %.2f\n", time_size);
printf("TIME IN unload: %.2f\n", time_unload);
printf("TIME IN TOTAL: %.2f\n\n",
time_load + time_check + time_size + time_unload);

return 0;
}

/**
* Returns number of seconds between b and a.
*/
double calculate(const struct rusage* b, const struct rusage* a)
{
if (b == NULL || a == NULL)
{
return 0.0;
}
else
{
return ((((a->ru_utime.tv_sec * 1000000 + a->ru_utime.tv_usec) -
(b->ru_utime.tv_sec * 1000000 + b->ru_utime.tv_usec)) +
((a->ru_stime.tv_sec * 1000000 + a->ru_stime.tv_usec) -
(b->ru_stime.tv_sec * 1000000 + b->ru_stime.tv_usec)))
/ 1000000.0);
}
}

在输出中,我收到了一些不应该出现的单词和符号形式的列表,有些行显示为空,我也不知道为什么:

nonproprietary
s
F
IS'
MERCHANTIBILITY
unenforceability




Q@
<
=
@

预先感谢您的帮助。

最佳答案

比较字符串通常是通过库函数 strcmp 完成的。无法通过 == 运算符比较字符串。

此外,= 运算符不能分配字符串。

uniq[misspellings][j] = word[j]; // it will not work

使用strcpy 复制字符串。

这是一个简单的程序来说明上述概念。可以帮助您了解您的问题。

#include <stdio.h>
#include <string.h>

#define LENGTH 30
#define ROWS 5

int main(void) {

char word[LENGTH+1] = "word";
char uniq[ROWS][LENGTH+1] = { "eva", "buba", "word" , "1235", "stop"};

for(int j = 0; j < ROWS; j++){

if(strcmp(uniq[j], word) == 0) // find word
{
printf("We have found: <%s>\n", word);
// replacing with "hello":
strcpy(uniq[j], "hello");
}
}

printf("Strings in uniq:\n", word);
for(int j = 0; j < ROWS; j++){
printf("%s ", uniq[j]);
}

return 0;
}

输出:

We have found: <word>                                                                                                                         
Strings in uniq:
eva buba hello 1235 stop

关于c - 数组中唯一值的记录,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/49349035/

24 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com