gpt4 book ai didi

c - 使用 c 检测文件中的重复行

转载 作者:太空宇宙 更新时间:2023-11-04 07:34:32 25 4
gpt4 key购买 nike

我有一个包含大约 (15000-25000) 行(固定大小)的 csv 文件,我想知道如何使用 C 语言检测重复的行。

输出示例如下:

0123456789;CUST098WZAX;35

我没有内存或时间限制,所以我想要最简单的解决方案。

感谢您的帮助。

最佳答案

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

struct somehash {
struct somehash *next;
unsigned hash;
char *mem;
};

#define THE_SIZE 100000
struct somehash *table[THE_SIZE] = { NULL,};

struct somehash **some_find(char *str, unsigned len);
static unsigned some_hash(char *str, unsigned len);

int main (void)
{
char buffer[100];
struct somehash **pp;
size_t len;

while (fgets(buffer, sizeof buffer, stdin)) {
len = strlen(buffer);
pp = some_find(buffer, len);
if (*pp) { /* found */
fprintf(stderr, "Duplicate:%s\n", buffer);
}
else { /* not found: create one */
fprintf(stdout, "%s", buffer);
*pp = malloc(sizeof **pp);
(*pp)->next = NULL;
(*pp)->hash = some_hash(buffer,len);
(*pp)->mem = malloc(1+len);
memcpy((*pp)->mem , buffer, 1+len);
}
}
return 0;
}
struct somehash **some_find(char *str, unsigned len)
{
unsigned hash;
unsigned slot;
struct somehash **hnd;

hash = some_hash(str,len);
slot = hash % THE_SIZE;
for (hnd = &table[slot]; *hnd ; hnd = &(*hnd)->next ) {
if ( (*hnd)->hash != hash) continue;
if ( strcmp((*hnd)->mem , str) ) continue;
break;
}
return hnd;
}

static unsigned some_hash(char *str, unsigned len)
{
unsigned val;
unsigned idx;

if (!len) len = strlen(str);

val = 0;
for(idx=0; idx < len; idx++ ) {
val ^= (val >> 2) ^ (val << 5) ^ (val << 13) ^ str[idx] ^ 0x80001801;
}
return val;
}

关于c - 使用 c 检测文件中的重复行,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/10189594/

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com