gpt4 book ai didi

c++ - 从 C++ 中的文本文件中读取数百万行分隔的整数的最有效方法是什么

转载 作者:塔克拉玛干 更新时间:2023-11-03 01:07:32 24 4
gpt4 key购买 nike

我的文本文件中有大约 2500 万个由行分隔的整数。我的第一个任务是获取这些整数并对它们进行排序。我实际上已经实现了读取整数并将它们放入数组中(因为我的排序函数将未排序的数组作为参数)。然而,从文件中读取整数是一个非常漫长且昂贵的过程。我已经搜索了许多其他解决方案以获得更便宜和有效的方法来执行此操作,但我无法找到能够处理如此大小的解决方案。因此,您的建议是从巨大的(大约 260MB)文本文件中读取整数。以及如何有效地获取同一问题的行数。

ifstream myFile("input.txt");

int currentNumber;
int nItems = 25000000;
int *arr = (int*) malloc(nItems*sizeof(*arr));
int i = 0;
while (myFile >> currentNumber)
{
arr[i++] = currentNumber;
}

这就是我从文本文件中获取整数的方法。没那么复杂。我假设行数是固定的(实际上是固定的)

顺便说一句,当然不会太慢。在配备 2.2GHz i7 处理器的 OS X 中,它在大约 9 秒内完成读取。但我觉得它可能会好得多。

最佳答案

最有可能的是,对此进行的任何优化都可能收效甚微。在我的机器上,读取大文件的限制因素是磁盘传输速度。是的,提高读取速度可以提高一点点,但很可能,你不会从中得到太多。

我在之前的测试中发现 [我会看看我是否能在其中找到答案——我在我的“SO 实验代码”目录中找不到源代码],最快的方法是加载文件使用 mmap。但它只比使用 ifstream 快一点点。

编辑:我自制的以几种不同方式读取文件的基准。 getline while reading a file vs reading whole file and then splitting based on newline character

按照惯例,基准测试衡量的是基准衡量的内容,对环境或代码编写方式的微小改变有时会产生很大的不同。

编辑:以下是“从文件中读取一个数字并将其存储在 vector 中”的几个实现:

#include <iostream>
#include <fstream>
#include <vector>
#include <sys/time.h>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <sys/mman.h>
#include <sys/types.h>
#include <fcntl.h>


using namespace std;

const char *file_name = "lots_of_numbers.txt";

void func1()
{
vector<int> v;
int num;
ifstream fin(file_name);
while( fin >> num )
{
v.push_back(num);
}
cout << "Number of values read " << v.size() << endl;
}


void func2()
{
vector<int> v;
v.reserve(42336000);
int num;

ifstream fin(file_name);
while( fin >> num )
{
v.push_back(num);
}
cout << "Number of values read " << v.size() << endl;
}

void func3()
{
int *v = new int[42336000];
int num;

ifstream fin(file_name);
int i = 0;
while( fin >> num )
{
v[i++] = num;
}
cout << "Number of values read " << i << endl;
delete [] v;
}


void func4()
{
int *v = new int[42336000];
FILE *f = fopen(file_name, "r");
int num;
int i = 0;
while(fscanf(f, "%d", &num) == 1)
{
v[i++] = num;
}
cout << "Number of values read " << i << endl;
fclose(f);
delete [] v;
}

void func5()
{
int *v = new int[42336000];
int num = 0;

ifstream fin(file_name);
char buffer[8192];
int i = 0;
int bytes = 0;
char *p;
int hasnum = 0;
int eof = 0;
while(!eof)
{
fin.read(buffer, sizeof(buffer));
p = buffer;
bytes = 8192;
while(bytes > 0)
{
if (*p == 26) // End of file marker...
{
eof = 1;
break;
}
if (*p == '\n' || *p == ' ')
{
if (hasnum)
v[i++] = num;
num = 0;
p++;
bytes--;
hasnum = 0;
}
else if (*p >= '0' && *p <= '9')
{
hasnum = 1;
num *= 10;
num += *p-'0';
p++;
bytes--;
}
else
{
cout << "Error..." << endl;
exit(1);
}
}
memset(buffer, 26, sizeof(buffer)); // To detect end of files.
}
cout << "Number of values read " << i << endl;
delete [] v;
}

void func6()
{
int *v = new int[42336000];
int num = 0;

FILE *f = fopen(file_name, "r");
char buffer[8192];
int i = 0;
int bytes = 0;
char *p;
int hasnum = 0;
int eof = 0;
while(!eof)
{
fread(buffer, 1, sizeof(buffer), f);
p = buffer;
bytes = 8192;
while(bytes > 0)
{
if (*p == 26) // End of file marker...
{
eof = 1;
break;
}
if (*p == '\n' || *p == ' ')
{
if (hasnum)
v[i++] = num;
num = 0;
p++;
bytes--;
hasnum = 0;
}
else if (*p >= '0' && *p <= '9')
{
hasnum = 1;
num *= 10;
num += *p-'0';
p++;
bytes--;
}
else
{
cout << "Error..." << endl;
exit(1);
}
}
memset(buffer, 26, sizeof(buffer)); // To detect end of files.
}
fclose(f);
cout << "Number of values read " << i << endl;
delete [] v;
}


void func7()
{
int *v = new int[42336000];
int num = 0;

FILE *f = fopen(file_name, "r");
int ch;
int i = 0;
int hasnum = 0;
while((ch = fgetc(f)) != EOF)
{
if (ch == '\n' || ch == ' ')
{
if (hasnum)
v[i++] = num;
num = 0;
hasnum = 0;
}
else if (ch >= '0' && ch <= '9')
{
hasnum = 1;
num *= 10;
num += ch-'0';
}
else
{
cout << "Error..." << endl;
exit(1);
}
}
fclose(f);
cout << "Number of values read " << i << endl;
delete [] v;
}


void func8()
{
int *v = new int[42336000];
int num = 0;

int f = open(file_name, O_RDONLY);

off_t size = lseek(f, 0, SEEK_END);
char *buffer = (char *)mmap(NULL, size, PROT_READ, MAP_PRIVATE, f, 0);

int i = 0;
int hasnum = 0;
int bytes = size;
char *p = buffer;
while(bytes > 0)
{
if (*p == '\n' || *p == ' ')
{
if (hasnum)
v[i++] = num;
num = 0;
p++;
bytes--;
hasnum = 0;
}
else if (*p >= '0' && *p <= '9')
{
hasnum = 1;
num *= 10;
num += *p-'0';
p++;
bytes--;
}
else
{
cout << "Error..." << endl;
exit(1);
}
}
close(f);
munmap(buffer, size);
cout << "Number of values read " << i << endl;
delete [] v;
}






struct bm
{
void (*f)();
const char *name;
};

#define BM(f) { f, #f }

bm b[] =
{
BM(func1),
BM(func2),
BM(func3),
BM(func4),
BM(func5),
BM(func6),
BM(func7),
BM(func8),
};


double time_to_double(timeval *t)
{
return (t->tv_sec + (t->tv_usec/1000000.0)) * 1000.0;
}

double time_diff(timeval *t1, timeval *t2)
{
return time_to_double(t2) - time_to_double(t1);
}



int main()
{
for(int i = 0; i < sizeof(b) / sizeof(b[0]); i++)
{
timeval t1, t2;
gettimeofday(&t1, NULL);
b[i].f();
gettimeofday(&t2, NULL);
cout << b[i].name << ": " << time_diff(&t1, &t2) << "ms" << endl;
}
for(int i = sizeof(b) / sizeof(b[0])-1; i >= 0; i--)
{
timeval t1, t2;
gettimeofday(&t1, NULL);
b[i].f();
gettimeofday(&t2, NULL);
cout << b[i].name << ": " << time_diff(&t1, &t2) << "ms" << endl;
}
}

结果(连续两次运行,向前和向后以避免文件缓存的好处):

Number of values read 42336000
func1: 6068.53ms
Number of values read 42336000
func2: 6421.47ms
Number of values read 42336000
func3: 5756.63ms
Number of values read 42336000
func4: 6947.56ms
Number of values read 42336000
func5: 941.081ms
Number of values read 42336000
func6: 962.831ms
Number of values read 42336000
func7: 2572.4ms
Number of values read 42336000
func8: 816.59ms
Number of values read 42336000
func8: 815.528ms
Number of values read 42336000
func7: 2578.6ms
Number of values read 42336000
func6: 948.185ms
Number of values read 42336000
func5: 932.139ms
Number of values read 42336000
func4: 6988.8ms
Number of values read 42336000
func3: 5750.03ms
Number of values read 42336000
func2: 6380.36ms
Number of values read 42336000
func1: 6050.45ms

总而言之,正如有人在评论中指出的那样,整数的实际解析占了整个时间的相当大一部分,因此读取文件并不像我最初认为的那么重要。即使是一种非常天真的读取文件的方式(使用 fgetc() 胜过 ifstream operator>> 整数。

可以看出,使用mmap 加载文件比通过fstream 读取文件稍微快一些,但也只是稍微快一点。

关于c++ - 从 C++ 中的文本文件中读取数百万行分隔的整数的最有效方法是什么,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/15115943/

24 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com