gpt4 book ai didi

c++ - 如何在 C++ 中快速安全地从文本文件中读取超长行?

转载 作者:可可西里 更新时间:2023-11-01 17:44:24 33 4
gpt4 key购买 nike

有一个 6.53 GiB 的大文本文件。它的每一行都可以是数据行或注释行。注释行通常很短,少于 80 个字符,而数据行包含超过 200 万个字符并且是可变长度的。

考虑到每个数据行都需要作为一个单元来处理,有没有一种简单的方法可以在 C++ 中安全快速地读取行?

safe(可变长度数据线安全):该解决方案与std::getline()一样易于使用.由于长度在变化,希望避免额外的内存管理。

:解决方案最快可达readline()python 3.6.0 , 甚至和 fgets() 一样快的 stdio.h .

欢迎使用纯 C 解决方案。在 C 和 C++ 中都提供了用于进一步处理的接口(interface)。


更新 1:感谢来自 Basile Starynkevitch 的简短但宝贵的评论,完美的解决方案来了: POSIX getline() .由于进一步的处理只涉及从字符到数字的转换,并没有使用字符串类的许多特性,因此在这个应用程序中一个字符数组就足够了。


更新 2:感谢来自 Zulan 的评论和 Galik ,他们都报告了 std::getline() 中的可比性能, fgets()POSIX getline() ,另一种可能的解决方案是使用更好的标准库实现,例如 libstdc++ .此外,这里有一个 report声称 std::getline 的 Visual C++ 和 libc++ 实现没有很好地优化。

libc++ 移动至 libstdc++改变了很多结果。在不同平台上使用 libstdc++ 3.4.13/Linux 2.6.32,POSIX getline() , std::getline()fgets()显示可比较的性能。一开始,代码在 Xcode 8.3.2 (8E2002) 的默认 clang 设置下运行,因此 libc++被使用。


更多细节和一些努力(很长):

getline()<string>可以处理任意长行但有点慢。 readline() 在 C++ 中是否有替代方案?在 python 中?

// benchmark on Mac OS X with libc++ and SSD:
readline() of python ~550 MiB/s

fgets() of stdio.h, -O0 / -O2 ~1100 MiB/s

getline() of string, -O0 ~27 MiB/s
getline() of string, -O2 ~150 MiB/s
getline() of string + stack buffer, -O2 ~150 MiB/s

getline() of ifstream, -O0 / -O2 ~240 MiB/s
read() of ifstream, -O2 ~340 MiB/s

wc -l ~670 MiB/s

cat data.txt | ./read-cin-unsync ~20 MiB/s

getline() of stdio.h (POSIX.1-2008), -O0 ~1300 MiB/s
  • 速度非常粗略地四舍五入,只是为了显示幅度,所有代码块都运行了几次以确保这些值具有代表性。

  • '-O0/-O2' 表示两种优化级别的速度非常相似

  • 代码如下所示。


readline() of python

# readline.py

import time
import os

t_start = time.perf_counter()

fname = 'data.txt'
fin = open(fname, 'rt')

count = 0

while True:
l = fin.readline()
length = len(l)
if length == 0: # EOF
break
if length > 80: # data line
count += 1

fin.close()

t_end = time.perf_counter()
time = t_end - t_start

fsize = os.path.getsize(fname)/1024/1024 # file size in MiB
print("speed: %d MiB/s" %(fsize/time))
print("reads %d data lines" %count)

# run as `python readline.py` with python 3.6.0

fgets() of stdio.h

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>

int main(int argc, char* argv[]){
clock_t t_start = clock();

if(argc != 2) {
fprintf(stderr, "needs one input argument\n");
return EXIT_FAILURE;
}

FILE* fp = fopen(argv[1], "r");
if(fp == NULL) {
perror("Failed to open file");
return EXIT_FAILURE;
}

// maximum length of lines, determined previously by python
const int SIZE = 1024*1024*3;
char line[SIZE];

int count = 0;
while(fgets(line, SIZE, fp) == line) {
if(strlen(line) > 80) {
count += 1;
}
}

clock_t t_end = clock();

const double fsize = 6685; // file size in MiB

double time = (t_end-t_start) / (double)CLOCKS_PER_SEC;

fprintf(stdout, "takes %.2f s\n", time);
fprintf(stdout, "speed: %d MiB/s\n", (int)(fsize/time));
fprintf(stdout, "reads %d data lines\n", count);

return EXIT_SUCCESS;
}

getline() of <string>

// readline-string-getline.cpp
#include <string>
#include <fstream>
#include <iostream>
#include <ctime>
#include <cstdlib>

using namespace std;

int main(int argc, char* argv[]) {
clock_t t_start = clock();

if(argc != 2) {
fprintf(stderr, "needs one input argument\n");
return EXIT_FAILURE;
}

// manually set the buffer on stack
const int BUFFERSIZE = 1024*1024*3; // stack on my platform is 8 MiB
char buffer[BUFFERSIZE];
ifstream fin;
fin.rdbuf()->pubsetbuf(buffer, BUFFERSIZE);
fin.open(argv[1]);

// default buffer setting
// ifstream fin(argv[1]);

if(!fin) {
perror("Failed to open file");
return EXIT_FAILURE;
}

// maximum length of lines, determined previously by python
const int SIZE = 1024*1024*3;
string line;
line.reserve(SIZE);

int count = 0;
while(getline(fin, line)) {
if(line.size() > 80) {
count += 1;
}
}

clock_t t_end = clock();

const double fsize = 6685; // file size in MiB

double time = (t_end-t_start) / (double)CLOCKS_PER_SEC;

fprintf(stdout, "takes %.2f s\n", time);
fprintf(stdout, "speed: %d MiB/s\n", (int)(fsize/time));
fprintf(stdout, "reads %d data lines\n", count);

return EXIT_SUCCESS;
}

getline() of ifstream

// readline-ifstream-getline.cpp
#include <fstream>
#include <iostream>
#include <ctime>
#include <cstdlib>

using namespace std;

int main(int argc, char* argv[]) {
clock_t t_start = clock();

if(argc != 2) {
fprintf(stderr, "needs one input argument\n");
return EXIT_FAILURE;
}

ifstream fin(argv[1]);
if(!fin) {
perror("Failed to open file");
return EXIT_FAILURE;
}

// maximum length of lines, determined previously by python
const int SIZE = 1024*1024*3;
char line[SIZE];

int count = 0;
while(fin.getline(line, SIZE)) {
if(strlen(line) > 80) {
count += 1;
}
}

clock_t t_end = clock();

const double fsize = 6685; // file size in MiB

double time = (t_end-t_start) / (double)CLOCKS_PER_SEC;

fprintf(stdout, "takes %.2f s\n", time);
fprintf(stdout, "speed: %d MiB/s\n", (int)(fsize/time));
fprintf(stdout, "reads %d data lines\n", count);

return EXIT_SUCCESS;
}

read() of ifstream

// seq-read-bin.cpp
// sequentially read the file to see the speed upper bound of
// ifstream

#include <iostream>
#include <fstream>
#include <ctime>

using namespace std;


int main(int argc, char* argv[]) {
clock_t t_start = clock();

if(argc != 2) {
fprintf(stderr, "needs one input argument\n");
return EXIT_FAILURE;
}

ifstream fin(argv[1], ios::binary);

const int SIZE = 1024*1024*3;
char str[SIZE];

while(fin) {
fin.read(str,SIZE);
}

clock_t t_end = clock();
double time = (t_end-t_start) / (double)CLOCKS_PER_SEC;

const double fsize = 6685; // file size in MiB

fprintf(stdout, "takes %.2f s\n", time);
fprintf(stdout, "speed: %d MiB/s\n", (int)(fsize/time));

return EXIT_SUCCESS;
}

use cat, then read from cin with cin.sync_with_stdio(false)

#include <iostream>
#include <ctime>
#include <cstdlib>

using namespace std;

int main(void) {
clock_t t_start = clock();

string input_line;

cin.sync_with_stdio(false);

while(cin) {
getline(cin, input_line);
}

double time = (clock() - t_start) / (double)CLOCKS_PER_SEC;

const double fsize = 6685; // file size in MiB

fprintf(stdout, "takes %.2f s\n", time);
fprintf(stdout, "speed: %d MiB/s\n", (int)(fsize/time));

return EXIT_SUCCESS;
}

POSIX getline()

// readline-c-getline.c
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

int main(int argc, char *argv[]) {

clock_t t_start = clock();

char *line = NULL;
size_t len = 0;
ssize_t nread;

if (argc != 2) {
fprintf(stderr, "Usage: %s <file>\n", argv[1]);
exit(EXIT_FAILURE);
}

FILE *stream = fopen(argv[1], "r");
if (stream == NULL) {
perror("fopen");
exit(EXIT_FAILURE);
}

int length = -1;
int count = 0;
while ((nread = getline(&line, &len, stream)) != -1) {
if (nread > 80) {
count += 1;
}
}

free(line);
fclose(stream);

double time = (clock() - t_start) / (double)CLOCKS_PER_SEC;
const double fsize = 6685; // file size in MiB
fprintf(stdout, "takes %.2f s\n", time);
fprintf(stdout, "speed: %d MiB/s\n", (int)(fsize/time));
fprintf(stdout, "reads %d data lines.\n", count);
// fprintf(stdout, "length of MSA: %d\n", length-1);

exit(EXIT_SUCCESS);
}

最佳答案

嗯,C 标准库是 C++ 标准库的一个子集。来自 C++ 2014 标准的 n4296 草案:

17.2 The C standard library [library.c]

The C++ standard library also makes available the facilities of the C standard library, suitably adjusted toensure static type safety.

因此,如果您在评论中解释性能瓶颈需要它,那么在 C++ 程序中使用 fgets 是完全没问题的 - 只是您应该小心地将它封装在实用程序类中,以便保留面向对象的高级结构。

关于c++ - 如何在 C++ 中快速安全地从文本文件中读取超长行?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/44241289/

33 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com