C++读取大数据，解析，然后写入数据-6ren

C++读取大数据，解析，然后写入数据

转载作者：塔克拉玛干更新时间：2023-11-03 07:58:08

我正在尝试读取一个大型数据集，按照我需要的方式对其进行格式化，然后将其写入另一个文件。我正在尝试在 SAS 或 STATA 上使用 C++ 以获得速度优势。数据文件通常在 10GB 左右。我当前的代码需要一个多小时才能运行(然后我将其终止，因为我确信我的代码有些地方效率很低。

有没有更有效的方法来做到这一点？也许将文件读入内存，然后使用 switch 语句对其进行分析？ (我有 32gb ram linux 64bit)。是否有可能在循环中读取然后写入会减慢它的速度，因为它一直在读取，然后写入？我尝试从一个驱动器读取它，然后写入另一个驱动器以加快速度。

开关箱会减慢速度吗？

我现在的进程使用 getline 读取数据，使用 switch 语句正确解析它，然后将它写入我的输出文件。并重复 3 亿行。 switch 语句中还有大约 10 种情况，但为了简洁起见，我没有复制。

代码可能非常难看，都在 main 函数中，但我想在处理吸引力之前让它工作。

我试过使用 read() 但没有成功。如果我需要澄清任何事情，请告诉我。

谢谢你的帮助!

 #include <iostream>
 #include <fstream>
 #include <string>
 #include <sstream>
 #include <stdio.h>
 //#include <cstring>
 //#include <boost/algorithm/string.hpp>

 #include <vector>

  using namespace std;
 //using namespace boost;


 struct dataline
{
char type[0];
double second;
short mill;
char event[1];
char ticker[6];
char marketCategory[1];
char financialStatus[1];
int roundLotSize;
short roundLotOnly;
char tradingState[1];
char reserved[1];
char reason[4];
char mpid[4];
char primaryMarketMaker[1];
char primaryMarketMode[1];
char marketParticipantState[1];
unsigned long orderNumber;
char buySell[0];
double shares;
float price;
int executedShares;
double matchNumber;
char printable[1];
double executionPrice;
int canceledShares;
double sharesBig;
double crossPrice;
char crossType[0];
double pairedShares;
double imbalanceShares;
char imbalanceDirection[1];
double fairPrice;
double nearPrice;
double currentReferencePrice;
char priceVariationIndicator[1];
};

  int main () 
{
string a; 
string b;
string c;
string d;
string e;
string f;
string g;
string h;
string k;
string l;
string times;
string smalltimes;

short time;     //counter to keep second filled
short smalltime;    //counter to keep millisecond filled
double N;
double NN;
double NNN;
int length;
char M; 
//vector<> fout;
string line;

ofstream fout ("/media/3tb/test.txt");
ifstream myfile;
myfile.open("S050508-v3.txt");

dataline oneline;

if (myfile.is_open())
    {
    while ( myfile.good() )
        {
        getline (myfile,line);
//      cout << line<<endl;;

        a=line.substr(0,1);
        stringstream ss(a);
        char type;
        ss>>type;


        switch (type)
            { 
            case 'T':
                {
                if (type == 'T')
                    {
                    times=line.substr(1,5);
                    stringstream s(times);
                    s>>time;
                    //oneline.second=time;
                    //oneline.second;
                    //cout<<time<<endl;
                    }
                else
                    {
                    time=time;
                    }
                break;
                }
            case 'M':
                {
                if (type == 'M')
                    {
                    smalltimes=line.substr(1,3);
                    stringstream ss(smalltimes);
                    ss>>smalltime;      //oneline.mill;
                //  cout<<smalltime<<endl;                            //smalltime=oneline.mill;
                    }
                else
                    {
                    smalltime=smalltime;
                    }
                break;
                }


            case 'R':
                {
                oneline.second=time;
                oneline.mill=smalltime;

                a=line.substr(0,1);
                stringstream ss(a);
                ss>>oneline.type;

                b=line.substr(1,6);
                stringstream sss(b);
                sss>>oneline.ticker;

                c=line.substr(7,1);
                stringstream ssss(c);
                ssss>>oneline.marketCategory;

                d=line.substr(8,1);
                stringstream sssss(d);
                sssss>>oneline.financialStatus;

                e=line.substr(9,6);
                stringstream ssssss(e);
                ssssss>>oneline.roundLotSize;

                f=line.substr(15,1);
                stringstream sssssss(f);
                sssssss>>oneline.roundLotOnly;

                *oneline.tradingState=0;
                *oneline.reserved=0;
                *oneline.reason=0;
                *oneline.mpid=0;
                *oneline.primaryMarketMaker=0;
                *oneline.primaryMarketMode=0;
                *oneline.marketParticipantState=0;
                oneline.orderNumber=0;
                *oneline.buySell=0;
                oneline.shares=0;
                oneline.price=0;
                oneline.executedShares=0;
                oneline.matchNumber=0;
                *oneline.printable=0;
                oneline.executionPrice=0;
                oneline.canceledShares=0;
                oneline.sharesBig=0;
                oneline.crossPrice=0;
                *oneline.crossType=0;
                oneline.pairedShares=0;
                oneline.imbalanceShares=0;
                *oneline.imbalanceDirection=0;
                oneline.fairPrice=0;
                oneline.nearPrice=0;
                oneline.currentReferencePrice=0;
                *oneline.priceVariationIndicator=0;

                break;
                }//End Case 
            }//End Switch
            }//end While
    myfile.close();

     }//End If
else cout << "Unable to open file"; 
cout<<"Junk"<<endl;

return 0;
}

更新所以我一直在尝试使用内存映射，但现在我遇到了段错误。我一直在尝试按照不同的示例来拼凑出适合我的东西。为什么我会出现段错误？我采用了代码的第一部分，如下所示:

int main (int argc, char** path) 
 {
 long i;
 int fd;
 char *map;
 char *FILEPATH = path;
 unsigned long FILESIZE;
 FILE* fp = fopen(FILEPATH, "/home/brian/Desktop/S050508-v3.txt");
 fseek(fp, 0, SEEK_END);
 FILESIZE = ftell(fp);
 fseek(fp, 0, SEEK_SET);
 fclose(fp);
 fd = open(FILEPATH, O_RDONLY);

 map = (char *) mmap(0, FILESIZE, PROT_READ, MAP_SHARED, fd, 0);

 char z;
 stringstream ss;

 for (long i = 0; i <= FILESIZE; ++i) 
    {
    z = map[i];
    if (z != '\n') 
        {
        ss << z;
            }
    else 
        {
            // c style tokenizing
            ss.str("");
            }
        }
 if (munmap(map, FILESIZE) == -1) perror("Error un-mmapping the file");
 close(fd);

最佳答案

The data file are usually around 10gigabytes. ... Are the switch cases slowing it down?

几乎可以肯定不是，闻起来像你受 I/O 限制。但是你应该考虑测量它。现代 CPU 具有性能计数器，使用正确的工具可以很容易地利用这些计数器。但是让我们开始将问题划分为一些主要领域:设备的 I/O、内存的加载/存储、CPU。您可以在您读取时钟的代码中放置一些标记，以便了解每个操作需要多长时间。在 linux 上，您可以使用 clock_gettime() 或 rdtsc 指令来访问精度高于操作系统节拍的时钟。

考虑 mmap/CreateFileMapping，其中任何一个都可以为您正在访问的页面提供更好的效率/吞吐量。

如果流式传输大量已经分页的数据，请考虑大/大页面。

来自manual for mmap() :

Description

mmap() creates a new mapping in the virtual address space of the calling process. The starting address for the new mapping is specified in addr. The length argument specifies the length of the mapping.

这是一个 mmap() example :

#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/mman.h>

#define FILEPATH "/tmp/mmapped.bin"
#define NUMINTS  (1000)
#define FILESIZE (NUMINTS * sizeof(int))

int main(int argc, char *argv[])
{
    int i;
    int fd;
    int *map;  /* mmapped array of int's */

    fd = open(FILEPATH, O_RDONLY);
    if (fd == -1) {
    perror("Error opening file for reading");
    exit(EXIT_FAILURE);
    }

    map = mmap(0, FILESIZE, PROT_READ, MAP_SHARED, fd, 0);
    if (map == MAP_FAILED) {
    close(fd);
    perror("Error mmapping the file");
    exit(EXIT_FAILURE);
    }

    /* Read the file int-by-int from the mmap
     */
    for (i = 1; i <=NUMINTS; ++i) {
    printf("%d: %d\n", i, map[i]);
    }

    if (munmap(map, FILESIZE) == -1) {
    perror("Error un-mmapping the file");
    }
    close(fd);
    return 0;
}

关于C++读取大数据，解析，然后写入数据，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/14784447/

文章推荐： c++ - QGLWidget 拒绝绘制任何东西

文章推荐： C++程序在OpenCV应用中不区分大小写字母

文章推荐： c++ - 使用 c/c++ 以编程方式从 html 文件中提取表格

php - 将 PHP 写入 Javascript 或将 Javascript 写入 PHP
我有这个代码 var myChart = new FusionCharts("../themes/clean/charts/hbullet.swf", "myChartId", "400", "75
Linux 异步 (io_submit) 写入 v/s 正常(缓冲)写入
既然写入是立即进行的(复制到内核缓冲区并返回)，那么使用 io_submit 进行写入有什么好处？事实上，它 (aio/io_submit) 看起来更糟，因为您必须在堆上分配写入缓冲区并且不能使用基
javascript - 当从网络服务器提供服务时，写入 .innerHTML 不起作用，但当作为文件浏览时，写入 .innerHTML 不起作用，这可能是什么原因造成的？
我正在使用 mootool 的 Request.JSON 从 Twitter 检索推文。收到它后，我将写入目标 div 的 .innerHTML 属性。当我在本地将其作为文件进行测试时，即 file:
python - 为什么从 Spark 写入 Vertica DB 比从 Spark 写入 MySQL 需要更长的时间？
最终，我想将 Vertica DB 中的数据抓取到 Spark 中，训练机器学习模型，进行预测，并将这些预测存储到另一个 Vertica DB 中。当前的问题是确定流程最后部分的瓶颈:将 Spark
java - 更改将 double 写入 CSV 的 Java 代码以将 double[] 写入 CSV(用例 = WEKA 库)
我使用 WEKA 库编写了一个 Java 程序，训练分类算法使用经过训练的算法对未标记的数据集运行预测将结果写入 .csv 文件问题在于它当前写出离散分类结果(即算法猜测一行属于哪个类别)。我
clickonce - 写入/读取数据目录是否需要管理员权限？
背景 - 我正在考虑使用 clickonce 通过 clickonce(通过网站)部署 WinForms 应用程序。相对简单的应用程序的要素是: - 它是一个可执行文件和一个数据库文件(sqlite)
arrays - 快速初始化C数组(写入)
是否有更好的解决方案来快速初始化 C 数组(在堆上创建)？就像我们使用大括号一样 double** matrix_multiply(const double **l_matrix, const dou
java - 写入 JSONArray
我正在读取 JSON 文件，取出值并进行一些更改。基本上我向数组添加了一些值。之后我想将其写回到文件中。当我将 JSONArray 写回文件时，会被写入字符串而不是 JSONArray 对象。怎样才
c# - 从页面文件读取/写入
我为两个应用程序使用嵌入式数据库，其中一个是服务器，另一个是客户端。客户端应用程序。可以向服务器端发送获取数据请求以检索数据并显示在表格(或其他)中。问题是这样的:如何将获取的数据保存(写入)到页面文
arrays - 快速初始化C数组(写入)
是否有更好的解决方案来快速初始化 C 数组(在堆上创建)？就像我们使用大括号一样 double** matrix_multiply(const double **l_matrix, const dou
java - 如何在Java中从内存中逐位读取/写入
从问题得出问题:找到所有 result = new ArrayList(); for (int i = 2; i >(i%8) & 0x1) == 0) { result.add(i
python - 写入 CSV
由于某种原因，它没有写入 CSV。谁能明白为什么它不写吗？ def main(): list_of_emails = read_email_csv() #read input file, cr
javascript - 写入\: on a URL
关闭。这个问题是 not reproducible or was caused by typos 。它目前不接受答案。这个问题是由于错别字或无法再重现的问题引起的。虽然类似的问题可能在这里出现，
c - 写入/读取二进制保存游戏
我目前正在开发一个保存和加载程序，但我无法获得正确的结果。编写程序: #include #include #define FILENAME "Save" #define COUNT 6 type
java - 从二进制文件读取/写入
import java.io.*; public class Main2 { public static void main(String[] args) throws Exception {
iphone - 写入 UITextView
我需要使用预定义位置字符串“Office”从所有日历中检索所有 iOS 事件，然后将结果写入 NSLog 和 UITextView。到目前为止，这是我的代码: #import "ViewCo
ios - 写入 PFInstallation
我正在尝试将 BOOL 值写入 PFInstallation 中的列，但会不停地崩溃: - (IBAction)pushSwitch:(id)sender { NSUserDefaults *push
c# - 写入 MySQL
我以前在学校学过一些简单的数据库编程，但现在我正在尝试学习最佳实践，因为我正在编写更复杂的应用程序。写入 MySQL 数据库并不难，但我想知道让分布式应用程序写入 Amazon EC2 上的远程数据库
Java 写入 ResourceBundle
是否可以写回到ResourceBundle？目前我正在使用 ResourceBundle 来存储信息，在运行时使用以下内容读取信息 while(ResourceBundle.getBundle("bu
c - 写入 - 读取二进制文件中的结构
关闭。这个问题是not reproducible or was caused by typos .它目前不接受答案。这个问题是由于错别字或无法再重现的问题引起的。虽然类似的问题可能是on-topi

塔克拉玛干

个人简介

我是一名优秀的程序员,十分优秀！

作者热门文章

滴滴打车优惠券免费领取

全站热门文章

首页

博学

6Ren·AI

商城

C++读取大数据，解析，然后写入数据