gpt4 book ai didi

node.js - Node 速度慢且对大数据文件无响应

转载 作者:搜寻专家 更新时间:2023-10-31 23:54:51 24 4
gpt4 key购买 nike

我编写了一个简单的 Node 程序来解析从公司 ERP 返回的 excel 格式的 HTML 表,提取数据并将其保存为 JSON。

这使用 FS 打开文件并使用 Cheerio 提取数据。

该程序对于小文件 (<10MB) 运行良好,但对于大文件 (>30MB) 需要花费很多时间

我遇到问题的数据文件是 38MB,大约有 30,0000 行数据。

问题 1:这不应该更快吗?问题二:我只能得到一条console.log语句输出。我可以在任何地方放置一个语句并且它有效,如果我添加多个,只有第一个输出任何东西。

var fs = require('fs');                             // for file system streaming

function oracleParse(file, callback) {

var headers = []; // array to store the data table column headers
var myError; // module error holder
var XMLdata = []; // array to store the parsed XML data to be returned
var cheerio = require('cheerio');

// open relevant file
var reader = fs.readFile(file, function (err, data) {

if (err) {

myError = err; // catch errors returned from file open
} else {
$ = cheerio.load(data); // load data returned from fs into cheerio for parsing

// the data retruned from Oracle consists of a variable number of tables however the last one is
// always the one that contains the data. We can select this with cheerio and reset the cherrio $ object
var dataTable = $('table').last();
$ = cheerio.load(dataTable);

// table column headers in the table of data returned from Oracle include headers under 'tr td b' elements
// We extract these headers and load these into the 'headers' array for future use as keys in the JSON
// data array to be constucted
$('tr td b').each(function (i, elem) {
headers.push($(this).text());
});

// remove the headers from the cheerio data object so that they don't interfere with the data
$('tr td b').remove();

// for the actual data, each row of data (this corresponds to a customer, account, transation record etc) is
// extracted using cheerio and stored in a key/value object. These objects are then stored in an array
var dataElements = [];
var dataObj = {};
var headersLength = headers.length;
var headerNum;
// the actual data is returned from Oracle in 'tr td nobr' elements. Using cheerio, we can extract all of
// these elements although they are not separated into individual rows. It is possible to return individual
// rows using cheeris (e.g. 'tr') but this is very slow as cheerio needs to requery each subsequent row.
// In our case, we simply select all data elements using the 'tr td nobr' selector and then iterate through
// them, aligning them with the relevant key and grouping them into relevant rows by taking the modulus of
// the element number returned and the number of headers there are.
$('tr td nobr').each(function (i, elem) {

headerNum = i % headersLength; // pick which column is associated with each element

dataObj[headers[headerNum]] = $(this).text(); // build the row object

// if we find the header number is equal to the header length less one, we have reached the end of
// elements for the row and push the row object onto the array in which we store the final result
if (headerNum === headersLength - 1) {
XMLdata.push(dataObj);
dataObj = {};
}
});
console.log(headersLength);

// once all the data in the file has been parsed, run the call back function passed in
callback(JSON.stringify(XMLdata));
}
});

return myError;
}

// parse promo dates data
var file = './data/Oracle/signups_01.html';
var output = './data/Oracle/signups_01.JSON';
//var file = './data/Oracle/detailed_data.html';
//var output = './data/Oracle/detailed_data.JSON';
var test = oracleParse(file, function(data) {
fs.writeFile(output, data, function(err) {
if (err) throw err;
console.log('File write complete: ' + output);
});
});

console.log(test);

最佳答案

您可能想查看像 substack 的 trumpet 这样的流解决方案或者(无耻自插)cornet .否则,您将多次遍历文档,这总是需要一些时间。

我的猜测是 Chrome 会智能地推迟繁重的工作——您可能只关心前几行,所以这就是您得到的结果。尝试包含 jQuery 并运行您的代码,它仍然需要一些时间。公平地说,Chrome 的 DOM 不会被垃圾回收,因此其性能始终优于 cheerio。

关于node.js - Node 速度慢且对大数据文件无响应,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/23889297/

24 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com