gpt4 book ai didi

javascript - Node.js GET 请求 ETIMEDOUT & ESOCKETTIMEDOUT

转载 作者:IT老高 更新时间:2023-10-28 23:07:54 30 4
gpt4 key购买 nike

我正在使用 Node.js - 异步和请求模块来抓取 100+ 数百万个网站,几分钟后我不断遇到错误 ESOCKETTIMEDOUTETIMEDOUT

我重新启动脚本后它再次工作。这似乎不是连接限制问题,因为我仍然可以毫无延迟地执行 resolve4、resolveNs、resolveMx 和 curl

您是否发现代码有任何问题?或任何建议?我想将 async.queue() 的并发量提高到至少 1000。谢谢。

var request = require('request'),
async = require('async'),
mysql = require('mysql'),
dns = require('dns'),
url = require('url'),
cheerio = require('cheerio'),
iconv = require('iconv-lite'),
charset = require('charset'),
config = require('./spy.config'),
pool = mysql.createPool(config.db);

iconv.skipDecodeWarning = true;

var queue = async.queue(function (task, cb) {
dns.resolve4('www.' + task.domain, function (err, addresses) {
if (err) {
//
// Do something
//
setImmediate(function () {
cb()
});
} else {
request({
url: 'http://www.' + task.domain,
method: 'GET',
encoding: 'binary',
followRedirect: true,
pool: false,
pool: { maxSockets: 1000 },
timeout: 15000 // 15 sec
}, function (error, response, body) {

//console.info(task);

if (!error) {
// If ok, do something

} else {
// If not ok, do these

console.log(error);

// It keeps erroring here after few minutes, resolve4, resolveNs, resolveMx still work here.

// { [Error: ETIMEDOUT] code: 'ETIMEDOUT' }
// { [Error: ESOCKETTIMEDOUT] code: 'ESOCKETTIMEDOUT' }

var ns = [],
ip = [],
mx = [];
async.parallel([
function (callback) {
// Resolves the domain's name server records
dns.resolveNs(task.domain, function (err, addresses) {
if (!err) {
ns = addresses;
}
callback();
});
}, function (callback) {
// Resolves the domain's IPV4 addresses
dns.resolve4(task.domain, function (err, addresses) {
if (!err) {
ip = addresses;
}
callback();
});
}, function (callback) {
// Resolves the domain's MX records
dns.resolveMx(task.domain, function (err, addresses) {
if (!err) {
addresses.forEach(function (a) {
mx.push(a.exchange);
});
}
callback();
});
}
], function (err) {
if (err) return next(err);

// do something
});

}
setImmediate(function () {
cb()
});
});
}
});
}, 200);

// When the queue is emptied we want to check if we're done
queue.drain = function () {
setImmediate(function () {
checkDone()
});
};
function consoleLog(msg) {
//console.info(msg);
}
function checkDone() {
if (queue.length() == 0) {
setImmediate(function () {
crawlQueue()
});
} else {
console.log("checkDone() not zero");
}
}

function query(sql) {
pool.getConnection(function (err, connection) {
if (!err) {
//console.log(sql);
connection.query(sql, function (err, results) {
connection.release();
});
}
});
}

function crawlQueue() {
pool.getConnection(function (err, connection) {
if (!err) {
var sql = "SELECT * FROM domain last_update < (UNIX_TIMESTAMP() - 2592000) LIMIT 500";
connection.query(sql, function (err, results) {
if (!err) {
if (results.length) {
for (var i = 0, len = results.length; i < len; ++i) {
queue.push({"id": results[i]['id'], "domain": results[i]['domain'] });
}
} else {
process.exit();
}
connection.release();
} else {
connection.release();
setImmediate(function () {
crawlQueue()
});
}
});
} else {
setImmediate(function () {
crawlQueue()
});
}
});
}
setImmediate(function () {
crawlQueue()
});

而且系统限制相当高。

    Limit                     Soft Limit           Hard Limit           Units
Max cpu time unlimited unlimited seconds
Max file size unlimited unlimited bytes
Max data size unlimited unlimited bytes
Max stack size 8388608 unlimited bytes
Max core file size 0 unlimited bytes
Max resident set unlimited unlimited bytes
Max processes 257645 257645 processes
Max open files 500000 500000 files
Max locked memory 65536 65536 bytes
Max address space unlimited unlimited bytes
Max file locks unlimited unlimited locks
Max pending signals 257645 257645 signals
Max msgqueue size 819200 819200 bytes
Max nice priority 0 0
Max realtime priority 0 0
Max realtime timeout unlimited unlimited us

sysctl

net.ipv4.ip_local_port_range = 10000    61000

最佳答案

默认情况下,Node 有 4 workers to resolve DNS queries .如果您的 DNS 查询需要很长时间,请求将在 DNS 阶段阻塞,并且症状正是 ESOCKETTIMEDOUTETIMEDOUT

尝试增加你的 uv 线程池大小:

export UV_THREADPOOL_SIZE=128
node ...

或在 index.js 中(或任何你的入口点):

#!/usr/bin/env node
process.env.UV_THREADPOOL_SIZE = 128;

function main() {
...
}

编辑:I also wrote blog post关于它。

关于javascript - Node.js GET 请求 ETIMEDOUT & ESOCKETTIMEDOUT,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/24320578/

30 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com