gpt4 book ai didi

java - Hadoop 中的 Mapreduce 在使用超过 200MB 的文件时会超出 GC 开销限制

转载 作者:可可西里 更新时间:2023-11-01 16:58:21 26 4
gpt4 key购买 nike

我正在 Hadoop 多节点集群 (2.4.1) 上运行 Mapreduce 代码。当我尝试使用大小为 200MB 和 200MB 的 2 个输入文件运行时,出现错误 GC overhead limit exceeded。当我使用非常小的文件时,它运行完美并得到正确的输出。
我的目标是比较第一个文件中的每个流量记录和第二个文件中的每个流量记录并计算距离,然后取 10 个最大值并根据这 10 个最大值输出到 reducer。值(value)观。

两个文件中的示例流记录 - 194.144.0.27|192.168.1.5|0.0.0.0|0|0|2|104|1410985350|1410985350|51915|51413|6|6

几张快照:http://goo.gl/5tUhJJhttp://goo.gl/lh1Qvm

这是映射器类:

映射器类:

public class mapper extends Mapper<LongWritable, Text, Text, IntWritable> 
{

private final static IntWritable five = new IntWritable(5);

private Text counter1;

ArrayList<String> lines = new ArrayList<String>();
String str;
BufferedReader br,in;
int ddos_line = 0;
int normal_line = 0,total_testing_records=4000;
int K = 10;

@Override
protected void setup(Context context) throws IOException, InterruptedException
{
//BufferedReader in = new BufferedReader(new FileReader("normal"));

Configuration conf = context.getConfiguration();
URI[] cachefiles = context.getCacheFiles();

FileSystem fs = FileSystem.get(new Configuration());
FileStatus[] status = fs.listStatus(new Path(cachefiles[0].toString()));
BufferedReader in=new BufferedReader(new InputStreamReader(fs.open(status[0].getPath())));


while((str = in.readLine()) != null)
{
lines.add(str);
}
in.close();
//System.out.println("na netti");
}

@Override
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException
{

String line1 = value.toString();
ddos_line++;
normal_line = 0;

double[] count = {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1};
int[] lineIndex = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};

String[] parts = line1.split("\\|");
String[] linesArray = lines.toArray(new String[lines.size()]);

boolean bool = true;
int t1=0;
double sum=0;
while (bool)
{
for(int i=0; i<K;i++)
{
if(bool==false) break;
sum = 0;
String[] parts2 = linesArray[normal_line].split("\\|");

for(int x=0;x<13;x++)
{
if(parts[x].equals(parts2[x]))
{
t1 = 1;
}
else t1 = 0;

sum += t1;
}

sum = Math.sqrt(sum);

if(count[K-1] <= sum)
{
count[K-1] = sum;
lineIndex[K-1]=normal_line;
}



for(int k=0;k<K;k++)
{
for(int j=0;j<K-1;j++)
{
if(count[j] < count[j+1])
{
double temp2 = count[j+1];
count[j+1] = count[j];
count[j] = temp2;

int temp3 = lineIndex[j+1];
lineIndex[j+1] = lineIndex[j];
lineIndex[j] = temp3;
}
}
}

//System.out.println(ddos_line + " " + normal_line);
if (normal_line + 1 < linesArray.length)
{
normal_line++;
continue;
}

else bool = false;
}


} // while end

char[] t = {'d','d','d','d','d','d','d','d','d','d'};
for(int i=0;i<K;i++)
{
if(lineIndex[i] <= total_testing_records/2 ) t[i] = 'n';
}

int counter_normal=0, counter_ddos=0;
for(int i=0;i<K;i++)
{
if(t[i]=='n')
counter_normal++;
else
counter_ddos++;
//System.out.println("t[i]: "+t[i]+", counter: "+counter_ddos);

}

if(counter_normal<=K/2)
{
counter1 = new Text(ddos_line + " : d : "+ counter_ddos);
}
else
{
counter1 = new Text(ddos_line + " : n : "+ (counter_normal));
}



context.write(counter1, five);

//System.out.println("mapper finished");
}
public void run(Context context) throws IOException, InterruptedException
{
setup(context);
while (context.nextKeyValue()) {
map(context.getCurrentKey(), context.getCurrentValue(), context);
}
cleanup(context);
}
}

最佳答案

然后简单地增加任务的内存:

设置

mapred.child.java.opts

在你的工作配置中

-Xmx1024m

或者更多,无论你需要读取这个文件并处理它。

关于java - Hadoop 中的 Mapreduce 在使用超过 200MB 的文件时会超出 GC 开销限制,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/27175278/

26 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com