gpt4 book ai didi

java - 在 hadoop 上解析 Stackoverflow 的 posts.xml

转载 作者:行者123 更新时间:2023-12-01 13:59:09 25 4
gpt4 key购买 nike

我正在关注这个article由 Anoop Madhusudanan 在 codeproject 上构建一个推荐引擎,不是在集群上而是在我的系统上。

问题是当我尝试解析 posts.xml 时,其结构如下:

 <row Id="99" PostTypeId="2" ParentId="88" CreationDate="2008-08-01T14:55:08.477" Score="2" Body="&lt;blockquote&gt;&#xD;&#xA;  &lt;p&gt;The actual resolution of gettimeofday() depends on the hardware architecture. Intel processors as well as SPARC machines offer high resolution timers that measure microseconds. Other hardware architectures fall back to the system’s timer, which is typically set to 100 Hz. In such cases, the time resolution will be less accurate. &lt;/p&gt;&#xD;&#xA;&lt;/blockquote&gt;&#xD;&#xA;&#xD;&#xA;&lt;p&gt;I obtained this answer from &lt;a href=&quot;http://www.informit.com/guides/content.aspx?g=cplusplus&amp;amp;seqNum=272&quot; rel=&quot;nofollow&quot;&gt;High Resolution Time Measurement and Timers, Part I&lt;/a&gt;&lt;/p&gt;" OwnerUserId="25" LastActivityDate="2008-08-01T14:55:08.477" />

现在我需要在 hadoop 上解析这个文件(大小 1.4 GB),我已经用 java 编写了代码并创建了它的 jar。Java类如下:

import java.io.IOException;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.DocumentBuilder;
import org.w3c.dom.Document;
import org.w3c.dom.NodeList;
import org.w3c.dom.Node;
import org.w3c.dom.Element;

import java.io.File;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.Job;


public class Recommend {

static class Map extends Mapper<Text, Text, Text, Text> {
Path path;
String fXmlFile;
DocumentBuilderFactory dbFactory;
DocumentBuilder dBuilder;
Document doc;

/**
* Given an output filename, write a bunch of random records to it.
*/
public void map(LongWritable key, Text value,
OutputCollector<Text, Text> output, Reporter reporter) throws IOException {
try{
fXmlFile=value.toString();
dbFactory = DocumentBuilderFactory.newInstance();
dBuilder= dbFactory.newDocumentBuilder();
doc= dBuilder.parse(fXmlFile);

doc.getDocumentElement().normalize();
NodeList nList = doc.getElementsByTagName("row");

for (int temp = 0; temp < nList.getLength(); temp++) {

Node nNode = nList.item(temp);
Element eElement = (Element) nNode;

Text keyWords =new Text(eElement.getAttribute("OwnerUserId"));
Text valueWords = new Text(eElement.getAttribute("ParentId"));
String val=keyWords.toString()+" "+valueWords.toString();
// Write the sentence
if(keyWords != null && valueWords != null){
output.collect(keyWords, new Text(val));
}
}

}catch (Exception e) {
e.printStackTrace();
}
}
}

/**
*
* @throws IOException
*/
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
//String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
/*if (args.length != 2) {
System.err.println("Usage: wordcount <in> <out>");
System.exit(2);
}*/
// FileSystem fs = FileSystem.get(conf);
Job job = new Job(conf, "Recommend");
job.setJarByClass(Recommend.class);

// the keys are words (strings)
job.setOutputKeyClass(Text.class);
job.setMapOutputKeyClass(LongWritable.class);
job.setMapOutputValueClass(Text.class);

// the values are counts (ints)
job.setOutputValueClass(Text.class);

job.setMapperClass(Map.class);
//conf.setReducerClass(Reduce.class);

FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));

System.exit(job.waitForCompletion(true) ? 0 : 1);
Path outPath = new Path(args[1]);
FileSystem dfs = FileSystem.get(outPath.toUri(), conf);
if (dfs.exists(outPath)) {
dfs.delete(outPath, true);
}
}
}

我希望输出是 hadoop 中的一个文件,其中包含 OwnerUserId ParentId 的输出但我得到的输出为:

1599788   <row Id="2292" PostTypeId="2" ParentId="2284" CreationDate="2008-08-05T13:28:06.700" Score="0" ViewCount="0" Body="&lt;p&gt;The first thing you should do is contact the main people who run the open source project. Ask them if it is ok to contribute to the code and go from there.&lt;/p&gt;&#xD;&#xA;&#xD;&#xA;&lt;p&gt;Simply writing your improved code and then giving it to them may result in your code being rejected.&lt;/p&gt;" OwnerUserId="383" LastActivityDate="2008-08-05T13:28:06.700" />

我不知道1599788作为映射器的键值出现的起源。

我不太了解如何为 hadoop 编写映射器类,我需要帮助来修改我的代码以获得所需的输出。

提前致谢。

最佳答案

经过大量的研究和实验,终于学会了为解析 xml 文件编写映射的方法,其语法类似于我提供的语法。我改变了我的方法,这是我的新映射器代码......它适用于我的用例。

希望它可以帮助别人并节省他们的时间:)

import java.io.IOException;
import java.util.StringTokenizer;

import javax.xml.parsers.ParserConfigurationException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.xml.sax.SAXException;

public class Map extends Mapper<LongWritable, Text, NullWritable, Text> {
NullWritable obj;

@Override
public void map(LongWritable key, Text value, Context context) throws InterruptedException {
StringTokenizer tok= new StringTokenizer(value.toString());
String pa=null,ow=null,pi=null,v;
while (tok.hasMoreTokens()) {
String[] arr;
String val = (String) tok.nextToken();
if(val.contains("PostTypeId")){
arr= val.split("[\"]");
pi=arr[arr.length-1];
if(pi.equals("2")){
continue;
}
else break;
}
if(val.contains("ParentId")){
arr= val.split("[\"]");
pa=arr[arr.length-1];
}
else if(val.contains("OwnerUserId") ){
arr= val.split("[\"]");
ow=arr[arr.length-1];
try {
if(pa!=null && ow != null){
v=String.format("{0},{1}", ow,pa);
context.write(obj,new Text(v));

}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}


}

}

关于java - 在 hadoop 上解析 Stackoverflow 的 posts.xml,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/19445528/

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com