hadoop - 如何使用同一程序将MapReduce输出插入HBASE-6ren

hadoop - 如何使用同一程序将MapReduce输出插入HBASE

转载作者：行者123 更新时间：2023-12-02 20:56:04

我编写了一个程序，该程序以pdf为输入，并整体生成文本输出。我想使用相同的程序在hbase中加载此文本，有没有办法做到这一点。任何帮助都将是可贵的

//Driver Class
package com.tcs;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

public class PdfInputDriver {

    public static void main(String[] args) throws IOException,InterruptedException, ClassNotFoundException 
    {
        Configuration conf = new Configuration();
        GenericOptionsParser parser = new GenericOptionsParser(conf, args);
        args = parser.getRemainingArgs();
        @SuppressWarnings("deprecation")
        Job job = new Job(conf, "Pdftext");
        job.setJarByClass(PdfInputDriver.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(LongWritable.class);
        job.setInputFormatClass(PdfInputFormat.class);
        job.setOutputFormatClass(TextOutputFormat.class);
        FileInputFormat.setInputPaths(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));
        job.setMapperClass(WordCountMapper.class);
        job.setReducerClass(WordCountReducer.class);


        System.out.println(job.waitForCompletion(true));
    }
}

//InputFormatClass
package com.tcs;

import java.io.IOException;

import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

public class PdfInputFormat extends FileInputFormat<Object, Object> {

    @SuppressWarnings({ "unchecked", "rawtypes" })
    @Override
    public RecordReader createRecordReader(
            InputSplit split, TaskAttemptContext context) throws IOException,
            InterruptedException {

        return new PdfRecordReader();
    }

}

//PDF Record Reader class
package com.tcs;

import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;

public class PdfRecordReader extends RecordReader<Object, Object> {

    private String[] lines = null;
    private LongWritable key = null;
    private Text value = null;

    @Override
    public void initialize(InputSplit genericSplit, TaskAttemptContext context)
            throws IOException, InterruptedException {

        FileSplit split = (FileSplit) genericSplit;
        Configuration job = context.getConfiguration();
        final Path file = split.getPath();

        /*
         * The below code contains the logic for opening the file and seek to
         * the start of the split. Here we are applying the Pdf Parsing logic
         */

        FileSystem fs = file.getFileSystem(job);
        FSDataInputStream fileIn = fs.open(split.getPath());
        PDDocument pdf = null;
        String parsedText = null;
        PDFTextStripper stripper;
        pdf = PDDocument.load(fileIn);
        stripper = new PDFTextStripper();
        parsedText = stripper.getText(pdf);
        //String delims = "[ ]";
        this.lines = parsedText.split("/n");
        }

    @Override
    public boolean nextKeyValue() throws IOException, InterruptedException {

        if (key == null) {
            key = new LongWritable();
            key.set(1);
            value = new Text();
            value.set(lines[0]);
        } else 
        {
            int temp = (int) key.get();
            if (temp < (lines.length - 1)) {
                int count = (int) key.get();
                value = new Text();
                value.set(lines[count]);
                count = count + 1;
                key = new LongWritable(count);
            } else {
                return false;
            }

        }
        if (key == null || value == null) {
            return false;
        } else {
            return true;
        }
    }

    @Override
    public LongWritable getCurrentKey() throws IOException,
            InterruptedException {

        return key;
    }

    @Override
    public Text getCurrentValue() throws IOException, InterruptedException {

        return value;
    }

    @Override
    public float getProgress() throws IOException, InterruptedException {

        return 0;
    }

    @Override
    public void close() throws IOException {

    }

}

//Mapper Class
package com.tcs;


import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;


public class WordCountMapper extends Mapper<LongWritable, Text, Text, LongWritable>
{

    protected void map(LongWritable key, Text value, Context context)
         throws IOException, InterruptedException {

    context.write(value, key);
}
}


//Reducer Class
package com.tcs;
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class WordCountReducer extends Reducer<Object, Object, Object, Object> {
    protected void reduce(Text key, Iterable<LongWritable> values,
            Context context) throws IOException, InterruptedException {


            context.write(key, new WordCountReducer());
    }
}

最佳答案

我认为您正在制作jar文件。只需使用从mapreduce输出生成的part-r-00000文件即可。
创建“表”

关于hadoop - 如何使用同一程序将MapReduce输出插入HBASE，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/44496777/

文章推荐： hadoop - Sqoop作业增量laSTLast错误的时间戳值

文章推荐： javascript - 从 JavaScript 中的箭头函数返回数组

文章推荐： javascript - 使用 Material UI 响应路由器

C 程序我想知道是否有办法简化我的 dayofyear 程序？
我是 C 语言新手，我编写了这个 C 程序，让用户输入一年中的某一天，作为返回，程序将输出月份以及该月的哪一天。该程序运行良好，但我现在想简化该程序。我知道我需要一个循环，但我不知道如何去做。这是程序
java - GUI Java 程序 - Paint 程序
我一直在努力找出我的代码有什么问题。这个想法是创建一个小的画图程序，并有红色、绿色、蓝色和清除按钮。我有我能想到的一切让它工作，但无法弄清楚代码有什么问题。程序打开，然后立即关闭。 import ja
homebrew - 从 Homebrew 程序/欺骗程序到 Homebrew 程序/核心的迁移是什么？
我想安装screen，但是接下来我应该做什么？ $ brew search screen imgur-screenshot screen
python - 客户端(python 程序)没有收到服务器(c 程序)返回的响应？
我有一个在服务器端工作的 UDP 套接字应用程序。为了测试服务器端，我编写了一个简单的 python 客户端程序，它发送消息“hello world how are you”。服务器随后应接收消息，将
python - 运行一个 shell 脚本，该脚本运行一个 python 程序，然后运行一个 R 程序
我有一个 shell 脚本，它运行一个 Python 程序来预处理一些数据，然后运行一个 R 程序来执行一些长时间运行的任务。我正在学习使用 Docker 并且我一直在运行 FROM r-base:l
python - 在 Linux 中从 Python 启动一个 c 程序，将一个大文本字符串作为参数传递给 c 程序
在 Linux 中。我有一个 c 程序，它读取一个 2048 字节的文本文件作为输入。我想从 Python 脚本启动 c 程序。我希望 Python 脚本将文本字符串作为参数传递给 c 程序，而不是将
在网页上调起本机C#程序
前言最近开始整理笔记里的库存草稿，本文是 23 年 5 月创建的了（因为中途转移到 onedrive，可能还不止）网页调起电脑程序是经常用到的场景，比如百度网盘下载，加入 QQ 群之类的我
VHDL 程序
对于一个类，我被要求编写一个 VHDL 程序，该程序接受两个整数输入 A 和 B，并用 A+B 替换 A，用 A-B 替换 B。我编写了以下程序和测试平台。它完成了实现和行为语法检查，但它不会模拟。尽
haskell 程序
module Algorithm where import System.Random import Data.Maybe import Data.List type Atom = String ty
求给定N个数的最小公倍数的C++程序
我想找到两个以上数字的最小公倍数求给定N个数的最小公倍数的C++程序最佳答案 int lcm(int a, int b) { return (a/gcd(a,b))*b; } 对于gcd，请查看
索引器的c#程序
这个程序有错误。谁能解决这个问题？ Error is :TempRecord already defines a member called 'this' with the same paramete
铁路围栏密码的C++程序
当我运行下面的程序时，我在 str1 和 str2 中得到了垃圾值。所以 #include #include #include using namespace std; int main() {
死兔子的C++程序
这是我的作业: 一对刚出生的兔子(一公一母)被放在田里。兔子在一个月大时可以交配，因此在第二个月的月底，每对兔子都会生出两对新兔子，然后死去。注:在第0个月，有0对兔子。第 1 个月，有 1 对兔子
十进制转十六进制的C++程序
我编写了一个程序，通过对字母使用 switch 命令将十进制字符串转换为十六进制，但是如果我使用 char，该程序无法正常工作!没有 switch 我无法处理 9 以上的数字。我希望你能理解我，因为我
连接MySQL的C++程序
我是 C++ 新手(虽然我有一些 C 语言经验)和 MySQL，我正在尝试制作一个从 MySQL 读取数据库的程序，我一直在关注这个 tutorial但当我尝试“构建”解决方案时出现错误。 (我正在使
Swift If 程序
仍然是一个初学者，只是尝试使用 swift 中的一些基本函数。有人能告诉我这段代码有什么问题吗？ import UIKit var guessInt: Int var randomNum = arc
折叠常量的C++程序
我正在用 C++11 编写一个函数，它采用 constant1 + constant2 形式的表达式并将它们折叠起来。 constant1 和 constant2 存储在 std::string 中，
2个矩阵相加和相乘的C++程序
我用 C++ 编写了这段代码，使用运算符重载对 2 个矩阵进行加法和乘法运算。当我执行代码时，它会在第 57 行和第 59 行产生错误，非法结构操作(两行都出现相同的错误)。请解释我的错误。提前致谢:
交换字符串中两个字符的C++程序
我是 C++ 的初学者，我想编写一个简单的程序来交换字符串中的两个字符。例如；我们输入这个字符串:“EXAMPLE”，我们给它交换这两个字符:“E”和“A”，输出应该类似于“AXEMPLA”。我在
确定三角形的C++程序
我需要以下代码的帮助: 声明 3 个 double 类型变量，每个代表三角形的三个边中的一个。提示用户为第一面输入一个值，然后将用户的输入设置为您创建的代表三角形第一条边的变量。将最后 2 个步

行者123

个人简介

我是一名优秀的程序员,十分优秀！

作者热门文章

滴滴打车优惠券免费领取

全站热门文章

首页

博学

6Ren·AI

商城

hadoop - 如何使用同一程序将MapReduce输出插入HBASE