java - 如何打印 mahout lda cvb 主题-6ren

java - 如何打印 mahout lda cvb 主题

转载作者：塔克拉玛干更新时间：2023-11-02 20:08:52

我想使用 mahout CVB0Driver API 运行集群 lda cvb 作业。但我不知道如何打印结果。这是我的代码。

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.jobcontrol.Job;
import org.apache.hadoop.util.ToolRunner;
import org.apache.mahout.clustering.lda.cvb.CVB0Driver;
import org.apache.mahout.common.AbstractJob;
import org.apache.mahout.common.HadoopUtil;
import org.apache.mahout.text.SequenceFilesFromDirectory;
import org.apache.mahout.utils.vectors.RowIdJob;
import org.apache.mahout.utils.vectors.VectorDumper;
import org.apache.mahout.vectorizer.SparseVectorsFromSequenceFiles;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class LDAJob extends AbstractJob {
    private static final Logger log = LoggerFactory.getLogger(Job.class);
    static int numTopics = 20;
    static double doc_topic_smoothening = 0.0001;
    static double term_topic_smoothening = 0.0001;
    static int maxIter = 10;
    static int iteration_block_size = 10;
    static double convergenceDelta = 0;
    static float testFraction = 0.0f;
    static int numTrainThreads = 4;
    static int numUpdateThreads = 1;
    static int maxItersPerDoc = 10;
    static int numReduceTasks = 10;
    static boolean backfillPerplexity = false;

public static void main(String args[]) throws Exception {
    // String baseFileLocation = args[0];
    String baseFileLocation = "/Users/pin/java";
    Path output = new Path(baseFileLocation, "/output");
    Configuration conf = new Configuration();
    HadoopUtil.delete(conf, output);
    String[] ldaArgs = { "-DbaseFileLocation=" + baseFileLocation };
    // String[] strings =
    // {"-Dmapred.input.dir=VectorFile/tfidf-vectors/part-r-00000"};
    ToolRunner.run(new LDAJob(), ldaArgs);
    System.out.println("done");
}

public int run(String[] arg0) throws Exception {
    Configuration conf = getConf();
    // String baseFileLocation = "/Users/pin/java";
    String baseFileLocation = conf.get("baseFileLocation");
    Path input = new Path(baseFileLocation, "/reuters-out");
    System.out.println(input.toString());
    String seqFileOutput = "SeqFile";
    String vectorOutFile = "VectorFile";
    String rowIDOutFile = "RowIdOutput";
    String ldaOutputFile = "topicModelOutputPath";
    String dictionaryFileName = vectorOutFile + "/dictionary.file-0";
    String tempLDAModelFile = "modelTempPath";
    String docTopicOutput = "docTopicOutputPath";
    String topicTermVectorDumpPath = "topicTermVectorDump";
    String docTopicVectorDumpPath = "docTopicVectorDump";

    // String topicTermVectorDump = "topicTermVectorDump";

    log.info("Deleting all the previous files.");
    HadoopUtil.delete(conf, new Path(seqFileOutput));
    HadoopUtil.delete(conf, new Path(vectorOutFile));
    HadoopUtil.delete(conf, new Path(rowIDOutFile));
    HadoopUtil.delete(conf, new Path(ldaOutputFile));
    HadoopUtil.delete(conf, new Path(docTopicOutput));
    HadoopUtil.delete(conf, new Path(tempLDAModelFile));
    HadoopUtil.delete(conf, new Path(topicTermVectorDumpPath));
    HadoopUtil.delete(conf, new Path(docTopicVectorDumpPath));

    // S3FileSystem.
    log.info("Step1: convert the directory into seqFile.");
    System.out.println("starting dir to seq job");
    String[] dirToSeqArgs = { "--input", input.toString(), "--output",
            seqFileOutput };
    ToolRunner.run(new SequenceFilesFromDirectory(), dirToSeqArgs);
    System.out.println("finished dir to seq job");

    log.info("Step 2: converting the seq to vector.");
    System.out.println("starting seq To Vector job");
    String[] seqToVectorArgs = { "--input", seqFileOutput, "--output",
            vectorOutFile, "--maxDFPercent", "70", "--maxNGramSize", "2",
            "--namedVector", "--analyzerName",
            "org.apache.lucene.analysis.WhitespaceAnalyzer" };
    ToolRunner.run(new SparseVectorsFromSequenceFiles(), seqToVectorArgs);
    System.out.println("finished seq to vector job");

    log.info("Step3: convert SequenceFile<Text, VectorWritable> to  SequenceFile<IntWritable, VectorWritable>");
    System.out.println("starting rowID job");
    String[] rowIdArgs = {
            "-Dmapred.input.dir=" + vectorOutFile
                    + "/tfidf-vectors/part-r-00000",
            "-Dmapred.output.dir=" + rowIDOutFile };
    ToolRunner.run(new RowIdJob(), rowIdArgs);
    System.out.println("finished rowID job");

    log.info("Step4: Run the LDA algo");
    System.out.println("starting caluclulating the number of terms");
    //int numTerms = getNumTerms(new Path(dictionaryFileName));
    System.out.println("finished calculating the number of terms");
    long seed = System.nanoTime() % 10000;
    System.out.println("starting the CVB job");
    CVB0Driver.run(conf, new Path(rowIDOutFile + "/matrix"), new Path(
            ldaOutputFile), numTopics, 0, doc_topic_smoothening,
            term_topic_smoothening, maxIter, iteration_block_size,
            convergenceDelta, new Path(dictionaryFileName), new Path(
                    docTopicOutput), new Path(tempLDAModelFile), seed,
            testFraction, numTrainThreads, numUpdateThreads,
            maxItersPerDoc, numReduceTasks, backfillPerplexity);
    //String[] runArgs ={};
    System.out.println("finished the cvb job");

    log.info("Step5: vectordump topic-term");

    System.out.println("starting the vector dumper for topic term");
    String[] topicTermDumperArg = {"--seqFile", ldaOutputFile+"/part-m-00000",  "--dictionary", 
            dictionaryFileName, "-dt", "sequencefile"  };
    //ToolRunner.run(new Configuration(), new CustomVectorDumper(), topicTermDumperArg);
    //VectorDumper.main(topicTermDumperArg);
    //SequenceFileDumper.main(topicTermDumperArg);
    //String[] topicTermDumperArg = {"--input", ldaOutputFile, "--output", topicTermVectorDumpPath,  "--dictionary", 
    //        dictionaryFileName, "-dt", "sequencefile" ,"--vectorSize", "25" ,"-sort", "testsortVectors" };
    //LDAPrintTopics.main(topicTermDumperArg);
    //String[] topicTermDumperArg = {"-seq"};
    VectorDumper.main(topicTermDumperArg);
    System.out.println("finisher the vector dumper for topicterm");
    //System.out.println("starting the vector dumper for doctopic dumper");
    //String[] docTopicDumperArg = {"--input", docTopicOutput, "--output", docTopicVectorDumpPath};
    //ToolRunner.run(new Configuration(), new CustomVectorDumper(), docTopicDumperArg);
    //VectorDumper.main(docTopicDumperArg);
    System.out.println("finsiher the vector dumper for doctopic dumper");

    //printLdaResults(ldaOutputFile, numTerms);
    //MongoDumper dumper = new MongoDumper();
    //dumper.writeTopicCollection(topicTermVectorDumpPath.toString());
    return 0;
}
}

程序在运行到 VectorDumper.main(topicTermDumperArg); 时卡住了。

我使用 mahout-core-0.7,mahout-utils-0.5,新闻资源下载click here

最佳答案

我使用的是 Mahout 0.9，以下适用于我:

     VectorDumper.main(new String[] 
            { "-i",
            OUTPUT_DIR + "/topic-term-dist/part-m-00000", "-o",
            OUTPUT_DIR + "/results", "-d",
                OUTPUT_DIR + "/dictionary.file-0", "-dt", "sequencefile",
            "-sort", "true", "-vs", "20" });

上面，OUTPUT_DIR 是我运行 LDA 作业的文件夹。它打印每个主题的前 20 个术语。

关于java - 如何打印 mahout lda cvb 主题，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/16994529/

文章推荐： java - 无法从 netty 服务器接收客户端响应

文章推荐： ios - 报亭原子提要的使用和限制

文章推荐： ios - Twitter SLComposeViewController 经常触发失败

文章推荐： android - 使用 Robolectric 和 Mockito 测试 CursorLoader

java - Android 主题.AppCompat 主题
这个问题已经有答案了: Cannot create AlertDialog: AppCompat error (2 个回答) 已关闭 6 年前。当我在列表项中调用警报对话框时，我的应用程序崩溃了。我
angular - node_modules/rxjs/主题 "' has no exported member ' 主题'
我在 Angular 应用程序中安装了 Material UI，现在我收到错误，没有导出的成员 Observable 错误，我删除了节点模块并重新安装，问题仍然存在 ERROR in node_mo
java - 'exactly once' 是否仅适用于流(主题 1 -> 应用程序 -> 主题 2)？
我有一个架构，其中有两个独立的应用程序。原始来源是一个sql数据库。 App1 监听 CDC 表以跟踪对该数据库中表的更改、规范化和序列化这些更改。它获取这些序列化消息并将它们发送到 Kafka 主题
android - 主题 Material Android : You need to use a Theme. AppCompat 主题(或后代)与设计库
这个问题在这里已经有了答案: Material Design, AppCompat, and Backwards Compatibility (1 个回答) 关闭 6 年前。我收到如下错误信息:
.net - 使 WPF 应用程序看起来像 Metro 风格，即使在 Windows 7 中也是如此？ (窗口 Chrome/主题/主题)
我喜欢新 Office 套件和 Visual Studio 上的窗口镶边: 当然，我仍在为 Windows 7 开发应用程序，但我想知道是否有一种快速且简单的方法(阅读:WPF 样式或 Windows
android - _HoloActivity$HoloThemeException : You must apply Holo. 主题，Holo.Theme.Light 或 Holo.Theme.Light.DarkActionBar 主题
我正在使用 HoloEverywhere-1.6.8。我有一个基于 Holo.Theme 的自定义主题。 ... 我遇到了下面的崩溃，但它只出现在以下设备上: Galaxy Tab 10.1 P
Angular DevExtreme 主题
我正在尝试为 Angular 的 DevExtreme 小部件加载主题。我采用了不同的方法: 在 angular.json 中设置样式但不会产生任何影响: "projects": { "my-proj
android - 扩展外部应用程序样式/主题
我想定义一个 android 样式，它扩展了一个在不同的应用程序/包中定义的样式，而不是作为库导入。从对android资源的xml引用的定义here : @[:]/ 似乎可以在定义资源的地方指定一个
android - 主题、样式和别名嵌套不起作用
我正在尝试测试一种制作主题的方法，但我使用的方法并没有给我预期的结果。这是我的设置: drawable/dummy.xml 值/mythemes.xml @style
IMAP FETCH 主题
通过 telnet 使用 IMAP，我希望能够从特定的给定电子邮件中提取主题。现在我知道 fetch 命令负责从电子邮件中获取数据。我的问题是，如何在不使用对 BODY[HEADER.FIELDS
RStudio knitr 主题
我刚刚开始使用 RStudio 中的一些新的 knitr 功能。我已经尝试在 R Markdown 设置对话框中选择几个不同的主题，但这些似乎对我生成的文档的样式没有任何明显的影响。应该，还是我错过
CSS 模块和多个布局/主题？
在我的应用程序中，我有多种主题样式(您可以将它们视为不同的、单独的 CSS 样式文件)。我想开始使用 CSS 模块，但我什至不知道如何 import我的第一个文件。让我们假设以下(简单)目录结构:
Azure 主题 - 同一订阅上的多个监听器
有没有一种方法可以在一个 Azure 主题订阅上拥有多个监听客户端，并且它们都接收所有消息？我的理解是订阅的唯一实现是发布的消息仅传递到该订阅上的一个客户端，因为它就像一个队列。可以使用同一订阅将这
Vim:根据一天中的时间设置颜色/主题
我有一台 super 光滑的显示器，所以白天我可以比深色主题上的代码更好地看到自己的倒影。因此，我认为如果我可以在 vimrc 中有一个简单的 if 开关来根据一天中的时间设置深色主题或浅色主题，那就
themes - Symfony2 主题
我希望在我的 Symfony2 项目中提供基本的主题支持，因此我希望为每个主题提供单独的静态文件(css、js、img)。我尝试添加 assetic: read_from: %kernel
Azure 主题 - 同一订阅上的多个监听器
有没有一种方法可以在一个 Azure 主题订阅上拥有多个监听客户端，并且它们都接收所有消息？我的理解是订阅的唯一实现是发布的消息仅传递到该订阅上的一个客户端，因为它就像一个队列。可以使用同一订阅将这
Nolio NES 主题
在 NES 上有多个处于 WAITING 状态的“Discovery Worker”和“Keep Alive”线程是预期的行为吗？ "DiscoveryWorker-10" Id=62 WAITING
java - 根据图像修改背景颜色(主题)
我正在尝试找到最适合加载图像的颜色并将其应用到背景中。适应图像并使 UI 感觉更自然。到目前为止我已经找到了 2 个方案: 1> 平均像素(下面的代码): final Color acclimati
java - 主题网络环境中的本地问题
我知道每个请求都由一个 servlet 线程提供服务，但是对于一个用户 session ，两个请求是否可以由两个不同的线程提供服务？如果上述情况真的发生，那么第一个请求服务线程存储的线程局部变量被第
java - 主题、背景颜色和操作栏的交互
我无法理解操作栏外观与主题化之间的交互模式。我的应用设置为使用默认主题，我认为它是深色的: 通过应用范围内的样式从应用中删除操作栏会导致主要 Activity 的黑色背景: 没有 and

塔克拉玛干

个人简介

我是一名优秀的程序员,十分优秀！

作者热门文章

滴滴打车优惠券免费领取

全站热门文章

首页

博学

6Ren·AI

商城

java - 如何打印 mahout lda cvb 主题