gpt4 book ai didi

java - java中的tfidf计算和矩阵存储

转载 作者:行者123 更新时间:2023-11-29 08:51:56 24 4
gpt4 key购买 nike

我有一个 txt 文件语料库,我想在其中计算它们的 Tfidf 值。 我想我必须首先将文件标记为单词,然后计算权重。我需要这个程序的输出是一个矩阵,其中行是文件,列是单词,剩余的矩阵单元格是 Tfidf 值。

我卡在了矩阵部分。这是我的尝试

 import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;


public class DocumentParser {

//This variable will hold all terms of each document in an array.
private List<String[]> termsDocsArray = new ArrayList<String[]>();
private List<String> allTerms = new ArrayList<String>(); //to hold all terms
private List<double[]> tfidfDocsVector = new ArrayList<double[]>();
private List fileNameList = new ArrayList();
private File[] allfiles;
private StringBuilder sb;
private BufferedReader in = null;

/**
* Method to read files and store in array.
* @param filePath : source file path
* @throws FileNotFoundException
* @throws IOException
*/

public void parseFiles(String filePath) throws FileNotFoundException, IOException {
allfiles = new File(filePath).listFiles();
for (File f : allfiles) {
if (f.getName().endsWith(".txt")) {
fileNameList.add(f.getName());
in = new BufferedReader(new FileReader(f));
sb = new StringBuilder();
String s = null;
while ((s = in.readLine()) != null) {
sb.append(s);
}
String[] tokenizedTerms = sb.toString().replaceAll("[\\W&&[^\\s]]", "").split("\\W+"); //to get individual terms
for (String term : tokenizedTerms) {
if (!allTerms.contains(term)) { //avoid duplicate entry
allTerms.add(term);
}
}
termsDocsArray.add(tokenizedTerms);
}
}

}

/**
* Method to create termVector according to its tfidf score.
* @return
*/
public double tfIdfCalculator(String file, String word) {
double tf; //term frequency
double idf; //inverse document frequency
double tfidf = 0; //term requency inverse document frequency
for (String[] docTermsArray : termsDocsArray) {
double[] tfidfvectors = new double[allTerms.size()];
int count = 0;
for (String terms : allTerms) {
tf = new TfIdf().tfCalculator(docTermsArray, terms);
idf = new TfIdf().idfCalculator(termsDocsArray, terms);
tfidf = tf * idf;
System.out.println(terms+"\t" + tfidf);
tfidfvectors[count] = tfidf;
count++;


}
tfidfDocsVector.add(tfidfvectors); //storing document vectors;
}

return tfidf;
}


public void TfIdfMatrix() throws IOException {

int r=allTerms.size();
int c=tfidfDocsVector.size();

String mat[][]= new String [r][c];

int rNumber=0;

for (int i = 0; i < fileNameList.size(); i++) {

rNumber++;

mat[rNumber][0]=(String) fileNameList.get(i);

}

String s;
while ((s = in.readLine()) != null) {

rNumber++;

mat[0][rNumber]=s;

}


//System.out.print(mat);

for (int row = 1; row <= rNumber; row++){
for (int col = 1; col <= rNumber; col++){
double ifidfValue=tfIdfCalculator(mat[0][col], mat[row][0]);
mat[row][col]=Double.toString(ifidfValue);

System.out.print(mat[row][col]);
}
}
}

请帮忙!!

最佳答案

这是我正在使用的代码示例。

public void tfIdfCalculator() {
double tf; //term frequency
double idf; //inverse document frequency
double tfidf; //term requency inverse document frequency

for (String[] docTermsArray : termsDocsArray) {
double[] tfidfvectors = new double[allTerms.size()];
int count = 0;
for (String terms : allTerms) {
tf = new TfIdf().tfCalculator(docTermsArray, terms);
idf = new TfIdf().idfCalculator(termsDocsArray, terms);
tfidf = tf * idf;
tfidfvectors[count] = tfidf;
count++;
}
tfidfDocsVector.add(tfidfvectors); //storing document vectors;
}
}

关于java - java中的tfidf计算和矩阵存储,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/22377097/

24 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com