gpt4 book ai didi

Java 索引器速度

转载 作者:行者123 更新时间:2023-12-01 10:27:29 25 4
gpt4 key购买 nike

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.PrintStream;
import java.util.*;
import org.apache.commons.io.FileUtils;

public class indexer {

@SuppressWarnings("unchecked")
public static void main(String[] args) throws IOException{

HindiStemmerLight shl = new
//HindiStemmerLight();
Scanner in1 = new Scanner(System.in);
System.out.println("");
System.out.println("Enter the File Path");

String path= in1.next();


File folder = new File(path);
File[] listOfFiles = folder.listFiles();
ArrayList<String> array = new ArrayList<String>();
int count1 = 0 ;
ArrayList<String> stopwords = new
ArrayList<String>(); File files = new File("/home/gaurav/stop-words_hindi_1_hi.txt");
String stopWordsFile=FileUtils.readFileToString(files);
String[] stopWords = stopWordsFile.split(" ");
for(String str:stopWords){
stopwords.add(str);
}
System.out.println("");

for (int i = 0; i <listOfFiles.length; i++) { //Reading the contents of each file


File file = listOfFiles[i];

if (file.isFile() && file.getName().endsWith(".txt")) {
String content = FileUtils.readFileToString(file); //storing the contents of files in content

String[] a=content.split("");
for(String s:a){
s= s.trim();
if(stopwords.contains(s)){
}
else{
//shl.stem(s); //applying the hindi stemmer on each word
// if(!array.contains(s)) // storing each word encountered into arraylist - array
array.add(s);
}
}

}
}

Arrays.sort(listOfFiles, new Comparator()
{
@Override
public int compare(Object f1, Object f2) {
return ((File) f1).getName().compareTo(((File) f2).getName());
}
});


Map<String, ArrayList<HashMap<String, Integer>>> words = new TreeMap<String, ArrayList<HashMap<String, Integer>>>();
Collections.sort(array);
for(int i =0 ; i<array.size();i++){
String s = array.get(i);
ArrayList<HashMap<String, Integer>> Hash = new ArrayList<HashMap<String, Integer>>();
HashMap<String, Integer> doc =null;

for(File newFile : listOfFiles){
doc = new HashMap<String, Integer>();
int count=0;
String DocId = newFile.getName();
String c=FileUtils.readFileToString(newFile);
String[] w = c.split(" ");
for(String s1 : w){
if(s.equals(s1)){
count++;
}
}
if(count != 0){
doc.put(DocId, count);
Hash.add(doc);
}
}
words.put(s, Hash);
}
PrintStream out = new PrintStream(new FileOutputStream("output.txt"));
System.setOut(out);
for (String name: words.keySet()){

String key =name.toString();
String value = words.get(name).toString();
System.out.print(key + " " + value);
System.out.println("");
}

我使用Java制作了一个索引器,但问题是当文档(语料库)较小时它表现良好。但是当语料库的大小为 50,000 个文本文件时。它会给出错误(内存不足:Java 堆空间),并且运行时间很长。请建议需要进行哪些更改才能降低其复杂性。

最佳答案

小批量索引,不要将整个数据集保留在内存中。

关于Java 索引器速度,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/35285263/

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com