gpt4 book ai didi

java - java中的停用词删除方法不起作用

转载 作者:太空宇宙 更新时间:2023-11-04 12:16:12 25 4
gpt4 key购买 nike

我正在尝试编写计算比读取所有 cran 字段(信息重审中的热门主题)的 Java 代码,以便进行标记化、计算总标记、找到 50 个常用单词并删除预定义的停用词。它可以工作,除了 StopWordsRemoval 方法(代码中的最后一个),它不会根据需要更改输出,此方法之前/之后的输出是相同的!

你能帮我找出问题所在吗?这是我用 Java 编写的第一个代码:(

import java.io.*;
import java.util.*;

public class Information_Retrieval_Hw1 {

//Global variables
public static BufferedReader buffer;
public static Hashtable<String, Integer> wordList = new Hashtable<String, Integer>();
public static ArrayList<Hashtable <String,Integer>> fileMap = new ArrayList<Hashtable<String,Integer>>();
public static Set<String> tagNames = new HashSet<String>();
//public static ArrayList<Map.Entry<String, Integer>> list;

public static int documentsCount = 0;
public static int totalTokens = 0;
public static int uniqueWords = 0;
public static int tagCount = 0;
public static int singleOccureneWords = 0;

public static ArrayList<Map.Entry<String, Integer>> sortedList;


public Information_Retrieval_Hw1() {
// TODO Auto-generated constructor stub
}

public static void main(String[] args) throws IOException {

String cranfield = "/Users/Manal/Desktop/semster1/IR/assigenment 1/cranfieldDocs";
File cranfieldFiles = new File(cranfield);
ReadFile(cranfieldFiles);

System.out.println("Total number of documents: " + fileMap.size());

//Calculate total number of tokens
totalTokens = CalculateNumberOfTokens(wordList);
System.out.println("Total number Of words = " + totalTokens);

//Calculate number of unique words
uniqueWords = CalculateUniqueWords(wordList);
System.out.println("Total number Of distinct words = " + uniqueWords);

//Calculate number of unique words
singleOccureneWords = CalculateSingleOccurenceWords(wordList);
System.out.println("Total number Of words that occur only once = " + singleOccureneWords);

//Find the 30 most frequent words
FindFiftyMostFrequentWords(wordList);

StopWordsRemoval (cranfieldFiles,wordList);
//reprint all information after removing stopword;

System.out.println("\n***********************************\nAfter removing stop words \n***********************************\n");

//Calculate total number of tokens
totalTokens = CalculateNumberOfTokens(wordList);
System.out.println("Total number Of words = " + totalTokens);

//Calculate number of unique words
uniqueWords = CalculateUniqueWords(wordList);
System.out.println("Total number Of distinct words = " + uniqueWords);

//Calculate number of unique words
singleOccureneWords = CalculateSingleOccurenceWords(wordList);
System.out.println("Total number Of words that occur only once = " + singleOccureneWords);

//Find the 30 most frequent words
FindFiftyMostFrequentWords(wordList);

}

public static void ReadFile(File cranfieldFiles) throws IOException{
for (File file: cranfieldFiles.listFiles())
{
//read files recursively if path contains folder
if(file.isDirectory())
{
ReadFile(file);
}

else
{
documentsCount++;
try
{
buffer = new BufferedReader(new FileReader(file));
}
catch (FileNotFoundException e)
{
System.out.println("File not Found");

}
//find the tags and their count
tagCount = tagCount + TagHandler(file, tagNames);
//find words in the cranfield
TokenHandler(file, tagNames);

}
}


}

public static int TagHandler(File file, Set<String> tagNames) throws IOException
{
String line;
int tag_count = 0;


buffer = new BufferedReader(new FileReader(file));
while((line = buffer.readLine()) != null)
{
/*
* If the line contains a '<', it is considered a tag and tag_count is incremented.
*/
if(line.contains("<"))
{
tag_count++;

String b = line.replaceAll("[<*>/]", "");
tagNames.add(b);
}

}
tag_count/=2; //Since each tag represent the beginning and the end, we divide it by two to get the actual count.
return tag_count;
}

public static void TokenHandler(File file, Set<String> tagNames) throws IOException
{
String line;
String words[];

buffer = new BufferedReader(new FileReader(file));
Hashtable<String, Integer> tempMap = new Hashtable<String, Integer>();

while((line = buffer.readLine()) != null)
{

String s1 = line.replaceAll("[^a-zA-Z.]+"," "); //Replace everything that is not an alphabet with a blank space.
String s2 = s1.replaceAll("[.]", "");//Replace words with . (eg U.S) as 1 word
words = s2.split(" ");

for(String word : words)
{
//Handle the tags properly
if(!tagNames.contains(word) && !word.equals(""))
{
word = word.toLowerCase(); // Converts all words to lower case.

//add word if it isn't added already
if(!wordList.containsKey(word))
{
//first occurance of this word
wordList.put(word, 1);

//Following is to compute the unique words in each document
if(!tempMap.containsKey(word))
{
tempMap.put(word,1);

}
else
{
tempMap.put(word, tempMap.get(word)+ 1);

}
}
else
{
//Increament the count of that word
wordList.put(word, wordList.get(word) + 1);
if(!tempMap.containsKey(word))
{
tempMap.put(word,1);

}
else
{
tempMap.put(word, tempMap.get(word)+ 1);
}
}
}
}
}

//Add count to file map and after reading every file
fileMap.add(tempMap);
}

//Function to find the total number of tokens in the cranfield database

public static int CalculateNumberOfTokens(Hashtable<String, Integer> myWordList)
{
int noOfTokens = 0;

for (Integer value: myWordList.values())
{
noOfTokens = noOfTokens + value;
}
return noOfTokens;
}

public static int CalculateUniqueWords(Hashtable<String, Integer> myWordList)
{

return myWordList.size();
}

public static int CalculateSingleOccurenceWords(Hashtable<String, Integer> myWordList)
{
int count = 0;

for (Integer value: myWordList.values())
{
if(value == 1)
{
count++;
}
}
return count;
}

//Sorting the hashTable
public static ArrayList<Map.Entry<String, Integer>> SortHashTable(Hashtable<String, Integer> myWordList)
{
ArrayList<Map.Entry<String, Integer>> list = new ArrayList<Map.Entry<String, Integer>>(myWordList.entrySet());
Collections.sort(list, new Comparator<Map.Entry<String, Integer>>(){
public int compare(Map.Entry<String, Integer> o1, Map.Entry<String, Integer> o2) {
return o2.getValue().compareTo(o1.getValue());
}});
return list;
}

public static void FindFiftyMostFrequentWords(Hashtable<String, Integer> myWordList)
{
//Sort the hashtable based on value

sortedList = SortHashTable(myWordList);
System.out.println("The 50 most frequent words are: ");
for(int i=0;i<50;i++)
{
System.out.println("\t" + (i+1) + "." + " " + sortedList.get(i));
}
}

public static Hashtable<String, Integer> StopWordsRemoval (File file, Hashtable<String, Integer> wordList) throws IOException {
int k=0,j;
String sCurrentLine;
String[] stopwords = new String[2000];
try
{
FileReader fr=new FileReader("/Users/Manal/Desktop/semster1/IR/assigenment 1/xid-10624858_1.txt");
BufferedReader br= new BufferedReader(fr);
while ((sCurrentLine = br.readLine()) != null){
stopwords[k]=sCurrentLine;
k++;
}
Set<String> keys = wordList.keySet();
for(String key: keys)
{
for(j = 0; j < k; j++)
{
if(wordList.keySet().equals(stopwords[j]))
wordList.remove(key);
}
}
}
catch(Exception ex)
{System.out.println(ex);}

return wordList;
}
}

最佳答案

我认为这是代码中的问题

if(wordList.keySet().equals(stopwords[j]))

您正在做的是检查 keySet 是否等于单词(keySet() 返回 Set)以及 keySet 是否包含该单词。试试这个:

if(wordList.keySet().contains(stopwords[j]))

请告诉我这是否可以解决您的问题。

关于java - java中的停用词删除方法不起作用,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/39389735/

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com