- android - 多次调用 OnPrimaryClipChangedListener
- android - 无法更新 RecyclerView 中的 TextView 字段
- android.database.CursorIndexOutOfBoundsException : Index 0 requested, 光标大小为 0
- android - 使用 AppCompat 时,我们是否需要明确指定其 UI 组件(Spinner、EditText)颜色
我正在尝试编写计算比读取所有 cran 字段(信息重审中的热门主题)的 Java 代码,以便进行标记化、计算总标记、找到 50 个常用单词并删除预定义的停用词。它可以工作,除了 StopWordsRemoval
方法(代码中的最后一个),它不会根据需要更改输出,此方法之前/之后的输出是相同的!
你能帮我找出问题所在吗?这是我用 Java 编写的第一个代码:(
import java.io.*;
import java.util.*;
public class Information_Retrieval_Hw1 {
//Global variables
public static BufferedReader buffer;
public static Hashtable<String, Integer> wordList = new Hashtable<String, Integer>();
public static ArrayList<Hashtable <String,Integer>> fileMap = new ArrayList<Hashtable<String,Integer>>();
public static Set<String> tagNames = new HashSet<String>();
//public static ArrayList<Map.Entry<String, Integer>> list;
public static int documentsCount = 0;
public static int totalTokens = 0;
public static int uniqueWords = 0;
public static int tagCount = 0;
public static int singleOccureneWords = 0;
public static ArrayList<Map.Entry<String, Integer>> sortedList;
public Information_Retrieval_Hw1() {
// TODO Auto-generated constructor stub
}
public static void main(String[] args) throws IOException {
String cranfield = "/Users/Manal/Desktop/semster1/IR/assigenment 1/cranfieldDocs";
File cranfieldFiles = new File(cranfield);
ReadFile(cranfieldFiles);
System.out.println("Total number of documents: " + fileMap.size());
//Calculate total number of tokens
totalTokens = CalculateNumberOfTokens(wordList);
System.out.println("Total number Of words = " + totalTokens);
//Calculate number of unique words
uniqueWords = CalculateUniqueWords(wordList);
System.out.println("Total number Of distinct words = " + uniqueWords);
//Calculate number of unique words
singleOccureneWords = CalculateSingleOccurenceWords(wordList);
System.out.println("Total number Of words that occur only once = " + singleOccureneWords);
//Find the 30 most frequent words
FindFiftyMostFrequentWords(wordList);
StopWordsRemoval (cranfieldFiles,wordList);
//reprint all information after removing stopword;
System.out.println("\n***********************************\nAfter removing stop words \n***********************************\n");
//Calculate total number of tokens
totalTokens = CalculateNumberOfTokens(wordList);
System.out.println("Total number Of words = " + totalTokens);
//Calculate number of unique words
uniqueWords = CalculateUniqueWords(wordList);
System.out.println("Total number Of distinct words = " + uniqueWords);
//Calculate number of unique words
singleOccureneWords = CalculateSingleOccurenceWords(wordList);
System.out.println("Total number Of words that occur only once = " + singleOccureneWords);
//Find the 30 most frequent words
FindFiftyMostFrequentWords(wordList);
}
public static void ReadFile(File cranfieldFiles) throws IOException{
for (File file: cranfieldFiles.listFiles())
{
//read files recursively if path contains folder
if(file.isDirectory())
{
ReadFile(file);
}
else
{
documentsCount++;
try
{
buffer = new BufferedReader(new FileReader(file));
}
catch (FileNotFoundException e)
{
System.out.println("File not Found");
}
//find the tags and their count
tagCount = tagCount + TagHandler(file, tagNames);
//find words in the cranfield
TokenHandler(file, tagNames);
}
}
}
public static int TagHandler(File file, Set<String> tagNames) throws IOException
{
String line;
int tag_count = 0;
buffer = new BufferedReader(new FileReader(file));
while((line = buffer.readLine()) != null)
{
/*
* If the line contains a '<', it is considered a tag and tag_count is incremented.
*/
if(line.contains("<"))
{
tag_count++;
String b = line.replaceAll("[<*>/]", "");
tagNames.add(b);
}
}
tag_count/=2; //Since each tag represent the beginning and the end, we divide it by two to get the actual count.
return tag_count;
}
public static void TokenHandler(File file, Set<String> tagNames) throws IOException
{
String line;
String words[];
buffer = new BufferedReader(new FileReader(file));
Hashtable<String, Integer> tempMap = new Hashtable<String, Integer>();
while((line = buffer.readLine()) != null)
{
String s1 = line.replaceAll("[^a-zA-Z.]+"," "); //Replace everything that is not an alphabet with a blank space.
String s2 = s1.replaceAll("[.]", "");//Replace words with . (eg U.S) as 1 word
words = s2.split(" ");
for(String word : words)
{
//Handle the tags properly
if(!tagNames.contains(word) && !word.equals(""))
{
word = word.toLowerCase(); // Converts all words to lower case.
//add word if it isn't added already
if(!wordList.containsKey(word))
{
//first occurance of this word
wordList.put(word, 1);
//Following is to compute the unique words in each document
if(!tempMap.containsKey(word))
{
tempMap.put(word,1);
}
else
{
tempMap.put(word, tempMap.get(word)+ 1);
}
}
else
{
//Increament the count of that word
wordList.put(word, wordList.get(word) + 1);
if(!tempMap.containsKey(word))
{
tempMap.put(word,1);
}
else
{
tempMap.put(word, tempMap.get(word)+ 1);
}
}
}
}
}
//Add count to file map and after reading every file
fileMap.add(tempMap);
}
//Function to find the total number of tokens in the cranfield database
public static int CalculateNumberOfTokens(Hashtable<String, Integer> myWordList)
{
int noOfTokens = 0;
for (Integer value: myWordList.values())
{
noOfTokens = noOfTokens + value;
}
return noOfTokens;
}
public static int CalculateUniqueWords(Hashtable<String, Integer> myWordList)
{
return myWordList.size();
}
public static int CalculateSingleOccurenceWords(Hashtable<String, Integer> myWordList)
{
int count = 0;
for (Integer value: myWordList.values())
{
if(value == 1)
{
count++;
}
}
return count;
}
//Sorting the hashTable
public static ArrayList<Map.Entry<String, Integer>> SortHashTable(Hashtable<String, Integer> myWordList)
{
ArrayList<Map.Entry<String, Integer>> list = new ArrayList<Map.Entry<String, Integer>>(myWordList.entrySet());
Collections.sort(list, new Comparator<Map.Entry<String, Integer>>(){
public int compare(Map.Entry<String, Integer> o1, Map.Entry<String, Integer> o2) {
return o2.getValue().compareTo(o1.getValue());
}});
return list;
}
public static void FindFiftyMostFrequentWords(Hashtable<String, Integer> myWordList)
{
//Sort the hashtable based on value
sortedList = SortHashTable(myWordList);
System.out.println("The 50 most frequent words are: ");
for(int i=0;i<50;i++)
{
System.out.println("\t" + (i+1) + "." + " " + sortedList.get(i));
}
}
public static Hashtable<String, Integer> StopWordsRemoval (File file, Hashtable<String, Integer> wordList) throws IOException {
int k=0,j;
String sCurrentLine;
String[] stopwords = new String[2000];
try
{
FileReader fr=new FileReader("/Users/Manal/Desktop/semster1/IR/assigenment 1/xid-10624858_1.txt");
BufferedReader br= new BufferedReader(fr);
while ((sCurrentLine = br.readLine()) != null){
stopwords[k]=sCurrentLine;
k++;
}
Set<String> keys = wordList.keySet();
for(String key: keys)
{
for(j = 0; j < k; j++)
{
if(wordList.keySet().equals(stopwords[j]))
wordList.remove(key);
}
}
}
catch(Exception ex)
{System.out.println(ex);}
return wordList;
}
}
最佳答案
我认为这是代码中的问题
if(wordList.keySet().equals(stopwords[j]))
您正在做的是检查 keySet 是否等于单词(keySet() 返回 Set)以及 keySet 是否包含该单词。试试这个:
if(wordList.keySet().contains(stopwords[j]))
请告诉我这是否可以解决您的问题。
关于java - java中的停用词删除方法不起作用,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/39389735/
我有以下案例要解决。 在短语中突出显示关键字的 Javascript 方法。 vm.highlightKeywords = (phrase, keywords) => { keywords =
我要匹配文本中的所有美元符号单词。例如,"Hello $VARONE this is $VARTWO"可以匹配$VARONE和$VARTWO。 正则表达式应该是/\$(\w+)/g,但是当我在Dart
在 redux 中,对于将状态作为参数、更改状态并返回新状态的特定操作,您会在 switch 语句中调用什么函数? function reducer(state = DEFAULT_STATE, ac
在 MySQL 5.1 中,我将一个字段命名为“Starting”。但是,每次我使用 SQL 查询时,它都会说无效的 SQL 语法。经过一些谷歌搜索,我发现 STARTING 是一个保留的 SQL 词
我必须使用函数 isIn(secretWord,lettersGuessed) 从列表中找到密码。在下面发布我的代码。 def isWordGuessed(secretWord, lettersGue
一段时间以来,我一直无法找到两个字符串中最长的常用词。首先我想到了用“isspace”函数来做这件事,但不知道如何找到一个常用词。然后我想到了“strcmp”,但到目前为止我只能比较两个字符串。我在想
我目前正在尝试制作一种“单词混合器”:对于两个给定的单词和指定的所需长度,程序应返回这两个单词的“混合”。然而,它可以是任何类型的混合:它可以是第一个单词的前半部分与第二个单词的后半部分相结合,它可以
如果 After 之后(逗号之前)没有 -ing 词,我想匹配它。所以 After 和逗号之间不应该有 -ing 词。 所需的匹配项(粗体): After sitting down, he began
我一直在试验 Stanford NLP 工具包及其词形还原功能。我很惊讶它如何使一些词词形还原。例如: depressing -> depressing depressed -> depressed
js 并尝试根据 [这里] 中的示例代码来做词云:https://github.com/jasondavies/d3-cloud .我想做的是单词的字体大小是基于数组中单词的频率。例如我有 [a,a,
我正在处理一个文本分类问题(在法语语料库上),并且正在试验不同的词嵌入。我对 ConceptNet 提供的内容非常感兴趣,所以我决定试一试。 我无法为我的特定任务找到专门的教程,所以我听取了他们的建议
当我在文本中搜索时,我输入 C-s,然后输入单词,然后一次又一次地输入 C-s,光标前进到找到的单词的下一个位置。问题是,一旦我转到下一个单词,我无法在按钮处编辑迷你缓冲区中的搜索单词,如果我按 Ba
我正在尝试按照以下结构运行这个 maven Hello Word: ├── pom.xml └── src └── Main.java 使用pom.xml设置: 4.0.0
所以,从我可以开始的.. 我正在使用 OCR。该脚本非常适合我的需要。它检测单词的准确性对我来说还可以。 这是结果:附加图像 100% 准确。 from PIL import Image import
Closed. This question does not meet Stack Overflow guidelines。它当前不接受答案。 想要改善这个问题吗?更新问题,以便将其作为on-topi
这是细节,但我想知道为什么会这样。 示例代码: Class klasa = Enum.class; for(Type t : klasa.getGenericInterfaces()) Syst
我在用: var header = ""+ "Export HTML to Word Document with JavaScript"; var footer = ""; /
我有一个程序可以像这样将数据打印到控制台(以空格分隔): variable1 value1 variable2 value2 variable3 value3 varialbe4 value4 编辑:
我有一个程序可以像这样将数据打印到控制台(以空格分隔): variable1 value1 variable2 value2 variable3 value3 varialbe4 value4 编辑:
最近我在查看与goliath相关的一些代码时,偶然在Ruby代码中看到了这个词use。 , 中间件等。看起来它不同于include/extend, and require. 有人可以解释为什么存在这个
我是一名优秀的程序员,十分优秀!