- html - 出于某种原因,IE8 对我的 Sass 文件中继承的 html5 CSS 不友好?
- JMeter 在响应断言中使用 span 标签的问题
- html - 在 :hover and :active? 上具有不同效果的 CSS 动画
- html - 相对于居中的 html 内容固定的 CSS 重复背景?
下面是我从这里获取的用于 JavaScript 的 Porters 词干算法:http://tartarus.org/~martin/PorterStemmer/js.txt
我希望能够通过简单地调用来使用该算法:
var stemmed_word = porter_stemming_alg( "some_word_to_stem" );
有人对我如何将此功能合并到下面的代码中有任何建议吗?
类似的东西:
function porter_stemming_alg( word ){
//...
// algorithm goes here...
//...
}
任何建议感谢大家....
var stemmer = (function(){
var step2list = {
"ational" : "ate",
"tional" : "tion",
"enci" : "ence",
"anci" : "ance",
"izer" : "ize",
"bli" : "ble",
"alli" : "al",
"entli" : "ent",
"eli" : "e",
"ousli" : "ous",
"ization" : "ize",
"ation" : "ate",
"ator" : "ate",
"alism" : "al",
"iveness" : "ive",
"fulness" : "ful",
"ousness" : "ous",
"aliti" : "al",
"iviti" : "ive",
"biliti" : "ble",
"logi" : "log"
},
step3list = {
"icate" : "ic",
"ative" : "",
"alize" : "al",
"iciti" : "ic",
"ical" : "ic",
"ful" : "",
"ness" : ""
},
c = "[^aeiou]", // consonant
v = "[aeiouy]", // vowel
C = c + "[^aeiouy]*", // consonant sequence
V = v + "[aeiou]*", // vowel sequence
mgr0 = "^(" + C + ")?" + V + C, // [C]VC... is m>0
meq1 = "^(" + C + ")?" + V + C + "(" + V + ")?$", // [C]VC[V] is m=1
mgr1 = "^(" + C + ")?" + V + C + V + C, // [C]VCVC... is m>1
s_v = "^(" + C + ")?" + v; // vowel in stem
return function (w) {
var stem,
suffix,
firstch,
re,
re2,
re3,
re4,
origword = w;
if (w.length < 3) { return w; }
firstch = w.substr(0,1);
if (firstch == "y") {
w = firstch.toUpperCase() + w.substr(1);
}
// Step 1a
re = /^(.+?)(ss|i)es$/;
re2 = /^(.+?)([^s])s$/;
if (re.test(w)) { w = w.replace(re,"$1$2"); }
else if (re2.test(w)) { w = w.replace(re2,"$1$2"); }
// Step 1b
re = /^(.+?)eed$/;
re2 = /^(.+?)(ed|ing)$/;
if (re.test(w)) {
var fp = re.exec(w);
re = new RegExp(mgr0);
if (re.test(fp[1])) {
re = /.$/;
w = w.replace(re,"");
}
} else if (re2.test(w)) {
var fp = re2.exec(w);
stem = fp[1];
re2 = new RegExp(s_v);
if (re2.test(stem)) {
w = stem;
re2 = /(at|bl|iz)$/;
re3 = new RegExp("([^aeiouylsz])\\1$");
re4 = new RegExp("^" + C + v + "[^aeiouwxy]$");
if (re2.test(w)) { w = w + "e"; }
else if (re3.test(w)) { re = /.$/; w = w.replace(re,""); }
else if (re4.test(w)) { w = w + "e"; }
}
}
// Step 1c
re = /^(.+?)y$/;
if (re.test(w)) {
var fp = re.exec(w);
stem = fp[1];
re = new RegExp(s_v);
if (re.test(stem)) { w = stem + "i"; }
}
// Step 2
re = /^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/;
if (re.test(w)) {
var fp = re.exec(w);
stem = fp[1];
suffix = fp[2];
re = new RegExp(mgr0);
if (re.test(stem)) {
w = stem + step2list[suffix];
}
}
// Step 3
re = /^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/;
if (re.test(w)) {
var fp = re.exec(w);
stem = fp[1];
suffix = fp[2];
re = new RegExp(mgr0);
if (re.test(stem)) {
w = stem + step3list[suffix];
}
}
// Step 4
re = /^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/;
re2 = /^(.+?)(s|t)(ion)$/;
if (re.test(w)) {
var fp = re.exec(w);
stem = fp[1];
re = new RegExp(mgr1);
if (re.test(stem)) {
w = stem;
}
} else if (re2.test(w)) {
var fp = re2.exec(w);
stem = fp[1] + fp[2];
re2 = new RegExp(mgr1);
if (re2.test(stem)) {
w = stem;
}
}
// Step 5
re = /^(.+?)e$/;
if (re.test(w)) {
var fp = re.exec(w);
stem = fp[1];
re = new RegExp(mgr1);
re2 = new RegExp(meq1);
re3 = new RegExp("^" + C + v + "[^aeiouwxy]$");
if (re.test(stem) || (re2.test(stem) && !(re3.test(stem)))) {
w = stem;
}
}
re = /ll$/;
re2 = new RegExp(mgr1);
if (re.test(w) && re2.test(w)) {
re = /.$/;
w = w.replace(re,"");
}
// and turn initial Y back to y
if (firstch == "y") {
w = firstch.toLowerCase() + w.substr(1);
}
return w;
}
})();
最佳答案
这已经是一个函数了。您可以使用 stemmer( "some_word_to_stem")
调用它。如果您想更改函数的名称,请将第一行编辑为 var porter_stemming_alg = ...
或在末尾添加 var porter_stemming_alg = stemmer;
。
关于javascript - 搬运工词干算法 Javascript,如何,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/7133389/
我想搜索一个单词及其在句子中的变体。例如,单词“happy”的变体是“happiest”、“happier”。有java库支持吗? 最佳答案 您可以使用Porter stemming algorith
愚蠢的,但我无法访问 NLTK 中的主干函数。 我打电话 import nltk nltk.stem.porter.step1ab() 但我被告知该对象没有 step1ab 属性。我想我以某种方式没有
我想将 DataFrame 的列和索引限制为这样的内容 ps = PorterStemmer() df_dic = pd.read_csv('inquirerbasic_clean.csv', sep
查看以下脚本tortest.py,它反复切换TOR电路并检查IP地址是否已更改: #!/usr/bin/env python2.7 # -*- coding: utf-8 -*- import jso
我是一名优秀的程序员,十分优秀!