gpt4 book ai didi

python - 如何调整 NLTK Python 代码,使分类器只训练一次

转载 作者:行者123 更新时间:2023-11-30 23:23:46 25 4
gpt4 key购买 nike

我尝试过对大约 10000 个句子的庞大数据集进行情感分析。现在,当我使用 NLTK Python 代码使用朴素贝叶斯执行训练和测试时,每次需要对一组新句子进行分类时,我都会训练分类器。这花费了很多时间。有没有一种方法可以获取训练部分的输出,然后将其用于分类,这样可以节省大量时间。这是我使用的 NLTK 代码。

import nltk
import re
import csv
#Read the tweets one by one and process it



def processTweet(tweet):
# process the tweets
#convert to lower case
tweet = tweet.lower()
#Convert www.* or https?://* to URL
tweet = re.sub('((www\.[\s]+)|(https?://[^\s]+))','URL',tweet)
#Convert @username to AT_USER
tweet = re.sub('@[^\s]+','AT_USER',tweet)
#Remove additional white spaces
tweet = re.sub('[\s]+', ' ', tweet)
#Replace #word with word
tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
#trim
tweet = tweet.strip('\'"')
return tweet

def replaceTwoOrMore(s):
#look for 2 or more repetitions of character and replace with the character itself
pattern = re.compile(r"(.)\1{1,}", re.DOTALL)
return pattern.sub(r"\1\1", s)
#end

#start getStopWordList
def getStopWordList(stopWordListFileName):
#read the stopwords file and build a list
stopWords = []
stopWords.append('AT_USER')
stopWords.append('url')
stopWords.append('URL')
stopWords.append('rt')

fp = open(stopWordListFileName)
line = fp.readline()
while line:
word = line.strip()
stopWords.append(word)
line = fp.readline()
fp.close()
return stopWords
#end

#start getfeatureVector
def getFeatureVector(tweet):
featureVector = []
#split tweet into words
words = tweet.split()
for w in words:
#replace two or more with two occurrences
w = replaceTwoOrMore(w)
#strip punctuation
w = w.strip('\'"?,.')
#check if the word starts with an alphabet
val = re.search(r"^[a-zA-Z][a-zA-Z0-9]*$", w)

#ignore if it is a stop word
if(w in stopWords or val is None):
continue
else:
featureVector.append(w.lower())
return featureVector
#end

def extract_features(tweet):
tweet_words = set(tweet)
features = {}
for word in featureList:
features['contains(%s)' % word] = (word in tweet_words)
return features

inpTweets = csv.reader(open('sheet3.csv', 'rb'), delimiter=',')
stopWords = getStopWordList('stopwords.txt')
featureList = []



# Get tweet words
tweets = []
for row in inpTweets:
sentiment = row[0]
tweet = row[1]
processedTweet = processTweet(tweet)
featureVector = getFeatureVector(processedTweet)
featureList.extend(featureVector)
tweets.append((featureVector, sentiment));
#end loop

# Remove featureList duplicates
featureList = list(set(featureList))

# Extract feature vector for all tweets in one shote
training_set = nltk.classify.util.apply_features(extract_features, tweets)

NBClassifier = nltk.NaiveBayesClassifier.train(training_set)

ft = open("april2.tsv")
line = ft.readline()

fo = open("dunno.tsv", "w")

fo.seek(0,0)
while line:
testTweet = line
processedTestTweet = processTweet(testTweet)
line1 = fo.write( NBClassifier.classify(extract_features(getFeatureVector(processedTestTweet))) + "\n");
line = ft.readline()

fo.close()
ft.close()

最佳答案

如果您想坚持使用 NLTK,请尝试 pickle,例如https://spaghetti-tagger.googlecode.com/svn/spaghetti.py ,参见https://docs.python.org/2/library/pickle.html :

#-*- coding: utf8 -*-

from nltk import UnigramTagger as ut
from nltk import BigramTagger as bt
from cPickle import dump,load

def loadtagger(taggerfilename):
infile = open(taggerfilename,'rb')
tagger = load(infile); infile.close()
return tagger

def traintag(corpusname, corpus):
# Function to save tagger.
def savetagger(tagfilename,tagger):
outfile = open(tagfilename, 'wb')
dump(tagger,outfile,-1); outfile.close()
return
# Training UnigramTagger.
uni_tag = ut(corpus)
savetagger(corpusname+'_unigram.tagger',uni_tag)
# Training BigramTagger.
bi_tag = bt(corpus)
savetagger(corpusname+'_bigram.tagger',bi_tag)
print "Tagger trained with",corpusname,"using" +\
"UnigramTagger and BigramTagger."
return

否则,请尝试其他机器学习库,例如 sklearn 或 shogun

关于python - 如何调整 NLTK Python 代码,使分类器只训练一次,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/23801244/

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com