gpt4 book ai didi

python删除标点符号电子邮件垃圾邮件

转载 作者:行者123 更新时间:2023-12-04 09:38:14 25 4
gpt4 key购买 nike

试图从单词列表中删除标点符号。 python 编程的新手,所以如果有人可以提供帮助,那就太好了。其目的是用于电子邮件垃圾邮件分类。以前我在检查标点符号是否存在后加入了单词,但这给了我单个字符而不是整个单词。更改它以获取单词后,这就是我在下面的内容,因此现在尝试删除标点符号,因为与以前的工作方式不同。

import os
import string
from collections import Counter
from os import listdir # return all files and folders in the directory

import nltk
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import pandas as pd
from nltk import WordNetLemmatizer
from nltk.corpus import stopwords
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

# used for importing the lingspam dataset
def importLingspamDataset(dir):
allEmails = [] # for storing the emails once read
fileNames = []
for file in listdir(dir):
f = open((dir + '/' + file), "r") # used for opening the file in read only format
fileNames.append(file)
allEmails.append(f.read()) # appends the read emails to the emails array
f.close()
return allEmails, fileNames

def importEnronDataset(dir):
allEmails = [] # for storing the emails once read
fileNames = []
for file in listdir(dir):
f = open((dir + '/' + file), "r") # used for opening the file in read only format
fileNames.append(file)
allEmails.append(f.read()) # appends the read emails to the emails array
f.close()
return allEmails, fileNames

# used to remove punctuation from the emails as this is of no use for detecting spam
def removePunctuation(cleanedEmails):
punc = set(string.punctuation)
for word, line in enumerate(cleanedEmails):
words = line.split()
x = [''.join(c for c in words if c not in string.punctuation)]
allWords = []
allWords += x
return allWords

# used to remove stopwords i.e. words of no use in detecting spam
def removeStopwords(cleanedEmails):
removeWords = set(stopwords.words('english')) # sets all the stopwords to be removed
for stopw in removeWords: # for each word in remove words
if stopw not in removeWords: # if the word is not in the stopwords to be removed
cleanedEmails.append(stopw) # add this word to the cleaned emails
return(cleanedEmails)

# funtion to return words to its root form - allows simplicity
def lemmatizeEmails(cleanedEmails):
lemma = WordNetLemmatizer() # to be used for returning each word to its root form
lemmaEmails = [lemma.lemmatize(i) for i in cleanedEmails] # lemmatize each word in the cleaned emails
return lemmaEmails

# function to allow a systematic process of elimating the undesired elements within the emails
def cleanAllEmails(cleanedEmails):
cleanPunc = removePunctuation(cleanedEmails)
cleanStop = removeStopwords(cleanPunc)
cleanLemma = lemmatizeEmails(cleanStop)
return cleanLemma

def createDictionary(email):
allWords = []
allWords.extend(email)
dictionary = Counter(allWords)
dictionary.most_common(3000)
word_cloud = WordCloud(width=400, height=400, background_color='white',
min_font_size=12).generate_from_frequencies(dictionary)
plt.imshow(word_cloud)
plt.axis("off")
plt.margins(x=0, y=0)
plt.show()
word_cloud.to_file('test1.png')

def featureExtraction(email):
emailFiles = []
emailFiles.extend(email)
featureMatrix = np.zeros((len(emailFiles), 3000))


def classifyLingspamDataset(email):
classifications = []
for name in email:
classifications.append("spmsg" in name)
return classifications

# Lingspam dataset
trainingDataLingspam, trainingLingspamFilename = importLingspamDataset("spam-non-spam-dataset/train-mails") # extract the training emails from the dataset
#testingDataLingspam, testingLingspamFilename = importLingspamDataset("spam-non-spam-dataset/test-mails") # extract the testing emails from the dataset

trainingDataLingspamClean = cleanAllEmails(trainingDataLingspam)
#testingDataLingspamClean = cleanAllEmails(testingDataLingspam)

#trainClassifyLingspam = classifyLingspamDataset(trainingDataLingspam)
#testClassifyLingspam = classifyLingspamDataset(testingDataLingspam)

trainDictionary = createDictionary(trainingDataLingspamClean)
#createDictionary(testingDataLingspamClean)

#trainingDataEnron, trainingEnronFilename = importEnronDataset("spam-non-spam-dataset-enron/bigEmailDump/training/")

最佳答案

根据您的问题,我假设您有一个电子邮件列表,您希望为每封电子邮件删除标点符号。此答案基于您发布的代码的第一次修订。

import string


def removePunctuation(emails):

# I am using a list comprehension here to iterate over the emails.
# For each iteration, translate the email to remove the punctuation marks.
# Translate only allows a translation table as an argument.
# This is why str.maketrans is used to create the translation table.

cleaned_emails = [email.translate(str.maketrans('', '', string.punctuation))
for email in emails]

return cleaned_emails


if __name__ == '__main__':

# Assuming cleanedEmails is a list of emails,
# I am substituting cleanedEmails with emails.
# I used cleanedEmails as the result.

emails = ["This is a, test!", "This is another#@! \ntest"]
cleaned_emails = removePunctuation(emails)
print(cleaned_emails)
input: ["This is a, test!", "This is another#@! \ntest"]
output: ['This is a test', 'This is another \ntest']

编辑:

与 OP 对话后问题得到解决。 OP 遇到 WordCloud 问题,我提供的解决方案正在运行。通过让 WordCloud 工作来管理指导 OP。 OP 现在正在微调 WordCloud 的结果。

关于python删除标点符号电子邮件垃圾邮件,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/62448491/

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com