如何解决如何删除阿拉伯语中的停用词?
我有一个训练文件和测试文件,我想使用机器学习算法从推文中检测情感,在这段代码中,我将在阿拉伯语的训练数据集中使用预处理步骤,并在删除 stop_words 时出现此错误!您需要安装阿拉伯语停用词文件还是我可以从 NLTK 导入它?
#csv file for train
df=pd.read_csv("C:/Users/User/Desktop/2018-EI-oc-Ar-fear-train.csv")
#csv file for test
df_test=pd.read_csv("C:/Users/User/Desktop/2018-EI-oc-Ar-fear-test-gold.csv")
def stopWordRmove(text):
ar_stop_list = open("ar_stop_word_list.txt","r") # this error appear in this line
stop_words = ar_stop_list.read().split('\n')
needed_words = []
words = word_tokenize(text)
for w in words:
if w not in (stop_words):
needed_words.append(w)
filtered_sentence = " ".join(needed_words)
return filtered_sentence
def noramlize(Tweet):
Tweet = re.sub(r"[إأٱآا]","ا",Tweet)
Tweet = re.sub(r"ى","ي",Tweet)
Tweet = re.sub(r"ؤ","ء",Tweet)
Tweet = re.sub(r"ئ",Tweet)
Tweet = re.sub(r'[^ا-ي ]',"",Tweet)
noise = re.compile(""" ّ | # Tashdid
َ | # Fatha
ً | # Tanwin Fath
ُ | # Damma
ٌ | # Tanwin Damm
ِ | # Kasra
ٍ | # Tanwin Kasr
ْ | # Sukun
ـ # Tatwil/Kashida
""",re.VERBOSE)
Tweet = re.sub(noise,'',Tweet)
return Tweet
def stopWordRmove(Tweet):
ar_stop_list = open("ar_stop_word_list.txt","r")
stop_words = ar_stop_list.read().split('\n')
needed_words = []
words = word_tokenize(Tweet)
for w in words:
if w not in (stop_words):
needed_words.append(w)
filtered_sentence = " ".join(needed_words)
return filtered_sentence
def stemming(Tweet):
st = ISRIStemmer()
stemmed_words = []
words = word_tokenize(Tweet)
for w in words:
stemmed_words.append(st.stem(w))
stemmed_sentence = " ".join(stemmed_words)
return stemmed_sentence
def prepareDataSets(df):
sentences = []
for index,r in df.iterrows():
text = stopWordRmove(r['Tweet'])
text = noramlize(r['Tweet'])
text = stemming(r['Tweet'])
df_sentences = DataFrame(sentences,columns=['Tweet','Affect Dimension'])
return df_sentences
preprocessed_df = prepareDataSets(df)
FileNotFoundError: [Errno 2] No such file or directory: 'ar_stop_word_list.txt'
如何从阿拉伯语推文中删除停用词?
版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。