如何解决文本聚类:质心列表中的重复项
在查看使用 KMeans 制作的簇的质心时,我在此质心列表中发现了重复项。
这是什么意思广告我怎样才能摆脱重复?
这可能是我有一些“黑洞”集群的原因,其中包含所有数据集的 30-40%,因为我有大约 40-80 个集群,所以数量巨大?你能给我一些如何使集群更平衡的提示吗?
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
**Preprocessing and transforming DataFrame to list of text strings**
import re
corpus = []
for i in df["text"]:
y = re.sub(r'[^(\w|\s)]',' ',i)
y = re.sub(r'[\-]',y)
y = re.sub(r'[a-z]','',y)
y = re.sub(r'[A-Z]',y)
y = re.sub(r'\n',y)
y = re.sub(r'[\d]',y)
y = re.sub(r'[(\(|\)]',y)
y = y.lower()
corpus.append(y)
# corpus - list of preprocessed strings
# PyMystem - lemmatizer for Russian
!wget http://download.cdn.yandex.net/mystem/mystem-3.0-linux3.1-64bit.tar.gz
!tar -xvf mystem-3.0-linux3.1-64bit.tar.gz
!cp mystem /root/.local/bin/mystem
from pymystem3 import Mystem
m = Mystem()
corp_lemmz = []
stop_words = [LIST OF STOP WORDS]
for row in corpus:
lemm = m.lemmatize(row)
for word in lemm:
if word in stop_words:
lemm.remove(word)
lemmas = "".join(lemm)
lemmas = re.sub(r'\n',lemmas)
corp_lemmz.append(lemmas)
#corp_lemmz - list of preprocessed strings
////////////////////
Vectorizing of preprocessed strings
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer=TfidfVectorizer(use_idf=True,analyzer = 'word',stop_words=stop_words)
tfidf_vectorizer_vectors = tfidf_vectorizer.fit_transform(corp_lemmz)
///// Trying to count duplicates but receive 0 in counter
counter = 0
count_list = []
for i in tfidf_vectorizer.get_feature_names():
if not i in count_list:
count_list.append(i)
else:
counter +=1
print(counter)
/////////
print("Top terms per cluster:")
n_clusters = 80
mbk = KMeans(n_clusters=n_clusters,init = "k-means++",random_state=20)
mbk.fit(tfidf_vectorizer_vectors)
order_centroids = mbk.cluster_centers_.argsort()[:,::-1]
labels = mbk.labels_
terms = tfidf_vectorizer.get_feature_names()
for i in range(n_clusters):
print("Cluster %d:" % i,end='')
for ind in order_centroids[i,:10]:
print(' %s' % terms[ind],end='')
print()
///////Count duplicates
import collections
c = collections.Counter()
for i in range(n_clusters):
for ind in order_centroids[i,:10]:
if ind not in c:
c[ind] = 1
else:
c[ind] += 1
c.most_common()
答案:[(981,27),(982,26),(983,22),(980,21),(2938,20) ...
#matching unpreprocessed initial sentence with its cluster label
match = {}
for i in range(len(labels)):
match[corpus[i]] = labels[i]
a = sorted(match.items(),key=lambda x: x[1])
## Creating a dict to store sentences with cluster label instead of list
output = {}
for x,y in a:
if y in output:
output[y].append((x))
else:
output[y] = [(x)]
#importing result to txt-files
with open("sentence_plus_cluster_80_idf_True_Predicted_MiniBatch.txt","w") as file:
for i in range(len(output)):
if output.get(i) != None:
file.write("Cluster" + str(i) + '\n' + 'Len of clust' + str(len(output.get(i))) + '\n')
for x in output.get(i):
file.write(x + '\n')
with open("top_words_per_clust_80_idf_True_PRedicted_MiniBatch.txt","w") as file:
file.write("KMeans") # %d" % n_clusters + '\n')
for i in range(n_clusters):
print("Cluster %d:" % i,end='')
file.write("Cluster %d:" % i)
for ind in order_centroids[i,:20]:
print(' %s' % terms[ind],end='')
print()
file.write(terms[ind] + '\n')
版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。