文本聚类：质心列表中的重复项

如何解决文本聚类：质心列表中的重复项

在查看使用 KMeans 制作的簇的质心时，我在此质心列表中发现了重复项。

这是什么意思广告我怎样才能摆脱重复？

这可能是我有一些“黑洞”集群的原因，其中包含所有数据集的 30-40%，因为我有大约 40-80 个集群，所以数量巨大？你能给我一些如何使集群更平衡的提示吗？

import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer

**Preprocessing and transforming DataFrame to list of text strings**
import re
corpus = []
for i in df["text"]:
    y = re.sub(r'[^(\w|\s)]',' ',i)
    y = re.sub(r'[\-]',y) 
    y = re.sub(r'[a-z]','',y)
    y = re.sub(r'[A-Z]',y)
    y = re.sub(r'\n',y)
    y = re.sub(r'[\d]',y)
    y = re.sub(r'[(\(|\)]',y)
    y = y.lower()
    corpus.append(y)
# corpus - list of preprocessed strings



# PyMystem - lemmatizer for Russian
!wget http://download.cdn.yandex.net/mystem/mystem-3.0-linux3.1-64bit.tar.gz
!tar -xvf mystem-3.0-linux3.1-64bit.tar.gz
!cp mystem /root/.local/bin/mystem

from pymystem3 import Mystem 
m = Mystem() 
corp_lemmz = []
stop_words = [LIST OF STOP WORDS]

for row in corpus:
    lemm = m.lemmatize(row)
    for word in lemm:
        if word in stop_words:
            lemm.remove(word)
    lemmas = "".join(lemm)
    lemmas = re.sub(r'\n',lemmas) 
    corp_lemmz.append(lemmas)
#corp_lemmz - list of preprocessed strings

////////////////////
Vectorizing of preprocessed strings

from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer=TfidfVectorizer(use_idf=True,analyzer = 'word',stop_words=stop_words) 
tfidf_vectorizer_vectors = tfidf_vectorizer.fit_transform(corp_lemmz)

///// Trying to count duplicates but receive 0 in counter
counter = 0
count_list = []
for i in tfidf_vectorizer.get_feature_names():
    if not i in count_list:
        count_list.append(i)
    else:    
        counter +=1
print(counter)

/////////

print("Top terms per cluster:")
n_clusters = 80
mbk = KMeans(n_clusters=n_clusters,init = "k-means++",random_state=20)
mbk.fit(tfidf_vectorizer_vectors)
order_centroids = mbk.cluster_centers_.argsort()[:,::-1]
labels = mbk.labels_
terms = tfidf_vectorizer.get_feature_names()

for i in range(n_clusters):
    print("Cluster %d:" % i,end='')
    for ind in order_centroids[i,:10]:
        print(' %s' % terms[ind],end='')
        print()

///////Count duplicates
import collections
c = collections.Counter()
for i in range(n_clusters):
    for ind in order_centroids[i,:10]:
        if ind not in c:
            c[ind] = 1
        else:
            c[ind] += 1

c.most_common()

答案：[(981,27),(982,26),(983,22),(980,21),(2938,20) ...

#matching unpreprocessed initial sentence with its cluster label
match = {}
for i in range(len(labels)):
    match[corpus[i]] = labels[i]

a = sorted(match.items(),key=lambda x: x[1])    

## Creating a dict to store sentences with cluster label instead of list
output = {} 
for x,y in a: 
  if y in output: 
    output[y].append((x)) 
  else: 
    output[y] = [(x)] 

#importing result to txt-files
with open("sentence_plus_cluster_80_idf_True_Predicted_MiniBatch.txt","w") as file:    
  for i in range(len(output)):
      if output.get(i) != None:
          file.write("Cluster" + str(i) +  '\n' + 'Len of clust' + str(len(output.get(i))) + '\n')
          for x in output.get(i):
              file.write(x + '\n')
with open("top_words_per_clust_80_idf_True_PRedicted_MiniBatch.txt","w") as file:    
  file.write("KMeans") # %d" % n_clusters + '\n')  
  for i in range(n_clusters):
      print("Cluster %d:" % i,end='')
      file.write("Cluster %d:" % i) 
      for ind in order_centroids[i,:20]:
          print(' %s' % terms[ind],end='')
          print()
          file.write(terms[ind] + '\n')

文本聚类：质心列表中的重复项

如何解决文本聚类：质心列表中的重复项

相关推荐