針對文本聚類優化
優化TF-IDF特征工程
from sklearn. feature_extraction. text import TfidfVectorizertfidf = TfidfVectorizer( ngram_range= ( 1 , 2 ) , max_features= 5000 , min_df= 2 , max_df= 0.8 , token_pattern= r"\b\w+\b"
)
動態選擇最佳簇數 n_clusters
import numpy as np
from sklearn. cluster import KMeans
from sklearn. metrics import silhouette_score
X = tfidf. fit_transform( comments_to_cluster)
best_k = 0
best_silhouette = - 1 for k in range ( 5 , 9 ) : kmeans = KMeans( n_clusters= k, random_state= 42 ) labels = kmeans. fit_predict( X) score = silhouette_score( X, labels) if score > best_silhouette: best_silhouette = scorebest_k = k
改進聚類算法
from sklearn. feature_extraction. text import TfidfVectorizer
from sklearn. preprocessing import Normalizer
from sklearn. cluster import KMeans
from sklearn. pipeline import make_pipeline
kmeans_predictor = make_pipeline( TfidfVectorizer( tokenizer= jieba. lcut, ngram_range= ( 1 , 2 ) , max_features= 5000 , min_df= 2 , max_df= 0.8 , token_pattern= r"\b\w+\b" ) , Normalizer( norm= "l2" ) , KMeans( n_clusters= best_k, random_state= 42 , n_init= 10 )
)
comments_data_clean = comments_data[ comments_data[ "sentiment_category" ] . isin( [ 1 , 3 ] ) ]
kmeans_predictor. fit( comments_data_clean[ "comment_text" ] )
kmeans_cluster_label = kmeans_predictor. predict( comments_data_clean[ "comment_text" ] ) kmeans_top_word = [ ]
tfidf_vectorizer = kmeans_predictor. named_steps[ 'tfidfvectorizer' ]
kmeans_model = kmeans_predictor. named_steps[ 'kmeans' ]
feature_names = tfidf_vectorizer. get_feature_names_out( )
cluster_centers = kmeans_model. cluster_centers_
for i in range ( kmeans_model. n_clusters) : top_feature_indices = cluster_centers[ i] . argsort( ) [ : : - 1 ] top_word = ' ' . join( [ feature_names[ idx] for idx in top_feature_indices[ : top_n_words] ] ) kmeans_top_word. append( top_word) comments_data. loc[ comments_data[ "sentiment_category" ] . isin( [ 1 , 3 ] ) , "positive_cluster_theme" ] = [ kmeans_top_word[ x] for x in kmeans_cluster_label]
提交得分