import gensim
from gensim import corpora
from gensim.models import LdaModel
from gensim.matutils import cossim
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string# 如果您尚未下載nltk的停用詞列表,請取消下面的注釋并運行一次
# nltk.download('punkt')
# nltk.download('stopwords')# 數據預處理函數
def preprocess(text):stop_words = set(stopwords.words('english'))tokens = word_tokenize(text.lower())tokens = [word for word in tokens if word.isalpha()] # 僅保留字母tokens = [word for word in tokens if word not in stop_words] # 去除停用詞return tokens# 示例文檔
documents = ["Text processing using LDA is interesting.","Another document example for LDA.","Text mining and natural language processing.","LDA helps in topic modeling and finding patterns.","This document is for testing LDA similarity."
]# 數據預處理
texts = [preprocess(doc) for doc in documents]# 創建詞典
dictionary = corpora.Dictionary(texts)# 轉換為詞袋模型
corpus = [dictionary.doc2bow(text) for text in texts]# 訓練LDA模型
num_topics = 2
lda_model = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15)# 對新文檔進行主題分布提取
new_doc = "New text for testing similarity with LDA."
new_doc_preprocessed = preprocess(new_doc)
new_doc_bow = dictionary.doc2bow(new_doc_preprocessed)
new_doc_topics = lda_model.get_document_topics(new_doc_bow)# 獲取原始文檔的主題分布
doc_topics = [lda_model.get_document_topics(doc_bow) for doc_bow in corpus]# 計算新文檔與每個原始文檔的相似性
similarities = []
for i, doc_topic in enumerate(doc_topics):similarity = cossim(new_doc_topics, doc_topic)similarities.append((i, similarity))# 輸出相似性結果
print("Similarity between new document and each original document:")
for i, similarity in similarities:print(f"Document {i}: Similarity = {similarity}")
import gensim
from gensim import corpora
from gensim.models import LdaModel
from gensim.matutils import cossim
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
# 如果您尚未下載nltk的停用詞列表,請取消下面的注釋并運行一次
# nltk.download('punkt')
# nltk.download('stopwords')
# 數據預處理函數
def preprocess(text):
? ? stop_words = set(stopwords.words('english'))
? ? tokens = word_tokenize(text.lower())
? ? tokens = [word for word in tokens if word.isalpha()] ?# 僅保留字母
? ? tokens = [word for word in tokens if word not in stop_words] ?# 去除停用詞
? ? return tokens
# 示例文檔
documents = [
? ? "Text processing using LDA is interesting.",
? ? "Another document example for LDA.",
? ? "Text mining and natural language processing.",
? ? "LDA helps in topic modeling and finding patterns.",
? ? "This document is for testing LDA similarity."
]
# 數據預處理
texts = [preprocess(doc) for doc in documents]
# 創建詞典
dictionary = corpora.Dictionary(texts)
# 轉換為詞袋模型
corpus = [dictionary.doc2bow(text) for text in texts]
# 訓練LDA模型
num_topics = 2
lda_model = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15)
# 對新文檔進行主題分布提取
new_doc = "New text for testing similarity with LDA."
new_doc_preprocessed = preprocess(new_doc)
new_doc_bow = dictionary.doc2bow(new_doc_preprocessed)
new_doc_topics = lda_model.get_document_topics(new_doc_bow)
# 獲取原始文檔的主題分布
doc_topics = [lda_model.get_document_topics(doc_bow) for doc_bow in corpus]
# 計算新文檔與每個原始文檔的相似性
similarities = []
for i, doc_topic in enumerate(doc_topics):
? ? similarity = cossim(new_doc_topics, doc_topic)
? ? similarities.append((i, similarity))
# 輸出相似性結果
print("Similarity between new document and each original document:")
for i, similarity in similarities:
? ? print(f"Document {i}: Similarity = {similarity}")
?