PyTorch 實現 Word2Vec(Skip-gram 模型) 的完整代碼,使用 中文語料 進行訓練,包括數據預處理、模型定義、訓練和測試。
1. 主要特點
支持中文數據,基于 jieba
進行分詞
使用 Skip-gram 進行訓練,適用于小數據集
支持負采樣,提升訓練效率
使用 cosine similarity
計算相似單詞
完整代碼:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random
import jieba
from collections import Counter
from sklearn.metrics.pairwise import cosine_similarity# ========== 1. 數據預處理 ==========
corpus = ["我們 喜歡 深度 學習","自然 語言 處理 是 有趣 的","人工智能 改變 了 世界","深度 學習 是 人工智能 的 重要 組成部分"
]# 超參數
window_size = 2 # 窗口大小
embedding_dim = 10 # 詞向量維度
num_epochs = 100 # 訓練輪數
learning_rate = 0.01 # 學習率
batch_size = 4 # 批大小
neg_samples = 5 # 負采樣個數# 分詞 & 構建詞匯表
tokenized_corpus = [list(jieba.cut(sentence)) for sentence in corpus]
vocab = set(word for sentence in tokenized_corpus for word in sentence)
word2idx = {word: idx for idx, word in enumerate(vocab)}
idx2word = {idx: word for word, idx in word2idx.items()}# 統計詞頻
word_counts = Counter([word for sentence in tokenized_corpus for word in sentence])
total_words = sum(word_counts.values())# 計算負采樣概率
word_freqs = {word: count / total_words for word, count in word_counts.items()}
word_powers = {word: freq ** 0.75 for word, freq in word_freqs.items()}
Z = sum(word_powers.values())
word_distribution = {word: prob / Z for word, prob in word_powers.items()}# 負采樣函數
def negative_sampling(positive_word, num_samples=5):words = list(word_distribution.keys())probabilities = list(word_distribution.values())negatives = []while len(negatives) < num_samples:neg = np.random.choice(words, p=probabilities)if neg != positive_word:negatives.append(neg)return negatives# 生成 Skip-gram 訓練數據
data = []
for sentence in tokenized_corpus:indices = [word2idx[word] for word in sentence]for center_idx in range(len(indices)):center_word = indices[center_idx]for offset in range(-window_size, window_size + 1):context_idx = center_idx + offsetif 0 <= context_idx < len(indices) and context_idx != center_idx:context_word = indices[context_idx]data.append((center_word, context_word))# 轉換為 PyTorch 張量
data = [(torch.tensor(center), torch.tensor(context)) for center, context in data]# ========== 2. 定義 Word2Vec (Skip-gram) 模型 ==========
class Word2Vec(nn.Module):def __init__(self, vocab_size, embedding_dim):super(Word2Vec, self).__init__()self.embedding = nn.Embedding(vocab_size, embedding_dim)self.output_layer = nn.Linear(embedding_dim, vocab_size)def forward(self, center_word):embed = self.embedding(center_word) # 獲取中心詞向量out = self.output_layer(embed) # 計算詞分布return out# 初始化模型
model = Word2Vec(len(vocab), embedding_dim)# ========== 3. 訓練 Word2Vec ==========
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)for epoch in range(num_epochs):total_loss = 0random.shuffle(data) # 每輪打亂數據for center_word, context_word in data:optimizer.zero_grad()output = model(center_word.unsqueeze(0)) # 預測詞分布loss = criterion(output, context_word.unsqueeze(0)) # 計算損失loss.backward()optimizer.step()total_loss += loss.item()if (epoch + 1) % 10 == 0:print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss:.4f}")# ========== 4. 測試詞向量 ==========
word_vectors = model.embedding.weight.data.numpy()# 計算單詞相似度
def most_similar(word, top_n=3):if word not in word2idx:return "單詞不在詞匯表中"word_vec = word_vectors[word2idx[word]].reshape(1, -1)similarities = cosine_similarity(word_vec, word_vectors)[0]# 獲取相似度最高的 top_n 個單詞(排除自身)similar_idx = similarities.argsort()[::-1][1:top_n+1]return [(idx2word[idx], similarities[idx]) for idx in similar_idx]# 測試相似詞
test_words = ["深度", "學習", "人工智能"]
for word in test_words:print(f"【{word}】的相似單詞:", most_similar(word))
數據預處理
- 使用
jieba.cut()
進行分詞 - 創建
word2idx
和idx2word
- 使用滑動窗口生成
(中心詞, 上下文詞)
訓練樣本 - 實現
negative_sampling()
提高訓練效率
模型
Embedding
層 學習詞向量Linear
層 計算單詞的概率分布CrossEntropyLoss
計算目標詞與預測詞的匹配度- 使用
Adam
進行梯度更新
計算詞相似度
- 使用
cosine_similarity
計算詞向量相似度 - 找出
top_n
個最相似的單詞
?5. 可優化點
?使用更大的中文語料庫(如 THUCNews
)
?使用 t-SNE
進行詞向量可視化
增加負采樣,提升模型訓練效率