pytorch實現基于Word2Vec的詞嵌入

PyTorch 實現 Word2Vec（Skip-gram 模型） 的完整代碼，使用 中文語料 進行訓練，包括數據預處理、模型定義、訓練和測試。

1. 主要特點

支持中文數據，基于 jieba 進行分詞
使用 Skip-gram 進行訓練，適用于小數據集
支持負采樣，提升訓練效率
使用 cosine similarity 計算相似單詞

完整代碼：

import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random
import jieba
from collections import Counter
from sklearn.metrics.pairwise import cosine_similarity# ========== 1. 數據預處理 ==========
corpus = ["我們 喜歡 深度 學習","自然 語言 處理 是 有趣 的","人工智能 改變 了 世界","深度 學習 是 人工智能 的 重要 組成部分"
]# 超參數
window_size = 2      # 窗口大小
embedding_dim = 10   # 詞向量維度
num_epochs = 100     # 訓練輪數
learning_rate = 0.01 # 學習率
batch_size = 4       # 批大小
neg_samples = 5      # 負采樣個數# 分詞 & 構建詞匯表
tokenized_corpus = [list(jieba.cut(sentence)) for sentence in corpus]
vocab = set(word for sentence in tokenized_corpus for word in sentence)
word2idx = {word: idx for idx, word in enumerate(vocab)}
idx2word = {idx: word for word, idx in word2idx.items()}# 統計詞頻
word_counts = Counter([word for sentence in tokenized_corpus for word in sentence])
total_words = sum(word_counts.values())# 計算負采樣概率
word_freqs = {word: count / total_words for word, count in word_counts.items()}
word_powers = {word: freq ** 0.75 for word, freq in word_freqs.items()}
Z = sum(word_powers.values())
word_distribution = {word: prob / Z for word, prob in word_powers.items()}# 負采樣函數
def negative_sampling(positive_word, num_samples=5):words = list(word_distribution.keys())probabilities = list(word_distribution.values())negatives = []while len(negatives) < num_samples:neg = np.random.choice(words, p=probabilities)if neg != positive_word:negatives.append(neg)return negatives# 生成 Skip-gram 訓練數據
data = []
for sentence in tokenized_corpus:indices = [word2idx[word] for word in sentence]for center_idx in range(len(indices)):center_word = indices[center_idx]for offset in range(-window_size, window_size + 1):context_idx = center_idx + offsetif 0 <= context_idx < len(indices) and context_idx != center_idx:context_word = indices[context_idx]data.append((center_word, context_word))# 轉換為 PyTorch 張量
data = [(torch.tensor(center), torch.tensor(context)) for center, context in data]# ========== 2. 定義 Word2Vec (Skip-gram) 模型 ==========
class Word2Vec(nn.Module):def __init__(self, vocab_size, embedding_dim):super(Word2Vec, self).__init__()self.embedding = nn.Embedding(vocab_size, embedding_dim)self.output_layer = nn.Linear(embedding_dim, vocab_size)def forward(self, center_word):embed = self.embedding(center_word)  # 獲取中心詞向量out = self.output_layer(embed)       # 計算詞分布return out# 初始化模型
model = Word2Vec(len(vocab), embedding_dim)# ========== 3. 訓練 Word2Vec ==========
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)for epoch in range(num_epochs):total_loss = 0random.shuffle(data)  # 每輪打亂數據for center_word, context_word in data:optimizer.zero_grad()output = model(center_word.unsqueeze(0))  # 預測詞分布loss = criterion(output, context_word.unsqueeze(0))  # 計算損失loss.backward()optimizer.step()total_loss += loss.item()if (epoch + 1) % 10 == 0:print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss:.4f}")# ========== 4. 測試詞向量 ==========
word_vectors = model.embedding.weight.data.numpy()# 計算單詞相似度
def most_similar(word, top_n=3):if word not in word2idx:return "單詞不在詞匯表中"word_vec = word_vectors[word2idx[word]].reshape(1, -1)similarities = cosine_similarity(word_vec, word_vectors)[0]# 獲取相似度最高的 top_n 個單詞（排除自身）similar_idx = similarities.argsort()[::-1][1:top_n+1]return [(idx2word[idx], similarities[idx]) for idx in similar_idx]# 測試相似詞
test_words = ["深度", "學習", "人工智能"]
for word in test_words:print(f"【{word}】的相似單詞:", most_similar(word))

數據預處理

使用 jieba.cut() 進行分詞
創建 word2idx 和 idx2word
使用滑動窗口生成 (中心詞, 上下文詞) 訓練樣本
實現 negative_sampling() 提高訓練效率

模型

Embedding 層 學習詞向量
Linear 層 計算單詞的概率分布
CrossEntropyLoss 計算目標詞與預測詞的匹配度
使用 Adam 進行梯度更新

計算詞相似度

使用 cosine_similarity 計算詞向量相似度
找出 top_n 個最相似的單詞

?5. 可優化點

?使用更大的中文語料庫（如 THUCNews）
?使用 t-SNE 進行詞向量可視化
增加負采樣，提升模型訓練效率

本文來自互聯網用戶投稿，該文觀點僅代表作者本人，不代表本站立場。本站僅提供信息存儲空間服務，不擁有所有權，不承擔相關法律責任。
如若轉載，請注明出處：http://www.pswp.cn/news/894445.shtml
繁體地址，請注明出處：http://hk.pswp.cn/news/894445.shtml
英文地址，請注明出處：http://en.pswp.cn/news/894445.shtml

如若內容造成侵權/違法違規/事實不符，請聯系多彩編程網進行投訴反饋email:809451989@qq.com，一經查實，立即刪除！