完整代碼:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np# 增強的數據集:更多的垃圾郵件與正常郵件樣本
X = ["Congratulations! You've won a $1000 gift card. Claim it now!","Dear friend, I hope you are doing well. Let's catch up soon.","Urgent: Your bank account has been compromised. Please contact support immediately.","Hello, just wanted to confirm our meeting at 2 PM today.","You have a new message from your friend. Click here to read.","Get a free iPhone now! Limited offer, click here.","Last chance to claim your prize, you won $500!","Meeting scheduled for tomorrow. Please confirm.","Hello! You are invited to an exclusive event!","Click here to get free lottery tickets. Hurry up!","Reminder: Your subscription will expire soon, renew now.","Don't forget to submit your report by end of day today."
]
y = [1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0] # 1 為垃圾郵件,0 為正常郵件# 使用 TfidfVectorizer 進行文本向量化
vectorizer = TfidfVectorizer(stop_words='english') # 去除停用詞
X_vec = vectorizer.fit_transform(X).toarray()# 劃分訓練集和測試集
X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.33, random_state=42)# 定義邏輯回歸模型
class LogisticRegressionModel(nn.Module):def __init__(self, input_dim):super(LogisticRegressionModel, self).__init__()self.fc = nn.Linear(input_dim, 1) # 線性層,輸入維度是特征的數量,輸出是1def forward(self, x):return torch.sigmoid(self.fc(x)) # 使用sigmoid激活函數輸出0到1之間的概率# 定義訓練過程
def train_model(model, X_train, y_train, num_epochs=200, learning_rate=0.001):criterion = nn.BCELoss() # 二分類交叉熵損失optimizer = optim.Adam(model.parameters(), lr=learning_rate) # 使用Adam優化器X_train_tensor = torch.tensor(X_train, dtype=torch.float32)y_train_tensor = torch.tensor(y_train, dtype=torch.float32).view(-1, 1)for epoch in range(num_epochs):model.train()optimizer.zero_grad()outputs = model(X_train_tensor)loss = criterion(outputs, y_train_tensor)loss.backward()optimizer.step()if (epoch + 1) % 10 == 0:print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')# 測試模型
def evaluate_model(model, X_test, y_test):model.eval()X_test_tensor = torch.tensor(X_test, dtype=torch.float32)y_test_tensor = torch.tensor(y_test, dtype=torch.float32).view(-1, 1)with torch.no_grad():outputs = model(X_test_tensor)predictions = (outputs >= 0.5).float() # 閾值設為0.5accuracy = accuracy_score(y_test, predictions.numpy())print(f'Accuracy: {accuracy * 100:.2f}%')# 訓練并評估模型
input_dim = X_train.shape[1] # 輸入特征的數量
model = LogisticRegressionModel(input_dim)
train_model(model, X_train, y_train, num_epochs=200, learning_rate=0.001)
evaluate_model(model, X_test, y_test)# 預測新郵件
def predict(model, new_email):model.eval()new_email_vec = vectorizer.transform([new_email]).toarray()new_email_tensor = torch.tensor(new_email_vec, dtype=torch.float32)with torch.no_grad():prediction = model(new_email_tensor)return "Spam" if prediction >= 0.5 else "Not Spam"# 檢測新郵件
email_1 = "Congratulations! You have a limited time offer for a free cruise."
email_2 = "Hi, let's discuss the project updates tomorrow."print(f"Email 1: {predict(model, email_1)}") # 可能輸出:Spam
print(f"Email 2: {predict(model, email_2)}") # 可能輸出:Not Spam
1. 數據預處理
- 準備數據集:包含垃圾郵件(Spam)和正常郵件(Not Spam)。
- 文本向量化:使用
TfidfVectorizer
將文本轉換為數值特征,使模型能夠處理。 - 去除停用詞:排除無意義的常見詞(如 "the", "is", "and"),提高模型性能。
2. 訓練集與測試集劃分
- 將數據集拆分為訓練集和測試集,以 67% 訓練,33% 測試,保證模型有足夠數據訓練,同時可以評估其泛化能力。
3. 邏輯回歸模型
- 搭建 PyTorch 邏輯回歸模型:
- 采用
nn.Linear()
構建一個單層神經網絡(輸入為文本特征,輸出為 1 個數值)。 - 使用
sigmoid
作為激活函數,將輸出轉換為 0-1 之間的概率值。
- 采用
4. 訓練模型
- 定義損失函數:使用二元交叉熵損失 (
BCELoss
),適用于二分類問題。 - 優化器:采用
Adam
優化器,以0.001
學習率進行參數優化。 - 訓練流程:
- 計算前向傳播的輸出。
- 計算損失值,衡量預測結果與真實標簽的差距。
- 進行反向傳播,更新權重參數。
- 迭代多輪(如 200 輪),不斷優化模型。
5. 評估模型
- 將測試數據輸入模型,預測結果并與真實標簽進行對比。
- 計算準確率,評估模型在未見過的數據上的表現。
6. 預測新郵件
- 將新郵件轉換為數值特征(與訓練時相同的方法)。
- 使用訓練好的模型進行預測。
- 閾值判斷:如果輸出概率 ≥ 0.5,則判斷為垃圾郵件,否則為正常郵件。