基于jieba分詞的文本多分類
- 目標
- 數據準備
- 參數配置
- 數據處理
- 模型構建
- 主程序
- 測試與評估
- 測試結果
目標
本文基于給定的詞表,將輸入的文本基于jieba分詞分割為若干個詞,然后將詞基于詞表進行初步編碼,之后經過網絡層,輸出在已知類別標簽上的概率分布,從而實現一個簡單文本的多分類。
數據準備
詞表文件chars.txt
類別標簽文件schema.json
{"停機保號": 0,"密碼重置": 1,"寬泛業務問題": 2,"親情號碼設置與修改": 3,"固話密碼修改": 4,"來電顯示開通": 5,"親情號碼查詢": 6,"密碼修改": 7,"無線套餐變更": 8,"月返費查詢": 9,"移動密碼修改": 10,"固定寬帶服務密碼修改": 11,"UIM反查手機號": 12,"有限寬帶障礙報修": 13,"暢聊套餐變更": 14,"呼叫轉移設置": 15,"短信套餐取消": 16,"套餐余量查詢": 17,"緊急停機": 18,"VIP密碼修改": 19,"移動密碼重置": 20,"彩信套餐變更": 21,"積分查詢": 22,"話費查詢": 23,"短信套餐開通立即生效": 24,"固話密碼重置": 25,"解掛失": 26,"掛失": 27,"無線寬帶密碼修改": 28
}
訓練集數據train.json訓練集數據
驗證集數據valid.json驗證集數據
參數配置
config.py
# -*- coding: utf-8 -*-"""
配置參數信息
"""Config = {"model_path": "model_output","schema_path": "../data/schema.json","train_data_path": "../data/train.json","valid_data_path": "../data/valid.json","vocab_path":"../chars.txt","max_length": 20,"hidden_size": 128,"epoch": 10,"batch_size": 32,"optimizer": "adam","learning_rate": 1e-3,
}
數據處理
loader.py
# -*- coding: utf-8 -*-import json
import re
import os
import torch
import random
import jieba
import numpy as np
from torch.utils.data import Dataset, DataLoader"""
數據加載
"""class DataGenerator:def __init__(self, data_path, config):self.config = configself.path = data_pathself.vocab = load_vocab(config["vocab_path"])self.config["vocab_size"] = len(self.vocab)self.schema = load_schema(config["schema_path"])self.config["class_num"] = len(self.schema)self.load()def load(self):self.data = []with open(self.path, encoding="utf8") as f:for line in f:line = json.loads(line)#加載訓練集if isinstance(line, dict):questions = line["questions"]label = line["target"]label_index = torch.LongTensor([self.schema[label]])for question in questions:input_id = self.encode_sentence(question)input_id = torch.LongTensor(input_id)self.data.append([input_id, label_index])else:assert isinstance(line, list)question, label = lineinput_id = self.encode_sentence(question)input_id = torch.LongTensor(input_id)label_index = torch.LongTensor([self.schema[label]])self.data.append([input_id, label_index])returndef encode_sentence(self, text):input_id = []if self.config["vocab_path"] == "words.txt":for word in jieba.cut(text):input_id.append(self.vocab.get(word, self.vocab["[UNK]"]))else:for char in text:input_id.append(self.vocab.get(char, self.vocab["[UNK]"]))input_id = self.padding(input_id)return input_id#補齊或截斷輸入的序列,使其可以在一個batch內運算def padding(self, input_id):input_id = input_id[:self.config["max_length"]]input_id += [0] * (self.config["max_length"] - len(input_id))return input_iddef __len__(self):return len(self.data)def __getitem__(self, index):return self.data[index]#加載字表或詞表
def load_vocab(vocab_path):token_dict = {}with open(vocab_path, encoding="utf8") as f:for index, line in enumerate(f):token = line.strip()token_dict[token] = index + 1 #0留給padding位置,所以從1開始return token_dict#加載schema
def load_schema(schema_path):with open(schema_path, encoding="utf8") as f:return json.loads(f.read())#用torch自帶的DataLoader類封裝數據
def load_data(data_path, config, shuffle=True):dg = DataGenerator(data_path, config)dl = DataLoader(dg, batch_size=config["batch_size"], shuffle=shuffle)return dlif __name__ == "__main__":from config import Configdg = DataGenerator("valid_tag_news.json", Config)print(dg[1])
主要實現一個自定義數據加載器 DataGenerator,用于加載和處理文本數據。它通過詞匯表和標簽映射將輸入文本轉化為索引序列,并進行補齊或截斷。
模型構建
model.py
# -*- coding: utf-8 -*-import torch
import torch.nn as nn
from torch.optim import Adam, SGD
"""
建立網絡模型結構
"""class TorchModel(nn.Module):def __init__(self, config):super(TorchModel, self).__init__()hidden_size = config["hidden_size"]vocab_size = config["vocab_size"] + 1max_length = config["max_length"]class_num = config["class_num"]self.embedding = nn.Embedding(vocab_size, hidden_size, padding_idx=0)self.layer = nn.Linear(hidden_size, hidden_size)self.classify = nn.Linear(hidden_size, class_num)self.pool = nn.AvgPool1d(max_length)self.activation = torch.relu #relu做激活函數self.dropout = nn.Dropout(0.1)self.loss = nn.functional.cross_entropy #loss采用交叉熵損失#當輸入真實標簽,返回loss值;無真實標簽,返回預測值def forward(self, x, target=None):x = self.embedding(x) #input shape:(batch_size, sen_len)x = self.layer(x) #input shape:(batch_size, sen_len, input_dim)x = self.pool(x.transpose(1,2)).squeeze() #input shape:(batch_size, sen_len, input_dim)predict = self.classify(x) #input shape:(batch_size, input_dim)if target is not None:return self.loss(predict, target.squeeze())else:return predictdef choose_optimizer(config, model):optimizer = config["optimizer"]learning_rate = config["learning_rate"]if optimizer == "adam":return Adam(model.parameters(), lr=learning_rate)elif optimizer == "sgd":return SGD(model.parameters(), lr=learning_rate)
定義了一個神經網絡模型 TorchModel
,繼承自 nn.Module
,用于文本分類任務。模型包括嵌入層、線性層、平均池化層和分類層,使用 ReLU 激活函數和 Dropout 防止過擬合。前向傳播根據輸入返回預測值或損失值(若提供標簽)。choose_optimizer
函數根據配置選擇 Adam 或 SGD 優化器,并設置學習率。模型通過交叉熵損失進行訓練。
主程序
main.py
# -*- coding: utf-8 -*-import torch
import os
import random
import os
import numpy as np
import loggingfrom config import Config
from model import TorchModel, choose_optimizer
from evaluate import Evaluator
from loader import load_data, load_schemalogging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)"""
模型訓練主程序
"""def main(config):#創建保存模型的目錄if not os.path.isdir(config["model_path"]):os.mkdir(config["model_path"])#加載訓練數據train_data = load_data(config["train_data_path"], config)#加載模型model = TorchModel(config)# 標識是否使用gpucuda_flag = torch.cuda.is_available()if cuda_flag:logger.info("gpu可以使用,遷移模型至gpu")model = model.cuda()#加載優化器optimizer = choose_optimizer(config, model)#加載效果測試類evaluator = Evaluator(config, model, logger)#訓練for epoch in range(config["epoch"]):epoch += 1model.train()logger.info("epoch %d begin" % epoch)train_loss = []for index, batch_data in enumerate(train_data):optimizer.zero_grad()if cuda_flag:batch_data = [d.cuda() for d in batch_data]input_id, labels = batch_data #輸入變化時這里需要修改,比如多輸入,多輸出的情況loss = model(input_id, labels)train_loss.append(loss.item())if index % int(len(train_data) / 2) == 0:logger.info("batch loss %f" % loss)loss.backward()# print(loss.item())# print(model.classify.weight.grad)optimizer.step()logger.info("epoch average loss: %f" % np.mean(train_loss))evaluator.eval(epoch)model_path = os.path.join(config["model_path"], "epoch_%d.pth" % epoch)torch.save(model.state_dict(), model_path)return model, train_datadef ask(model, question):input_id = train_data.dataset.encode_sentence(question)model.eval()model = model.cpu()cls = torch.argmax(model(torch.LongTensor([input_id])))schemes = load_schema(Config["schema_path"])ans = ""for name, val in schemes.items():if val == cls:ans = namereturn ansif __name__ == "__main__":model, train_data = main(Config)print(ask(model, "積分是怎么積的"))while True:question = input("請輸入問題:")res = ask(model, question)print("命中問題:", res)print("-----------")
實現一個基于 PyTorch 的文本分類模型的訓練和推理過程。首先,通過 main
函數創建模型訓練的主流程。代碼首先檢查是否有 GPU 可用,并將模型遷移至 GPU(如果可用)。然后加載訓練數據、模型、優化器以及效果評估類。訓練過程中,模型使用交叉熵損失函數計算訓練誤差并進行反向傳播更新參數,每個 epoch 后記錄并輸出平均損失。同時,訓練結束后,將模型保存至指定路徑。
在訓練完成后,ask
函數用于推理,輸入問題并通過模型進行預測。它首先將輸入問題轉化為模型所需的格式,然后利用訓練好的模型進行分類,最后返回匹配的答案。整個程序支持通過命令行輸入問題,模型根據訓練結果給出對應的答案。
在主程序中,首先進行一次初始化訓練,之后進入循環,可以持續輸入問題并得到模型的預測答案。
測試與評估
evaluate.py
# -*- coding: utf-8 -*-
import torch
from loader import load_data"""
模型效果測試
"""class Evaluator:def __init__(self, config, model, logger):self.config = configself.model = modelself.logger = loggerself.valid_data = load_data(config["valid_data_path"], config, shuffle=False)self.stats_dict = {"correct":0, "wrong":0} #用于存儲測試結果def eval(self, epoch):self.logger.info("開始測試第%d輪模型效果:" % epoch)self.stats_dict = {"correct":0, "wrong":0} #清空前一輪的測試結果self.model.eval()for index, batch_data in enumerate(self.valid_data):if torch.cuda.is_available():batch_data = [d.cuda() for d in batch_data]input_id, labels = batch_data #輸入變化時這里需要修改,比如多輸入,多輸出的情況with torch.no_grad():pred_results = self.model(input_id) #不輸入labels,使用模型當前參數進行預測self.write_stats(labels, pred_results)self.show_stats()returndef write_stats(self, labels, pred_results):assert len(labels) == len(pred_results)for true_label, pred_label in zip(labels, pred_results):pred_label = torch.argmax(pred_label)if int(true_label) == int(pred_label):self.stats_dict["correct"] += 1else:self.stats_dict["wrong"] += 1returndef show_stats(self):correct = self.stats_dict["correct"]wrong = self.stats_dict["wrong"]self.logger.info("預測集合條目總量:%d" % (correct +wrong))self.logger.info("預測正確條目:%d,預測錯誤條目:%d" % (correct, wrong))self.logger.info("預測準確率:%f" % (correct / (correct + wrong)))self.logger.info("--------------------")return
定義一個 Evaluator
類,用于評估深度學習模型在驗證集上的表現。Evaluator
初始化時接受配置文件、模型和日志記錄器,并加載驗證數據。eval
方法用于進行模型評估,在每輪評估開始時清空統計信息,設置模型為評估模式,然后通過遍歷驗證數據集進行預測。預測結果通過 write_stats
方法與真實標簽進行比對,統計正確和錯誤的預測條目。最后,show_stats
方法輸出總預測條目數、正確條目數、錯誤條目數以及準確率。該類的作用是幫助監控模型在驗證集上的性能,便于調整和優化模型。
測試結果
請輸入問題:在官網上如何修改移動密碼
命中問題: 移動密碼修改
-----------
請輸入問題:我想多加一個號碼作為親情號
命中問題: 親情號碼設置與修改
-----------
請輸入問題:我已經交足了話費請立即幫我開機
命中問題: 話費查詢
-----------
請輸入問題:密碼想換一下
命中問題: 密碼修改