使用bert-base-chinese預訓練模型對二分類問題進行微調
import pandas as pd
from transformers import BertTokenizerFast, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torchmodel_name = "./bert-base-chinese"
path = "./abuse_22.csv"df = pd.read_csv(path, encoding="utf-8")
texts = df["content"][:1000].tolist()
labels = df["punish_result"][:1000].tolist()
texts = list(map(lambda x: str(x), texts))class Dataset(torch.utils.data.Dataset):def __init__(self, encodings, labels):self.encodings = encodingsself.labels = labelsdef __getitem__(self, idx):item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}item['labels'] = torch.tensor(self.labels[idx])return itemdef __len__(self):return len(self.labels)model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
tokenizer = BertTokenizerFast.from_pretrained(model_name)# 參考這里 https://blog.csdn.net/weixin_42924890/article/details/139269528
train_encodings = tokenizer(texts, truncation=True, padding=True, max_length=512)
encodings = Dataset(train_encodings, labels)args = TrainingArguments(output_dir='./output_dir',evaluation_strategy='epoch',no_cuda=True,num_train_epochs=2,learning_rate=1e-4,weight_decay=1e-2,per_device_eval_batch_size=32,per_device_train_batch_size=32)trainer = Trainer(model=model,args=args,train_dataset=encodings,
)# 開始訓練
trainer.train()