應該算是完結啦~再次感謝土堆老師!
模型訓練
模型訓練基本可以分為以下幾個步驟按序執行:
引入數據集-使用dataloader加載數據集-建立模型-設置損失函數-設置優化器-進行訓練-訓練中計算損失,并使用優化器更新參數-模型測試-模型存儲
習慣上會將model和train代碼分開寫,當然一開始混合寫也沒啥問題,直接給出一個例程:
# train.py
import torch
from torchvision import transforms, datasets
from torch.utils.data import DataLoader
import torch.nn as nn
from model import *
import timefrom torch.utils.tensorboard import SummaryWriterdata_transforms = transforms.Compose([transforms.ToTensor()
])#引入數據集
train_data = datasets.CIFAR10("./dataset",train=True,transform=data_transforms,download=True)test_data = datasets.CIFAR10("./dataset",train=False,transform=data_transforms,download=True)#加載數據
train_dataloader = DataLoader(train_data,batch_size=64)
test_dataloader = DataLoader(test_data,batch_size=64)start_time = time.time()#建立模型
my_module = MyModule()#設置損失函數
cross_loss = nn.CrossEntropyLoss()#設置優化器
#設置學習率
learning_rate = 1e-2
optimizer = torch.optim.SGD(my_module.parameters(),lr=learning_rate)#進行訓練
#設置迭代次數
epoch = 10total_train_steps = 0writer = SummaryWriter("train_logs")for i in range(epoch):print("第{}輪訓練".format(i+1))#訓練my_module.train() #只對某些層起作用for data in train_dataloader:imgs, targets = dataoutputs = my_module(imgs)#計算損失loss = cross_loss(outputs, targets)#優化模型optimizer.zero_grad()loss.backward()optimizer.step()total_train_steps +=1if total_train_steps % 100 ==0:print("訓練次數:{},Loss:{}".format(total_train_steps,loss.item()))writer.add_scalar("train_loss",loss.item(),total_train_steps)#測試,不再梯度下降my_module.eval() #同樣只對某些層起作用 total_test_loss = 0# total_test_steps = 0total_accuracy = 0test_data_size = len(test_data)with torch.no_grad():for data in test_dataloader:imgs, targets = dataoutputs = my_module(imgs)loss = cross_loss(outputs,targets)total_test_loss += loss.item()##對于分類任務可以求一下準確的個數,非必須#argmax(1)按行取最大的下標 argmax(0)按列取最大的下標accuracy = (outputs.argmax(1)==targets).sum()total_accuracy += accuracyprint("第{}輪的測試集Loss:{}".format(i+1,total_test_loss))print("測試集準確率:{}".format(total_accuracy/test_data_size))writer.add_scalar("test_loss",total_test_loss,i)end_time = time.time()print("time:{}".format(end_time-start_time))#存儲模型if i % 5 == 0:torch.save(my_module,"my_module_{}.pth".format(i))print("模型存儲成功")writer.close()
使用GPU加速訓練(方式一)
上述寫法默認是采用cpu進行訓練的,會比較慢,為了加速訓練過程,我們需要用GPU進行加速訓練。對應有兩種方式,推薦方式二(大部分例程也都是采用方式二的)
需要用到GPU加速的主要有如圖中的三個部分:
對應有.cuda()的參數,只要在原始位置后面加上.cuda()就可以,考慮到有些設備沒有GPU,建議加上.cuda.is_avaliable的判斷。同樣給出完整例程:
# train_gpu1.py
import torch
from torchvision import transforms, datasets
from torch.utils.data import DataLoader
import torch.nn as nn
from model import *
import timefrom torch.utils.tensorboard import SummaryWriterdata_transforms = transforms.Compose([transforms.ToTensor()
])#引入數據集
train_data = datasets.CIFAR10("./dataset",train=True,transform=data_transforms,download=True)test_data = datasets.CIFAR10("./dataset",train=False,transform=data_transforms,download=True)#加載數據
train_dataloader = DataLoader(train_data,batch_size=64)
test_dataloader = DataLoader(test_data,batch_size=64)start_time = time.time()#建立模型
my_module = MyModule()
if torch.cuda.is_available():my_module.cuda()#設置損失函數
cross_loss = nn.CrossEntropyLoss()
if torch.cuda.is_available():cross_loss.cuda()#設置優化器
#設置學習率
learning_rate = 1e-2
optimizer = torch.optim.SGD(my_module.parameters(),lr=learning_rate)#進行訓練
#設置迭代次數
epoch = 10total_train_steps = 0writer = SummaryWriter("train_logs")for i in range(epoch):print("第{}輪訓練".format(i+1))#訓練my_module.train() #只對某些層起作用for data in train_dataloader:imgs, targets = dataif torch.cuda.is_available():imgs = imgs.cuda()targets = targets.cuda()outputs = my_module(imgs)#計算損失loss = cross_loss(outputs, targets)#優化模型optimizer.zero_grad()loss.backward()optimizer.step()total_train_steps +=1if total_train_steps % 100 ==0:print("訓練次數:{},Loss:{}".format(total_train_steps,loss.item()))writer.add_scalar("train_loss",loss.item(),total_train_steps)#測試,不再梯度下降my_module.eval() #同樣只對某些層起作用 total_test_loss = 0# total_test_steps = 0total_accuracy = 0test_data_size = len(test_data)with torch.no_grad():for data in test_dataloader:imgs, targets = dataif torch.cuda.is_available():imgs = imgs.cuda()targets = targets.cuda()outputs = my_module(imgs)loss = cross_loss(outputs,targets)total_test_loss += loss.item()##對于分類任務可以求一下準確的個數,非必須#argmax(1)按行取最大的下標 argmax(0)按列取最大的下標accuracy = (outputs.argmax(1)==targets).sum()total_accuracy += accuracyprint("第{}輪的測試集Loss:{}".format(i+1,total_test_loss))print("測試集準確率:{}".format(total_accuracy/test_data_size))writer.add_scalar("test_loss",total_test_loss,i)end_time = time.time()print("time:{}".format(end_time-start_time))#存儲模型if i % 5 == 0:torch.save(my_module.state_dict(),"my_module_{}.pth".format(i))print("模型存儲成功")writer.close()
使用GPU加速訓練(方式二)
現在更常見的寫法是用device+.to(device)的搭配,需要引入的位置和方式一提到的沒有任何差異,主要就是使用上的語法會有一點點不一樣,所以直接給出一個例程,大家看完就知道怎么用了:
# train_gpu2.py
import torch
from torchvision import transforms, datasets
from torch.utils.data import DataLoader
import torch.nn as nn
from model import *from torch.utils.tensorboard import SummaryWriterdata_transforms = transforms.Compose([transforms.ToTensor()
])#引入數據集
train_data = datasets.CIFAR10("./dataset",train=True,transform=data_transforms,download=True)test_data = datasets.CIFAR10("./dataset",train=False,transform=data_transforms,download=True)#加載數據
train_dataloader = DataLoader(train_data,batch_size=64)
test_dataloader = DataLoader(test_data,batch_size=64)#確定設備
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)#建立模型
my_module = MyModule()
my_module.to(device)#設置損失函數
cross_loss = nn.CrossEntropyLoss()
cross_loss.to(device)#設置優化器
#設置學習率
learning_rate = 1e-2
optimizer = torch.optim.SGD(my_module.parameters(),lr=learning_rate)#進行訓練
#設置迭代次數
epoch = 10total_train_steps = 0writer = SummaryWriter("train_logs")for i in range(epoch):print("第{}輪訓練".format(i+1))#訓練my_module.train() #只對某些層起作用for data in train_dataloader:imgs, targets = dataimgs, targets = imgs.to(device), targets.to(device)outputs = my_module(imgs)#計算損失loss = cross_loss(outputs, targets)#優化模型optimizer.zero_grad()loss.backward()optimizer.step()total_train_steps +=1if total_train_steps % 100 ==0:print("訓練次數:{},Loss:{}".format(total_train_steps,loss.item()))writer.add_scalar("train_loss",loss.item(),total_train_steps)#測試,不再梯度下降my_module.eval() #同樣只對某些層起作用 total_test_loss = 0# total_test_steps = 0total_accuracy = 0test_data_size = len(test_data)with torch.no_grad():for data in test_dataloader:imgs, targets = dataimgs, targets = imgs.to(device), targets.to(device)outputs = my_module(imgs)loss = cross_loss(outputs,targets)total_test_loss += loss.item()##對于分類任務可以求一下準確的個數,非必須#argmax(1)按行取最大的下標 argmax(0)按列取最大的下標accuracy = (outputs.argmax(1)==targets).sum()total_accuracy += accuracyprint("第{}輪的測試集Loss:{}".format(i+1,total_test_loss))print("測試集準確率:{}".format(total_accuracy/test_data_size))writer.add_scalar("test_loss",total_test_loss,i)#存儲模型if i % 5 == 0:torch.save(my_module.state_dict(),"my_module_{}.pth".format(i))print("模型存儲成功")writer.close()
使用模型完成任務
這個標題我想了很久應該叫什么才能和內容對應上…土堆老師原來名字叫模型驗證我沒看之前一直以為是evaluate的過程啊啊啊結果是test的過程
實際上我們訓練完模型得到一堆準確率啊或者什么的時候并不代表我們完成了整個事情,說人話就是沒啥用,所以這部分其實就是教我們怎么用得到的模型在其他數據上使用,這一部分還蠻簡單的,和訓練中的evaluate部分使用差不多,注意點就是別忘了給圖片reshape成含有batch_size的形狀(特別是單張的情況下),當然,如果有報錯也可以先考慮是不是形狀不太對的原因…
# test.py
from PIL import Image
import torchvision
from torchvision import transforms
from model import *image_path = "./test_imgs/cat.jpg"
image = Image.open(image_path)
# image = image.convert('RGB') # 對于png圖片要加上這一句data_transforms = torchvision.transforms.Compose([transforms.Resize((32, 32)),transforms.ToTensor()])image = data_transforms(image)
print(image.shape)model = MyModule()
model.load_state_dict(torch.load("my_module_5.pth"))image = torch.reshape(image,(1,3,32,32))
model.eval()
with torch.no_grad():output = model(image)print(output)
print(output.argmax(1))# torch.Size([3, 32, 32])
# tensor([[-1.5852, -1.3985, 1.0891, 2.5762, 0.1534, 2.0844, 0.6164, 1.7049,# -4.7464, -1.4447]])
# tensor([3])
其實我的模型準確率沒有很高,但是對于這張圖片竟然驚人的分對了(第3類-cat)