- ???🍨?本文為🔗365天深度學習訓練營中的學習記錄博客
- ? ?🍖?原作者:K同學啊
Transformer通過自注意力機制,改變了序列建模的方式,成為AI領域的基礎架構
編碼器:理解輸入,提取上下文特征。
解碼器:基于編碼特征,按順序生成輸出。
1.多頭注意力機制
import math import torch import torch.nn as nndevice=torch.device('cuda' if torch.cuda.is_available() else 'cpu')class MultiHeadAttention(nn.Module):# n_heads:多頭注意力的數量# hid_dim:每個詞輸出的向量維度def __init__(self,hid_dim,n_heads):super(MultiHeadAttention,self).__init__()self.hid_dim=hid_dimself.n_heads=n_heads#強制hid_dim必須整除 hassert hid_dim % n_heads == 0#定義W_q矩陣ceself.w_q=nn.Linear(hid_dim,hid_dim)#定義W_k矩陣self.w_k=nn.Linear(hid_dim,hid_dim)#定義W_v矩陣self.w_v=nn.Linear(hid_dim,hid_dim)self.fc =nn.Linear(hid_dim,hid_dim)#縮放self.scale=torch.sqrt(torch.FloatTensor([hid_dim//n_heads]))def forward(self,query,key,value,mask=None):#Q,K,V的在句子這長度這一個維度的數值可以不一樣,可以一樣#K:[64,10,300],假設batch_size為64,有10個詞,每個詞的Query向量是300維bsz=query.shape[0]Q =self.w_q(query)K =self.w_k(key)V =self.w_v(value)#這里把K Q V 矩陣拆分為多組注意力#最后一維就是是用self.hid_dim // self.n_heads 來得到的,表示每組注意力的向量長度,每個head的向量長度是:300/6=50#64表示batch size,6表示有6組注意力,10表示有10詞,50表示每組注意力的詞的向量長度#K: [64,10,300] 拆分多組注意力 -> [64,10,6,50] 轉置得到 -> [64,6,10,50]#轉置是為了把注意力的數量6放在前面,把10和50放在后面,方便下面計算Q=Q.view(bsz,-1,self.n_heads,self.hid_dim//self.n_heads).permute(0,2,1,3)K=K.view(bsz,-1,self.n_heads,self.hid_dim//self.n_heads).permute(0,2,1,3)V=V.view(bsz,-1,self.n_heads,self.hid_dim//self.n_heads).permute(0,2,1,3)#Q乘以K的轉置,除以scale#[64,6,12,50]*[64,6,50,10]=[64,6,12,10]#attention:[64,6,12,10]attention=torch.matmul(Q,K.permute(0,1,3,2)) / self.scale#如果mask不為空,那么就把mask為0的位置的attention分數設置為-1e10,這里用‘0’來指示哪些位置的詞向量不能被attention到,比如padding位置if mask is not None:attention=attention.masked_fill(mask==0,-1e10)#第二步:計算上一步結果的softmax,再經過dropout,得到attention#注意,這里是對最后一維做softmax,也就是在輸入序列的維度做softmax#attention: [64,6,12,10]attention=torch.softmax(attention,dim=-1)#第三步,attention結果與V相乘,得到多頭注意力的結果#[64,6,12,10] * [64,6,10,50] =[64,6,12,50]# x: [64,6,12,50]x=torch.matmul(attention,V)#因為query有12個詞,所以把12放在前面,把50和6放在后面,方便下面拼接多組的結果#x: [64,6,12,50] 轉置 -> [64,12,6,50]x=x.permute(0,2,1,3).contiguous()#這里的矩陣轉換就是:把多頭注意力的結果拼接起來#最后結果就是[64,12,300]# x:[64,12,6,50] -> [64,12,300]x=x.view(bsz,-1,self.n_heads*(self.hid_dim//self.n_heads))x=self.fc(x)return x
2.前饋傳播
class Feedforward(nn.Module):def __init__(self,d_model,d_ff,dropout=0.1):super(Feedforward,self).__init__()#兩層線性映射和激活函數self.linear1=nn.Linear(d_model,d_ff)self.dropout=nn.Dropout(dropout)self.linear2=nn.Linear(d_ff,d_model)def forward(self,x):x=torch.nn.functional.relu(self.linear1(x))x=self.dropout(x)x=self.linear2(x)return x
3.位置編碼
class PositionalEncoding(nn.Module):"實現位置編碼"def __init__(self, d_model, dropout=0.1, max_len=5000):super(PositionalEncoding, self).__init__()self.dropout = nn.Dropout(p=dropout)# 初始化Shape為(max_len, d_model)的PE (positional encoding)pe = torch.zeros(max_len, d_model).to(device)# 初始化一個tensor [[0, 1, 2, 3, ...]]position = torch.arange(0, max_len).unsqueeze(1)# 這里就是sin和cos括號中的內容,通過e和ln進行了變換div_term = torch.exp(torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model))pe[:, 0::2] = torch.sin(position * div_term) # 計算PE(pos, 2i)pe[:, 1::2] = torch.cos(position * div_term) # 計算PE(pos, 2i+1)pe = pe.unsqueeze(0) # 為了方便計算,在最外面在unsqueeze出一個batch# 如果一個參數不參與梯度下降,但又希望保存model的時候將其保存下來# 這個時候就可以用register_bufferself.register_buffer("pe", pe)def forward(self, x):"""x 為embedding后的inputs,例如(1,7, 128),batch size為1,7個單詞,單詞維度為128"""# 將x和positional encoding相加。x = x + self.pe[:, :x.size(1)].requires_grad_(False)return self.dropout(x)
4.編碼層
class EncoderLayer(nn.Module):def __init__(self,d_model,n_heads,d_ff,dropout=0.1):super(EncoderLayer,self).__init__()#編碼器層包含自注意機制和前饋神經網絡self.self_attn=MultiHeadAttention(d_model,n_heads)self.feedforward=Feedforward(d_model,d_ff,dropout)self.norm1=nn.LayerNorm(d_model)self.norm2=nn.LayerNorm(d_model)self.dropout=nn.Dropout(dropout)def forward(self,x,mask):#自注意力機制atten_output=self.self_attn(x,x,x,mask)x=x+self.dropout(atten_output)x=self.norm1(x)#前饋神經網絡ff_output=self.feedforward(x)x=x+self.dropout(ff_output)x=self.norm2(x)return x
5.解碼層
class DecoderLayer(nn.Module):def __init__(self, d_model, n_heads, d_ff, dropout=0.1):super(DecoderLayer, self).__init__()# 解碼器層包含自注意力機制、編碼器-解碼器注意力機制和前饋神經網絡self.self_attn = MultiHeadAttention(d_model, n_heads)self.enc_attn = MultiHeadAttention(d_model, n_heads)self.feedforward = Feedforward(d_model, d_ff, dropout)self.norm1 = nn.LayerNorm(d_model)self.norm2 = nn.LayerNorm(d_model)self.norm3 = nn.LayerNorm(d_model)self.dropout = nn.Dropout(dropout)def forward(self, x, enc_output, self_mask, context_mask):# 自注意力機制attn_output = self.self_attn(x, x, x, self_mask)x = x + self.dropout(attn_output)x = self.norm1(x)# 編碼器-解碼器注意力機制attn_output = self.enc_attn(x, enc_output, enc_output, context_mask)x = x + self.dropout(attn_output)x = self.norm2(x)# 前饋神經網絡ff_output = self.feedforward(x)x = x + self.dropout(ff_output)x = self.norm3(x)return x
6.Transformer模型構建
class Transformer(nn.Module):def __init__(self, vocab_size, d_model, n_heads, n_encoder_layers, n_decoder_layers, d_ff, dropout=0.1):super(Transformer, self).__init__()# Transformer 模型包含詞嵌入、位置編碼、編碼器和解碼器self.embedding = nn.Embedding(vocab_size, d_model)self.positional_encoding = PositionalEncoding(d_model)self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, n_heads, d_ff, dropout) for _ in range(n_encoder_layers)])self.decoder_layers = nn.ModuleList([DecoderLayer(d_model, n_heads, d_ff, dropout) for _ in range(n_decoder_layers)])self.fc_out = nn.Linear(d_model, vocab_size)self.dropout = nn.Dropout(dropout)def forward(self, src, trg, src_mask, trg_mask):# 詞嵌入和位置編碼src = self.embedding(src)src = self.positional_encoding(src)trg = self.embedding(trg)trg = self.positional_encoding(trg)# 編碼器for layer in self.encoder_layers:src = layer(src, src_mask)# 解碼器for layer in self.decoder_layers:trg = layer(trg, src, trg_mask, src_mask)# 輸出層output = self.fc_out(trg)return output
vocab_size = 10000 d_model = 128 n_heads = 8 n_encoder_layers = 6 n_decoder_layers = 6 d_ff = 2048 dropout = 0.1device = torch.device('cpu')transformer_model = Transformer(vocab_size, d_model, n_heads, n_encoder_layers, n_decoder_layers, d_ff, dropout)# 定義輸入 src = torch.randint(0, vocab_size, (32, 10)) # 源語言句子 trg = torch.randint(0, vocab_size, (32, 20)) # 目標語言句子 src_mask = (src != 0).unsqueeze(1).unsqueeze(2) # 掩碼,用于屏蔽填充的位置 trg_mask = (trg != 0).unsqueeze(1).unsqueeze(2) # 掩碼,用于屏蔽填充的位置# 模型前向傳播 output = transformer_model(src, trg, src_mask, trg_mask) print(output.shape)