GPT 結束語設計以nanogpt為例

1、簡述

2、分詞設計

3、結束語斷點

1、簡述

在手搓gpt的時候，可能會遇到一些性能問題，即關于是否需要全部輸出或者怎么節約資源。

在輸出語句被max_new_tokens 限制，如果出現一些輸出句子比較長，就會被限制，但如果是設計時候沒有設計結束語，就會出現全部輸出的問題。

如果只需要一部分的語句，或者是某一些特定的場景設計，例如：

1、gpt自動化操作

2、輸出美觀

3、一些較小的業務場景，特定處理的業務

以上的業務場景都是設計的時候為特定模型，即小大模型，通常不需要較大的參數，所以在設計時候如果考慮到輕量化和小型化，參數1M至100M之間的小大模型。

基于成本和開發快速考慮，可以使用nanogpt用于訓練和開發，然后再進一步的微調迭代，所需要的性能和效果基本可以滿足部分要求，迭代速度較快，適合單人或小團隊開發特定場景。

2、分詞設計

以下是關于之前做過的一個開發場景：音樂生成按鍵的場景

分詞中加入了end的作為特定的結束語，如果后續擴展可以通過end前后設計一些音樂風格的標識符，這樣通過風格的標識來達到風格的統一。

# 自定義詞典
word_dict = set(['\n', ' ', '+', '.', '0', '1', '2', '3', '4'
? ? ? ? ?'6', '7', '8', '9', ':', "'a'", "'b'", "'c'", "'d'",
? ? ? ? ?"'e'", "'f'", "'g'", "'h'","'j'", "'n'","'m'","'q'","'w'","'r'","'t'","'y'","'u'",
? ? ? ? "'s'", "'v'", "'x'", "'z'",'<96>','<97>','<98>','<99>','<100>',
? ? ? ? '<101>','<102>','<103>','<104>','<105>','end'])

seg_list = max_forward_matching(data, word_dict, max(len(word) for word in word_dict))
words = list(seg_list)
# 創建一個默認字典來存儲詞匯到ID的映射
word_to_id = defaultdict(lambda: len(word_to_id))
# 創建一個列表來存儲ID到詞匯的映射（可選）
id_to_word = []
# 構建詞匯到ID的映射
for word in words:
? ? word_id = word_to_id[word]
? ? # ID到詞匯的映射，可以這樣做：
? ? if word_id == len(word_to_id): ?# 只有當新的ID被分配時才添加到id_to_word中
? ? ? ? id_to_word.append(word)

import os
import pickle
import requests
import numpy as np
from collections import defaultdict
# download the tiny shakespeare dataset
input_file_path = os.path.join(os.path.dirname(__file__), 'music.txt')
if not os.path.exists(input_file_path):data_url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'with open(input_file_path, 'w') as f:f.write(requests.get(data_url).text)with open(input_file_path, 'r',encoding="utf-8") as f:data = f.read()
print(f"length of dataset in characters: {len(data):,}")# get all the unique characters that occur in this text
def max_forward_matching(text, word_dict, max_len):result = []index = 0while index < len(text):found = Falsefor size in range(max_len, 0, -1):  # 從最大長度開始嘗試匹配piece = text[index:index + size]if piece in word_dict:result.append(piece)index += sizefound = Truebreakif not found:  # 如果沒有找到匹配的詞，則按字符輸出result.append(text[index])index += 1return result#自建一套
# 自定義詞典
word_dict = set(['\n', ' ', '+', '.', '0', '1', '2', '3', '4''6', '7', '8', '9', ':', "'a'", "'b'", "'c'", "'d'","'e'", "'f'", "'g'", "'h'","'j'", "'n'","'m'","'q'","'w'","'r'","'t'","'y'","'u'","'s'", "'v'", "'x'", "'z'",'<96>','<97>','<98>','<99>','<100>','<101>','<102>','<103>','<104>','<105>','end'])seg_list = max_forward_matching(data, word_dict, max(len(word) for word in word_dict))
words = list(seg_list)
# 創建一個默認字典來存儲詞匯到ID的映射
word_to_id = defaultdict(lambda: len(word_to_id))
# 創建一個列表來存儲ID到詞匯的映射（可選）
id_to_word = []
# 構建詞匯到ID的映射
for word in words:word_id = word_to_id[word]# ID到詞匯的映射，可以這樣做：if word_id == len(word_to_id):  # 只有當新的ID被分配時才添加到id_to_word中id_to_word.append(word)chars = list(word_to_id)
print(chars)
vocab_size = len(chars)print("all the unique characters:", ''.join(chars))
print(f"vocab size: {vocab_size:,}")
#Myzzb That is need about jieba to cut text
print(chars)
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
print(stoi)
itos = { i:ch for i,ch in enumerate(chars) }
print(itos)def encode(s):seg_list = max_forward_matching(data, word_dict, max(len(word) for word in word_dict))words = list(seg_list)# 創建一個默認字典來存儲詞匯到ID的映射word_to_id = defaultdict(lambda: len(word_to_id))# 創建一個列表來存儲ID到詞匯的映射id_to_word = []# 構建詞匯到ID的映射for word in words:word_id = word_to_id[word]# 如果你也需要ID到詞匯的映射，可以這樣做：if word_id == len(word_to_id):  # 只有當新的ID被分配時才添加到id_to_word中id_to_word.append(word)return [word_to_id[word] for word in words] # encoder: take a string, output a list of integers
def decode(l):seg_list = max_forward_matching(data, word_dict, max(len(word) for word in word_dict))words = list(seg_list)# 創建一個默認字典來存儲詞匯到ID的映射word_to_id = defaultdict(lambda: len(word_to_id))# 創建一個列表來存儲ID到詞匯的映射（可選）id_to_word = []# 構建詞匯到ID的映射for word in words:word_id = word_to_id[word]# 如果你也需要ID到詞匯的映射，可以這樣做：if word_id == len(word_to_id):  # 只有當新的ID被分配時才添加到id_to_word中id_to_word.append(word)return ''.join([word_to_id[word] for word in words]) # decoder: take a list of integers, output a string
# create the train and test splits
n = len(data)
train_data = data[:int(n*0.95)]#這里因為沒寫字典排序，所以訓練集和測試集懶得分開
val_data = data[int(n*0.95):]
# print(val_data)
# encode both to integers
train_ids = encode(train_data)
print(train_ids)
val_ids = encode(val_data)
print(val_ids)
# print(val_ids)
print(f"train has {len(train_ids):,} tokens")
print(f"val has {len(val_ids):,} tokens")# export to bin files
train_ids = np.array(train_ids, dtype=np.uint16)
val_ids = np.array(val_ids, dtype=np.uint16)
train_ids.tofile(os.path.join(os.path.dirname(__file__), 'train.bin'))
val_ids.tofile(os.path.join(os.path.dirname(__file__), 'val.bin'))# save the meta information as well, to help us encode/decode later
meta = {'vocab_size': vocab_size,'itos': itos,'stoi': stoi,
}
with open(os.path.join(os.path.dirname(__file__), 'meta.pkl'), 'wb') as f:pickle.dump(meta, f)

3、結束語斷點

通過在推理過程中檢測新生成的編碼是否和結束語一致，以上在設計的過程中通過字典分詞，然后再分配的編碼，是可以通過代碼獲取對應的結束語的編碼。

通過在分詞的時候進行對部分結束語進行輸出，例子：

print(encode("\n"))
print(encode("\t"))

源碼添加上，即可知道結束語的編碼是多少：

"""
Prepare the Shakespeare dataset for character-level language modeling.
So instead of encoding with GPT-2 BPE tokens, we just map characters to ints.
Will save train.bin, val.bin containing the ids, and meta.pkl containing the
encoder and decoder and some other related info.
"""
import os
import pickle
import requests
import numpy as np# download the tiny shakespeare dataset
input_file_path = os.path.join(os.path.dirname(__file__), 'say.txt')
if not os.path.exists(input_file_path):data_url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'with open(input_file_path, 'w') as f:f.write(requests.get(data_url).text)with open(input_file_path, 'r',encoding="utf-8", errors='replace') as f:data = f.read()
print(f"length of dataset in characters: {len(data):,}")# get all the unique characters that occur in this text
chars = sorted(list(set(data)))
vocab_size = len(chars)
print("all the unique characters:", ''.join(chars))
print(f"vocab size: {vocab_size:,}")# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }def encode(s):return [stoi[c] for c in s] # encoder: take a string, output a list of integers
def decode(l):return ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a stringprint(encode("\n"))
print(encode("\t"))# create the train and test splits
n = len(data)
train_data = data[:int(n*0.9)]
val_data = data[int(n*0.9):]# encode both to integers
train_ids = encode(train_data)
val_ids = encode(val_data)
print(f"train has {len(train_ids):,} tokens")
print(f"val has {len(val_ids):,} tokens")# export to bin files
train_ids = np.array(train_ids, dtype=np.uint16)
val_ids = np.array(val_ids, dtype=np.uint16)
train_ids.tofile(os.path.join(os.path.dirname(__file__), 'train.bin'))
val_ids.tofile(os.path.join(os.path.dirname(__file__), 'val.bin'))# save the meta information as well, to help us encode/decode later
meta = {'vocab_size': vocab_size,'itos': itos,'stoi': stoi,
}
with open(os.path.join(os.path.dirname(__file__), 'meta.pkl'), 'wb') as f:pickle.dump(meta, f)# length of dataset in characters:  1115394
# all the unique characters:
#  !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
# vocab size: 65
# train has 1003854 tokens
# val has 111540 tokens

只需要簡單添加一句代碼即可：

# 檢查是否生成了結束語可以獲取大部分結束語的編碼用于判斷也可以自擬結束語將其處理為唯一的標識符避免干擾
if 1 in idx_next[0].tolist():break

@torch.no_grad()def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):"""Take a conditioning sequence of indices idx (LongTensor of shape (b,t)) and completethe sequence max_new_tokens times, feeding the predictions back into the model each time.Most likely you'll want to make sure to be in model.eval() mode of operation for this."""for _ in range(max_new_tokens):# if the sequence context is growing too long we must crop it at block_sizeidx_cond = idx if idx.size(1) <= self.config.block_size else idx[:, -self.config.block_size:]# forward the model to get the logits for the index in the sequencelogits, _ = self(idx_cond)# pluck the logits at the final step and scale by desired temperaturelogits = logits[:, -1, :] / temperature# optionally crop the logits to only the top k optionsif top_k is not None:v, _ = torch.topk(logits, min(top_k, logits.size(-1)))logits[logits < v[:, [-1]]] = -float('Inf')# apply softmax to convert logits to (normalized) probabilitiesprobs = F.softmax(logits, dim=-1)# sample from the distributionidx_next = torch.multinomial(probs, num_samples=1)# 檢查是否生成了結束語 可以獲取大部分結束語的編碼用于判斷 也可以自擬結束語 將其處理為唯一的標識符避免干擾if 1 in idx_next[0].tolist():break# append sampled index to the running sequence and continueidx = torch.cat((idx, idx_next), dim=1)return idx