?做一次按NLP項目常見工具的使用拆解
1. tokenizer
from torchtext.data.utils import get_tokenizertokenizer = get_tokenizer('basic_english')
text_sample = "We're going on an adventure! The weather is really nice today."
tokens = tokenizer(text_sample)
print(tokens)
['we', "'", 're', 'going', 'on', 'an', 'adventure', '!', 'the', 'weather', 'is', 'really', 'nice', 'today', '.']
2. vocab
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer# 創建分詞器
tokenizer = get_tokenizer('basic_english')# 測試數據
test_sentences = ["The quick brown fox jumps over the lazy dog.","Hello world! This is a test for building vocabulary.",
]vocab = build_vocab_from_iterator((tokenizer(sentence) for sentence in test_sentences),specials=['<unk>', '<pad>'],min_freq=1 # 設置最小頻率為1
)vocab.set_default_index(vocab['<unk>'])print("詞表大小:", len(vocab))
print("'fox'的索引:", vocab['fox'])
詞表大小: 21? ?
'fox'的索引: 10
3. Dataloader(示例1)
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer# 1. 創建分詞器
tokenizer = get_tokenizer('basic_english')# 2. 測試數據
train_sentences = ["The quick brown fox jumps over the lazy dog.","Hello world! This is a test for building vocabulary.",# 你可以在這里添加更多訓練句子
]
test_sentences = ["The quick brown fox jumps over the lazy dog.","Hello world! This is a test for building vocabulary.",
]# 3. 構建詞表
vocab = build_vocab_from_iterator((tokenizer(sentence) for sentence in train_sentences),specials=['<unk>', '<pad>'],min_freq=1
)
vocab.set_default_index(vocab['<unk>'])print("詞表大小:", len(vocab))
print("'fox'的索引:", vocab['fox'])# 4. 自定義 Dataset
class TextDataset(Dataset):def __init__(self, sentences, vocab, tokenizer):self.sentences = sentencesself.vocab = vocabself.tokenizer = tokenizerdef __len__(self):return len(self.sentences)def __getitem__(self, idx):tokens = self.tokenizer(self.sentences[idx])indices = [self.vocab[token] for token in tokens]return torch.tensor(indices, dtype=torch.long)# 5. 創建 Dataset 實例
train_dataset = TextDataset(train_sentences, vocab, tokenizer)
test_dataset = TextDataset(test_sentences, vocab, tokenizer)# 6. DataLoader 與 Padding Collate 函數def collate_fn(batch):# batch 是一個 list of tensorsreturn pad_sequence(batch, batch_first=True, padding_value=vocab['<pad>'])train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=2, shuffle=False, collate_fn=collate_fn)# 7. 測試 DataLoader 輸出
print("\n=== Train Batch Indices ===")
for batch in train_loader:print(batch)breakprint("\n=== Test Batch Indices ===")
for batch in test_loader:print(batch)break
=== Train Batch Indices ===
tensor([[11, 20, ?4, 18, 12, ?5, 17, ?9, ?7, 19, ?2],
? ? ? ? [ 3, 16, ?6, 10, 13, 15, ?3, 14, ?8, ?2, ?1]])=== Test Batch Indices ===
tensor([[ 3, 16, ?6, 10, 13, 15, ?3, 14, ?8, ?2, ?1],
? ? ? ? [11, 20, ?4, 18, 12, ?5, 17, ?9, ?7, 19, ?2]])
4. Dataloader(示例2)?
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer# 1. 創建分詞器
tokenizer = get_tokenizer('basic_english')# 2. 帶標簽的訓練與測試數據 (句子, 標簽)
train_data = [("The quick brown fox jumps over the lazy dog.", 1), # 正面情感("Hello world! This is a test for building vocabulary.", 0), # 負面情感# 可添加更多 (sentence, label)
]
test_data = [("The quick brown fox jumps over the lazy dog.", 1),("Hello world! This is a test for building vocabulary.", 0),
]# 3. 構建詞表,只基于訓練數據中的句子
vocab = build_vocab_from_iterator((tokenizer(sentence) for sentence, _ in train_data),specials=['<unk>', '<pad>'],min_freq=1
)
vocab.set_default_index(vocab['<unk>'])print("詞表大小:", len(vocab))
print("'fox'的索引:", vocab['fox'])# 4. 自定義 Dataset,返回 (indices_tensor, label_tensor)
class TextDataset(Dataset):def __init__(self, data, vocab, tokenizer):self.data = dataself.vocab = vocabself.tokenizer = tokenizerdef __len__(self):return len(self.data)def __getitem__(self, idx):sentence, label = self.data[idx]tokens = self.tokenizer(sentence)indices = [self.vocab[token] for token in tokens]return torch.tensor(indices, dtype=torch.long), torch.tensor(label, dtype=torch.long)# 5. Padding 與 collate_fn
def collate_fn(batch):sequences, labels = zip(*batch)padded_seqs = pad_sequence(sequences, batch_first=True, padding_value=vocab['<pad>'])labels_tensor = torch.stack(labels)return padded_seqs, labels_tensor# 6. 創建 DataLoader
train_dataset = TextDataset(train_data, vocab, tokenizer)
test_dataset = TextDataset(test_data, vocab, tokenizer)train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True, collate_fn=collate_fn
)
test_loader = DataLoader(test_dataset, batch_size=2, shuffle=False, collate_fn=collate_fn
)# 7. 測試輸出
print("\n=== Train Batch ===")
for seq_batch, label_batch in train_loader:print("Sequences:", seq_batch)print("Labels: ", label_batch)breakprint("\n=== Test Batch ===")
for seq_batch, label_batch in test_loader:print("Sequences:", seq_batch)print("Labels: ", label_batch)break