def read_time_machine(): with open(d2l.download('time_machine'),'r') as f: lines = f.readlines() return [re.sub('[^A-Za-z]+', ' ', line).strip().lower() for line in lines]
def tokenize(lines, token='word'): if token == 'word': return [line.split() for line in lines] elif token == 'char': return [list(line) for line in lines] else: print('未知词元')
tokens = tokenize(lines) for i in range(11): print(token[i])
class Vocab: def __init__(self, tokens=None, min_freq=0, reversed_tokens=None): if tokens is None: tokens = [] if reversed_tokens is None: reversed_tokens = []
self.idx_to_token = ['<unk>'] + reversed_tokens self.token_to_idx = {token: idx for idx, token in enumerate(self.idx_to_token)}
for token, freq in self._token_freqs: if freq<min_freq: break if token not in self.token_to_idx: self.idx_to_token.append(token) self.token_to_idx[token] = len(self.idx_to_token)-1 def __len__(self): return len(self.idx_to_token)
def __getitem__(self, tokens): if not isinstance(tokens, (list, tuple)): return self.token_to_idx.get(tokens, self.unk) return [self.__getitem__(token) for token in tokens]
def to_tokens(self, indices): if not isinstance(indices, (list, tuple)): return self.idx_to_token[indices] return [self.idx_to_token[index] for index in indices]
def count_corpus(tokens): if len(tokens) == 0 or isinstance(tokens[0], list): tokens = [token for line in tokens for token in line] return collections.Counter(tokens)
def load_corpus_time_machine(max_tokens=-1): '''返回时光机器数据集的词元索引列表和词表''' lines = read_time_machine() tokens = tokenize(lines, 'char') vocab = Vocab(tokens) corpus = [vocab[token] for line in tokens for token in line] if max_tokens>0: corpus = corpus[:max_tokens] return corpus, vocab
def read_time_machine(): with open(d2l.download('time_machine'),'r') as f: lines = f.readlines() return [re.sub('[^A-Za-z]+', ' ', line).strip().lower() for line in lines]
tokens =d2l.tokenize(read_time_machine()) corpus = [token for line in tokens for token in line] vocab = d2l.Vocab(corpus)
vocab.token_freqs[:10]
出现较多的词往往没有吸引人的地方,这些词通常被称为停用词,因此可以被过滤掉。
尽管如此,它们本身是有意义的,我们仍然会在模型中使用它们。
此外,词频衰减速度相当快也是一个问题。
1 2
freqs = [freq for token, freq in vocab.token_freqs] d2l.plot(freqs, xlabel='token: x',ylabel='frequency: n(x)', xscale='log', yscale='log')
num_batchs = num_subseqs//batch_size for i in range(0, batch_size*num_batchs, batch_size): initial_indices_per_batch = initial_indices[i:i+batch_size] X = [data(j) for j in initial_indices_per_batch] Y = [data(j+1) for j in initial_indices_per_batch] yield torch.tensor(X), torch.tensor(Y)
下面我们生成一个从0到34的序列。假设批量大小为2,时间步数为5。
1 2 3 4
my_seq = list(range(35))
for X,Y in seq_data_iter_random(my_seq, 2, 5): print(X, Y)
def read_time_machine(): #@save with open(d2l.download('time_machine'),'r') as f: lines = f.readlines() return [re.sub('[^A-Za-z]+', ' ', line).strip().lower() for line in lines]
def load_corpus_time_machine(max_tokens=-1): #@save '''返回时光机器数据集的词元索引列表和词表''' lines = read_time_machine() tokens = d2l.tokenize(lines, 'char') vocab = d2l.Vocab(tokens) corpus = [vocab[token] for line in tokens for token in line] if max_tokens>0: corpus = corpus[:max_tokens] return corpus, vocab
def seq_data_iter_sequential(corpus, batch_size, num_steps): '''使用顺序分区生成一个小批量子序列''' offset = random.randint(0, num_steps) num_tokens = ((len(corpus)-offset-1)//batch_size)*batch_size Xs = torch.tensor(corpus[offset:offset+num_tokens]) Ys = torch.tensor(corpus[offset+1:offset+num_tokens+1]) Xs, Ys = Xs.reshape(batch_size, -1), Ys.reshape(batch_size, -1) num_batchs = Xs.shape[1]//num_steps for i in range(0, num_steps*num_batchs, num_steps): X = Xs[:, i:i+num_steps] Y = Ys[:, i:i+num_steps] yield X, Y
def predict(prefix, num_preds, net, vocab, device): '''在prefix后面生成新字符''' state = net.begin_state(batch_size=1, device=device) outputs = [vocab[prefix[0]]] get_input = lambda: torch.tensor([outputs[-1]], device=device).reshape((1, 1)) # 预热 for y in prefix[1:]: _, state = net(get_input(), state) outputs.append(vocab[y]) # 预测num_preds for _ in range(num_preds): y, state = net(get_input(), state) outputs.append(int(y.argmax(dim=1).reshape(1))) return ''.join([vocab.idx_to_token[i] for i in outputs])
def train_epoch(net, train_iter, loss, updater, device, use_random_iter): '''训练网络一轮''' state = None metric = d2l.Accumulator(2) # 训练损失之和,词元数量 for X,Y in train_iter: if state is None or use_random_iter: state = net.begin_state(X.shape[0], device) else: if isinstance(net, nn.Module) and not isinstance(state, tuple): # state对于nn.GRU是一个张量 state.detach_() else: # state对于nn.LSTM或对于我们从零开始实现的模型是一个由张量组成的元组 for s in state: s.detach_() y = Y.T.reshape(-1) X,y = X.to(device), y.to(device) y_hat, state = net(X, state) l = loss(y_hat, y.long()).mean() updater.zero_grad() l.backward() grad_clipping(net, 1) updater.step() metric.add(l*y.numel(), y.numel()) return math.exp(metric[0]/metric[1])
def read_time_machine(): with open(d2l.download('time_machine'), 'r') as f: lines = f.readlines() return [re.sub('[^A-Za-z]+', ' ', line).strip().lower() for line in lines]
def tokenize(lines, token='word'): if token == 'word': return [line.split() for line in lines] else: return [list(line) for line in lines]
import collections
def count_corpus(tokens): if len(tokens) == 0 or isinstance(tokens[0], list): tokens = [token for line in tokens for token in line] return collections.Counter(tokens)
class Vocab: def __init__(self, tokens, min_freq=0, reversed_tokens=None): if tokens is None: tokens = [] if reversed_tokens is None: reversed_tokens = []
self.idx_to_token = ['<unk>'] + reversed_tokens self.token_to_idx = {token: idx for idx, token in enumerate(self.idx_to_token)}
for token, freq in self._token_freqs: if freq<min_freq: break if token not in self.token_to_idx: self.idx_to_token.append(token) self.token_to_idx[token] = len(self.idx_to_token)-1
def __len__(self): return len(self.idx_to_token)
def __getitem__(self, tokens): if not isinstance(tokens, (list, tuple)): return self.token_to_idx.get(tokens, self.unk)
return [self.__getitem__(token) for token in tokens]
def to_tokens(self, indices): if not isinstance(indices, (list, tuple)): return self.idx_to_token[indices] return [self.idx_to_token[index] for index in indices]
for i in range(0, num_batchs*batch_size, batch_size): initial_indices_per_batch = initial_indices[i:i+batch_size] X = [data(j) for j in initial_indices_per_batch] Y = [data(j+1) for j in initial_indices_per_batch] yield torch.tensor(X), torch.tensor(Y)
def seq_data_iter_sequential(corpus, batch_size, num_steps): offset = random.randint(0, num_steps) num_tokens = ((len(corpus)-offset-1)//batch_size)*batch_size Xs = torch.tensor(corpus[offset:offset+num_tokens]) Ys = torch.tensor(corpus[offset+1:offset+num_tokens+1]) Xs, Ys = Xs.reshape((batch_size, -1)), Ys.reshape((batch_size, -1)) num_batchs = Xs.shape[1]//num_steps for i in range(0, num_steps*num_batchs, num_steps): X = Xs[:, i:i+num_steps] Y = Ys[:, i:i+num_steps] yield X, Y
def load_corpus_time_machine(max_tokens = -1): lines = read_time_machine() tokens = tokenize(lines, 'char') vocab = Vocab(tokens) corpus = [vocab[token] for line in tokens for token in line] if max_tokens>0: corpus = corpus[:max_tokens] return corpus, vocab
class SeqDataLoader: def __init__(self, batch_size, num_steps, use_random_iter, max_tokens): if use_random_iter: self.data_iter_fn = seq_data_iter_random else: self.data_iter_fn = seq_data_iter_sequential
def forward(self, inputs, state): X = F.one_hot(inputs.T.long(), self.vocab_size) X = X.type(torch.float32) Y, state = self.rnn(X, state) output = self.linear(Y.reshape(-1, Y.shape[-1])) return output, state
device = 'cuda:0' net = RNNModel(rnn_layer, len(vocab)) net = net.to(device)
def predict(prefix, num_preds, net, vocab, device) : outputs = [vocab[prefix[0]]] state = net.begin_state(batch_size=1, device=device) get_input = lambda: torch.tensor([outputs[-1]], device=device).reshape((1,1)) for y in prefix[1:]: _, state = net(get_input(), state) outputs.append(vocab[y]) for _ in range(num_preds): y, state = net(get_input(), state) outputs.append(int(y.argmax(dim=1).reshape(1))) return ''.join([vocab.idx_to_token[i]for i in outputs])
def grad_clipping(params, theta): norm = torch.sqrt(sum(torch.sum((p.grad**2)) for p in params)) if norm>theta: for p in params: p.grad[:] *= theta/norm
def train_epoch(net, train_iter, loss, updater, use_random_iter): state = None total_loss, n_train = 0, 0 for X, Y in train_iter: if state == None or use_random_iter: state = net.begin_state(batch_size=X.shape[0], device=device) else: if isinstance(net, nn.Module): state.detach_() else: for s in state: s.detach_() y = Y.T.reshape(-1)
X, y = X.to(device), y.to(device) y_hat, state = net(X, state) l = loss(y_hat, y).mean()