diff --git a/chapter_installation/index.md b/chapter_installation/index.md index d75d133cf9..8c9ac5810a 100644 --- a/chapter_installation/index.md +++ b/chapter_installation/index.md @@ -21,10 +21,10 @@ For example, if you are using macOS and Python 3.x you would download the bash script whose name contains the strings "Miniconda3" and "MacOSX", navigate to the download location, -and execute the installation as follows: +and execute the installation as follows (taking Intel Macs as an example): ```bash -sh Miniconda3-latest-MacOSX-x86_64.sh -b +sh Miniconda3-py39_4.12.0-MacOSX-x86_64.sh -b ``` @@ -50,7 +50,7 @@ You should be able to create a new environment as follows: ```bash -conda create --name d2l python=3.8 -y +conda create --name d2l python=3.9 -y ``` @@ -179,20 +179,20 @@ pip uninstall mxnet We now need to find out what version of CUDA you have installed. You can check this by running `nvcc --version` or `cat /usr/local/cuda/version.txt`. -Assume that you have installed CUDA 10.1, +Assume that you have installed CUDA 10.2, then you can install with the following command: ```bash # For Windows users -pip install mxnet-cu101==1.7.0 -f https://dist.mxnet.io/python +pip install mxnet-cu102==1.7.0 -f https://dist.mxnet.io/python # For Linux and macOS users -pip install mxnet-cu101==1.7.0 +pip install mxnet-cu102==1.7.0 ``` -You may change the last digits according to your CUDA version, e.g., `cu100` for -CUDA 10.0 and `cu90` for CUDA 9.0. +You may change the last digits according to your CUDA version, e.g., `cu101` for +CUDA 10.1 and `cu90` for CUDA 9.0. :end_tab: diff --git a/config.ini b/config.ini index fbcd4202a6..97ccc35126 100644 --- a/config.ini +++ b/config.ini @@ -12,7 +12,7 @@ author = Aston Zhang, Zachary C. Lipton, Mu Li, and Alexander J. Smola copyright = 2022, All authors. Licensed under CC-BY-SA-4.0 and MIT-0. -release = 0.17.5 +release = 0.17.6 diff --git a/d2l/__init__.py b/d2l/__init__.py index f2f7468270..ed8ec6846a 100644 --- a/d2l/__init__.py +++ b/d2l/__init__.py @@ -5,7 +5,8 @@ from d2l import mxnet as d2l # Use MXNet as the backend from d2l import torch as d2l # Use PyTorch as the backend from d2l import tensorflow as d2l # Use TensorFlow as the backend +from d2l import paddle as d2l # Use PaddlePaddle as the backend """ -__version__ = "0.17.5" +__version__ = "0.17.6" diff --git a/d2l/paddle.py b/d2l/paddle.py new file mode 100644 index 0000000000..6c5813aebb --- /dev/null +++ b/d2l/paddle.py @@ -0,0 +1,2680 @@ +################# WARNING ################ +# The below part is generated automatically through: +# d2lbook build lib +# Don't edit it directly + +import collections +import hashlib +import math +import os +import random +import re +import shutil +import sys +import tarfile +import time +import zipfile +from collections import defaultdict +import pandas as pd +import requests +from IPython import display +from matplotlib import pyplot as plt +from matplotlib_inline import backend_inline + +d2l = sys.modules[__name__] + +import warnings +import numpy as np + +warnings.filterwarnings("ignore") +import paddle +import paddle.vision as paddlevision +from paddle import nn +from paddle.nn import functional as F +from paddle.vision import transforms +from PIL import Image + +paddle.disable_signal_handler() + +def use_svg_display(): + """使用svg格式在Jupyter中显示绘图 + + Defined in :numref:`sec_calculus`""" + backend_inline.set_matplotlib_formats('svg') + +def set_figsize(figsize=(3.5, 2.5)): + """设置matplotlib的图表大小 + + Defined in :numref:`sec_calculus`""" + use_svg_display() + d2l.plt.rcParams['figure.figsize'] = figsize + +def set_axes(axes, xlabel, ylabel, xlim, ylim, xscale, yscale, legend): + """设置matplotlib的轴 + + Defined in :numref:`sec_calculus`""" + axes.set_xlabel(xlabel) + axes.set_ylabel(ylabel) + axes.set_xscale(xscale) + axes.set_yscale(yscale) + axes.set_xlim(xlim) + axes.set_ylim(ylim) + if legend: + axes.legend(legend) + axes.grid() + +def plot(X, Y=None, xlabel=None, ylabel=None, legend=None, xlim=None, + ylim=None, xscale='linear', yscale='linear', + fmts=('-', 'm--', 'g-.', 'r:'), figsize=(3.5, 2.5), axes=None): + """绘制数据点 + + Defined in :numref:`sec_calculus`""" + if legend is None: + legend = [] + + set_figsize(figsize) + axes = axes if axes else d2l.plt.gca() + + # 如果X有一个轴,输出True + def has_one_axis(X): + return (hasattr(X, "ndim") and X.ndim == 1 or isinstance(X, list) + and not hasattr(X[0], "__len__")) + + if has_one_axis(X): + X = [X] + if Y is None: + X, Y = [[]] * len(X), X + elif has_one_axis(Y): + Y = [Y] + if len(X) != len(Y): + X = X * len(Y) + axes.cla() + for x, y, fmt in zip(X, Y, fmts): + if len(x): + axes.plot(x, y, fmt) + else: + axes.plot(y, fmt) + set_axes(axes, xlabel, ylabel, xlim, ylim, xscale, yscale, legend) + +class Timer: + """记录多次运行时间""" + def __init__(self): + """Defined in :numref:`subsec_linear_model`""" + self.times = [] + self.start() + + def start(self): + """启动计时器""" + self.tik = time.time() + + def stop(self): + """停止计时器并将时间记录在列表中""" + self.times.append(time.time() - self.tik) + return self.times[-1] + + def avg(self): + """返回平均时间""" + return sum(self.times) / len(self.times) + + def sum(self): + """返回时间总和""" + return sum(self.times) + + def cumsum(self): + """返回累计时间""" + return np.array(self.times).cumsum().tolist() + +def synthetic_data(w, b, num_examples): + """生成y=Xw+b+噪声 + + Defined in :numref:`sec_linear_scratch`""" + X = d2l.normal(0, 1, (num_examples, len(w))) + y = d2l.matmul(X, w) + b + y += d2l.normal(0, 0.01, y.shape) + return X, d2l.reshape(y, (-1, 1)) + +def linreg(X, w, b): + """线性回归模型 + + Defined in :numref:`sec_linear_scratch`""" + return d2l.matmul(X, w) + b + +def squared_loss(y_hat, y): + """均方损失 + + Defined in :numref:`sec_linear_scratch`""" + return (y_hat - d2l.reshape(y, y_hat.shape)) ** 2 / 2 + +def sgd(params, lr, batch_size): + """小批量随机梯度下降 + + Defined in :numref:`sec_linear_scratch`""" + with paddle.no_grad(): + for i, param in enumerate(params): + param -= lr * params[i].grad / batch_size + params[i].set_value(param) + params[i].clear_gradient() + +def load_array(data_arrays, batch_size, is_train=True): + """构造一个Paddle数据迭代器 + + Defined in :numref:`sec_linear_concise`""" + dataset = paddle.io.TensorDataset(data_arrays) + return paddle.io.DataLoader(dataset, batch_size=batch_size, + shuffle=is_train, + return_list=True) + +def get_fashion_mnist_labels(labels): + """返回Fashion-MNIST数据集的文本标签 + + Defined in :numref:`sec_fashion_mnist`""" + text_labels = ['t-shirt', 'trouser', 'pullover', 'dress', 'coat', + 'sandal', 'shirt', 'sneaker', 'bag', 'ankle boot'] + return [text_labels[int(i)] for i in labels] + +def show_images(imgs, num_rows, num_cols, titles=None, scale=1.5): + """绘制图像列表 + + Defined in :numref:`sec_fashion_mnist`""" + figsize = (num_cols * scale, num_rows * scale) + _, axes = d2l.plt.subplots(num_rows, num_cols, figsize=figsize) + axes = axes.flatten() + for i, (ax, img) in enumerate(zip(axes, imgs)): + if paddle.is_tensor(img): + # 图片张量 + ax.imshow(img.numpy()) + else: + # PIL图片 + ax.imshow(img) + ax.axes.get_xaxis().set_visible(False) + ax.axes.get_yaxis().set_visible(False) + if titles: + ax.set_title(titles[i]) + return axes + +def get_dataloader_workers(): + """使用4个进程来读取数据 + + Defined in :numref:`sec_fashion_mnist`""" + return 4 + +def load_data_fashion_mnist(batch_size, resize=None): + """下载Fashion-MNIST数据集,然后将其加载到内存中 + + Defined in :numref:`sec_fashion_mnist`""" + trans = [transforms.ToTensor()] + if resize: + trans.insert(0, transforms.Resize(resize)) + trans = transforms.Compose(trans) + mnist_train = paddle.vision.datasets.FashionMNIST(mode="train", + transform=trans) + mnist_test = paddle.vision.datasets.FashionMNIST(mode="test", + transform=trans) + return (paddle.io.DataLoader(dataset=mnist_train, + batch_size=batch_size, + shuffle=True, + return_list=True, + num_workers=get_dataloader_workers()), + paddle.io.DataLoader(dataset=mnist_test, + batch_size=batch_size, + return_list=True, + shuffle=True, + num_workers=get_dataloader_workers())) + +def accuracy(y_hat, y): + """计算预测正确的数量 + + Defined in :numref:`sec_softmax_scratch`""" + if len(y_hat.shape) > 1 and y_hat.shape[1] > 1: + y_hat = d2l.argmax(y_hat, axis=1) + cmp = d2l.astype(y_hat, y.dtype) == y + return float(d2l.reduce_sum(d2l.astype(cmp, y.dtype))) + +def accuracy(y_hat, y): + """计算预测正确的数量 + + Defined in :numref:`sec_softmax_scratch`""" + if len(y_hat.shape) > 1 and y_hat.shape[1] > 1: + y_hat = y_hat.argmax(axis=1) + if len(y_hat.shape) < len(y.shape): + cmp = y_hat.astype(y.dtype) == y.squeeze() + else: + cmp = y_hat.astype(y.dtype) == y + return float(cmp.astype(y.dtype).sum()) + +def evaluate_accuracy(net, data_iter): + """计算在指定数据集上模型的精度 + + Defined in :numref:`sec_softmax_scratch`""" + if isinstance(net, paddle.nn.Layer): + net.eval() # 将模型设置为评估模式 + metric = Accumulator(2) # 正确预测数、预测总数 + with paddle.no_grad(): + for X, y in data_iter: + metric.add(accuracy(net(X), y), d2l.size(y)) + return metric[0] / metric[1] + +class Accumulator: + """在n个变量上累加""" + def __init__(self, n): + """Defined in :numref:`sec_softmax_scratch`""" + self.data = [0.0] * n + + def add(self, *args): + self.data = [a + float(b) for a, b in zip(self.data, args)] + + def reset(self): + self.data = [0.0] * len(self.data) + + def __getitem__(self, idx): + return self.data[idx] + +def train_epoch_ch3(net, train_iter, loss, updater): + """训练模型一个迭代周期(定义见第3章) + + Defined in :numref:`sec_softmax_scratch`""" + # 将模型设置为训练模式 + if isinstance(net, paddle.nn.Layer): + net.train() + # 训练损失总和、训练准确度总和、样本数 + metric = Accumulator(3) + + for X, y in train_iter: + # 计算梯度并更新参数 + y_hat = net(X) + l = loss(y_hat, y) + if isinstance(updater, paddle.optimizer.Optimizer): + # 使用PaddlePaddle内置的优化器和损失函数 + updater.clear_grad() + l.mean().backward() + updater.step() + else: + # 使用定制的优化器和损失函数 + l.sum().backward() + updater(X.shape[0]) + metric.add(float(l.sum()), accuracy(y_hat, y), y.numel()) + return metric[0] / metric[2], metric[1] / metric[2] + +class Animator: + """在动画中绘制数据""" + def __init__(self, xlabel=None, ylabel=None, legend=None, xlim=None, + ylim=None, xscale='linear', yscale='linear', + fmts=('-', 'm--', 'g-.', 'r:'), nrows=1, ncols=1, + figsize=(3.5, 2.5)): + """Defined in :numref:`sec_softmax_scratch`""" + # 增量地绘制多条线 + if legend is None: + legend = [] + d2l.use_svg_display() + self.fig, self.axes = d2l.plt.subplots(nrows, ncols, figsize=figsize) + if nrows * ncols == 1: + self.axes = [self.axes, ] + # 使用lambda函数捕获参数 + self.config_axes = lambda: d2l.set_axes( + self.axes[0], xlabel, ylabel, xlim, ylim, xscale, yscale, legend) + self.X, self.Y, self.fmts = None, None, fmts + + def add(self, x, y): + # 向图表中添加多个数据点 + if not hasattr(y, "__len__"): + y = [y] + n = len(y) + if not hasattr(x, "__len__"): + x = [x] * n + if not self.X: + self.X = [[] for _ in range(n)] + if not self.Y: + self.Y = [[] for _ in range(n)] + for i, (a, b) in enumerate(zip(x, y)): + if a is not None and b is not None: + self.X[i].append(a) + self.Y[i].append(b) + self.axes[0].cla() + for x, y, fmt in zip(self.X, self.Y, self.fmts): + self.axes[0].plot(x, y, fmt) + self.config_axes() + display.display(self.fig) + display.clear_output(wait=True) + +def train_ch3(net, train_iter, test_iter, loss, num_epochs, updater): + """训练模型(定义见第3章) + + Defined in :numref:`sec_softmax_scratch`""" + animator = Animator(xlabel='epoch', xlim=[1, num_epochs], ylim=[0.3, 0.9], + legend=['train loss', 'train acc', 'test acc']) + for epoch in range(num_epochs): + train_metrics = train_epoch_ch3(net, train_iter, loss, updater) + test_acc = evaluate_accuracy(net, test_iter) + animator.add(epoch + 1, train_metrics + (test_acc,)) + train_loss, train_acc = train_metrics + assert train_loss < 0.5, train_loss + assert train_acc <= 1 and train_acc > 0.7, train_acc + assert test_acc <= 1 and test_acc > 0.7, test_acc + +def predict_ch3(net, test_iter, n=6): + """预测标签(定义见第3章) + + Defined in :numref:`sec_softmax_scratch`""" + for X, y in test_iter: + break + trues = d2l.get_fashion_mnist_labels(y) + preds = d2l.get_fashion_mnist_labels(d2l.argmax(net(X), axis=1)) + titles = [true +'\n' + pred for true, pred in zip(trues, preds)] + d2l.show_images( + d2l.reshape(X[0:n], (n, 28, 28)), 1, n, titles=titles[0:n]) + +def evaluate_loss(net, data_iter, loss): + """评估给定数据集上模型的损失。 + + Defined in :numref:`sec_model_selection`""" + metric = d2l.Accumulator(2) # 损失的总和, 样本数量 + for X, y in data_iter: + out = net(X) + y = y.reshape(out.shape) + l = loss(out, y) + metric.add(l.sum(), l.numel()) + return metric[0] / metric[1] + +DATA_HUB = dict() +DATA_URL = 'http://d2l-data.s3-accelerate.amazonaws.com/' + +def download(name, cache_dir=os.path.join('..', 'data')): + """下载一个DATA_HUB中的文件,返回本地文件名 + + Defined in :numref:`sec_kaggle_house`""" + assert name in DATA_HUB, f"{name} 不存在于 {DATA_HUB}" + url, sha1_hash = DATA_HUB[name] + os.makedirs(cache_dir, exist_ok=True) + fname = os.path.join(cache_dir, url.split('/')[-1]) + if os.path.exists(fname): + sha1 = hashlib.sha1() + with open(fname, 'rb') as f: + while True: + data = f.read(1048576) + if not data: + break + sha1.update(data) + if sha1.hexdigest() == sha1_hash: + return fname # 命中缓存 + print(f'正在从{url}下载{fname}...') + r = requests.get(url, stream=True, verify=True) + with open(fname, 'wb') as f: + f.write(r.content) + return fname + +def download_extract(name, folder=None): + """下载并解压zip/tar文件 + + Defined in :numref:`sec_kaggle_house`""" + fname = download(name) + base_dir = os.path.dirname(fname) + data_dir, ext = os.path.splitext(fname) + if ext == '.zip': + fp = zipfile.ZipFile(fname, 'r') + elif ext in ('.tar', '.gz'): + fp = tarfile.open(fname, 'r') + else: + assert False, '只有zip/tar文件可以被解压缩' + fp.extractall(base_dir) + return os.path.join(base_dir, folder) if folder else data_dir + +def download_all(): + """下载DATA_HUB中的所有文件 + + Defined in :numref:`sec_kaggle_house`""" + for name in DATA_HUB: + download(name) + +DATA_HUB['kaggle_house_train'] = ( + DATA_URL + 'kaggle_house_pred_train.csv', + '585e9cc93e70b39160e7921475f9bcd7d31219ce') + +DATA_HUB['kaggle_house_test'] = ( + DATA_URL + 'kaggle_house_pred_test.csv', + 'fa19780a7b011d9b009e8bff8e99922a8ee2eb90') + +def try_gpu(i=0): + """如果存在,则返回gpu(i),否则返回cpu()。 + + Defined in :numref:`sec_use_gpu`""" + if paddle.device.cuda.device_count() >= i + 1: + return paddle.CUDAPlace(i) + return paddle.CPUPlace() + +def try_all_gpus(): + """返回所有可用的GPU,如果没有GPU,则返回[cpu(),]。 + + Defined in :numref:`sec_use_gpu`""" + devices = [paddle.CUDAPlace(i) + for i in range(paddle.device.cuda.device_count())] + return devices if devices else paddle.CPUPlace() + +def corr2d(X, K): + """计算二维互相关运算 + + Defined in :numref:`sec_conv_layer`""" + h, w = K.shape + Y = d2l.zeros((X.shape[0] - h + 1, X.shape[1] - w + 1)) + for i in range(Y.shape[0]): + for j in range(Y.shape[1]): + Y[i, j] = d2l.reduce_sum((X[i: i + h, j: j + w] * K)) + return Y + +def evaluate_accuracy_gpu(net, data_iter, device=None): + """使用GPU计算模型在数据集上的精度 + + Defined in :numref:`sec_lenet`""" + if isinstance(net, nn.Layer): + net.eval() # 设置为评估模式 + if not device: + device = next(iter(net.parameters())).place + paddle.set_device("gpu:{}".format(str(device)[-2])) + # 正确预测的数量,总预测的数量 + metric = d2l.Accumulator(2) + with paddle.no_grad(): + for X, y in data_iter: + if isinstance(X, list): + # BERT微调所需的 + X = [paddle.to_tensor(x, place=device) for x in X] + else: + X = paddle.to_tensor(X, place=device) + y = paddle.to_tensor(y, place=device) + metric.add(d2l.accuracy(net(X), y), d2l.size(y)) + return metric[0] / metric[1] + +def train_ch6(net, train_iter, test_iter, num_epochs, lr, device): + """用GPU训练模型(在第六章定义) + + Defined in :numref:`sec_lenet`""" + def init_weights(m): + if type(m) == nn.Linear or type(m) == nn.Conv2D: + nn.initializer.XavierUniform(m.weight) + net.apply(init_weights) + print('training on', device) + net.to(device) + optimizer = paddle.optimizer.SGD(learning_rate=lr, parameters=net.parameters()) + loss = nn.CrossEntropyLoss() + animator = d2l.Animator(xlabel='epoch', xlim=[1, num_epochs], + legend=['train loss', 'train acc', 'test acc']) + timer, num_batches = d2l.Timer(), len(train_iter) + for epoch in range(num_epochs): + # 训练损失之和,训练准确率之和,样本数 + metric = d2l.Accumulator(3) + net.train() + for i, (X, y) in enumerate(train_iter): + timer.start() + optimizer.clear_grad() + X, y = paddle.to_tensor(X, place=device), paddle.to_tensor(y, place=device) + y_hat = net(X) + l = loss(y_hat, y) + l.backward() + optimizer.step() + with paddle.no_grad(): + metric.add(l * X.shape[0], d2l.accuracy(y_hat, y), X.shape[0]) + timer.stop() + train_l = metric[0] / metric[2] + train_acc = metric[1] / metric[2] + if (i + 1) % (num_batches // 5) == 0 or i == num_batches - 1: + animator.add(epoch + (i + 1) / num_batches, + (train_l, train_acc, None)) + test_acc = evaluate_accuracy_gpu(net, test_iter) + animator.add(epoch + 1, (None, None, test_acc)) + print(f'loss {train_l:.3f}, train acc {train_acc:.3f}, ' + f'test acc {test_acc:.3f}') + print(f'{metric[2] * num_epochs / timer.sum():.1f} examples/sec ' + f'on {str(device)}') + +class Residual(nn.Layer): + def __init__(self, input_channels, num_channels, use_1x1conv=False, + strides=1): + super(Residual, self).__init__() + self.conv1 = nn.Conv2D(input_channels, num_channels, kernel_size=3, + padding=1, stride=strides) + self.conv2 = nn.Conv2D(num_channels, num_channels, kernel_size=3, + padding=1) + if use_1x1conv: + self.conv3 = nn.Conv2D(input_channels, num_channels, + kernel_size=1, stride=strides) + else: + self.conv3 = None + self.bn1 = nn.BatchNorm2D(num_channels) + self.bn2 = nn.BatchNorm2D(num_channels) + self.relu = nn.ReLU() + + def forward(self, X): + Y = F.relu(self.bn1(self.conv1(X))) + Y = self.bn2(self.conv2(Y)) + if self.conv3: + X = self.conv3(X) + Y += X + return F.relu(Y) + +d2l.DATA_HUB['time_machine'] = (d2l.DATA_URL + 'timemachine.txt', + '090b5e7e70c295757f55df93cb0a180b9691891a') + +def read_time_machine(): + """将时间机器数据集加载到文本行的列表中 + + Defined in :numref:`sec_text_preprocessing`""" + with open(d2l.download('time_machine'), 'r') as f: + lines = f.readlines() + return [re.sub('[^A-Za-z]+', ' ', line).strip().lower() for line in lines] + +def tokenize(lines, token='word'): + """将文本行拆分为单词或字符词元 + + Defined in :numref:`sec_text_preprocessing`""" + if token == 'word': + return [line.split() for line in lines] + elif token == 'char': + return [list(line) for line in lines] + else: + print('错误:未知词元类型:' + token) + +class Vocab: + """文本词表""" + def __init__(self, tokens=None, min_freq=0, reserved_tokens=None): + """Defined in :numref:`sec_text_preprocessing`""" + if tokens is None: + tokens = [] + if reserved_tokens is None: + reserved_tokens = [] + # 按出现频率排序 + counter = count_corpus(tokens) + self._token_freqs = sorted(counter.items(), key=lambda x: x[1], + reverse=True) + # 未知词元的索引为0 + self.idx_to_token = [''] + reserved_tokens + self.token_to_idx = {token: idx + for idx, token in enumerate(self.idx_to_token)} + for token, freq in self._token_freqs: + if freq < min_freq: + break + if token not in self.token_to_idx: + self.idx_to_token.append(token) + self.token_to_idx[token] = len(self.idx_to_token) - 1 + + def __len__(self): + return len(self.idx_to_token) + + def __getitem__(self, tokens): + if not isinstance(tokens, (list, tuple)): + return self.token_to_idx.get(tokens, self.unk) + return [self.__getitem__(token) for token in tokens] + + def to_tokens(self, indices): + if not isinstance(indices, (list, tuple)): + return self.idx_to_token[indices] + return [self.idx_to_token[index] for index in indices] + + @property + def unk(self): # 未知词元的索引为0 + return 0 + + @property + def token_freqs(self): + return self._token_freqs + +def count_corpus(tokens): + """统计词元的频率 + + Defined in :numref:`sec_text_preprocessing`""" + # 这里的tokens是1D列表或2D列表 + if len(tokens) == 0 or isinstance(tokens[0], list): + # 将词元列表展平成一个列表 + tokens = [token for line in tokens for token in line] + return collections.Counter(tokens) + +def load_corpus_time_machine(max_tokens=-1): + """返回时光机器数据集的词元索引列表和词表 + + Defined in :numref:`sec_text_preprocessing`""" + lines = read_time_machine() + tokens = tokenize(lines, 'char') + vocab = Vocab(tokens) + # 因为时光机器数据集中的每个文本行不一定是一个句子或一个段落, + # 所以将所有文本行展平到一个列表中 + corpus = [vocab[token] for line in tokens for token in line] + if max_tokens > 0: + corpus = corpus[:max_tokens] + return corpus, vocab + +def seq_data_iter_random(corpus, batch_size, num_steps): + """使用随机抽样生成一个小批量子序列 + + Defined in :numref:`sec_language_model`""" + # 从随机偏移量开始对序列进行分区,随机范围包括num_steps-1 + corpus = corpus[random.randint(0, num_steps - 1):] + # 减去1,是因为我们需要考虑标签 + num_subseqs = (len(corpus) - 1) // num_steps + # 长度为num_steps的子序列的起始索引 + initial_indices = list(range(0, num_subseqs * num_steps, num_steps)) + # 在随机抽样的迭代过程中, + # 来自两个相邻的、随机的、小批量中的子序列不一定在原始序列上相邻 + random.shuffle(initial_indices) + + def data(pos): + # 返回从pos位置开始的长度为num_steps的序列 + return corpus[pos: pos + num_steps] + + num_batches = num_subseqs // batch_size + for i in range(0, batch_size * num_batches, batch_size): + # 在这里,initial_indices包含子序列的随机起始索引 + initial_indices_per_batch = initial_indices[i: i + batch_size] + X = [data(j) for j in initial_indices_per_batch] + Y = [data(j + 1) for j in initial_indices_per_batch] + yield d2l.tensor(X), d2l.tensor(Y) + +def seq_data_iter_sequential(corpus, batch_size, num_steps): + """使用顺序分区生成一个小批量子序列 + + Defined in :numref:`sec_language_model`""" + # 从随机偏移量开始划分序列 + offset = random.randint(0, num_steps) + num_tokens = ((len(corpus) - offset - 1) // batch_size) * batch_size + Xs = d2l.tensor(corpus[offset: offset + num_tokens]) + Ys = d2l.tensor(corpus[offset + 1: offset + 1 + num_tokens]) + Xs, Ys = Xs.reshape((batch_size, -1)), Ys.reshape((batch_size, -1)) + num_batches = Xs.shape[1] // num_steps + for i in range(0, num_steps * num_batches, num_steps): + X = Xs[:, i: i + num_steps] + Y = Ys[:, i: i + num_steps] + yield X, Y + +class SeqDataLoader: + """加载序列数据的迭代器""" + def __init__(self, batch_size, num_steps, use_random_iter, max_tokens): + """Defined in :numref:`sec_language_model`""" + if use_random_iter: + self.data_iter_fn = d2l.seq_data_iter_random + else: + self.data_iter_fn = d2l.seq_data_iter_sequential + self.corpus, self.vocab = d2l.load_corpus_time_machine(max_tokens) + self.batch_size, self.num_steps = batch_size, num_steps + + def __iter__(self): + return self.data_iter_fn(self.corpus, self.batch_size, self.num_steps) + +def load_data_time_machine(batch_size, num_steps, + use_random_iter=False, max_tokens=10000): + """返回时光机器数据集的迭代器和词表 + + Defined in :numref:`sec_language_model`""" + data_iter = SeqDataLoader( + batch_size, num_steps, use_random_iter, max_tokens) + return data_iter, data_iter.vocab + +class RNNModelScratch: + """从零开始实现的循环神经网络模型""" + def __init__(self, vocab_size, num_hiddens, + get_params, init_state, forward_fn): + """Defined in :numref:`sec_rnn_scratch`""" + self.vocab_size, self.num_hiddens = vocab_size, num_hiddens + self.params = get_params(vocab_size, num_hiddens) + self.init_state, self.forward_fn = init_state, forward_fn + + def __call__(self, X, state): + X = F.one_hot(X.T, self.vocab_size) + return self.forward_fn(X, state, self.params) + + def begin_state(self, batch_size): + return self.init_state(batch_size, self.num_hiddens) + +def predict_ch8(prefix, num_preds, net, vocab, device): + """在prefix后面生成新字符 + + Defined in :numref:`sec_rnn_scratch`""" + state = net.begin_state(batch_size=1) + outputs = [vocab[prefix[0]]] + get_input = lambda: d2l.reshape(d2l.tensor(outputs[-1], place=device), (1, 1)) + for y in prefix[1:]: # 预热期 + _, state = net(get_input(), state) + outputs.append(vocab[y]) + for _ in range(num_preds): # 预测num_preds步 + y, state = net(get_input(), state) + outputs.append(int(paddle.reshape(paddle.argmax(y,axis=1),shape=[1]))) + return ''.join([vocab.idx_to_token[i] for i in outputs]) + +def grad_clipping(net, theta): + """裁剪梯度 + + Defined in :numref:`sec_rnn_scratch`""" + if isinstance(net, nn.Layer): + params = [p for p in net.parameters() if not p.stop_gradient] + else: + params = net.params + norm = paddle.sqrt(sum(paddle.sum((p.grad ** 2)) for p in params)) + if norm > theta: + with paddle.no_grad(): + for param in params: + param.grad.set_value(param.grad * theta / norm) + +def train_epoch_ch8(net, train_iter, loss, updater, device, use_random_iter): + """训练网络一个迭代周期(定义见第8章) + + Defined in :numref:`sec_rnn_scratch`""" + state, timer = None, d2l.Timer() + metric = d2l.Accumulator(2) # 训练损失之和,词元数量 + for X, Y in train_iter: + if state is None or use_random_iter: + # 在第一次迭代或使用随机抽样时初始化state + state = net.begin_state(batch_size=X.shape[0]) + else: + if isinstance(net, nn.Layer) and not isinstance(state, tuple): + # state对于nn.GRU是个张量 + state.stop_gradient=True + else: + # state对于nn.LSTM或对于我们从零开始实现的模型是个张量 + for s in state: + s.stop_gradient=True + y = paddle.reshape(Y.T,shape=[-1]) + X = paddle.to_tensor(X, place=device) + y = paddle.to_tensor(y, place=device) + y_hat, state = net(X, state) + l = loss(y_hat, y).mean() + if isinstance(updater, paddle.optimizer.Optimizer): + updater.clear_grad() + l.backward() + grad_clipping(net, 1) + updater.step() + else: + l.backward() + grad_clipping(net, 1) + # 因为已经调用了mean函数 + updater(batch_size=1) + + metric.add(l * d2l.size(y), d2l.size(y)) + return math.exp(metric[0] / metric[1]), metric[1] / timer.stop() + +def train_ch8(net, train_iter, vocab, lr, num_epochs, device, use_random_iter=False): + """训练模型(定义见第8章) + + Defined in :numref:`sec_rnn_scratch`""" + loss = nn.CrossEntropyLoss() + animator = d2l.Animator(xlabel='epoch', ylabel='perplexity', + legend=['train'], xlim=[10, num_epochs]) + # 初始化 + if isinstance(net, nn.Layer): + updater = paddle.optimizer.SGD( + learning_rate=lr, parameters=net.parameters()) + else: + updater = lambda batch_size: d2l.sgd(net.params, lr, batch_size) + predict = lambda prefix: predict_ch8(prefix, 50, net, vocab, device) + # 训练和预测 + for epoch in range(num_epochs): + ppl, speed = train_epoch_ch8( + net, train_iter, loss, updater, device, use_random_iter) + if (epoch + 1) % 10 == 0: + print(predict('time traveller')) + animator.add(epoch + 1, [ppl]) + print(f'困惑度 {ppl:.1f}, {speed:.1f} 词元/秒 {str(device)}') + print(predict('time traveller')) + print(predict('traveller')) + +class RNNModel(nn.Layer): + """循环神经网络模型 + + Defined in :numref:`sec_rnn-concise`""" + def __init__(self, rnn_layer, vocab_size, **kwargs): + super(RNNModel, self).__init__(**kwargs) + self.rnn = rnn_layer + self.vocab_size = vocab_size + self.num_hiddens = self.rnn.hidden_size + # 如果RNN是双向的(之后将介绍),num_directions应该是2,否则应该是1 + if self.rnn.num_directions==1: + self.num_directions = 1 + self.linear = nn.Linear(self.num_hiddens, self.vocab_size) + else: + self.num_directions = 2 + self.linear = nn.Linear(self.num_hiddens * 2, self.vocab_size) + + def forward(self, inputs, state): + X = F.one_hot(inputs.T, self.vocab_size) + Y, state = self.rnn(X, state) + # 全连接层首先将Y的形状改为(时间步数*批量大小,隐藏单元数) + # 它的输出形状是(时间步数*批量大小,词表大小)。 + output = self.linear(Y.reshape((-1, Y.shape[-1]))) + return output, state + + def begin_state(self, batch_size=1): + if not isinstance(self.rnn, nn.LSTM): + # nn.GRU以张量作为隐状态 + return paddle.zeros(shape=[self.num_directions * self.rnn.num_layers, + batch_size, self.num_hiddens]) + else: + # nn.LSTM以元组作为隐状态 + return (paddle.zeros( + shape=[self.num_directions * self.rnn.num_layers, + batch_size, self.num_hiddens]), + paddle.zeros( + shape=[self.num_directions * self.rnn.num_layers, + batch_size, self.num_hiddens])) + +d2l.DATA_HUB['fra-eng'] = (d2l.DATA_URL + 'fra-eng.zip', + '94646ad1522d915e7b0f9296181140edcf86a4f5') + +def read_data_nmt(): + """载入“英语-法语”数据集 + + Defined in :numref:`sec_machine_translation`""" + data_dir = d2l.download_extract('fra-eng') + with open(os.path.join(data_dir, 'fra.txt'), 'r', + encoding='utf-8') as f: + return f.read() + +def preprocess_nmt(text): + """预处理“英语-法语”数据集 + + Defined in :numref:`sec_machine_translation`""" + def no_space(char, prev_char): + return char in set(',.!?') and prev_char != ' ' + + # 使用空格替换不间断空格 + # 使用小写字母替换大写字母 + text = text.replace('\u202f', ' ').replace('\xa0', ' ').lower() + # 在单词和标点符号之间插入空格 + out = [' ' + char if i > 0 and no_space(char, text[i - 1]) else char + for i, char in enumerate(text)] + return ''.join(out) + +def tokenize_nmt(text, num_examples=None): + """词元化“英语-法语”数据数据集 + + Defined in :numref:`sec_machine_translation`""" + source, target = [], [] + for i, line in enumerate(text.split('\n')): + if num_examples and i > num_examples: + break + parts = line.split('\t') + if len(parts) == 2: + source.append(parts[0].split(' ')) + target.append(parts[1].split(' ')) + return source, target + +def show_list_len_pair_hist(legend, xlabel, ylabel, xlist, ylist): + """绘制列表长度对的直方图 + + Defined in :numref:`sec_machine_translation`""" + d2l.set_figsize() + _, _, patches = d2l.plt.hist( + [[len(l) for l in xlist], [len(l) for l in ylist]]) + d2l.plt.xlabel(xlabel) + d2l.plt.ylabel(ylabel) + for patch in patches[1].patches: + patch.set_hatch('/') + d2l.plt.legend(legend) + +def truncate_pad(line, num_steps, padding_token): + """截断或填充文本序列 + + Defined in :numref:`sec_machine_translation`""" + if len(line) > num_steps: + return line[:num_steps] # 截断 + return line + [padding_token] * (num_steps - len(line)) # 填充 + +def build_array_nmt(lines, vocab, num_steps): + """将机器翻译的文本序列转换成小批量 + + Defined in :numref:`subsec_mt_data_loading`""" + lines = [vocab[l] for l in lines] + lines = [l + [vocab['']] for l in lines] + array = d2l.tensor([truncate_pad( + l, num_steps, vocab['']) for l in lines]) + valid_len = d2l.reduce_sum( + d2l.astype(array != vocab[''], d2l.int32), 1) + return array, valid_len + +def load_data_nmt(batch_size, num_steps, num_examples=600): + """返回翻译数据集的迭代器和词表 + + Defined in :numref:`subsec_mt_data_loading`""" + text = preprocess_nmt(read_data_nmt()) + source, target = tokenize_nmt(text, num_examples) + src_vocab = d2l.Vocab(source, min_freq=2, + reserved_tokens=['', '', '']) + tgt_vocab = d2l.Vocab(target, min_freq=2, + reserved_tokens=['', '', '']) + src_array, src_valid_len = build_array_nmt(source, src_vocab, num_steps) + tgt_array, tgt_valid_len = build_array_nmt(target, tgt_vocab, num_steps) + data_arrays = (src_array, src_valid_len, tgt_array, tgt_valid_len) + data_iter = d2l.load_array(data_arrays, batch_size) + return data_iter, src_vocab, tgt_vocab + +class Encoder(nn.Layer): + """编码器-解码器架构的基本编码器接口""" + def __init__(self, **kwargs): + super(Encoder, self).__init__(**kwargs) + + def forward(self, X, *args): + raise NotImplementedError + +class Decoder(nn.Layer): + """编码器-解码器架构的基本解码器接口 + + Defined in :numref:`sec_encoder-decoder`""" + def __init__(self, **kwargs): + super(Decoder, self).__init__(**kwargs) + + def init_state(self, enc_outputs, *args): + raise NotImplementedError + + def forward(self, X, state): + raise NotImplementedError + +class EncoderDecoder(nn.Layer): + """编码器-解码器架构的基类 + + Defined in :numref:`sec_encoder-decoder`""" + def __init__(self, encoder, decoder, **kwargs): + super(EncoderDecoder, self).__init__(**kwargs) + self.encoder = encoder + self.decoder = decoder + + def forward(self, enc_X, dec_X, *args): + enc_outputs = self.encoder(enc_X, *args) + dec_state = self.decoder.init_state(enc_outputs, *args) + return self.decoder(dec_X, dec_state) + +class Seq2SeqEncoder(d2l.Encoder): + """用于序列到序列学习的循环神经网络编码器 + + Defined in :numref:`sec_seq2seq`""" + def __init__(self, vocab_size, embed_size, num_hiddens, num_layers, + dropout=0, **kwargs): + super(Seq2SeqEncoder, self).__init__(**kwargs) + weight_ih_attr = paddle.ParamAttr(initializer=nn.initializer.XavierUniform()) + weight_hh_attr = paddle.ParamAttr(initializer=nn.initializer.XavierUniform()) + # 嵌入层 + self.embedding = nn.Embedding(vocab_size, embed_size) + self.rnn = nn.GRU(embed_size, num_hiddens, num_layers, dropout=dropout, + time_major=True, weight_ih_attr=weight_ih_attr, weight_hh_attr=weight_hh_attr) + + def forward(self, X, *args): + # 输出'X'的形状:(batch_size,num_steps,embed_size) + X = self.embedding(X) + # 在循环神经网络模型中,第一个轴对应于时间步 + X = X.transpose([1, 0, 2]) + # 如果未提及状态,则默认为0 + output, state = self.rnn(X) + # PaddlePaddle的GRU层output的形状:(batch_size,time_steps,num_directions * num_hiddens), + # 需设定time_major=True,指定input的第一个维度为time_steps + # state[0]的形状:(num_layers,batch_size,num_hiddens) + return output, state + +def sequence_mask(X, valid_len, value=0): + """在序列中屏蔽不相关的项 + + Defined in :numref:`sec_seq2seq_decoder`""" + maxlen = X.shape[1] + mask = paddle.arange((maxlen), dtype=paddle.float32)[None, :] < valid_len[:, None] + Xtype = X.dtype + X = X.astype(paddle.float32) + X[~mask] = float(value) + return X.astype(Xtype) + +class MaskedSoftmaxCELoss(nn.CrossEntropyLoss): + """带遮蔽的softmax交叉熵损失函数 + + Defined in :numref:`sec_seq2seq_decoder`""" + # pred的形状:(batch_size,num_steps,vocab_size) + # label的形状:(batch_size,num_steps) + # valid_len的形状:(batch_size,) + def forward(self, pred, label, valid_len): + weights = paddle.ones_like(label) + weights = sequence_mask(weights, valid_len) + self.reduction='none' + unweighted_loss = super(MaskedSoftmaxCELoss, self).forward( + pred, label) + weighted_loss = (unweighted_loss * weights).mean(axis=1) + return weighted_loss + +def train_seq2seq(net, data_iter, lr, num_epochs, tgt_vocab, device): + """训练序列到序列模型 + + Defined in :numref:`sec_seq2seq_decoder`""" + optimizer = paddle.optimizer.Adam(learning_rate=lr, parameters=net.parameters()) + loss = MaskedSoftmaxCELoss() + net.train() + animator = d2l.Animator(xlabel='epoch', ylabel='loss', + xlim=[10, num_epochs]) + for epoch in range(num_epochs): + timer = d2l.Timer() + metric = d2l.Accumulator(2) # 训练损失总和,词元数量 + for batch in data_iter: + optimizer.clear_grad() + X, X_valid_len, Y, Y_valid_len = [paddle.to_tensor(x, place=device) for x in batch] + bos = paddle.to_tensor([tgt_vocab['']] * Y.shape[0]).reshape([-1, 1]) + dec_input = paddle.concat([bos, Y[:, :-1]], 1) # 强制教学 + Y_hat, _ = net(X, dec_input, X_valid_len.squeeze()) + l = loss(Y_hat, Y, Y_valid_len.squeeze()) + l.backward() # 损失函数的标量进行“反向传播” + d2l.grad_clipping(net, 1) + num_tokens = Y_valid_len.sum() + optimizer.step() + with paddle.no_grad(): + metric.add(l.sum(), num_tokens) + if (epoch + 1) % 10 == 0: + animator.add(epoch + 1, (metric[0] / metric[1],)) + print(f'loss {metric[0] / metric[1]:.3f}, {metric[1] / timer.stop():.1f} ' + f'tokens/sec on {str(device)}') + +def predict_seq2seq(net, src_sentence, src_vocab, tgt_vocab, num_steps, + device, save_attention_weights=False): + """序列到序列模型的预测 + + Defined in :numref:`sec_seq2seq_training`""" + # 在预测时将net设置为评估模式 + net.eval() + src_tokens = src_vocab[src_sentence.lower().split(' ')] + [ + src_vocab['']] + enc_valid_len = paddle.to_tensor([len(src_tokens)], place=device) + src_tokens = d2l.truncate_pad(src_tokens, num_steps, src_vocab['']) + # 添加批量轴 + enc_X = paddle.unsqueeze( + paddle.to_tensor(src_tokens, dtype=paddle.int64, place=device), axis=0) + enc_outputs = net.encoder(enc_X, enc_valid_len) + dec_state = net.decoder.init_state(enc_outputs, enc_valid_len) + # 添加批量轴 + dec_X = paddle.unsqueeze(paddle.to_tensor( + [tgt_vocab['']], dtype=paddle.int64, place=device), axis=0) + output_seq, attention_weight_seq = [], [] + for _ in range(num_steps): + Y, dec_state = net.decoder(dec_X, dec_state) + # 我们使用具有预测最高可能性的词元,作为解码器在下一时间步的输入 + dec_X = Y.argmax(axis=2) + pred = dec_X.squeeze(axis=0).astype(paddle.int32).item() + # 保存注意力权重(稍后讨论) + if save_attention_weights: + attention_weight_seq.append(net.decoder.attention_weights) + # 一旦序列结束词元被预测,输出序列的生成就完成了 + if pred == tgt_vocab['']: + break + output_seq.append(pred) + return ' '.join(tgt_vocab.to_tokens(output_seq)), attention_weight_seq + +def bleu(pred_seq, label_seq, k): + """计算BLEU + + Defined in :numref:`sec_seq2seq_training`""" + pred_tokens, label_tokens = pred_seq.split(' '), label_seq.split(' ') + len_pred, len_label = len(pred_tokens), len(label_tokens) + score = math.exp(min(0, 1 - len_label / len_pred)) + for n in range(1, k + 1): + num_matches, label_subs = 0, collections.defaultdict(int) + for i in range(len_label - n + 1): + label_subs[' '.join(label_tokens[i: i + n])] += 1 + for i in range(len_pred - n + 1): + if label_subs[' '.join(pred_tokens[i: i + n])] > 0: + num_matches += 1 + label_subs[' '.join(pred_tokens[i: i + n])] -= 1 + score *= math.pow(num_matches / (len_pred - n + 1), math.pow(0.5, n)) + return score + +def show_heatmaps(matrices, xlabel, ylabel, titles=None, figsize=(2.5, 2.5), + cmap='Reds'): + """显示矩阵热图 + + Defined in :numref:`sec_attention-cues`""" + d2l.use_svg_display() + num_rows, num_cols = matrices.shape[0], matrices.shape[1] + fig, axes = d2l.plt.subplots(num_rows, num_cols, figsize=figsize, + sharex=True, sharey=True, squeeze=False) + for i, (row_axes, row_matrices) in enumerate(zip(axes, matrices)): + for j, (ax, matrix) in enumerate(zip(row_axes, row_matrices)): + pcm = ax.imshow(d2l.numpy(matrix), cmap=cmap) + if i == num_rows - 1: + ax.set_xlabel(xlabel) + if j == 0: + ax.set_ylabel(ylabel) + if titles: + ax.set_title(titles[j]) + fig.colorbar(pcm, ax=axes, shrink=0.6); + +def masked_softmax(X, valid_lens): + """通过在最后一个轴上掩蔽元素来执行softmax操作 + + Defined in :numref:`sec_attention-scoring-functions`""" + # X:3D张量,valid_lens:1D或2D张量 + if valid_lens is None: + return nn.functional.softmax(X, axis=-1) + else: + shape = X.shape + if valid_lens.dim() == 1: + valid_lens = paddle.repeat_interleave(valid_lens, shape[1]) + else: + valid_lens = valid_lens.reshape((-1,)) + # 最后一轴上被掩蔽的元素使用一个非常大的负值替换,从而其softmax输出为0 + X = d2l.sequence_mask(X.reshape((-1, shape[-1])), valid_lens, + value=-1e6) + return nn.functional.softmax(X.reshape(shape), axis=-1) + +class AdditiveAttention(nn.Layer): + """加性注意力 + + Defined in :numref:`sec_attention-scoring-functions`""" + def __init__(self, key_size, query_size, num_hiddens, dropout, **kwargs): + super(AdditiveAttention, self).__init__(**kwargs) + self.W_k = nn.Linear(key_size, num_hiddens, bias_attr=False) + self.W_q = nn.Linear(query_size, num_hiddens, bias_attr=False) + self.w_v = nn.Linear(num_hiddens, 1, bias_attr=False) + self.dropout = nn.Dropout(dropout) + + def forward(self, queries, keys, values, valid_lens): + queries, keys = self.W_q(queries), self.W_k(keys) + # 在维度扩展后, + # queries的形状:(batch_size,查询的个数,1,num_hidden) + # key的形状:(batch_size,1,“键-值”对的个数,num_hiddens) + # 使用广播方式进行求和 + features = queries.unsqueeze(2) + keys.unsqueeze(1) + features = paddle.tanh(features) + # self.w_v仅有一个输出,因此从形状中移除最后那个维度。 + # scores的形状:(batch_size,查询的个数,“键-值”对的个数) + scores = self.w_v(features).squeeze(-1) + self.attention_weights = masked_softmax(scores, valid_lens) + # values的形状:(batch_size,“键-值”对的个数,值的维度) + return paddle.bmm(self.dropout(self.attention_weights), values) + +class DotProductAttention(nn.Layer): + """缩放点积注意力 + + Defined in :numref:`subsec_additive-attention`""" + def __init__(self, dropout, **kwargs): + super(DotProductAttention, self).__init__(**kwargs) + self.dropout = nn.Dropout(dropout) + + # queries的形状:(batch_size,查询的个数,d) + # keys的形状:(batch_size,“键-值”对的个数,d) + # values的形状:(batch_size,“键-值”对的个数,值的维度) + # valid_lens的形状:(batch_size,)或者(batch_size,查询的个数) + def forward(self, queries, keys, values, valid_lens=None): + d = queries.shape[-1] + # 设置transpose_b=True为了交换keys的最后两个维度 + scores = paddle.bmm(queries, keys.transpose((0,2,1))) / math.sqrt(d) + self.attention_weights = masked_softmax(scores, valid_lens) + return paddle.bmm(self.dropout(self.attention_weights), values) + +class AttentionDecoder(d2l.Decoder): + """带有注意力机制解码器的基本接口 + + Defined in :numref:`sec_seq2seq_attention`""" + def __init__(self, **kwargs): + super(AttentionDecoder, self).__init__(**kwargs) + + @property + def attention_weights(self): + raise NotImplementedError + +class MultiHeadAttention(nn.Layer): + """Defined in :numref:`sec_multihead-attention`""" + def __init__(self, key_size, query_size, value_size, num_hiddens, + num_heads, dropout, bias=False, **kwargs): + super(MultiHeadAttention, self).__init__(**kwargs) + self.num_heads = num_heads + self.attention = d2l.DotProductAttention(dropout) + self.W_q = nn.Linear(query_size, num_hiddens, bias_attr=bias) + self.W_k = nn.Linear(key_size, num_hiddens, bias_attr=bias) + self.W_v = nn.Linear(value_size, num_hiddens, bias_attr=bias) + self.W_o = nn.Linear(num_hiddens, num_hiddens, bias_attr=bias) + + def forward(self, queries, keys, values, valid_lens): + # queries,keys,values的形状: + # (batch_size,查询或者“键-值”对的个数,num_hiddens) + # valid_lens 的形状: + # (batch_size,)或(batch_size,查询的个数) + # 经过变换后,输出的queries,keys,values 的形状: + # (batch_size*num_heads,查询或者“键-值”对的个数, + # num_hiddens/num_heads) + queries = transpose_qkv(self.W_q(queries), self.num_heads) + keys = transpose_qkv(self.W_k(keys), self.num_heads) + values = transpose_qkv(self.W_v(values), self.num_heads) + if valid_lens is not None: + # 在轴0,将第一项(标量或者矢量)复制num_heads次, + # 然后如此复制第二项,然后诸如此类。 + valid_lens = paddle.repeat_interleave( + valid_lens, repeats=self.num_heads, axis=0) + + # output的形状:(batch_size*num_heads,查询的个数, + # num_hiddens/num_heads) + output = self.attention(queries, keys, values, valid_lens) + + # output_concat的形状:(batch_size,查询的个数,num_hiddens) + output_concat = transpose_output(output, self.num_heads) + return self.W_o(output_concat) + +def transpose_qkv(X, num_heads): + """为了多注意力头的并行计算而变换形状 + + Defined in :numref:`sec_multihead-attention`""" + # 输入X的形状:(batch_size,查询或者“键-值”对的个数,num_hiddens) + # 输出X的形状:(batch_size,查询或者“键-值”对的个数,num_heads, + # num_hiddens/num_heads) + X = X.reshape((X.shape[0], X.shape[1], num_heads, -1)) + + # 输出X的形状:(batch_size,num_heads,查询或者“键-值”对的个数, + # num_hiddens/num_heads) + X = X.transpose((0, 2, 1, 3)) + + # 最终输出的形状:(batch_size*num_heads,查询或者“键-值”对的个数, + # num_hiddens/num_heads) + return X.reshape((-1, X.shape[2], X.shape[3])) + + +def transpose_output(X, num_heads): + """逆转transpose_qkv函数的操作 + + Defined in :numref:`sec_multihead-attention`""" + X = X.reshape((-1, num_heads, X.shape[1], X.shape[2])) + X = X.transpose((0, 2, 1, 3)) + return X.reshape((X.shape[0], X.shape[1], -1)) + +class PositionalEncoding(nn.Layer): + """位置编码 + + Defined in :numref:`sec_self-attention-and-positional-encoding`""" + def __init__(self, num_hiddens, dropout, max_len=1000): + super(PositionalEncoding, self).__init__() + self.dropout = nn.Dropout(dropout) + # 创建一个足够长的P + self.P = paddle.zeros((1, max_len, num_hiddens)) + X = paddle.arange(max_len, dtype=paddle.float32).reshape( + (-1, 1)) / paddle.pow(paddle.to_tensor([10000.0]), paddle.arange( + 0, num_hiddens, 2, dtype=paddle.float32) / num_hiddens) + self.P[:, :, 0::2] = paddle.sin(X) + self.P[:, :, 1::2] = paddle.cos(X) + + def forward(self, X): + X = X + self.P[:, :X.shape[1], :] + return self.dropout(X) + +class PositionWiseFFN(nn.Layer): + """基于位置的前馈网络 + + Defined in :numref:`sec_transformer`""" + def __init__(self, ffn_num_input, ffn_num_hiddens, ffn_num_outputs, + **kwargs): + super(PositionWiseFFN, self).__init__(**kwargs) + self.dense1 = nn.Linear(ffn_num_input, ffn_num_hiddens) + self.relu = nn.ReLU() + self.dense2 = nn.Linear(ffn_num_hiddens, ffn_num_outputs) + + def forward(self, X): + return self.dense2(self.relu(self.dense1(X))) + +class AddNorm(nn.Layer): + """残差连接后进行层规范化 + + Defined in :numref:`sec_transformer`""" + def __init__(self, normalized_shape, dropout, **kwargs): + super(AddNorm, self).__init__(**kwargs) + self.dropout = nn.Dropout(dropout) + self.ln = nn.LayerNorm(normalized_shape) + + def forward(self, X, Y): + return self.ln(self.dropout(Y) + X) + +class EncoderBlock(nn.Layer): + """transformer编码器块 + + Defined in :numref:`sec_transformer`""" + def __init__(self, key_size, query_size, value_size, num_hiddens, + norm_shape, ffn_num_input, ffn_num_hiddens, num_heads, + dropout, use_bias=False, **kwargs): + super(EncoderBlock, self).__init__(**kwargs) + self.attention = d2l.MultiHeadAttention( + key_size, query_size, value_size, num_hiddens, num_heads, dropout, + use_bias) + self.addnorm1 = AddNorm(norm_shape, dropout) + self.ffn = PositionWiseFFN( + ffn_num_input, ffn_num_hiddens, num_hiddens) + self.addnorm2 = AddNorm(norm_shape, dropout) + + def forward(self, X, valid_lens): + Y = self.addnorm1(X, self.attention(X, X, X, valid_lens)) + return self.addnorm2(Y, self.ffn(Y)) + +class TransformerEncoder(d2l.Encoder): + """transformer编码器 + + Defined in :numref:`sec_transformer`""" + def __init__(self, vocab_size, key_size, query_size, value_size, + num_hiddens, norm_shape, ffn_num_input, ffn_num_hiddens, + num_heads, num_layers, dropout, use_bias=False, **kwargs): + super(TransformerEncoder, self).__init__(**kwargs) + self.num_hiddens = num_hiddens + self.embedding = nn.Embedding(vocab_size, num_hiddens) + self.pos_encoding = d2l.PositionalEncoding(num_hiddens, dropout) + self.blks = nn.Sequential() + for i in range(num_layers): + self.blks.add_sublayer(str(i), + EncoderBlock(key_size, query_size, value_size, num_hiddens, + norm_shape, ffn_num_input, ffn_num_hiddens, + num_heads, dropout, use_bias)) + + def forward(self, X, valid_lens, *args): + # 因为位置编码值在-1和1之间, + # 因此嵌入值乘以嵌入维度的平方根进行缩放, + # 然后再与位置编码相加。 + X = self.pos_encoding(self.embedding(X) * math.sqrt(self.num_hiddens)) + self.attention_weights = [None] * len(self.blks) + for i, blk in enumerate(self.blks): + X = blk(X, valid_lens) + self.attention_weights[ + i] = blk.attention.attention.attention_weights + return X + +def annotate(text, xy, xytext): + d2l.plt.gca().annotate(text, xy=xy, xytext=xytext, + arrowprops=dict(arrowstyle='->')) + +def train_2d(trainer, steps=20, f_grad=None): + """用定制的训练机优化2D目标函数 + + Defined in :numref:`subsec_gd-learningrate`""" + # s1和s2是稍后将使用的内部状态变量 + x1, x2, s1, s2 = -5, -2, 0, 0 + results = [(x1, x2)] + for i in range(steps): + if f_grad: + x1, x2, s1, s2 = trainer(x1, x2, s1, s2, f_grad) + else: + x1, x2, s1, s2 = trainer(x1, x2, s1, s2) + results.append((x1, x2)) + print(f'epoch {i + 1}, x1: {float(x1):f}, x2: {float(x2):f}') + return results + +def show_trace_2d(f, results): + """显示优化过程中2D变量的轨迹 + + Defined in :numref:`subsec_gd-learningrate`""" + d2l.set_figsize() + d2l.plt.plot(*zip(*results), '-o', color='#ff7f0e') + x1, x2 = d2l.meshgrid(d2l.arange(-5.5, 1.0, 0.1, dtype='float32'), + d2l.arange(-3.0, 1.0, 0.1, dtype='float32')) + d2l.plt.contour(x1, x2, f(x1, x2), colors='#1f77b4') + d2l.plt.xlabel('x1') + d2l.plt.ylabel('x2') + +d2l.DATA_HUB['airfoil'] = (d2l.DATA_URL + 'airfoil_self_noise.dat', + '76e5be1548fd8222e5074cf0faae75edff8cf93f') + +def get_data_ch11(batch_size=10, n=1500): + """Defined in :numref:`sec_minibatches`""" + data = np.genfromtxt(d2l.download('airfoil'), + dtype=np.float32, delimiter='\t') + data = d2l.tensor((data - data.mean(axis=0)) / data.std(axis=0)) + data_iter = d2l.load_array((data[:n, :-1], data[:n, -1]), + batch_size, is_train=True) + return data_iter, data.shape[1]-1 + +def train_ch11(trainer_fn, states, hyperparams, data_iter, + feature_dim, num_epochs=2): + """Defined in :numref:`sec_minibatches`""" + # 初始化模型 + w = d2l.tensor(d2l.normal(mean=0.0, std=0.01, shape=(feature_dim, 1)), stop_gradient=False) + b = d2l.tensor(d2l.zeros((1,)), stop_gradient=False) + net, loss = lambda X: d2l.linreg(X, w, b), d2l.squared_loss + # 训练模型 + animator = d2l.Animator(xlabel='epoch', ylabel='loss', + xlim=[0, num_epochs], ylim=[0.22, 0.35]) + n, timer = 0, d2l.Timer() + for _ in range(num_epochs): + for X, y in data_iter: + l = loss(net(X), y).mean() + l.backward() + w, b = trainer_fn([w, b], states, hyperparams) + n += X.shape[0] + if n % 200 == 0: + timer.stop() + animator.add(n/X.shape[0]/len(data_iter), + (d2l.evaluate_loss(net, data_iter, loss),)) + timer.start() + print(f'loss: {animator.Y[0][-1]:.3f}, {timer.avg():.3f} sec/epoch') + return timer.cumsum(), animator.Y[0] + +def train_concise_ch11(trainer_fn, hyperparams, data_iter, num_epochs=4): + """Defined in :numref:`sec_minibatches`""" + # 初始化模型 + net = nn.Sequential(nn.Linear(5, 1)) + def init_weights(m): + if type(m) == nn.Linear: + paddle.nn.initializer.Normal(m.weight, std=0.01) + + net.apply(init_weights) + + optimizer = trainer_fn(parameters=net.parameters(), **hyperparams) + loss = nn.MSELoss(reduction='none') + animator = d2l.Animator(xlabel='epoch', ylabel='loss', + xlim=[0, num_epochs], ylim=[0.22, 0.35]) + n, timer = 0, d2l.Timer() + for _ in range(num_epochs): + for X, y in data_iter: + optimizer.clear_grad() + out = net(X) + y = y.reshape(out.shape) + l = loss(out, y) + l.mean().backward() + optimizer.step() + n += X.shape[0] + if n % 200 == 0: + timer.stop() + # MSELoss计算平方误差时不带系数1/2 + animator.add(n/X.shape[0]/len(data_iter), + (d2l.evaluate_loss(net, data_iter, loss) / 2,)) + timer.start() + print(f'loss: {animator.Y[0][-1]:.3f}, {timer.avg():.3f} sec/epoch') + +class Benchmark: + """用于测量运行时间""" + def __init__(self, description='Done'): + """Defined in :numref:`sec_hybridize`""" + self.description = description + + def __enter__(self): + self.timer = d2l.Timer() + return self + + def __exit__(self, *args): + print(f'{self.description}: {self.timer.stop():.4f} sec') + +def split_batch(X, y, devices): + """将X和y拆分到多个设备上 + + Defined in :numref:`sec_multi_gpu`""" + assert X.shape[0] == y.shape[0] + return (paddlescatter(X, devices), + paddlescatter(y, devices)) + +def resnet18(num_classes, in_channels=1): + """稍加修改的ResNet-18模型 + + Defined in :numref:`sec_multi_gpu_concise`""" + def resnet_block(in_channels, out_channels, num_residuals, + first_block=False): + blk = [] + for i in range(num_residuals): + if i == 0 and not first_block: + blk.append(d2l.Residual(in_channels, out_channels, + use_1x1conv=True, strides=2)) + else: + blk.append(d2l.Residual(out_channels, out_channels)) + return nn.Sequential(*blk) + + # 该模型使用了更小的卷积核、步长和填充,而且删除了最大汇聚层 + net = nn.Sequential( + nn.Conv2D(in_channels, 64, kernel_size=3, stride=1, padding=1), + nn.BatchNorm2D(64), + nn.ReLU()) + net.add_sublayer("resnet_block1", resnet_block( + 64, 64, 2, first_block=True)) + net.add_sublayer("resnet_block2", resnet_block(64, 128, 2)) + net.add_sublayer("resnet_block3", resnet_block(128, 256, 2)) + net.add_sublayer("resnet_block4", resnet_block(256, 512, 2)) + net.add_sublayer("global_avg_pool", nn.AdaptiveAvgPool2D((1, 1))) + net.add_sublayer("fc", nn.Sequential(nn.Flatten(), + nn.Linear(512, num_classes))) + return net + +def train_batch_ch13(net, X, y, loss, trainer, devices): + """Defined in :numref:`sec_image_augmentation`""" + """用多GPU进行小批量训练 + 飞桨不支持在notebook上进行多GPU训练 + Defined in :numref:`sec_image_augmentation`""" + if isinstance(X, list): + # 微调BERT中所需(稍后讨论) + X = [paddle.to_tensor(x, place=devices[0]) for x in X] + else: + X = paddle.to_tensor(X, place=devices[0]) + y = paddle.to_tensor(y, place=devices[0]) + net.train() + trainer.clear_grad() + pred = net(X) + l = loss(pred, y) + l.sum().backward() + trainer.step() + train_loss_sum = l.sum() + train_acc_sum = d2l.accuracy(pred, y) + return train_loss_sum, train_acc_sum + +def train_ch13(net, train_iter, test_iter, loss, trainer, num_epochs, + devices=d2l.try_all_gpus()): + """Defined in :numref:`sec_image_augmentation`""" + """用多GPU进行模型训练 + Defined in :numref:`sec_image_augmentation`""" + timer, num_batches = d2l.Timer(), len(train_iter) + animator = d2l.Animator(xlabel='epoch', xlim=[1, num_epochs], ylim=[0, 1], + legend=['train loss', 'train acc', 'test acc']) + net = paddle.DataParallel(net) + for epoch in range(num_epochs): + # 4个维度:储存训练损失,训练准确度,实例数,特点数 + metric = d2l.Accumulator(4) + for i, (features, labels) in enumerate(train_iter): + timer.start() + l, acc = train_batch_ch13( + net, features, labels, loss, trainer, devices) + metric.add(l, acc, labels.shape[0], labels.numel()) + timer.stop() + if (i + 1) % (num_batches // 5) == 0 or i == num_batches - 1: + animator.add(epoch + (i + 1) / num_batches, + (metric[0] / metric[2], metric[1] / metric[3], + None)) + test_acc = d2l.evaluate_accuracy_gpu(net, test_iter) + animator.add(epoch + 1, (None, None, test_acc)) + print(f'loss {metric[0] / metric[2]:.3f}, train acc ' + f'{metric[1] / metric[3]:.3f}, test acc {test_acc:.3f}') + print(f'{metric[2] * num_epochs / timer.sum():.1f} examples/sec on ' + f'{str(devices)}') + +d2l.DATA_HUB['hotdog'] = (d2l.DATA_URL + 'hotdog.zip', + 'fba480ffa8aa7e0febbb511d181409f899b9baa5') + +def box_corner_to_center(boxes): + """从(左上,右下)转换到(中间,宽度,高度) + + Defined in :numref:`sec_bbox`""" + x1, y1, x2, y2 = boxes[:, 0], boxes[:, 1], boxes[:, 2], boxes[:, 3] + cx = (x1 + x2) / 2 + cy = (y1 + y2) / 2 + w = x2 - x1 + h = y2 - y1 + boxes = d2l.stack((cx, cy, w, h), axis=-1) + return boxes + +def box_center_to_corner(boxes): + """从(中间,宽度,高度)转换到(左上,右下) + + Defined in :numref:`sec_bbox`""" + cx, cy, w, h = boxes[:, 0], boxes[:, 1], boxes[:, 2], boxes[:, 3] + x1 = cx - 0.5 * w + y1 = cy - 0.5 * h + x2 = cx + 0.5 * w + y2 = cy + 0.5 * h + boxes = d2l.stack((x1, y1, x2, y2), axis=-1) + return boxes + +def bbox_to_rect(bbox, color): + """Defined in :numref:`sec_bbox`""" + # 将边界框(左上x,左上y,右下x,右下y)格式转换成matplotlib格式: + # ((左上x,左上y),宽,高) + return d2l.plt.Rectangle( + xy=(bbox[0], bbox[1]), width=bbox[2]-bbox[0], height=bbox[3]-bbox[1], + fill=False, edgecolor=color, linewidth=2) + +def multibox_prior(data, sizes, ratios): + """生成以每个像素为中心具有不同形状的锚框 + + Defined in :numref:`sec_anchor`""" + in_height, in_width = data.shape[-2:] + place, num_sizes, num_ratios = data.place, len(sizes), len(ratios) + boxes_per_pixel = (num_sizes + num_ratios - 1) + size_tensor = paddle.to_tensor(sizes, place=place) + ratio_tensor = paddle.to_tensor(ratios, place=place) + + # 为了将锚点移动到像素的中心,需要设置偏移量。 + # 因为一个像素的的高为1且宽为1,我们选择偏移我们的中心0.5 + offset_h, offset_w = 0.5, 0.5 + steps_h = 1.0 / in_height # 在y轴上缩放步长 + steps_w = 1.0 / in_width # 在x轴上缩放步长 + + # 生成锚框的所有中心点 + center_h = (paddle.arange(in_height) + offset_h) * steps_h + center_w = (paddle.arange(in_width) + offset_w) * steps_w + shift_y, shift_x = paddle.meshgrid(center_h, center_w) + shift_y, shift_x = shift_y.reshape([-1]), shift_x.reshape([-1]) + + # 生成“boxes_per_pixel”个高和宽, + # 之后用于创建锚框的四角坐标(xmin,xmax,ymin,ymax) + w = paddle.concat((size_tensor * paddle.sqrt(ratio_tensor[0]), + sizes[0] * paddle.sqrt(ratio_tensor[1:])))\ + * in_height / in_width # 处理矩形输入 + h = paddle.concat((size_tensor / paddle.sqrt(ratio_tensor[0]), + sizes[0] / paddle.sqrt(ratio_tensor[1:]))) + # 除以2来获得半高和半宽 + anchor_manipulations = paddle.tile(paddle.stack((-w, -h, w, h)).T, + (in_height * in_width, 1)) / 2 + + # 每个中心点都将有“boxes_per_pixel”个锚框, + # 所以生成含所有锚框中心的网格,重复了“boxes_per_pixel”次 + out_grid = paddle.stack([shift_x, shift_y, shift_x, shift_y], axis=1) + out_grid = paddle.tile(out_grid, repeat_times=[boxes_per_pixel]).reshape((-1, out_grid.shape[1])) + output = out_grid + anchor_manipulations + return output.unsqueeze(0) + +def show_bboxes(axes, bboxes, labels=None, colors=None): + """显示所有边界框 + + Defined in :numref:`sec_anchor`""" + def _make_list(obj, default_values=None): + if obj is None: + obj = default_values + elif not isinstance(obj, (list, tuple)): + obj = [obj] + return obj + + labels = _make_list(labels) + colors = _make_list(colors, ['b', 'g', 'r', 'm', 'c']) + for i, bbox in enumerate(bboxes): + color = colors[i % len(colors)] + rect = d2l.bbox_to_rect(d2l.numpy(bbox), color) + axes.add_patch(rect) + if labels and len(labels) > i: + text_color = 'k' if color == 'w' else 'w' + axes.text(rect.xy[0], rect.xy[1], labels[i], + va='center', ha='center', fontsize=9, color=text_color, + bbox=dict(facecolor=color, lw=0)) + +def box_iou(boxes1, boxes2): + """计算两个锚框或边界框列表中成对的交并比 + + Defined in :numref:`sec_anchor`""" + box_area = lambda boxes: ((boxes[:, 2] - boxes[:, 0]) * + (boxes[:, 3] - boxes[:, 1])) + # boxes1,boxes2,areas1,areas2的形状: + # boxes1:(boxes1的数量,4), + # boxes2:(boxes2的数量,4), + # areas1:(boxes1的数量,), + # areas2:(boxes2的数量,) + areas1 = box_area(boxes1) + areas2 = box_area(boxes2) + # inter_upperlefts,inter_lowerrights,inters的形状: + # (boxes1的数量,boxes2的数量,2) + inter_upperlefts = paddle.maximum(boxes1[:, None, :2], boxes2[:, :2]) + inter_lowerrights = paddle.minimum(boxes1[:, None, 2:], boxes2[:, 2:]) + inters = (inter_lowerrights - inter_upperlefts).clip(min=0) + # inter_areasandunion_areas的形状:(boxes1的数量,boxes2的数量) + inter_areas = inters[:, :, 0] * inters[:, :, 1] + union_areas = areas1[:, None] + areas2 - inter_areas + return inter_areas / union_areas + +def assign_anchor_to_bbox(ground_truth, anchors, place, iou_threshold=0.5): + """将最接近的真实边界框分配给锚框 + + Defined in :numref:`sec_anchor`""" + num_anchors, num_gt_boxes = anchors.shape[0], ground_truth.shape[0] + # 位于第i行和第j列的元素x_ij是锚框i和真实边界框j的IoU + jaccard = box_iou(anchors, ground_truth) + # 对于每个锚框,分配的真实边界框的张量 + anchors_bbox_map = paddle.full((num_anchors,), -1, dtype=paddle.int64) + # 根据阈值,决定是否分配真实边界框 + max_ious = paddle.max(jaccard, axis=1) + indices = paddle.argmax(jaccard, axis=1) + anc_i = paddle.nonzero(max_ious >= 0.5).reshape([-1]) + box_j = indices[max_ious >= 0.5] + anchors_bbox_map[anc_i] = box_j + col_discard = paddle.full((num_anchors,), -1) + row_discard = paddle.full((num_gt_boxes,), -1) + for _ in range(num_gt_boxes): + max_idx = paddle.argmax(jaccard) + box_idx = paddle.cast((max_idx % num_gt_boxes), dtype='int64') + anc_idx = paddle.cast((max_idx / num_gt_boxes), dtype='int64') + anchors_bbox_map[anc_idx] = box_idx + jaccard[:, box_idx] = col_discard + jaccard[anc_idx, :] = row_discard + return anchors_bbox_map + +def offset_boxes(anchors, assigned_bb, eps=1e-6): + """对锚框偏移量的转换 + + Defined in :numref:`subsec_labeling-anchor-boxes`""" + c_anc = d2l.box_corner_to_center(anchors) + c_assigned_bb = d2l.box_corner_to_center(assigned_bb) + offset_xy = 10 * (c_assigned_bb[:, :2] - c_anc[:, :2]) / c_anc[:, 2:] + offset_wh = 5 * d2l.log(eps + c_assigned_bb[:, 2:] / c_anc[:, 2:]) + offset = d2l.concat([offset_xy, offset_wh], axis=1) + return offset + +def multibox_target(anchors, labels): + """使用真实边界框标记锚框 + + Defined in :numref:`subsec_labeling-anchor-boxes`""" + batch_size, anchors = labels.shape[0], anchors.squeeze(0) + batch_offset, batch_mask, batch_class_labels = [], [], [] + place, num_anchors = anchors.place, anchors.shape[0] + for i in range(batch_size): + label = labels[i, :, :] + anchors_bbox_map = assign_anchor_to_bbox( + label[:, 1:], anchors, place) + bbox_mask = paddle.tile(paddle.to_tensor((anchors_bbox_map >= 0), dtype='float32').unsqueeze(-1), (1, 4)) + # 将类标签和分配的边界框坐标初始化为零 + class_labels = paddle.zeros(paddle.to_tensor(num_anchors), dtype=paddle.int64) + assigned_bb = paddle.zeros(paddle.to_tensor((num_anchors, 4)), dtype=paddle.float32) + # 使用真实边界框来标记锚框的类别。 + # 如果一个锚框没有被分配,我们标记其为背景(值为零) + indices_true = paddle.nonzero(anchors_bbox_map >= 0).numpy() + bb_idx = anchors_bbox_map[indices_true].numpy() + class_labels[indices_true] = label.numpy()[bb_idx, 0][:] + 1 + assigned_bb[indices_true] = label.numpy()[bb_idx, 1:] + class_labels = paddle.to_tensor(class_labels) + assigned_bb = paddle.to_tensor(assigned_bb) + # 偏移量转换 + offset = offset_boxes(anchors, assigned_bb) * bbox_mask + batch_offset.append(offset.reshape([-1])) + batch_mask.append(bbox_mask.reshape([-1])) + batch_class_labels.append(class_labels) + bbox_offset = paddle.stack(batch_offset) + bbox_mask = paddle.stack(batch_mask) + class_labels = paddle.stack(batch_class_labels) + return (bbox_offset, bbox_mask, class_labels) + +def offset_inverse(anchors, offset_preds): + """根据带有预测偏移量的锚框来预测边界框 + + Defined in :numref:`subsec_labeling-anchor-boxes`""" + anc = d2l.box_corner_to_center(anchors) + pred_bbox_xy = (offset_preds[:, :2] * anc[:, 2:] / 10) + anc[:, :2] + pred_bbox_wh = d2l.exp(offset_preds[:, 2:] / 5) * anc[:, 2:] + pred_bbox = d2l.concat((pred_bbox_xy, pred_bbox_wh), axis=1) + predicted_bbox = d2l.box_center_to_corner(pred_bbox) + return predicted_bbox + +def nms(boxes, scores, iou_threshold): + """对预测边界框的置信度进行排序 + + Defined in :numref:`subsec_predicting-bounding-boxes-nms`""" + B = paddle.argsort(scores, axis=-1, descending=True) + keep = [] # 保留预测边界框的指标 + while B.numel().item() > 0: + i = B[0] + keep.append(i.item()) + if B.numel().item() == 1: break + iou = box_iou(boxes[i.numpy(), :].reshape([-1, 4]), + paddle.to_tensor(boxes.numpy()[B[1:].numpy(), :]).reshape([-1, 4])).reshape([-1]) + inds = paddle.nonzero(iou <= iou_threshold).numpy().reshape([-1]) + B = paddle.to_tensor(B.numpy()[inds + 1]) + return paddle.to_tensor(keep, place=boxes.place, dtype='int64') + +def multibox_detection(cls_probs, offset_preds, anchors, nms_threshold=0.5, + pos_threshold=0.009999999): + """使用非极大值抑制来预测边界框 + + Defined in :numref:`subsec_predicting-bounding-boxes-nms`""" + batch_size = cls_probs.shape[0] + anchors = anchors.squeeze(0) + num_classes, num_anchors = cls_probs.shape[1], cls_probs.shape[2] + out = [] + for i in range(batch_size): + cls_prob, offset_pred = cls_probs[i], offset_preds[i].reshape([-1, 4]) + conf = paddle.max(cls_prob[1:], 0) + class_id = paddle.argmax(cls_prob[1:], 0) + predicted_bb = offset_inverse(anchors, offset_pred) + keep = nms(predicted_bb, conf, nms_threshold) + + # 找到所有的non_keep索引,并将类设置为背景 + all_idx = paddle.arange(num_anchors, dtype='int64') + combined = paddle.concat((keep, all_idx)) + uniques, counts = combined.unique(return_counts=True) + non_keep = uniques[counts == 1] + all_id_sorted = paddle.concat([keep, non_keep]) + class_id[non_keep.numpy()] = -1 + class_id = class_id[all_id_sorted] + conf, predicted_bb = conf[all_id_sorted], predicted_bb[all_id_sorted] + # pos_threshold是一个用于非背景预测的阈值 + below_min_idx = (conf < pos_threshold) + class_id[below_min_idx.numpy()] = -1 + conf[below_min_idx.numpy()] = 1 - conf[below_min_idx.numpy()] + pred_info = paddle.concat((paddle.to_tensor(class_id, dtype='float32').unsqueeze(1), + paddle.to_tensor(conf, dtype='float32').unsqueeze(1), + predicted_bb), axis=1) + out.append(pred_info) + return paddle.stack(out) + +d2l.DATA_HUB['banana-detection'] = ( + d2l.DATA_URL + 'banana-detection.zip', + '5de26c8fce5ccdea9f91267273464dc968d20d72') + +def read_data_bananas(is_train=True): + """读取香蕉检测数据集中的图像和标签 + + Defined in :numref:`sec_object-detection-dataset`""" + data_dir = d2l.download_extract('banana-detection') + csv_fname = os.path.join(data_dir, 'bananas_train' if is_train + else 'bananas_val', 'label.csv') + csv_data = pd.read_csv(csv_fname) + csv_data = csv_data.set_index('img_name') + images, targets = [], [] + for img_name, target in csv_data.iterrows(): + paddle.vision.set_image_backend('cv2') + images.append(paddlevision.image_load(os.path.join(data_dir, 'bananas_train' if is_train else + 'bananas_val', 'images', f'{img_name}'))[..., ::-1]) + # 这里的target包含(类别,左上角x,左上角y,右下角x,右下角y) + # 其中所有图像都具有相同的香蕉类(索引为0) + targets.append(list(target)) + return images, paddle.to_tensor(targets).unsqueeze(1) / 256 + +class BananasDataset(paddle.io.Dataset): + """一个用于加载香蕉检测数据集的自定义数据集 + + Defined in :numref:`sec_object-detection-dataset`""" + def __init__(self, is_train): + self.features, self.labels = read_data_bananas(is_train) + print('read ' + str(len(self.features)) + (f' training examples' if + is_train else f' validation examples')) + + def __getitem__(self, idx): + return (paddle.to_tensor(self.features[idx], dtype='float32').transpose([2, 0, 1]), self.labels[idx]) + + def __len__(self): + return len(self.features) + +def load_data_bananas(batch_size): + """加载香蕉检测数据集 + + Defined in :numref:`sec_object-detection-dataset`""" + train_iter = paddle.io.DataLoader(BananasDataset(is_train=True), + batch_size=batch_size, return_list=True, shuffle=True) + val_iter = paddle.io.DataLoader(BananasDataset(is_train=False), + batch_size=batch_size, return_list=True) + return train_iter, val_iter + +d2l.DATA_HUB['voc2012'] = (d2l.DATA_URL + 'VOCtrainval_11-May-2012.tar', + '4e443f8a2eca6b1dac8a6c57641b67dd40621a49') + +def read_voc_images(voc_dir, is_train=True): + """Defined in :numref:`sec_semantic_segmentation` + + Defined in :numref:`sec_semantic_segmentation`""" + """读取所有VOC图像并标注 + Defined in :numref:`sec_semantic_segmentation`""" + txt_fname = os.path.join(voc_dir, 'ImageSets', 'Segmentation', + 'train.txt' if is_train else 'val.txt') + with open(txt_fname, 'r') as f: + images = f.read().split() + features, labels = [], [] + for i, fname in enumerate(images): + features.append(paddle.vision.image.image_load(os.path.join( + voc_dir, 'JPEGImages', f'{fname}.jpg'), backend='cv2')[..., ::-1].transpose( + [2, 0, 1])) + labels.append(paddle.vision.image.image_load(os.path.join( + voc_dir, 'SegmentationClass', f'{fname}.png'), backend='cv2')[..., ::-1].transpose( + [2, 0, 1])) + return features, labels + +VOC_COLORMAP = [[0, 0, 0], [128, 0, 0], [0, 128, 0], [128, 128, 0], + [0, 0, 128], [128, 0, 128], [0, 128, 128], [128, 128, 128], + [64, 0, 0], [192, 0, 0], [64, 128, 0], [192, 128, 0], + [64, 0, 128], [192, 0, 128], [64, 128, 128], [192, 128, 128], + [0, 64, 0], [128, 64, 0], [0, 192, 0], [128, 192, 0], + [0, 64, 128]] + +VOC_CLASSES = ['background', 'aeroplane', 'bicycle', 'bird', 'boat', + 'bottle', 'bus', 'car', 'cat', 'chair', 'cow', + 'diningtable', 'dog', 'horse', 'motorbike', 'person', + 'potted plant', 'sheep', 'sofa', 'train', 'tv/monitor'] + +def voc_colormap2label(): + """构建从RGB到VOC类别索引的映射 + + Defined in :numref:`sec_semantic_segmentation`""" + colormap2label = paddle.zeros([256 ** 3], dtype=paddle.int64) + for i, colormap in enumerate(VOC_COLORMAP): + colormap2label[ + (colormap[0] * 256 + colormap[1]) * 256 + colormap[2]] = i + return colormap2label + +def voc_label_indices(colormap, colormap2label): + """将VOC标签中的RGB值映射到它们的类别索引 + + Defined in :numref:`sec_semantic_segmentation`""" + colormap = colormap.transpose([1, 2, 0]).astype('int32') + idx = ((colormap[:, :, 0] * 256 + colormap[:, :, 1]) * 256 + + colormap[:, :, 2]) + return colormap2label[idx] + +def voc_rand_crop(feature, label, height, width): + """随机裁剪特征和标签图像 + + Defined in :numref:`sec_semantic_segmentation`""" + rect = paddle.vision.transforms.RandomCrop((height, width))._get_param( + img=feature, output_size=(height, width)) + feature = paddle.vision.transforms.crop(feature, *rect) + label = paddle.vision.transforms.crop(label, *rect) + return feature, label + +class VOCSegDataset(paddle.io.Dataset): + """Defined in :numref:`sec_semantic_segmentation`""" + """一个用于加载VOC数据集的自定义数据集 + Defined in :numref:`sec_semantic_segmentation`""" + + def __init__(self, is_train, crop_size, voc_dir): + self.transform = paddle.vision.transforms.Normalize( + mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) + self.crop_size = crop_size + features, labels = read_voc_images(voc_dir, is_train=is_train) + self.features = [self.normalize_image(feature) + for feature in self.filter(features)] + self.labels = self.filter(labels) + self.colormap2label = voc_colormap2label() + print('read ' + str(len(self.features)) + ' examples') + + def normalize_image(self, img): + return self.transform(img.astype("float32") / 255) + + def filter(self, imgs): + return [img for img in imgs if ( + img.shape[1] >= self.crop_size[0] and + img.shape[2] >= self.crop_size[1])] + + def __getitem__(self, idx): + feature = paddle.to_tensor(self.features[idx],dtype='float32') + label = paddle.to_tensor(self.labels[idx],dtype='float32') + feature, label = voc_rand_crop(feature,label, + *self.crop_size) + return (feature, voc_label_indices(label, self.colormap2label)) + + def __len__(self): + return len(self.features) + +def load_data_voc(batch_size, crop_size): + """加载VOC语义分割数据集 + + Defined in :numref:`sec_semantic_segmentation`""" + voc_dir = d2l.download_extract('voc2012', os.path.join( + 'VOCdevkit', 'VOC2012')) + num_workers = d2l.get_dataloader_workers() + train_iter = paddle.io.DataLoader( + VOCSegDataset(True, crop_size, voc_dir), batch_size=batch_size, + shuffle=True, return_list=True, drop_last=True, num_workers=num_workers) + test_iter = paddle.io.DataLoader( + VOCSegDataset(False, crop_size, voc_dir), batch_size=batch_size, + drop_last=True, return_list=True, num_workers=num_workers) + return train_iter, test_iter + +d2l.DATA_HUB['cifar10_tiny'] = (d2l.DATA_URL + 'kaggle_cifar10_tiny.zip', + '2068874e4b9a9f0fb07ebe0ad2b29754449ccacd') + +def read_csv_labels(fname): + """读取fname来给标签字典返回一个文件名 + + Defined in :numref:`sec_kaggle_cifar10`""" + with open(fname, 'r') as f: + # 跳过文件头行(列名) + lines = f.readlines()[1:] + tokens = [l.rstrip().split(',') for l in lines] + return dict(((name, label) for name, label in tokens)) + +def copyfile(filename, target_dir): + """将文件复制到目标目录 + + Defined in :numref:`sec_kaggle_cifar10`""" + os.makedirs(target_dir, exist_ok=True) + shutil.copy(filename, target_dir) + +def reorg_train_valid(data_dir, labels, valid_ratio): + """将验证集从原始的训练集中拆分出来 + + Defined in :numref:`sec_kaggle_cifar10`""" + # 训练数据集中样本最少的类别中的样本数 + n = collections.Counter(labels.values()).most_common()[-1][1] + # 验证集中每个类别的样本数 + n_valid_per_label = max(1, math.floor(n * valid_ratio)) + label_count = {} + for train_file in os.listdir(os.path.join(data_dir, 'train')): + label = labels[train_file.split('.')[0]] + fname = os.path.join(data_dir, 'train', train_file) + copyfile(fname, os.path.join(data_dir, 'train_valid_test', + 'train_valid', label)) + if label not in label_count or label_count[label] < n_valid_per_label: + copyfile(fname, os.path.join(data_dir, 'train_valid_test', + 'valid', label)) + label_count[label] = label_count.get(label, 0) + 1 + else: + copyfile(fname, os.path.join(data_dir, 'train_valid_test', + 'train', label)) + return n_valid_per_label + +def reorg_test(data_dir): + """在预测期间整理测试集,以方便读取 + + Defined in :numref:`sec_kaggle_cifar10`""" + for test_file in os.listdir(os.path.join(data_dir, 'test')): + copyfile(os.path.join(data_dir, 'test', test_file), + os.path.join(data_dir, 'train_valid_test', 'test', + 'unknown')) + +d2l.DATA_HUB['dog_tiny'] = (d2l.DATA_URL + 'kaggle_dog_tiny.zip', + '0cb91d09b814ecdc07b50f31f8dcad3e81d6a86d') + +d2l.DATA_HUB['ptb'] = (d2l.DATA_URL + 'ptb.zip', + '319d85e578af0cdc590547f26231e4e31cdf1e42') + +def read_ptb(): + """将PTB数据集加载到文本行的列表中 + + Defined in :numref:`sec_word2vec_data`""" + data_dir = d2l.download_extract('ptb') + # Readthetrainingset. + with open(os.path.join(data_dir, 'ptb.train.txt')) as f: + raw_text = f.read() + return [line.split() for line in raw_text.split('\n')] + +def subsample(sentences, vocab): + """下采样高频词 + + Defined in :numref:`sec_word2vec_data`""" + # 排除未知词元'' + sentences = [[token for token in line if vocab[token] != vocab.unk] + for line in sentences] + counter = d2l.count_corpus(sentences) + num_tokens = sum(counter.values()) + + # 如果在下采样期间保留词元,则返回True + def keep(token): + return(random.uniform(0, 1) < + math.sqrt(1e-4 / counter[token] * num_tokens)) + + return ([[token for token in line if keep(token)] for line in sentences], + counter) + +def get_centers_and_contexts(corpus, max_window_size): + """返回跳元模型中的中心词和上下文词 + + Defined in :numref:`sec_word2vec_data`""" + centers, contexts = [], [] + for line in corpus: + # 要形成“中心词-上下文词”对,每个句子至少需要有2个词 + if len(line) < 2: + continue + centers += line + for i in range(len(line)): # 上下文窗口中间i + window_size = random.randint(1, max_window_size) + indices = list(range(max(0, i - window_size), + min(len(line), i + 1 + window_size))) + # 从上下文词中排除中心词 + indices.remove(i) + contexts.append([line[idx] for idx in indices]) + return centers, contexts + +class RandomGenerator: + """根据n个采样权重在{1,...,n}中随机抽取""" + def __init__(self, sampling_weights): + """Defined in :numref:`sec_word2vec_data`""" + # Exclude + self.population = list(range(1, len(sampling_weights) + 1)) + self.sampling_weights = sampling_weights + self.candidates = [] + self.i = 0 + + def draw(self): + if self.i == len(self.candidates): + # 缓存k个随机采样结果 + self.candidates = random.choices( + self.population, self.sampling_weights, k=10000) + self.i = 0 + self.i += 1 + return self.candidates[self.i - 1] + +generator = RandomGenerator([2, 3, 4]) +[generator.draw() for _ in range(10)] + +def get_negatives(all_contexts, vocab, counter, K): + """返回负采样中的噪声词 + + Defined in :numref:`sec_word2vec_data`""" + # 索引为1、2、...(索引0是词表中排除的未知标记) + sampling_weights = [counter[vocab.to_tokens(i)]**0.75 + for i in range(1, len(vocab))] + all_negatives, generator = [], RandomGenerator(sampling_weights) + for contexts in all_contexts: + negatives = [] + while len(negatives) < len(contexts) * K: + neg = generator.draw() + # 噪声词不能是上下文词 + if neg not in contexts: + negatives.append(neg) + all_negatives.append(negatives) + return all_negatives + +def batchify(data): + """返回带有负采样的跳元模型的小批量样本 + + Defined in :numref:`sec_word2vec_data`""" + max_len = max(len(c) + len(n) for _, c, n in data) + centers, contexts_negatives, masks, labels = [], [], [], [] + for center, context, negative in data: + cur_len = len(context) + len(negative) + centers += [center] + contexts_negatives += \ + [context + negative + [0] * (max_len - cur_len)] + masks += [[1] * cur_len + [0] * (max_len - cur_len)] + labels += [[1] * len(context) + [0] * (max_len - len(context))] + return (d2l.reshape(d2l.tensor(centers), (-1, 1)), d2l.tensor( + contexts_negatives), d2l.tensor(masks), d2l.tensor(labels)) + +def load_data_ptb(batch_size, max_window_size, num_noise_words): + """下载PTB数据集,然后将其加载到内存中 + + Defined in :numref:`subsec_word2vec-minibatch-loading`""" + num_workers = d2l.get_dataloader_workers() + sentences = read_ptb() + vocab = d2l.Vocab(sentences, min_freq=10) + subsampled, counter = subsample(sentences, vocab) + corpus = [vocab[line] for line in subsampled] + all_centers, all_contexts = get_centers_and_contexts( + corpus, max_window_size) + all_negatives = get_negatives( + all_contexts, vocab, counter, num_noise_words) + + class PTBDataset(paddle.io.Dataset): + def __init__(self, centers, contexts, negatives): + assert len(centers) == len(contexts) == len(negatives) + self.centers = centers + self.contexts = contexts + self.negatives = negatives + + def __getitem__(self, index): + return (self.centers[index], self.contexts[index], + self.negatives[index]) + + def __len__(self): + return len(self.centers) + + dataset = PTBDataset(all_centers, all_contexts, all_negatives) + + data_iter = paddle.io.DataLoader( + dataset, batch_size=batch_size, shuffle=True, return_list=True, + collate_fn=batchify, num_workers=num_workers) + return data_iter, vocab + +d2l.DATA_HUB['glove.6b.50d'] = (d2l.DATA_URL + 'glove.6B.50d.zip', + '0b8703943ccdb6eb788e6f091b8946e82231bc4d') + +d2l.DATA_HUB['glove.6b.100d'] = (d2l.DATA_URL + 'glove.6B.100d.zip', + 'cd43bfb07e44e6f27cbcc7bc9ae3d80284fdaf5a') + +d2l.DATA_HUB['glove.42b.300d'] = (d2l.DATA_URL + 'glove.42B.300d.zip', + 'b5116e234e9eb9076672cfeabf5469f3eec904fa') + +d2l.DATA_HUB['wiki.en'] = (d2l.DATA_URL + 'wiki.en.zip', + 'c1816da3821ae9f43899be655002f6c723e91b88') + +class TokenEmbedding: + """GloVe嵌入""" + def __init__(self, embedding_name): + """Defined in :numref:`sec_synonyms`""" + self.idx_to_token, self.idx_to_vec = self._load_embedding( + embedding_name) + self.unknown_idx = 0 + self.token_to_idx = {token: idx for idx, token in + enumerate(self.idx_to_token)} + + def _load_embedding(self, embedding_name): + idx_to_token, idx_to_vec = [''], [] + data_dir = d2l.download_extract(embedding_name) + # GloVe网站:https://nlp.stanford.edu/projects/glove/ + # fastText网站:https://fasttext.cc/ + with open(os.path.join(data_dir, 'vec.txt'), 'r') as f: + for line in f: + elems = line.rstrip().split(' ') + token, elems = elems[0], [float(elem) for elem in elems[1:]] + # 跳过标题信息,例如fastText中的首行 + if len(elems) > 1: + idx_to_token.append(token) + idx_to_vec.append(elems) + idx_to_vec = [[0] * len(idx_to_vec[0])] + idx_to_vec + return idx_to_token, d2l.tensor(idx_to_vec) + + def __getitem__(self, tokens): + indices = [self.token_to_idx.get(token, self.unknown_idx) + for token in tokens] + vecs = self.idx_to_vec[d2l.tensor(indices)] + return vecs + + def __len__(self): + return len(self.idx_to_token) + +def get_tokens_and_segments(tokens_a, tokens_b=None): + """获取输入序列的词元及其片段索引 + + Defined in :numref:`sec_bert`""" + tokens = [''] + tokens_a + [''] + # 0和1分别标记片段A和B + segments = [0] * (len(tokens_a) + 2) + if tokens_b is not None: + tokens += tokens_b + [''] + segments += [1] * (len(tokens_b) + 1) + return tokens, segments + +class BERTEncoder(nn.Layer): + """BERT编码器 + + Defined in :numref:`subsec_bert_input_rep`""" + def __init__(self, vocab_size, num_hiddens, norm_shape, ffn_num_input, + ffn_num_hiddens, num_heads, num_layers, dropout, + max_len=1000, key_size=768, query_size=768, value_size=768, + **kwargs): + super(BERTEncoder, self).__init__(**kwargs) + self.token_embedding = nn.Embedding(vocab_size, num_hiddens) + self.segment_embedding = nn.Embedding(2, num_hiddens) + self.blks = nn.Sequential() + for i in range(num_layers): + self.blks.add_sublayer(f"{i}", d2l.EncoderBlock( + key_size, query_size, value_size, num_hiddens, norm_shape, + ffn_num_input, ffn_num_hiddens, num_heads, dropout, True)) + # 在BERT中,位置嵌入是可学习的,因此我们创建一个足够长的位置嵌入参数 + x = paddle.randn([1, max_len, num_hiddens]) + self.pos_embedding = paddle.create_parameter(shape=x.shape, dtype=str(x.numpy().dtype), + default_initializer=paddle.nn.initializer.Assign(x)) + + def forward(self, tokens, segments, valid_lens): + # 在以下代码段中,X的形状保持不变:(批量大小,最大序列长度,num_hiddens) + X = self.token_embedding(tokens) + self.segment_embedding(segments) + X = X + self.pos_embedding[:, :X.shape[1], :] + for blk in self.blks: + X = blk(X, valid_lens) + return X + +class MaskLM(nn.Layer): + """BERT的掩蔽语言模型任务 + + Defined in :numref:`subsec_bert_input_rep`""" + def __init__(self, vocab_size, num_hiddens, num_inputs=768, **kwargs): + super(MaskLM, self).__init__(**kwargs) + self.mlp = nn.Sequential(nn.Linear(num_inputs, num_hiddens), + nn.ReLU(), + nn.LayerNorm(num_hiddens), + nn.Linear(num_hiddens, vocab_size)) + + def forward(self, X, pred_positions): + num_pred_positions = pred_positions.shape[1] + pred_positions = pred_positions.reshape([-1]) + batch_size = X.shape[0] + batch_idx = paddle.arange(0, batch_size) + # 假设batch_size=2,num_pred_positions=3 + # 那么batch_idx是np.array([0,0,0,1,1]) + batch_idx = paddle.repeat_interleave(batch_idx, num_pred_positions) + masked_X = X[batch_idx, pred_positions] + masked_X = masked_X.reshape((batch_size, num_pred_positions, -1)) + mlm_Y_hat = self.mlp(masked_X) + return mlm_Y_hat + +class NextSentencePred(nn.Layer): + """BERT的下一句预测任务 + + Defined in :numref:`subsec_mlm`""" + def __init__(self, num_inputs, **kwargs): + super(NextSentencePred, self).__init__(**kwargs) + self.output = nn.Linear(num_inputs, 2) + + def forward(self, X): + # X的形状:(batchsize,num_hiddens) + return self.output(X) + +class BERTModel(nn.Layer): + """BERT模型 + + Defined in :numref:`subsec_nsp`""" + def __init__(self, vocab_size, num_hiddens, norm_shape, ffn_num_input, + ffn_num_hiddens, num_heads, num_layers, dropout, + max_len=1000, key_size=768, query_size=768, value_size=768, + hid_in_features=768, mlm_in_features=768, + nsp_in_features=768): + super(BERTModel, self).__init__() + self.encoder = BERTEncoder(vocab_size, num_hiddens, norm_shape, + ffn_num_input, ffn_num_hiddens, num_heads, num_layers, + dropout, max_len=max_len, key_size=key_size, + query_size=query_size, value_size=value_size) + self.hidden = nn.Sequential(nn.Linear(hid_in_features, num_hiddens), + nn.Tanh()) + self.mlm = MaskLM(vocab_size, num_hiddens, mlm_in_features) + self.nsp = NextSentencePred(nsp_in_features) + + def forward(self, tokens, segments, valid_lens=None, + pred_positions=None): + encoded_X = self.encoder(tokens, segments, valid_lens) + if pred_positions is not None: + mlm_Y_hat = self.mlm(encoded_X, pred_positions) + else: + mlm_Y_hat = None + # 用于下一句预测的多层感知机分类器的隐藏层,0是“”标记的索引 + nsp_Y_hat = self.nsp(self.hidden(encoded_X[:, 0, :])) + return encoded_X, mlm_Y_hat, nsp_Y_hat + +d2l.DATA_HUB['wikitext-2'] = ( + 'https://s3.amazonaws.com/research.metamind.io/wikitext/' + 'wikitext-2-v1.zip', '3c914d17d80b1459be871a5039ac23e752a53cbe') + +def _read_wiki(data_dir): + """Defined in :numref:`sec_bert-dataset`""" + file_name = os.path.join(data_dir, 'wiki.train.tokens') + with open(file_name, 'r') as f: + lines = f.readlines() + # 大写字母转换为小写字母 + paragraphs = [line.strip().lower().split(' . ') + for line in lines if len(line.split(' . ')) >= 2] + random.shuffle(paragraphs) + return paragraphs + +def _get_next_sentence(sentence, next_sentence, paragraphs): + """Defined in :numref:`sec_bert-dataset`""" + if random.random() < 0.5: + is_next = True + else: + # paragraphs是三重列表的嵌套 + next_sentence = random.choice(random.choice(paragraphs)) + is_next = False + return sentence, next_sentence, is_next + +def _get_nsp_data_from_paragraph(paragraph, paragraphs, vocab, max_len): + """Defined in :numref:`sec_bert-dataset`""" + nsp_data_from_paragraph = [] + for i in range(len(paragraph) - 1): + tokens_a, tokens_b, is_next = _get_next_sentence( + paragraph[i], paragraph[i + 1], paragraphs) + # 考虑1个''词元和2个''词元 + if len(tokens_a) + len(tokens_b) + 3 > max_len: + continue + tokens, segments = d2l.get_tokens_and_segments(tokens_a, tokens_b) + nsp_data_from_paragraph.append((tokens, segments, is_next)) + return nsp_data_from_paragraph + +def _replace_mlm_tokens(tokens, candidate_pred_positions, num_mlm_preds, + vocab): + """Defined in :numref:`sec_bert-dataset`""" + # 为遮蔽语言模型的输入创建新的词元副本,其中输入可能包含替换的“”或随机词元 + mlm_input_tokens = [token for token in tokens] + pred_positions_and_labels = [] + # 打乱后用于在遮蔽语言模型任务中获取15%的随机词元进行预测 + random.shuffle(candidate_pred_positions) + for mlm_pred_position in candidate_pred_positions: + if len(pred_positions_and_labels) >= num_mlm_preds: + break + masked_token = None + # 80%的时间:将词替换为“”词元 + if random.random() < 0.8: + masked_token = '' + else: + # 10%的时间:保持词不变 + if random.random() < 0.5: + masked_token = tokens[mlm_pred_position] + # 10%的时间:用随机词替换该词 + else: + masked_token = random.choice(vocab.idx_to_token) + mlm_input_tokens[mlm_pred_position] = masked_token + pred_positions_and_labels.append( + (mlm_pred_position, tokens[mlm_pred_position])) + return mlm_input_tokens, pred_positions_and_labels + +def _get_mlm_data_from_tokens(tokens, vocab): + """Defined in :numref:`subsec_prepare_mlm_data`""" + candidate_pred_positions = [] + # tokens是一个字符串列表 + for i, token in enumerate(tokens): + # 在遮蔽语言模型任务中不会预测特殊词元 + if token in ['', '']: + continue + candidate_pred_positions.append(i) + # 遮蔽语言模型任务中预测15%的随机词元 + num_mlm_preds = max(1, round(len(tokens) * 0.15)) + mlm_input_tokens, pred_positions_and_labels = _replace_mlm_tokens( + tokens, candidate_pred_positions, num_mlm_preds, vocab) + pred_positions_and_labels = sorted(pred_positions_and_labels, + key=lambda x: x[0]) + pred_positions = [v[0] for v in pred_positions_and_labels] + mlm_pred_labels = [v[1] for v in pred_positions_and_labels] + return vocab[mlm_input_tokens], pred_positions, vocab[mlm_pred_labels] + +def _pad_bert_inputs(examples, max_len, vocab): + """Defined in :numref:`subsec_prepare_mlm_data`""" + max_num_mlm_preds = round(max_len * 0.15) + all_token_ids, all_segments, valid_lens, = [], [], [] + all_pred_positions, all_mlm_weights, all_mlm_labels = [], [], [] + nsp_labels = [] + for (token_ids, pred_positions, mlm_pred_label_ids, segments, + is_next) in examples: + all_token_ids.append(paddle.to_tensor(token_ids + [vocab['']] * ( + max_len - len(token_ids)), dtype=paddle.int64)) + all_segments.append(paddle.to_tensor(segments + [0] * ( + max_len - len(segments)), dtype=paddle.int64)) + # valid_lens不包括''的计数 + valid_lens.append(paddle.to_tensor(len(token_ids), dtype=paddle.float32)) + all_pred_positions.append(paddle.to_tensor(pred_positions + [0] * ( + max_num_mlm_preds - len(pred_positions)), dtype=paddle.int64)) + # 填充词元的预测将通过乘以0权重在损失中过滤掉 + all_mlm_weights.append( + paddle.to_tensor([1.0] * len(mlm_pred_label_ids) + [0.0] * ( + max_num_mlm_preds - len(pred_positions)), + dtype=paddle.float32)) + all_mlm_labels.append(paddle.to_tensor(mlm_pred_label_ids + [0] * ( + max_num_mlm_preds - len(mlm_pred_label_ids)), dtype=paddle.int64)) + nsp_labels.append(paddle.to_tensor(is_next, dtype=paddle.int64)) + return (all_token_ids, all_segments, valid_lens, all_pred_positions, + all_mlm_weights, all_mlm_labels, nsp_labels) + +class _WikiTextDataset(paddle.io.Dataset): + """Defined in :numref:`subsec_prepare_mlm_data`""" + def __init__(self, paragraphs, max_len): + # 输入paragraphs[i]是代表段落的句子字符串列表; + # 而输出paragraphs[i]是代表段落的句子列表,其中每个句子都是词元列表 + paragraphs = [d2l.tokenize( + paragraph, token='word') for paragraph in paragraphs] + sentences = [sentence for paragraph in paragraphs + for sentence in paragraph] + self.vocab = d2l.Vocab(sentences, min_freq=5, reserved_tokens=[ + '', '', '', '']) + # 获取下一句子预测任务的数据 + examples = [] + for paragraph in paragraphs: + examples.extend(_get_nsp_data_from_paragraph( + paragraph, paragraphs, self.vocab, max_len)) + # 获取遮蔽语言模型任务的数据 + examples = [(_get_mlm_data_from_tokens(tokens, self.vocab) + + (segments, is_next)) + for tokens, segments, is_next in examples] + # 填充输入 + (self.all_token_ids, self.all_segments, self.valid_lens, + self.all_pred_positions, self.all_mlm_weights, + self.all_mlm_labels, self.nsp_labels) = _pad_bert_inputs( + examples, max_len, self.vocab) + + def __getitem__(self, idx): + return (self.all_token_ids[idx], self.all_segments[idx], + self.valid_lens[idx], self.all_pred_positions[idx], + self.all_mlm_weights[idx], self.all_mlm_labels[idx], + self.nsp_labels[idx]) + + def __len__(self): + return len(self.all_token_ids) + +def load_data_wiki(batch_size, max_len): + """加载WikiText-2数据集 + + Defined in :numref:`subsec_prepare_mlm_data`""" + num_workers = d2l.get_dataloader_workers() + data_dir = d2l.download_extract('wikitext-2', 'wikitext-2') + paragraphs = _read_wiki(data_dir) + train_set = _WikiTextDataset(paragraphs, max_len) + train_iter = paddle.io.DataLoader(dataset=train_set, batch_size=batch_size, return_list=True, + shuffle=True, num_workers=num_workers) + return train_iter, train_set.vocab + +def _get_batch_loss_bert(net, loss, vocab_size, tokens_X, + segments_X, valid_lens_x, + pred_positions_X, mlm_weights_X, + mlm_Y, nsp_y): + """Defined in :numref:`sec_bert-pretraining`""" + # 前向传播 + _, mlm_Y_hat, nsp_Y_hat = net(tokens_X, segments_X, + valid_lens_x.reshape([-1]), + pred_positions_X) + # 计算遮蔽语言模型损失 + mlm_l = loss(mlm_Y_hat.reshape([-1, vocab_size]), mlm_Y.reshape([-1])) *\ + mlm_weights_X.reshape([-1, 1]) + mlm_l = mlm_l.sum() / (mlm_weights_X.sum() + 1e-8) + # 计算下一句子预测任务的损失 + nsp_l = loss(nsp_Y_hat, nsp_y) + l = mlm_l + nsp_l + return mlm_l, nsp_l, l + +d2l.DATA_HUB['aclImdb'] = ( + 'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz', + '01ada507287d82875905620988597833ad4e0903') + +def read_imdb(data_dir, is_train): + """读取IMDb评论数据集文本序列和标签 + + Defined in :numref:`sec_sentiment`""" + data, labels = [], [] + for label in ('pos', 'neg'): + folder_name = os.path.join(data_dir, 'train' if is_train else 'test', + label) + for file in os.listdir(folder_name): + with open(os.path.join(folder_name, file), 'rb') as f: + review = f.read().decode('utf-8').replace('\n', '') + data.append(review) + labels.append(1 if label == 'pos' else 0) + return data, labels + +def load_data_imdb(batch_size, num_steps=500): + """返回数据迭代器和IMDb评论数据集的词表 + + Defined in :numref:`sec_sentiment`""" + data_dir = d2l.download_extract('aclImdb', 'aclImdb') + train_data = read_imdb(data_dir, True) + test_data = read_imdb(data_dir, False) + train_tokens = d2l.tokenize(train_data[0], token='word') + test_tokens = d2l.tokenize(test_data[0], token='word') + vocab = d2l.Vocab(train_tokens, min_freq=5) + train_features = d2l.tensor([d2l.truncate_pad( + vocab[line], num_steps, vocab['']) for line in train_tokens]) + test_features = d2l.tensor([d2l.truncate_pad( + vocab[line], num_steps, vocab['']) for line in test_tokens]) + train_iter = d2l.load_array((train_features, d2l.tensor(train_data[1])), + batch_size) + test_iter = d2l.load_array((test_features, d2l.tensor(test_data[1])), + batch_size, + is_train=False) + return train_iter, test_iter, vocab + +def predict_sentiment(net, vocab, sequence): + """预测文本序列的情感 + + Defined in :numref:`sec_sentiment_rnn`""" + sequence = paddle.to_tensor(vocab[sequence.split()], place=d2l.try_gpu()) + label = paddle.argmax(net(sequence.reshape((1, -1))), axis=1) + return 'positive' if label == 1 else 'negative' + +d2l.DATA_HUB['SNLI'] = ( + 'https://nlp.stanford.edu/projects/snli/snli_1.0.zip', + '9fcde07509c7e87ec61c640c1b2753d9041758e4') + +def read_snli(data_dir, is_train): + """将SNLI数据集解析为前提、假设和标签 + + Defined in :numref:`sec_natural-language-inference-and-dataset`""" + def extract_text(s): + # 删除我们不会使用的信息 + s = re.sub('\\(', '', s) + s = re.sub('\\)', '', s) + # 用一个空格替换两个或多个连续的空格 + s = re.sub('\\s{2,}', ' ', s) + return s.strip() + label_set = {'entailment': 0, 'contradiction': 1, 'neutral': 2} + file_name = os.path.join(data_dir, 'snli_1.0_train.txt' + if is_train else 'snli_1.0_test.txt') + with open(file_name, 'r') as f: + rows = [row.split('\t') for row in f.readlines()[1:]] + premises = [extract_text(row[1]) for row in rows if row[0] in label_set] + hypotheses = [extract_text(row[2]) for row in rows if row[0] \ + in label_set] + labels = [label_set[row[0]] for row in rows if row[0] in label_set] + return premises, hypotheses, labels + +class SNLIDataset(paddle.io.Dataset): + """用于加载SNLI数据集的自定义数据集 + + Defined in :numref:`sec_natural-language-inference-and-dataset`""" + def __init__(self, dataset, num_steps, vocab=None): + self.num_steps = num_steps + all_premise_tokens = d2l.tokenize(dataset[0]) + all_hypothesis_tokens = d2l.tokenize(dataset[1]) + if vocab is None: + self.vocab = d2l.Vocab(all_premise_tokens + \ + all_hypothesis_tokens, min_freq=5, reserved_tokens=['']) + else: + self.vocab = vocab + self.premises = self._pad(all_premise_tokens) + self.hypotheses = self._pad(all_hypothesis_tokens) + self.labels = paddle.to_tensor(dataset[2]) + print('read ' + str(len(self.premises)) + ' examples') + + def _pad(self, lines): + return paddle.to_tensor([d2l.truncate_pad( + self.vocab[line], self.num_steps, self.vocab['']) + for line in lines]) + + def __getitem__(self, idx): + return (self.premises[idx], self.hypotheses[idx]), self.labels[idx] + + def __len__(self): + return len(self.premises) + +def load_data_snli(batch_size, num_steps=50): + """下载SNLI数据集并返回数据迭代器和词表 + + Defined in :numref:`sec_natural-language-inference-and-dataset`""" + num_workers = d2l.get_dataloader_workers() + data_dir = d2l.download_extract('SNLI') + train_data = read_snli(data_dir, True) + test_data = read_snli(data_dir, False) + train_set = SNLIDataset(train_data, num_steps) + test_set = SNLIDataset(test_data, num_steps, train_set.vocab) + train_iter = paddle.io.DataLoader(train_set,batch_size=batch_size, + shuffle=True, + num_workers=num_workers, + return_list=True) + + test_iter = paddle.io.DataLoader(test_set, batch_size=batch_size, + shuffle=False, + num_workers=num_workers, + return_list=True) + return train_iter, test_iter, train_set.vocab + +def predict_snli(net, vocab, premise, hypothesis): + """预测前提和假设之间的逻辑关系 + + Defined in :numref:`sec_natural-language-inference-attention`""" + net.eval() + premise = paddle.to_tensor(vocab[premise], place=d2l.try_gpu()) + hypothesis = paddle.to_tensor(vocab[hypothesis], place=d2l.try_gpu()) + label = paddle.argmax(net([premise.reshape((1, -1)), + hypothesis.reshape((1, -1))]), axis=1) + + return 'entailment' if label == 0 else 'contradiction' if label == 1 \ + else 'neutral' + + +# Alias defined in config.ini +nn_Module = nn.Layer + +ones = paddle.ones +zeros = paddle.zeros +tensor = paddle.to_tensor +arange = paddle.arange +meshgrid = paddle.meshgrid +sin = paddle.sin +sinh = paddle.sinh +cos = paddle.cos +cosh = paddle.cosh +tanh = paddle.tanh +linspace = paddle.linspace +exp = paddle.exp +log = paddle.log +normal = paddle.normal +rand = paddle.rand +randn = paddle.randn +matmul = paddle.matmul +int32 = paddle.int32 +float32 = paddle.float32 +concat = paddle.concat +stack = paddle.stack +abs = paddle.abs +eye = paddle.eye +numpy = lambda x, *args, **kwargs: x.detach().numpy(*args, **kwargs) +size = lambda x, *args, **kwargs: x.numel(*args, **kwargs) +reshape = lambda x, *args, **kwargs: x.reshape(*args, **kwargs) +to = lambda x, *args, **kwargs: x.to(*args, **kwargs) +reduce_sum = lambda x, *args, **kwargs: x.sum(*args, **kwargs) +argmax = lambda x, *args, **kwargs: x.argmax(*args, **kwargs) +astype = lambda x, *args, **kwargs: x.astype(*args, **kwargs) +transpose = lambda x, *args, **kwargs: x.t(*args, **kwargs) +reduce_mean = lambda x, *args, **kwargs: x.mean(*args, **kwargs) + diff --git a/static/build.yml b/static/build.yml index 722641de86..b5fc3af2bb 100644 --- a/static/build.yml +++ b/static/build.yml @@ -1,5 +1,5 @@ dependencies: - - python=3.8 + - python=3.9 - pip - pip: - .. # d2l