Another profound dimension of simplicity is its smoothness, which emphasizes that a function should not react drastically or produce large changes in output when faced with small fluctuations in input data. In the field of machine learning and deep neural networks, pursuing model smoothness is crucial to improving generalization ability and reducing overfitting. Smoothness requires that the model not only fits the training data well, but also maintains stable prediction performance on unseen data.
Dropout, as an innovative technique for training neural networks, is designed based on this concept. In the process of forward propagation, dropout not only calculates the output of each internal layer, but also deliberately injects random noise into these layers. The specific method is to randomly "discard" or temporarily ignore some neurons in the network according to a certain probability. The reason why this technology is named "Dropout" is that intuitively, it seems to randomly "discard" or close the connections of some neurons in each iteration step of the training process. The introduction of this randomness effectively breaks the fixed pattern or path dependence that may be formed during the training of the neural network, prompting the network to explore more possibilities, thereby improving the generalization performance and robustness of the model. By continuously implementing dropout during the training process, the network gradually learns how to make accurate predictions in the absence of some information, which is of great significance for improving the performance of the model in practical applications.
import torch
import torchvision
from torch.utils import data
from torchvision import transforms
import matplotlib.pyplot as plt
from torch import nn
def get_dataloader_workers():
return 6
def load_data_fashion_mnist(batch_size, resize=None):
trans = [transforms.ToTensor()]
if resize:
trans.insert(0, transforms.Resize(resize))
trans = transforms.Compose(trans)
mnist_train = torchvision.datasets.FashionMNIST(root="./data", train=True, transform=trans, download=True)
mnist_test = torchvision.datasets.FashionMNIST(root="./data", train=False, transform=trans, download=True)
return (data.DataLoader(mnist_train, batch_size, shuffle=True, num_workers=get_dataloader_workers()),
data.DataLoader(mnist_test, batch_size, shuffle=False, num_workers=get_dataloader_workers()))
def accurancy(y_hat, y):
if len(y_hat.shape) > 1 and y_hat.shape[1] > 1:
y_hat = y_hat.argmax(axis=1)
cmp = y_hat.type(y.dtype) == y
return float(cmp.type(y.dtype).sum())
class Accumulator:
def __init__(self, n) -> None:
self.data = [0.0]*n
def add(self, *args):
self.data = [a + float(b) for a, b in zip(self.data, args)]
def reset(self):
self.data = [0.0] * len(self.data)
def __getitem__(self, idx):
return self.data[idx]
def evaluate_accurancy(net, data_iter):
if isinstance(net, torch.nn.Module):
net.eval()
metric = Accumulator(2)
with torch.no_grad():
for X, y in data_iter:
metric.add(accurancy(net(X), y), y.numel())
return metric[0] / metric[1]
def train_epoch_ch3(net, train_iter, loss, updater):
if isinstance(net, torch.nn.Module):
net.train()
metric = Accumulator(3)
for X, y in train_iter:
y_hat = net(X)
l = loss(y_hat, y)
if isinstance(updater, torch.optim.Optimizer):
updater.zero_grad()
l.mean().backward()
updater.step()
else:
l.sum().backward()
updater(X.shape[0])
metric.add(float(l.sum()), accurancy(y_hat, y), y.numel())
return metric[0] / metric[1], metric[1] / metric[2]
def set_axes(axes, xlable, ylable, xlim, ylim, xscale, yscale, legend):
axes.set_xlabel(xlable)
axes.set_ylabel(ylable)
axes.set_xscale(xscale)
axes.set_yscale(yscale)
axes.set_xlim(xlim)
axes.set_ylim(ylim)
if legend:
axes.legend(legend)
axes.grid()
class Animator:
def __init__(self, xlable=None, ylable=None, legend=None, xlim=None, ylim=None,
xscale='linear', yscale='linear',fmts=('-','m--','g-.','r:'), nrows=1, ncols=1, figsize=(3.5, 2.5)):
if legend is None:
legend = []
self.fig, self.axes = plt.subplots(nrows, ncols, figsize=figsize)
if nrows * ncols == 1:
self.axes = [self.axes, ]
self.config_axes = lambda: set_axes(self.axes[0], xlable, ylable, xlim, ylim, xscale, yscale, legend)
self.X, self.Y, self.fmts = None, None, fmts
def add(self, x, y):
if not hasattr(y, "__len__"):
y=[y]
n = len(y)
if not hasattr(x, "__len__"):
x = [x] * n
if not self.X:
self.X = [[] for _ in range(n)]
if not self.Y:
self.Y = [[] for _ in range(n)]
for i, (a,b) in enumerate(zip(x, y)):
if a is not None and b is not None:
self.X[i].append(a)
self.Y[i].append(b)
self.axes[0].cla()
for x, y, fmt in zip(self.X, self.Y, self.fmts):
self.axes[0].plot(x, y, fmt)
self.config_axes()
def train_ch3(net, train_iter, test_iter, loss, num_epochs, updater):
animator = Animator(xlable='epoch', xlim=[1, num_epochs], ylim=[0.3, 0.9], legend=['train loss', "train acc", "test acc"])
for epoch in range(num_epochs):
train_metrics = train_epoch_ch3(net, train_iter, loss, updater)
test_acc = evaluate_accurancy(net, test_iter)
animator.add(epoch+1, train_metrics+(test_acc, ))
train_loss, train_acc = train_metrics
assert train_loss < 0.5, train_loss
assert train_acc < 1 and train_acc > 0.7, train_acc
assert test_acc < 1 and test_acc > 0.7, test_acc
dropout1, dropout2 = 0.2, 0.5
num_epochs, lr, batch_size = 10, 0.5, 256
loss = nn.CrossEntropyLoss(reduction='none')
train_iter, test_iter = load_data_fashion_mnist(batch_size)
net = nn.Sequential(nn.Flatten(),
nn.Linear(784, 256),
nn.ReLU(),
nn.Dropout(dropout1),
nn.Linear(256, 256),
nn.ReLU(),
nn.Dropout(dropout2),
nn.Linear(256, 10))
def init_weights(m):
if type(m) == nn.Linear:
nn.init.normal_(m.weight, std=0.01)
net.apply(init_weights)
trainer = torch.optim.SGD(net.parameters(), lr=lr)
train_ch3(net, train_iter, test_iter, loss, num_epochs, trainer)
plt.show()