05-深度学习计算
5.1 层和块
层和块
层:
(1) 接受一组输入 (2) 生成相应的输出 (3) 由一组可调整参数描述
块:
(1) 比单个层大,但比整个模型小的组件;
(2) 可以描述单个层、由多个层组成的组件或整个模型本身;可以将一些块组合成更大的组件;
(3) 块由类表示,它的任何子类都必须定义一个将其输入转换为输出的前向传播函数,并且必须存储任何必须的参数;为了计算梯度,块必须具有反向传播参数;
多层感知机的代码实现:
1
2
3
4
net = nn.Sequential(nn.Linear(20, 256), nn.ReLU(), nn.Linear(256, 10))
X = torch.rand(2, 20)
print(net(X)) # net.__call__(X)的简写
tensor([[-0.0598, -0.2316, -0.3619, 0.1329, -0.0471, -0.1033, -0.0660, 0.1027, 0.0881, -0.0803],
[ 0.0419, -0.2622, -0.2755, 0.0526, -0.0111, -0.1644, -0.0917, 0.0586, -0.0880, -0.0593]], grad_fn=)
自定义块的代码实现:
1
2
3
4
5
6
7
8
9
10
11
class MLP(nn.Module):
def __init__(self):
super().__init__()
self.hidden = nn.Linear(20,256)
self.out = nn.Linear(256,10)
def forward(self,X):
return self.out(F.relu(self.hidden(X))) # ReLU的函数版本,nn.functional
net = MLP()
print(net(X))
tensor([[-0.0404, -0.1485, -0.2479, 0.0589, 0.1766, 0.1838, 0.1994, 0.0569, 0.0653, 0.2753],
[-0.1073, -0.0745, -0.2666, 0.0762, 0.1180, 0.0527, 0.0867, 0.0300, -0.0015, 0.2650]], grad_fn=)
顺序块
1
2
3
4
5
6
7
8
9
10
11
12
13
class MySequential(nn.Module):
def __init__(self, *args):
super().__init__()
for idx, module in enumerate(args):
self._modules[str(idx)] = module
def forward(self, X):
for block in self._modules.values():
X = block(X)
return X
net = MySequential(nn.Linear(20, 256), nn.ReLU(), nn.Linear(256, 10))
print(net(X))
tensor([[-0.2242, 0.1504, 0.1898, 0.0536, -0.1651, -0.0147, 0.1896, -0.2420, -0.1102, 0.2483],
[-0.1735, 0.0988, 0.2303, 0.0658, -0.0901, 0.0443, 0.1346, -0.1984, -0.0308, 0.1663]], grad_fn=)
当执行net(X)时,实际调用顺序如下:
触发__call__方法
pytorch的
nn.Module类重写了__call__方法,当实例化对象net被像函数一样调用时,即net(X),call方法会被自动执行;call方法会处理前向传播的预处理逻辑,最终调用用户定义的forward函数;
在前向传播函数中执行代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
class FixedHiddenMLP(nn.Module):
def __init__(self):
super().__init__()
self.rand_weight = torch.rand((20, 20), requires_grad=False)
self.linear = nn.Linear(20, 20)
def forward(self, X):
X = self.linear(X)
X = F.relu(torch.mm(X, self.rand_weight) + 1)
X = self.linear(X)
while X.abs().sum() > 1:
X /= 2
return X.sum()
net = FixedHiddenMLP()
print(net(X))
tensor(-0.2347, grad_fn=
)
组合块的混合搭配:
1
2
3
4
5
6
7
8
9
10
11
12
class NestMLP(nn.Module):
def __init__(self):
super().__init__()
self.net = nn.Sequential(nn.Linear(20, 64), nn.ReLU(),
nn.Linear(64, 32), nn.ReLU())
self.Linear = nn.Linear(32, 16)
def forward(self, X):
return self.Linear(self.net(X))
chimera = nn.Sequential(NestMLP(), nn.Linear(16, 20), FixedHiddenMLP())
print(chimera(X))
tensor(0.5916, grad_fn=
)
5.2 参数管理
参数访问
以单隐藏层多层感知机为例:
1
2
net = nn.Sequential(nn.Linear(4, 8), nn.ReLU(), nn.Linear(8, 1))
X = torch.rand(size=(2, 4))
访问特定层参数
1
print(net[2].state_dict())
OrderedDict([(‘weight’, tensor([[-0.0279, 0.1508, -0.2685, -0.2710, -0.2749, -0.0494, -0.2761, -0.2905]])), (‘bias’, tensor([-0.1618]))])
访问特定层的特定参数
1 2 3 4 5
print(type(net[2].bias)) print(net[2].bias) print(net[2].bias.data) print(net[2].weight)
Parameter containing:
tensor([-0.1618], requires_grad=True)
tensor([-0.1618])
Parameter containing:
tensor([[-0.0279, 0.1508, -0.2685, -0.2710, -0.2749, -0.0494, -0.2761, -0.2905]], requires_grad=True)1
print(net[2].weight.grad == None) # True
访问所有参数
1 2 3 4
print(*[(name, param.shape) for name, param in net[0].named_parameters()]) print(*[(name, param.shape) for name, param in net.named_parameters()]) print(net.state_dict()['2.bias'].data)
(‘weight’, torch.Size([8, 4])) (‘bias’, torch.Size([8]))
(‘0.weight’, torch.Size([8, 4])) (‘0.bias’, torch.Size([8])) (‘2.weight’, torch.Size([1, 8])) (‘2.bias’, torch.Size([1]))
tensor([-0.1618])
从嵌套块收集数据
定义块:
1
2
3
4
5
6
7
8
9
10
def block1():
return nn.Sequential(nn.Linear(4, 8), nn.ReLU(),
nn.Linear(8, 4), nn.ReLU())
def block2():
net = nn.Sequential()
for i in range(4):
# 动态向网络中添加子模块,并为其指定一个名称
net.add_module(f'block {i}', block1())
return net
使用块:
1
rgnet = nn.Sequential(block2(), nn.Linear(4, 1))
嵌套块的参数:
1
2
3
4
print(rgnet(X))
print(rgnet)
print(rgnet[0][1][0].bias.data)
tensor([[0.1059], [0.1059]], grad_fn=
)
Sequential(
(0): Sequential(
(block 0): Sequential(
(0): Linear(in_features=4, out_features=8, bias=True)
(1): ReLU()
(2): Linear(in_features=8, out_features=4, bias=True)
(3): ReLU()
)
(block 1): Sequential(
(0): Linear(in_features=4, out_features=8, bias=True)
(1): ReLU()
(2): Linear(in_features=8, out_features=4, bias=True)
(3): ReLU()
)
(block 2): Sequential(
(0): Linear(in_features=4, out_features=8, bias=True)
(1): ReLU()
(2): Linear(in_features=8, out_features=4, bias=True)
(3): ReLU()
)
(block 3): Sequential(
(0): Linear(in_features=4, out_features=8, bias=True)
(1): ReLU()
(2): Linear(in_features=8, out_features=4, bias=True)
(3): ReLU()
)
)
(1): Linear(in_features=4, out_features=1, bias=True)
)
tensor([-0.3803, -0.0498, 0.3972, 0.1615, 0.1181, 0.1456, 0.0816, 0.0283])
参数初始化(内置&自定义)
内置:
默认情况下,pytorch会根据一个范围均匀地初始化权重和偏置矩阵,这个范围是根据输入和输出维度计算出的
1
2
3
4
5
6
7
def init_normal(m):
if type(m) == nn.Linear:
nn.init.normal_(m.weight, mean=0, std=0.01)
nn.init.zeros_(m.bias)
net.apply(init_normal)
print(net[0].weight.data[0], net[0].bias.data[0])
1
2
3
4
5
6
7
def init_constant(m):
if type(m) == nn.Linear:
nn.init.constant_(m.weight, 1)
nn.init.zeros_(m.bias)
net.apply(init_constant)
print(net[0].weight.data[0], net[0].bias.data[0])
1
2
3
4
5
6
7
8
9
10
11
12
13
# 两个层应用不同的初始化方法
def init_xavier(m):
if type(m) == nn.Linear:
nn.init.xavier_uniform_(m.weight)
def init_42(m):
if type(m) == nn.Linear:
nn.init.constant_(m.weight, 42)
net[0].apply(init_xavier)
net[2].apply(init_42)
print(net[0].weight.data[0])
print(net[2].weight.data)
模板显而易见,补充说明Xavier初始化:
- 为了使得网络中信息更好的流动,每一层输出的方差应该尽量相等
Xavier 均匀分布初始化:
nn.init.xavier_uniform_(layer.weight)
Xavier 正态分布初始化:
nn.init.xavier_normal_(layer.weight)
自定义:
1
2
3
4
5
6
7
8
9
def my_init(m):
if type(m) == nn.Linear:
print("Init", *[(name, param.shape)
for name, param in m.named_parameters()][0])
nn.init.uniform_(m.weight,-10,10)
m.weight.data *= m.weight.data.abs() >= 5
net.apply(my_init)
print(net[0].weight[:2])
Init weight torch.Size([8, 4])
Init weight torch.Size([1, 8])
tensor([[ 0.0000, -9.4758, 7.6545, 6.8255], [ 5.1140, -9.1785, -0.0000, 0.0000]], grad_fn=)
1
2
3
net[0].weight.data[:] += 1
net[0].weight.data[0, 0] = 42
print(net[0].weight.data[0])
tensor([42.0000, -8.4758, 8.6545, 7.8255])
参数绑定
1
2
3
4
5
6
# 设置共享层名称,以可以引用参数
shared = nn.Linear(8, 8)
net = nn.Sequential(nn.Linear(4, 8), nn.ReLU(),
shared, nn.ReLU(),
shared, nn.ReLU(),
nn.Linear(8, 1))
1
2
3
4
5
6
7
net(X)
# 检查参数是否相同
print(net[2].weight.data[0] == net[4].weight.data[0])
net[2].weight.data[0, 0] = 100
# 确保它们实际上是同一个对象,而不只是有相同的值
print(net[2].weight.data[0] == net[4].weight.data[0])
tensor([True, True, True, True, True, True, True, True])
tensor([True, True, True, True, True, True, True, True])
5.4 自定义层
不带参数的层
1
2
3
4
5
6
class CenteredLayer(nn.Module):
def __init__(self):
super().__init__()
def forward(self, X):
return X - X.mean()
1
2
3
4
5
6
7
layer = CenteredLayer()
print(layer(torch.FloatTensor([1, 2, 3, 4, 5])))
net = nn.Sequential(nn.Linear(8, 128), CenteredLayer())
Y = net(torch.rand(4, 8))
print(Y.mean())
tensor([-2., -1., 0., 1., 2.])
tensor(9.3132e-09, grad_fn=)
这里由于浮点数精度原因,结果并不为0
带参数的层
1
2
3
4
5
6
7
8
9
class MyLinear(nn.Module):
def __init__(self, in_units, units):
super().__init__()
self.weight = nn.Parameter(torch.randn(in_units, units))
self.bias = nn.Parameter(torch.randn(units, ))
def forward(self, X):
linear = torch.matmul(X, self.weight.data) + self.bias.data
return F.relu(linear)
1
2
3
4
5
6
7
linear = MyLinear(5, 3)
print(linear.weight)
print(linear(torch.rand(2,5)))
net = nn.Sequential(MyLinear(64,8),MyLinear(8,1))
print(net(torch.rand(2,64)))
Parameter containing:
tensor([[ 0.6317, -1.4218, -0.4263], [ 0.8780, -1.6636, 1.1964], [ 0.3438, 0.7003, -1.0633], [ 1.0698, 0.0897, -1.3074], [ 0.2074, 1.4064, 0.7034]], requires_grad=True)
tensor([[1.3331, 0.0000, 0.0000], [1.9260, 0.0000, 0.0000]])
tensor([[8.6352], [4.7161]])
5.5 读写文件
加载和保存张量
单个张量:
1 2
x = torch.arange(4) torch.save(x, 'x-file')
1 2
x2 = torch.load('x-file') print(x2)
tensor([0, 1, 2, 3])
张量列表:
1 2 3 4
y = torch.zeros(4) torch.save([x, y], 'x-files') x2, y2 = torch.load('x-files') print((x2, y2))
(tensor([0, 1, 2, 3]), tensor([0., 0., 0., 0.]))
写入或读取从字符串映射到张量的字典:
1 2 3 4
mydict = {'x': x, 'y': y} torch.save(mydict, 'mydict') mydict2 = torch.load('mydict') print(mydict2)
{‘x’: tensor([0, 1, 2, 3]), ‘y’: tensor([0., 0., 0., 0.])}
加载和保存模型参数
1
2
3
4
5
6
7
8
class MLP(nn.Module):
def __init__(self):
super().__init__()
self.hidden = nn.Linear(20, 256)
self.output = nn.Linear(256, 10)
def forward(self, x):
return self.output(F.relu(self.hidden(x)))
1
2
3
4
5
6
# 模型参数的存储
net = MLP()
X = torch.randn(size=(2, 20))
Y = net(X)
torch.save(net.state_dict(), 'mlp.params')
1
2
3
4
5
6
clone = MLP()
clone.load_state_dict(torch.load('mlp.params'))
clone.eval() # 用于将模型设置为评估模式
Y_clone = clone(X)
print(Y_clone == Y)
tensor([[True, True, True, True, True, True, True, True, True, True],
[True, True, True, True, True, True, True, True, True, True]])
补充:.eval()方法
- 用于将模型设置为评估模式
- 对于在训练和测试阶段行为不同的层(如dropout层和批归一层)非常关键的操作
5.6 GPU
保存至d2l
1
2
3
4
5
6
7
8
9
def try_gpu(i = 0): #@save
if torch.cuda.device_count() >= i+1:
return torch.device(f'cuda:{i}')
return torch.device('cpu')
def try_all_gpus(): #@save
devices = [torch.device(f'cuda:{i}')
for i in range(torch.cuda.device_count())]
return devices if devices else [torch.device('cpu')]
使用:
1 2 3 print(try_gpu()) # cuda:0 print(try_gpu(4)) # cpu print(try_all_gpus()) # [device(type='cuda', index=0)]
指定设备
1
2
3
print(torch.device('cpu'))
print(torch.device('cuda'))
print(torch.device('cuda:1'))
查询可用gpu的数量
1
print(torch.cuda.device_count()) # 1
张量与GPU
默认情况下,张量是在CPU上创建的
1 2
x = torch.tensor([1,2,3]) print(x.device) # cpu
存储到GPU
1 2
X = torch.ones(2,3,device=try_gpu()) print(X)
tensor([[1., 1., 1.], [1., 1., 1.]], device=’cuda:0’)
当试图使用不存在的GPU时,使用cpu
1 2 3
Y = torch.rand(2,3,device=try_gpu(1)) print(Y) print(Y.device) # cpu
神经网络与GPU
模型参数存储在同一个GPU上:
1
2
3
4
net = nn.Sequential(nn.Linear(3,1))
net = net.to(device=try_gpu())
print(net(X))
print(net[0].weight.data.device)
tensor([[-0.5811], [-0.5811]], device=’cuda:0’, grad_fn=
)
cuda:0
