05-深度学习计算

Posted Feb 27, 2025 Updated Aug 5, 2025

By hhhi21g

10 min read

05-深度学习计算

5.1 层和块

层和块

层：

(1) 接受一组输入 (2) 生成相应的输出 (3) 由一组可调整参数描述

块：

(1) 比单个层大，但比整个模型小的组件；

(2) 可以描述单个层、由多个层组成的组件或整个模型本身；可以将一些块组合成更大的组件；

(3) 块由类表示，它的任何子类都必须定义一个将其输入转换为输出的前向传播函数，并且必须存储任何必须的参数；为了计算梯度，块必须具有反向传播参数；

多层感知机的代码实现：

  
net = nn.Sequential(nn.Linear(20, 256), nn.ReLU(), nn.Linear(256, 10))

X = torch.rand(2, 20)
print(net(X))  # net.__call__(X)的简写

tensor([[-0.0598, -0.2316, -0.3619, 0.1329, -0.0471, -0.1033, -0.0660, 0.1027, 0.0881, -0.0803],
[ 0.0419, -0.2622, -0.2755, 0.0526, -0.0111, -0.1644, -0.0917, 0.0586, -0.0880, -0.0593]], grad_fn=)

自定义块的代码实现：

  
class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.hidden = nn.Linear(20,256)
        self.out = nn.Linear(256,10)

    def forward(self,X):
        return self.out(F.relu(self.hidden(X)))  # ReLU的函数版本,nn.functional

net = MLP()
print(net(X))

tensor([[-0.0404, -0.1485, -0.2479, 0.0589, 0.1766, 0.1838, 0.1994, 0.0569, 0.0653, 0.2753],
[-0.1073, -0.0745, -0.2666, 0.0762, 0.1180, 0.0527, 0.0867, 0.0300, -0.0015, 0.2650]], grad_fn=)

顺序块

  
class MySequential(nn.Module):
    def __init__(self, *args):
        super().__init__()
        for idx, module in enumerate(args):
            self._modules[str(idx)] = module

    def forward(self, X):
        for block in self._modules.values():
            X = block(X)
        return X

net = MySequential(nn.Linear(20, 256), nn.ReLU(), nn.Linear(256, 10))
print(net(X))  

tensor([[-0.2242, 0.1504, 0.1898, 0.0536, -0.1651, -0.0147, 0.1896, -0.2420, -0.1102, 0.2483],
[-0.1735, 0.0988, 0.2303, 0.0658, -0.0901, 0.0443, 0.1346, -0.1984, -0.0308, 0.1663]], grad_fn=)

当执行net(X)时，实际调用顺序如下：

触发__call__方法
pytorch的nn.Module类重写了__call__方法，当实例化对象net被像函数一样调用时，即net(X),call方法会被自动执行；
call方法会处理前向传播的预处理逻辑，最终调用用户定义的forward函数；

在前向传播函数中执行代码

  
class FixedHiddenMLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.rand_weight = torch.rand((20, 20), requires_grad=False)
        self.linear = nn.Linear(20, 20)

    def forward(self, X):
        X = self.linear(X)
        X = F.relu(torch.mm(X, self.rand_weight) + 1)
        X = self.linear(X)
        while X.abs().sum() > 1:
            X /= 2
        return X.sum()

net = FixedHiddenMLP()
print(net(X))

tensor(-0.2347, grad_fn=)

组合块的混合搭配：

  
class NestMLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(nn.Linear(20, 64), nn.ReLU(),
                                 nn.Linear(64, 32), nn.ReLU())
        self.Linear = nn.Linear(32, 16)

    def forward(self, X):
        return self.Linear(self.net(X))

chimera = nn.Sequential(NestMLP(), nn.Linear(16, 20), FixedHiddenMLP())
print(chimera(X))

tensor(0.5916, grad_fn=)

5.2 参数管理

参数访问

以单隐藏层多层感知机为例：

  
net = nn.Sequential(nn.Linear(4, 8), nn.ReLU(), nn.Linear(8, 1))
X = torch.rand(size=(2, 4))

访问特定层参数
1 print(net[2].state_dict())
OrderedDict([(‘weight’, tensor([[-0.0279, 0.1508, -0.2685, -0.2710, -0.2749, -0.0494, -0.2761, -0.2905]])), (‘bias’, tensor([-0.1618]))])
访问特定层的特定参数
1 2 3 4 5 print(type(net[2].bias)) print(net[2].bias) print(net[2].bias.data) print(net[2].weight)
Parameter containing:
tensor([-0.1618], requires_grad=True)
tensor([-0.1618])
Parameter containing:
tensor([[-0.0279, 0.1508, -0.2685, -0.2710, -0.2749, -0.0494, -0.2761, -0.2905]], requires_grad=True)
1 print(net[2].weight.grad == None) # True
访问所有参数
1 2 3 4 print(*[(name, param.shape) for name, param in net[0].named_parameters()]) print(*[(name, param.shape) for name, param in net.named_parameters()]) print(net.state_dict()['2.bias'].data)
(‘weight’, torch.Size([8, 4])) (‘bias’, torch.Size([8]))
(‘0.weight’, torch.Size([8, 4])) (‘0.bias’, torch.Size([8])) (‘2.weight’, torch.Size([1, 8])) (‘2.bias’, torch.Size([1]))
tensor([-0.1618])

从嵌套块收集数据

定义块：

  
def block1():
    return nn.Sequential(nn.Linear(4, 8), nn.ReLU(),
                         nn.Linear(8, 4), nn.ReLU())

def block2():
    net = nn.Sequential()
    for i in range(4):
        # 动态向网络中添加子模块，并为其指定一个名称
        net.add_module(f'block {i}', block1())
    return net

使用块：

  
rgnet = nn.Sequential(block2(), nn.Linear(4, 1))

嵌套块的参数：

  
print(rgnet(X))
print(rgnet)

print(rgnet[0][1][0].bias.data)

tensor([[0.1059], [0.1059]], grad_fn=)
Sequential(
(0): Sequential(
(block 0): Sequential(
(0): Linear(in_features=4, out_features=8, bias=True)
(1): ReLU()
(2): Linear(in_features=8, out_features=4, bias=True)
(3): ReLU()
)
(block 1): Sequential(
(0): Linear(in_features=4, out_features=8, bias=True)
(1): ReLU()
(2): Linear(in_features=8, out_features=4, bias=True)
(3): ReLU()
)
(block 2): Sequential(
(0): Linear(in_features=4, out_features=8, bias=True)
(1): ReLU()
(2): Linear(in_features=8, out_features=4, bias=True)
(3): ReLU()
)
(block 3): Sequential(
(0): Linear(in_features=4, out_features=8, bias=True)
(1): ReLU()
(2): Linear(in_features=8, out_features=4, bias=True)
(3): ReLU()
)
)
(1): Linear(in_features=4, out_features=1, bias=True)
)
tensor([-0.3803, -0.0498, 0.3972, 0.1615, 0.1181, 0.1456, 0.0816, 0.0283])

参数初始化(内置&自定义)

内置：

默认情况下，pytorch会根据一个范围均匀地初始化权重和偏置矩阵，这个范围是根据输入和输出维度计算出的

  
def init_normal(m):
    if type(m) == nn.Linear:
        nn.init.normal_(m.weight, mean=0, std=0.01)
        nn.init.zeros_(m.bias)

net.apply(init_normal)
print(net[0].weight.data[0], net[0].bias.data[0])

  
def init_constant(m):
    if type(m) == nn.Linear:
        nn.init.constant_(m.weight, 1)
        nn.init.zeros_(m.bias)

net.apply(init_constant)
print(net[0].weight.data[0], net[0].bias.data[0])

  
# 两个层应用不同的初始化方法
def init_xavier(m):
    if type(m) == nn.Linear:
        nn.init.xavier_uniform_(m.weight)

def init_42(m):
    if type(m) == nn.Linear:
        nn.init.constant_(m.weight, 42)

net[0].apply(init_xavier)
net[2].apply(init_42)
print(net[0].weight.data[0])
print(net[2].weight.data)

模板显而易见，补充说明Xavier初始化：

为了使得网络中信息更好的流动，每一层输出的方差应该尽量相等

Xavier 均匀分布初始化：

nn.init.xavier_uniform_(layer.weight)

Xavier 正态分布初始化：

nn.init.xavier_normal_(layer.weight)

自定义：

  
def my_init(m):
    if type(m) == nn.Linear:
        print("Init", *[(name, param.shape)
                        for name, param in m.named_parameters()][0])
        nn.init.uniform_(m.weight,-10,10)
        m.weight.data *= m.weight.data.abs() >= 5

net.apply(my_init)
print(net[0].weight[:2])

Init weight torch.Size([8, 4])
Init weight torch.Size([1, 8])
tensor([[ 0.0000, -9.4758, 7.6545, 6.8255], [ 5.1140, -9.1785, -0.0000, 0.0000]], grad_fn=)

  
net[0].weight.data[:] += 1
net[0].weight.data[0, 0] = 42
print(net[0].weight.data[0])

tensor([42.0000, -8.4758, 8.6545, 7.8255])

参数绑定

  
# 设置共享层名称，以可以引用参数
shared = nn.Linear(8, 8)
net = nn.Sequential(nn.Linear(4, 8), nn.ReLU(),
                    shared, nn.ReLU(),
                    shared, nn.ReLU(),
                    nn.Linear(8, 1))

  
net(X)
# 检查参数是否相同
print(net[2].weight.data[0] == net[4].weight.data[0])

net[2].weight.data[0, 0] = 100
# 确保它们实际上是同一个对象，而不只是有相同的值
print(net[2].weight.data[0] == net[4].weight.data[0])

tensor([True, True, True, True, True, True, True, True])
tensor([True, True, True, True, True, True, True, True])

5.4 自定义层

不带参数的层

  
class CenteredLayer(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, X):
        return X - X.mean()

  
layer = CenteredLayer()
print(layer(torch.FloatTensor([1, 2, 3, 4, 5])))

net = nn.Sequential(nn.Linear(8, 128), CenteredLayer())

Y = net(torch.rand(4, 8))
print(Y.mean())

tensor([-2., -1., 0., 1., 2.])
tensor(9.3132e-09, grad_fn=)

这里由于浮点数精度原因，结果并不为0

带参数的层

  
class MyLinear(nn.Module):
    def __init__(self, in_units, units):
        super().__init__()
        self.weight = nn.Parameter(torch.randn(in_units, units))
        self.bias = nn.Parameter(torch.randn(units, ))

    def forward(self, X):
        linear = torch.matmul(X, self.weight.data) + self.bias.data
        return F.relu(linear)

  
linear = MyLinear(5, 3)
print(linear.weight)

print(linear(torch.rand(2,5)))

net = nn.Sequential(MyLinear(64,8),MyLinear(8,1))
print(net(torch.rand(2,64)))

Parameter containing:
tensor([[ 0.6317, -1.4218, -0.4263], [ 0.8780, -1.6636, 1.1964], [ 0.3438, 0.7003, -1.0633], [ 1.0698, 0.0897, -1.3074], [ 0.2074, 1.4064, 0.7034]], requires_grad=True)
tensor([[1.3331, 0.0000, 0.0000], [1.9260, 0.0000, 0.0000]])
tensor([[8.6352], [4.7161]])

5.5 读写文件

加载和保存张量

单个张量：

  
x = torch.arange(4)
torch.save(x, 'x-file')

  
x2 = torch.load('x-file')
print(x2)

tensor([0, 1, 2, 3])

张量列表：

  
y = torch.zeros(4)
torch.save([x, y], 'x-files')
x2, y2 = torch.load('x-files')
print((x2, y2))

(tensor([0, 1, 2, 3]), tensor([0., 0., 0., 0.]))

写入或读取从字符串映射到张量的字典：

  
mydict = {'x': x, 'y': y}
torch.save(mydict, 'mydict')
mydict2 = torch.load('mydict')
print(mydict2)

{‘x’: tensor([0, 1, 2, 3]), ‘y’: tensor([0., 0., 0., 0.])}

加载和保存模型参数

  
class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.hidden = nn.Linear(20, 256)
        self.output = nn.Linear(256, 10)

    def forward(self, x):
        return self.output(F.relu(self.hidden(x)))

  
# 模型参数的存储
net = MLP()
X = torch.randn(size=(2, 20))
Y = net(X)

torch.save(net.state_dict(), 'mlp.params')

  
clone = MLP()
clone.load_state_dict(torch.load('mlp.params'))
clone.eval()  # 用于将模型设置为评估模式

Y_clone = clone(X)
print(Y_clone == Y)  

tensor([[True, True, True, True, True, True, True, True, True, True],
[True, True, True, True, True, True, True, True, True, True]])

补充：.eval()方法

用于将模型设置为评估模式
对于在训练和测试阶段行为不同的层(如dropout层和批归一层)非常关键的操作

5.6 GPU

保存至d2l

  
def try_gpu(i = 0):  #@save
    if torch.cuda.device_count() >= i+1:
        return torch.device(f'cuda:{i}')
    return torch.device('cpu')

def try_all_gpus():  #@save
    devices = [torch.device(f'cuda:{i}')
              for i in range(torch.cuda.device_count())]
    return devices if devices else [torch.device('cpu')]

使用：

  
print(try_gpu())  # cuda:0
print(try_gpu(4))  # cpu
print(try_all_gpus())  # [device(type='cuda', index=0)]

指定设备

  
print(torch.device('cpu'))
print(torch.device('cuda'))
print(torch.device('cuda:1'))

查询可用gpu的数量

  
print(torch.cuda.device_count())  # 1

张量与GPU

默认情况下，张量是在CPU上创建的

  
x = torch.tensor([1,2,3])
print(x.device)  # cpu

存储到GPU
1 2 X = torch.ones(2,3,device=try_gpu()) print(X)
tensor([[1., 1., 1.], [1., 1., 1.]], device=’cuda:0’)

当试图使用不存在的GPU时，使用cpu

  
Y = torch.rand(2,3,device=try_gpu(1))
print(Y)
print(Y.device)  # cpu

神经网络与GPU

模型参数存储在同一个GPU上：

  
net = nn.Sequential(nn.Linear(3,1))
net = net.to(device=try_gpu())
print(net(X))
print(net[0].weight.data.device)

tensor([[-0.5811], [-0.5811]], device=’cuda:0’, grad_fn=)
cuda:0

深度学习

This post is licensed under CC BY 4.0 by the author.