读取数据
参考 官方示例 和 这个github repo (recommanded)
Dataset类
需要继承torch.utils.data.Dataset
且重写__len__
和__getitem__
。__len__
返回数据集的大小,__getitem__
根据索引值\(i\)从数据集中返回第\(i\)个样本。
一个简单的示例
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
import os
from torch.utils.data.dataset import Dataset
class CustomDataset(Dataset):
def __init__(self, root_dir):
self.data_list = [os.path.join(root_dir, data_name) for data_name in os.listdir(root_dir)]
def __getitem__(self, index):
data_path = self.data_list[index]
data = READ_OP(data_path) # data reading
return data
def __len__(self):
return len(self.data_list)
可以用data = CustomDataset.__getitem__(99)
返回第99个样本。更进一步的,CustomDataset
可以这样来读取样本
1
2
for i in range(len(CustomDataset)):
sample = CustomDataset[i]
使用transform
示例
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import os
from torch.utils.data.dataset import Dataset
from torchvision import transforms
class CustomDataset(Dataset):
def __init__(self, root_dir, transform):
self.data_list = [os.path.join(root_dir, data_name) for data_name in os.listdir(root_path)]
self.transform = transform
def __getitem__(self, index):
data_path = self.data_list[index]
data = READ_OP(data_path) # data reading
# apply transform
if self.transform is not None:
data = self.transform(data)
return data
def __len__(self):
return len(self.data_list)
if __name__ == '__main__':
transform = transforms.Compose([
transforms.ToPILImage(), # 使用RandomCrop等函数,要求变量为PIL
transforms.RandomCrop(32, padding=4),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
])
# apply the transformations to the dataset
dataset = CustomDataset(root_dir, transform)
使用DataLoader
CustomDataset
可以按索引index
读取样本,但是不便于
- 批量读取数据(
Batching the data
) - 打乱数据(
Shuffling the data
) - 多线程读取数据(
Loading the data in parallel using multiprocessing workers
)
使用DataLoader
对其进一步封装即可赋予这些能力
1
2
3
4
5
6
7
from torch.utils.data import DataLoader
dataloader = DataLoader(transformedDataset, batch_size=4,
shuffle=True, num_workers=4)
for data in dataloader:
...
搭建网络
继承nn.Module
,参考官方示例
示例
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import torch
import torch.nn as nn
import torch.nn.functional as F
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
# 1 input image channel, 6 output channels, 5x5 square convolution
# kernel
self.conv1 = nn.Conv2d(1, 6, 5)
self.conv2 = nn.Conv2d(6, 16, 5)
# an affine operation: y = Wx + b
self.fc1 = nn.Linear(16 * 5 * 5, 120) # 5*5 from image dimension
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, 10)
def forward(self, x):
# Max pooling over a (2, 2) window
x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
# If the size is a square, you can specify with a single number
x = F.max_pool2d(F.relu(self.conv2(x)), 2)
x = torch.flatten(x, 1) # flatten all dimensions except the batch dimension
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.fc3(x)
return x
net = Net()
print(net)
: torch.nn
只支持batch
数据,不支持单帧数据。比如 nn.Conv2d
的输入应为一个4维Tensor
: Batch_size x Channels x Height x Width
. 如果要输入单帧的数据,应该使用input.unsqueeze(0)
来转化成一个“伪”batch
数据。
模型训练
计算loss
,反向传播,参数更新。参考官方示例
1
2
3
4
5
6
7
8
9
10
11
12
13
import torch.optim as optim
import torch.nn as nn
# create your optimizer
optimizer = optim.SGD(net.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()
# in your training loop:
optimizer.zero_grad() # zero the gradient buffers
output = model(input)
loss = criterion(output, target)
loss.backward()
optimizer.step() # Do the update
优化器使用
参考官方示例
1
2
3
4
import torch.optim
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
optimizer = optim.Adam([var1, var2], lr=0.0001)
固定部分参数
1
2
model.fc.requires_grad = False
optimizer = optim.SGD(filter(lambda p: p.requires_grad, net.parameters()), lr=0.1)
如果后续需要再对这些层进行训练,则可以通过
1
2
model.fc.requires_grad = True
optimizer.add_param_group({'params': net.fc2.parameters()})
为不同参数设定不同的学习规则
为优化器传入一个元素类型为字典
的迭代器。每个字典元素必须包含params
键值以指定待优化的参数。
1
2
3
4
optim.SGD([
{'params': model.base.parameters()},
{'params': model.classifier.parameters(), 'lr': 1e-3}
], lr=1e-2, momentum=0.9)
这代表model.base
参数使用1e-2
学习率,model.classifier
参数的学习率由于重写,会使用1e-3
;momentum = 0.9
为二者公用。
那么问题来了,怎么知道我们的模型有哪些模块,对应哪些参数呢?可以到模型的定义处看一下,或者也可以print(model)
输出看一下。下面以torchvision
自带的ResNet18
为例,展示一下如何让使得最后的分类fc
层和网络前端参数使用不同的学习率。
参考CSDN-Pytorch中,动态调整学习率、不同层设置不同学习率和固定某些层训练的方法
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
from torch import nn, optim
import torchvision
def group_params_v1(model):
cls_params = []
for name, mod in model.named_modules():
if type(mod) == nn.Linear:
cls_params += mod.parameters()
# else: x !注意model.modules()是递归遍历的,故不能直接使用else分支来获取非fc层的参数
# filter(function, iterable): 返回由<iterable>中符合条件<function>的元素组成的新列表
# list(map(id, cls_params)): 返回存放cls_params参数的内存地址
base_params = filter(lambda x:id(x) not in list(map(id, cls_params)), model.parameters())
return base_params, cls_params
def group_params_v2(model):
params_dict = dict(model.named_parameters())
cls_param_names = {
name
for name, param in model.named_parameters()
if "fc" in name and param.requires_grad
}
base_param_names = params_dict.keys() - cls_param_names
# print(cls_param_names, base_param_names)
base_params = (params_dict[n] for n in sorted(base_param_names))
cls_params = (params_dict[n] for n in sorted(cls_param_names))
return base_params, cls_params
model = torchvision.models.resnet18(pretrained=True)
print(model) # 可以看到最后一层为fc层: (fc): Linear(in_features=512, out_features=1000, bias=True)
# base_params, cls_params = group_params_v1(model)
base_params, cls_params = group_params_v2(model)
optimizer = optim.SGD(
[
{'params': base_params, 'lr': 0.1},
{'params': cls_params, 'lr': 0.01}
], momentum=0.9)
使用scheduler
以常用的StepLR
为例,其定义及使用方法为
1
2
3
4
5
6
7
8
9
10
11
from torch.optim import lr_scheduler
# StepLR: https://pytorch.org/docs/stable/optim.html#torch.optim.lr_scheduler.StepLR
scheduler = lr_scheduler.StepLR(optimizer, step_size=30, gamma=0.1)
# MultiStepLR: https://pytorch.org/docs/stable/optim.html#torch.optim.lr_scheduler.MultiStepLR
scheduler = lr_scheduler.MultiStepLR(optimizer, milestones, gamma=0.1)
for epoch in range(100):
train(...)
validate(...)
scheduler.step() # Learning rate scheduling should be applied after optimizer’s update
获取优化器信息
1
2
3
4
5
6
7
8
9
10
11
12
13
14
# === 总览 === #
print(optimizer)
# === output === #
SGD (
Parameter Group 0
dampening: 0
lr: 0.01
momentum: 0.9
nesterov: False
weight_decay: 0
)
# 获取学习率
optimizer.param_groups[0]['lr']
使用GPU训练模型
使用单GPU训练模型
参考官方示例
1
2
3
4
5
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# print(device)
model.to(device) # or model.cuda()
input = input.to(device) # or input = input.cuda()
使用多GPU分布式训练
参考官方示例
1
2
3
4
5
6
7
8
9
10
11
if torch.cuda.device_count() > 1:
print("Let's use", torch.cuda.device_count(), "GPUs!")
# dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs
model = nn.DataParallel(model)
model.to(device)
for data in data_loader:
input = data.to(device)
output = model(input)
print("Outside: input size", input.size(),
"output_size", output.size())
当分布式训练时,DataParallel
自动将输入数据分配给多个GPU,每个GPU计算完成后再进行汇总输出。
有多块卡,仅指定使用其中部分卡(可见),可通过:
1
CUDA_VISIBLE_DEVICES=0,1 python train.py
使用visdom
查看数据
可参考visdom使用
使用tensorboard
查看数据
模型保存与加载
参考官方示例
torch.save()
and torch.load()
use Python’s pickle by default.
保存和加载 tensors
1
2
3
4
>>> t = torch.tensor([1., 2.])
>>> torch.save(t, 'tensor.pt') # save with a ‘.pt’ or ‘.pth’ extension
>>> torch.load('tensor.pt')
tensor([1., 2.])
保存和加载多个tensors
1
2
3
4
>>> d = {'a': torch.tensor([1., 2.]), 'b': torch.tensor([3., 4.])}
>>> torch.save(d, 'tensor_dict.pt')
>>> torch.load('tensor_dict.pt')
{'a': tensor([1., 2.]), 'b': tensor([3., 4.])}
保存和加载模型参数
一个module
的state dict
包含着 all of its parameters
和persistent buffers
1
2
3
4
5
6
7
bn = torch.nn.BatchNorm1d(3, track_running_stats=True)
>>> bn.state_dict()
OrderedDict([('weight', tensor([1., 1., 1.])),
('bias', tensor([0., 0., 0.])),
('running_mean', tensor([0., 0., 0.])),
('running_var', tensor([1., 1., 1.])),
('num_batches_tracked', tensor(0))])
简单示例
1
2
3
4
5
>>> torch.save(bn.state_dict(), 'bn.pt')
>>> bn_state_dict = torch.load('bn.pt') # load state dict from its file
>>> new_bn = torch.nn.BatchNorm1d(3, track_running_stats=True)
>>> new_bn.load_state_dict(bn_state_dict) # restore the state dict to a new module
<All keys matched successfully>
custom modules
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
# A module with two linear layers
>>> class MyModule(torch.nn.Module):
def __init__(self):
super(MyModule, self).__init__()
self.l0 = torch.nn.Linear(4, 2)
self.l1 = torch.nn.Linear(2, 1)
def forward(self, input):
out0 = self.l0(input)
out0_relu = torch.nn.functional.relu(out0)
return self.l1(out0_relu)
>>> m = MyModule()
>>> m.state_dict()
OrderedDict([('l0.weight', tensor([[ 0.1400, 0.4563, -0.0271, -0.4406],
[-0.3289, 0.2827, 0.4588, 0.2031]])),
('l0.bias', tensor([ 0.0300, -0.1316])),
('l1.weight', tensor([[0.6533, 0.3413]])),
('l1.bias', tensor([-0.1112]))])
>>> torch.save(m.state_dict(), 'mymodule.pt')
>>> m_state_dict = torch.load('mymodule.pt')
>>> new_m = MyModule()
>>> new_m.load_state_dict(m_state_dict)
<All keys matched successfully>
保存和加载一个通用的checkpoint
参考知乎
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
# save
torch.save({
'epoch': epoch,
'model_state_dict': model.state_dict(),
'optimizer_state_dict': optimizer.state_dict(),
'loss': loss,
...
}, PATH)
# load
checkpoint = torch.load(PATH)
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
epoch = checkpoint['epoch']
loss = checkpoint['loss']
何时需要自己重载load_state_dict()
?
TODO
Pytorch碎碎念
Pytorch的Tensor
Tensor初始化
1
2
3
4
5
6
7
data = [[1, 2],[3, 4]]
x_data = torch.tensor(data)
shape = (2, 3)
rand_tensor = torch.rand(shape)
ones_tensor = torch.ones(shape)
zeros_tensor = torch.zeros(shape)
Tensor和numpy array相互转化
1
2
3
4
5
# from numpy array to tensor
tensor = torch.from_numpy(np_array)
# from tensor to numpy array
np_array = tensor.numpy()
Pytorch使用GPU时正确测试时间
当使用GPU运算时,在计时时须使用
1
2
3
4
5
torch.cuda.synchronize()
start = time.time()
result = model(input)
torch.cuda.synchronize()
end = time.time()
module.named_x()
named_modules()
,named_children()
与named_parmameters()
三者均返回一个literator。上demo:
1
2
3
4
5
6
7
8
9
10
11
12
13
import torch
import torch.nn as nn
class ToyModel(nn.Module):
def __init__(self):
super().__init__()
self.fc1 = nn.Linear(3, 4)
self.sequence = nn.Sequential(
nn.Linear(5, 4),
nn.ReLU(),
)
model = ToyModel()
-
named_modules()
递归返回所有的module:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
for name, mod in model.named_modules():
print(name, mod)
'''out
ToyModel(
(fc1): Linear(in_features=3, out_features=4, bias=True)
(sequence): Sequential(
(0): Linear(in_features=5, out_features=4, bias=True)
(1): ReLU()
)
)
fc1 Linear(in_features=3, out_features=4, bias=True)
sequence Sequential(
(0): Linear(in_features=5, out_features=4, bias=True)
(1): ReLU()
)
sequence.0 Linear(in_features=5, out_features=4, bias=True)
sequence.1 ReLU()
'''
-
named_children()
返回第一代module (immediate children module):
1
2
3
4
5
6
7
8
9
for name, child in model.named_children():
print(name, child)
'''out
fc1 Linear(in_features=3, out_features=4, bias=True)
sequence Sequential(
(0): Linear(in_features=5, out_features=4, bias=True)
(1): ReLU()
)
'''
-
named_parameters()
返回所有module的parameter:
1
2
3
4
5
6
7
8
for name, param in model.named_parameters():
print(name, param.shape)
'''out
fc1.weight torch.Size([4, 3])
fc1.bias torch.Size([4])
sequence.0.weight torch.Size([4, 5])
sequence.0.bias torch.Size([4])
'''
常用代码段
其他
-
从torch tensor中取值:
torch_tensor.item()
- 注意
.item()
只能从单个的torch tensor中取值,如需要从torch tensor list中取值,可使用1
[tensor.item() for tensor in tensor_list]
- 注意
-
获取数据类型详细信息
-
浮点类型:
torch.finfo(dtype)
,如1 2
print(torch.finfo(torch.float32)) # finfo(resolution=1e-06, min=-3.40282e+38, max=3.40282e+38, eps=1.19209e-07, tiny=1.17549e-38, dtype=float32)
-
整型:
torch.iinfo(dtype)
,如1 2
print(torch.iinfo(torch.int8)) # iinfo(min=-128, max=127, dtype=int8)
-
-
bit shift
负值移位是一种未定义的行为。在原生python中,会
ValueError: negative shift count
,但在Pytorch中无报错信息,但会产生预期之外的结果,当涉及移位运算时需要注意。如1 2 3
a = torch.tensor(8) # dtype = torch.int64 (default) print(a >> 2) # 2 print(a << -2) # 0 !!!
-
获取模型所在的device
1 2 3 4
model = Model() # device = model.device() # WRONG. ==> 'Model' object has no attribute 'device' device = next(model.parameters()).device # 'cpu' or 'cuda:0'
踩坑
ValueError: can’t optimize a non-leaf Tensor
使用下面的code
1
2
3
4
5
6
device = 'cuda' if torch.cuda.is_available() else 'cpu'
scales = nn.Parameter(torch.ones(in_channels), requires_grad=True)
scales = scales.to(device)
optimizer = torch.optim.SGD([scales], lr=0.01, momentum=0.9)
会报错:ValueError: can't optimize a non-leaf Tensor
正确的用法是:
1
2
3
4
5
6
device = 'cuda' if torch.cuda.is_available() else 'cpu'
equalize_scales = nn.Parameter(torch.ones(in_channels), requires_grad=True)
scale_optimizer = torch.optim.SGD([equalize_scales], lr=0.01, momentum=0.9)
scales = scales.to(device)