李宏毅ML2022

HW01

通过Middle Baseline只需要对于特征选择部分进行一定的更改就行,将前四天的所有特征和结果都选取上

1
feat_idx = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,53,69,86,101]

Strong Baseline

调整神经网络架构,加入L2正则化

1
2
3
4
5
6
7
8
   self.layers = nn.Sequential(
nn.Linear(input_dim, 16),
nn.LeakyReLU(0.1),
nn.Linear(16, 8),
nn.LeakyReLU(0.1),
nn.Linear(8, 1)
)
optimizer = torch.optim.RMSprop(model.parameters(),lr=config['learning_rate']*10,weight_decay=1e-3)

Boss Baseline

继续调整神经网络架构

一般输入 → 卷积1 → BatchNorm1 → 激活函数 → Dropout → 卷积2 → BatchNorm2 → 激活函数

BactchNorm1d在激活函数前,激活函数后进行Dropout

1
2
3
4
5
6
7
8
9
10
11
12
self.layers = nn.Sequential(
nn.Linear(input_dim, 64),
nn.LeakyReLU(0.2),
nn.BatchNorm1d(64),//batch normalization
nn.Dropout(0.2),//随机失活抑制过拟合

nn.Linear(64, 16),
nn.LeakyReLU(0.2),
nn.Dropout(0.1),

nn.Linear(16, 1)
)

加入SelectKBest进行特征选取,选取前k个表现最好的,防止噪声干扰

1
2
3
4
5
k = 32
selector = SelectKBest(score_func=f_regression, k=k)
result = selector.fit(x_data, y_data)
idx = np.argsort(result.scores_)[::-1]
selected_idx = list(np.sort(idx[:k]))

调整学习器加入余弦退火热重启,自动调整学习速率,跳出critical point

1
2
3
4
optimizer = torch.optim.Adam(model.parameters(), lr=config['learning_rate']*50,
weight_decay=1e-3)
scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer,
T_0=2, T_mult=2, eta_min=config['learning_rate'])

Hw02

Middle Baseline调整一下concat_nframes , hidden_layers , hidden_dim即可

Strong Baseline需要调整hyperparameter并且加入batch normalization和dropout

需要注意的是为了让训练更快并且收敛,需要增大batch_size,同时需要增大epoch以达到同样的精确度

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
        self.block = nn.Sequential(
nn.Linear(input_dim, output_dim),
nn.BatchNorm1d(output_dim),
nn.LeakyReLU(0.1),
nn.Dropout(0.2),
)

# data prarameters
concat_nframes = 19 # the number of frames to concat with, n must be odd (total 2k+1 = n frames)
train_ratio = 0.8 # the ratio of data used for training, the rest will be used for validation

# training parameters
seed = 0 # random seed
batch_size = 2048 # batch size
num_epoch = 42 # the number of training epoch
learning_rate = 0.0001 # learning rate
model_path = './model.ckpt' # the path where the checkpoint will be saved

# model parameters
input_dim = 39 * concat_nframes # the input dim of the model, you should not change the value
hidden_layers = 14 # the number of hidden layers
hidden_dim = 1024 # the hidden dim

Bossline需要自行实现BiLSTM

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
# preparing data
import os
import random
import pandas as pd
import torch
from tqdm import tqdm

def load_feat(path):
feat = torch.load(path)
return feat

def shift(x, n):
if n < 0:
left = x[0].repeat(-n, 1)
right = x[:n]

elif n > 0:
right = x[-1].repeat(n, 1)
left = x[n:]
else:
return x

return torch.cat((left, right), dim=0)

def concat_feat(x, concat_n):
assert concat_n % 2 == 1 # n must be odd
if concat_n < 2:
return x
seq_len, feature_dim = x.size(0), x.size(1)
x = x.repeat(1, concat_n)
x = x.view(seq_len, concat_n, feature_dim).permute(1, 0, 2) # concat_n, seq_len, feature_dim
mid = (concat_n // 2)
for r_idx in range(1, mid+1):
x[mid + r_idx, :] = shift(x[mid + r_idx], r_idx)
x[mid - r_idx, :] = shift(x[mid - r_idx], -r_idx)

return x.permute(1, 0, 2).view(seq_len, concat_n * feature_dim)

def preprocess_data(split, feat_dir, phone_path, concat_nframes, train_ratio=0.8, train_val_seed=1337):
class_num = 41 # NOTE: pre-computed, should not need change
mode = 'train' if (split == 'train' or split == 'val') else 'test'

label_dict = {}
if mode != 'test':
phone_file = open(os.path.join(phone_path, f'{mode}_labels.txt')).readlines()

for line in phone_file:
line = line.strip('\n').split(' ')
label_dict[line[0]] = [int(p) for p in line[1:]]

if split == 'train' or split == 'val':
# split training and validation data
usage_list = open(os.path.join(phone_path, 'train_split.txt')).readlines()
random.seed(train_val_seed)
random.shuffle(usage_list)
percent = int(len(usage_list) * train_ratio)
usage_list = usage_list[:percent] if split == 'train' else usage_list[percent:]
elif split == 'test':
usage_list = open(os.path.join(phone_path, 'test_split.txt')).readlines()
else:
raise ValueError('Invalid \'split\' argument for dataset: PhoneDataset!')

usage_list = [line.strip('\n') for line in usage_list]
print('[Dataset] - # phone classes: ' + str(class_num) + ', number of utterances for ' + split + ': ' + str(len(usage_list)))

max_len = 3000000
X = torch.empty(max_len, 39 * concat_nframes)
if mode != 'test':
y = torch.empty(max_len, concat_nframes, dtype=torch.long)

idx = 0
for i, fname in tqdm(enumerate(usage_list)):
feat = load_feat(os.path.join(feat_dir, mode, f'{fname}.pt'))
cur_len = len(feat)
feat = concat_feat(feat, concat_nframes)
if mode != 'test':
label = torch.LongTensor(label_dict[fname]).unsqueeze(1)
label = concat_feat(label, concat_nframes)

X[idx: idx + cur_len, :] = feat
if mode != 'test':
y[idx: idx + cur_len] = label

idx += cur_len

X = X[:idx, :]
if mode != 'test':
y = y[:idx]

print(f'[INFO] {split} set')
print(X.shape)
if mode != 'test':
print(y.shape)
return X, y
else:
return X

#Define Dataset
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

class LibriDataset(Dataset):
def __init__(self, X, y=None):
self.data = X
if y is not None:
self.label = torch.LongTensor(y)
else:
self.label = None

def __getitem__(self, idx):
if self.label is not None:
return self.data[idx].view(-1, 39), self.label[idx]
else:
return self.data[idx].view(-1, 39)

def __len__(self):
return len(self.data)

#Define Model
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchcrf import CRF

# 使用双向LSTM的RNN
class BiLSTM (nn.Module):
def __init__(self, class_size=41, input_dim=39,hidden_dim=512,dropout=0.5):
# Label: 41 classes, each class represents a phoneme
super().__init__()
self.input_dim = input_dim
self.hidden_dim = hidden_dim
self.class_size = class_size
self.lstm = nn.LSTM(input_dim, hidden_dim // 2, dropout = dropout,
num_layers = 4, bidirectional = True, batch_first = True)
#映射到标签空间,生成每个时间步的标签发射分数
self.hidden2tag = nn.Sequential(
nn.Dropout(dropout),
nn.Linear(hidden_dim, class_size)
)

def forward(self, x):
feats,_ = self.lstm(x)
# 返回发射分数矩阵
return self.hidden2tag(feats)

class Crf(nn.Module):
def __init__(self, class_size = 41):
super().__init__()
self.class_size=class_size
self.crf = CRF(self.class_size,batch_first = True)

def likelihood(self, x, y):
# 返回CRF的负对数似然损失值 Negative Log-Likelihood Loss Function
return self.crf(x, y)

def forward(self, x):
# 返回形状为 (batch_size, seq_len) 的预测标签序列
return torch.LongTensor(self.crf.decode(x))
# self.crf.decode(x):使用维特比算法解码最优标签序列,返回列表形式的预测结果
# torch.LongTensor():将列表转换为Long类型的张量
# data prarameters
concat_nframes = 51 # the number of frames to concat with, n must be odd (total 2k+1 = n frames)
mid = concat_nframes // 2
train_ratio = 0.85 # the ratio of data used for training, the rest will be used for validation
early_stopping = 12
# training parameters
seed = 0 # random seed
batch_size = 2048 # batch size
num_epoch = 52 # the number of training epoch
learning_rate = 0.00001 # learning rate
model1_path = './model1.ckpt' # the path where the checkpoint will be saved
model2_path = './model2.ckpt' # the path where the checkpoint will be saved

# model parameters
input_dim = 39 * concat_nframes # the input dim of the model, you should not change the value
hidden_layers = 4 # the number of hidden layers
hidden_dim = 512 # the hidden dim



import gc

# preprocess data
train_X, train_y = preprocess_data(split='train', feat_dir='./libriphone/feat', phone_path='./libriphone', concat_nframes=concat_nframes, train_ratio=train_ratio)
val_X, val_y = preprocess_data(split='val', feat_dir='./libriphone/feat', phone_path='./libriphone', concat_nframes=concat_nframes, train_ratio=train_ratio)

# get dataset
train_set = LibriDataset(train_X, train_y)
val_set = LibriDataset(val_X, val_y)

# remove raw feature to save memory
del train_X, train_y, val_X, val_y
gc.collect()

# get dataloader
train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_set, batch_size=batch_size, shuffle=False)


# fix random seed
same_seeds(seed)

# create model, define a loss function, and optimizer
bilstm = BiLSTM().to(device)
crf = Crf().to(device)
optimizer1 = torch.optim.AdamW(bilstm.parameters(),
lr=learning_rate*200, weight_decay=1e-3)
scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer1,
T_0=4,
T_mult=2,
eta_min=learning_rate / 2)
optimizer2 = torch.optim.AdamW(crf.parameters(),
lr=learning_rate*200, weight_decay=1e-8)

best_acc = 0.0
for epoch in range(num_epoch): # 遍历每个epoch
# 初始化指标
train_acc = 0.0 # 训练准确率累加器
train_loss = 0.0 # 训练损失累加器
val_acc = 0.0 # 验证准确率累加器
val_loss = 0.0 # 验证损失累加器(代码中未实际使用)
train_item = 0 # 已处理的训练样本数(用于计算平均损失)

# ================== 训练阶段 ==================
bilstm.train() # 设置BiLSTM为训练模式(启用Dropout)
crf.train() # 设置CRF为训练模式(无实际操作,保持一致性)
pbar = tqdm(train_loader, ncols=110) # 创建训练数据进度条
pbar.set_description(f'T: {epoch+1}/{num_epoch}') # 显示当前epoch
samples = 0 # 已处理样本数(与train_item重复,可能冗余)

for i, batch in enumerate(pbar): # 遍历训练数据的每个batch
features, labels = batch # 解包特征和标签
features = features.to(device) # 将特征移到GPU
labels = labels.to(device) # 将标签移到GPU

# 梯度清零
optimizer1.zero_grad() # 清空BiLSTM的梯度
optimizer2.zero_grad() # 清空CRF的梯度

# 前向传播与损失计算
emissions = bilstm(features) # BiLSTM生成发射分数
loss = -crf.likelihood(emissions, labels) # CRF计算负对数似然损失

# 反向传播与梯度裁剪
loss.backward() # 反向传播计算梯度
grad_norm = nn.utils.clip_grad_norm_(bilstm.parameters(), max_norm=50) # 裁剪BiLSTM梯度(防止梯度爆炸)

# 参数更新
optimizer1.step() # 更新BiLSTM参数
optimizer2.step() # 更新CRF参数

# 统计指标
train_loss += loss.item() # 累加批次损失
train_item += labels.size(0) # 累加已处理样本数
lr1 = optimizer1.param_groups[0]["lr"] # 获取当前BiLSTM学习率
lr2 = optimizer2.param_groups[0]["lr"] # 获取当前CRF学习率
pbar.set_postfix({'lr1':lr1, 'lr2':lr2, 'loss':train_loss/train_item}) # 更新进度条显示

scheduler.step() # 调整BiLSTM的学习率
pbar.close() # 关闭训练进度条

# ================== 验证阶段 ==================
if len(val_set) > 0: # 如果存在验证集
bilstm.eval() # 设置BiLSTM为评估模式(关闭Dropout)
crf.eval() # 设置CRF为评估模式
with torch.no_grad(): # 关闭梯度计算
pbar = tqdm(val_loader, ncols=110) # 创建验证数据进度条
pbar.set_description(f'V: {epoch+1}/{num_epoch}')
samples = 0
for i, batch in enumerate(pbar):
features, labels = batch
features, labels = features.to(device), labels.to(device)
outputs = crf(bilstm(features)) # 预测标签序列
val_acc += (outputs[:, mid] == labels[:, mid].cpu()).sum().item() # 计算中间位置准确率
samples += labels.size(0)
pbar.set_postfix({'val acc':val_acc/samples})
pbar.close()

# 模型保存与早停
if val_acc > best_acc: # 如果当前验证准确率优于历史最佳
best_acc = val_acc # 更新最佳准确率
torch.save(bilstm.state_dict(), model1_path) # 保存BiLSTM参数
torch.save(crf.state_dict(), model2_path) # 保存CRF参数
print('saving model with acc {:.3f}'.format(best_acc/len(val_set))) # 打印保存信息
early_stop_count = 0 # 重置早停计数器
else:
early_stop_count += 1 # 早停计数器+1
if early_stop_count >= early_stopping: # 达到早停阈值
print(f"Epoch: {epoch + 1}, early stopping.")
break # 终止训练循环


pred = np.array([], dtype=np.int32)

bilstm.eval()
crf.eval()
with torch.no_grad():
for features in tqdm(test_loader):
features = features.to(device)
outputs = crf(bilstm(features))
pred = np.concatenate((pred, outputs.detach().cpu()[:, mid]), axis=0)