李宏毅ML2022

HW01

通过Middle Baseline只需要对于特征选择部分进行一定的更改就行，将前四天的所有特征和结果都选取上

1	feat_idx = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,53,69,86,101]

Strong Baseline

调整神经网络架构，加入L2正则化

   self.layers = nn.Sequential(
       nn.Linear(input_dim, 16),
       nn.LeakyReLU(0.1),
       nn.Linear(16, 8),
       nn.LeakyReLU(0.1),            
       nn.Linear(8, 1)
   ) 			       
optimizer = torch.optim.RMSprop(model.parameters(),lr=config['learning_rate']*10,weight_decay=1e-3)

Boss Baseline

继续调整神经网络架构

一般输入 → 卷积1 → BatchNorm1 → 激活函数 → Dropout → 卷积2 → BatchNorm2 → 激活函数

BactchNorm1d在激活函数前，激活函数后进行Dropout

self.layers = nn.Sequential(
    nn.Linear(input_dim, 64),
    nn.LeakyReLU(0.2),
    nn.BatchNorm1d(64),//batch normalization
    nn.Dropout(0.2),//随机失活抑制过拟合
    
    nn.Linear(64, 16),
    nn.LeakyReLU(0.2),
    nn.Dropout(0.1),
    
    nn.Linear(16, 1)
)

加入SelectKBest进行特征选取，选取前k个表现最好的，防止噪声干扰

k = 32
selector = SelectKBest(score_func=f_regression, k=k)
result = selector.fit(x_data, y_data)
idx = np.argsort(result.scores_)[::-1]
selected_idx = list(np.sort(idx[:k]))

调整学习器加入余弦退火热重启，自动调整学习速率，跳出critical point

optimizer = torch.optim.Adam(model.parameters(), lr=config['learning_rate']*50,
                             weight_decay=1e-3)
scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, 
                                    T_0=2, T_mult=2, eta_min=config['learning_rate'])

Hw02

Middle Baseline调整一下concat_nframes , hidden_layers , hidden_dim即可

Strong Baseline需要调整hyperparameter并且加入batch normalization和dropout

需要注意的是为了让训练更快并且收敛，需要增大batch_size，同时需要增大epoch以达到同样的精确度

        self.block = nn.Sequential(
            nn.Linear(input_dim, output_dim),
            nn.BatchNorm1d(output_dim),
            nn.LeakyReLU(0.1),
            nn.Dropout(0.2),
        )

# data prarameters
concat_nframes = 19             # the number of frames to concat with, n must be odd (total 2k+1 = n frames)
train_ratio = 0.8               # the ratio of data used for training, the rest will be used for validation

# training parameters
seed = 0                        # random seed
batch_size = 2048                # batch size
num_epoch = 42                   # the number of training epoch
learning_rate = 0.0001          # learning rate
model_path = './model.ckpt'     # the path where the checkpoint will be saved

# model parameters
input_dim = 39 * concat_nframes # the input dim of the model, you should not change the value
hidden_layers = 14             # the number of hidden layers
hidden_dim = 1024              # the hidden dim

Bossline需要自行实现BiLSTM

# preparing data
import os
import random
import pandas as pd
import torch
from tqdm import tqdm

def load_feat(path):
    feat = torch.load(path)
    return feat

def shift(x, n):
    if n < 0:
        left = x[0].repeat(-n, 1)
        right = x[:n]

    elif n > 0:
        right = x[-1].repeat(n, 1)
        left = x[n:]
    else:
        return x

    return torch.cat((left, right), dim=0)

def concat_feat(x, concat_n):
    assert concat_n % 2 == 1 # n must be odd
    if concat_n < 2:
        return x
    seq_len, feature_dim = x.size(0), x.size(1)
    x = x.repeat(1, concat_n)
    x = x.view(seq_len, concat_n, feature_dim).permute(1, 0, 2) # concat_n, seq_len, feature_dim
    mid = (concat_n // 2)
    for r_idx in range(1, mid+1):
        x[mid + r_idx, :] = shift(x[mid + r_idx], r_idx)
        x[mid - r_idx, :] = shift(x[mid - r_idx], -r_idx)

    return x.permute(1, 0, 2).view(seq_len, concat_n * feature_dim)

def preprocess_data(split, feat_dir, phone_path, concat_nframes, train_ratio=0.8, train_val_seed=1337):
    class_num = 41 # NOTE: pre-computed, should not need change
    mode = 'train' if (split == 'train' or split == 'val') else 'test'

    label_dict = {}
    if mode != 'test':
      phone_file = open(os.path.join(phone_path, f'{mode}_labels.txt')).readlines()

      for line in phone_file:
          line = line.strip('\n').split(' ')
          label_dict[line[0]] = [int(p) for p in line[1:]]

    if split == 'train' or split == 'val':
        # split training and validation data
        usage_list = open(os.path.join(phone_path, 'train_split.txt')).readlines()
        random.seed(train_val_seed)
        random.shuffle(usage_list)
        percent = int(len(usage_list) * train_ratio)
        usage_list = usage_list[:percent] if split == 'train' else usage_list[percent:]
    elif split == 'test':
        usage_list = open(os.path.join(phone_path, 'test_split.txt')).readlines()
    else:
        raise ValueError('Invalid \'split\' argument for dataset: PhoneDataset!')

    usage_list = [line.strip('\n') for line in usage_list]
    print('[Dataset] - # phone classes: ' + str(class_num) + ', number of utterances for ' + split + ': ' + str(len(usage_list)))

    max_len = 3000000
    X = torch.empty(max_len, 39 * concat_nframes)
    if mode != 'test':
      y = torch.empty(max_len, concat_nframes, dtype=torch.long)

    idx = 0
    for i, fname in tqdm(enumerate(usage_list)):
        feat = load_feat(os.path.join(feat_dir, mode, f'{fname}.pt'))
        cur_len = len(feat)
        feat = concat_feat(feat, concat_nframes)
        if mode != 'test':
          label = torch.LongTensor(label_dict[fname]).unsqueeze(1)
          label = concat_feat(label, concat_nframes)

        X[idx: idx + cur_len, :] = feat
        if mode != 'test':
          y[idx: idx + cur_len] = label

        idx += cur_len

    X = X[:idx, :]
    if mode != 'test':
      y = y[:idx]

    print(f'[INFO] {split} set')
    print(X.shape)
    if mode != 'test':
      print(y.shape)
      return X, y
    else:
      return X
    
#Define Dataset    
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

class LibriDataset(Dataset):
    def __init__(self, X, y=None):
        self.data = X
        if y is not None:
            self.label = torch.LongTensor(y)
        else:
            self.label = None

    def __getitem__(self, idx):
        if self.label is not None:
            return self.data[idx].view(-1, 39), self.label[idx]
        else:
            return self.data[idx].view(-1, 39)

    def __len__(self):
        return len(self.data)
      
#Define Model
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchcrf import CRF

# 使用双向LSTM的RNN
class BiLSTM (nn.Module):
  def __init__(self, class_size=41, input_dim=39,hidden_dim=512,dropout=0.5):
      # Label: 41 classes, each class represents a phoneme
      super().__init__()
      self.input_dim = input_dim
      self.hidden_dim = hidden_dim
      self.class_size = class_size
      self.lstm = nn.LSTM(input_dim, hidden_dim // 2, dropout = dropout,
                          num_layers = 4, bidirectional = True, batch_first = True)
      #映射到标签空间,生成每个时间步的标签发射分数
      self.hidden2tag = nn.Sequential(
          nn.Dropout(dropout),
          nn.Linear(hidden_dim, class_size)
      )

  def forward(self, x):
    feats,_ = self.lstm(x)
    # 返回发射分数矩阵
    return self.hidden2tag(feats)

class Crf(nn.Module):
  def __init__(self, class_size = 41):
    super().__init__()
    self.class_size=class_size
    self.crf = CRF(self.class_size,batch_first = True)

  def likelihood(self, x, y):
    # 返回CRF的负对数似然损失值 Negative Log-Likelihood Loss Function
    return self.crf(x, y)

  def forward(self, x):
    # 返回形状为 (batch_size, seq_len) 的预测标签序列
    return torch.LongTensor(self.crf.decode(x))
    # self.crf.decode(x)：使用维特比算法解码最优标签序列，返回列表形式的预测结果
    # torch.LongTensor()：将列表转换为Long类型的张量
# data prarameters
concat_nframes = 51             # the number of frames to concat with, n must be odd (total 2k+1 = n frames)
mid = concat_nframes // 2
train_ratio = 0.85               # the ratio of data used for training, the rest will be used for validation
early_stopping = 12
# training parameters
seed = 0                        # random seed
batch_size = 2048                # batch size
num_epoch = 52                   # the number of training epoch
learning_rate = 0.00001          # learning rate
model1_path = './model1.ckpt'     # the path where the checkpoint will be saved
model2_path = './model2.ckpt'   # the path where the checkpoint will be saved

# model parameters
input_dim = 39 * concat_nframes # the input dim of the model, you should not change the value
hidden_layers = 4             # the number of hidden layers
hidden_dim = 512              # the hidden dim



import gc

# preprocess data
train_X, train_y = preprocess_data(split='train', feat_dir='./libriphone/feat', phone_path='./libriphone', concat_nframes=concat_nframes, train_ratio=train_ratio)
val_X, val_y = preprocess_data(split='val', feat_dir='./libriphone/feat', phone_path='./libriphone', concat_nframes=concat_nframes, train_ratio=train_ratio)

# get dataset
train_set = LibriDataset(train_X, train_y)
val_set = LibriDataset(val_X, val_y)

# remove raw feature to save memory
del train_X, train_y, val_X, val_y
gc.collect()

# get dataloader
train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_set, batch_size=batch_size, shuffle=False)


# fix random seed
same_seeds(seed)

# create model, define a loss function, and optimizer
bilstm = BiLSTM().to(device)
crf = Crf().to(device)
optimizer1 = torch.optim.AdamW(bilstm.parameters(),
                               lr=learning_rate*200, weight_decay=1e-3)
scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer1,
                                                                 T_0=4,
                                                                 T_mult=2,
                                                                 eta_min=learning_rate / 2)
optimizer2  = torch.optim.AdamW(crf.parameters(),
                                lr=learning_rate*200, weight_decay=1e-8)

best_acc = 0.0
for epoch in range(num_epoch):  # 遍历每个epoch
    # 初始化指标
    train_acc = 0.0      # 训练准确率累加器
    train_loss = 0.0     # 训练损失累加器
    val_acc = 0.0        # 验证准确率累加器
    val_loss = 0.0       # 验证损失累加器（代码中未实际使用）
    train_item = 0       # 已处理的训练样本数（用于计算平均损失）

    # ================== 训练阶段 ==================
    bilstm.train()       # 设置BiLSTM为训练模式（启用Dropout）
    crf.train()          # 设置CRF为训练模式（无实际操作，保持一致性）
    pbar = tqdm(train_loader, ncols=110)           # 创建训练数据进度条
    pbar.set_description(f'T: {epoch+1}/{num_epoch}')  # 显示当前epoch
    samples = 0          # 已处理样本数（与train_item重复，可能冗余）

    for i, batch in enumerate(pbar):  # 遍历训练数据的每个batch
        features, labels = batch      # 解包特征和标签
        features = features.to(device)  # 将特征移到GPU
        labels = labels.to(device)      # 将标签移到GPU

        # 梯度清零
        optimizer1.zero_grad()  # 清空BiLSTM的梯度
        optimizer2.zero_grad()  # 清空CRF的梯度

        # 前向传播与损失计算
        emissions = bilstm(features)  # BiLSTM生成发射分数
        loss = -crf.likelihood(emissions, labels)  # CRF计算负对数似然损失

        # 反向传播与梯度裁剪
        loss.backward()  # 反向传播计算梯度
        grad_norm = nn.utils.clip_grad_norm_(bilstm.parameters(), max_norm=50)  # 裁剪BiLSTM梯度（防止梯度爆炸）

        # 参数更新
        optimizer1.step()  # 更新BiLSTM参数
        optimizer2.step()  # 更新CRF参数

        # 统计指标
        train_loss += loss.item()         # 累加批次损失
        train_item += labels.size(0)      # 累加已处理样本数
        lr1 = optimizer1.param_groups[0]["lr"]  # 获取当前BiLSTM学习率
        lr2 = optimizer2.param_groups[0]["lr"]  # 获取当前CRF学习率
        pbar.set_postfix({'lr1':lr1, 'lr2':lr2, 'loss':train_loss/train_item})  # 更新进度条显示

    scheduler.step()  # 调整BiLSTM的学习率
    pbar.close()      # 关闭训练进度条

    # ================== 验证阶段 ==================
    if len(val_set) > 0:  # 如果存在验证集
        bilstm.eval()  # 设置BiLSTM为评估模式（关闭Dropout）
        crf.eval()     # 设置CRF为评估模式
        with torch.no_grad():  # 关闭梯度计算
            pbar = tqdm(val_loader, ncols=110)        # 创建验证数据进度条
            pbar.set_description(f'V: {epoch+1}/{num_epoch}')
            samples = 0
            for i, batch in enumerate(pbar):
                features, labels = batch
                features, labels = features.to(device), labels.to(device)
                outputs = crf(bilstm(features))  # 预测标签序列
                val_acc += (outputs[:, mid] == labels[:, mid].cpu()).sum().item()  # 计算中间位置准确率
                samples += labels.size(0)
                pbar.set_postfix({'val acc':val_acc/samples})
            pbar.close()

        # 模型保存与早停
        if val_acc > best_acc:  # 如果当前验证准确率优于历史最佳
            best_acc = val_acc  # 更新最佳准确率
            torch.save(bilstm.state_dict(), model1_path)  # 保存BiLSTM参数
            torch.save(crf.state_dict(), model2_path)     # 保存CRF参数
            print('saving model with acc {:.3f}'.format(best_acc/len(val_set)))  # 打印保存信息
            early_stop_count = 0  # 重置早停计数器
        else:
            early_stop_count += 1  # 早停计数器+1
            if early_stop_count >= early_stopping:  # 达到早停阈值
                print(f"Epoch: {epoch + 1}, early stopping.")
                break  # 终止训练循环

                
pred = np.array([], dtype=np.int32)

bilstm.eval()
crf.eval()
with torch.no_grad():
    for features in tqdm(test_loader):
        features = features.to(device)
        outputs = crf(bilstm(features))
        pred = np.concatenate((pred, outputs.detach().cpu()[:, mid]), axis=0)