1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74
| class VGG2L(torch.nn.Module): """VGG-like module
:param int in_channel: number of input channels """
def __init__(self, in_channel=1): super(VGG2L, self).__init__() # CNN layer (VGG motivated) self.conv1_1 = torch.nn.Conv2d(in_channel, 64, 3, stride=1,padding=1) self.conv1_2 = torch.nn.Conv2d(64, 64, 3, stride=1, padding=1) self.conv2_1 = torch.nn.Conv2d(64, 128, 3, stride=1, padding=1) self.conv2_2 = torch.nn.Conv2d(128, 128, 3, stride=1, padding=1)
self.in_channel = in_channel
def forward(self, xs_pad, ilens, **kwargs): """VGG2L forward
:param torch.Tensor xs_pad: batch of padded input sequences (B, Tmax, D) :param torch.Tensor ilens: batch of lengths of input sequences (B) :return: batch of padded hidden state sequences (B, Tmax // 4, 128 * D // 4) :rtype: torch.Tensor """ logging.debug(self.__class__.__name__ + " input lengths: " + str(ilens))
# x: utt x frame x dim # xs_pad = F.pad_sequence(xs_pad)
# x: utt x 1 (input channel num) x frame x dim xs_pad = xs_pad.view( xs_pad.size(0), #15 xs_pad.size(1), #882 self.in_channel, #1 xs_pad.size(2) // self.in_channel, #83 ).transpose(1, 2) #将维度1和维度2互换位置(15,1,882,83)
# NOTE: max_pool1d ? xs_pad = F.relu(self.conv1_1(xs_pad)) #(15,64,882,83) xs_pad = F.relu(self.conv1_2(xs_pad)) #(15,64,882,83) xs_pad = F.max_pool2d(xs_pad, 2, stride=2, ceil_mode=True) #(15,64,441,42)
xs_pad = F.relu(self.conv2_1(xs_pad)) #(15,128,441,42) xs_pad = F.relu(self.conv2_2(xs_pad)) #(15,128,441,42) xs_pad = F.max_pool2d(xs_pad, 2, stride=2, ceil_mode=True) #(15,128,221,21) if torch.is_tensor(ilens): ilens = ilens.cpu().numpy() else: ilens = np.array(ilens, dtype=np.float32) ilens = np.array(np.ceil(ilens / 2), dtype=np.int64) ilens = np.array( np.ceil(np.array(ilens, dtype=np.float32) / 2), dtype=np.int64 ).tolist()
# x: utt_list of frame (remove zeropaded frames) x (input channel num x dim) xs_pad = xs_pad.transpose(1, 2) #在这个时候又将维度1和2的位置交换(15,221,128,21) xs_pad = xs_pad.contiguous().view( xs_pad.size(0), xs_pad.size(1), xs_pad.size(2) * xs_pad.size(3) ) #(15,221,2688)我们相对于原始的特征看一下 #(15,882,83),相当于语音帧减少了四倍,特征维度增加了很多倍。 return xs_pad, ilens, None # no state in this layer
|