Buidling multilayer GPU from single GRU-cells with Pytorch.
First use nn.GRU with 3 layers for processing sequences. Then use nn.GRUCell for doing the same.
from __future__ import unicode_literals, print_function, division
from io import open
import glob
import os
import unicodedata
import string
import numpy as np
import torch
import torch.nn as nn
import random
import torch.optim as optim
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import time
import math
import torch.nn.functional as F
torch.cuda.is_available()
True
pad_char = '#'
all_chars = 'ABCDE'+pad_char
n_chars = len(all_chars)
# Find char index from all_chars, e.g. "a" = 0
def charToIndex(char):
return all_chars.find(char)
# Just for demonstration, turn a char into a <1 x n_chars> Tensor
def charToTensor_one_hot(char):
tensor = torch.zeros(1, n_chars)
tensor[0][charToIndex(char)] = 1
return tensor
def charToTensor(char):
tensor = torch.zeros(1,dtype=torch.long)
tensor[0] = charToIndex(char)
return tensor
# Turn a line into a <line_length x 1 x n_chars>,
# or an array of one-hot char vectors
def seqToTensor_one_hot(seq):
tensor = torch.zeros(len(seq),1, n_chars)
for idx, char in enumerate(seq):
tensor[idx][0][charToIndex(char)] = 1
return tensor
def seqToTensor(seq):
tensor = torch.zeros(len(seq), dtype=torch.long)
for idx, char in enumerate(seq):
tensor[idx] = int(charToIndex(char))
return tensor
for ch in all_chars:
print(ch,':',charToTensor_one_hot(ch))
A : tensor([[1., 0., 0., 0., 0., 0.]])
B : tensor([[0., 1., 0., 0., 0., 0.]])
C : tensor([[0., 0., 1., 0., 0., 0.]])
D : tensor([[0., 0., 0., 1., 0., 0.]])
E : tensor([[0., 0., 0., 0., 1., 0.]])
# : tensor([[0., 0., 0., 0., 0., 1.]])
Prepare some tensor data for input : 2 character sequences
sequences = ['AABC','AAAACC']
batch_size = len(sequences)
max_seqlen = 10
seq_tensors = []
for seq in sequences:
seq_tensor = seqToTensor_one_hot(seq)
seq_tensors.append(torch.squeeze(seq_tensor))
pad_char_tensor = charToTensor_one_hot(pad_char) #tensor corresponding to pad_char
batch_tensor = pad_char_tensor.repeat(batch_size, max_seqlen,1)
#print('batch_names_tensor',batch_names_tensor.shape)
for i,t in enumerate(seq_tensors):
num_chars = t.shape[0]
batch_tensor[i,-num_chars:,:] = t #Left padding is done with pad_char
print('input tensor shape=',batch_tensor.shape)
print('input tensor=\n',batch_tensor)
input tensor shape= torch.Size([2, 10, 6])
input tensor=
tensor([[[0., 0., 0., 0., 0., 1.],
[0., 0., 0., 0., 0., 1.],
[0., 0., 0., 0., 0., 1.],
[0., 0., 0., 0., 0., 1.],
[0., 0., 0., 0., 0., 1.],
[0., 0., 0., 0., 0., 1.],
[1., 0., 0., 0., 0., 0.],
[1., 0., 0., 0., 0., 0.],
[0., 1., 0., 0., 0., 0.],
[0., 0., 1., 0., 0., 0.]],
[[0., 0., 0., 0., 0., 1.],
[0., 0., 0., 0., 0., 1.],
[0., 0., 0., 0., 0., 1.],
[0., 0., 0., 0., 0., 1.],
[1., 0., 0., 0., 0., 0.],
[1., 0., 0., 0., 0., 0.],
[1., 0., 0., 0., 0., 0.],
[1., 0., 0., 0., 0., 0.],
[0., 0., 1., 0., 0., 0.],
[0., 0., 1., 0., 0., 0.]]])
Experiment 1: Simple GRU Layer (3 layers)
Takes input an entire sequence
class SimpleGRU(nn.Module):
def __init__(self, input_size, num_layers=1, bidirectional=False, hidden_dim=10, printVars=False):
super().__init__()
print()
self.hidden_dim = hidden_dim
self.gru = nn.GRU(input_size, hidden_dim, num_layers, bidirectional=bidirectional, batch_first=True)
self.num_layers = num_layers
self.num_directions = 2 if bidirectional==True else 1
#dim=2 as we are doing softmax across the last dimension of output_size
self.softmax = nn.Softmax(dim=2)
self.hidden = None
self.printVars = printVars #run the print statements in forward ?
#initialize biases and weights to some fixed value for testing
for name, param in self.named_parameters():
if 'bias' in name:
nn.init.constant(param, 0.3)
elif 'weight' in name:
nn.init.constant(param, 0.3) #nn.init.xavier_normal(param)
def init_hidden(self, batch_size):
hidden_dim = self.hidden_dim
weight = next(self.parameters()).data
h_0 = weight.new(self.num_directions*self.num_layers, batch_size, hidden_dim).zero_()
return h_0
def forward(self, batch_of_words):
batch_size = batch_of_words.shape[0]
#This is stateless GRU, so hidden states are initialized for every forward pass.
#The hidden states are not preserved across batches.
self.hidden = self.init_hidden(batch_size)
h_0 = self.hidden #initial hidden state, shape (num_direction*num_layers , batch_size, hidden_dim)
x = batch_of_words
if self.printVars:
print('forward: h_0.shape',h_0.shape)
print('forward: input to gru, x =',x.shape)
output, self.hidden = self.gru(x, self.hidden)
#output: output features h_t from the last layer of the GRU for each timestep=t
#self.hidden : tensor containing the hidden state for the last timestep t = seq_len
if self.printVars:
print('\ngru_output=',output.shape,'\n',output) #output from final layer for all timesteps.
print('\nh_out=',self.hidden.shape,'\n',self.hidden) #hidden state from last timestep for all layers
return output, self.hidden
n_hidden = 4
n_layers = 3
print('batch_size=',batch_tensor.shape[0])
print('input_size =',n_chars)
print('n_hidden =',n_hidden)
print('n_layers = ',n_layers)
gru_rnn = SimpleGRU(n_chars, num_layers=n_layers, bidirectional=False,hidden_dim=n_hidden,printVars=False)
print('input to gru = ',batch_tensor.shape)
emissions,h_n = gru_rnn(batch_tensor)
print('\n all_step_gru_output=',emissions.shape,'\n',emissions)
print('\n final_step_hidden=',h_n.shape,'\n',h_n)
batch_size= 2
input_size = 6
n_hidden = 4
n_layers = 3
input to gru = torch.Size([2, 10, 6])
all_step_gru_output= torch.Size([2, 10, 4])
tensor([[[0.1908, 0.1908, 0.1908, 0.1908],
[0.3296, 0.3296, 0.3296, 0.3296],
[0.4295, 0.4295, 0.4295, 0.4295],
[0.5042, 0.5042, 0.5042, 0.5042],
[0.5623, 0.5623, 0.5623, 0.5623],
[0.6090, 0.6090, 0.6090, 0.6090],
[0.6475, 0.6475, 0.6475, 0.6475],
[0.6800, 0.6800, 0.6800, 0.6800],
[0.7077, 0.7077, 0.7077, 0.7077],
[0.7317, 0.7317, 0.7317, 0.7317]],
[[0.1908, 0.1908, 0.1908, 0.1908],
[0.3296, 0.3296, 0.3296, 0.3296],
[0.4295, 0.4295, 0.4295, 0.4295],
[0.5042, 0.5042, 0.5042, 0.5042],
[0.5623, 0.5623, 0.5623, 0.5623],
[0.6090, 0.6090, 0.6090, 0.6090],
[0.6475, 0.6475, 0.6475, 0.6475],
[0.6800, 0.6800, 0.6800, 0.6800],
[0.7077, 0.7077, 0.7077, 0.7077],
[0.7317, 0.7317, 0.7317, 0.7317]]], grad_fn=<TransposeBackward0>)
final_step_hidden= torch.Size([3, 2, 4])
tensor([[[0.7692, 0.7692, 0.7692, 0.7692],
[0.7692, 0.7692, 0.7692, 0.7692]],
[[0.7272, 0.7272, 0.7272, 0.7272],
[0.7272, 0.7272, 0.7272, 0.7272]],
[[0.7317, 0.7317, 0.7317, 0.7317],
[0.7317, 0.7317, 0.7317, 0.7317]]], grad_fn=<StackBackward>)
/opt/conda/lib/python3.6/site-packages/ipykernel_launcher.py:24: UserWarning: nn.init.constant is now deprecated in favor of nn.init.constant_.
/opt/conda/lib/python3.6/site-packages/ipykernel_launcher.py:22: UserWarning: nn.init.constant is now deprecated in favor of nn.init.constant_.
Experiment 2: Simple GRU Layer from individual cells (3 layers)
class Stacked_GRU_Cells(nn.Module):
""" Implements a three layer GRU cell with an output linear layer back to the size of the output categories"""
def __init__(self, input_size, hidden_dim=10):
super().__init__()
self.gru_0 = nn.GRUCell(input_size, hidden_dim)
self.gru_1 = nn.GRUCell(hidden_dim, hidden_dim)
self.gru_2 = nn.GRUCell(hidden_dim, hidden_dim)
#initialize biases and weights to some fixed value for testing
for name, param in self.named_parameters():
if 'bias' in name:
nn.init.constant(param, 0.3)
elif 'weight' in name:
nn.init.constant(param, 0.3)
#nn.init.xavier_normal(param)
def forward(self, x, h_in):
h_out = torch.zeros(h_in.size())
h_out[0] = self.gru_0(x, h_in[0])
h_out[1] = self.gru_1(h_out[0], h_in[1])
h_out[2] = self.gru_2(h_out[1], h_in[2])
x = h_out[2]
return x, h_out
def forward_RNN_pass(gru_rnn, batch_tensor,hidden_dim):
batch_size = batch_tensor.shape[0]
seq_len = batch_tensor.shape[1]
h_init = torch.zeros(3, batch_size, hidden_dim)
print('Initial hidden state = ',h_init.shape)
h = h_init
#To gather outputs from all timesteps
gru_out = torch.zeros([batch_size,seq_len,hidden_dim])
for position in range(seq_len):
logits, h = gru_rnn(batch_tensor[:, position, :], h)
gru_out[:,position,:] = logits #store gru output from this timestep
all_step_output = gru_out #output from final layer for all timesteps
final_step_hidden = h #hidden state from final timestep for all layers
return all_step_output, final_step_hidden
n_hidden = 4
n_layers = 3 #fixed
print('batch_size=',batch_tensor.shape[0])
print('input_size =',n_chars)
print('n_hidden =',n_hidden)
print('n_layers = ',n_layers)
gru_rnn = Stacked_GRU_Cells(n_chars, hidden_dim=n_hidden)
print('input to gru = ',batch_tensor.shape)
final_step_output, final_step_hidden = forward_RNN_pass(gru_rnn,batch_tensor,n_hidden)
print('\n all_step_gru_output=',final_step_output.shape,'\n',final_step_output)
print('\n final_step_hidden=',final_step_hidden.shape,'\n',final_step_hidden)
batch_size= 2
input_size = 6
n_hidden = 4
n_layers = 3
input to gru = torch.Size([2, 10, 6])
Initial hidden state = torch.Size([3, 2, 4])
all_step_gru_output= torch.Size([2, 10, 4])
tensor([[[0.1908, 0.1908, 0.1908, 0.1908],
[0.3296, 0.3296, 0.3296, 0.3296],
[0.4295, 0.4295, 0.4295, 0.4295],
[0.5042, 0.5042, 0.5042, 0.5042],
[0.5623, 0.5623, 0.5623, 0.5623],
[0.6090, 0.6090, 0.6090, 0.6090],
[0.6475, 0.6475, 0.6475, 0.6475],
[0.6800, 0.6800, 0.6800, 0.6800],
[0.7077, 0.7077, 0.7077, 0.7077],
[0.7317, 0.7317, 0.7317, 0.7317]],
[[0.1908, 0.1908, 0.1908, 0.1908],
[0.3296, 0.3296, 0.3296, 0.3296],
[0.4295, 0.4295, 0.4295, 0.4295],
[0.5042, 0.5042, 0.5042, 0.5042],
[0.5623, 0.5623, 0.5623, 0.5623],
[0.6090, 0.6090, 0.6090, 0.6090],
[0.6475, 0.6475, 0.6475, 0.6475],
[0.6800, 0.6800, 0.6800, 0.6800],
[0.7077, 0.7077, 0.7077, 0.7077],
[0.7317, 0.7317, 0.7317, 0.7317]]], grad_fn=<CopySlices>)
final_step_hidden= torch.Size([3, 2, 4])
tensor([[[0.7692, 0.7692, 0.7692, 0.7692],
[0.7692, 0.7692, 0.7692, 0.7692]],
[[0.7272, 0.7272, 0.7272, 0.7272],
[0.7272, 0.7272, 0.7272, 0.7272]],
[[0.7317, 0.7317, 0.7317, 0.7317],
[0.7317, 0.7317, 0.7317, 0.7317]]], grad_fn=<CopySlices>)
/opt/conda/lib/python3.6/site-packages/ipykernel_launcher.py:16: UserWarning: nn.init.constant is now deprecated in favor of nn.init.constant_.
app.launch_new_instance()
/opt/conda/lib/python3.6/site-packages/ipykernel_launcher.py:14: UserWarning: nn.init.constant is now deprecated in favor of nn.init.constant_.
Compare these results with the output and hidden state tensors from Experiment 1
They should be same
Written on September 25, 2020