Implementation of End-2-End Memory Network for Language Modeling
Tensorflow implementation of End-To-End Memory Networks for the language modeling task. I tried to name the variable as closely as possible to that in the paper following the equations to help understand the paper. Don’t forget to change the “input_file” to your input file. Some of the ideas are borrowed from earlier implementation. The python notebook of the code can be found here.
Model parameters and input configurations
# code for Mem-N-to-N for language modelling.
import numpy as np
import os
import math
import tensorflow as tf
import sys
import random
from collections import Counter
input_file = 'ptb.train.txt' #change this to your file of input.
config = {
'batch_size' : 128, # batch_size
'emb_dim' : 150, # embedding dimension for words
'mem_size' : 100, # memory size
'init_q' : 0.1,
'n_epochs' : 50, # no. of epochs
'n_hops' : 6, # no. of hops in memory
'n_words' : None,
'init_lr' : 0.001, # initial learning rate
'std_dev' : 0.05,
'lin_dim' : 75, # no. of units to have linear activation
'max_grad_norm' : 50 #clip gradients to this norm.
}
# read words and convert it to unique integers (from https://github.com/carpedm20/MemN2N-tensorflow/)
def read_data(fname, count, word2idx):
with open(fname) as f:
lines = f.readlines()
words = []
for line in lines:
words.extend(line.split())
if len(count) == 0:
count.append(['<eos>', 0])
count[0][1] += len(lines)
count.extend(Counter(words).most_common())
if len(word2idx) == 0:
word2idx['<eos>'] = 0
for word, _ in count:
if word not in word2idx:
word2idx[word] = len(word2idx)
data = list()
for line in lines:
for word in line.split():
index = word2idx[word]
data.append(index)
data.append(word2idx['<eos>'])
print("Read %s words from %s" % (len(data), fname))
return data
count = list()
word2idx = dict()
train_data = read_data(input_file, count, word2idx)
config['n_words'] = len(word2idx)
batch_size = config['batch_size']
e_dim = config['emb_dim']
l_dim = config['lin_dim']
mem_size = config['mem_size']
n_epochs = config['n_epochs']
n_hops = config['n_hops']
n_words = config['n_words'] = len(word2idx)
current_lr = config['init_lr']
std_dev = config['std_dev']
init_q = config['init_q']
max_grad_norm = config['max_grad_norm']
Define the tensorflow model.
The comments should explain the code.
print('Defining the tensorflow model...')
# Define the tensorflow model. The Variable names are made to follow the paper as closely as possible.
input_q = tf.placeholder(tf.float32, shape=[None, e_dim],name="q") #the question q, will be set to all 0.1.
input_x = tf.placeholder(tf.int32, [None, mem_size], name="x") # the context word ids
input_time = tf.placeholder(tf.int32, [None, mem_size], name="time") # to lookup temporal encoding
input_y = tf.placeholder(tf.float32, [None, n_words], name="target") # id of next word to predict (target)
# Matrices for input memory representation
A = tf.Variable(tf.random_normal([n_words, e_dim], stddev=std_dev),name="A") #embedding matrix A for input memory representation
T_A = tf.Variable(tf.random_normal([n_words, e_dim], stddev=std_dev),name="T_A") #embedding matrix for temporal encoding
# Input memory vectors : m_i = sum A_ij * x_ij + T_A_i
x_in_A = tf.nn.embedding_lookup(A, input_x) # embedding lookup, shape: batch_size x mem_size x e_dim
T_A_i = tf.nn.embedding_lookup(T_A, input_time) #T_A(i), shape: batch_size x mem_size x e_dim
mem_in = tf.add(x_in_A, T_A_i) #input memory vectors m_i, shape: batch_size x mem_size x e_dim
# Matrices for output memory representation
C = tf.Variable(tf.random_normal([n_words, e_dim], stddev=std_dev),name="C") #embedding matrix C for output memory representation
T_C = tf.Variable(tf.random_normal([n_words, e_dim], stddev=std_dev),name="T_C") #embedding matrix for temporal encoding
# Output memory vectors : c_i = sum C_ij * x_ij + T_C_i
x_in_C = tf.nn.embedding_lookup(C, input_x) # embedding lookup, shape: batch_size x mem_size x e_dim
T_C_i = tf.nn.embedding_lookup(T_C, input_time) #T_C(i), shape: batch_size x mem_size x e_dim
mem_out = tf.add(x_in_C, T_C_i) #output memory vectors c_i, shape: batch_size x mem_size x e_dim
# For linear mapping of input u between hops
Hw = tf.Variable(tf.random_normal([e_dim, e_dim], stddev=std_dev),name="Hw")
Hb = tf.Variable(tf.random_normal([e_dim], stddev=std_dev),name="Hb")
u_k = input_q #initialize u_k for first hop in memory, shape : batch_size x edim
for k in range(n_hops): #k indexes the hops in memory
print('hop in memory :',k,' input u_k:',u_k)
u_k_3d = tf.reshape(u_k, [-1, e_dim, 1]) # reshape to shape: batch_size x e_dim x 1
# p_i = Softmax(u^T m_i) (equation 1)
probs = tf.nn.softmax(tf.matmul(mem_in, u_k_3d)) # shape: batch_size x mem_size x 1
# o = sum p_i c_i (equation 2)
o_k = tf.matmul(mem_out, probs, transpose_a=True) # shape: batch_size x e_dim x 1
o_k_2d = tf.reshape(o_k, [-1, e_dim]) # shape: batch_size x e_dim
#apply a linear mapping H to u : u_mapped = Hw u + Hb
u_k_mapped = tf.add(tf.matmul(u_k,Hw),Hb)
# u_(k+1) = u_k + o_k (equation 4)
u_k_next_hop = tf.add(u_k_mapped,o_k_2d)
#apply ReLU to a slice of the units, rest of the unit activations are linear.
u_k_next_hop_linear = tf.slice(u_k_next_hop, [0,0], [-1,l_dim]) #slice of u_k_next_hop to have linear activations
u_k_next_hop_relu = tf.slice(u_k_next_hop, [0,l_dim], [-1,e_dim - l_dim]) # remaining slice to have ReLU activations
u_k_next_hop_relu = tf.nn.relu(u_k_next_hop_relu)
u_k_next_hop = tf.concat(axis=1, values=[u_k_next_hop_linear,u_k_next_hop_relu])
u_k = u_k_next_hop #update u_k for the next hop in memory
print('-------------')
W = tf.Variable(tf.random_normal([n_words, e_dim], stddev=std_dev),name="W") # final weight matrix W as in the paper.
a_hat = tf.matmul(u_k, W, transpose_b=True) # shape : batch_size x n_words (equation 3), the output logits.
print('Model specification complete...')
Define the ops for model optimization.
print('Defining the ops for model optimization ...')
#Define the ops to estimate loss and optimize the above model.
#change the softmax_cross_entropy_with_logits_v2 to softmax_cross_entropy_with_logits for older versions of tensorflow.
model_loss = tf.nn.softmax_cross_entropy_with_logits_v2(logits=a_hat, labels=input_y)
lr = tf.Variable(current_lr)
opt = tf.train.GradientDescentOptimizer(lr) #optimizer
params = [A, T_A, C, T_C, Hw, Hb, W] #list of Variables to optimize
# get a List of (gradient, variable) pairs as returned by compute_gradients(...)
grads_and_vars = opt.compute_gradients(model_loss,params)
#clip the gradients using l2 norm of each variable separately, not used.
#clipped_grads_and_vars = [(tf.clip_by_norm(gv[0], max_grad_norm), gv[1]) for gv in grads_and_vars]
# Better: clip the gradients using l2 norm of the whole gradient of all variables.
all_gradients = [gv[0] for gv in grads_and_vars]
clipped_grads_global = tf.clip_by_global_norm(all_gradients,max_grad_norm)[0] #should be a list of clipped tensors
clipped_grads_and_vars_global = [(clipped_grads_global[i],gv[1]) for i,gv in enumerate(grads_and_vars)]
optim = opt.apply_gradients(clipped_grads_and_vars_global)
Define the data structures to provide data input to the model. Create and run session for training.
# Define the data structures to provide data input to the model.
q = np.ndarray([batch_size, e_dim], dtype=np.float32)
x = np.ndarray([batch_size, mem_size])
time = np.ndarray([batch_size, mem_size], dtype=np.int32)
target = np.zeros([batch_size, n_words]) # each word is one-hot-encoded
q.fill(init_q) # fill with all 0.1
for t in range(mem_size):
time[:,t].fill(t)
def train_one_epoch(epoch_no,sess,data):
# No. of loops in one epoch
N = int(math.ceil(len(data) / batch_size))
total_loss = 0
for idx in range(1,N+1):
target.fill(0)
for b in range(batch_size):
t_idx = random.randrange(mem_size, len(data)) #choose a word index beyond mem_size.
target[b][data[t_idx]] = 1 #set the word at the chosen index to be the target word to predict.
# the context
x[b] = data[t_idx - mem_size : t_idx] #set to the mem_size words preceeding the target word.
f_dict = {
input_q: q,
input_x: x,
input_time: time,
input_y: target
}
_, batch_loss = sess.run([optim,model_loss],feed_dict=f_dict)
total_loss += np.sum(batch_loss)
cost = total_loss/(idx*batch_size)
print('epoch=',epoch_no,' batch=',idx,' avg_loss=',cost)
cost = total_loss/(N*batch_size)
print('epoch=',epoch_no,' avg_loss=',cost, "epoch perplexity=",np.exp(cost))
# Define session to run the model with data
with tf.Session() as sess:
tf.global_variables_initializer().run()
for epoch_no in range(n_epochs):
print('Running epoch = ',epoch_no)
train_one_epoch(epoch_no,sess,train_data)
Happy training :). Please drop a line if you found the code easy to understand.
Written on October 10, 2018