Comaparing using TimeDistributed and not with dense when return_sequences=True in Keras. The results are identical.

from __future__ import print_function

import sys
import os
import pandas as pd
import numpy as np

from keras.layers import Input, Embedding, LSTM, TimeDistributed, Dense
from keras.models import Model, load_model, Sequential

import encoding

from keras import backend as K
import tensorflow as tf

Using TensorFlow backend.

n_chars = 4      # Input feature dimension
n_hidden = 5     # LSTM hidden state dimension, so output dimension of each timestep
n_categories = 2 # Output dimension of final dense layer.

inputs = Input(shape=(None, n_chars)) #n_chars = feature size
lstm = LSTM(n_hidden, return_sequences=True)(inputs) #return the output of all time steps

#Initialize the two
initializer = tf.keras.initializers.Constant(0.1)

#create two dense layers. 
#feed LSTM output of all timesteps to dense layer.
dense1 = Dense(n_categories,kernel_initializer=initializer)(lstm)

#feed LSTM output of all timesteps to dense layer with timedistributed.
dense2 = TimeDistributed(Dense(n_categories,kernel_initializer=initializer))(lstm)

model = Model(inputs=inputs, outputs=[dense1, dense2])

#you can initialize the LSTM hidden and cell states to 0 if you want. 

#hidden_states = K.variable(value=np.zeros([1, n_hidden]))
#cell_states = K.variable(value=np.zeros([1, n_hidden]))
#model.layers[1].states[0] = hidden_states
#model.layers[1].states[1] = cell_states 

print(model.summary())

X = K.constant(np.ones([1,3,4]))

o1,o2 = model(X)

O1 = K.eval(o1)
O2 = K.eval(o2)

#Both dense1 and dense2 outputs should be same.

print('dense1 output\n',O1.shape,'\n',O1)

print('------------')

print('dense2 output\n',O2.shape,'\n',O2)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
==================================================================================================
input_1 (InputLayer)            (None, None, 4)      0                                            
__________________________________________________________________________________________________
lstm_1 (LSTM)                   (None, None, 5)      200         input_1[0][0]                    
__________________________________________________________________________________________________
dense_1 (Dense)                 (None, None, 2)      12          lstm_1[0][0]                     
__________________________________________________________________________________________________
time_distributed_1 (TimeDistrib (None, None, 2)      12          lstm_1[0][0]                     
==================================================================================================
Total params: 224
Trainable params: 224
Non-trainable params: 0
__________________________________________________________________________________________________
None
dense1 output
 (1, 3, 2) 
 [[[0.03041531 0.03041531]
  [0.05476438 0.05476438]
  [0.07346614 0.07346614]]]
------------
dense2 output
 (1, 3, 2) 
 [[[0.03041531 0.03041531]
  [0.05476438 0.05476438]
  [0.07346614 0.07346614]]]

Written on January 16, 2021