Read csv file with variable number of fields using Tensorflow
import tensorflow as tf
import numpy as np
filename = 'in.txt'
data_read = []
with open(filename,'r') as fin:
for line in fin:
x = line.strip()
data_read += [x]
data_tensor = tf.convert_to_tensor(data_read, dtype=tf.string)
sparse_tensor = tf.string_split(data_tensor,',') #sparse tensor
dense_tensor = tf.sparse_to_dense(sparse_tensor.indices,sparse_tensor.dense_shape,sparse_tensor.values,default_value='NA')
with tf.Session() as sess:
init = tf.initialize_all_variables()
sess.run(init)
X,Y = sess.run([data_tensor,dense_tensor])
print('Input data with shape {0} = \n'.format(X.shape),X,'\n')
print('Padded data with shape {0} = \n'.format(Y.shape),Y)
Output
The output array is nicely padded with βNAβ for the missing values.
Input data with shape (5,) =
['record_1,1,2,3,4' 'record_2,10,20,30' 'record_3,5,6,7,8,9,10,11,12'
'record_4,41,42' 'record_5,100,200,300,400,500,600,700,800,900,1000']
Padded data with shape (5, 11) =
[['record_1' '1' '2' '3' '4' 'NA' 'NA' 'NA' 'NA' 'NA' 'NA']
['record_2' '10' '20' '30' 'NA' 'NA' 'NA' 'NA' 'NA' 'NA' 'NA']
['record_3' '5' '6' '7' '8' '9' '10' '11' '12' 'NA' 'NA']
['record_4' '41' '42' 'NA' 'NA' 'NA' 'NA' 'NA' 'NA' 'NA' 'NA']
['record_5' '100' '200' '300' '400' '500' '600' '700' '800'
'900' '1000']]
Written on October 11, 2017