Simple data feeding to Deepchem framework in tensorflow style

Deepchem provides a wonderful framework and library for developing deep learning and machine learning predictive models for small molecules. However, its understandably complex pythonic architecure and equally inexplicable lack of documentation (except the raw python function descriptions and a handful of tutorials) make it very hard to get benath the surface and engineer it to fit your own needs, particularly so if you are not a physics, chemisty and deep learning and programming major. Here I will chronicle my efforts to just enable training with deepchem where we will be feeding the data using feed_dict (people who uses tensorflow will understand this term) to the tensorflow graph, not using the standard fit_generator or fit functions of deepchem (which kind of makes deepchem a blackbox difficult to understand).

The data is just small molecule smiles and a binary property which we will learn to predict with deepchem using just the molecule smiles. The data in a csv file (Tr.csv) looks like:

Smiles,Activity
S(C(C)C)c1c(/C=N/OC(=O)C)n2c(SC=C2)n1,1
O=C(OC)CCC(=O)N(C)c1nnc(-c2cnccc2)cc1,0
O=C(Nc1n(-c2ccc(C)cc2)nc2c1C[SH0](=O)C2)CCC1CCCCC1,1
Cl/C(/Cl)=C\C1C(C)(C)C1C(=O)Nc1nc(Cl)nc(N/N=C/c2ccc(OC(F)(F)F)cc2)c1,0
FC(F)(F)Oc1ccc(-c2nc(-c3cc(C#N)c(-n4ncnc4)cc3)cnc2)cc1,0
c1(-c2ccccc2)[nH]nc2-c3c(cccc3)CCc12,0
O(c1ncc(C#N)cc1)c1cc(N(CC)CC)ccc1,1
S(C)c1ccc(-c2nc(NC(=O)c3ccccc3)sn2)cc1,0
Clc1c(-c2ccccc2)cc(C/C(=N/O)/C(=O)NCCSSCCNC(=O)/C(=N\O)/Cc2cc(c(Cl)cc2)-c2ccccc2)cc1,1
Clc1c2OC(O)c3c(nc(C)c(C#N)c3)-c2cc(Cl)c1,0
O=[N+]([O-])c1cc2ncn(C(=O)NCCC)c2cc1,0
Clc1c(O)c(/C=N/NC(=O)c2ccc(C(C)(C)C)cc2)cc(Cl)c1,0
FC(F)(F)c1ccc(N2C(=O)OC=N2)cc1,0
Fc1ccc(N2CCN(Cc3nc4n(C(=O)N(CC5CC5)CC4)c3)CC2)cc1,0
Clc1ccc(-c2nc(-c3occc3)on2)cc1,1

The code starts with standard imports:

import deepchem as dc
from deepchem.models.tensorgraph.tensor_graph import TensorGraph
from deepchem.metrics import to_one_hot
from deepchem.models.tensorgraph.layers import Feature
from deepchem.models.tensorgraph.layers import Dense, GraphConv, BatchNorm
from deepchem.models.tensorgraph.layers import GraphPool, GraphGather
from deepchem.models.tensorgraph.layers import Dense, SoftMax, SoftMaxCrossEntropy, WeightedError, Stack
from deepchem.models.tensorgraph.layers import Label, Weights
import tensorflow as tf
import os

Next define the functions to read data from Tr.csv:

batch_size = 7
pad_batches = True

tg = TensorGraph(use_queue=False)
training_task = ['Activity']

# simple circular fingerprint featurizer, we are asking for just 75 features to extract per molecule
def read_data(fname):
  dataset_file = fname
  featurizer = dc.feat.CircularFingerprint(size=75)
  loader = dc.data.CSVLoader(tasks=training_task, smiles_field="Smiles", featurizer=featurizer)
  dataset = loader.featurize(dataset_file, shard_size=8192)
  transformer = dc.trans.BalancingTransformer(transform_w=True, dataset=dataset)
  transformed_dataset = transformer.transform(dataset)
  return transformed_dataset

And the model that will be trained:

atom_features = Feature(shape=(None, 75))
classification = Dense( out_channels=2, activation_fn=tf.nn.relu, in_layers=[atom_features],name="Dense")
label = Label(shape=(None, 2))
cost = SoftMaxCrossEntropy(in_layers=[label, classification],name="SoftMaxCrossEntropy")

weights = Weights(shape=(None, len(training_task)),name='Weights')
loss = WeightedError(in_layers=[cost, weights],name='WE')
tg.set_loss(loss)

Next create a data generator that will generate batches of data to be fed into our model:

def data_generator(dataset, epochs=1, pad_batches=True):
  for epoch in range(epochs):
    print('Starting epoch %i' % epoch)
    data_iterator_batch = dataset.iterbatches(batch_size, pad_batches=pad_batches, deterministic=True)
    for ind, (X_b, y_b, w_b, ids_b) in enumerate(data_iterator_batch):
      d = {} #a dictionary 
      d[atom_features] = X_b
      d[weights] = w_b
      d[label] = to_one_hot(y_b[:, 0])
      yield d

Finally the feed_dict part to feed data generated by the above data_generator into the tensorflow graph.

    
train_dataset = read_data("Tr.csv")
N = train_dataset.y.shape[0]

with tg._get_tf("Graph").as_default():
  generator = data_generator(train_dataset, epochs=10) # calling the generator we defined above
  tg.build() #need to allocate the tensors for the various layers in pur model (such as classification, label, cost etc.)
  train_op = tg._get_tf('train_op') #this op will make the training happen
  with tg.session as sess:
    avg_loss, n_averaged_batches = 0.0, 0.0
    k = 0.0
    for d in generator:
      feed_dict = dict(d)
      fetched_values = sess.run([train_op,loss], feed_dict=feed_dict)
      avg_loss += fetched_values[1]
      n_averaged_batches += 1
      k += batch_size
      if k >= N:
         avg_loss /= n_averaged_batches
         print('Epoch avg loss = ',avg_loss)
         avg_loss, n_averaged_batches = 0.0, 0.0
         k = 0.0

Running this gives:

Starting epoch 0
Epoch avg loss =  32.24176534016927
Starting epoch 1
Epoch avg loss =  31.82079251607259
Starting epoch 2
Epoch avg loss =  31.52631123860677
Starting epoch 3
Epoch avg loss =  31.251598358154297
Starting epoch 4
Epoch avg loss =  30.985911051432293
Starting epoch 5
Epoch avg loss =  30.726199467976887
Starting epoch 6
Epoch avg loss =  30.471251487731934
Starting epoch 7
Epoch avg loss =  30.220494588216145
Starting epoch 8
Epoch avg loss =  29.97362518310547
Starting epoch 9
Epoch avg loss =  29.737802505493164

Lets explore the same strategy for more complicated Graph Convolutions. We need to use the ConvMolFeaturizer:

def read_data(fname):
   dataset_file = os.path.join('./data/', fname)
   featurizer = dc.feat.ConvMolFeaturizer()
   loader = dc.data.CSVLoader(tasks=training_task, smiles_field="Smiles", featurizer=featurizer)
   dataset = loader.featurize(dataset_file, shard_size=8192)
   transformer = dc.trans.BalancingTransformer(transform_w=True, dataset=dataset)
   transformed_dataset = transformer.transform(dataset)
   return transformed_dataset

And re-define our predictive model, other parts remain unchanged:

# placeholder for a feature vector of length 75 for each atom
atom_features = Feature(shape=(None, 75))
# an indexing convenience that makes it easy to locate atoms from all molecules with a given degree
degree_slice = Feature(shape=(None, 2), dtype=tf.int32)
# placeholder that determines the membership of atoms in molecules (atom i belongs to molecule membership[i])
membership = Feature(shape=(None,), dtype=tf.int32)
# list that contains adjacency lists grouped by atom degree
deg_adjs = []
for i in range(0, 10 + 1):
   deg_adj = Feature(shape=(None, i + 1), dtype=tf.int32) # placeholder for adj list of all nodes with i neighbors
   deg_adjs.append(deg_adj)

gc1 = GraphConv(64, activation_fn=tf.nn.relu, in_layers=[atom_features, degree_slice, membership]+deg_adjs )
batch_norm1 = BatchNorm(in_layers=[gc1])
gp1 = GraphPool(in_layers=[batch_norm1, degree_slice, membership] + deg_adjs)

dense = Dense(out_channels=512, activation_fn=tf.nn.relu, in_layers=[gp1])
batch_norm2 = BatchNorm(in_layers=[dense])
readout = GraphGather( batch_size=batch_size, activation_fn=tf.nn.tanh, in_layers=[batch_norm2, degree_slice, membership] + deg_adjs)

classification = Dense( out_channels=2, activation_fn=None, in_layers=[readout],name="Dense")
softmax = SoftMax(in_layers=[classification],name="Softmax")
tg.add_output(softmax)
label = Label(shape=(None, 2))
cost = SoftMaxCrossEntropy(in_layers=[label, classification],name="SoftMaxCrossEntropy")

weights = Weights(shape=(None, len(training_task)))
loss = WeightedError(in_layers=[cost, weights])
tg.set_loss(loss)

Training this gives:

Starting epoch 0
Epoch avg loss =  36.449554443359375
Starting epoch 1
Epoch avg loss =  37.274847666422524
Starting epoch 2
Epoch avg loss =  24.871442794799805
Starting epoch 3
Epoch avg loss =  26.757398923238117
Starting epoch 4
Epoch avg loss =  21.097051938374836
Starting epoch 5
Epoch avg loss =  18.842341423034668
Starting epoch 6
Epoch avg loss =  18.118828455607098
Starting epoch 7
Epoch avg loss =  15.58962615331014
Starting epoch 8
Epoch avg loss =  14.505534172058105
Starting epoch 9
Epoch avg loss =  13.777753194173178
Written on March 22, 2018