Encode a protein to a image using Hilbert curves.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import numpy as np
import itertools

# generate elements of the sequence
def element(seq_list):
    list_ = []
    for s in seq_list:
        if s not in list_:
            list_.append(s)
    return list_

# generate mapping dictionary
def combination(elements, seq_length):
    keys = map(''.join, itertools.product(elements, repeat=seq_length))
    n_word = len(keys)
    array_word = np.eye(n_word)
    mapping_dic = {}
    for i in range(n_word):
        mapping_dic[keys[i]] = array_word[i,:]
    return mapping_dic

def hilbert_curve(n):
    # recursion base
    if n == 1:
        return np.zeros((1, 1), np.int32)
    # make (n/2, n/2) index
    t = hilbert_curve(n // 2)
    # flip it four times and add index offsets
    a = np.flipud(np.rot90(t))
    b = t + t.size
    c = t + t.size * 2
    d = np.flipud(np.rot90(t, -1)) + t.size * 3
    # and stack four tiles into resulting array
    return np.vstack(map(np.hstack, [[a, b], [d, c]]))


#one hot encoding
def one_hot(sequence, sub_len, mapping_dic):
    n_ = len(sequence)
    sub_list = []
    for i in range(n_ - sub_len + 1):
        sub_list.append(sequence[i:i + sub_len])
    res_ = []
    for sub in sub_list:
        res_.append(mapping_dic[sub])
    return np.array(res_)

#assign each pixel a one-hot encoding.
def plot_hb_dna(seq, H_curve, sub_length,map_dic):
    r, c = H_curve.shape
    num_A = one_hot(seq, sub_length, map_dic)
    H_dna = np.zeros((r, c, 20 ** sub_length))
    for i in range(len(num_A)):
        x, y = np.where(H_curve == i)
        H_dna[x, y, :] = num_A[i, :]
    return H_dna

#assign each pixel a 1 to check if all 1-mers are present in the image.
def plot_hb_dna_check(seq, H_curve, sub_length,map_dic):
    r, c = H_curve.shape
    num_A = one_hot(seq, sub_length, map_dic)
    H_dna = np.zeros((r,c))
    for i in range(len(num_A)):
        x, y = np.where(H_curve == i)
        H_dna[x,y] = 1
    return H_dna

#input protein.
protein_seq = 'MANMNNTKLNARALPSFIDYFNGIYGFATGIKDIMNMIFKTDTGGNLTLDEILKNQQLLNMMNNPPAARRYYFFEISGKLDGVNGSLNDLIAQGNLNTELSKEILKIANEQNQVLNDVNNKLDAINTMLHIYLPKITSMLSDVMKQNYALSLQVEYLSKQLKEISDKLDVINVNVLINSTLTEITPAYQRIKYVNEKFEELTFATETTLKVKKDSSPADILDELTELTELARSVTRNDMESFEFYIKTFHDVMIGNNLFSRSALKTASELIAKENIHTRGSEIGNVYTFMIVLTSLQAKAFLTLTTCRKLLGLADIDYTQIMNENLDREKEEFRLNILPTLSNDFSNPNYTETLGSDLVDPIVTLEAEPGYALIGFEILNDPLPVLKVFQAKLKQNYQVDKESIMENIYGNIHKLLCPKQREQKYYIKDITFPEGYVITKIVFEKKLNLLGYEVTANLYDPFTGSIDLNKTILESWKEDCCEEDCCEEDCCEENCCEEDYIKLMPLGVISETFLTPIYSFKLIIDKKTKKISLAGKSYLRESLLATDLVNKETNLIPSPNGFISSIVQTWHITSDNIEPWEANNKNAYVDKTDTMVGFSSLYTHKDGEFLQFIGAKLKPKTEYVIQYTVKGKPSIHLKDENTGYILYEDTNNDLEDFQTITKRFTTGTDLMRVYLILKSQSGHEAWGDNFTILEIKPAEALVSPELINPNSWITTQGASISGDKLFISLGTNGTFRQNLSLNSYSTYSISFTASGPFNVTVRNSREVLYERNNLMSSTSHISGEFKTESNNTGLYVELSRRSGGAGHISFENISIK'
print('protein len = ',len(protein_seq))

#20 amino acids
elements= ['M', 'A', 'N', 'T', 'K', 'L', 'R', 'P', 'S', 'F', 'I', 'D', 'Y', 'G', 'E', 'Q', 'V', 'H', 'C', 'W']

#choose kmer size = 1. So each one-hot encoding will be of len 20.  
sub_length = 1 #k-mer size.
mapping_dic = combination(elements, sub_length)

print('\nMapping dictionary:')
#each 1-mer and its encodings
for k,v in mapping_dic.iteritems():
    print(k,v)

Output

protein len =  816

Mapping dictionary:
A [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
C [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
E [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
D [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
G [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
F [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
I [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
H [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
K [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
M [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
L [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
N [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
Q [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
P [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
S [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
R [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
T [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
W [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
V [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
Y [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
# The Hilbert curve yields a square image of size 2^n × 2^n = 2^(2n), where n is the order of the curve.
# Choose order of Hilbert curve to accommodate all the k-mers in the image. 
# The count of kmers is roughly the length of the protein sequence.
# So for protein of length 816, we choose order = 5. Then Hilber image size = 2^(10) = 1024 >= 800
# Each pixel in the Hilber image will be one-hot encoding of a k-mer.
# In our case, k=1, so each pixel is one-hot encoded of len = 20. 
# So final image size : 32 x 32 x 20.

order = 32

H = hilbert_curve(order)

print('Hilbert matrix=\n',H)

X = plot_hb_dna_check(seq=protein_seq, H_curve=H, sub_length=sub_length, map_dic=mapping_dic)

seq_image = plot_hb_dna(seq=protein_seq, H_curve=H, sub_length=sub_length, map_dic=mapping_dic)

Output

Hilbert matrix=
 [[   0    1   14 ...  339  340  341]
 [   3    2   13 ...  338  343  342]
 [   4    7    8 ...  349  344  345]
 ...
 [1019 1016 1015 ...  674  679  678]
 [1020 1021 1010 ...  685  680  681]
 [1023 1022 1009 ...  684  683  682]]
#check if all 1-mers in the sequence are assigned to the image.
z = 0
for r in range(len(X)):
    z += sum(X[r,:])
    print(','.join([str(int(x))for x in X[r,:]]))
print("Total no of 1's =",z) #should be same as the protein seq len to ensure all 1-mers are in the image.

Output

1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
Total no of 1's = 816.0
print('Final image shape:')
print(seq_image.shape)

Output

Final image shape:
(32, 32, 20)

Full notebook

Notebook

Written on April 25, 2019