Obejective¶

We have to make a model which translate Italian to English

Basic Information¶

1. Download the Italian to English translation dataset from here

2. Preprocess that data. 

3. Encoder and Decoder architecture with  

Encoder   - with 1 layer LSTM 
Decoder   - with 1 layer LSTM
attention -  

4. In Global attention, we have 3 types of scoring functions.
 As a part of this assignment you need to create 3 models for each scoring function.

    In model 1 you need to implemnt "dot" score function
    In model 3 you need to implemnt "concat" score function


5. Using attention weights, we have plot the attention plots.

6. BLEU score as metric to evaluate the model and SparseCategoricalCrossentropy as a loss.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
import re
import tensorflow as tf
from tqdm import tqdm
import math
import os
import time
import matplotlib.ticker as ticker
import random
import nltk.translate.bleu_score as bleu
from sklearn.model_selection import train_test_split
import joblib
import pickle
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Embedding,Flatten,Dense,Concatenate,BatchNormalization,Dropout,Conv2D,Conv1D,MaxPooling1D,LSTM,Softmax,GRU
from tensorflow.keras.models import Model
%load_ext tensorboard

from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/

txt=open('/content/drive/My Drive/seq2seq/ita-eng/Ita.txt','r')
d=txt.readlines()

PRE PROCESSING¶

def pre_txt(data):
  eng=[]
  ita=[]
  for i in tqdm(data):
    u=i.lower()
    u=re.sub(r"'m", ' am', u)
    u=re.sub(r"'ll", ' will', u)
    u=re.sub(r"'d", ' had', u)
    u=re.sub(r"'s", ' is', u)
    u=re.sub(r"'ve", ' have', u)
    u=re.sub(r"'re", ' are', u)
    u=re.sub(r"won't", 'would not', u)
    u=re.sub(r"can't", 'can not', u)
    u=re.sub(r"o'clock", '', u)
    u=re.sub(r"n't", ' not ', u)#"haven't", ' don't
    u=re.sub(r"([?.!,¿])", r" \1 ", u)

    u=u.split('\t')
    p= re.sub(r"[^a-zA-Z?.!,¿]+", " ", u[0])
    q= re.sub(r"[^a-zA-Z?.!,¿]+", " ", u[1])
    eng_inp='<sos> ' + p + '<eos>'
    ita_inp='<sos> ' + q + '<eos>'
    if ita_inp.split('<eos>')[0][-1].isalpha()==True:
        ita_inp=ita_inp.replace('<eos>',' <eos>')

    eng.append(eng_inp)
    ita.append(ita_inp)

  return eng,ita

eng_txt,ita_txt=np.array(pre_txt(d))

100%|██████████| 336614/336614 [00:08<00:00, 39534.58it/s]

eng_txt.shape,ita_txt.shape

((336614,), (336614,))

WORD ANALYSIS¶

counts, bin_edges = np.histogram([len(i.split(' ')) for i in ita_txt], bins=18,density = True,)
pdf = counts/(sum(counts))
print('pdf : ',pdf,'\n');
print('bin edge : ',bin_edges,'\n')
cdf = np.cumsum(pdf)
plt.figure(figsize=(8,6))
plt.plot(bin_edges[1:],pdf,label='Histogram of Italian Text')
plt.plot(bin_edges[1:], cdf,label='Cumulative distribution of Italian Text')
plt.title('histogram and cumulative distribution of Italian Text')
plt.legend()
plt.grid()
c=0
q=[]
for i in pdf:
 c=c+i
 q.append(c)
print('outlier : ',q)

pdf :  [1.85449803e-01 5.58140778e-01 2.14444438e-01 3.42766492e-02
 5.08297338e-03 1.07541576e-03 5.82269305e-04 2.28748656e-04
 2.31719417e-04 1.18830471e-04 1.66362659e-04 7.72398058e-05
 3.26783794e-05 3.86199029e-05 1.18830471e-05 1.78245706e-05
 2.97076176e-06 2.07953323e-05] 

bin edge :  [ 4.          6.83333333  9.66666667 12.5        15.33333333 18.16666667
 21.         23.83333333 26.66666667 29.5        32.33333333 35.16666667
 38.         40.83333333 43.66666667 46.5        49.33333333 52.16666667
 55.        ] 

outlier :  [0.18544980303849506, 0.7435905814969073, 0.958035019339659, 0.9923116685580515, 0.9973946419340846, 0.9984700576921932, 0.9990523269976885, 0.9992810756534188, 0.9995127950709118, 0.999631625541421, 0.999797988200134, 0.999875228005965, 0.999907906385355, 0.9999465262882705, 0.9999584093353214, 0.9999762339058978, 0.9999792046676605, 0.9999999999999997]

counts, bin_edges = np.histogram([len(i.split(' ')) for i in eng_txt], bins=18,density = True,)
pdf = counts/(sum(counts))
print('pdf : ',pdf,'\n');
print('bin edge : ',bin_edges,'\n')
cdf = np.cumsum(pdf)
plt.figure(figsize=(8,6))
plt.plot(bin_edges[1:],pdf,label='Histogram of Italian Text')
plt.plot(bin_edges[1:], cdf,label='Cumulative distribution of Italian Text')
plt.title('histogram and cumulative distribution of Italian Text')
plt.legend()
plt.grid()
c=0
q=[]
for i in pdf:
 c=c+i
 q.append(c)
print('outlier : ',q)

pdf :  [9.84777817e-02 5.62825670e-01 2.86702276e-01 3.59610711e-02
 1.17612458e-02 2.64397797e-03 6.50596826e-04 8.61520911e-05
 8.31813294e-05 2.64397797e-04 1.72304182e-04 2.19836370e-04
 5.64444735e-05 3.26783794e-05 1.18830471e-05 1.18830471e-05
 2.37660941e-05 1.48538088e-05] 

bin edge :  [ 4.          6.72222222  9.44444444 12.16666667 14.88888889 17.61111111
 20.33333333 23.05555556 25.77777778 28.5        31.22222222 33.94444444
 36.66666667 39.38888889 42.11111111 44.83333333 47.55555556 50.27777778
 53.        ] 

outlier :  [0.09847778167277654, 0.6613034514310161, 0.9480057276286786, 0.9839667987665398, 0.9957280445851926, 0.9983720225540234, 0.9990226193800615, 0.9991087714711807, 0.9991919528005372, 0.9994563505974202, 0.9996286547796587, 0.9998484911501008, 0.9999049356235926, 0.9999376140029826, 0.9999494970500336, 0.9999613800970845, 0.9999851461911863, 0.9999999999999999]

Observation¶

99% of time each sentence contain less than 16 words.

TOKENISATION¶

def tokenize(lang):
  lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
  lang_tokenizer.fit_on_texts(lang)

  tensor = lang_tokenizer.texts_to_sequences(lang)

  tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor,
                                                         padding='post')
  print(len(lang_tokenizer.word_index)+1)
  #len(tok.word_index) + 1

  return tensor, lang_tokenizer

REMOVE WORDS LESS THAN 16¶

ita_txt_new=[]
eng_txt_new=[]
for t,e in zip(ita_txt,eng_txt):
  if len(t.split(' '))<=16 and len(e.split(' '))<=16:
    eng_txt_new.append(e)
    ita_txt_new.append(t)

eng_txt=eng_txt_new
ita_txt=ita_txt_new

input_tensor, inp_lang_tokenizer = tokenize(ita_txt)
target_tensor, targ_lang_tokenizer = tokenize(eng_txt)

25397
12656

print('input shape : ',input_tensor.shape)
print('target shape : ',target_tensor.shape)

input shape :  (334096, 16)
target shape :  (334096, 16)

Saving all the file¶

pickle.dump(input_tensor, open('/content/drive/My Drive/seq2seq/input_tensor', 'wb'))
pickle.dump(input_tensor, open('/content/drive/My Drive/seq2seq/target_tensor', 'wb'))
pickle.dump(inp_lang_tokenizer, open('/content/drive/My Drive/seq2seq/inp_lang_tokenizer', 'wb'))
pickle.dump(targ_lang_tokenizer, open('/content/drive/My Drive/seq2seq/targ_lang_tokenizer', 'wb'))

Loading all the file¶

input_tensor=pickle.load(open('/content/drive/My Drive/seq2seq/input_tensor', 'rb'))
target_tensor=pickle.load(open('/content/drive/My Drive/seq2seq/target_tensor', 'rb'))
inp_lang_tokenizer=pickle.load(open('/content/drive/My Drive/seq2seq/inp_lang_tokenizer', 'rb'))
targ_lang_tokenizer=pickle.load(open('/content/drive/My Drive/seq2seq/targ_lang_tokenizer', 'rb'))

print('input shape : ',input_tensor.shape)
print('target shape : ',target_tensor.shape)

input shape :  (334096, 16)
target shape :  (334096, 16)

decoder_input_target_tensor=[]
for i in target_tensor:
  i=list(i)
  if 1 in i:
    i.remove(2)
    i.append(0) 
  decoder_input_target_tensor.append(i)  
decoder_input_target_tensor=np.array(decoder_input_target_tensor)

decoder_output_target_tensor=[]
for i in target_tensor:
  i=list(i)
  if 1 in i:
    i.remove(1)
    i.append(0) 
  decoder_output_target_tensor.append(i)  
decoder_output_target_tensor=np.array(decoder_output_target_tensor)

input_tensor_train, input_tensor_val,decoder_input_target_tensor_train,decoder_input_target_tensor_val ,decoder_output_target_tensor_train, decoder_output_target_tensor_val, = train_test_split(input_tensor, decoder_input_target_tensor,decoder_output_target_tensor, test_size=0.18,random_state=42)

print('train input size : ',input_tensor_train.shape)
print('train input-output size : ',decoder_input_target_tensor_train.shape)
print('train output-output size : ',decoder_output_target_tensor_train.shape)

train input size :  (273958, 16)
train input-output size :  (273958, 16)
train output-output size :  (273958, 16)

print('train input size : ',input_tensor_val.shape)
print('train input-output size : ',decoder_input_target_tensor_val.shape)
print('train output-output size : ',decoder_output_target_tensor_val.shape)

train input size :  (60138, 16)
train input-output size :  (60138, 16)
train output-output size :  (60138, 16)

def convert(lang, tensor):
  for t in tensor:
    if t!=0:
      print ("%d ----> %s" % (t, lang.index_word[t]))

print ("Input Language; index to word mapping")
convert(inp_lang_tokenizer, input_tensor[10])
print()
print ("Target input Language; index to word mapping")
convert(targ_lang_tokenizer, decoder_input_target_tensor[10])
print()
print ("Target output Language; index to word mapping")
convert(targ_lang_tokenizer, decoder_output_target_tensor[10])

Input Language; index to word mapping
1 ----> <sos>
5601 ----> salti
3 ----> .
2 ----> <eos>

Target input Language; index to word mapping
1 ----> <sos>
1995 ----> jump
3 ----> .

Target output Language; index to word mapping
1995 ----> jump
3 ----> .
2 ----> <eos>

s=273552
input_tensor_train=input_tensor_train[:s]#269280
decoder_input_target_tensor_train=decoder_input_target_tensor_train[:s]
decoder_output_target_tensor_train=decoder_output_target_tensor_train[:s]


print('train input size : ',input_tensor_train.shape)
print('train input-output size : ',decoder_input_target_tensor_train.shape)
print('train output-output size : ',decoder_output_target_tensor_train.shape)

train input size :  (273552, 16)
train input-output size :  (273552, 16)
train output-output size :  (273552, 16)

p=60048

input_tensor_val=input_tensor_val[:p]#67296
decoder_input_target_tensor_val=decoder_input_target_tensor_val[:p]
decoder_output_target_tensor_val=decoder_output_target_tensor_val[:p]

print('train input size : ',input_tensor_val.shape)
print('train input-output size : ',decoder_input_target_tensor_val.shape)
print('train output-output size : ',decoder_output_target_tensor_val.shape)

train input size :  (60048, 16)
train input-output size :  (60048, 16)
train output-output size :  (60048, 16)

ENCODER¶

class Encoder(tf.keras.Model):

    def __init__(self,vocab_size,embedding_size,lstm_size,input_length):
        super().__init__()
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.input_length = input_length
        self.lstm_size= lstm_size
        self.lstm_output = 0
        self.state_h=0
        self.state_c=0
        self.embedding = Embedding(input_dim=self.vocab_size, output_dim=self.embedding_size, input_length=self.input_length)                           
        self.lstm = LSTM(self.lstm_size, return_state=True, return_sequences=True, name="Encoder_LSTM")

    def call(self,input_sequence,states):
        input_embedd  = self.embedding(input_sequence)
        self.lstm_output, self.lstm_state_h,self.lstm_state_c = self.lstm(input_embedd,initial_state = states)
        return self.lstm_output, self.lstm_state_h,self.lstm_state_c
  
    def initialize_states(self,batch_size):
        return tf.zeros((batch_size, self.lstm_size)),tf.zeros((batch_size, self.lstm_size))

def grader_check_encoder():
    vocab_size=12
    embedding_size=20
    lstm_size=32
    input_length=8
    batch_size=16
    encoder=Encoder(vocab_size,embedding_size,lstm_size,input_length)
    input_sequence=tf.random.uniform(shape=[batch_size,input_length],maxval=vocab_size,minval=0,dtype=tf.int32)
    initial_state=encoder.initialize_states(batch_size)
    print
    encoder_output,state_h,state_c=encoder(input_sequence,initial_state)
    
    assert(encoder_output.shape==(batch_size,input_length,lstm_size) and state_h.shape==(batch_size,lstm_size) and state_c.shape==(batch_size,lstm_size))
    return True
print(grader_check_encoder())

True

ATTENTION¶

class Attention(tf.keras.Model):
  
  def __init__(self,scoring_function, att_units):
    super().__init__()
    self.scoring_function=scoring_function
    self.att_units=att_units
    self.softmax=Softmax()
    if self.scoring_function=='dot':
      pass
      
    elif scoring_function == 'concat':
      self.W = tf.keras.layers.Dense(att_units,activation='relu',kernel_initializer='he_uniform')
      self.V = tf.keras.layers.Dense(1)
  
  def call(self,decoder_hidden_state,encoder_output):
    '''
      Attention mechanism takes two inputs current step -- decoder_hidden_state and all the encoder_outputs.
      * Based on the scoring function we will find the score or similarity between decoder_hidden_state and encoder_output.
        Multiply the score function with your encoder_outputs to get the context vector.
        Function returns context vector and attention weights(softmax - scores)
    '''
       
    if self.scoring_function == 'dot':
        state_h=decoder_hidden_state
        state= tf.expand_dims(state_h, 1)
        prob=[]
        for i in range(encoder_output.shape[0]):
          eo=tf.transpose(encoder_output[i])
          dot=tf.matmul(state[i],eo)
          soft_out=self.softmax(dot[0])
          prob.append(soft_out)
          
        attention_weights=tf.reshape(tf.convert_to_tensor(prob),(encoder_output.shape[0],encoder_output.shape[1],1))
        context_vector=attention_weights * encoder_output
        context_vector = tf.reduce_sum(context_vector, axis=1)
        return context_vector,attention_weights

        
    elif self.scoring_function == 'concat':
        state = tf.expand_dims(decoder_hidden_state, 1)
        state= tf.tile(state,[1,encoder_output.shape[1],1])
        score=self.V(tf.nn.tanh(self.W(tf.concat([encoder_output,state],axis=-1))))
        score=tf.transpose(score,[0,2,1])
        attention_weights = tf.nn.softmax(score,axis=2)
        context_vector = tf.matmul(attention_weights , encoder_output)
        context_vector=tf.reshape(context_vector,shape=(context_vector.shape[0],context_vector.shape[2]))
        attention_weights=tf.reshape(attention_weights,shape=(attention_weights.shape[0],attention_weights.shape[2],attention_weights.shape[1]))
        return context_vector, attention_weights

def grader_check_attention(scoring_fun):
  
    input_length=10
    batch_size=16
    att_units=32
    state_h=tf.random.uniform(shape=[batch_size,att_units])
    encoder_output=tf.random.uniform(shape=[batch_size,input_length,att_units])
    attention=Attention(scoring_fun,att_units)
    context_vector,attention_weights=attention(state_h,encoder_output)
    assert(context_vector.shape==(batch_size,att_units) and attention_weights.shape==(batch_size,input_length,1))
    return True
print(grader_check_attention('dot'))
print(grader_check_attention('concat'))

True
True

ONE STEP DECODER¶

class One_Step_Decoder(tf.keras.Model):
  def __init__(self,tar_vocab_size, embedding_dim, input_length, dec_units ,score_fun ,att_units):
      # Initialize decoder embedding layer, LSTM and any other objects needed
        super().__init__()
        self.tar_vocab_size = tar_vocab_size
        self.embedding_dim = embedding_dim
        self.input_length = input_length
        self.dec_units= dec_units
        self.score_fun = score_fun
        self.att_units=att_units
        self.attention=Attention(score_fun,att_units)
        self.softmax=Softmax()
        self.dense=Dense(self.tar_vocab_size)
        self.embedding = Embedding(input_dim=self.tar_vocab_size, output_dim=self.embedding_dim, 
                                   input_length=1)
        self.lstm = LSTM(self.dec_units, return_state=True, return_sequences=True, name="Encoder_LSTM")

  def call(self,input_to_decoder, encoder_output, state_h,state_c):

        #A
        emb=self.embedding(input_to_decoder)
        #B
        context_vector,attention_weights=self.attention(state_h,encoder_output)
        context_vector=tf.expand_dims(context_vector,1)
        #C
        con=Concatenate()([emb,context_vector])
        #D
        decoder_out,hidden_state,cell_state=self.lstm(con,initial_state = [state_h,state_c])
        dense_out=self.dense(decoder_out)
        
        return tf.reshape(dense_out,(dense_out.shape[0],dense_out.shape[2])),hidden_state,cell_state,attention_weights,tf.reshape(context_vector,(context_vector.shape[0],context_vector.shape[2]))
      
      #One step decoder mechanisim step by step:
      #A. Pass the input_to_decoder to the embedding layer and then get the output(1,1,embedding_dim)
      #B. Using the encoder_output and decoder hidden state, compute the context vector.
      #C. Concat the context vector with the step A output
      #D. Pass the Step-C output to LSTM/GRU and get the decoder output and states(hidden and cell state)
      #E. Pass the decoder output to dense layer(vocab size) and store the result into output.
      #F. Return the states from step D, output from Step E, attention weights from Step -B

def grader_onestepdecoder(score_fun):
    vocab_size=13 
    embedding_dim=12 
    input_length=10
    dec_units=16 
    att_units=16
    batch_size=32
    onestepdecoder=One_Step_Decoder(vocab_size, embedding_dim, input_length, dec_units ,score_fun ,att_units)
    input_to_decoder=tf.random.uniform(shape=(batch_size,1),maxval=10,minval=0,dtype=tf.int32)
    encoder_output=tf.random.uniform(shape=[batch_size,input_length,dec_units])
    state_h=tf.random.uniform(shape=[batch_size,dec_units])
    state_c=tf.random.uniform(shape=[batch_size,dec_units])
    output,state_h,state_c,attention_weights,context_vector=onestepdecoder(input_to_decoder,encoder_output,state_h,state_c)
    assert(output.shape==(batch_size,vocab_size))
    assert(state_h.shape==(batch_size,dec_units))
    assert(state_c.shape==(batch_size,dec_units))
    assert(attention_weights.shape==(batch_size,input_length,1))
    assert(context_vector.shape==(batch_size,dec_units))
    
    return True
    

print(grader_onestepdecoder('dot'))
print(grader_onestepdecoder('concat'))

True
True

DECODER¶

class Decoder(tf.keras.Model):
    def __init__(self,out_vocab_size, embedding_dim, output_length, dec_units ,score_fun ,att_units,input_length):
      super().__init__()
      self.onestepDecoder=One_Step_Decoder(out_vocab_size, embedding_dim, input_length, dec_units ,score_fun ,att_units)
      

    def call(self, input_to_decoder,encoder_output,decoder_hidden_state,decoder_cell_state ):
        #Initialize an empty Tensor array, that will store the outputs at each and every time step
        all_outputs=tf.TensorArray(tf.float32,size=tf.shape(input_to_decoder)[1], name='output_array')
          
        for timestep in range(0,tf.shape(input_to_decoder)[1]):
            output,decoder_hidden_state,decoder_cell_state,_,_=self.onestepDecoder(input_to_decoder[:,timestep:timestep+1],encoder_output,decoder_hidden_state,decoder_cell_state)
            #storing the one step decoder outputs to the tensor array
            
            all_outputs=all_outputs.write(timestep,output)
        
        all_outputs=tf.transpose(all_outputs.stack(), [1,0,2])
        return all_outputs

def grader_decoder(score_fun):
    out_vocab_size=13 
    embedding_dim=12 
    input_length=10
    output_length=11
    dec_units=16 
    att_units=16
    batch_size=32
    
    target_sentences=tf.random.uniform(shape=(batch_size,output_length),maxval=10,minval=0,dtype=tf.int32)
    encoder_output=tf.random.uniform(shape=[batch_size,input_length,dec_units])
    state_h=tf.random.uniform(shape=[batch_size,dec_units])
    state_c=tf.random.uniform(shape=[batch_size,dec_units])    
    decoder=Decoder(out_vocab_size, embedding_dim, output_length, dec_units ,score_fun ,att_units,input_length)
    output=decoder(target_sentences,encoder_output, state_h,state_c)
    assert(output.shape==(batch_size,output_length,out_vocab_size))#(32,11,13)
    
    return True
print(grader_decoder('dot'))
print(grader_decoder('concat'))

True
True

LOSS¶

optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask

  return tf.reduce_mean(loss_)

ENCODER_DECODER¶

class encoder_decoder(tf.keras.Model):
  def __init__(self,vocab_inp_size,embedding_size,lstm_units,input_length,batch_size,vocab_tar_size,output_length,scoring_fun):
    super().__init__()
    #Intialize objects from encoder decoder
    #1
    self.encoder=Encoder(vocab_inp_size,embedding_size,lstm_units,input_length)
    self.initial_state=self.encoder.initialize_states(batch_size)
    #2
    self.decoder=Decoder(vocab_tar_size, embedding_size, output_length, lstm_units ,scoring_fun ,lstm_units,input_length)


  def call(self,input):
    input_sequence=input[0]
    target_sentences=input[1]
    encoder_output,state_h,state_c=self.encoder(input_sequence,self.initial_state) 
    
    output=self.decoder(target_sentences,encoder_output, state_h,state_c)
    return output

FUNCTION FOR PREDICTION¶

max_length_inp=input_tensor.shape[1]
max_length_targ=target_tensor.shape[1]

def preprocess_sentence(data):
    u=data.lower()
    u=re.sub(r"'m", ' am', u)
    u=re.sub(r"'ll", ' will', u)
    u=re.sub(r"'d", ' had', u)
    u=re.sub(r"'s", ' is', u)
    u=re.sub(r"'ve", ' have', u)
    u=re.sub(r"'re", ' are', u)
    u=re.sub(r"won't", 'would not', u)
    u=re.sub(r"can't", 'can not', u)
    u=re.sub(r"o'clock", '', u)
    u=re.sub(r"n't", ' not ', u)#"haven't", ' don't
    u=re.sub(r"([?.!,¿])", r" \1 ", u)
    #u=re.sub(r'[^a-zA_Z0-9]',' ',u)
    
    q= re.sub(r"[^a-zA-Z?.!,¿]+", " ", u)
    sen='<sos> ' + q + '<eos>'
    return sen

def evaluate(sentence):
  attention_plot = np.zeros((max_length_targ, max_length_inp))

  sentence = preprocess_sentence(sentence)

  inputs = [inp_lang_tokenizer.word_index[i] for i in sentence.split(' ')]
  inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                         maxlen=max_length_inp,
                                                         padding='post')
  inputs = tf.convert_to_tensor(inputs)
  result = ''
  enc_hidden = encoder.initialize_states(1)
  enc_output, enc_hidden,enc_cell = model.layers[0](inputs, enc_hidden)
  dec_hidden = enc_hidden
  dec_cell=enc_cell
  dec_input = tf.expand_dims([targ_lang_tokenizer.word_index['<sos>']], 0)

  for t in range(max_length_targ):
    predictions, dec_hidden, dec_cell,attention_weights,context_vector = model.layers[1].onestepDecoder(dec_input,enc_output,dec_hidden,dec_cell,training=False)
    # storing the attention weights to plot later on
    attention_weights = tf.reshape(attention_weights, (-1, ))
    attention_plot[t] = attention_weights.numpy()
    predicted_id = tf.argmax(predictions[0]).numpy()

    result += targ_lang_tokenizer.index_word[predicted_id] + ' '

    if targ_lang_tokenizer.index_word[predicted_id] == '<eos>':
      return result, sentence, attention_plot

    # the predicted ID is fed back into the model
    dec_input = tf.expand_dims([predicted_id], 0)

  return result, sentence, attention_plot

def plot_attention(attention, sentence, predicted_sentence):
  fig = plt.figure(figsize=(10,10))
  ax = fig.add_subplot(1, 1, 1)
  ax.matshow(attention, cmap='gray')

  fontdict = {'fontsize': 14}

  ax.set_xticklabels([''] + sentence, fontdict=fontdict, rotation=90)
  ax.set_yticklabels([''] + predicted_sentence, fontdict=fontdict)

  ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
  ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

  plt.show()

def translate(sentence):
  result, sentence, attention_plot = evaluate(sentence)

  print('Input: %s' % (sentence))
  print('Predicted translation: {}'.format(result))

  attention_plot = attention_plot[:len(result.split(' ')), :len(sentence.split(' '))]
  plot_attention(attention_plot, sentence.split(' '), result.split(' '))

FUNCTION FOR BLUE SCORE¶

def tok2word1(data,tokenizer):
    a=''
    for i in data:
      
      if tokenizer.index_word[i]=='<eos>':
        break
      
      a=a+' '+tokenizer.index_word[i]
    a=a.split('<sos>')[1][1:]+' '
    return a
def tok2word2(data,tokenizer):
    a=''
    for i in data:
      if tokenizer.index_word[i]=='<eos>':
        break

      a=a+' '+tokenizer.index_word[i]
    return a
index=random.sample(range(0,input_tensor_val.shape[0]),1000)

def bleu_score(input_val,target_val):
  score=0
  for i in index: 
    inn=input_val[i]
    out=target_val[i]
    in_sen=tok2word1(inn,inp_lang_tokenizer)
    out_sen=tok2word2(out,targ_lang_tokenizer)
    ref=[out_sen.split(),]
    translation,_,_ = evaluate(in_sen)
    trans=translation.split()[:-1]
    res=bleu.sentence_bleu(ref, trans,)
    score=score+res
  score=score/1000
  print('avg. bleu score : ',score)

DOT¶

vocab_inp_size = len(inp_lang_tokenizer.word_index)+1
vocab_tar_size = len(targ_lang_tokenizer.word_index)+1
embedding_size=378
lstm_units=470
input_length=input_tensor.shape[1]
output_length=decoder_input_target_tensor_train.shape[1]
batch_size=48

encoder=Encoder(vocab_inp_size,embedding_size,lstm_units,input_length)
initial_state=encoder.initialize_states(batch_size)

scoring_fun='dot'

model  = encoder_decoder(vocab_inp_size,embedding_size,lstm_units,input_length,batch_size,vocab_tar_size,output_length,scoring_fun)

optimizer = tf.keras.optimizers.Adam()

model.compile(optimizer=optimizer,loss=loss_function)

model.fit([input_tensor_train,decoder_input_target_tensor_train], decoder_output_target_tensor_train,epochs=6,batch_size=48,validation_data=([input_tensor_val,decoder_input_target_tensor_val], decoder_output_target_tensor_val))

Epoch 1/6
5699/5699 [==============================] - 1624s 283ms/step - loss: 1.6256 - val_loss: 0.5345
Epoch 2/6
5699/5699 [==============================] - 1617s 284ms/step - loss: 0.4072 - val_loss: 0.2864
Epoch 3/6
5699/5699 [==============================] - 1639s 288ms/step - loss: 0.1912 - val_loss: 0.2244
Epoch 4/6
5699/5699 [==============================] - 1613s 283ms/step - loss: 0.1168 - val_loss: 0.1998
Epoch 5/6
5699/5699 [==============================] - 1612s 283ms/step - loss: 0.0825 - val_loss: 0.1916
Epoch 6/6
5699/5699 [==============================] - 1607s 282ms/step - loss: 0.0638 - val_loss: 0.1892

<tensorflow.python.keras.callbacks.History at 0x7f9091b33940>

model.save_weights("/content/drive/My Drive/model_dot_1/dot_pos2.hdf5")

Model load¶

model.load_weights("/content/drive/My Drive/model_dot_1/dot_pos2.hdf5")

TRANSLATION¶

translate(u'amo la mela ')
print('Actual eng sentence : i love apple')

Input: <sos> amo la mela <eos>
Predicted translation: i love the apple . <eos>

Actual eng sentence : i love apple

translate('non posso rispondere alla tua domanda ') 
print('Actual eng sentence : i can not answer your question ')

Input: <sos> non posso rispondere alla tua domanda <eos>
Predicted translation: i can not answer your question . <eos>

Actual eng sentence : i can not answer your question

translate(u'tom non sembrava essere molto interessato alla scuola ')
print('Actual eng sentence : tom did not seem to be very interested in school ')

Input: <sos> tom non sembrava essere molto interessato alla scuola <eos>
Predicted translation: tom did not seem to be very interested in school ? <eos>

Actual eng sentence : tom did not seem to be very interested in school

SCORE¶

bleu_score(input_tensor_val,decoder_output_target_tensor_val)

/usr/local/lib/python3.6/dist-packages/nltk/translate/bleu_score.py:490: UserWarning: 
Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
  warnings.warn(_msg)
/usr/local/lib/python3.6/dist-packages/nltk/translate/bleu_score.py:490: UserWarning: 
Corpus/Sentence contains 0 counts of 4-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
  warnings.warn(_msg)
/usr/local/lib/python3.6/dist-packages/nltk/translate/bleu_score.py:490: UserWarning: 
Corpus/Sentence contains 0 counts of 3-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
  warnings.warn(_msg)

avg. bleu score :  0.8502854061401058

CONCAT¶

vocab_inp_size = len(inp_lang_tokenizer.word_index)+1
vocab_tar_size = len(targ_lang_tokenizer.word_index)+1
embedding_size=378
lstm_units=470
input_length=input_tensor.shape[1]
output_length=decoder_input_target_tensor_train.shape[1]
batch_size=48

steps_per_epoch = ((len(input_tensor_train)+1)//batch_size)+1
scoring_fun='concat'

encoder=Encoder(vocab_inp_size,embedding_size,lstm_units,input_length)
initial_state=encoder.initialize_states(batch_size)

model  = encoder_decoder(vocab_inp_size,embedding_size,lstm_units,input_length,batch_size,vocab_tar_size,output_length,scoring_fun)

optimizer = tf.keras.optimizers.Adam()
model.compile(optimizer=optimizer,loss=loss_function)

model.fit([input_tensor_train,decoder_input_target_tensor_train], decoder_output_target_tensor_train,epochs=6,batch_size=48,validation_data=([input_tensor_val,decoder_input_target_tensor_val], decoder_output_target_tensor_val))

model.load_weights('/content/drive/My Drive/model_concat_1/con_pos2.hdf5')

Model Load¶

model.load_weights("/content/drive/My Drive/model_concat_1/con_pos2.hdf5")

TRANSLATION¶

translate(u'amo la mela .')
print('Actual eng sentence : i love apple')

Input: <sos> amo la mela . <eos>
Predicted translation: i love apple . <eos>

Actual eng sentence : i love apple

translate('non posso rispondere alla tua domanda .') 
print('Actual eng sentence : i can not answer your question ')

Input: <sos> non posso rispondere alla tua domanda . <eos>
Predicted translation: i can not answer your question . <eos>

Actual eng sentence : i can not answer your question

translate(u'tom non sembrava essere molto interessato alla scuola .')
print('Actual eng sentence : tom did not seem to be very interested in school .')

Input: <sos> tom non sembrava essere molto interessato alla scuola . <eos>
Predicted translation: tom did not seem to be very interested in school . <eos>

Actual eng sentence : tom did not seem to be very interested in school .

SCORE¶

bleu_score(input_tensor_val,decoder_output_target_tensor_val)

/usr/local/lib/python3.6/dist-packages/nltk/translate/bleu_score.py:490: UserWarning: 
Corpus/Sentence contains 0 counts of 4-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
  warnings.warn(_msg)
/usr/local/lib/python3.6/dist-packages/nltk/translate/bleu_score.py:490: UserWarning: 
Corpus/Sentence contains 0 counts of 3-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
  warnings.warn(_msg)
/usr/local/lib/python3.6/dist-packages/nltk/translate/bleu_score.py:490: UserWarning: 
Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
  warnings.warn(_msg)

avg. bleu score :  0.8492603959084802

OBSERVATION¶

Best bleu score which i am getting is 0.85 from model dot.
By both the score we are getting same answer only.
Now if i have to select best model i will go to concat model because it's score is almost same but it's word dependence is better than dot.