From 7247d39bbcc233e0a2f00cc9c717b70da916a64d Mon Sep 17 00:00:00 2001
From: t-sagoy <t-sagoy@microsoft.com>
Date: Thu, 30 Apr 2020 12:53:28 +0000
Subject: [PATCH 1/6] FastCell Example Fixes

---
 .../pytorch/FastCells/fastcell_example.py     | 10 ++++--
 pytorch/edgeml_pytorch/trainer/fastTrainer.py | 32 ++++++++++++-------
 2 files changed, 28 insertions(+), 14 deletions(-)

diff --git a/examples/pytorch/FastCells/fastcell_example.py b/examples/pytorch/FastCells/fastcell_example.py
index 9d55dd9d7..69a759c7e 100644
--- a/examples/pytorch/FastCells/fastcell_example.py
+++ b/examples/pytorch/FastCells/fastcell_example.py
@@ -11,7 +11,7 @@
 
 def main():
     # change cuda:0 to cuda:gpuid for specific allocation
-    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+    device = torch.device("cuda:3" if torch.cuda.is_available() else "cpu")
     # Fixing seeds for reproducibility
     torch.manual_seed(42)
     np.random.seed(42)
@@ -43,10 +43,16 @@ def main():
 
     (dataDimension, numClasses, Xtrain, Ytrain, Xtest, Ytest,
      mean, std) = helpermethods.preProcessData(dataDir)
-
     assert dataDimension % inputDims == 0, "Infeasible per step input, " + \
         "Timesteps have to be integer"
 
+    timeSteps = int(Xtest.shape[1] / inputDims)
+    Xtest = np.reshape(Xtest, (-1, timeSteps, inputDims))
+    Xtrain = Xtrain.reshape((-1, timeSteps, inputDims))
+    if not args.batch_first:
+        Xtest = np.swapaxes(Xtest, 0, 1)
+        Xtrain = np.swapaxes(Xtrain, 0, 1)
+
     currDir = helpermethods.createTimeStampDir(dataDir, cell)
 
     helpermethods.dumpCommand(sys.argv, currDir)
diff --git a/pytorch/edgeml_pytorch/trainer/fastTrainer.py b/pytorch/edgeml_pytorch/trainer/fastTrainer.py
index 3f0ebd338..e1aeb1547 100644
--- a/pytorch/edgeml_pytorch/trainer/fastTrainer.py
+++ b/pytorch/edgeml_pytorch/trainer/fastTrainer.py
@@ -77,9 +77,13 @@ def computeLogits(self, input):
             logits = self.classifier(feats[-1, :])
         else:
             feats = self.RNN(input)
-            logits = self.classifier(feats[-1, :])
-
-        return logits, feats[:, -1]
+            if self.batch_first:
+                logits = self.classifier(feats[:, -1])
+                feats_n = feats[:,-1]
+            else:
+                logits = self.classifier(feats[-1,:])
+                feats_n = feats[-1,:]
+        return logits, feats_n
 
     def optimizer(self):
         '''
@@ -351,7 +355,13 @@ def train(self, batchSize, totalEpochs, Xtrain, Xtest, Ytrain, Ytest,
         '''
         fileName = str(self.FastObj.cellType) + 'Results_pytorch.txt'
         resultFile = open(os.path.join(dataDir, fileName), 'a+')
-        numIters = int(np.ceil(float(Xtrain.shape[0]) / float(batchSize)))
+        if self.batch_first:
+                self.timeSteps = Xtrain.shape[1]
+                self.numPoints = Xtrain.shape[0]
+        else:
+                self.timeSteps = Xtrain.shape[0]
+                self.numPoints = Xtrain.shape[1]
+        numIters = int(np.ceil(float(self.numPoints) / float(batchSize)))
         totalBatches = numIters * totalEpochs
 
         counter = 0
@@ -362,11 +372,6 @@ def train(self, batchSize, totalEpochs, Xtrain, Xtest, Ytrain, Ytest,
             ihtDone = 1
             maxTestAcc = -10000
         header = '*' * 20
-        self.timeSteps = int(Xtest.shape[1] / self.inputDims)
-        Xtest = Xtest.reshape((-1, self.timeSteps, self.inputDims))
-        Xtest = np.swapaxes(Xtest, 0, 1)
-        Xtrain = Xtrain.reshape((-1, self.timeSteps, self.inputDims))
-        Xtrain = np.swapaxes(Xtrain, 0, 1)
 
         for i in range(0, totalEpochs):
             print("\nEpoch Number: " + str(i), file=self.outFile)
@@ -376,7 +381,7 @@ def train(self, batchSize, totalEpochs, Xtrain, Xtest, Ytrain, Ytest,
                 for param_group in self.optimizer.param_groups:
                     param_group['lr'] = self.learningRate
 
-            shuffled = list(range(Xtrain.shape[1]))
+            shuffled = list(range(self.numPoints))
             np.random.shuffle(shuffled)
             trainAcc = 0.0
             trainLoss = 0.0
@@ -389,9 +394,12 @@ def train(self, batchSize, totalEpochs, Xtrain, Xtest, Ytrain, Ytest,
                           (header, msg, header), file=self.outFile)
 
                 k = shuffled[j * batchSize:(j + 1) * batchSize]
-                batchX = Xtrain[:, k, :]
+                if self.batch_first:
+                        batchX = Xtrain[k, :, :]
+                else:
+                        batchX = Xtrain[:, k, :]
+                
                 batchY = Ytrain[k]
-
                 self.optimizer.zero_grad()
                 logits, _ = self.computeLogits(batchX.to(self.device))
                 batchLoss = self.loss(logits, batchY.to(self.device))

From e1958059b2e7de9b3a0489c2123f0ba593447d4e Mon Sep 17 00:00:00 2001
From: SachinG007 <t-sagoy@microsoft.com>
Date: Sat, 2 May 2020 23:44:36 +0000
Subject: [PATCH 2/6] LSTM logit fix for batch first

---
 pytorch/edgeml_pytorch/trainer/fastTrainer.py | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/pytorch/edgeml_pytorch/trainer/fastTrainer.py b/pytorch/edgeml_pytorch/trainer/fastTrainer.py
index e1aeb1547..f5d7ad307 100644
--- a/pytorch/edgeml_pytorch/trainer/fastTrainer.py
+++ b/pytorch/edgeml_pytorch/trainer/fastTrainer.py
@@ -74,16 +74,15 @@ def computeLogits(self, input):
         '''
         if self.FastObj.cellType == "LSTMLR":
             feats, _ = self.RNN(input)
-            logits = self.classifier(feats[-1, :])
         else:
             feats = self.RNN(input)
-            if self.batch_first:
+
+        if self.batch_first:
                 logits = self.classifier(feats[:, -1])
-                feats_n = feats[:,-1]
-            else:
-                logits = self.classifier(feats[-1,:])
-                feats_n = feats[-1,:]
-        return logits, feats_n
+                return logits, feats[:, -1]
+        else:
+                logits = self.classifier(feats[-1, :])
+                return logits, feats[-1, :]
 
     def optimizer(self):
         '''

From 9427308ae2283bb8030d6da2c427cc65a4696d7c Mon Sep 17 00:00:00 2001
From: SachinG007 <t-sagoy@microsoft.com>
Date: Sun, 3 May 2020 00:18:55 +0000
Subject: [PATCH 3/6] rnnpool merge

---
 pytorch/edgeml_pytorch/graph/rnn.py | 201 ++++++++++++++++++++++------
 1 file changed, 157 insertions(+), 44 deletions(-)

diff --git a/pytorch/edgeml_pytorch/graph/rnn.py b/pytorch/edgeml_pytorch/graph/rnn.py
index 5a292ee00..988f7e495 100644
--- a/pytorch/edgeml_pytorch/graph/rnn.py
+++ b/pytorch/edgeml_pytorch/graph/rnn.py
@@ -144,8 +144,8 @@ def getVars(self):
 
     def get_model_size(self):
         '''
-		Function to get aimed model size
-		'''
+        Function to get aimed model size
+        '''
         mats = self.getVars()
         endW = self._num_W_matrices
         endU = endW + self._num_U_matrices
@@ -261,7 +261,7 @@ def __init__(self, input_size, hidden_size, gate_nonlinearity="sigmoid",
         self.zeta = nn.Parameter(self._zetaInit * torch.ones([1, 1]))
         self.nu = nn.Parameter(self._nuInit * torch.ones([1, 1]))
 
-        self.copy_previous_UW()
+        # self.copy_previous_UW()
 
     @property
     def name(self):
@@ -330,7 +330,7 @@ class FastGRNNCUDACell(RNNCell):
     '''
     def __init__(self, input_size, hidden_size, gate_nonlinearity="sigmoid", 
     update_nonlinearity="tanh", wRank=None, uRank=None, zetaInit=1.0, nuInit=-4.0, wSparsity=1.0, uSparsity=1.0, name="FastGRNNCUDACell"):
-        super(FastGRNNCUDACell, self).__init__(input_size, hidden_size, gate_non_linearity, update_nonlinearity, 
+        super(FastGRNNCUDACell, self).__init__(input_size, hidden_size, gate_nonlinearity, update_nonlinearity, 
                                                 1, 1, 2, wRank, uRank, wSparsity, uSparsity)
         if utils.findCUDA() is None:
             raise Exception('FastGRNNCUDA is supported only on GPU devices.')
@@ -967,63 +967,115 @@ class BaseRNN(nn.Module):
     [batchSize, timeSteps, inputDims]
     '''
 
-    def __init__(self, cell: RNNCell, batch_first=False):
+    def __init__(self, cell: RNNCell, batch_first=False, cell_reverse: RNNCell=None, bidirectional=False):
         super(BaseRNN, self).__init__()
-        self._RNNCell = cell
+        self.RNNCell = cell 
         self._batch_first = batch_first
+        self._bidirectional = bidirectional
+        if cell_reverse is not None:
+            self.RNNCell_reverse = cell_reverse
+        elif self._bidirectional:
+            self.RNNCell_reverse = cell
 
     def getVars(self):
-        return self._RNNCell.getVars()
+        return self.RNNCell.getVars()
 
     def forward(self, input, hiddenState=None,
                 cellState=None):
         self.device = input.device
+        self.num_directions = 2 if self._bidirectional else 1
+        # hidden
+        # for i in range(num_directions):
         hiddenStates = torch.zeros(
                 [input.shape[0], input.shape[1],
-                 self._RNNCell.output_size]).to(self.device)
+                 self.RNNCell.output_size]).to(self.device)
+
+        if self._bidirectional:
+                hiddenStates_reverse = torch.zeros(
+                    [input.shape[0], input.shape[1],
+                     self.RNNCell_reverse.output_size]).to(self.device)
+
         if hiddenState is None:
                 hiddenState = torch.zeros(
-                    [input.shape[0] if self._batch_first else input.shape[1],
-                    self._RNNCell.output_size]).to(self.device)
+                    [self.num_directions, input.shape[0] if self._batch_first else input.shape[1],
+                    self.RNNCell.output_size]).to(self.device)
 
         if self._batch_first is True:
-            if self._RNNCell.cellType == "LSTMLR":
+            if self.RNNCell.cellType == "LSTMLR":
                 cellStates = torch.zeros(
                     [input.shape[0], input.shape[1],
-                     self._RNNCell.output_size]).to(self.device)
+                     self.RNNCell.output_size]).to(self.device)
+                if self._bidirectional:
+                    cellStates_reverse = torch.zeros(
+                    [input.shape[0], input.shape[1],
+                     self.RNNCell_reverse.output_size]).to(self.device)
                 if cellState is None:
                     cellState = torch.zeros(
-                        [input.shape[0], self._RNNCell.output_size]).to(self.device)
+                        [self.num_directions, input.shape[0], self.RNNCell.output_size]).to(self.device)
                 for i in range(0, input.shape[1]):
-                    hiddenState, cellState = self._RNNCell(
-                        input[:, i, :], (hiddenState, cellState))
-                    hiddenStates[:, i, :] = hiddenState
-                    cellStates[:, i, :] = cellState
-                return hiddenStates, cellStates
+                    hiddenState[0], cellState[0] = self.RNNCell(
+                        input[:, i, :], (hiddenState[0].clone(), cellState[0].clone()))
+                    hiddenStates[:, i, :] = hiddenState[0]
+                    cellStates[:, i, :] = cellState[0]
+                    if self._bidirectional:
+                        hiddenState[1], cellState[1] = self.RNNCell_reverse(
+                            input[:, input.shape[1]-i-1, :], (hiddenState[1].clone(), cellState[1].clone()))
+                        hiddenStates_reverse[:, i, :] = hiddenState[1]
+                        cellStates_reverse[:, i, :] = cellState[1]
+                if not self._bidirectional:
+                    return hiddenStates, cellStates
+                else:
+                    return torch.cat([hiddenStates,hiddenStates_reverse],-1), torch.cat([cellStates,cellStates_reverse],-1)  
             else:
                 for i in range(0, input.shape[1]):
-                    hiddenState = self._RNNCell(input[:, i, :], hiddenState)
-                    hiddenStates[:, i, :] = hiddenState
-                return hiddenStates
+                    hiddenState[0] = self.RNNCell(input[:, i, :], hiddenState[0].clone())
+                    hiddenStates[:, i, :] = hiddenState[0]
+                    if self._bidirectional:
+                        hiddenState[1] = self.RNNCell_reverse(
+                            input[:, input.shape[1]-i-1, :], hiddenState[1].clone())
+                        hiddenStates_reverse[:, i, :] = hiddenState[1]
+                if not self._bidirectional:
+                    return hiddenStates
+                else:
+                    return torch.cat([hiddenStates,hiddenStates_reverse],-1)
         else:
-            if self._RNNCell.cellType == "LSTMLR":
+            if self.RNNCell.cellType == "LSTMLR":
                 cellStates = torch.zeros(
                     [input.shape[0], input.shape[1],
-                     self._RNNCell.output_size]).to(self.device)
+                     self.RNNCell.output_size]).to(self.device)
+                if self._bidirectional:
+                    cellStates_reverse = torch.zeros(
+                    [input.shape[0], input.shape[1],
+                     self.RNNCell_reverse.output_size]).to(self.device)
                 if cellState is None:
                     cellState = torch.zeros(
-                        [input.shape[1], self._RNNCell.output_size]).to(self.device)
+                        [self.num_directions, input.shape[1], self.RNNCell.output_size]).to(self.device)
                 for i in range(0, input.shape[0]):
-                    hiddenState, cellState = self._RNNCell(
-                        input[i, :, :], (hiddenState, cellState))
-                    hiddenStates[i, :, :] = hiddenState
-                    cellStates[i, :, :] = cellState
-                return hiddenStates, cellStates
+                    hiddenState[0], cellState[0] = self.RNNCell(
+                        input[i, :, :], (hiddenState[0].clone(), cellState[0].clone()))
+                    hiddenStates[i, :, :] = hiddenState[0]
+                    cellStates[i, :, :] = cellState[0]
+                    if self._bidirectional:
+                        hiddenState[1], cellState[1] = self.RNNCell_reverse(
+                            input[input.shape[0]-i-1, :, :], (hiddenState[1].clone(), cellState[1].clone()))
+                        hiddenStates_reverse[i, :, :] = hiddenState[1]
+                        cellStates_reverse[i, :, :] = cellState[1]
+                if not self._bidirectional:
+                    return hiddenStates, cellStates
+                else:
+                    return torch.cat([hiddenStates,hiddenStates_reverse],-1), torch.cat([cellStates,cellStates_reverse],-1)
             else:
                 for i in range(0, input.shape[0]):
-                    hiddenState = self._RNNCell(input[i, :, :], hiddenState)
-                    hiddenStates[i, :, :] = hiddenState
-                return hiddenStates
+                    hiddenState[0] = self.RNNCell(input[i, :, :], hiddenState[0].clone())
+                    hiddenStates[i, :, :] = hiddenState[0]
+                    if self._bidirectional:
+                        hiddenState[1] = self.RNNCell_reverse(
+                            input[input.shape[0]-i-1, :, :], hiddenState[1].clone())
+                        hiddenStates_reverse[i, :, :] = hiddenState[1]
+                if not self._bidirectional:
+                    return hiddenStates
+                else:
+                    return torch.cat([hiddenStates,hiddenStates_reverse],-1)
 
 
 class LSTM(nn.Module):
@@ -1031,14 +1083,26 @@ class LSTM(nn.Module):
 
     def __init__(self, input_size, hidden_size, gate_nonlinearity="sigmoid",
                  update_nonlinearity="tanh", wRank=None, uRank=None,
-                 wSparsity=1.0, uSparsity=1.0, batch_first=False):
+                 wSparsity=1.0, uSparsity=1.0, batch_first=False, 
+                 bidirectional=False, is_shared_bidirectional=True):
         super(LSTM, self).__init__()
+        self._bidirectional = bidirectional
+        self._batch_first = batch_first
+        self._is_shared_bidirectional = is_shared_bidirectional
         self.cell = LSTMLRCell(input_size, hidden_size,
                                gate_nonlinearity=gate_nonlinearity,
                                update_nonlinearity=update_nonlinearity,
                                wRank=wRank, uRank=uRank,
                                wSparsity=wSparsity, uSparsity=uSparsity)
-        self.unrollRNN = BaseRNN(self.cell, batch_first=batch_first)
+        self.unrollRNN = BaseRNN(self.cell, batch_first=self._batch_first, bidirectional=self._bidirectional)
+
+        if self._bidirectional is True and self._is_shared_bidirectional is False:
+            self.cell_reverse = LSTMLRCell(input_size, hidden_size,
+                               gate_nonlinearity=gate_nonlinearity,
+                               update_nonlinearity=update_nonlinearity,
+                               wRank=wRank, uRank=uRank,
+                               wSparsity=wSparsity, uSparsity=uSparsity)
+            self.unrollRNN = BaseRNN(self.cell, self.cell_reverse, batch_first=self._batch_first, bidirectional=self._bidirectional)
 
     def forward(self, input, hiddenState=None, cellState=None):
         return self.unrollRNN(input, hiddenState, cellState)
@@ -1049,14 +1113,26 @@ class GRU(nn.Module):
 
     def __init__(self, input_size, hidden_size, gate_nonlinearity="sigmoid",
                  update_nonlinearity="tanh", wRank=None, uRank=None,
-                 wSparsity=1.0, uSparsity=1.0, batch_first=False):
+                 wSparsity=1.0, uSparsity=1.0, batch_first=False, 
+                 bidirectional=False, is_shared_bidirectional=True):
         super(GRU, self).__init__()
+        self._bidirectional = bidirectional
+        self._batch_first = batch_first
+        self._is_shared_bidirectional = is_shared_bidirectional
         self.cell = GRULRCell(input_size, hidden_size,
                               gate_nonlinearity=gate_nonlinearity,
                               update_nonlinearity=update_nonlinearity,
                               wRank=wRank, uRank=uRank,
                               wSparsity=wSparsity, uSparsity=uSparsity)
-        self.unrollRNN = BaseRNN(self.cell, batch_first=batch_first)
+        self.unrollRNN = BaseRNN(self.cell, batch_first=self._batch_first, bidirectional=self._bidirectional)
+
+        if self._bidirectional is True and self._is_shared_bidirectional is False:
+            self.cell_reverse = GRULRCell(input_size, hidden_size,
+                              gate_nonlinearity=gate_nonlinearity,
+                              update_nonlinearity=update_nonlinearity,
+                              wRank=wRank, uRank=uRank,
+                              wSparsity=wSparsity, uSparsity=uSparsity)
+            self.unrollRNN = BaseRNN(self.cell, self.cell_reverse, batch_first=self._batch_first, bidirectional=self._bidirectional)
 
     def forward(self, input, hiddenState=None, cellState=None):
         return self.unrollRNN(input, hiddenState, cellState)
@@ -1067,14 +1143,26 @@ class UGRNN(nn.Module):
 
     def __init__(self, input_size, hidden_size, gate_nonlinearity="sigmoid",
                  update_nonlinearity="tanh", wRank=None, uRank=None,
-                 wSparsity=1.0, uSparsity=1.0, batch_first=False):
+                 wSparsity=1.0, uSparsity=1.0, batch_first=False, 
+                 bidirectional=False, is_shared_bidirectional=True):
         super(UGRNN, self).__init__()
+        self._bidirectional = bidirectional
+        self._batch_first = batch_first
+        self._is_shared_bidirectional = is_shared_bidirectional
         self.cell = UGRNNLRCell(input_size, hidden_size,
                                 gate_nonlinearity=gate_nonlinearity,
                                 update_nonlinearity=update_nonlinearity,
                                 wRank=wRank, uRank=uRank,
                                 wSparsity=wSparsity, uSparsity=uSparsity)
-        self.unrollRNN = BaseRNN(self.cell, batch_first=batch_first)
+        self.unrollRNN = BaseRNN(self.cell, batch_first=self._batch_first, bidirectional=self._bidirectional)
+
+        if self._bidirectional is True and self._is_shared_bidirectional is False:
+            self.cell_reverse = UGRNNLRCell(input_size, hidden_size,
+                                gate_nonlinearity=gate_nonlinearity,
+                                update_nonlinearity=update_nonlinearity,
+                                wRank=wRank, uRank=uRank,
+                                wSparsity=wSparsity, uSparsity=uSparsity)
+            self.unrollRNN = BaseRNN(self.cell, self.cell_reverse, batch_first=self._batch_first, bidirectional=self._bidirectional)
 
     def forward(self, input, hiddenState=None, cellState=None):
         return self.unrollRNN(input, hiddenState, cellState)
@@ -1085,15 +1173,28 @@ class FastRNN(nn.Module):
 
     def __init__(self, input_size, hidden_size, gate_nonlinearity="sigmoid",
                  update_nonlinearity="tanh", wRank=None, uRank=None,
-                 wSparsity=1.0, uSparsity=1.0, alphaInit=-3.0, betaInit=3.0, batch_first=False):
+                 wSparsity=1.0, uSparsity=1.0, alphaInit=-3.0, betaInit=3.0,
+                 batch_first=False, bidirectional=False, is_shared_bidirectional=True):
         super(FastRNN, self).__init__()
+        self._bidirectional = bidirectional
+        self._batch_first = batch_first
+        self._is_shared_bidirectional = is_shared_bidirectional
         self.cell = FastRNNCell(input_size, hidden_size,
                                 gate_nonlinearity=gate_nonlinearity,
                                 update_nonlinearity=update_nonlinearity,
                                 wRank=wRank, uRank=uRank,
                                 wSparsity=wSparsity, uSparsity=uSparsity,
                                 alphaInit=alphaInit, betaInit=betaInit)
-        self.unrollRNN = BaseRNN(self.cell, batch_first=batch_first)
+        self.unrollRNN = BaseRNN(self.cell, batch_first=self._batch_first, bidirectional=self._bidirectional)
+
+        if self._bidirectional is True and self._is_shared_bidirectional is False:
+            self.cell_reverse = FastRNNCell(input_size, hidden_size,
+                                gate_nonlinearity=gate_nonlinearity,
+                                update_nonlinearity=update_nonlinearity,
+                                wRank=wRank, uRank=uRank,
+                                wSparsity=wSparsity, uSparsity=uSparsity,
+                                alphaInit=alphaInit, betaInit=betaInit)
+            self.unrollRNN = BaseRNN(self.cell, self.cell_reverse, batch_first=self._batch_first, bidirectional=self._bidirectional)
 
     def forward(self, input, hiddenState=None, cellState=None):
         return self.unrollRNN(input, hiddenState, cellState)
@@ -1105,15 +1206,27 @@ class FastGRNN(nn.Module):
     def __init__(self, input_size, hidden_size, gate_nonlinearity="sigmoid",
                  update_nonlinearity="tanh", wRank=None, uRank=None,
                  wSparsity=1.0, uSparsity=1.0, zetaInit=1.0, nuInit=-4.0,
-                 batch_first=False):
+                 batch_first=False, bidirectional=False, is_shared_bidirectional=True):
         super(FastGRNN, self).__init__()
+        self._bidirectional = bidirectional
+        self._batch_first = batch_first
+        self._is_shared_bidirectional = is_shared_bidirectional
         self.cell = FastGRNNCell(input_size, hidden_size,
                                  gate_nonlinearity=gate_nonlinearity,
                                  update_nonlinearity=update_nonlinearity,
                                  wRank=wRank, uRank=uRank,
                                  wSparsity=wSparsity, uSparsity=uSparsity,
                                  zetaInit=zetaInit, nuInit=nuInit)
-        self.unrollRNN = BaseRNN(self.cell, batch_first=batch_first)
+        self.unrollRNN = BaseRNN(self.cell, batch_first=self._batch_first, bidirectional=self._bidirectional)
+
+        if self._bidirectional is True and self._is_shared_bidirectional is False:
+            self.cell_reverse = FastGRNNCell(input_size, hidden_size,
+                                 gate_nonlinearity=gate_nonlinearity,
+                                 update_nonlinearity=update_nonlinearity,
+                                 wRank=wRank, uRank=uRank,
+                                 wSparsity=wSparsity, uSparsity=uSparsity,
+                                 zetaInit=zetaInit, nuInit=nuInit)
+            self.unrollRNN = BaseRNN(self.cell, self.cell_reverse, batch_first=self._batch_first, bidirectional=self._bidirectional)
 
     def getVars(self):
         return self.unrollRNN.getVars()
@@ -1222,8 +1335,8 @@ def getVars(self):
 
     def get_model_size(self):
         '''
-		Function to get aimed model size
-		'''
+        Function to get aimed model size
+        '''
         mats = self.getVars()
         endW = self._num_W_matrices
         endU = endW + self._num_U_matrices

From 87c174017655854e37220cc4453c11bd01fb2647 Mon Sep 17 00:00:00 2001
From: SachinG007 <t-sagoy@microsoft.com>
Date: Sun, 3 May 2020 14:18:56 +0000
Subject: [PATCH 4/6] cuda gpu  number fix

---
 examples/pytorch/FastCells/fastcell_example.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/pytorch/FastCells/fastcell_example.py b/examples/pytorch/FastCells/fastcell_example.py
index 69a759c7e..6267fc0ae 100644
--- a/examples/pytorch/FastCells/fastcell_example.py
+++ b/examples/pytorch/FastCells/fastcell_example.py
@@ -11,7 +11,7 @@
 
 def main():
     # change cuda:0 to cuda:gpuid for specific allocation
-    device = torch.device("cuda:3" if torch.cuda.is_available() else "cpu")
+    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
     # Fixing seeds for reproducibility
     torch.manual_seed(42)
     np.random.seed(42)

From fe862302e1b48ea68e5b7265234489c06232767a Mon Sep 17 00:00:00 2001
From: SachinG007 <t-sagoy@microsoft.com>
Date: Wed, 6 May 2020 20:44:02 +0000
Subject: [PATCH 5/6] PR173 merged, optimizer changes

---
 .../pytorch/FastCells/fastcell_example.py     |  9 +++---
 pytorch/edgeml_pytorch/trainer/fastTrainer.py | 31 ++++++++++---------
 2 files changed, 22 insertions(+), 18 deletions(-)

diff --git a/examples/pytorch/FastCells/fastcell_example.py b/examples/pytorch/FastCells/fastcell_example.py
index 6267fc0ae..91674894a 100644
--- a/examples/pytorch/FastCells/fastcell_example.py
+++ b/examples/pytorch/FastCells/fastcell_example.py
@@ -46,12 +46,13 @@ def main():
     assert dataDimension % inputDims == 0, "Infeasible per step input, " + \
         "Timesteps have to be integer"
 
-    timeSteps = int(Xtest.shape[1] / inputDims)
-    Xtest = np.reshape(Xtest, (-1, timeSteps, inputDims))
+    timeSteps = int(dataDimension / inputDims)
     Xtrain = Xtrain.reshape((-1, timeSteps, inputDims))
-    if not args.batch_first:
-        Xtest = np.swapaxes(Xtest, 0, 1)
+    Xtest = Xtest.reshape((-1, timeSteps, inputDims))
+
+    if not batch_first:
         Xtrain = np.swapaxes(Xtrain, 0, 1)
+        Xtest = np.swapaxes(Xtest, 0, 1)
 
     currDir = helpermethods.createTimeStampDir(dataDir, cell)
 
diff --git a/pytorch/edgeml_pytorch/trainer/fastTrainer.py b/pytorch/edgeml_pytorch/trainer/fastTrainer.py
index f5d7ad307..32c96b68d 100644
--- a/pytorch/edgeml_pytorch/trainer/fastTrainer.py
+++ b/pytorch/edgeml_pytorch/trainer/fastTrainer.py
@@ -9,6 +9,14 @@
 from edgeml_pytorch.graph.rnn import *
 import numpy as np
 
+class SimpleFC(nn.Module):
+    def __init__(self, input_size, num_classes, name="SimpleFC"):
+        super(SimpleFC, self).__init__()
+        self.FC = nn.Parameter(torch.randn([input_size, num_classes]))
+        self.FCbias = nn.Parameter(torch.randn([num_classes]))
+
+    def forward(self, input):
+        return torch.matmul(input, self.FC) + self.FCbias
 
 class FastTrainer:
 
@@ -50,23 +58,17 @@ def __init__(self, FastObj, numClasses, sW=1.0, sU=1.0,
         self.numMatrices = self.FastObj.num_weight_matrices
         self.totalMatrices = self.numMatrices[0] + self.numMatrices[1]
 
-        self.optimizer = self.optimizer()
-
         self.RNN = BaseRNN(self.FastObj, batch_first=self.batch_first).to(self.device)
-
-        self.FC = nn.Parameter(torch.randn(
-            [self.FastObj.output_size, self.numClasses])).to(self.device)
-        self.FCbias = nn.Parameter(torch.randn(
-            [self.numClasses])).to(self.device)
-
+        self.simpleFC = SimpleFC(self.FastObj.output_size, self.numClasses).to(self.device)
         self.FastParams = self.FastObj.getVars()
+        self.optimizer = self.optimizer()
 
     def classifier(self, feats):
         '''
         Can be raplaced by any classifier
         TODO: Make this a separate class if needed
         '''
-        return torch.matmul(feats, self.FC) + self.FCbias
+        return self.simpleFC(feats)
 
     def computeLogits(self, input):
         '''
@@ -88,8 +90,9 @@ def optimizer(self):
         '''
         Optimizer for FastObj Params
         '''
+        paramList = list(self.FastObj.parameters()) + list(self.simpleFC.parameters())
         optimizer = torch.optim.Adam(
-            self.FastObj.parameters(), lr=self.learningRate)
+            paramList, lr=self.learningRate)
 
         return optimizer
 
@@ -171,12 +174,12 @@ def getModelSize(self):
             hasSparse = hasSparse or sparseFlag
 
         # Replace this with classifier class call
-        nnz, size, sparseFlag = utils.estimateNNZ(self.FC, 1.0)
+        nnz, size, sparseFlag = utils.estimateNNZ(self.simpleFC.FC, 1.0)
         totalnnZ += nnz
         totalSize += size
         hasSparse = hasSparse or sparseFlag
 
-        nnz, size, sparseFlag = utils.estimateNNZ(self.FCbias, 1.0)
+        nnz, size, sparseFlag = utils.estimateNNZ(self.simpleFC.FCbias, 1.0)
         totalnnZ += nnz
         totalSize += size
         hasSparse = hasSparse or sparseFlag
@@ -344,8 +347,8 @@ def saveParams(self, currDir):
             np.save(os.path.join(currDir, "Bo.npy"),
                     self.FastParams[self.totalMatrices + 3].data.cpu())
 
-        np.save(os.path.join(currDir, "FC.npy"), self.FC.data.cpu())
-        np.save(os.path.join(currDir, "FCbias.npy"), self.FCbias.data.cpu())
+        np.save(os.path.join(currDir, "FC.npy"), self.simpleFC.FC.data.cpu())
+        np.save(os.path.join(currDir, "FCbias.npy"), self.simpleFC.FCbias.data.cpu())
 
     def train(self, batchSize, totalEpochs, Xtrain, Xtest, Ytrain, Ytest,
               decayStep, decayRate, dataDir, currDir):

From 9b050b81a91d9190a697bbdd3a7078db2e25fd74 Mon Sep 17 00:00:00 2001
From: SachinG007 <t-sagoy@microsoft.com>
Date: Sun, 10 May 2020 15:56:19 +0000
Subject: [PATCH 6/6] Resolved few comments by aditya

---
 pytorch/edgeml_pytorch/graph/rnn.py | 49 +++++++++++++++--------------
 1 file changed, 26 insertions(+), 23 deletions(-)

diff --git a/pytorch/edgeml_pytorch/graph/rnn.py b/pytorch/edgeml_pytorch/graph/rnn.py
index 988f7e495..ceed5a5e1 100644
--- a/pytorch/edgeml_pytorch/graph/rnn.py
+++ b/pytorch/edgeml_pytorch/graph/rnn.py
@@ -969,7 +969,7 @@ class BaseRNN(nn.Module):
 
     def __init__(self, cell: RNNCell, batch_first=False, cell_reverse: RNNCell=None, bidirectional=False):
         super(BaseRNN, self).__init__()
-        self.RNNCell = cell 
+        self._RNNCell = cell 
         self._batch_first = batch_first
         self._bidirectional = bidirectional
         if cell_reverse is not None:
@@ -978,47 +978,50 @@ def __init__(self, cell: RNNCell, batch_first=False, cell_reverse: RNNCell=None,
             self.RNNCell_reverse = cell
 
     def getVars(self):
-        return self.RNNCell.getVars()
+        return self._RNNCell.getVars()
 
     def forward(self, input, hiddenState=None,
                 cellState=None):
         self.device = input.device
         self.num_directions = 2 if self._bidirectional else 1
-        # hidden
-        # for i in range(num_directions):
+        if self._bidirectional:
+            self.num_directions = 2
+        else:
+            self.num_directions = 1
+            
         hiddenStates = torch.zeros(
                 [input.shape[0], input.shape[1],
-                 self.RNNCell.output_size]).to(self.device)
+                 self._RNNCell.output_size]).to(self.device)
 
         if self._bidirectional:
                 hiddenStates_reverse = torch.zeros(
                     [input.shape[0], input.shape[1],
-                     self.RNNCell_reverse.output_size]).to(self.device)
+                     self._RNNCell_reverse.output_size]).to(self.device)
 
         if hiddenState is None:
                 hiddenState = torch.zeros(
                     [self.num_directions, input.shape[0] if self._batch_first else input.shape[1],
-                    self.RNNCell.output_size]).to(self.device)
+                    self._RNNCell.output_size]).to(self.device)
 
         if self._batch_first is True:
-            if self.RNNCell.cellType == "LSTMLR":
+            if self._RNNCell.cellType == "LSTMLR":
                 cellStates = torch.zeros(
                     [input.shape[0], input.shape[1],
-                     self.RNNCell.output_size]).to(self.device)
+                     self._RNNCell.output_size]).to(self.device)
                 if self._bidirectional:
                     cellStates_reverse = torch.zeros(
                     [input.shape[0], input.shape[1],
-                     self.RNNCell_reverse.output_size]).to(self.device)
+                     self._RNNCell_reverse.output_size]).to(self.device)
                 if cellState is None:
                     cellState = torch.zeros(
-                        [self.num_directions, input.shape[0], self.RNNCell.output_size]).to(self.device)
+                        [self.num_directions, input.shape[0], self._RNNCell.output_size]).to(self.device)
                 for i in range(0, input.shape[1]):
-                    hiddenState[0], cellState[0] = self.RNNCell(
+                    hiddenState[0], cellState[0] = self._RNNCell(
                         input[:, i, :], (hiddenState[0].clone(), cellState[0].clone()))
                     hiddenStates[:, i, :] = hiddenState[0]
                     cellStates[:, i, :] = cellState[0]
                     if self._bidirectional:
-                        hiddenState[1], cellState[1] = self.RNNCell_reverse(
+                        hiddenState[1], cellState[1] = self._RNNCell_reverse(
                             input[:, input.shape[1]-i-1, :], (hiddenState[1].clone(), cellState[1].clone()))
                         hiddenStates_reverse[:, i, :] = hiddenState[1]
                         cellStates_reverse[:, i, :] = cellState[1]
@@ -1028,10 +1031,10 @@ def forward(self, input, hiddenState=None,
                     return torch.cat([hiddenStates,hiddenStates_reverse],-1), torch.cat([cellStates,cellStates_reverse],-1)  
             else:
                 for i in range(0, input.shape[1]):
-                    hiddenState[0] = self.RNNCell(input[:, i, :], hiddenState[0].clone())
+                    hiddenState[0] = self._RNNCell(input[:, i, :], hiddenState[0].clone())
                     hiddenStates[:, i, :] = hiddenState[0]
                     if self._bidirectional:
-                        hiddenState[1] = self.RNNCell_reverse(
+                        hiddenState[1] = self._RNNCell_reverse(
                             input[:, input.shape[1]-i-1, :], hiddenState[1].clone())
                         hiddenStates_reverse[:, i, :] = hiddenState[1]
                 if not self._bidirectional:
@@ -1039,24 +1042,24 @@ def forward(self, input, hiddenState=None,
                 else:
                     return torch.cat([hiddenStates,hiddenStates_reverse],-1)
         else:
-            if self.RNNCell.cellType == "LSTMLR":
+            if self._RNNCell.cellType == "LSTMLR":
                 cellStates = torch.zeros(
                     [input.shape[0], input.shape[1],
-                     self.RNNCell.output_size]).to(self.device)
+                     self._RNNCell.output_size]).to(self.device)
                 if self._bidirectional:
                     cellStates_reverse = torch.zeros(
                     [input.shape[0], input.shape[1],
-                     self.RNNCell_reverse.output_size]).to(self.device)
+                     self._RNNCell_reverse.output_size]).to(self.device)
                 if cellState is None:
                     cellState = torch.zeros(
-                        [self.num_directions, input.shape[1], self.RNNCell.output_size]).to(self.device)
+                        [self.num_directions, input.shape[1], self._RNNCell.output_size]).to(self.device)
                 for i in range(0, input.shape[0]):
-                    hiddenState[0], cellState[0] = self.RNNCell(
+                    hiddenState[0], cellState[0] = self._RNNCell(
                         input[i, :, :], (hiddenState[0].clone(), cellState[0].clone()))
                     hiddenStates[i, :, :] = hiddenState[0]
                     cellStates[i, :, :] = cellState[0]
                     if self._bidirectional:
-                        hiddenState[1], cellState[1] = self.RNNCell_reverse(
+                        hiddenState[1], cellState[1] = self._RNNCell_reverse(
                             input[input.shape[0]-i-1, :, :], (hiddenState[1].clone(), cellState[1].clone()))
                         hiddenStates_reverse[i, :, :] = hiddenState[1]
                         cellStates_reverse[i, :, :] = cellState[1]
@@ -1066,10 +1069,10 @@ def forward(self, input, hiddenState=None,
                     return torch.cat([hiddenStates,hiddenStates_reverse],-1), torch.cat([cellStates,cellStates_reverse],-1)
             else:
                 for i in range(0, input.shape[0]):
-                    hiddenState[0] = self.RNNCell(input[i, :, :], hiddenState[0].clone())
+                    hiddenState[0] = self._RNNCell(input[i, :, :], hiddenState[0].clone())
                     hiddenStates[i, :, :] = hiddenState[0]
                     if self._bidirectional:
-                        hiddenState[1] = self.RNNCell_reverse(
+                        hiddenState[1] = self._RNNCell_reverse(
                             input[input.shape[0]-i-1, :, :], hiddenState[1].clone())
                         hiddenStates_reverse[i, :, :] = hiddenState[1]
                 if not self._bidirectional: