forked from shiv08/Advanced-LSTM-Implementation-with-PyTorch
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlstm.py
266 lines (217 loc) · 10.4 KB
/
lstm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
import torch
import torch.nn as nn
import math
from typing import Tuple, List, Optional
class LSTMCell(nn.Module):
"""
LSTM Cell implementation with layer normalization.
Mathematical formulation of LSTM:
f_t = σ(W_f · [h_{t-1}, x_t] + b_f) # Forget gate
i_t = σ(W_i · [h_{t-1}, x_t] + b_i) # Input gate
g_t = tanh(W_g · [h_{t-1}, x_t] + b_g) # Candidate cell state
o_t = σ(W_o · [h_{t-1}, x_t] + b_o) # Output gate
c_t = f_t ⊙ c_{t-1} + i_t ⊙ g_t # New cell state
h_t = o_t ⊙ tanh(c_t) # New hidden state
where:
- σ is the sigmoid function
- ⊙ is element-wise multiplication
- [h_{t-1}, x_t] represents concatenation
"""
def __init__(self, input_size: int, hidden_size: int, dropout: float = 0.0):
super().__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.dropout = nn.Dropout(dropout) if dropout > 0 else None
# Combined weight matrices for efficiency
# W_ih combines weights for [i_t, f_t, g_t, o_t] for input x_t
# W_hh combines weights for [i_t, f_t, g_t, o_t] for hidden state h_{t-1}
self.weight_ih = nn.Linear(input_size, 4 * hidden_size)
self.weight_hh = nn.Linear(hidden_size, 4 * hidden_size)
# Layer Normalization for better training stability
self.layer_norm_x = nn.LayerNorm(4 * hidden_size) # Normalize gate pre-activations
self.layer_norm_h = nn.LayerNorm(hidden_size) # Normalize hidden state
self.layer_norm_c = nn.LayerNorm(hidden_size) # Normalize cell state
self.init_parameters()
def init_parameters(self) -> None:
"""
Initialize parameters using best practices:
1. Orthogonal initialization for better gradient flow
2. Initialize forget gate bias to 1.0 to prevent forgetting at start of training
"""
for weight in [self.weight_ih.weight, self.weight_hh.weight]:
nn.init.orthogonal_(weight)
# Set forget gate bias to 1.0 (helps with learning long sequences)
nn.init.constant_(self.weight_ih.bias[self.hidden_size:2*self.hidden_size], 1.0)
nn.init.constant_(self.weight_hh.bias[self.hidden_size:2*self.hidden_size], 1.0)
def forward(self, x: torch.Tensor,
hidden_state: Tuple[torch.Tensor, torch.Tensor]) -> Tuple[torch.Tensor, torch.Tensor]:
"""
Forward pass of LSTM cell.
Args:
x: Input tensor of shape (batch_size, input_size)
hidden_state: Tuple of (h_{t-1}, c_{t-1}) each of shape (batch_size, hidden_size)
Returns:
Tuple of (h_t, c_t) representing new hidden and cell states
"""
h_prev, c_prev = hidden_state
# Combined matrix multiplication for all gates
# Shape: (batch_size, 4 * hidden_size)
gates_x = self.weight_ih(x) # Transform input
gates_h = self.weight_hh(h_prev) # Transform previous hidden state
# Apply layer normalization
gates_x = self.layer_norm_x(gates_x)
gates = gates_x + gates_h # Combined gate pre-activations
# Split into individual gates
# Each gate shape: (batch_size, hidden_size)
i_gate, f_gate, g_gate, o_gate = gates.chunk(4, dim=1)
# Apply gate non-linearities
i_t = torch.sigmoid(i_gate) # Input gate
f_t = torch.sigmoid(f_gate) # Forget gate
g_t = torch.tanh(g_gate) # Cell state candidate
o_t = torch.sigmoid(o_gate) # Output gate
# Update cell state: c_t = f_t ⊙ c_{t-1} + i_t ⊙ g_t
c_t = f_t * c_prev + i_t * g_t
c_t = self.layer_norm_c(c_t)
# Update hidden state: h_t = o_t ⊙ tanh(c_t)
h_t = o_t * torch.tanh(c_t)
h_t = self.layer_norm_h(h_t)
if self.dropout is not None:
h_t = self.dropout(h_t)
return h_t, c_t
def init_hidden(self, batch_size: int, device: torch.device) -> Tuple[torch.Tensor, torch.Tensor]:
"""Initialize hidden state and cell state with zeros."""
return (torch.zeros(batch_size, self.hidden_size, device=device),
torch.zeros(batch_size, self.hidden_size, device=device))
class StackedLSTM(nn.Module):
"""
Stacked LSTM implementation supporting multiple layers.
Each layer processes the output of the previous layer.
"""
def __init__(self, input_size: int, hidden_size: int, num_layers: int, dropout: float = 0.0):
super().__init__()
self.num_layers = num_layers
self.hidden_size = hidden_size
# Create list of LSTM cells, one for each layer
self.layers = nn.ModuleList([
LSTMCell(input_size if i == 0 else hidden_size, hidden_size,
dropout if i < num_layers - 1 else 0.0) # No dropout on last layer
for i in range(num_layers)
])
def forward(self, x: torch.Tensor,
hidden_states: Optional[List[Tuple[torch.Tensor, torch.Tensor]]] = None) -> Tuple[torch.Tensor, List[Tuple[torch.Tensor, torch.Tensor]]]:
"""
Process input sequence through stacked LSTM layers.
Args:
x: Input tensor of shape (batch_size, seq_length, input_size)
hidden_states: Optional initial hidden states for each layer
Returns:
Tuple of (output, hidden_states) where output has shape (batch_size, seq_length, hidden_size)
"""
batch_size, seq_length, _ = x.size()
device = x.device
if hidden_states is None:
hidden_states = [layer.init_hidden(batch_size, device) for layer in self.layers]
layer_outputs = []
for t in range(seq_length):
input_t = x[:, t, :]
for i, lstm_cell in enumerate(self.layers):
input_t, cell_state = lstm_cell(input_t, hidden_states[i])
hidden_states[i] = (input_t, cell_state)
layer_outputs.append(input_t)
# Stack outputs along sequence dimension
output = torch.stack(layer_outputs, dim=1)
return output, hidden_states
class LSTMNetwork(nn.Module):
"""
Complete LSTM network with bidirectional support.
In bidirectional mode:
- Forward LSTM processes sequence from left to right
- Backward LSTM processes sequence from right to left
- Outputs are concatenated for final prediction
"""
def __init__(self,
input_size: int,
hidden_size: int,
num_layers: int,
output_size: int,
dropout: float = 0.0,
bidirectional: bool = False):
super().__init__()
self.bidirectional = bidirectional
# Forward direction LSTM
self.stacked_lstm = StackedLSTM(input_size, hidden_size, num_layers, dropout)
# Optional backward direction LSTM for bidirectional processing
if bidirectional:
self.reverse_lstm = StackedLSTM(input_size, hidden_size, num_layers, dropout)
hidden_size *= 2 # Double hidden size due to concatenation
self.fc = nn.Linear(hidden_size, output_size)
self.dropout = nn.Dropout(dropout)
def forward(self, x: torch.Tensor,
hidden_states: Optional[List[Tuple[torch.Tensor, torch.Tensor]]] = None) -> torch.Tensor:
"""
Forward pass of the network.
For bidirectional processing:
1. Process sequence normally with forward LSTM
2. Process reversed sequence with backward LSTM
3. Concatenate both outputs
4. Apply final linear transformation
Args:
x: Input tensor of shape (batch_size, seq_length, input_size)
hidden_states: Optional initial hidden states
Returns:
Output tensor of shape (batch_size, output_size)
"""
# Forward direction
output, hidden_states = self.stacked_lstm(x, hidden_states)
if self.bidirectional:
# Process sequence in reverse direction
reverse_output, _ = self.reverse_lstm(torch.flip(x, [1]))
# Flip back to align with forward sequence
reverse_output = torch.flip(reverse_output, [1])
# Concatenate forward and backward outputs along feature dimension
output = torch.cat([output, reverse_output], dim=-1)
# Apply dropout before final layer
output = self.dropout(output)
# Use final timestep output for prediction
final_output = self.fc(output[:, -1, :])
return final_output
def create_lstm_model(config: dict) -> LSTMNetwork:
"""
Factory function to create an LSTM model with specified configuration.
Args:
config: Dictionary containing model parameters:
- input_size: Size of input features
- hidden_size: Size of LSTM hidden state
- num_layers: Number of stacked LSTM layers
- output_size: Size of final output
- dropout: Dropout probability (optional)
- bidirectional: Whether to use bidirectional LSTM (optional)
"""
return LSTMNetwork(
input_size=config['input_size'],
hidden_size=config['hidden_size'],
num_layers=config['num_layers'],
output_size=config['output_size'],
dropout=config.get('dropout', 0.0),
bidirectional=config.get('bidirectional', False)
)
# Example usage
if __name__ == "__main__":
# Configuration for a bidirectional LSTM
config = {
'input_size': 3,
'hidden_size': 64,
'num_layers': 2,
'output_size': 1,
'dropout': 0.3,
'bidirectional': True # Enable bidirectional processing
}
# Create model
model = create_lstm_model(config)
# Generate dummy input
batch_size, seq_length = 32, 10
x = torch.randn(batch_size, seq_length, config['input_size'])
# Forward pass
output = model(x)
print(f"Input shape: {x.shape}")
print(f"Output shape: {output.shape}")