fix(Dense): correct temporal data handling in forward and backward passes

marcpinet · marcpinet · commit b43c8d5e6d7d · 2024-11-09T17:06:58.000+01:00
diff --git a/examples/classification-regression/mnist_multiclass.ipynb b/examples/classification-regression/mnist_multiclass.ipynb
diff --git a/examples/classification-regression/sentiment_analysis.ipynb b/examples/classification-regression/sentiment_analysis.ipynb
@@ -21,8 +21,8 @@
    "execution_count": 1,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2024-11-06T21:51:28.948615200Z",
-     "start_time": "2024-11-06T21:51:19.721136Z"
+     "end_time": "2024-11-09T15:29:05.393532Z",
+     "start_time": "2024-11-09T15:28:57.267583700Z"
     }
    },
    "outputs": [],
@@ -51,8 +51,8 @@
    "execution_count": 2,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2024-11-06T21:51:30.589179800Z",
-     "start_time": "2024-11-06T21:51:28.950619500Z"
+     "end_time": "2024-11-09T15:29:06.872553900Z",
+     "start_time": "2024-11-09T15:29:05.396041700Z"
     }
    },
    "outputs": [],
@@ -72,8 +72,8 @@
    "execution_count": 3,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2024-11-06T21:51:30.871205900Z",
-     "start_time": "2024-11-06T21:51:30.590182500Z"
+     "end_time": "2024-11-09T15:29:07.138228500Z",
+     "start_time": "2024-11-09T15:29:06.873553300Z"
     }
    },
    "outputs": [
@@ -150,8 +150,8 @@
    "execution_count": 4,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2024-11-06T21:51:30.899961500Z",
-     "start_time": "2024-11-06T21:51:30.871205900Z"
+     "end_time": "2024-11-09T15:29:07.182100500Z",
+     "start_time": "2024-11-09T15:29:07.139267400Z"
     }
    },
    "outputs": [],
@@ -176,8 +176,8 @@
    "execution_count": 5,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2024-11-06T21:51:30.904961800Z",
-     "start_time": "2024-11-06T21:51:30.886456800Z"
+     "end_time": "2024-11-09T15:29:07.185659500Z",
+     "start_time": "2024-11-09T15:29:07.154336500Z"
     }
    },
    "outputs": [
@@ -189,8 +189,8 @@
       "-------------------------------------------------\n",
       "Layer 1: Input(input_shape=(200,))\n",
       "Layer 2: Embedding(input_dim=10000, output_dim=100)\n",
-      "Layer 3: Bidirectional(layer=LSTM(units=32, return_sequences=True, return_state=False, random_state=None))\n",
-      "Layer 4: Attention(use_scale=True, score_mode=dot)\n",
+      "Layer 3: Bidirectional(layer=LSTM(units=32, return_sequences=True, return_state=False))\n",
+      "Layer 4: Attention(use_scale=True, score_mode=dot, return_sequences=False)\n",
       "Layer 5: Dense(units=1)\n",
       "Layer 6: Activation(Sigmoid)\n",
       "-------------------------------------------------\n",
@@ -215,30 +215,37 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 6,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2024-11-06T22:17:05.632380200Z",
-     "start_time": "2024-11-06T22:17:05.625379900Z"
+     "end_time": "2024-11-09T15:57:58.751713500Z",
+     "start_time": "2024-11-09T15:29:07.168952900Z"
     }
    },
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "\n",
-      "[==============================] 100% Epoch 1/10 - loss: 0.6193 - accuracy: 0.7079 - 248.72s - val_accuracy: 0.8013\n",
-      "[==============================] 100% Epoch 2/10 - loss: 0.4215 - accuracy: 0.8477 - 264.70s - val_accuracy: 0.8504\n",
-      "[==============================] 100% Epoch 3/10 - loss: 0.3301 - accuracy: 0.8799 - 266.74s - val_accuracy: 0.8624\n",
-      "[==============================] 100% Epoch 4/10 - loss: 0.2835 - accuracy: 0.8954 - 255.44s - val_accuracy: 0.8677\n",
-      "[==============================] 100% Epoch 5/10 - loss: 0.2519 - accuracy: 0.9093 - 239.53s - val_accuracy: 0.8710\n",
-      "[==============================] 100% Epoch 6/10 - loss: 0.2283 - accuracy: 0.9183 - 239.53s - val_accuracy: 0.8728\n",
-      "[==============================] 100% Epoch 7/10 - loss: 0.2090 - accuracy: 0.9260 - 239.53s - val_accuracy: 0.8802\n",
-      "[==============================] 100% Epoch 8/10 - loss: 0.1926 - accuracy: 0.9320 - 239.53s - val_accuracy: 0.8884\n",
-      "[==============================] 100% Epoch 9/10 - loss: 0.1784 - accuracy: 0.9376 - 239.53s - val_accuracy: 0.8902\n",
-      "[==============================] 100% Epoch 10/10 - loss: 0.1660 - accuracy: 0.9423 - 239.53s - val_accuracy: 0.9000\n"
+      "[==============================] 100% Epoch 1/10 - loss: 0.4424 - accuracy: 0.7944 - 118.13s - val_accuracy: 0.8490\n",
+      "[==============================] 100% Epoch 2/10 - loss: 0.2401 - accuracy: 0.9084 - 120.27s - val_accuracy: 0.8170\n",
+      "[==============================] 100% Epoch 3/10 - loss: 0.1814 - accuracy: 0.9332 - 121.17s - val_accuracy: 0.8602\n",
+      "[==============================] 100% Epoch 4/10 - loss: 0.1479 - accuracy: 0.9485 - 118.24s - val_accuracy: 0.8509\n",
+      "[==============================] 100% Epoch 5/10 - loss: 0.1056 - accuracy: 0.9649 - 120.75s - val_accuracy: 0.8637\n",
+      "[==============================] 100% Epoch 6/10 - loss: 0.0854 - accuracy: 0.9735 - 118.61s - val_accuracy: 0.8549\n",
+      "[==============================] 100% Epoch 7/10 - loss: 0.0871 - accuracy: 0.9728 - 120.97s - val_accuracy: 0.8567\n",
+      "[==============================] 100% Epoch 8/10 - loss: 0.0629 - accuracy: 0.9799 - 117.70s - val_accuracy: 0.8515\n",
+      "[==============================] 100% Epoch 9/10 - loss: 0.0533 - accuracy: 0.9840 - 120.13s - val_accuracy: 0.8463\n",
+      "[==============================] 100% Epoch 10/10 - loss: 0.0394 - accuracy: 0.9890 - 118.95s - val_accuracy: 0.8444\n"
      ]
+    },
+    {
+     "data": {
+      "text/plain": ""
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
     }
    ],
    "source": [
@@ -254,20 +261,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 7,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2024-11-06T22:17:25.754433600Z",
-     "start_time": "2024-11-06T22:17:14.398517800Z"
+     "end_time": "2024-11-09T15:58:20.883275400Z",
+     "start_time": "2024-11-09T15:57:58.748498900Z"
     }
    },
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Loss: 1.4010948021794365\n",
-      "Accuracy: 0.881\n"
+      "Loss: 3.0061621606978313\n",
+      "Accuracy: 0.8592\n"
      ]
     }
    ],
diff --git a/neuralnetlib/layers.py b/neuralnetlib/layers.py
@@ -136,30 +136,39 @@ def initialize_weights(self, input_size: int):
 
     def forward_pass(self, input_data: np.ndarray) -> np.ndarray:
         self.input_shape = input_data.shape
+        self.input = input_data
 
         if len(input_data.shape) == 3:
             batch_size, timesteps, features = input_data.shape
-            input_data = input_data.mean(axis=1)
-
+            input_reshaped = input_data.reshape(-1, features)
+            
+            if self.weights is None:
+                self.initialize_weights(features)
+            
+            output = np.dot(input_reshaped, self.weights) + self.bias
+            
+            return output.reshape(batch_size, timesteps, self.units)
+        
         if self.weights is None:
             self.initialize_weights(input_data.shape[1])
-
-        self.input = input_data
-        output = np.dot(self.input, self.weights) + self.bias
-        return output
+        
+        return np.dot(input_data, self.weights) + self.bias
 
     def backward_pass(self, output_error: np.ndarray) -> np.ndarray:
         if len(output_error.shape) == 3:
-            output_error = output_error.mean(axis=1)
-
+            batch_size, timesteps, _ = output_error.shape
+            output_error_reshaped = output_error.reshape(-1, output_error.shape[-1])
+            input_reshaped = self.input.reshape(-1, self.input.shape[-1])
+            
+            input_error = np.dot(output_error_reshaped, self.weights.T)
+            self.d_weights = np.dot(input_reshaped.T, output_error_reshaped)
+            self.d_bias = np.sum(output_error_reshaped, axis=0, keepdims=True)
+            
+            return input_error.reshape(batch_size, timesteps, -1)
+        
         input_error = np.dot(output_error, self.weights.T)
         self.d_weights = np.dot(self.input.T, output_error)
         self.d_bias = np.sum(output_error, axis=0, keepdims=True)
-
-        if len(self.input_shape) == 3:
-            input_error = np.expand_dims(input_error, 1)
-            input_error = np.repeat(input_error, self.input_shape[1], axis=1)
-
         return input_error
 
     def get_config(self) -> dict:
@@ -1680,7 +1689,7 @@ def from_config(config: dict):
 
 
 class Attention(Layer):
-    def __init__(self, use_scale: bool = True, score_mode: str = "dot", return_sequences: bool = True):
+    def __init__(self, use_scale: bool = True, score_mode: str = "dot", return_sequences: bool = False):
         super().__init__()
         self.use_scale = use_scale
         self.score_mode = score_mode
@@ -1714,7 +1723,7 @@ def forward_pass(self, input_data: np.ndarray) -> np.ndarray:
             context[i] = np.dot(attention_weights[i], input_data[i])
         
         if not self.return_sequences:
-            return np.mean(context, axis=1)
+            context = np.mean(context, axis=1)
         return context
 
     def backward_pass(self, output_error: np.ndarray) -> np.ndarray: