docs: update dinosaurs example

marcpinet · marcpinet · commit 4a4448d819df · 2024-11-10T01:48:38.000+01:00
diff --git a/examples/real-life-applications/dinosaurnames/dinosaur_names_generator.ipynb b/examples/real-life-applications/dinosaurnames/dinosaur_names_generator.ipynb
@@ -45,7 +45,7 @@
     "with open('dinos.txt', 'r', encoding='utf-8') as f:\n",
     "    names = [line.strip() for line in f]\n",
     "\n",
-    "print(names[:5])  # on affiche les 5 premiers noms de la liste pour vois s'ils ont été correctement chargés"
+    "print(names[:5])  # display the first 5 names of the list to check if they were loaded correctly"
    ]
   },
   {
@@ -95,19 +95,19 @@
     }
    ],
    "source": [
-    "# Constantes\n",
-    "PAD_TOKEN = ''  # Token de padding (index 0)\n",
-    "EOS_TOKEN = '$'  # Token de fin de séquence (index 1)\n",
-    "max_length = 15  # Longueur maximale des séquences\n",
+    "# Constants\n",
+    "PAD_TOKEN = ''   # Padding token (index 0)\n",
+    "EOS_TOKEN = '$'  # End of sequence token (index 1)\n",
+    "max_length = 15  # Maximum sequence length\n",
     "\n",
-    "# Dictionnaires de mapping\n",
+    "# Mapping dictionaries\n",
     "char_to_index = {PAD_TOKEN: 0, EOS_TOKEN: 1}\n",
     "index_to_char = {0: PAD_TOKEN, 1: EOS_TOKEN}\n",
     "\n",
-    "# Extraction des caractères uniques et tri\n",
+    "# Extract unique characters and sort them\n",
     "unique_chars = sorted(set(''.join(names)))\n",
     "\n",
-    "# Construction des mappings caractère <-> index en commençant à 2\n",
+    "# Build character <-> index mappings starting at index 2\n",
     "for idx, char in enumerate(unique_chars, start=2):\n",
     "    char_to_index[char] = idx\n",
     "    index_to_char[idx] = char\n",
@@ -118,73 +118,74 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 8,
    "id": "1364a6786997a8f5",
    "metadata": {
+    "collapsed": false,
     "ExecuteTime": {
-     "end_time": "2024-11-06T23:42:25.101611700Z",
-     "start_time": "2024-11-06T23:42:25.070594900Z"
-    },
-    "collapsed": false
+     "end_time": "2024-11-10T00:46:51.452858700Z",
+     "start_time": "2024-11-10T00:46:51.432814400Z"
+    }
    },
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Taille du vocabulaire: 54\n",
-      "Forme des données X: (18374, 10)\n",
-      "Forme des labels y: (18374, 54)\n",
       "\n",
-      "Exemple pour Aachenosaurus:\n",
-      "Séquence d'entrée: [ 0  0  0  0 28 28 30 35 32 41]\n",
-      "Sortie attendue: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n",
+      "Vocabulary size: 54\n",
+      "X data shape: (18374, 10)\n",
+      "y labels shape: (18374, 54)\n",
+      "\n",
+      "Example for Aachenosaurus:\n",
+      "Input sequence: [ 0  0  0  0 28 28 30 35 32 41]\n",
+      "Expected output: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n",
       " 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.\n",
       " 0. 0. 0. 0. 0. 0.]\n",
       "\n",
-      "Décodage de la séquence d'exemple:\n",
+      "Decoding example sequence:\n",
       "['', '', '', '', 'a', 'a', 'c', 'h', 'e', 'n']\n",
-      "Prochain caractère: o\n"
+      "Next character: o\n"
      ]
     }
    ],
    "source": [
-    "# Séquences pour le training\n",
+    "# Training sequences\n",
     "sequences = []\n",
     "next_chars = []\n",
     "\n",
-    "# Création des séquences et des caractères suivants\n",
+    "# Create sequences and next characters\n",
     "for name in names:\n",
     "    name = name.lower()\n",
     "    name_chars = list(name) + [EOS_TOKEN]\n",
     "\n",
     "    for i in range(len(name_chars) - 1):\n",
-    "        # Extraction de la séquence\n",
+    "        # Extract sequence\n",
     "        seq = name_chars[max(0, i - max_length + 1):i + 1]\n",
     "\n",
-    "        # Padding et conversion en indices\n",
+    "        # Padding and conversion to indices\n",
     "        padded_seq = [0] * (max_length - len(seq)) + [char_to_index[char] for char in seq]\n",
     "\n",
     "        sequences.append(padded_seq)\n",
     "        next_chars.append(char_to_index[name_chars[i + 1]])\n",
     "\n",
-    "# Conversion en arrays numpy\n",
+    "# Convert to numpy arrays\n",
     "X = np.array(sequences)\n",
     "y = one_hot_encode(np.array(next_chars), vocab_size)\n",
     "\n",
-    "print(f\"Taille du vocabulaire: {vocab_size}\")\n",
-    "print(f\"Forme des données X: {X.shape}\")\n",
-    "print(f\"Forme des labels y: {y.shape}\")\n",
+    "print(f\"Vocabulary size: {vocab_size}\")\n",
+    "print(f\"X data shape: {X.shape}\")\n",
+    "print(f\"y labels shape: {y.shape}\")\n",
     "\n",
-    "# Affichage d'un exemple bbpour vérification\n",
-    "print(f\"\\nExemple pour {names[0]}:\")\n",
-    "print(f\"Séquence d'entrée: {X[5]}\")\n",
-    "print(f\"Sortie attendue: {y[5]}\")\n",
+    "# Display an example for verification\n",
+    "print(f\"\\nExample for {names[0]}:\")\n",
+    "print(f\"Input sequence: {X[5]}\")\n",
+    "print(f\"Expected output: {y[5]}\")\n",
     "\n",
-    "# Visualisation des tokens pour le premier exemple\n",
-    "print(\"\\nDécodage de la séquence d'exemple:\")\n",
+    "# Visualize tokens for the first example\n",
+    "print(\"\\nDecoding example sequence:\")\n",
     "print([index_to_char[idx] for idx in X[5]])\n",
-    "print(f\"Prochain caractère: {index_to_char[next_chars[5]]}\")"
+    "print(f\"Next character: {index_to_char[next_chars[5]]}\")"
    ]
   },
   {
@@ -218,8 +219,7 @@
     }
    ],
    "source": [
-    "\n",
-    "# Création du modèle\n",
+    "# Model definition\n",
     "embedding_dim = 32\n",
     "lstm_units = 128\n",
     "\n",
@@ -339,14 +339,14 @@
     }
    ],
    "source": [
-    "# Création du callback EarlyStopping\n",
+    "# Early stopping callback\n",
     "early_stopping = EarlyStopping(\n",
     "    monitor='loss',\n",
     "    patience=5,\n",
     "    restore_best_weights=True\n",
     ")\n",
     "\n",
-    "# Entraînement du modèle\n",
+    "# Model training\n",
     "history = model.fit(\n",
     "    X, y,\n",
     "    epochs=100,\n",
@@ -358,38 +358,38 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 9,
    "id": "68ec75af38129a34",
    "metadata": {
+    "collapsed": false,
     "ExecuteTime": {
-     "end_time": "2024-11-07T00:02:29.823857Z",
-     "start_time": "2024-11-07T00:02:29.567804300Z"
-    },
-    "collapsed": false
+     "end_time": "2024-11-10T00:48:01.311549Z",
+     "start_time": "2024-11-10T00:48:01.292550Z"
+    }
    },
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
       "\n",
-      "Noms générés:\n",
+      "Generated names:\n",
       "Ourocosaur (10 caractères)\n",
       "Rsholisaur (10 caractères)\n",
       "Cosonimus (9 caractères)\n",
       "Euceratous (10 caractères)\n",
       "Amarcerato (10 caractères)\n",
       "\n",
-      "Tous les noms sont-ils originaux ? True\n",
+      "Are all names original? True\n",
       "\n",
-      "Longueur moyenne: 9.8 caractères\n",
-      "Longueur minimale: 9 caractères\n",
-      "Longueur maximale: 10 caractères\n"
+      "Average length: 9.8 caractères\n",
+      "Minimum length: 9 caractères\n",
+      "Maximum length: 10 caractères\n"
      ]
     }
    ],
    "source": [
-    "# Génération de nouveaux noms\n",
+    "# Name generation\n",
     "def generate_name(model, min_length=5):\n",
     "    current_sequence = [0] * max_length\n",
     "    generated_name = \"\"\n",
@@ -398,24 +398,24 @@
     "        x = np.array([current_sequence])\n",
     "        preds = model.predict(x)[0]\n",
     "\n",
-    "        # Sélection du prochain caractère avec random.choices\n",
+    "        # Select next character using random.choices\n",
     "        next_char_idx = random.choices(range(vocab_size), weights=preds, k=1)[0]\n",
     "        next_char = index_to_char[next_char_idx]\n",
     "\n",
-    "        # STOP si longueur minimale atteinte et EOS rencontré\n",
+    "        # STOP if minimum length reached and EOS encountered\n",
     "        if len(generated_name) >= min_length and next_char == EOS_TOKEN:\n",
     "            break\n",
     "\n",
-    "        # Ajout du caractère si ce n'est ni PAD ni EOS\n",
+    "        # Add character if it's neither PAD nor EOS\n",
     "        if next_char not in [PAD_TOKEN, EOS_TOKEN]:\n",
     "            generated_name += next_char\n",
     "\n",
-    "        # Mise à jour de la séquence courante\n",
+    "        # Update current sequence\n",
     "        current_sequence = current_sequence[1:] + [next_char_idx]\n",
     "\n",
     "    return generated_name.capitalize() if len(generated_name) >= min_length else None\n",
     "\n",
-    "# Génération de plusieurs noms\n",
+    "# Generate multiple names\n",
     "generated_names = []\n",
     "number_of_names = 5\n",
     "min_length = 5\n",
@@ -425,19 +425,19 @@
     "    if name is not None and name not in generated_names:\n",
     "        generated_names.append(name)\n",
     "\n",
-    "# Affichage des résultats\n",
-    "print(\"\\nNoms générés:\")\n",
+    "# Display results\n",
+    "print(\"\\nGenerated names:\")\n",
     "for name in generated_names:\n",
-    "    print(f\"{name} ({len(name)} caractères)\")\n",
+    "    print(f\"{name} ({len(name)} characters)\")\n",
     "\n",
-    "# Vérification de l'originalité\n",
-    "print(\"\\nTous les noms sont-ils originaux ?\", all(name.lower() not in [n.lower() for n in names] for name in generated_names))\n",
+    "# Check originality\n",
+    "print(\"\\nAre all names original?\", all(name.lower() not in [n.lower() for n in names] for name in generated_names))\n",
     "\n",
-    "# Statistiques sur les longueurs\n",
+    "# Length statistics\n",
     "lengths = [len(name) for name in generated_names]\n",
-    "print(f\"\\nLongueur moyenne: {sum(lengths)/len(lengths):.1f} caractères\")\n",
-    "print(f\"Longueur minimale: {min(lengths)} caractères\")\n",
-    "print(f\"Longueur maximale: {max(lengths)} caractères\")"
+    "print(f\"\\nAverage length: {sum(lengths)/len(lengths):.1f} characters\")\n",
+    "print(f\"Minimum length: {min(lengths)} characters\")\n",
+    "print(f\"Maximum length: {max(lengths)} characters\")"
    ]
   }
  ],