Skip to content

Commit 4a4448d

Browse files
committed
docs: update dinosaurs example
1 parent 9b3d806 commit 4a4448d

File tree

1 file changed

+65
-65
lines changed

1 file changed

+65
-65
lines changed

examples/real-life-applications/dinosaurnames/dinosaur_names_generator.ipynb

Lines changed: 65 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@
4545
"with open('dinos.txt', 'r', encoding='utf-8') as f:\n",
4646
" names = [line.strip() for line in f]\n",
4747
"\n",
48-
"print(names[:5]) # on affiche les 5 premiers noms de la liste pour vois s'ils ont été correctement chargés"
48+
"print(names[:5]) # display the first 5 names of the list to check if they were loaded correctly"
4949
]
5050
},
5151
{
@@ -95,19 +95,19 @@
9595
}
9696
],
9797
"source": [
98-
"# Constantes\n",
99-
"PAD_TOKEN = '' # Token de padding (index 0)\n",
100-
"EOS_TOKEN = '$' # Token de fin de séquence (index 1)\n",
101-
"max_length = 15 # Longueur maximale des séquences\n",
98+
"# Constants\n",
99+
"PAD_TOKEN = '' # Padding token (index 0)\n",
100+
"EOS_TOKEN = '$' # End of sequence token (index 1)\n",
101+
"max_length = 15 # Maximum sequence length\n",
102102
"\n",
103-
"# Dictionnaires de mapping\n",
103+
"# Mapping dictionaries\n",
104104
"char_to_index = {PAD_TOKEN: 0, EOS_TOKEN: 1}\n",
105105
"index_to_char = {0: PAD_TOKEN, 1: EOS_TOKEN}\n",
106106
"\n",
107-
"# Extraction des caractères uniques et tri\n",
107+
"# Extract unique characters and sort them\n",
108108
"unique_chars = sorted(set(''.join(names)))\n",
109109
"\n",
110-
"# Construction des mappings caractère <-> index en commençant à 2\n",
110+
"# Build character <-> index mappings starting at index 2\n",
111111
"for idx, char in enumerate(unique_chars, start=2):\n",
112112
" char_to_index[char] = idx\n",
113113
" index_to_char[idx] = char\n",
@@ -118,73 +118,74 @@
118118
},
119119
{
120120
"cell_type": "code",
121-
"execution_count": 5,
121+
"execution_count": 8,
122122
"id": "1364a6786997a8f5",
123123
"metadata": {
124+
"collapsed": false,
124125
"ExecuteTime": {
125-
"end_time": "2024-11-06T23:42:25.101611700Z",
126-
"start_time": "2024-11-06T23:42:25.070594900Z"
127-
},
128-
"collapsed": false
126+
"end_time": "2024-11-10T00:46:51.452858700Z",
127+
"start_time": "2024-11-10T00:46:51.432814400Z"
128+
}
129129
},
130130
"outputs": [
131131
{
132132
"name": "stdout",
133133
"output_type": "stream",
134134
"text": [
135-
"Taille du vocabulaire: 54\n",
136-
"Forme des données X: (18374, 10)\n",
137-
"Forme des labels y: (18374, 54)\n",
138135
"\n",
139-
"Exemple pour Aachenosaurus:\n",
140-
"Séquence d'entrée: [ 0 0 0 0 28 28 30 35 32 41]\n",
141-
"Sortie attendue: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n",
136+
"Vocabulary size: 54\n",
137+
"X data shape: (18374, 10)\n",
138+
"y labels shape: (18374, 54)\n",
139+
"\n",
140+
"Example for Aachenosaurus:\n",
141+
"Input sequence: [ 0 0 0 0 28 28 30 35 32 41]\n",
142+
"Expected output: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n",
142143
" 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.\n",
143144
" 0. 0. 0. 0. 0. 0.]\n",
144145
"\n",
145-
"Décodage de la séquence d'exemple:\n",
146+
"Decoding example sequence:\n",
146147
"['', '', '', '', 'a', 'a', 'c', 'h', 'e', 'n']\n",
147-
"Prochain caractère: o\n"
148+
"Next character: o\n"
148149
]
149150
}
150151
],
151152
"source": [
152-
"# Séquences pour le training\n",
153+
"# Training sequences\n",
153154
"sequences = []\n",
154155
"next_chars = []\n",
155156
"\n",
156-
"# Création des séquences et des caractères suivants\n",
157+
"# Create sequences and next characters\n",
157158
"for name in names:\n",
158159
" name = name.lower()\n",
159160
" name_chars = list(name) + [EOS_TOKEN]\n",
160161
"\n",
161162
" for i in range(len(name_chars) - 1):\n",
162-
" # Extraction de la séquence\n",
163+
" # Extract sequence\n",
163164
" seq = name_chars[max(0, i - max_length + 1):i + 1]\n",
164165
"\n",
165-
" # Padding et conversion en indices\n",
166+
" # Padding and conversion to indices\n",
166167
" padded_seq = [0] * (max_length - len(seq)) + [char_to_index[char] for char in seq]\n",
167168
"\n",
168169
" sequences.append(padded_seq)\n",
169170
" next_chars.append(char_to_index[name_chars[i + 1]])\n",
170171
"\n",
171-
"# Conversion en arrays numpy\n",
172+
"# Convert to numpy arrays\n",
172173
"X = np.array(sequences)\n",
173174
"y = one_hot_encode(np.array(next_chars), vocab_size)\n",
174175
"\n",
175-
"print(f\"Taille du vocabulaire: {vocab_size}\")\n",
176-
"print(f\"Forme des données X: {X.shape}\")\n",
177-
"print(f\"Forme des labels y: {y.shape}\")\n",
176+
"print(f\"Vocabulary size: {vocab_size}\")\n",
177+
"print(f\"X data shape: {X.shape}\")\n",
178+
"print(f\"y labels shape: {y.shape}\")\n",
178179
"\n",
179-
"# Affichage d'un exemple bbpour vérification\n",
180-
"print(f\"\\nExemple pour {names[0]}:\")\n",
181-
"print(f\"Séquence d'entrée: {X[5]}\")\n",
182-
"print(f\"Sortie attendue: {y[5]}\")\n",
180+
"# Display an example for verification\n",
181+
"print(f\"\\nExample for {names[0]}:\")\n",
182+
"print(f\"Input sequence: {X[5]}\")\n",
183+
"print(f\"Expected output: {y[5]}\")\n",
183184
"\n",
184-
"# Visualisation des tokens pour le premier exemple\n",
185-
"print(\"\\nDécodage de la séquence d'exemple:\")\n",
185+
"# Visualize tokens for the first example\n",
186+
"print(\"\\nDecoding example sequence:\")\n",
186187
"print([index_to_char[idx] for idx in X[5]])\n",
187-
"print(f\"Prochain caractère: {index_to_char[next_chars[5]]}\")"
188+
"print(f\"Next character: {index_to_char[next_chars[5]]}\")"
188189
]
189190
},
190191
{
@@ -218,8 +219,7 @@
218219
}
219220
],
220221
"source": [
221-
"\n",
222-
"# Création du modèle\n",
222+
"# Model definition\n",
223223
"embedding_dim = 32\n",
224224
"lstm_units = 128\n",
225225
"\n",
@@ -339,14 +339,14 @@
339339
}
340340
],
341341
"source": [
342-
"# Création du callback EarlyStopping\n",
342+
"# Early stopping callback\n",
343343
"early_stopping = EarlyStopping(\n",
344344
" monitor='loss',\n",
345345
" patience=5,\n",
346346
" restore_best_weights=True\n",
347347
")\n",
348348
"\n",
349-
"# Entraînement du modèle\n",
349+
"# Model training\n",
350350
"history = model.fit(\n",
351351
" X, y,\n",
352352
" epochs=100,\n",
@@ -358,38 +358,38 @@
358358
},
359359
{
360360
"cell_type": "code",
361-
"execution_count": 24,
361+
"execution_count": 9,
362362
"id": "68ec75af38129a34",
363363
"metadata": {
364+
"collapsed": false,
364365
"ExecuteTime": {
365-
"end_time": "2024-11-07T00:02:29.823857Z",
366-
"start_time": "2024-11-07T00:02:29.567804300Z"
367-
},
368-
"collapsed": false
366+
"end_time": "2024-11-10T00:48:01.311549Z",
367+
"start_time": "2024-11-10T00:48:01.292550Z"
368+
}
369369
},
370370
"outputs": [
371371
{
372372
"name": "stdout",
373373
"output_type": "stream",
374374
"text": [
375375
"\n",
376-
"Noms générés:\n",
376+
"Generated names:\n",
377377
"Ourocosaur (10 caractères)\n",
378378
"Rsholisaur (10 caractères)\n",
379379
"Cosonimus (9 caractères)\n",
380380
"Euceratous (10 caractères)\n",
381381
"Amarcerato (10 caractères)\n",
382382
"\n",
383-
"Tous les noms sont-ils originaux ? True\n",
383+
"Are all names original? True\n",
384384
"\n",
385-
"Longueur moyenne: 9.8 caractères\n",
386-
"Longueur minimale: 9 caractères\n",
387-
"Longueur maximale: 10 caractères\n"
385+
"Average length: 9.8 caractères\n",
386+
"Minimum length: 9 caractères\n",
387+
"Maximum length: 10 caractères\n"
388388
]
389389
}
390390
],
391391
"source": [
392-
"# Génération de nouveaux noms\n",
392+
"# Name generation\n",
393393
"def generate_name(model, min_length=5):\n",
394394
" current_sequence = [0] * max_length\n",
395395
" generated_name = \"\"\n",
@@ -398,24 +398,24 @@
398398
" x = np.array([current_sequence])\n",
399399
" preds = model.predict(x)[0]\n",
400400
"\n",
401-
" # Sélection du prochain caractère avec random.choices\n",
401+
" # Select next character using random.choices\n",
402402
" next_char_idx = random.choices(range(vocab_size), weights=preds, k=1)[0]\n",
403403
" next_char = index_to_char[next_char_idx]\n",
404404
"\n",
405-
" # STOP si longueur minimale atteinte et EOS rencontré\n",
405+
" # STOP if minimum length reached and EOS encountered\n",
406406
" if len(generated_name) >= min_length and next_char == EOS_TOKEN:\n",
407407
" break\n",
408408
"\n",
409-
" # Ajout du caractère si ce n'est ni PAD ni EOS\n",
409+
" # Add character if it's neither PAD nor EOS\n",
410410
" if next_char not in [PAD_TOKEN, EOS_TOKEN]:\n",
411411
" generated_name += next_char\n",
412412
"\n",
413-
" # Mise à jour de la séquence courante\n",
413+
" # Update current sequence\n",
414414
" current_sequence = current_sequence[1:] + [next_char_idx]\n",
415415
"\n",
416416
" return generated_name.capitalize() if len(generated_name) >= min_length else None\n",
417417
"\n",
418-
"# Génération de plusieurs noms\n",
418+
"# Generate multiple names\n",
419419
"generated_names = []\n",
420420
"number_of_names = 5\n",
421421
"min_length = 5\n",
@@ -425,19 +425,19 @@
425425
" if name is not None and name not in generated_names:\n",
426426
" generated_names.append(name)\n",
427427
"\n",
428-
"# Affichage des résultats\n",
429-
"print(\"\\nNoms générés:\")\n",
428+
"# Display results\n",
429+
"print(\"\\nGenerated names:\")\n",
430430
"for name in generated_names:\n",
431-
" print(f\"{name} ({len(name)} caractères)\")\n",
431+
" print(f\"{name} ({len(name)} characters)\")\n",
432432
"\n",
433-
"# Vérification de l'originalité\n",
434-
"print(\"\\nTous les noms sont-ils originaux ?\", all(name.lower() not in [n.lower() for n in names] for name in generated_names))\n",
433+
"# Check originality\n",
434+
"print(\"\\nAre all names original?\", all(name.lower() not in [n.lower() for n in names] for name in generated_names))\n",
435435
"\n",
436-
"# Statistiques sur les longueurs\n",
436+
"# Length statistics\n",
437437
"lengths = [len(name) for name in generated_names]\n",
438-
"print(f\"\\nLongueur moyenne: {sum(lengths)/len(lengths):.1f} caractères\")\n",
439-
"print(f\"Longueur minimale: {min(lengths)} caractères\")\n",
440-
"print(f\"Longueur maximale: {max(lengths)} caractères\")"
438+
"print(f\"\\nAverage length: {sum(lengths)/len(lengths):.1f} characters\")\n",
439+
"print(f\"Minimum length: {min(lengths)} characters\")\n",
440+
"print(f\"Maximum length: {max(lengths)} characters\")"
441441
]
442442
}
443443
],

0 commit comments

Comments
 (0)