|
45 | 45 | "with open('dinos.txt', 'r', encoding='utf-8') as f:\n", |
46 | 46 | " names = [line.strip() for line in f]\n", |
47 | 47 | "\n", |
48 | | - "print(names[:5]) # on affiche les 5 premiers noms de la liste pour vois s'ils ont été correctement chargés" |
| 48 | + "print(names[:5]) # display the first 5 names of the list to check if they were loaded correctly" |
49 | 49 | ] |
50 | 50 | }, |
51 | 51 | { |
|
95 | 95 | } |
96 | 96 | ], |
97 | 97 | "source": [ |
98 | | - "# Constantes\n", |
99 | | - "PAD_TOKEN = '' # Token de padding (index 0)\n", |
100 | | - "EOS_TOKEN = '$' # Token de fin de séquence (index 1)\n", |
101 | | - "max_length = 15 # Longueur maximale des séquences\n", |
| 98 | + "# Constants\n", |
| 99 | + "PAD_TOKEN = '' # Padding token (index 0)\n", |
| 100 | + "EOS_TOKEN = '$' # End of sequence token (index 1)\n", |
| 101 | + "max_length = 15 # Maximum sequence length\n", |
102 | 102 | "\n", |
103 | | - "# Dictionnaires de mapping\n", |
| 103 | + "# Mapping dictionaries\n", |
104 | 104 | "char_to_index = {PAD_TOKEN: 0, EOS_TOKEN: 1}\n", |
105 | 105 | "index_to_char = {0: PAD_TOKEN, 1: EOS_TOKEN}\n", |
106 | 106 | "\n", |
107 | | - "# Extraction des caractères uniques et tri\n", |
| 107 | + "# Extract unique characters and sort them\n", |
108 | 108 | "unique_chars = sorted(set(''.join(names)))\n", |
109 | 109 | "\n", |
110 | | - "# Construction des mappings caractère <-> index en commençant à 2\n", |
| 110 | + "# Build character <-> index mappings starting at index 2\n", |
111 | 111 | "for idx, char in enumerate(unique_chars, start=2):\n", |
112 | 112 | " char_to_index[char] = idx\n", |
113 | 113 | " index_to_char[idx] = char\n", |
|
118 | 118 | }, |
119 | 119 | { |
120 | 120 | "cell_type": "code", |
121 | | - "execution_count": 5, |
| 121 | + "execution_count": 8, |
122 | 122 | "id": "1364a6786997a8f5", |
123 | 123 | "metadata": { |
| 124 | + "collapsed": false, |
124 | 125 | "ExecuteTime": { |
125 | | - "end_time": "2024-11-06T23:42:25.101611700Z", |
126 | | - "start_time": "2024-11-06T23:42:25.070594900Z" |
127 | | - }, |
128 | | - "collapsed": false |
| 126 | + "end_time": "2024-11-10T00:46:51.452858700Z", |
| 127 | + "start_time": "2024-11-10T00:46:51.432814400Z" |
| 128 | + } |
129 | 129 | }, |
130 | 130 | "outputs": [ |
131 | 131 | { |
132 | 132 | "name": "stdout", |
133 | 133 | "output_type": "stream", |
134 | 134 | "text": [ |
135 | | - "Taille du vocabulaire: 54\n", |
136 | | - "Forme des données X: (18374, 10)\n", |
137 | | - "Forme des labels y: (18374, 54)\n", |
138 | 135 | "\n", |
139 | | - "Exemple pour Aachenosaurus:\n", |
140 | | - "Séquence d'entrée: [ 0 0 0 0 28 28 30 35 32 41]\n", |
141 | | - "Sortie attendue: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n", |
| 136 | + "Vocabulary size: 54\n", |
| 137 | + "X data shape: (18374, 10)\n", |
| 138 | + "y labels shape: (18374, 54)\n", |
| 139 | + "\n", |
| 140 | + "Example for Aachenosaurus:\n", |
| 141 | + "Input sequence: [ 0 0 0 0 28 28 30 35 32 41]\n", |
| 142 | + "Expected output: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n", |
142 | 143 | " 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.\n", |
143 | 144 | " 0. 0. 0. 0. 0. 0.]\n", |
144 | 145 | "\n", |
145 | | - "Décodage de la séquence d'exemple:\n", |
| 146 | + "Decoding example sequence:\n", |
146 | 147 | "['', '', '', '', 'a', 'a', 'c', 'h', 'e', 'n']\n", |
147 | | - "Prochain caractère: o\n" |
| 148 | + "Next character: o\n" |
148 | 149 | ] |
149 | 150 | } |
150 | 151 | ], |
151 | 152 | "source": [ |
152 | | - "# Séquences pour le training\n", |
| 153 | + "# Training sequences\n", |
153 | 154 | "sequences = []\n", |
154 | 155 | "next_chars = []\n", |
155 | 156 | "\n", |
156 | | - "# Création des séquences et des caractères suivants\n", |
| 157 | + "# Create sequences and next characters\n", |
157 | 158 | "for name in names:\n", |
158 | 159 | " name = name.lower()\n", |
159 | 160 | " name_chars = list(name) + [EOS_TOKEN]\n", |
160 | 161 | "\n", |
161 | 162 | " for i in range(len(name_chars) - 1):\n", |
162 | | - " # Extraction de la séquence\n", |
| 163 | + " # Extract sequence\n", |
163 | 164 | " seq = name_chars[max(0, i - max_length + 1):i + 1]\n", |
164 | 165 | "\n", |
165 | | - " # Padding et conversion en indices\n", |
| 166 | + " # Padding and conversion to indices\n", |
166 | 167 | " padded_seq = [0] * (max_length - len(seq)) + [char_to_index[char] for char in seq]\n", |
167 | 168 | "\n", |
168 | 169 | " sequences.append(padded_seq)\n", |
169 | 170 | " next_chars.append(char_to_index[name_chars[i + 1]])\n", |
170 | 171 | "\n", |
171 | | - "# Conversion en arrays numpy\n", |
| 172 | + "# Convert to numpy arrays\n", |
172 | 173 | "X = np.array(sequences)\n", |
173 | 174 | "y = one_hot_encode(np.array(next_chars), vocab_size)\n", |
174 | 175 | "\n", |
175 | | - "print(f\"Taille du vocabulaire: {vocab_size}\")\n", |
176 | | - "print(f\"Forme des données X: {X.shape}\")\n", |
177 | | - "print(f\"Forme des labels y: {y.shape}\")\n", |
| 176 | + "print(f\"Vocabulary size: {vocab_size}\")\n", |
| 177 | + "print(f\"X data shape: {X.shape}\")\n", |
| 178 | + "print(f\"y labels shape: {y.shape}\")\n", |
178 | 179 | "\n", |
179 | | - "# Affichage d'un exemple bbpour vérification\n", |
180 | | - "print(f\"\\nExemple pour {names[0]}:\")\n", |
181 | | - "print(f\"Séquence d'entrée: {X[5]}\")\n", |
182 | | - "print(f\"Sortie attendue: {y[5]}\")\n", |
| 180 | + "# Display an example for verification\n", |
| 181 | + "print(f\"\\nExample for {names[0]}:\")\n", |
| 182 | + "print(f\"Input sequence: {X[5]}\")\n", |
| 183 | + "print(f\"Expected output: {y[5]}\")\n", |
183 | 184 | "\n", |
184 | | - "# Visualisation des tokens pour le premier exemple\n", |
185 | | - "print(\"\\nDécodage de la séquence d'exemple:\")\n", |
| 185 | + "# Visualize tokens for the first example\n", |
| 186 | + "print(\"\\nDecoding example sequence:\")\n", |
186 | 187 | "print([index_to_char[idx] for idx in X[5]])\n", |
187 | | - "print(f\"Prochain caractère: {index_to_char[next_chars[5]]}\")" |
| 188 | + "print(f\"Next character: {index_to_char[next_chars[5]]}\")" |
188 | 189 | ] |
189 | 190 | }, |
190 | 191 | { |
|
218 | 219 | } |
219 | 220 | ], |
220 | 221 | "source": [ |
221 | | - "\n", |
222 | | - "# Création du modèle\n", |
| 222 | + "# Model definition\n", |
223 | 223 | "embedding_dim = 32\n", |
224 | 224 | "lstm_units = 128\n", |
225 | 225 | "\n", |
|
339 | 339 | } |
340 | 340 | ], |
341 | 341 | "source": [ |
342 | | - "# Création du callback EarlyStopping\n", |
| 342 | + "# Early stopping callback\n", |
343 | 343 | "early_stopping = EarlyStopping(\n", |
344 | 344 | " monitor='loss',\n", |
345 | 345 | " patience=5,\n", |
346 | 346 | " restore_best_weights=True\n", |
347 | 347 | ")\n", |
348 | 348 | "\n", |
349 | | - "# Entraînement du modèle\n", |
| 349 | + "# Model training\n", |
350 | 350 | "history = model.fit(\n", |
351 | 351 | " X, y,\n", |
352 | 352 | " epochs=100,\n", |
|
358 | 358 | }, |
359 | 359 | { |
360 | 360 | "cell_type": "code", |
361 | | - "execution_count": 24, |
| 361 | + "execution_count": 9, |
362 | 362 | "id": "68ec75af38129a34", |
363 | 363 | "metadata": { |
| 364 | + "collapsed": false, |
364 | 365 | "ExecuteTime": { |
365 | | - "end_time": "2024-11-07T00:02:29.823857Z", |
366 | | - "start_time": "2024-11-07T00:02:29.567804300Z" |
367 | | - }, |
368 | | - "collapsed": false |
| 366 | + "end_time": "2024-11-10T00:48:01.311549Z", |
| 367 | + "start_time": "2024-11-10T00:48:01.292550Z" |
| 368 | + } |
369 | 369 | }, |
370 | 370 | "outputs": [ |
371 | 371 | { |
372 | 372 | "name": "stdout", |
373 | 373 | "output_type": "stream", |
374 | 374 | "text": [ |
375 | 375 | "\n", |
376 | | - "Noms générés:\n", |
| 376 | + "Generated names:\n", |
377 | 377 | "Ourocosaur (10 caractères)\n", |
378 | 378 | "Rsholisaur (10 caractères)\n", |
379 | 379 | "Cosonimus (9 caractères)\n", |
380 | 380 | "Euceratous (10 caractères)\n", |
381 | 381 | "Amarcerato (10 caractères)\n", |
382 | 382 | "\n", |
383 | | - "Tous les noms sont-ils originaux ? True\n", |
| 383 | + "Are all names original? True\n", |
384 | 384 | "\n", |
385 | | - "Longueur moyenne: 9.8 caractères\n", |
386 | | - "Longueur minimale: 9 caractères\n", |
387 | | - "Longueur maximale: 10 caractères\n" |
| 385 | + "Average length: 9.8 caractères\n", |
| 386 | + "Minimum length: 9 caractères\n", |
| 387 | + "Maximum length: 10 caractères\n" |
388 | 388 | ] |
389 | 389 | } |
390 | 390 | ], |
391 | 391 | "source": [ |
392 | | - "# Génération de nouveaux noms\n", |
| 392 | + "# Name generation\n", |
393 | 393 | "def generate_name(model, min_length=5):\n", |
394 | 394 | " current_sequence = [0] * max_length\n", |
395 | 395 | " generated_name = \"\"\n", |
|
398 | 398 | " x = np.array([current_sequence])\n", |
399 | 399 | " preds = model.predict(x)[0]\n", |
400 | 400 | "\n", |
401 | | - " # Sélection du prochain caractère avec random.choices\n", |
| 401 | + " # Select next character using random.choices\n", |
402 | 402 | " next_char_idx = random.choices(range(vocab_size), weights=preds, k=1)[0]\n", |
403 | 403 | " next_char = index_to_char[next_char_idx]\n", |
404 | 404 | "\n", |
405 | | - " # STOP si longueur minimale atteinte et EOS rencontré\n", |
| 405 | + " # STOP if minimum length reached and EOS encountered\n", |
406 | 406 | " if len(generated_name) >= min_length and next_char == EOS_TOKEN:\n", |
407 | 407 | " break\n", |
408 | 408 | "\n", |
409 | | - " # Ajout du caractère si ce n'est ni PAD ni EOS\n", |
| 409 | + " # Add character if it's neither PAD nor EOS\n", |
410 | 410 | " if next_char not in [PAD_TOKEN, EOS_TOKEN]:\n", |
411 | 411 | " generated_name += next_char\n", |
412 | 412 | "\n", |
413 | | - " # Mise à jour de la séquence courante\n", |
| 413 | + " # Update current sequence\n", |
414 | 414 | " current_sequence = current_sequence[1:] + [next_char_idx]\n", |
415 | 415 | "\n", |
416 | 416 | " return generated_name.capitalize() if len(generated_name) >= min_length else None\n", |
417 | 417 | "\n", |
418 | | - "# Génération de plusieurs noms\n", |
| 418 | + "# Generate multiple names\n", |
419 | 419 | "generated_names = []\n", |
420 | 420 | "number_of_names = 5\n", |
421 | 421 | "min_length = 5\n", |
|
425 | 425 | " if name is not None and name not in generated_names:\n", |
426 | 426 | " generated_names.append(name)\n", |
427 | 427 | "\n", |
428 | | - "# Affichage des résultats\n", |
429 | | - "print(\"\\nNoms générés:\")\n", |
| 428 | + "# Display results\n", |
| 429 | + "print(\"\\nGenerated names:\")\n", |
430 | 430 | "for name in generated_names:\n", |
431 | | - " print(f\"{name} ({len(name)} caractères)\")\n", |
| 431 | + " print(f\"{name} ({len(name)} characters)\")\n", |
432 | 432 | "\n", |
433 | | - "# Vérification de l'originalité\n", |
434 | | - "print(\"\\nTous les noms sont-ils originaux ?\", all(name.lower() not in [n.lower() for n in names] for name in generated_names))\n", |
| 433 | + "# Check originality\n", |
| 434 | + "print(\"\\nAre all names original?\", all(name.lower() not in [n.lower() for n in names] for name in generated_names))\n", |
435 | 435 | "\n", |
436 | | - "# Statistiques sur les longueurs\n", |
| 436 | + "# Length statistics\n", |
437 | 437 | "lengths = [len(name) for name in generated_names]\n", |
438 | | - "print(f\"\\nLongueur moyenne: {sum(lengths)/len(lengths):.1f} caractères\")\n", |
439 | | - "print(f\"Longueur minimale: {min(lengths)} caractères\")\n", |
440 | | - "print(f\"Longueur maximale: {max(lengths)} caractères\")" |
| 438 | + "print(f\"\\nAverage length: {sum(lengths)/len(lengths):.1f} characters\")\n", |
| 439 | + "print(f\"Minimum length: {min(lengths)} characters\")\n", |
| 440 | + "print(f\"Maximum length: {max(lengths)} characters\")" |
441 | 441 | ] |
442 | 442 | } |
443 | 443 | ], |
|
0 commit comments