Skip to content

Commit fb67279

Browse files
Source code of the MLP project.
1 parent 17a0f7e commit fb67279

14 files changed

+1242
-0
lines changed

MLP/callbacks.py

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
#!/usr/bin/env python
2+
# -*- coding: utf-8 -*-
3+
4+
"""
5+
This module defines a custom callback PrintAndSaveStats to monitor and save various statistics during the training of a
6+
MLP model. The callback logs information such as epoch timings, accuracy, loss, and metrics like precision and recall.
7+
It also computes aggregates like total training time and best accuracy achieved. Additionally, it writes these
8+
statistics to a file and logs them for TensorBoard visualization. The get_callbacks function generates a list of
9+
callbacks including Early Stopping, model checkpointing, the custom PrintAndSaveStats, and TensorBoard logging,
10+
tailored for a specific model with given parameters.
11+
"""
12+
13+
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
14+
import tensorflow as tf
15+
import datetime
16+
import time
17+
from parameters import get_tensorboard_path
18+
19+
20+
class PrintAndSaveStats(tf.keras.callbacks.Callback):
21+
22+
def __init__(self, model_name):
23+
self.epoch_time_start = None
24+
self.model_name = model_name
25+
self.total_time = 0
26+
self.last_epoch = 1
27+
self.best_acc = 0
28+
self.best_epoch = 1
29+
self.first_acc = 0
30+
self.last_acc = 0
31+
self.last_loss = 0
32+
self.last_f1_micro = 0
33+
self.last_f1_macro = 0
34+
self.last_precision = 0
35+
self.last_recall = 0
36+
37+
def on_epoch_begin(self, batch, logs={}):
38+
self.epoch_time_start = time.time()
39+
40+
def on_epoch_end(self, epoch, logs):
41+
epoch += 1
42+
if epoch == 1:
43+
self.first_acc = logs["val_accuracy"]
44+
print('Epoch {} finished at {}'.format(epoch, datetime.datetime.now().time()))
45+
print(f"Printing log object:\n{logs}")
46+
elapsed_time = int((time.time() - self.epoch_time_start))
47+
print(f"Elaspsed time: {elapsed_time}")
48+
if logs["loss"] != 0:
49+
print("val/train loss: {:.2f}".format(logs["val_loss"] / logs["loss"]))
50+
if logs["accuracy"] != 0:
51+
print("val/train acc: {:.2f}".format(logs["val_accuracy"] / logs["accuracy"]))
52+
file1 = open(get_history_path(self.model_name), "a") # append mode
53+
SEPARATOR = ";"
54+
file1.write(str(epoch) + SEPARATOR + str(datetime.datetime.now().time()) + SEPARATOR +
55+
str(elapsed_time) + SEPARATOR + str(logs["accuracy"]) + SEPARATOR +
56+
str(logs["val_accuracy"]) + SEPARATOR + str(logs["loss"]) + SEPARATOR + str(logs["val_loss"])
57+
+ "\n")
58+
file1.close()
59+
self.compute_aggregates(elapsed_time, logs["val_accuracy"], epoch)
60+
61+
self.last_acc = logs["val_accuracy"]
62+
self.last_loss = logs["val_loss"]
63+
# self.last_f1_micro = logs["val_f1_micro"]
64+
# self.last_f1_macro = logs["val_f1_macro"]
65+
self.last_precision = logs["val_precision"]
66+
self.last_recall = logs["val_recall"]
67+
with tf.summary.create_file_writer(get_tensorboard_path()).as_default():
68+
tf.summary.scalar("val_accuracy", logs["val_accuracy"], step=epoch)
69+
tf.summary.scalar("val_loss", logs["val_loss"], step=epoch)
70+
tf.summary.scalar("train_accuracy", logs["accuracy"], step=epoch)
71+
tf.summary.scalar("train_loss", logs["loss"], step=epoch)
72+
tf.summary.scalar("time", elapsed_time, step=epoch)
73+
tf.summary.scalar("precision", logs["val_precision"], step=epoch)
74+
tf.summary.scalar("recall", logs["val_recall"], step=epoch)
75+
# tf.summary.scalar("f1_macro", logs["val_f1_macro"], step=epoch)
76+
# tf.summary.scalar("f1_micro", logs["val_f1_micro"], step=epoch)
77+
78+
def compute_aggregates(self, elapsed_time: int, val_acc, epoch: int):
79+
self.total_time += elapsed_time
80+
self.last_epoch = epoch
81+
if val_acc > self.best_acc:
82+
self.best_acc = val_acc
83+
self.best_epoch = epoch
84+
85+
def get_stats(self):
86+
return [int(self.total_time / self.last_epoch), self.first_acc, self.best_acc, self.best_epoch, self.last_epoch]
87+
88+
89+
def get_history_path(model_name: str):
90+
return model_name + "_history.csv"
91+
92+
93+
def get_best_model_path(model_name: str):
94+
return model_name + "_checkpoint.h5"
95+
96+
97+
def get_callbacks(model_name: str, early_patience: int) -> list:
98+
early_stopping = EarlyStopping(monitor="val_loss", mode="min", patience=early_patience,
99+
restore_best_weights=True, verbose=1)
100+
save_best_model = ModelCheckpoint(get_best_model_path(model_name), save_best_only=True, monitor="val_loss", verbose=1)
101+
save_model_stats = PrintAndSaveStats(model_name)
102+
tensorboard = TensorBoard(get_tensorboard_path())
103+
return [save_best_model, save_model_stats, early_stopping, tensorboard]

MLP/class_weight.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
#!/usr/bin/env python
2+
# -*- coding: utf-8 -*-
3+
4+
"""
5+
This module calculates class weights for the MLP training by first extracting class labels from the training dataset.
6+
Then, it computes class weights using scikit-learn compute_class_weight function to address
7+
class imbalance. Finally, it returns a dictionary mapping class indices to their respective weights."""
8+
9+
from lazy_load import load_ds_lazy
10+
from parameters import *
11+
from pickle_load import pickle_to_tensor
12+
import numpy as np
13+
from sklearn.utils.class_weight import compute_class_weight
14+
from typing import Dict, Any
15+
16+
17+
def get_class_weight(ds_train_y) -> Dict[int, Any]:
18+
class_labels = np.argmax(ds_train_y, axis=1)
19+
class_weights = compute_class_weight('balanced', classes=np.unique(class_labels), y=class_labels)
20+
cw_dict = {}
21+
for lang_index in range(0, class_weights.shape[0]):
22+
cw_dict[lang_index] = class_weights[lang_index]
23+
return cw_dict
24+
25+
26+
if __name__ == '__main__':
27+
for config in process_args():
28+
ARG_MAP = config
29+
if ARG_MAP[LAZY_LOAD]:
30+
train_ds, val_ds = load_ds_lazy(ARG_MAP[BATCH_SIZE], ARG_MAP[N_LABELS], ARG_MAP[EPOCHS])
31+
else:
32+
train_ds, val_ds = get_pickle_paths()
33+
train_y = pickle_to_tensor(train_ds + "_labels")
34+
print(f"{get_class_weight(train_y)} vector for {ARG_MAP[N_SAMPLES]} samples")

MLP/files_to_csv_raw_int_lines.py

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
#!/usr/bin/env python
2+
# -*- coding: utf-8 -*-
3+
4+
"""
5+
6+
This module transforms text files into integer vectors mapping each character to its UTF-8 value and stores them as CSV
7+
files. It reads the texts in the corpus and splits the different lines, concatenates them with their corresponding
8+
labels, and then batches them for efficient processing. It subsequently iterates through the resulting dataset,
9+
writing batches of lines into separate CSV files. Additionally, it also handles the dataset stratification."""
10+
11+
import tensorflow as tf
12+
from typing import List
13+
import os
14+
15+
N_CORES = tf.data.AUTOTUNE
16+
17+
18+
def eff_write(ds, folder, file_counter=0, lines_per_file=1000):
19+
if not os.path.exists(folder):
20+
os.mkdir(folder)
21+
ds = ds.map(lambda line, label: tf.strings.reduce_join(tf.strings.as_string(tf.concat([line, [label]], axis=0))
22+
, separator=","))
23+
ds = ds.batch(lines_per_file)
24+
FILE_NAME = "f"
25+
for batch in ds:
26+
complete_name = FILE_NAME + str(file_counter) + ".csv"
27+
file_content = str(tf.strings.reduce_join(batch, separator="\n").numpy(), encoding="ascii")
28+
with open(folder + "/" + complete_name, 'w') as csvfile:
29+
csvfile.write(file_content)
30+
file_counter += 1
31+
return file_counter
32+
33+
34+
def line_to_raw_int_ds(ds: tf.data.Dataset):
35+
ds = ds.map(lambda file, label: (file, tf.cast(label, tf.dtypes.int32)), num_parallel_calls=N_CORES)
36+
ds = ds.map(lambda file, label: (tf.strings.unicode_decode(file, "UTF-16LE"), label), num_parallel_calls=N_CORES)
37+
ds = ds.map(lambda file, label: (tf.strings.unicode_encode(file, "UTF-8"), label), num_parallel_calls=N_CORES)
38+
ds = ds.interleave(lambda file, label: tf.data.Dataset.from_tensor_slices(
39+
tf.map_fn(lambda line: (line, label), tf.strings.split(tf.strings.regex_replace(file, "\r", ""), "\n"),
40+
fn_output_signature=(tf.dtypes.string, tf.int32))).shuffle(1000)
41+
, num_parallel_calls=N_CORES,
42+
deterministic=True, block_length=1, cycle_length=20_000
43+
)
44+
LENGTH_LIMIT: int = 10
45+
ds = ds.filter(lambda line, _: tf.strings.length(line) >= LENGTH_LIMIT)
46+
return ds.map(lambda line, label: (tf.strings.unicode_decode(line, "UTF-8"), label), num_parallel_calls=N_CORES)
47+
48+
49+
def get_raw_lines_ds(source_folder: str) -> tf.data.Dataset:
50+
snippet_ds: tf.data.Dataset = tf.keras.utils.text_dataset_from_directory(
51+
source_folder, label_mode='int', batch_size=None, shuffle=True)
52+
return line_to_raw_int_ds(snippet_ds)
53+
54+
55+
def stratify_ds(ds: tf.data.Dataset, weights: List[float]):
56+
datasets = [ds.filter(lambda _, label: label == i) for i in range(len(weights))]
57+
return tf.data.Dataset.sample_from_datasets(datasets, weights, stop_on_empty_dataset=True)
58+
59+
60+
if __name__ == '__main__':
61+
SNIPPET_SOURCE_FOLDER = ".\\comments_V2_TXT_test"
62+
line_ds: tf.data.Dataset = get_raw_lines_ds(SNIPPET_SOURCE_FOLDER)
63+
DS_SIZE: int = 1_000_000
64+
N_LANGS: int = 21
65+
WEIGHTS = [DS_SIZE / N_LANGS] * N_LANGS
66+
line_ds = stratify_ds(line_ds, WEIGHTS).take(DS_SIZE)
67+
DEST_FOLDER: str = ".\\raw_int_lines_test_balanced_1M"
68+
eff_write(line_ds, DEST_FOLDER)

0 commit comments

Comments
 (0)