Skip to content

Commit f843aa6

Browse files
New BRNN directory with the source code used to train, validate and evaluate that model.
1 parent 902be46 commit f843aa6

19 files changed

+1774
-0
lines changed

BRNN/configuration.py

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
#!/usr/bin/env python
2+
# -*- coding: utf-8 -*-
3+
4+
"""
5+
Configuration file with global constants
6+
"""
7+
8+
import math
9+
10+
# Data
11+
DATA_PCT = 0.001
12+
MODELS_DIR = './models/'
13+
LOG_DIR = './logs/'
14+
CSV_DIR = './csvs/'
15+
PICKLE_FILE_NAMES_7M = {
16+
"x_train": "pkl/7_000_000_train.pkl",
17+
"y_train": "pkl/7_000_000_train_labels.pkl",
18+
"x_val": "pkl/7_000_000_valid.pkl",
19+
"y_val": "pkl/7_000_000_valid_labels.pkl",
20+
"model": "7_000_000"
21+
}
22+
PICKLE_FILE_NAMES_70M = {
23+
"x_train": "pkl/70_000_000_train.pkl",
24+
"y_train": "pkl/70_000_000_train_labels.pkl",
25+
"x_val": "pkl/70_000_000_valid.pkl",
26+
"y_val": "pkl/70_000_000_valid_labels.pkl",
27+
"model": "70_000_000"
28+
}
29+
PICKLE_FILE_NAMES_400M = {
30+
"x_train": "pkl/432_180_483_train.pkl",
31+
"y_train": "pkl/432_180_483_train_labels.pkl",
32+
"x_val": "pkl/1_000_020_valid.pkl",
33+
"y_val": "pkl/1_000_020_valid_labels.pkl",
34+
"x_test": "pkl/1_000_020_test.pkl",
35+
"y_test": "pkl/1_000_020_test_labels.pkl",
36+
"model": "400_000_000"
37+
}
38+
PICKLE_FILE_NAMES_700M = {
39+
"x_train": "pkl/700_000_000_train.pkl",
40+
"y_train": "pkl/700_000_000_train_labels.pkl",
41+
"x_val": "pkl/700_000_000_valid.pkl",
42+
"y_val": "pkl/700_000_000_valid_labels.pkl",
43+
"x_test": "pkl/1_000_000_test.pkl",
44+
"y_test": "pkl/1_000_000_test_labels.pkl",
45+
"model": "700_000_000"
46+
}
47+
48+
# Input
49+
VOCABULARY_SIZE = 97 # Size of the vocabulary
50+
MAX_CHARS_PER_LINE = 40 # Max number of characters per line
51+
NUMBER_OF_CLASSES = 21 # Number of languages (different targets)
52+
53+
54+
# Model hyper-parameters
55+
EMBEDDING_DIM = value if (value := math.ceil(math.log2(VOCABULARY_SIZE))) % 2 == 0 else value + 1
56+
EMBEDDING_DIM *= 4
57+
58+
# Programming languages that can be classified
59+
LANGUAGES = ["Assembly", "C", "C++", "C#", "CSS", "Go", "HTML", "Java", "JavaScript", "Kotlin",
60+
"Matlab", "Perl", "PHP", "Python", "R", "Ruby", "Scala", "SQL", "Swift", "TypeScript",
61+
"Unix Shell"]

BRNN/data.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
#!/usr/bin/env python
2+
# -*- coding: utf-8 -*-
3+
4+
"""
5+
Module that loads datasets from files
6+
"""
7+
8+
import pickle
9+
from typing import Dict
10+
11+
def load_data(file_names:Dict[str,str]):
12+
with open(file_names['x_train'], 'rb') as handle:
13+
x_train = pickle.load(handle)
14+
with open(file_names['y_train'], 'rb') as handle:
15+
y_train = pickle.load(handle)
16+
with open(file_names['x_val'], 'rb') as handle:
17+
x_val = pickle.load(handle)
18+
with open(file_names['y_val'], 'rb') as handle:
19+
y_val = pickle.load(handle)
20+
return (x_train, y_train), (x_val, y_val)
21+
22+
23+
def select_first_in_list(iterable, pencentage: float):
24+
if pencentage >= 1:
25+
return iterable
26+
return iterable[:int(len(iterable)*pencentage)]
27+
28+
29+
def show_data(x_train, y_train, x_val, y_val):
30+
print(f"x_train shape: {x_train.shape}")
31+
print(f"y_train shape: {y_train.shape}")
32+
print(f"x_val shape: {x_val.shape}")
33+
print(f"y_val shape: {y_val.shape}")
34+

BRNN/error_analysis.py

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
#!/usr/bin/env python
2+
# -*- coding: utf-8 -*-
3+
4+
"""
5+
Performs an error analysis, showing the intances that were missclassified
6+
"""
7+
8+
import pickle
9+
import keras
10+
import numpy as np
11+
from configuration import PICKLE_FILE_NAMES_400M
12+
13+
MODEL_FILE_NAME = "models/RNN-432180483-batch_size_2048-n_rnn_layers_8-drop_out_0-n_neurons_hidden_dense_layer_classifier_512-n_class_layers_2-learning_rate_0.0001-n_neurons_lstm_out_256-embedding_dim_32-activation_relu-lstm_True-"
14+
LANGUAGE_LABELS = ["Assembly", "C", "C++", "C#", "CSS", "Go", "HTML", "Java", "JavaScript", "Kotlin",
15+
"Matlab", "Perl", "PHP", "Python", "R", "Ruby", "Scala", "SQL", "Swift", "TypeScript",
16+
"Unix Shell"]
17+
NUMBER_INSTANCES_TO_PROCESS = 100_000
18+
19+
20+
def load_dataset(x_file_name: str, y_file_name: str):
21+
print("Loading X for validation...")
22+
with open(x_file_name, 'rb') as handle:
23+
x_test = pickle.load(handle)
24+
print("Loading Y validation...")
25+
with open(y_file_name, 'rb') as handle:
26+
y_test = pickle.load(handle)
27+
# Convert from one-hot to integer values
28+
y_test = np.argmax(y_test, axis=1)
29+
print(x_test.shape)
30+
print(y_test.shape)
31+
return x_test, y_test
32+
33+
34+
def shuffle_dataset(x_data, y_data):
35+
print("Shuffling dataset...")
36+
assert len(x_data) == len(y_data)
37+
indices = np.random.permutation(len(x_data))
38+
return x_data[indices], y_data[indices]
39+
40+
41+
def get_erroneous_predictions(y_test, predicted_y, actual_lang: str, predicted_lang: str):
42+
assert len(y_test) == len(predicted_y)
43+
actual_lang_index, predicted_lang_index = LANGUAGE_LABELS.index(actual_lang), LANGUAGE_LABELS.index(predicted_lang)
44+
wrong_classification_indexes = [i for i in range(len(y_test)) if y_test[i] == actual_lang_index and
45+
np.argmax(predicted_y[i]) == predicted_lang_index]
46+
return wrong_classification_indexes
47+
48+
49+
def convert_vector_to_code(vector) -> str:
50+
return "".join(map(lambda integer: chr(integer - 2 + 32), vector))
51+
52+
53+
def show_error_instances(wrong_classification_indexes, x_test):
54+
for index in wrong_classification_indexes:
55+
print(f'Source code line: "{convert_vector_to_code(x_test[index])}".')
56+
57+
58+
def show_miss_classifications(actual_lang: str, predicted_lang: str):
59+
# Load dataset
60+
x_test, y_test = load_dataset(PICKLE_FILE_NAMES_400M['x_test'], PICKLE_FILE_NAMES_400M['y_test'])
61+
x_test, y_test = shuffle_dataset(x_test, y_test)
62+
x_test, y_test = x_test[:NUMBER_INSTANCES_TO_PROCESS], y_test[:NUMBER_INSTANCES_TO_PROCESS]
63+
# Load model
64+
print(f"Loading the model from {MODEL_FILE_NAME} ...")
65+
model = keras.models.load_model(MODEL_FILE_NAME)
66+
# Predict the languages
67+
predicted_y = model.predict(x_test)
68+
wrong_classification_indexes = get_erroneous_predictions(y_test, predicted_y, actual_lang, predicted_lang)
69+
print(f"Miss classifications of {actual_lang} (actual) as {predicted_lang} (predicted):")
70+
show_error_instances(wrong_classification_indexes, x_test)
71+
72+
73+
def main():
74+
show_miss_classifications("Swift", "Kotlin")
75+
76+
77+
if __name__ == "__main__":
78+
main()

BRNN/hyperparams.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
#!/usr/bin/env python
2+
# -*- coding: utf-8 -*-
3+
4+
"""
5+
Class modeling the hyperparameters of neural networks
6+
"""
7+
8+
class HyperParams:
9+
10+
def __init__(self, model_name: str, n_individuals: int, batch_size: int,
11+
n_attention_heads: int = None, n_trans_blocks: int = None,
12+
n_rnn_layers: int = None,
13+
n_neurons_hidden_dense_layer_encoder: int = None, drop_out: int = None,
14+
n_neurons_hidden_dense_layer_classifier: int = None, n_class_layers: int = None,
15+
learning_rate: int = None, n_neurons_lstm_out: int = None,
16+
embedding_dim: int = None, activation: str = None, lstm: bool = None):
17+
self.properties = dict()
18+
self.model_name = model_name
19+
self.properties['n_individuals'] = n_individuals
20+
self.properties['batch_size'] = batch_size
21+
if n_attention_heads is not None:
22+
self.properties['n_attention_heads'] = n_attention_heads
23+
if n_trans_blocks is not None:
24+
self.properties['n_trans_blocks'] = n_trans_blocks
25+
if n_rnn_layers is not None:
26+
self.properties['n_rnn_layers'] = n_rnn_layers
27+
if n_neurons_hidden_dense_layer_encoder is not None:
28+
self.properties['n_neurons_hidden_dense_layer_encoder'] = n_neurons_hidden_dense_layer_encoder
29+
if drop_out is not None:
30+
self.properties['drop_out'] = drop_out
31+
if n_neurons_hidden_dense_layer_classifier is not None:
32+
self.properties['n_neurons_hidden_dense_layer_classifier'] = n_neurons_hidden_dense_layer_classifier
33+
if n_class_layers is not None:
34+
self.properties['n_class_layers'] = n_class_layers
35+
if learning_rate is not None:
36+
self.properties['learning_rate'] = learning_rate
37+
if n_neurons_lstm_out is not None:
38+
self.properties['n_neurons_lstm_out'] = n_neurons_lstm_out
39+
if embedding_dim is not None:
40+
self.properties['embedding_dim'] = embedding_dim
41+
if activation is not None:
42+
self.properties['activation'] = activation
43+
if lstm is not None:
44+
self.properties['lstm'] = lstm
45+
46+
def get(self, name: str) -> any:
47+
return self.properties[name]
48+
49+
def set(self, name: str, value: any):
50+
self.properties[name] = value
51+
52+
def long_name(self):
53+
result = f"{self.model_name}-{self.properties['n_individuals']}-"
54+
for key, value in self.properties.items():
55+
if value is not None and key != 'n_individuals':
56+
result += f"{key}_{value}-"
57+
return result

BRNN/infer-from-params.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
#!/usr/bin/env python
2+
# -*- coding: utf-8 -*-
3+
4+
"""
5+
Example use of model inference, by restoring it from the saved model
6+
"""
7+
import pickle
8+
import keras
9+
from configuration import PICKLE_FILE_NAMES_700M, PICKLE_FILE_NAMES_400M
10+
import os
11+
from data import select_first_in_list
12+
from utils import evaluate_model
13+
14+
MODELS_DIR = './models/'
15+
16+
17+
def main():
18+
file_names = PICKLE_FILE_NAMES_700M
19+
20+
#model_file_name = MODELS_DIR + file_names['model']
21+
model_names = [
22+
"RNN-700000000-batch_size_2048-n_rnn_layers_6-drop_out_0-n_neurons_hidden_dense_layer_classifier_512-n_class_layers_2-learning_rate_0.001-n_neurons_lstm_out_256-embedding_dim_32-activation_relu-lstm_True-",
23+
"RNN-700000000-batch_size_2048-n_rnn_layers_8-drop_out_0-n_neurons_hidden_dense_layer_classifier_512-n_class_layers_2-learning_rate_0.0005-n_neurons_lstm_out_256-embedding_dim_32-activation_relu-lstm_True-",
24+
"RNN-700000000-batch_size_2048-n_rnn_layers_10-drop_out_0-n_neurons_hidden_dense_layer_classifier_512-n_class_layers_2-learning_rate_0.0001-n_neurons_lstm_out_256-embedding_dim_32-activation_relu-lstm_True-"
25+
]
26+
27+
print("Loading X for validation...")
28+
with open(file_names['x_test'], 'rb') as handle:
29+
x_val = pickle.load(handle)
30+
print("Loading Y validation...")
31+
with open(file_names['y_test'], 'rb') as handle:
32+
y_val = pickle.load(handle)
33+
34+
DATA_PCT = 1
35+
x_val, y_val = select_first_in_list(x_val, DATA_PCT), select_first_in_list(y_val, DATA_PCT)
36+
37+
for model_file_name in model_names:
38+
model_file_name = MODELS_DIR + model_file_name
39+
if not os.path.exists(model_file_name):
40+
print(f"File '{model_file_name}' not found.")
41+
return
42+
print(f"Loading model from '{model_file_name}'...")
43+
model = keras.models.load_model(model_file_name)
44+
print(f"Evaluating the model {model_file_name}...")
45+
46+
evaluate_model(model, x_val, y_val)
47+
48+
49+
if __name__ == "__main__":
50+
main()

BRNN/infer-from-pickle.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
#!/usr/bin/env python
2+
# -*- coding: utf-8 -*-
3+
4+
"""
5+
Example use of model inference, by restoring it from a serialized model (using pickle)
6+
"""
7+
8+
import pickle
9+
from utils import evaluate_model
10+
import os
11+
12+
MODELS_DIR = './models/'
13+
PICKLE_FILE_NAMES_7M = {
14+
"x_train": "pkl/7_000_000_train.pkl",
15+
"y_train": "pkl/7_000_000_train_labels.pkl",
16+
"x_val": "pkl/7_000_000_valid.pkl",
17+
"y_val": "pkl/7_000_000_valid_labels.pkl",
18+
}
19+
20+
def get_file_with_highest_accuracy(dir: str) -> str:
21+
file_names = os.listdir(dir)
22+
file_names = list(filter(lambda file_name: file_name.startswith('accuracy'), file_names))
23+
file_names.sort()
24+
if len(file_names):
25+
return dir + file_names[-1]
26+
return None
27+
28+
def main():
29+
file_name = get_file_with_highest_accuracy(MODELS_DIR)
30+
if not file_name:
31+
return
32+
print("Loading X for validation...")
33+
with open(PICKLE_FILE_NAMES_7M['x_val'], 'rb') as handle:
34+
x_val = pickle.load(handle)
35+
print("Loading Y validation...")
36+
with open(PICKLE_FILE_NAMES_7M['y_val'], 'rb') as handle:
37+
y_val = pickle.load(handle)
38+
print(f"Loading the model from {file_name} ...")
39+
with open(file_name, 'rb') as handle:
40+
model = pickle.load(handle)
41+
print("Evaluating the model...")
42+
evaluate_model(model, x_val, y_val)
43+
44+
45+
if __name__ == "__main__":
46+
main()

0 commit comments

Comments
 (0)