PokeLLMon/eval_utils.py at master · JVP15/PokeLLMon · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
# The normal SFT trainer doesn't support custom evaluation code; I want to fix that so that I can tell how well the model
# is actually doing

import torch
import numpy as np

from trl import SFTTrainer
from transformers import PreTrainedTokenizer, PreTrainedModel
from datasets import Dataset

from unsloth import FastLanguageModel

from typing import List, Dict, Any, Tuple

from type_dataset_utils import TypeDataset, TypeQADataset

from tqdm import tqdm


# eventually, I'll want to confirm that there aren't other types in the generated text
TYPES = [
    'bug',
    'dark',
    'dragon',
    'electric',
    'fairy',
    'fighting',
    'fire',
    'flying',
    'ghost',
    'grass',
    'ground',
    'ice',
    'normal',
    'poison',
    'psychic',
    'rock',
    'steel',
    'water'
]

def qa_pipeline(model, tokenizer: PreTrainedTokenizer, questions):
    padding_side = tokenizer.padding_side
    tokenizer.padding_side = 'left' # regardless of what it is like in training, we need to pad left for evals

    question_batch = [[{'role': 'user', 'content': q}] for q in questions]

    inputs = tokenizer.apply_chat_template(
        question_batch,
        add_generation_prompt=True,
        tokenize=False,
    )

    # the special tokens are already added when the chat template is applied
    inputs = tokenizer(inputs, padding='longest', return_tensors='pt', add_special_tokens=False).to('cuda')

    outputs = model.generate(**inputs, max_new_tokens=32, use_cache=True, temperature=0.0)

    text_batch = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    tokenizer.padding_side = padding_side

    return text_batch


def single_pokemon_qa(model: PreTrainedModel, tokenizer: PreTrainedTokenizer, questions: List[str], types: List[str]) -> int:
    """Runs inference with model (using the tokenizer for the appropriate prompt template) and returns the number
    of questions that the model got right."""

    text_batch = qa_pipeline(model, tokenizer, questions)

    num_correct = 0

    for text in text_batch:
        if all(pokemon_type.lower() in text.lower() for pokemon_type in types):
            num_correct += 1

    return num_correct

def batch_pokemon_qa(model: PreTrainedModel, tokenizer: PreTrainedTokenizer, questions: List[List[str]], types: List[List[str]]) -> List[int]:
    """
    Runs inference with model for a batch of Pokémon and returns the number of questions
    that the model got right for each Pokémon.
    """

    flattened_questions = []
    question_indices = []
    for i, pokemon_questions in enumerate(questions):
        flattened_questions.extend(pokemon_questions)
        question_indices.extend([i] * len(pokemon_questions))

    text_batch = qa_pipeline(model, tokenizer, flattened_questions)

    # Count correct answers for each Pokémon
    num_correct = [0] * len(questions)
    for text, pokemon_index in zip(text_batch, question_indices):
        if all(pokemon_type.lower() in text.lower() for pokemon_type in types[pokemon_index]):
            num_correct[pokemon_index] += 1

    return num_correct

def evaluate_type_qa_dataset(model: PreTrainedModel, tokenizer: PreTrainedTokenizer, question_dataset: Dataset, pokemon_batch_size=4) -> Tuple[float, float]:
    """Evalates a pokemon type question-answer dataset and returns the macro and micro averages for accuracy."""
    was_training = model.training
    FastLanguageModel.for_inference(model)

    total_questions = 0
    total_correct_answers = 0
    per_pokemon_correct_answers = []

    num_pokemon = len(question_dataset['questions'])
    num_batches = (num_pokemon + pokemon_batch_size - 1) // pokemon_batch_size

    # even though we do batch processing across the # of questions, for Phi-3, we can have a much bigger batch size
    # and it'll be a lot faster
    for i in tqdm(range(0, num_pokemon, pokemon_batch_size), desc='Evaluating Model for QA', unit=' Pokemon Batch',
                  total=num_batches):

        batch_questions = question_dataset['questions'][i:i + pokemon_batch_size]
        batch_types = question_dataset['types'][i:i + pokemon_batch_size]

        batch_correct_answers = batch_pokemon_qa(model, tokenizer, batch_questions, batch_types)

        # Update statistics
        for j, correct_answers in enumerate(batch_correct_answers):
            num_questions = len(batch_questions[j])
            total_questions += num_questions
            total_correct_answers += correct_answers
            per_pokemon_correct_answers.append(correct_answers / num_questions)

    # this basically tells us how many questions it got right in general
    # but it doesn't tell us how robust the model is to different prompts
    # e.g. does it get 1/5 correct for every pokemon, or 5/5 for 1/5th of pokemon and 0/5 for the rest? we can't tell
    micro_accuracy = total_correct_answers / total_questions

    macro_accuracy = sum(per_pokemon_correct_answers) / num_pokemon

    if was_training:
        model.train()
        FastLanguageModel.for_training(model)

    return  macro_accuracy, micro_accuracy


def create_compute_metric_fn(
        model: PreTrainedModel,
        tokenizer: PreTrainedTokenizer,
        train_question_dataset: Dataset = None,
        val_question_dataset: Dataset = None,
        pokemon_batch_size=4):
    """
    This function creates a function usable by the TRL trainer's compute_metric, but since that is pretty in terms of
    the inputs, this function comes pre-packaged with the provided model, tokenizer, and dataset.

    dataset is expected to be the dataset that is
    """

    @torch.no_grad()
    def compute_metric_fn(*args, **kwargs) -> Dict[str, float]:
        metrics = {}

        if train_question_dataset is not None:
            train_macro, train_micro = evaluate_type_qa_dataset(model, tokenizer, train_question_dataset, pokemon_batch_size)
            metrics['train_macro_accuracy'] = train_macro
            metrics['train_micro_accuracy'] = train_micro

        if val_question_dataset is not None:
            val_macro, val_micro = evaluate_type_qa_dataset(model, tokenizer, val_question_dataset, pokemon_batch_size)
            metrics['val_macro_accuracy'] = val_macro
            metrics['val_micro_accuracy'] = val_micro

        return metrics

    return compute_metric_fn


if __name__ == '__main__':
    from unsloth import FastLanguageModel
    from transformers import AutoTokenizer

    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name="unsloth/Phi-3-mini-4k-instruct",  # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length=256,
        dtype=None,
        load_in_4bit=False
    )

    print(qa_pipeline(model, tokenizer, ['What type of pokemon is Bulbasaur?', "bulbasaur's pokedex entry shows it as what type?",
                                         "What type or types does bulbasaur have?"]))

    num_correct = single_pokemon_qa(model, tokenizer, ['What type of pokemon is Bulbasaur?', "bulbasaur's pokedex entry shows it as what type?",
                                         "What type or types does bulbasaur have?"], types=['Grass', 'Poison'])

    print(num_correct)

    type_dataset = TypeDataset()
    all_pokemon_qa_dataset = TypeQADataset(type_dataset, num_questions_per_pokemon=4)
    all_pokemon_qa_dataset = all_pokemon_qa_dataset.train_test_split(test_size=.2)

    compute_metric_fn = create_compute_metric_fn(
        model,
        tokenizer,
        train_question_dataset=all_pokemon_qa_dataset['train'],
        val_question_dataset=all_pokemon_qa_dataset['test'],
        pokemon_batch_size=16
    )

    print(compute_metric_fn())