keirp · gowrishankarin · Mar 14, 2024
diff --git a/automatic_prompt_engineer/ape.py b/automatic_prompt_engineer/ape.py
@@ -18,8 +18,8 @@ def simple_ape(dataset,
                eval_template='Instruction: [PROMPT]\nInput: [INPUT]\nOutput: [OUTPUT]',
                prompt_gen_template=None,
                demos_template='Input: [INPUT]\nOutput: [OUTPUT]',
-               eval_model='text-davinci-002',
-               prompt_gen_model='text-davinci-002',
+               eval_model='gpt-3.5-turbo',
+               prompt_gen_model='gpt-3.5-turbo',
                prompt_gen_mode='forward',
                num_prompts=50,
                eval_rounds=20,
@@ -60,7 +60,7 @@ def simple_eval(dataset,
                 prompts,
                 eval_template='Instruction: [PROMPT]\nInput: [INPUT]\nOutput: [OUTPUT]',
                 demos_template='Input: [INPUT]\nOutput: [OUTPUT]',
-                eval_model='text-davinci-002',
+                eval_model='gpt-3.5-turbo',
                 num_samples=50):
     """
     Function that wraps the evaluate_prompts function to make it easier to use.
@@ -87,8 +87,8 @@ def simple_estimate_cost(dataset,
                          eval_template='Instruction: [PROMPT]\nInput: [INPUT]\nOutput: [OUTPUT]',
                          prompt_gen_template=None,
                          demos_template='Input: [INPUT]\nOutput: [OUTPUT]',
-                         eval_model='text-davinci-002',
-                         prompt_gen_model='text-davinci-002',
+                         eval_model='gpt-3.5-turbo',
+                         prompt_gen_model='gpt-3.5-turbo',
                          prompt_gen_mode='forward',
                          num_prompts=50,
                          eval_rounds=20,

diff --git a/automatic_prompt_engineer/configs/bandits.yaml b/automatic_prompt_engineer/configs/bandits.yaml
@@ -6,7 +6,7 @@ generation:
     name: GPT_forward # the name of the model used for prompt generation
     batch_size: 500 # the maximum batch size used for prompt generation
     gpt_config: # the configuration of the GPT model used for prompt generation (these are fed directly to the openai function)
-      model: text-davinci-002
+      model: gpt-3.5-turbo
       temperature: 0.9
       max_tokens: 50
       top_p: 0.9
@@ -27,7 +27,7 @@ evaluation:
       name: GPT_forward
       batch_size: 500
       gpt_config:
-        model: text-davinci-002
+        model: gpt-3.5-turbo
         temperature: 0.7
         max_tokens: 200
         top_p: 1.0
@@ -38,7 +38,7 @@ demo:
     name: GPT_forward
     batch_size: 500
     gpt_config:
-      model: text-davinci-002
+      model: gpt-3.5-turbo
       temperature: 0.7
       max_tokens: 200
       top_p: 1.0

diff --git a/automatic_prompt_engineer/configs/default.yaml b/automatic_prompt_engineer/configs/default.yaml
@@ -6,7 +6,7 @@ generation:
     name: GPT_forward
     batch_size: 500
     gpt_config:
-      model: text-davinci-002
+      model: gpt-3.5-turbo
       temperature: 0.9
       max_tokens: 50
       top_p: 0.9
@@ -20,7 +20,7 @@ evaluation:
     name: GPT_forward
     batch_size: 500
     gpt_config:
-      model: text-davinci-002
+      model: gpt-3.5-turbo
       temperature: 0.7
       max_tokens: 200
       top_p: 1.0
@@ -31,7 +31,7 @@ demo:
     name: GPT_forward
     batch_size: 500
     gpt_config:
-      model: text-davinci-002
+      model: gpt-3.5-turbo
       temperature: 0.7
       max_tokens: 200
       top_p: 1.0

diff --git a/automatic_prompt_engineer/evaluation/likelihood.py b/automatic_prompt_engineer/evaluation/likelihood.py
@@ -80,9 +80,10 @@ def __init__(self, prompts, log_probs, num_samples):
     def _compute_avg_likelihood(self, prompts, log_probs, num_samples):
         i = 0
         prompt_log_probs = []
-        for prompt in prompts:
+        # TODO: Prompts and NumSamples should be the loop strengths
+        for log_prob in log_probs:
             prompt_log_probs.append([])
-            for _ in range(num_samples):
+            for _ in range(len(log_probs)):
                 lps = log_probs[i]
                 prompt_log_probs[-1].append(sum(lps) / len(lps))
                 i += 1

diff --git a/automatic_prompt_engineer/llm.py b/automatic_prompt_engineer/llm.py
@@ -4,8 +4,12 @@
 import time
 from tqdm import tqdm
 from abc import ABC, abstractmethod
-
 import openai
+from openai import OpenAI
+
+SYSTEM_PROMPT = 'You are a kick ass prompt engineer, you are given with input variables and the output generated by an LLM. Find the right prompt for this batch of input and outputs'
+
+from automatic_prompt_engineer import utils
 
 gpt_costs_per_thousand = {
     'davinci': 0.0200,
@@ -18,10 +22,11 @@
 def model_from_config(config, disable_tqdm=True):
     """Returns a model based on the config."""
     model_type = config["name"]
+    client = OpenAI()
     if model_type == "GPT_forward":
-        return GPT_Forward(config, disable_tqdm=disable_tqdm)
+        return GPT_Forward(config, client, disable_tqdm=disable_tqdm)
     elif model_type == "GPT_insert":
-        return GPT_Insert(config, disable_tqdm=disable_tqdm)
+        return GPT_Insert(config, client, disable_tqdm=disable_tqdm)
     raise ValueError(f"Unknown model type: {model_type}")
 
 
@@ -54,11 +59,12 @@ def log_probs(self, text, log_prob_range):
 class GPT_Forward(LLM):
     """Wrapper for GPT-3."""
 
-    def __init__(self, config, needs_confirmation=False, disable_tqdm=True):
+    def __init__(self, config, client, needs_confirmation=False, disable_tqdm=True):
         """Initializes the model."""
         self.config = config
         self.needs_confirmation = needs_confirmation
         self.disable_tqdm = disable_tqdm
+        self.client = client
 
     def confirm_cost(self, texts, n, max_tokens):
         total_estimated_cost = 0
@@ -155,18 +161,21 @@ def __generate_text(self, prompt, n):
         for i in range(len(prompt)):
             prompt[i] = prompt[i].replace('[APE]', '').strip()
         response = None
+
+        messages = utils.get_messages(prompt)
+
         while response is None:
             try:
-                response = openai.Completion.create(
-                    **config, prompt=prompt)
+                response = self.client.chat.completions.create(
+                    **config, messages=messages)
             except Exception as e:
                 if 'is greater than the maximum' in str(e):
                     raise BatchSizeException()
                 print(e)
                 print('Retrying...')
                 time.sleep(5)
 
-        return [response['choices'][i]['text'] for i in range(len(response['choices']))]
+        return [response.choices[i].message.content for i in range(len(response.choices))]
 
     def __complete(self, prompt, n):
         """Generates text from the model and returns the log prob data."""
@@ -178,10 +187,13 @@ def __complete(self, prompt, n):
         for i in range(len(prompt)):
             prompt[i] = prompt[i].replace('[APE]', '').strip()
         response = None
+
+        messages = utils.get_messages(prompt)
+
         while response is None:
             try:
-                response = openai.Completion.create(
-                    **config, prompt=prompt)
+                response = self.client.chat.completions.create(
+                    **config, messages=messages)
             except Exception as e:
                 print(e)
                 print('Retrying...')
@@ -199,42 +211,51 @@ def __log_probs(self, text, log_prob_range=None):
                 assert lower_index >= 0
                 assert upper_index - 1 < len(text[i])
         config = self.config['gpt_config'].copy()
-        config['logprobs'] = 1
-        config['echo'] = True
-        config['max_tokens'] = 0
-        if isinstance(text, list):
-            text = [f'\n{text[i]}' for i in range(len(text))]
-        else:
-            text = f'\n{text}'
+        config['logprobs'] = True
+        config['top_logprobs'] = 1
+        # config['echo'] = True
+        config['max_tokens'] = 50
+        # if isinstance(text, list):
+        #     text = [f'\n{text[i]}' for i in range(len(text))]
+        # else:
+        #     text = f'\n{text}'
         response = None
+        messages = utils.get_messages(text)
+
         while response is None:
             try:
-                response = openai.Completion.create(
-                    **config, prompt=text)
+                response = self.client.chat.completions.create(
+                    **config, messages=messages)
             except Exception as e:
                 print(e)
                 print('Retrying...')
                 time.sleep(5)
-        log_probs = [response['choices'][i]['logprobs']['token_logprobs'][1:]
-                     for i in range(len(response['choices']))]
-        tokens = [response['choices'][i]['logprobs']['tokens'][1:]
-                  for i in range(len(response['choices']))]
-        offsets = [response['choices'][i]['logprobs']['text_offset'][1:]
-                   for i in range(len(response['choices']))]
-
-        # Subtract 1 from the offsets to account for the newline
-        for i in range(len(offsets)):
-            offsets[i] = [offset - 1 for offset in offsets[i]]
 
-        if log_prob_range is not None:
-            # First, we need to find the indices of the tokens in the log probs
-            # that correspond to the tokens in the log_prob_range
-            for i in range(len(log_probs)):
-                lower_index, upper_index = self.get_token_indices(
-                    offsets[i], log_prob_range[i])
+        log_probs = []
+        tokens = []
+        idx = 0
+        jdx = 0
+
+        try: 
+            for i in range(len(response.choices)):
+                idx = i
+                this_log_probs = []
+                this_tokens = []
+                choice = response.choices[i]
+                for j in range(len(choice.logprobs.content)):
+                    jdx = j
+                    this_log_probs.append(choice.logprobs.content[j].logprob)
+                    this_tokens.append(choice.logprobs.content[j].token)
+
                 log_probs[i] = log_probs[i][lower_index:upper_index]
+                tokens.append(this_tokens)
+
+        except AttributeError:
                 tokens[i] = tokens[i][lower_index:upper_index]
 
+        finally:
+            print(f"logprobs {response.choices[idx].logprobs.content[jdx]}")
+
         return log_probs, tokens
 
     def get_token_indices(self, offsets, log_prob_range):
@@ -258,11 +279,12 @@ def get_token_indices(self, offsets, log_prob_range):
 
 class GPT_Insert(LLM):
 
-    def __init__(self, config, needs_confirmation=False, disable_tqdm=True):
+    def __init__(self, config, client, needs_confirmation=False, disable_tqdm=True):
         """Initializes the model."""
         self.config = config
         self.needs_confirmation = needs_confirmation
         self.disable_tqdm = disable_tqdm
+        self.client = client
 
     def confirm_cost(self, texts, n, max_tokens):
         total_estimated_cost = 0
@@ -314,10 +336,14 @@ def __generate_text(self, prompt, n):
         prefix = prompt[0].split('[APE]')[0]
         suffix = prompt[0].split('[APE]')[1]
         response = None
+
+        messages = utils.get_messages(prompt)
+
+
         while response is None:
             try:
-                response = openai.Completion.create(
-                    **config, prompt=prefix, suffix=suffix)
+                response = self.client.chat.completions.create(
+                    **config, messages=messages, suffix=suffix)
             except Exception as e:
                 print(e)
                 print('Retrying...')

diff --git a/experiments/configs/instruction_induction.yaml b/experiments/configs/instruction_induction.yaml
@@ -6,7 +6,7 @@ generation:
     name: GPT_forward
     batch_size: 500
     gpt_config:
-      model: text-davinci-002
+      model: gpt-3.5-turbo
       temperature: 0.9
       max_tokens: 50
       top_p: 0.9
@@ -20,7 +20,7 @@ evaluation:
     name: GPT_forward
     batch_size: 20
     gpt_config:
-      model: text-davinci-002
+      model: gpt-3.5-turbo
       temperature: 0.7
       max_tokens: 200
       top_p: 1.0
@@ -31,7 +31,7 @@ demo:
     name: GPT_forward
     batch_size: 500
     gpt_config:
-      model: text-davinci-002
+      model: gpt-3.5-turbo
       temperature: 0.7
       max_tokens: 200
       top_p: 1.0

diff --git a/experiments/configs/truthful_qa.yaml b/experiments/configs/truthful_qa.yaml
@@ -6,7 +6,7 @@ generation:
     name: GPT_forward
     batch_size: 500
     gpt_config:
-      model: text-davinci-002
+      model: gpt-3.5-turbo
       temperature: 0.9
       max_tokens: 50
       top_p: 0.9
@@ -19,7 +19,7 @@ evaluation:
     name: GPT_forward
     batch_size: 20
     gpt_config:
-      model: text-davinci-002
+      model: gpt-3.5-turbo
       temperature: 0.7
       max_tokens: 200
       top_p: 1.0
@@ -30,7 +30,7 @@ demo:
     name: GPT_forward
     batch_size: 500
     gpt_config:
-      model: text-davinci-002
+      model: gpt-3.5-turbo
       temperature: 0.7
       max_tokens: 200
       top_p: 1.0