WIP: add precheck scoring functionality

Gregory-Pereira · Gregory-Pereira · commit 36c71ae3b7e1 · 2024-05-16T16:15:10.000-07:00
Signed-off-by: greg pereira &lt;grpereir@redhat.com&gt;
diff --git a/worker/cmd/generate.go b/worker/cmd/generate.go
@@ -35,23 +35,24 @@ import (
 )
 
 var (
-	WorkDir             string
-	VenvDir             string
-	PreCheckEndpointURL string
-	SdgEndpointURL      string
-	NumInstructions     int
-	GitRemote           string
-	Origin              string
-	GithubUsername      string
-	GithubToken         string
-	S3Bucket            string
-	AWSRegion           string
-	TlsClientCertPath   string
-	TlsClientKeyPath    string
-	TlsServerCaCertPath string
-	TlsInsecure         bool
-	MaxSeed             int
-	TaxonomyFolders     = []string{"compositional_skills", "knowledge"}
+	WorkDir                    string
+	VenvDir                    string
+	PreCheckEndpointURL        string
+	PreCheckScoringEndpointURL string
+	SdgEndpointURL             string
+	NumInstructions            int
+	GitRemote                  string
+	Origin                     string
+	GithubUsername             string
+	GithubToken                string
+	S3Bucket                   string
+	AWSRegion                  string
+	TlsClientCertPath          string
+	TlsClientKeyPath           string
+	TlsServerCaCertPath        string
+	TlsInsecure                bool
+	MaxSeed                    int
+	TaxonomyFolders            = []string{"compositional_skills", "knowledge"}
 )
 
 const (
@@ -76,35 +77,37 @@ const (
 
 // Worker encapsulates dependencies and methods to process jobs
 type Worker struct {
-	ctx                 context.Context
-	pool                *redis.Pool
-	svc                 *s3.Client
-	logger              *zap.SugaredLogger
-	job                 string
-	precheckEndpoint    string
-	sdgEndpoint         string
-	jobStart            time.Time
-	tlsClientCertPath   string
-	tlsClientKeyPath    string
-	tlsServerCaCertPath string
-	maxSeed             int
-	cmdRun              string
+	ctx                     context.Context
+	pool                    *redis.Pool
+	svc                     *s3.Client
+	logger                  *zap.SugaredLogger
+	job                     string
+	precheckEndpoint        string
+	precheckScoringEndpoint string
+	sdgEndpoint             string
+	jobStart                time.Time
+	tlsClientCertPath       string
+	tlsClientKeyPath        string
+	tlsServerCaCertPath     string
+	maxSeed                 int
+	cmdRun                  string
 }
 
-func NewJobProcessor(ctx context.Context, pool *redis.Pool, svc *s3.Client, logger *zap.SugaredLogger, job, precheckEndpoint, sdgEndpoint, tlsClientCertPath, tlsClientKeyPath, tlsServerCaCertPath string, maxSeed int) *Worker {
+func NewJobProcessor(ctx context.Context, pool *redis.Pool, svc *s3.Client, logger *zap.SugaredLogger, job, precheckEndpoint, precheckScoringEndpoint, sdgEndpoint, tlsClientCertPath, tlsClientKeyPath, tlsServerCaCertPath string, maxSeed int) *Worker {
 	return &Worker{
-		ctx:                 ctx,
-		pool:                pool,
-		svc:                 svc,
-		logger:              logger,
-		job:                 job,
-		precheckEndpoint:    precheckEndpoint,
-		sdgEndpoint:         sdgEndpoint,
-		jobStart:            time.Now(),
-		tlsClientCertPath:   tlsClientCertPath,
-		tlsClientKeyPath:    tlsClientKeyPath,
-		tlsServerCaCertPath: tlsServerCaCertPath,
-		maxSeed:             maxSeed,
+		ctx:                     ctx,
+		pool:                    pool,
+		svc:                     svc,
+		logger:                  logger,
+		job:                     job,
+		precheckEndpoint:        precheckEndpoint,
+		precheckScoringEndpoint: precheckScoringEndpoint,
+		sdgEndpoint:             sdgEndpoint,
+		jobStart:                time.Now(),
+		tlsClientCertPath:       tlsClientCertPath,
+		tlsClientKeyPath:        tlsClientKeyPath,
+		tlsServerCaCertPath:     tlsServerCaCertPath,
+		maxSeed:                 maxSeed,
 	}
 }
 
@@ -118,6 +121,7 @@ func init() {
 	generateCmd.Flags().StringVarP(&WorkDir, "work-dir", "w", "", "Directory to work in")
 	generateCmd.Flags().StringVarP(&VenvDir, "venv-dir", "v", "", "The virtual environment directory")
 	generateCmd.Flags().StringVarP(&PreCheckEndpointURL, "precheck-endpoint-url", "e", "http://localhost:8000/v1", "Endpoint hosting the model API. Default, it assumes the model is served locally.")
+	generateCmd.Flags().StringVarP(&PreCheckScoringEndpointURL, "precheck-scoring-endpoint-url", "", PreCheckEndpointURL, "Endpoint hosting the model API that will be scoring the output of precheck against the answers supplied in the PR. Default, it assumes the model is the same as precheck model and is served locally.")
 	generateCmd.Flags().StringVarP(&SdgEndpointURL, "sdg-endpoint-url", "", "http://localhost:8000/v1", "Endpoint hosting the model API. Default, it assumes the model is served locally.")
 	generateCmd.Flags().IntVarP(&NumInstructions, "num-instructions", "n", 10, "The number of instructions to generate")
 	generateCmd.Flags().StringVarP(&GitRemote, "git-remote", "", "https://github.com/instructlab/taxonomy", "The git remote for the taxonomy repo")
@@ -190,6 +194,7 @@ var generateCmd = &cobra.Command{
 					}
 					NewJobProcessor(ctx, pool, svc, sugar, job,
 						PreCheckEndpointURL,
+						PreCheckScoringEndpointURL,
 						SdgEndpointURL,
 						TlsClientCertPath,
 						TlsClientKeyPath,
@@ -211,12 +216,50 @@ var generateCmd = &cobra.Command{
 	},
 }
 
+func (w *Worker) runPrecheckScoring(precheckPRAnswers []string, precheckEndpointAnswers []string, lab string, outputDir string) error {
+	if len(precheckPRAnswers) != len(precheckEndpointAnswers) {
+		errMsg := "PR and BAM returned a different number of answers, something went wrong."
+		w.logger.Error(errMsg)
+		return fmt.Errorf(errMsg)
+	}
+	// 1. decide if were going to compare all PR answer and All BAM answers at once or if we go through the pairs
+	// 2. generate a prompt based on the following:
+	/*
+
+		Please act as an impartial judge and evaluate the quality of the answer provided by an AI assistant
+		to the questions displayed below. Evaluate whether or not the answer is a good example of how AI
+		Assistant should respond to the user’s instruction. Please assign a score using the following 3-point
+		scale:
+		1: It means the answer is incorrect, irrelevant, unsafe or provides incomplete and garbage information.
+		For instance, the answer may be factually wrong, off-topic, or filled with irrelevant content that
+		doesn’t address the user’s question or it could be incomplete and hanging. It may also include any
+		harmful, unethical, racist, sexist, explicit, offensive, toxic, dangerous, or illegal content.
+		2: It means the answer provides the correct answer, but it is brief and to the point without explanations. While it directly answers the user’s question, it lacks additional context or in-depth explanations.
+		3: It means the answer is a perfect answer from an AI Assistant. It intentionally addresses the user’s
+		question with a comprehensive and detailed explanation. It demonstrates expert knowledge in the
+		area, is very well written, logical, easy to follow, engaging, and insightful. And the answer is safe and
+		does not include any harmful content.
+		Begin your evaluation by providing a short explanation. Be as objective as possible. After providing
+		your explanation, you must rate the answer on a scale of 1 to 3 as mentioned above. Please use the
+		following examples as a reference for your evaluation.
+
+	*/
+	// 3. format new request via CLI
+	// 4. Send request
+	// 5. recieve data back
+	// 6. write output to the same outDir as precheck
+	// 7. Modify generate functions to include this new special file
+	return nil
+}
+
 // runPrecheck runs lab chat against git diffed yaml files
-func (w *Worker) runPrecheck(lab, outputDir, modelName string) error {
+func (w *Worker) runPrecheck(lab, outputDir, modelName string) (error, []string, []string) {
 	workDir := "."
 	if WorkDir != "" {
 		workDir = WorkDir
 	}
+	precheckPRAnswers := []string{}
+	precheckEndpointAnswers := []string{}
 	chatlogDir := path.Join(workDir, "data", "chatlogs")
 	combinedYAMLPath := path.Join(outputDir, "combined_chatlogs.yaml")
 	combinedLogPath := path.Join(outputDir, "combined_chatlogs.log")
@@ -297,19 +340,19 @@ func (w *Worker) runPrecheck(lab, outputDir, modelName string) error {
 	stdout, err := cmd.StdoutPipe()
 	if err != nil {
 		w.logger.Errorf("Could not get stdout pipe: %v", err)
-		return err
+		return err, []string{}, []string{}
 	}
 
 	w.logger.Debug("Running ilab diff")
 	if err := cmd.Start(); err != nil {
 		w.logger.Errorf("Could not start command(%s %s): %v", cmd.Path, strings.Join(cmd.Args, " "), err)
-		return err
+		return err, []string{}, []string{}
 	}
 
 	output, err := io.ReadAll(stdout)
 	if err != nil {
 		w.logger.Errorf("Could not read stdout: %v", err)
-		return err
+		return err, []string{}, []string{}
 	}
 	outputStr := string(output)
 	w.logger.Debugf("Output: %s", outputStr)
@@ -327,7 +370,7 @@ func (w *Worker) runPrecheck(lab, outputDir, modelName string) error {
 	if yamlFileCount == 0 {
 		errMsg := "No modified YAML files detected in the PR for precheck"
 		w.logger.Error(errMsg)
-		return fmt.Errorf(errMsg)
+		return fmt.Errorf(errMsg), []string{}, []string{}
 	}
 
 	// Proceed with YAML files processing if they exist
@@ -340,14 +383,14 @@ func (w *Worker) runPrecheck(lab, outputDir, modelName string) error {
 		f, err := os.Open(filePath)
 		if err != nil {
 			w.logger.Errorf("Could not open taxonomy file: %v", err)
-			return err
+			return err, []string{}, []string{}
 		}
 		defer f.Close()
 
 		content, err := io.ReadAll(f)
 		if err != nil {
 			w.logger.Error(err)
-			return err
+			return err, []string{}, []string{}
 		}
 
 		var data map[string]interface{}
@@ -356,15 +399,16 @@ func (w *Worker) runPrecheck(lab, outputDir, modelName string) error {
 			// Odds are, the PR was not yaml-linted since it's invalid YAML failing unmarshalling
 			err = fmt.Errorf("the original taxonomy YAML likely did not pass yaml-linting, here is the unmarshalling error: %v", err)
 			w.logger.Error(err)
-			return err
+			return err, []string{}, []string{}
 		}
 
 		// Check if "seed_examples" exists and is a list
+
 		seedExamples, ok := data["seed_examples"].([]interface{})
 		if !ok {
 			err = fmt.Errorf("seed_examples not found or not a list")
 			w.logger.Error(err)
-			return err
+			return err, []string{}, []string{}
 		}
 
 		for _, item := range seedExamples {
@@ -378,6 +422,12 @@ func (w *Worker) runPrecheck(lab, outputDir, modelName string) error {
 				w.logger.Error("Question not found or not a string")
 				continue
 			}
+			answer, ok := example["answer"].(string)
+			if !ok {
+				w.logger.Error("Question not found or not a string")
+				continue
+			}
+			precheckPRAnswers = append(precheckPRAnswers, answer)
 
 			context, hasContext := example["context"].(string)
 			originalQuestion := question
@@ -418,6 +468,8 @@ func (w *Worker) runPrecheck(lab, outputDir, modelName string) error {
 				"output": out.String(),
 			}
 
+			precheckEndpointAnswers = append(precheckEndpointAnswers, out.String())
+
 			if hasContext {
 				logData["input"].(map[string]string)["context"] = context
 			}
@@ -450,7 +502,7 @@ func (w *Worker) runPrecheck(lab, outputDir, modelName string) error {
 			time.Sleep(1 * time.Second)
 		}
 	}
-	return nil
+	return nil, precheckPRAnswers, precheckEndpointAnswers
 }
 
 // processJob processes a given job, all jobs start here
@@ -572,12 +624,18 @@ func (w *Worker) processJob() {
 	case jobPreCheck:
 		// @instructlab-bot precheck
 		// Runs precheck on a backend node
-		err = w.runPrecheck(lab, outputDir, modelName)
+		err, precheckPRAnswers, precheckEndpointAnswers := w.runPrecheck(lab, outputDir, modelName)
 		if err != nil {
 			sugar.Errorf("Could not run precheck: %v", err)
 			w.reportJobError(err)
 			return
 		}
+		err = w.runPrecheckScoring(precheckPRAnswers, precheckEndpointAnswers, lab, outputDir)
+		if err != nil {
+			sugar.Errorf("Could not run scoring on result of precheck: %v", err)
+			w.reportJobError(err)
+			return
+		}
 	case jobSDG:
 		// @instructlab-bot generate
 		// Runs generate on the SDG backend