Implement Evaluation Mode for Traces and Logs (#92)

echo "### PR Title: Implement Evaluation Mode for Traces and Logs ### Overview This pull request introduces the ability to distinguish between logs generated in evaluation mode and those created during normal operation. This differentiation is crucial for ensuring that logs meant for evaluation do not interfere with the learning process and data meant for production use. ### Changes 1. **API Adjustments**: New `EvalMode` methods and fields are introduced in various structures (`LogEntry`, `GenerateTrace`, `ExecuteTrace`, and `BlockLog`) across the API. These changes allow the identification and handling of logs and traces created in evaluation mode. 2. **Analyzer Logic Enhancement**: The analysis process now accounts for the evaluation mode flag when processing and combining log entries. This ensures that logs generated during evaluation carry the evaluation mode flag throughout the analysis pipeline, from individual log entries to compiled traces. 3. **Exclusion from Learning**: Blocks marked as created in evaluation mode are now excluded from the learning process. This is implemented in the learning logic to prevent evaluation data from being used as learning examples, thus maintaining the integrity of the training data. ### Testing - Extended unit tests to cover the changes in log analysis and learning processes, ensuring that evaluation mode logs are processed and handled as expected. - Included tests with scenarios that simulate operational and evaluation conditions to verify the robustness of the new logic.
jlewi · May 5, 2024 · 6f19eac · 6f19eac
1 parent 1be7110
commit 6f19eac
Show file tree

Hide file tree

Showing 9 changed files with 202 additions and 13 deletions.
diff --git a/app/api/types.go b/app/api/types.go
@@ -39,6 +39,19 @@ func (L *LogEntry) Request() []byte {
 	return nil
 }
 
+// EvalMode returns value, ok. Where ok is true if the field was present and false otherwise.
+// If the field was present value is the value of the field.
+func (L *LogEntry) EvalMode() (bool, bool) {
+	v, ok := (*L)["evalMode"]
+	if !ok {
+		return false, false
+	}
+	if val, ok := v.(bool); ok {
+		return val, true
+	}
+	return false, false
+}
+
 func (L *LogEntry) Response() []byte {
 	v, ok := (*L)["response"]
 	if !ok {
@@ -127,6 +140,7 @@ type GenerateTrace struct {
 	EndTime   time.Time                  `json:"endTime"`
 	Request   *v1alpha1.GenerateRequest  `json:"request"`
 	Response  *v1alpha1.GenerateResponse `json:"response"`
+	EvalMode  bool                       `json:"evalMode"`
 }
 
 func (g *GenerateTrace) ID() string {
@@ -145,6 +159,7 @@ type ExecuteTrace struct {
 	EndTime   time.Time                 `json:"endTime"`
 	Request   *v1alpha1.ExecuteRequest  `json:"request"`
 	Response  *v1alpha1.ExecuteResponse `json:"response"`
+	EvalMode  bool                      `json:"evalMode"`
 }
 
 func (e *ExecuteTrace) ID() string {
@@ -177,4 +192,7 @@ type BlockLog struct {
 
 	// ExitCode is the exit code of the executed block
 	ExitCode int `json:"exitCode"`
+
+	// EvalMode is true if the block was generated as part of an evaluation and shouldn't be used for learning
+	EvalMode bool `json:"evalMode"`
 }
diff --git a/app/pkg/analyze/analyzer.go b/app/pkg/analyze/analyzer.go
@@ -296,6 +296,11 @@ func buildBlockLog(ctx context.Context, block *api.BlockLog, traces map[string]a
 			block.Doc = genTrace.Request.GetDoc()
 		}
 
+		// If the block was generated as part of evaluation mode then consider it to be in evaluation mode.
+		if genTrace.EvalMode {
+			block.EvalMode = true
+		}
+
 		// Find the actual block
 		for _, b := range genTrace.Response.GetBlocks() {
 			if b.GetId() == block.ID {
@@ -327,6 +332,10 @@ func buildBlockLog(ctx context.Context, block *api.BlockLog, traces map[string]a
 		}
 	}
 	if lastTrace != nil {
+		// If the block was executed as part of evaluation mode then consider it to be in evaluation mode.
+		if lastTrace.EvalMode {
+			block.EvalMode = true
+		}
 		block.ExecutedBlock = lastTrace.Request.GetBlock()
 		block.ExitCode = unsetExitCode
 		for _, o := range lastTrace.Response.GetOutputs() {
@@ -364,10 +373,21 @@ func combineEntriesForTrace(ctx context.Context, entries []*api.LogEntry) (api.T
 
 func combineGenerateTrace(ctx context.Context, entries []*api.LogEntry) (*api.GenerateTrace, error) {
 	trace := &api.GenerateTrace{}
+	evalMode := false
 	for _, e := range entries {
 		if trace.TraceID == "" {
 			trace.TraceID = e.TraceID()
 		}
+		if mode, present := e.EvalMode(); present {
+			// If any of the entries are marked as true then we will consider the trace to be in eval mode.
+			// We don't want to assume that the evalMode will be set on all log entries in the trace.
+			// So the logic is to assume its not eval mode by default and then set it to eval mode if we find
+			// One entry that is marked as eval mode.
+			if mode {
+				evalMode = mode
+			}
+		}
+
 		if trace.Request == nil {
 			raw := e.Request()
 			if raw != nil {
@@ -392,16 +412,27 @@ func combineGenerateTrace(ctx context.Context, entries []*api.LogEntry) (*api.Ge
 			}
 		}
 	}
-
+	trace.EvalMode = evalMode
 	return trace, nil
 }
 
 func combineExecuteTrace(ctx context.Context, entries []*api.LogEntry) (*api.ExecuteTrace, error) {
 	trace := &api.ExecuteTrace{}
+	evalMode := false
 	for _, e := range entries {
 		if trace.TraceID == "" {
 			trace.TraceID = e.TraceID()
 		}
+		if mode, present := e.EvalMode(); present {
+			// If any of the entries are marked as true then we will consider the trace to be in eval mode.
+			// We don't want to assume that the evalMode will be set on all log entries in the trace.
+			// So the logic is to assume its not eval mode by default and then set it to eval mode if we find
+			// One entry that is marked as eval mode.
+			if mode {
+				evalMode = mode
+			}
+		}
+
 		if trace.Request == nil {
 			raw := e.Request()
 			if raw != nil {
@@ -426,6 +457,6 @@ func combineExecuteTrace(ctx context.Context, entries []*api.LogEntry) (*api.Exe
 			}
 		}
 	}
-
+	trace.EvalMode = evalMode
 	return trace, nil
 }
diff --git a/app/pkg/analyze/analyzer_test.go b/app/pkg/analyze/analyzer_test.go
@@ -112,9 +112,60 @@ func Test_BuildBlockLog(t *testing.T) {
 		},
 	}
 
+	// Create a block in evaluation mode
+	const bid2 = "g456output1"
+	genTrace2 := &api.GenerateTrace{
+		TraceID:   "g456",
+		StartTime: timeMustParse(time.RFC3339, "2021-01-01T00:00:00Z"),
+		EndTime:   timeMustParse(time.RFC3339, "2021-01-01T00:01:00Z"),
+		Request: &v1alpha1.GenerateRequest{
+			Doc: &v1alpha1.Doc{
+				Blocks: []*v1alpha1.Block{
+					{
+						Contents: "echo hello",
+					},
+				},
+			},
+		},
+		Response: &v1alpha1.GenerateResponse{
+			Blocks: []*v1alpha1.Block{
+				{
+					Id:       bid2,
+					Contents: "outcell",
+				},
+			},
+		},
+		EvalMode: true,
+	}
+
+	execTrace3 := &api.ExecuteTrace{
+		TraceID:   "e912",
+		StartTime: timeMustParse(time.RFC3339, "2021-01-03T00:00:00Z"),
+		EndTime:   timeMustParse(time.RFC3339, "2021-01-03T00:01:00Z"),
+		Request: &v1alpha1.ExecuteRequest{
+			Block: &v1alpha1.Block{
+				Contents: "echo hello",
+				Id:       bid2,
+			},
+		},
+		Response: &v1alpha1.ExecuteResponse{
+			Outputs: []*v1alpha1.BlockOutput{
+				{
+					Items: []*v1alpha1.BlockOutputItem{
+						{
+							TextData: "exitCode: 7",
+						},
+					},
+				},
+			},
+		},
+	}
+
 	traces[genTrace.TraceID] = genTrace
+	traces[genTrace2.TraceID] = genTrace2
 	traces[execTrace1.TraceID] = execTrace1
 	traces[execTrace2.TraceID] = execTrace2
+	traces[execTrace3.TraceID] = execTrace3
 
 	// We shuffle ExecTraceIds to make sure we properly set block log based on the later trace
 	execTraceIds := shuffle([]string{execTrace1.TraceID, execTrace2.TraceID})
@@ -135,6 +186,27 @@ func Test_BuildBlockLog(t *testing.T) {
 				GeneratedBlock: genTrace.Response.Blocks[0],
 				ExecutedBlock:  execTrace2.Request.Block,
 				ExitCode:       7,
+				EvalMode:       false,
+			},
+			traces: traces,
+		},
+		{
+			name: "eval_mode",
+			block: &api.BlockLog{
+				ID:         bid2,
+				GenTraceID: genTrace2.TraceID,
+
+				ExecTraceIDs: []string{execTrace3.TraceID},
+			},
+			expected: &api.BlockLog{
+				ID:             bid2,
+				GenTraceID:     genTrace2.TraceID,
+				ExecTraceIDs:   []string{execTrace3.TraceID},
+				Doc:            genTrace2.Request.Doc,
+				GeneratedBlock: genTrace2.Response.Blocks[0],
+				ExecutedBlock:  execTrace3.Request.Block,
+				ExitCode:       7,
+				EvalMode:       true,
 			},
 			traces: traces,
 		},
@@ -296,14 +368,21 @@ func checkExecuteTracesFiles(t *testing.T, path string) {
 
 func Test_CombineGenerateEntries(t *testing.T) {
 	type testCase struct {
-		name      string
-		linesFile string
+		name             string
+		linesFile        string
+		expectedEvalMode bool
 	}
 
 	cases := []testCase{
 		{
-			name:      "basic",
-			linesFile: "generate_trace_lines.jsonl",
+			name:             "basic",
+			linesFile:        "generate_trace_lines.jsonl",
+			expectedEvalMode: false,
+		},
+		{
+			name:             "evalMode",
+			linesFile:        "generate_trace_lines_eval_mode.jsonl",
+			expectedEvalMode: true,
 		},
 	}
 
@@ -345,20 +424,36 @@ func Test_CombineGenerateEntries(t *testing.T) {
 			if trace.Response == nil {
 				t.Errorf("Expected trace to have a response")
 			}
+
+			if trace.EvalMode != c.expectedEvalMode {
+				t.Errorf("Expected EvalMode to be %v but got %v", c.expectedEvalMode, trace.EvalMode)
+			}
 		})
 	}
 }
 
 func Test_CombineExecuteEntries(t *testing.T) {
 	type testCase struct {
-		name      string
-		linesFile string
+		name             string
+		linesFile        string
+		expectedEvalMode bool
 	}
 
 	cases := []testCase{
 		{
-			name:      "basic",
-			linesFile: "execute_traces_lines.jsonl",
+			name:             "basic",
+			linesFile:        "execute_traces_lines.jsonl",
+			expectedEvalMode: false,
+		},
+		{
+			name:             "eval_mode_true",
+			linesFile:        "execute_traces_lines_eval_mode.jsonl",
+			expectedEvalMode: true,
+		},
+		{
+			name:             "eval_mode_false",
+			linesFile:        "execute_traces_lines_eval_mode_false.jsonl",
+			expectedEvalMode: false,
 		},
 	}
 
@@ -400,6 +495,9 @@ func Test_CombineExecuteEntries(t *testing.T) {
 			if trace.Response == nil {
 				t.Errorf("Expected trace to have a response")
 			}
+			if trace.EvalMode != c.expectedEvalMode {
+				t.Errorf("Expected EvalMode to be %v but got %v", c.expectedEvalMode, trace.EvalMode)
+			}
 		})
 	}
 }
diff --git a/app/pkg/analyze/test_data/execute_traces_lines_eval_mode.jsonl b/app/pkg/analyze/test_data/execute_traces_lines_eval_mode.jsonl
@@ -0,0 +1,2 @@
+{"severity":"info","time":1713303870.400788,"caller":"executor/executor.go:43", "evalMode":true,"function":"github.com/jlewi/foyle/app/pkg/executor.(*Executor).Execute","message":"Executor.Execute","traceId":"43281a4e73cd76570e9851589207a8bd","blockId":"","request":{"block":{"kind":"CODE","language":"","contents":"gcloud logging read --project=foyle-dev\n","outputs":[],"trace_ids":[],"id":""}}}
+{"severity":"info","time":1713303872.587626,"caller":"executor/executor.go:61","function":"github.com/jlewi/foyle/app/pkg/executor.(*Executor).Execute","message":"Executed instructions","traceId":"43281a4e73cd76570e9851589207a8bd","instructionsError":"json: unsupported type: chan string","response":{"outputs":[{"items":[{"mime":"text/plain","text_data":"exitCode: 0"}]}]}}
diff --git a/app/pkg/analyze/test_data/execute_traces_lines_eval_mode_false.jsonl b/app/pkg/analyze/test_data/execute_traces_lines_eval_mode_false.jsonl
@@ -0,0 +1,2 @@
+{"severity":"info","time":1713303870.400788,"caller":"executor/executor.go:43", "evalMode":false,"function":"github.com/jlewi/foyle/app/pkg/executor.(*Executor).Execute","message":"Executor.Execute","traceId":"43281a4e73cd76570e9851589207a8bd","blockId":"","request":{"block":{"kind":"CODE","language":"","contents":"gcloud logging read --project=foyle-dev\n","outputs":[],"trace_ids":[],"id":""}}}
+{"severity":"info","time":1713303872.587626,"caller":"executor/executor.go:61","function":"github.com/jlewi/foyle/app/pkg/executor.(*Executor).Execute","message":"Executed instructions","traceId":"43281a4e73cd76570e9851589207a8bd","instructionsError":"json: unsupported type: chan string","response":{"outputs":[{"items":[{"mime":"text/plain","text_data":"exitCode: 0"}]}]}}
diff --git a/app/pkg/analyze/test_data/generate_trace_lines_eval_mode.jsonl b/app/pkg/analyze/test_data/generate_trace_lines_eval_mode.jsonl
@@ -0,0 +1,5 @@
+{"severity":"info","time":1713303852.283372,"caller":"agent/agent.go:61", "evalMode": true,"function":"github.com/jlewi/foyle/app/pkg/agent.(*Agent).Generate","message":"Agent.Generate","traceId":"2c4efb9372de3bb51d66d3f7e9c3a76a","request":{"doc":{"blocks":[{"kind":"MARKUP","language":"markdown","contents":"Use gcloud to read the logs for the cluster dev in project foyle-dev; assume you aready logged in","outputs":[],"trace_ids":["","","",""],"id":""}]}}}
+{"severity":"info","time":1713303852.283869,"caller":"agent/agent.go:117","function":"github.com/jlewi/foyle/app/pkg/agent.(*Agent).completeWithRetries","message":"OpenAI:CreateChatCompletion","traceId":"2c4efb9372de3bb51d66d3f7e9c3a76a","request":{"model":"gpt-3.5-turbo-0125","messages":[{"role":"system","content":"You are a helpful AI assistant for software developers. You are helping software engineers write markdown documents to deploy\nand operate software. Your job is to help users reason about problems and tasks and come up with the appropriate\ncommands to accomplish them. You should never try to execute commands. You should always tell the user\nto execute the commands themselves. To help the user place the commands inside a code block with the language set to\nbash. Users can then execute the commands inside VSCode notebooks. The output will then be appended to the document.\nYou can then use that output to reason about the next steps.\n\nYou are only helping users with tasks related to building, deploying, and operating software. You should interpret\nany questions or commands in that context.\n"},{"role":"user","content":"Please continue writing this markdown document to deal with any tasks or issues listed\nin the document. The document is a markdown document. It will contain a description of the task\nor problem, I need your help with. It will then contain one or more code blocks containing commands\nto be executed to accomplish the task or obtain information needed to figure out the problem.\nIf a command has already been executed the output of the command will be provided in a code block\nwith the language `output`. Use the output to help you figure out the problem or complete the task.\nIf you need me to execute a command please provide the command in a code block and I will execute it\nand then add the output to the document.\n\nHere's the document:\n\nUse gcloud to read the logs for the cluster dev in project foyle-dev; assume you aready logged in\n"}],"max_tokens":2000,"temperature":0.9}}
+{"severity":"info","time":1713303855.224336,"caller":"agent/agent.go:132","function":"github.com/jlewi/foyle/app/pkg/agent.(*Agent).completeWithRetries","message":"OpenAI:CreateChatCompletion response","traceId":"2c4efb9372de3bb51d66d3f7e9c3a76a","resp":{"id":"chatcmpl-9Ekt2d8y2F8FKBEfPdVj6iFiV4ciL","object":"chat.completion","created":1713303852,"model":"gpt-3.5-turbo-0125","choices":[{"index":0,"message":{"role":"assistant","content":"To read the logs for the cluster `dev` in project `foyle-dev` using `gcloud`, you can use the following command:\n\n```bash\ngcloud container clusters get-credentials dev --project=foyle-dev\n```\n\nThis command will authenticate `kubectl` to the cluster `dev` in the `foyle-dev` project.\n\nAfter this, you can use `kubectl` to read the logs for the cluster. Here's the command to read the logs for a specific pod:\n\n```bash\nkubectl logs \u003cpod_name\u003e\n```\n\nReplace `\u003cpod_name\u003e` with the name of the pod for which you want to read the logs. If you are not sure about the pod name, you can list all pods in the cluster using the following command:\n\n```bash\nkubectl get pods\n```\n\nPlease execute the commands above and provide the output if you need further assistance."},"finish_reason":"stop"}],"usage":{"prompt_tokens":336,"completion_tokens":179,"total_tokens":515},"system_fingerprint":"fp_c2295e73ad"}}
+{"severity":"info","time":1713303855.225972,"caller":"agent/agent.go:76","function":"github.com/jlewi/foyle/app/pkg/agent.(*Agent).Generate","message":"Agent.Generate returning blocks","traceId":"2c4efb9372de3bb51d66d3f7e9c3a76a","blockIds":["10b11f2d-7c8d-4d58-bedc-7e2dd51a85dc","8594d742-39d7-473b-b4ad-901ff362fdb3","f1662328-e884-418c-a084-95dfb1a3f7fc","4feb6219-d050-4630-8ceb-d08ec149b60d","3893a0b6-8c84-49ca-a38c-fbf6d7adfcde","fd276a6f-f379-4f9c-9779-0ed07819d0f5","d507ce35-af59-4f92-8dec-6c37d7b26647"]}
+{"severity":"info","time":1713303855.226055,"caller":"agent/agent.go:83","function":"github.com/jlewi/foyle/app/pkg/agent.(*Agent).Generate","message":"Agent.Generate returning response","traceId":"2c4efb9372de3bb51d66d3f7e9c3a76a","response":{"blocks":[{"kind":"MARKUP","language":"","contents":"To read the logs for the cluster `dev` in project `foyle-dev` using `gcloud`, you can use the following command:","outputs":[],"trace_ids":[],"id":"10b11f2d-7c8d-4d58-bedc-7e2dd51a85dc"},{"kind":"CODE","language":"bash","contents":"gcloud container clusters get-credentials dev --project=foyle-dev\n","outputs":[],"trace_ids":[],"id":"8594d742-39d7-473b-b4ad-901ff362fdb3"},{"kind":"MARKUP","language":"","contents":"\n\nThis command will authenticate `kubectl` to the cluster `dev` in the `foyle-dev` project.\n\nAfter this, you can use `kubectl` to read the logs for the cluster. Here's the command to read the logs for a specific pod:","outputs":[],"trace_ids":[],"id":"f1662328-e884-418c-a084-95dfb1a3f7fc"},{"kind":"CODE","language":"bash","contents":"kubectl logs <pod_name>\n","outputs":[],"trace_ids":[],"id":"4feb6219-d050-4630-8ceb-d08ec149b60d"},{"kind":"MARKUP","language":"","contents":"\n\nReplace `<pod_name>` with the name of the pod for which you want to read the logs. If you are not sure about the pod name, you can list all pods in the cluster using the following command:","outputs":[],"trace_ids":[],"id":"3893a0b6-8c84-49ca-a38c-fbf6d7adfcde"},{"kind":"CODE","language":"bash","contents":"kubectl get pods\n","outputs":[],"trace_ids":[],"id":"fd276a6f-f379-4f9c-9779-0ed07819d0f5"},{"kind":"MARKUP","language":"","contents":"\n\nPlease execute the commands above and provide the output if you need further assistance.","outputs":[],"trace_ids":[],"id":"d507ce35-af59-4f92-8dec-6c37d7b26647"}]}}
diff --git a/app/pkg/learn/learner.go b/app/pkg/learn/learner.go
@@ -46,8 +46,6 @@ func (l *Learner) Reconcile(ctx context.Context) error {
 	// TODO(jeremy): Can we call Analyze to compute the latest logs?
 	log := logs.FromContext(ctx)
 
-	log.Error(errors.New("Not implemented"), "The learning code needs to be updated to filter out examples that are used for evaluation")
-
 	trainDir := l.Config.GetTrainingDir()
 	if _, err := os.Stat(trainDir); err != nil {
 		if os.IsNotExist(err) {
@@ -97,7 +95,14 @@ func (l *Learner) reconcileExamples(ctx context.Context, blocks map[string]api.B
 			// Block wasn't the result of AI generation
 			continue
 		}
-		// TODO(jeremy): Should we use some sort of distance metric? e.g. edit distance?
+
+		if b.EvalMode {
+			log.V(logs.Debug).Info("Skipping block which was created as part of an eval", "id", b.ID)
+			continue
+		}
+
+		// TODO(jeremy): Should we use some sort of distance metric? e.g. edit distance? We could potentially
+		// Use the metric used for eval.
 		if strings.TrimSpace(b.ExecutedBlock.GetContents()) == strings.TrimSpace(b.GeneratedBlock.GetContents()) {
 			log.V(logs.Debug).Info("Skipping executed block which matches generated block", "id", b.ID)
 			continue

diff --git a/data/eval/git/pr_description.foyle b/data/eval/git/pr_description.foyle
@@ -0,0 +1,14 @@
+{
+  "blocks": [
+    {
+      "kind": "MARKUP",
+      "language": "markdown",
+      "contents": "Create a PR description"
+    },
+    {
+      "kind": "CODE",
+      "language": "bash",
+      "contents": "git diff origin/main | llm --model=gpt-4-0125-preview -s \"Create a PR description from the following diff\""
+    }
+  ]
+}
diff --git a/data/eval/hydros/image_logs.foyle b/data/eval/hydros/image_logs.foyle
@@ -0,0 +1,14 @@
+{
+  "blocks": [
+    {
+      "kind": "MARKUP",
+      "language": "markdown",
+      "contents": "Get the logs for building the image carabou"
+    },
+    {
+      "kind": "CODE",
+      "language": "bash",
+      "contents": "gcloud logging read 'logName=\"projects/foyle-dev/logs/hydros\" jsonPayload.image=\"carabou\"' --freshness=1d  --project=foyle-dev"
+    }
+  ]
+}