From 6f19eac45ccb88cc176776ea79411f834a12a575 Mon Sep 17 00:00:00 2001 From: Jeremy Lewi Date: Sat, 4 May 2024 17:51:54 -0700 Subject: [PATCH] Implement Evaluation Mode for Traces and Logs (#92) echo "### PR Title: Implement Evaluation Mode for Traces and Logs ### Overview This pull request introduces the ability to distinguish between logs generated in evaluation mode and those created during normal operation. This differentiation is crucial for ensuring that logs meant for evaluation do not interfere with the learning process and data meant for production use. ### Changes 1. **API Adjustments**: New `EvalMode` methods and fields are introduced in various structures (`LogEntry`, `GenerateTrace`, `ExecuteTrace`, and `BlockLog`) across the API. These changes allow the identification and handling of logs and traces created in evaluation mode. 2. **Analyzer Logic Enhancement**: The analysis process now accounts for the evaluation mode flag when processing and combining log entries. This ensures that logs generated during evaluation carry the evaluation mode flag throughout the analysis pipeline, from individual log entries to compiled traces. 3. **Exclusion from Learning**: Blocks marked as created in evaluation mode are now excluded from the learning process. This is implemented in the learning logic to prevent evaluation data from being used as learning examples, thus maintaining the integrity of the training data. ### Testing - Extended unit tests to cover the changes in log analysis and learning processes, ensuring that evaluation mode logs are processed and handled as expected. - Included tests with scenarios that simulate operational and evaluation conditions to verify the robustness of the new logic. --- app/api/types.go | 18 +++ app/pkg/analyze/analyzer.go | 35 +++++- app/pkg/analyze/analyzer_test.go | 114 ++++++++++++++++-- .../execute_traces_lines_eval_mode.jsonl | 2 + ...execute_traces_lines_eval_mode_false.jsonl | 2 + .../generate_trace_lines_eval_mode.jsonl | 5 + app/pkg/learn/learner.go | 11 +- data/eval/git/pr_description.foyle | 14 +++ data/eval/hydros/image_logs.foyle | 14 +++ 9 files changed, 202 insertions(+), 13 deletions(-) create mode 100644 app/pkg/analyze/test_data/execute_traces_lines_eval_mode.jsonl create mode 100644 app/pkg/analyze/test_data/execute_traces_lines_eval_mode_false.jsonl create mode 100644 app/pkg/analyze/test_data/generate_trace_lines_eval_mode.jsonl create mode 100644 data/eval/git/pr_description.foyle create mode 100644 data/eval/hydros/image_logs.foyle diff --git a/app/api/types.go b/app/api/types.go index 270c4f16..f2e87b9e 100644 --- a/app/api/types.go +++ b/app/api/types.go @@ -39,6 +39,19 @@ func (L *LogEntry) Request() []byte { return nil } +// EvalMode returns value, ok. Where ok is true if the field was present and false otherwise. +// If the field was present value is the value of the field. +func (L *LogEntry) EvalMode() (bool, bool) { + v, ok := (*L)["evalMode"] + if !ok { + return false, false + } + if val, ok := v.(bool); ok { + return val, true + } + return false, false +} + func (L *LogEntry) Response() []byte { v, ok := (*L)["response"] if !ok { @@ -127,6 +140,7 @@ type GenerateTrace struct { EndTime time.Time `json:"endTime"` Request *v1alpha1.GenerateRequest `json:"request"` Response *v1alpha1.GenerateResponse `json:"response"` + EvalMode bool `json:"evalMode"` } func (g *GenerateTrace) ID() string { @@ -145,6 +159,7 @@ type ExecuteTrace struct { EndTime time.Time `json:"endTime"` Request *v1alpha1.ExecuteRequest `json:"request"` Response *v1alpha1.ExecuteResponse `json:"response"` + EvalMode bool `json:"evalMode"` } func (e *ExecuteTrace) ID() string { @@ -177,4 +192,7 @@ type BlockLog struct { // ExitCode is the exit code of the executed block ExitCode int `json:"exitCode"` + + // EvalMode is true if the block was generated as part of an evaluation and shouldn't be used for learning + EvalMode bool `json:"evalMode"` } diff --git a/app/pkg/analyze/analyzer.go b/app/pkg/analyze/analyzer.go index 0ee3384e..e9d97e0f 100644 --- a/app/pkg/analyze/analyzer.go +++ b/app/pkg/analyze/analyzer.go @@ -296,6 +296,11 @@ func buildBlockLog(ctx context.Context, block *api.BlockLog, traces map[string]a block.Doc = genTrace.Request.GetDoc() } + // If the block was generated as part of evaluation mode then consider it to be in evaluation mode. + if genTrace.EvalMode { + block.EvalMode = true + } + // Find the actual block for _, b := range genTrace.Response.GetBlocks() { if b.GetId() == block.ID { @@ -327,6 +332,10 @@ func buildBlockLog(ctx context.Context, block *api.BlockLog, traces map[string]a } } if lastTrace != nil { + // If the block was executed as part of evaluation mode then consider it to be in evaluation mode. + if lastTrace.EvalMode { + block.EvalMode = true + } block.ExecutedBlock = lastTrace.Request.GetBlock() block.ExitCode = unsetExitCode for _, o := range lastTrace.Response.GetOutputs() { @@ -364,10 +373,21 @@ func combineEntriesForTrace(ctx context.Context, entries []*api.LogEntry) (api.T func combineGenerateTrace(ctx context.Context, entries []*api.LogEntry) (*api.GenerateTrace, error) { trace := &api.GenerateTrace{} + evalMode := false for _, e := range entries { if trace.TraceID == "" { trace.TraceID = e.TraceID() } + if mode, present := e.EvalMode(); present { + // If any of the entries are marked as true then we will consider the trace to be in eval mode. + // We don't want to assume that the evalMode will be set on all log entries in the trace. + // So the logic is to assume its not eval mode by default and then set it to eval mode if we find + // One entry that is marked as eval mode. + if mode { + evalMode = mode + } + } + if trace.Request == nil { raw := e.Request() if raw != nil { @@ -392,16 +412,27 @@ func combineGenerateTrace(ctx context.Context, entries []*api.LogEntry) (*api.Ge } } } - + trace.EvalMode = evalMode return trace, nil } func combineExecuteTrace(ctx context.Context, entries []*api.LogEntry) (*api.ExecuteTrace, error) { trace := &api.ExecuteTrace{} + evalMode := false for _, e := range entries { if trace.TraceID == "" { trace.TraceID = e.TraceID() } + if mode, present := e.EvalMode(); present { + // If any of the entries are marked as true then we will consider the trace to be in eval mode. + // We don't want to assume that the evalMode will be set on all log entries in the trace. + // So the logic is to assume its not eval mode by default and then set it to eval mode if we find + // One entry that is marked as eval mode. + if mode { + evalMode = mode + } + } + if trace.Request == nil { raw := e.Request() if raw != nil { @@ -426,6 +457,6 @@ func combineExecuteTrace(ctx context.Context, entries []*api.LogEntry) (*api.Exe } } } - + trace.EvalMode = evalMode return trace, nil } diff --git a/app/pkg/analyze/analyzer_test.go b/app/pkg/analyze/analyzer_test.go index 4624e247..83706b7e 100644 --- a/app/pkg/analyze/analyzer_test.go +++ b/app/pkg/analyze/analyzer_test.go @@ -112,9 +112,60 @@ func Test_BuildBlockLog(t *testing.T) { }, } + // Create a block in evaluation mode + const bid2 = "g456output1" + genTrace2 := &api.GenerateTrace{ + TraceID: "g456", + StartTime: timeMustParse(time.RFC3339, "2021-01-01T00:00:00Z"), + EndTime: timeMustParse(time.RFC3339, "2021-01-01T00:01:00Z"), + Request: &v1alpha1.GenerateRequest{ + Doc: &v1alpha1.Doc{ + Blocks: []*v1alpha1.Block{ + { + Contents: "echo hello", + }, + }, + }, + }, + Response: &v1alpha1.GenerateResponse{ + Blocks: []*v1alpha1.Block{ + { + Id: bid2, + Contents: "outcell", + }, + }, + }, + EvalMode: true, + } + + execTrace3 := &api.ExecuteTrace{ + TraceID: "e912", + StartTime: timeMustParse(time.RFC3339, "2021-01-03T00:00:00Z"), + EndTime: timeMustParse(time.RFC3339, "2021-01-03T00:01:00Z"), + Request: &v1alpha1.ExecuteRequest{ + Block: &v1alpha1.Block{ + Contents: "echo hello", + Id: bid2, + }, + }, + Response: &v1alpha1.ExecuteResponse{ + Outputs: []*v1alpha1.BlockOutput{ + { + Items: []*v1alpha1.BlockOutputItem{ + { + TextData: "exitCode: 7", + }, + }, + }, + }, + }, + } + traces[genTrace.TraceID] = genTrace + traces[genTrace2.TraceID] = genTrace2 traces[execTrace1.TraceID] = execTrace1 traces[execTrace2.TraceID] = execTrace2 + traces[execTrace3.TraceID] = execTrace3 // We shuffle ExecTraceIds to make sure we properly set block log based on the later trace execTraceIds := shuffle([]string{execTrace1.TraceID, execTrace2.TraceID}) @@ -135,6 +186,27 @@ func Test_BuildBlockLog(t *testing.T) { GeneratedBlock: genTrace.Response.Blocks[0], ExecutedBlock: execTrace2.Request.Block, ExitCode: 7, + EvalMode: false, + }, + traces: traces, + }, + { + name: "eval_mode", + block: &api.BlockLog{ + ID: bid2, + GenTraceID: genTrace2.TraceID, + + ExecTraceIDs: []string{execTrace3.TraceID}, + }, + expected: &api.BlockLog{ + ID: bid2, + GenTraceID: genTrace2.TraceID, + ExecTraceIDs: []string{execTrace3.TraceID}, + Doc: genTrace2.Request.Doc, + GeneratedBlock: genTrace2.Response.Blocks[0], + ExecutedBlock: execTrace3.Request.Block, + ExitCode: 7, + EvalMode: true, }, traces: traces, }, @@ -296,14 +368,21 @@ func checkExecuteTracesFiles(t *testing.T, path string) { func Test_CombineGenerateEntries(t *testing.T) { type testCase struct { - name string - linesFile string + name string + linesFile string + expectedEvalMode bool } cases := []testCase{ { - name: "basic", - linesFile: "generate_trace_lines.jsonl", + name: "basic", + linesFile: "generate_trace_lines.jsonl", + expectedEvalMode: false, + }, + { + name: "evalMode", + linesFile: "generate_trace_lines_eval_mode.jsonl", + expectedEvalMode: true, }, } @@ -345,20 +424,36 @@ func Test_CombineGenerateEntries(t *testing.T) { if trace.Response == nil { t.Errorf("Expected trace to have a response") } + + if trace.EvalMode != c.expectedEvalMode { + t.Errorf("Expected EvalMode to be %v but got %v", c.expectedEvalMode, trace.EvalMode) + } }) } } func Test_CombineExecuteEntries(t *testing.T) { type testCase struct { - name string - linesFile string + name string + linesFile string + expectedEvalMode bool } cases := []testCase{ { - name: "basic", - linesFile: "execute_traces_lines.jsonl", + name: "basic", + linesFile: "execute_traces_lines.jsonl", + expectedEvalMode: false, + }, + { + name: "eval_mode_true", + linesFile: "execute_traces_lines_eval_mode.jsonl", + expectedEvalMode: true, + }, + { + name: "eval_mode_false", + linesFile: "execute_traces_lines_eval_mode_false.jsonl", + expectedEvalMode: false, }, } @@ -400,6 +495,9 @@ func Test_CombineExecuteEntries(t *testing.T) { if trace.Response == nil { t.Errorf("Expected trace to have a response") } + if trace.EvalMode != c.expectedEvalMode { + t.Errorf("Expected EvalMode to be %v but got %v", c.expectedEvalMode, trace.EvalMode) + } }) } } diff --git a/app/pkg/analyze/test_data/execute_traces_lines_eval_mode.jsonl b/app/pkg/analyze/test_data/execute_traces_lines_eval_mode.jsonl new file mode 100644 index 00000000..d6d7d9af --- /dev/null +++ b/app/pkg/analyze/test_data/execute_traces_lines_eval_mode.jsonl @@ -0,0 +1,2 @@ +{"severity":"info","time":1713303870.400788,"caller":"executor/executor.go:43", "evalMode":true,"function":"github.com/jlewi/foyle/app/pkg/executor.(*Executor).Execute","message":"Executor.Execute","traceId":"43281a4e73cd76570e9851589207a8bd","blockId":"","request":{"block":{"kind":"CODE","language":"","contents":"gcloud logging read --project=foyle-dev\n","outputs":[],"trace_ids":[],"id":""}}} +{"severity":"info","time":1713303872.587626,"caller":"executor/executor.go:61","function":"github.com/jlewi/foyle/app/pkg/executor.(*Executor).Execute","message":"Executed instructions","traceId":"43281a4e73cd76570e9851589207a8bd","instructionsError":"json: unsupported type: chan string","response":{"outputs":[{"items":[{"mime":"text/plain","text_data":"exitCode: 0"}]}]}} diff --git a/app/pkg/analyze/test_data/execute_traces_lines_eval_mode_false.jsonl b/app/pkg/analyze/test_data/execute_traces_lines_eval_mode_false.jsonl new file mode 100644 index 00000000..9a0eface --- /dev/null +++ b/app/pkg/analyze/test_data/execute_traces_lines_eval_mode_false.jsonl @@ -0,0 +1,2 @@ +{"severity":"info","time":1713303870.400788,"caller":"executor/executor.go:43", "evalMode":false,"function":"github.com/jlewi/foyle/app/pkg/executor.(*Executor).Execute","message":"Executor.Execute","traceId":"43281a4e73cd76570e9851589207a8bd","blockId":"","request":{"block":{"kind":"CODE","language":"","contents":"gcloud logging read --project=foyle-dev\n","outputs":[],"trace_ids":[],"id":""}}} +{"severity":"info","time":1713303872.587626,"caller":"executor/executor.go:61","function":"github.com/jlewi/foyle/app/pkg/executor.(*Executor).Execute","message":"Executed instructions","traceId":"43281a4e73cd76570e9851589207a8bd","instructionsError":"json: unsupported type: chan string","response":{"outputs":[{"items":[{"mime":"text/plain","text_data":"exitCode: 0"}]}]}} diff --git a/app/pkg/analyze/test_data/generate_trace_lines_eval_mode.jsonl b/app/pkg/analyze/test_data/generate_trace_lines_eval_mode.jsonl new file mode 100644 index 00000000..9b80c37d --- /dev/null +++ b/app/pkg/analyze/test_data/generate_trace_lines_eval_mode.jsonl @@ -0,0 +1,5 @@ +{"severity":"info","time":1713303852.283372,"caller":"agent/agent.go:61", "evalMode": true,"function":"github.com/jlewi/foyle/app/pkg/agent.(*Agent).Generate","message":"Agent.Generate","traceId":"2c4efb9372de3bb51d66d3f7e9c3a76a","request":{"doc":{"blocks":[{"kind":"MARKUP","language":"markdown","contents":"Use gcloud to read the logs for the cluster dev in project foyle-dev; assume you aready logged in","outputs":[],"trace_ids":["","","",""],"id":""}]}}} +{"severity":"info","time":1713303852.283869,"caller":"agent/agent.go:117","function":"github.com/jlewi/foyle/app/pkg/agent.(*Agent).completeWithRetries","message":"OpenAI:CreateChatCompletion","traceId":"2c4efb9372de3bb51d66d3f7e9c3a76a","request":{"model":"gpt-3.5-turbo-0125","messages":[{"role":"system","content":"You are a helpful AI assistant for software developers. You are helping software engineers write markdown documents to deploy\nand operate software. Your job is to help users reason about problems and tasks and come up with the appropriate\ncommands to accomplish them. You should never try to execute commands. You should always tell the user\nto execute the commands themselves. To help the user place the commands inside a code block with the language set to\nbash. Users can then execute the commands inside VSCode notebooks. The output will then be appended to the document.\nYou can then use that output to reason about the next steps.\n\nYou are only helping users with tasks related to building, deploying, and operating software. You should interpret\nany questions or commands in that context.\n"},{"role":"user","content":"Please continue writing this markdown document to deal with any tasks or issues listed\nin the document. The document is a markdown document. It will contain a description of the task\nor problem, I need your help with. It will then contain one or more code blocks containing commands\nto be executed to accomplish the task or obtain information needed to figure out the problem.\nIf a command has already been executed the output of the command will be provided in a code block\nwith the language `output`. Use the output to help you figure out the problem or complete the task.\nIf you need me to execute a command please provide the command in a code block and I will execute it\nand then add the output to the document.\n\nHere's the document:\n\nUse gcloud to read the logs for the cluster dev in project foyle-dev; assume you aready logged in\n"}],"max_tokens":2000,"temperature":0.9}} +{"severity":"info","time":1713303855.224336,"caller":"agent/agent.go:132","function":"github.com/jlewi/foyle/app/pkg/agent.(*Agent).completeWithRetries","message":"OpenAI:CreateChatCompletion response","traceId":"2c4efb9372de3bb51d66d3f7e9c3a76a","resp":{"id":"chatcmpl-9Ekt2d8y2F8FKBEfPdVj6iFiV4ciL","object":"chat.completion","created":1713303852,"model":"gpt-3.5-turbo-0125","choices":[{"index":0,"message":{"role":"assistant","content":"To read the logs for the cluster `dev` in project `foyle-dev` using `gcloud`, you can use the following command:\n\n```bash\ngcloud container clusters get-credentials dev --project=foyle-dev\n```\n\nThis command will authenticate `kubectl` to the cluster `dev` in the `foyle-dev` project.\n\nAfter this, you can use `kubectl` to read the logs for the cluster. Here's the command to read the logs for a specific pod:\n\n```bash\nkubectl logs \u003cpod_name\u003e\n```\n\nReplace `\u003cpod_name\u003e` with the name of the pod for which you want to read the logs. If you are not sure about the pod name, you can list all pods in the cluster using the following command:\n\n```bash\nkubectl get pods\n```\n\nPlease execute the commands above and provide the output if you need further assistance."},"finish_reason":"stop"}],"usage":{"prompt_tokens":336,"completion_tokens":179,"total_tokens":515},"system_fingerprint":"fp_c2295e73ad"}} +{"severity":"info","time":1713303855.225972,"caller":"agent/agent.go:76","function":"github.com/jlewi/foyle/app/pkg/agent.(*Agent).Generate","message":"Agent.Generate returning blocks","traceId":"2c4efb9372de3bb51d66d3f7e9c3a76a","blockIds":["10b11f2d-7c8d-4d58-bedc-7e2dd51a85dc","8594d742-39d7-473b-b4ad-901ff362fdb3","f1662328-e884-418c-a084-95dfb1a3f7fc","4feb6219-d050-4630-8ceb-d08ec149b60d","3893a0b6-8c84-49ca-a38c-fbf6d7adfcde","fd276a6f-f379-4f9c-9779-0ed07819d0f5","d507ce35-af59-4f92-8dec-6c37d7b26647"]} +{"severity":"info","time":1713303855.226055,"caller":"agent/agent.go:83","function":"github.com/jlewi/foyle/app/pkg/agent.(*Agent).Generate","message":"Agent.Generate returning response","traceId":"2c4efb9372de3bb51d66d3f7e9c3a76a","response":{"blocks":[{"kind":"MARKUP","language":"","contents":"To read the logs for the cluster `dev` in project `foyle-dev` using `gcloud`, you can use the following command:","outputs":[],"trace_ids":[],"id":"10b11f2d-7c8d-4d58-bedc-7e2dd51a85dc"},{"kind":"CODE","language":"bash","contents":"gcloud container clusters get-credentials dev --project=foyle-dev\n","outputs":[],"trace_ids":[],"id":"8594d742-39d7-473b-b4ad-901ff362fdb3"},{"kind":"MARKUP","language":"","contents":"\n\nThis command will authenticate `kubectl` to the cluster `dev` in the `foyle-dev` project.\n\nAfter this, you can use `kubectl` to read the logs for the cluster. Here's the command to read the logs for a specific pod:","outputs":[],"trace_ids":[],"id":"f1662328-e884-418c-a084-95dfb1a3f7fc"},{"kind":"CODE","language":"bash","contents":"kubectl logs \n","outputs":[],"trace_ids":[],"id":"4feb6219-d050-4630-8ceb-d08ec149b60d"},{"kind":"MARKUP","language":"","contents":"\n\nReplace `` with the name of the pod for which you want to read the logs. If you are not sure about the pod name, you can list all pods in the cluster using the following command:","outputs":[],"trace_ids":[],"id":"3893a0b6-8c84-49ca-a38c-fbf6d7adfcde"},{"kind":"CODE","language":"bash","contents":"kubectl get pods\n","outputs":[],"trace_ids":[],"id":"fd276a6f-f379-4f9c-9779-0ed07819d0f5"},{"kind":"MARKUP","language":"","contents":"\n\nPlease execute the commands above and provide the output if you need further assistance.","outputs":[],"trace_ids":[],"id":"d507ce35-af59-4f92-8dec-6c37d7b26647"}]}} \ No newline at end of file diff --git a/app/pkg/learn/learner.go b/app/pkg/learn/learner.go index 4c85513c..91bb27c7 100644 --- a/app/pkg/learn/learner.go +++ b/app/pkg/learn/learner.go @@ -46,8 +46,6 @@ func (l *Learner) Reconcile(ctx context.Context) error { // TODO(jeremy): Can we call Analyze to compute the latest logs? log := logs.FromContext(ctx) - log.Error(errors.New("Not implemented"), "The learning code needs to be updated to filter out examples that are used for evaluation") - trainDir := l.Config.GetTrainingDir() if _, err := os.Stat(trainDir); err != nil { if os.IsNotExist(err) { @@ -97,7 +95,14 @@ func (l *Learner) reconcileExamples(ctx context.Context, blocks map[string]api.B // Block wasn't the result of AI generation continue } - // TODO(jeremy): Should we use some sort of distance metric? e.g. edit distance? + + if b.EvalMode { + log.V(logs.Debug).Info("Skipping block which was created as part of an eval", "id", b.ID) + continue + } + + // TODO(jeremy): Should we use some sort of distance metric? e.g. edit distance? We could potentially + // Use the metric used for eval. if strings.TrimSpace(b.ExecutedBlock.GetContents()) == strings.TrimSpace(b.GeneratedBlock.GetContents()) { log.V(logs.Debug).Info("Skipping executed block which matches generated block", "id", b.ID) continue diff --git a/data/eval/git/pr_description.foyle b/data/eval/git/pr_description.foyle new file mode 100644 index 00000000..0100c542 --- /dev/null +++ b/data/eval/git/pr_description.foyle @@ -0,0 +1,14 @@ +{ + "blocks": [ + { + "kind": "MARKUP", + "language": "markdown", + "contents": "Create a PR description" + }, + { + "kind": "CODE", + "language": "bash", + "contents": "git diff origin/main | llm --model=gpt-4-0125-preview -s \"Create a PR description from the following diff\"" + } + ] +} \ No newline at end of file diff --git a/data/eval/hydros/image_logs.foyle b/data/eval/hydros/image_logs.foyle new file mode 100644 index 00000000..ff31c231 --- /dev/null +++ b/data/eval/hydros/image_logs.foyle @@ -0,0 +1,14 @@ +{ + "blocks": [ + { + "kind": "MARKUP", + "language": "markdown", + "contents": "Get the logs for building the image carabou" + }, + { + "kind": "CODE", + "language": "bash", + "contents": "gcloud logging read 'logName=\"projects/foyle-dev/logs/hydros\" jsonPayload.image=\"carabou\"' --freshness=1d --project=foyle-dev" + } + ] +} \ No newline at end of file