Skip to content

Commit

Permalink
Implement Evaluation Mode for Traces and Logs (#92)
Browse files Browse the repository at this point in the history
echo "### PR Title: Implement Evaluation Mode for Traces and Logs

### Overview

This pull request introduces the ability to distinguish between logs
generated in evaluation mode and those created during normal operation.
This differentiation is crucial for ensuring that logs meant for
evaluation do not interfere with the learning process and data meant for
production use.

### Changes

1. **API Adjustments**: New `EvalMode` methods and fields are introduced
in various structures (`LogEntry`, `GenerateTrace`, `ExecuteTrace`, and
`BlockLog`) across the API. These changes allow the identification and
handling of logs and traces created in evaluation mode.

2. **Analyzer Logic Enhancement**: The analysis process now accounts for
the evaluation mode flag when processing and combining log entries. This
ensures that logs generated during evaluation carry the evaluation mode
flag throughout the analysis pipeline, from individual log entries to
compiled traces.

3. **Exclusion from Learning**: Blocks marked as created in evaluation
mode are now excluded from the learning process. This is implemented in
the learning logic to prevent evaluation data from being used as
learning examples, thus maintaining the integrity of the training data.


### Testing

- Extended unit tests to cover the changes in log analysis and learning
processes, ensuring that evaluation mode logs are processed and handled
as expected.
- Included tests with scenarios that simulate operational and evaluation
conditions to verify the robustness of the new logic.
  • Loading branch information
jlewi authored May 5, 2024
1 parent 1be7110 commit 6f19eac
Show file tree
Hide file tree
Showing 9 changed files with 202 additions and 13 deletions.
18 changes: 18 additions & 0 deletions app/api/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,19 @@ func (L *LogEntry) Request() []byte {
return nil
}

// EvalMode returns value, ok. Where ok is true if the field was present and false otherwise.
// If the field was present value is the value of the field.
func (L *LogEntry) EvalMode() (bool, bool) {
v, ok := (*L)["evalMode"]
if !ok {
return false, false
}
if val, ok := v.(bool); ok {
return val, true
}
return false, false
}

func (L *LogEntry) Response() []byte {
v, ok := (*L)["response"]
if !ok {
Expand Down Expand Up @@ -127,6 +140,7 @@ type GenerateTrace struct {
EndTime time.Time `json:"endTime"`
Request *v1alpha1.GenerateRequest `json:"request"`
Response *v1alpha1.GenerateResponse `json:"response"`
EvalMode bool `json:"evalMode"`
}

func (g *GenerateTrace) ID() string {
Expand All @@ -145,6 +159,7 @@ type ExecuteTrace struct {
EndTime time.Time `json:"endTime"`
Request *v1alpha1.ExecuteRequest `json:"request"`
Response *v1alpha1.ExecuteResponse `json:"response"`
EvalMode bool `json:"evalMode"`
}

func (e *ExecuteTrace) ID() string {
Expand Down Expand Up @@ -177,4 +192,7 @@ type BlockLog struct {

// ExitCode is the exit code of the executed block
ExitCode int `json:"exitCode"`

// EvalMode is true if the block was generated as part of an evaluation and shouldn't be used for learning
EvalMode bool `json:"evalMode"`
}
35 changes: 33 additions & 2 deletions app/pkg/analyze/analyzer.go
Original file line number Diff line number Diff line change
Expand Up @@ -296,6 +296,11 @@ func buildBlockLog(ctx context.Context, block *api.BlockLog, traces map[string]a
block.Doc = genTrace.Request.GetDoc()
}

// If the block was generated as part of evaluation mode then consider it to be in evaluation mode.
if genTrace.EvalMode {
block.EvalMode = true
}

// Find the actual block
for _, b := range genTrace.Response.GetBlocks() {
if b.GetId() == block.ID {
Expand Down Expand Up @@ -327,6 +332,10 @@ func buildBlockLog(ctx context.Context, block *api.BlockLog, traces map[string]a
}
}
if lastTrace != nil {
// If the block was executed as part of evaluation mode then consider it to be in evaluation mode.
if lastTrace.EvalMode {
block.EvalMode = true
}
block.ExecutedBlock = lastTrace.Request.GetBlock()
block.ExitCode = unsetExitCode
for _, o := range lastTrace.Response.GetOutputs() {
Expand Down Expand Up @@ -364,10 +373,21 @@ func combineEntriesForTrace(ctx context.Context, entries []*api.LogEntry) (api.T

func combineGenerateTrace(ctx context.Context, entries []*api.LogEntry) (*api.GenerateTrace, error) {
trace := &api.GenerateTrace{}
evalMode := false
for _, e := range entries {
if trace.TraceID == "" {
trace.TraceID = e.TraceID()
}
if mode, present := e.EvalMode(); present {
// If any of the entries are marked as true then we will consider the trace to be in eval mode.
// We don't want to assume that the evalMode will be set on all log entries in the trace.
// So the logic is to assume its not eval mode by default and then set it to eval mode if we find
// One entry that is marked as eval mode.
if mode {
evalMode = mode
}
}

if trace.Request == nil {
raw := e.Request()
if raw != nil {
Expand All @@ -392,16 +412,27 @@ func combineGenerateTrace(ctx context.Context, entries []*api.LogEntry) (*api.Ge
}
}
}

trace.EvalMode = evalMode
return trace, nil
}

func combineExecuteTrace(ctx context.Context, entries []*api.LogEntry) (*api.ExecuteTrace, error) {
trace := &api.ExecuteTrace{}
evalMode := false
for _, e := range entries {
if trace.TraceID == "" {
trace.TraceID = e.TraceID()
}
if mode, present := e.EvalMode(); present {
// If any of the entries are marked as true then we will consider the trace to be in eval mode.
// We don't want to assume that the evalMode will be set on all log entries in the trace.
// So the logic is to assume its not eval mode by default and then set it to eval mode if we find
// One entry that is marked as eval mode.
if mode {
evalMode = mode
}
}

if trace.Request == nil {
raw := e.Request()
if raw != nil {
Expand All @@ -426,6 +457,6 @@ func combineExecuteTrace(ctx context.Context, entries []*api.LogEntry) (*api.Exe
}
}
}

trace.EvalMode = evalMode
return trace, nil
}
114 changes: 106 additions & 8 deletions app/pkg/analyze/analyzer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -112,9 +112,60 @@ func Test_BuildBlockLog(t *testing.T) {
},
}

// Create a block in evaluation mode
const bid2 = "g456output1"
genTrace2 := &api.GenerateTrace{
TraceID: "g456",
StartTime: timeMustParse(time.RFC3339, "2021-01-01T00:00:00Z"),
EndTime: timeMustParse(time.RFC3339, "2021-01-01T00:01:00Z"),
Request: &v1alpha1.GenerateRequest{
Doc: &v1alpha1.Doc{
Blocks: []*v1alpha1.Block{
{
Contents: "echo hello",
},
},
},
},
Response: &v1alpha1.GenerateResponse{
Blocks: []*v1alpha1.Block{
{
Id: bid2,
Contents: "outcell",
},
},
},
EvalMode: true,
}

execTrace3 := &api.ExecuteTrace{
TraceID: "e912",
StartTime: timeMustParse(time.RFC3339, "2021-01-03T00:00:00Z"),
EndTime: timeMustParse(time.RFC3339, "2021-01-03T00:01:00Z"),
Request: &v1alpha1.ExecuteRequest{
Block: &v1alpha1.Block{
Contents: "echo hello",
Id: bid2,
},
},
Response: &v1alpha1.ExecuteResponse{
Outputs: []*v1alpha1.BlockOutput{
{
Items: []*v1alpha1.BlockOutputItem{
{
TextData: "exitCode: 7",
},
},
},
},
},
}

traces[genTrace.TraceID] = genTrace
traces[genTrace2.TraceID] = genTrace2
traces[execTrace1.TraceID] = execTrace1
traces[execTrace2.TraceID] = execTrace2
traces[execTrace3.TraceID] = execTrace3

// We shuffle ExecTraceIds to make sure we properly set block log based on the later trace
execTraceIds := shuffle([]string{execTrace1.TraceID, execTrace2.TraceID})
Expand All @@ -135,6 +186,27 @@ func Test_BuildBlockLog(t *testing.T) {
GeneratedBlock: genTrace.Response.Blocks[0],
ExecutedBlock: execTrace2.Request.Block,
ExitCode: 7,
EvalMode: false,
},
traces: traces,
},
{
name: "eval_mode",
block: &api.BlockLog{
ID: bid2,
GenTraceID: genTrace2.TraceID,

ExecTraceIDs: []string{execTrace3.TraceID},
},
expected: &api.BlockLog{
ID: bid2,
GenTraceID: genTrace2.TraceID,
ExecTraceIDs: []string{execTrace3.TraceID},
Doc: genTrace2.Request.Doc,
GeneratedBlock: genTrace2.Response.Blocks[0],
ExecutedBlock: execTrace3.Request.Block,
ExitCode: 7,
EvalMode: true,
},
traces: traces,
},
Expand Down Expand Up @@ -296,14 +368,21 @@ func checkExecuteTracesFiles(t *testing.T, path string) {

func Test_CombineGenerateEntries(t *testing.T) {
type testCase struct {
name string
linesFile string
name string
linesFile string
expectedEvalMode bool
}

cases := []testCase{
{
name: "basic",
linesFile: "generate_trace_lines.jsonl",
name: "basic",
linesFile: "generate_trace_lines.jsonl",
expectedEvalMode: false,
},
{
name: "evalMode",
linesFile: "generate_trace_lines_eval_mode.jsonl",
expectedEvalMode: true,
},
}

Expand Down Expand Up @@ -345,20 +424,36 @@ func Test_CombineGenerateEntries(t *testing.T) {
if trace.Response == nil {
t.Errorf("Expected trace to have a response")
}

if trace.EvalMode != c.expectedEvalMode {
t.Errorf("Expected EvalMode to be %v but got %v", c.expectedEvalMode, trace.EvalMode)
}
})
}
}

func Test_CombineExecuteEntries(t *testing.T) {
type testCase struct {
name string
linesFile string
name string
linesFile string
expectedEvalMode bool
}

cases := []testCase{
{
name: "basic",
linesFile: "execute_traces_lines.jsonl",
name: "basic",
linesFile: "execute_traces_lines.jsonl",
expectedEvalMode: false,
},
{
name: "eval_mode_true",
linesFile: "execute_traces_lines_eval_mode.jsonl",
expectedEvalMode: true,
},
{
name: "eval_mode_false",
linesFile: "execute_traces_lines_eval_mode_false.jsonl",
expectedEvalMode: false,
},
}

Expand Down Expand Up @@ -400,6 +495,9 @@ func Test_CombineExecuteEntries(t *testing.T) {
if trace.Response == nil {
t.Errorf("Expected trace to have a response")
}
if trace.EvalMode != c.expectedEvalMode {
t.Errorf("Expected EvalMode to be %v but got %v", c.expectedEvalMode, trace.EvalMode)
}
})
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{"severity":"info","time":1713303870.400788,"caller":"executor/executor.go:43", "evalMode":true,"function":"github.com/jlewi/foyle/app/pkg/executor.(*Executor).Execute","message":"Executor.Execute","traceId":"43281a4e73cd76570e9851589207a8bd","blockId":"","request":{"block":{"kind":"CODE","language":"","contents":"gcloud logging read --project=foyle-dev\n","outputs":[],"trace_ids":[],"id":""}}}
{"severity":"info","time":1713303872.587626,"caller":"executor/executor.go:61","function":"github.com/jlewi/foyle/app/pkg/executor.(*Executor).Execute","message":"Executed instructions","traceId":"43281a4e73cd76570e9851589207a8bd","instructionsError":"json: unsupported type: chan string","response":{"outputs":[{"items":[{"mime":"text/plain","text_data":"exitCode: 0"}]}]}}
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{"severity":"info","time":1713303870.400788,"caller":"executor/executor.go:43", "evalMode":false,"function":"github.com/jlewi/foyle/app/pkg/executor.(*Executor).Execute","message":"Executor.Execute","traceId":"43281a4e73cd76570e9851589207a8bd","blockId":"","request":{"block":{"kind":"CODE","language":"","contents":"gcloud logging read --project=foyle-dev\n","outputs":[],"trace_ids":[],"id":""}}}
{"severity":"info","time":1713303872.587626,"caller":"executor/executor.go:61","function":"github.com/jlewi/foyle/app/pkg/executor.(*Executor).Execute","message":"Executed instructions","traceId":"43281a4e73cd76570e9851589207a8bd","instructionsError":"json: unsupported type: chan string","response":{"outputs":[{"items":[{"mime":"text/plain","text_data":"exitCode: 0"}]}]}}
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{"severity":"info","time":1713303852.283372,"caller":"agent/agent.go:61", "evalMode": true,"function":"github.com/jlewi/foyle/app/pkg/agent.(*Agent).Generate","message":"Agent.Generate","traceId":"2c4efb9372de3bb51d66d3f7e9c3a76a","request":{"doc":{"blocks":[{"kind":"MARKUP","language":"markdown","contents":"Use gcloud to read the logs for the cluster dev in project foyle-dev; assume you aready logged in","outputs":[],"trace_ids":["","","",""],"id":""}]}}}
{"severity":"info","time":1713303852.283869,"caller":"agent/agent.go:117","function":"github.com/jlewi/foyle/app/pkg/agent.(*Agent).completeWithRetries","message":"OpenAI:CreateChatCompletion","traceId":"2c4efb9372de3bb51d66d3f7e9c3a76a","request":{"model":"gpt-3.5-turbo-0125","messages":[{"role":"system","content":"You are a helpful AI assistant for software developers. You are helping software engineers write markdown documents to deploy\nand operate software. Your job is to help users reason about problems and tasks and come up with the appropriate\ncommands to accomplish them. You should never try to execute commands. You should always tell the user\nto execute the commands themselves. To help the user place the commands inside a code block with the language set to\nbash. Users can then execute the commands inside VSCode notebooks. The output will then be appended to the document.\nYou can then use that output to reason about the next steps.\n\nYou are only helping users with tasks related to building, deploying, and operating software. You should interpret\nany questions or commands in that context.\n"},{"role":"user","content":"Please continue writing this markdown document to deal with any tasks or issues listed\nin the document. The document is a markdown document. It will contain a description of the task\nor problem, I need your help with. It will then contain one or more code blocks containing commands\nto be executed to accomplish the task or obtain information needed to figure out the problem.\nIf a command has already been executed the output of the command will be provided in a code block\nwith the language `output`. Use the output to help you figure out the problem or complete the task.\nIf you need me to execute a command please provide the command in a code block and I will execute it\nand then add the output to the document.\n\nHere's the document:\n\nUse gcloud to read the logs for the cluster dev in project foyle-dev; assume you aready logged in\n"}],"max_tokens":2000,"temperature":0.9}}
{"severity":"info","time":1713303855.224336,"caller":"agent/agent.go:132","function":"github.com/jlewi/foyle/app/pkg/agent.(*Agent).completeWithRetries","message":"OpenAI:CreateChatCompletion response","traceId":"2c4efb9372de3bb51d66d3f7e9c3a76a","resp":{"id":"chatcmpl-9Ekt2d8y2F8FKBEfPdVj6iFiV4ciL","object":"chat.completion","created":1713303852,"model":"gpt-3.5-turbo-0125","choices":[{"index":0,"message":{"role":"assistant","content":"To read the logs for the cluster `dev` in project `foyle-dev` using `gcloud`, you can use the following command:\n\n```bash\ngcloud container clusters get-credentials dev --project=foyle-dev\n```\n\nThis command will authenticate `kubectl` to the cluster `dev` in the `foyle-dev` project.\n\nAfter this, you can use `kubectl` to read the logs for the cluster. Here's the command to read the logs for a specific pod:\n\n```bash\nkubectl logs \u003cpod_name\u003e\n```\n\nReplace `\u003cpod_name\u003e` with the name of the pod for which you want to read the logs. If you are not sure about the pod name, you can list all pods in the cluster using the following command:\n\n```bash\nkubectl get pods\n```\n\nPlease execute the commands above and provide the output if you need further assistance."},"finish_reason":"stop"}],"usage":{"prompt_tokens":336,"completion_tokens":179,"total_tokens":515},"system_fingerprint":"fp_c2295e73ad"}}
{"severity":"info","time":1713303855.225972,"caller":"agent/agent.go:76","function":"github.com/jlewi/foyle/app/pkg/agent.(*Agent).Generate","message":"Agent.Generate returning blocks","traceId":"2c4efb9372de3bb51d66d3f7e9c3a76a","blockIds":["10b11f2d-7c8d-4d58-bedc-7e2dd51a85dc","8594d742-39d7-473b-b4ad-901ff362fdb3","f1662328-e884-418c-a084-95dfb1a3f7fc","4feb6219-d050-4630-8ceb-d08ec149b60d","3893a0b6-8c84-49ca-a38c-fbf6d7adfcde","fd276a6f-f379-4f9c-9779-0ed07819d0f5","d507ce35-af59-4f92-8dec-6c37d7b26647"]}
{"severity":"info","time":1713303855.226055,"caller":"agent/agent.go:83","function":"github.com/jlewi/foyle/app/pkg/agent.(*Agent).Generate","message":"Agent.Generate returning response","traceId":"2c4efb9372de3bb51d66d3f7e9c3a76a","response":{"blocks":[{"kind":"MARKUP","language":"","contents":"To read the logs for the cluster `dev` in project `foyle-dev` using `gcloud`, you can use the following command:","outputs":[],"trace_ids":[],"id":"10b11f2d-7c8d-4d58-bedc-7e2dd51a85dc"},{"kind":"CODE","language":"bash","contents":"gcloud container clusters get-credentials dev --project=foyle-dev\n","outputs":[],"trace_ids":[],"id":"8594d742-39d7-473b-b4ad-901ff362fdb3"},{"kind":"MARKUP","language":"","contents":"\n\nThis command will authenticate `kubectl` to the cluster `dev` in the `foyle-dev` project.\n\nAfter this, you can use `kubectl` to read the logs for the cluster. Here's the command to read the logs for a specific pod:","outputs":[],"trace_ids":[],"id":"f1662328-e884-418c-a084-95dfb1a3f7fc"},{"kind":"CODE","language":"bash","contents":"kubectl logs <pod_name>\n","outputs":[],"trace_ids":[],"id":"4feb6219-d050-4630-8ceb-d08ec149b60d"},{"kind":"MARKUP","language":"","contents":"\n\nReplace `<pod_name>` with the name of the pod for which you want to read the logs. If you are not sure about the pod name, you can list all pods in the cluster using the following command:","outputs":[],"trace_ids":[],"id":"3893a0b6-8c84-49ca-a38c-fbf6d7adfcde"},{"kind":"CODE","language":"bash","contents":"kubectl get pods\n","outputs":[],"trace_ids":[],"id":"fd276a6f-f379-4f9c-9779-0ed07819d0f5"},{"kind":"MARKUP","language":"","contents":"\n\nPlease execute the commands above and provide the output if you need further assistance.","outputs":[],"trace_ids":[],"id":"d507ce35-af59-4f92-8dec-6c37d7b26647"}]}}
11 changes: 8 additions & 3 deletions app/pkg/learn/learner.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,6 @@ func (l *Learner) Reconcile(ctx context.Context) error {
// TODO(jeremy): Can we call Analyze to compute the latest logs?
log := logs.FromContext(ctx)

log.Error(errors.New("Not implemented"), "The learning code needs to be updated to filter out examples that are used for evaluation")

trainDir := l.Config.GetTrainingDir()
if _, err := os.Stat(trainDir); err != nil {
if os.IsNotExist(err) {
Expand Down Expand Up @@ -97,7 +95,14 @@ func (l *Learner) reconcileExamples(ctx context.Context, blocks map[string]api.B
// Block wasn't the result of AI generation
continue
}
// TODO(jeremy): Should we use some sort of distance metric? e.g. edit distance?

if b.EvalMode {
log.V(logs.Debug).Info("Skipping block which was created as part of an eval", "id", b.ID)
continue
}

// TODO(jeremy): Should we use some sort of distance metric? e.g. edit distance? We could potentially
// Use the metric used for eval.
if strings.TrimSpace(b.ExecutedBlock.GetContents()) == strings.TrimSpace(b.GeneratedBlock.GetContents()) {
log.V(logs.Debug).Info("Skipping executed block which matches generated block", "id", b.ID)
continue
Expand Down
14 changes: 14 additions & 0 deletions data/eval/git/pr_description.foyle
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
{
"blocks": [
{
"kind": "MARKUP",
"language": "markdown",
"contents": "Create a PR description"
},
{
"kind": "CODE",
"language": "bash",
"contents": "git diff origin/main | llm --model=gpt-4-0125-preview -s \"Create a PR description from the following diff\""
}
]
}
14 changes: 14 additions & 0 deletions data/eval/hydros/image_logs.foyle
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
{
"blocks": [
{
"kind": "MARKUP",
"language": "markdown",
"contents": "Get the logs for building the image carabou"
},
{
"kind": "CODE",
"language": "bash",
"contents": "gcloud logging read 'logName=\"projects/foyle-dev/logs/hydros\" jsonPayload.image=\"carabou\"' --freshness=1d --project=foyle-dev"
}
]
}

0 comments on commit 6f19eac

Please sign in to comment.