From 6f19eac45ccb88cc176776ea79411f834a12a575 Mon Sep 17 00:00:00 2001
From: Jeremy Lewi <jeremy@lewi.us>
Date: Sat, 4 May 2024 17:51:54 -0700
Subject: [PATCH] Implement Evaluation Mode for Traces and Logs (#92)

echo "### PR Title: Implement Evaluation Mode for Traces and Logs

### Overview

This pull request introduces the ability to distinguish between logs
generated in evaluation mode and those created during normal operation.
This differentiation is crucial for ensuring that logs meant for
evaluation do not interfere with the learning process and data meant for
production use.

### Changes

1. **API Adjustments**: New `EvalMode` methods and fields are introduced
in various structures (`LogEntry`, `GenerateTrace`, `ExecuteTrace`, and
`BlockLog`) across the API. These changes allow the identification and
handling of logs and traces created in evaluation mode.

2. **Analyzer Logic Enhancement**: The analysis process now accounts for
the evaluation mode flag when processing and combining log entries. This
ensures that logs generated during evaluation carry the evaluation mode
flag throughout the analysis pipeline, from individual log entries to
compiled traces.

3. **Exclusion from Learning**: Blocks marked as created in evaluation
mode are now excluded from the learning process. This is implemented in
the learning logic to prevent evaluation data from being used as
learning examples, thus maintaining the integrity of the training data.


### Testing

- Extended unit tests to cover the changes in log analysis and learning
processes, ensuring that evaluation mode logs are processed and handled
as expected.
- Included tests with scenarios that simulate operational and evaluation
conditions to verify the robustness of the new logic.
---
 app/api/types.go                              |  18 +++
 app/pkg/analyze/analyzer.go                   |  35 +++++-
 app/pkg/analyze/analyzer_test.go              | 114 ++++++++++++++++--
 .../execute_traces_lines_eval_mode.jsonl      |   2 +
 ...execute_traces_lines_eval_mode_false.jsonl |   2 +
 .../generate_trace_lines_eval_mode.jsonl      |   5 +
 app/pkg/learn/learner.go                      |  11 +-
 data/eval/git/pr_description.foyle            |  14 +++
 data/eval/hydros/image_logs.foyle             |  14 +++
 9 files changed, 202 insertions(+), 13 deletions(-)
 create mode 100644 app/pkg/analyze/test_data/execute_traces_lines_eval_mode.jsonl
 create mode 100644 app/pkg/analyze/test_data/execute_traces_lines_eval_mode_false.jsonl
 create mode 100644 app/pkg/analyze/test_data/generate_trace_lines_eval_mode.jsonl
 create mode 100644 data/eval/git/pr_description.foyle
 create mode 100644 data/eval/hydros/image_logs.foyle

diff --git a/app/api/types.go b/app/api/types.go
index 270c4f16..f2e87b9e 100644
--- a/app/api/types.go
+++ b/app/api/types.go
@@ -39,6 +39,19 @@ func (L *LogEntry) Request() []byte {
 	return nil
 }
 
+// EvalMode returns value, ok. Where ok is true if the field was present and false otherwise.
+// If the field was present value is the value of the field.
+func (L *LogEntry) EvalMode() (bool, bool) {
+	v, ok := (*L)["evalMode"]
+	if !ok {
+		return false, false
+	}
+	if val, ok := v.(bool); ok {
+		return val, true
+	}
+	return false, false
+}
+
 func (L *LogEntry) Response() []byte {
 	v, ok := (*L)["response"]
 	if !ok {
@@ -127,6 +140,7 @@ type GenerateTrace struct {
 	EndTime   time.Time                  `json:"endTime"`
 	Request   *v1alpha1.GenerateRequest  `json:"request"`
 	Response  *v1alpha1.GenerateResponse `json:"response"`
+	EvalMode  bool                       `json:"evalMode"`
 }
 
 func (g *GenerateTrace) ID() string {
@@ -145,6 +159,7 @@ type ExecuteTrace struct {
 	EndTime   time.Time                 `json:"endTime"`
 	Request   *v1alpha1.ExecuteRequest  `json:"request"`
 	Response  *v1alpha1.ExecuteResponse `json:"response"`
+	EvalMode  bool                      `json:"evalMode"`
 }
 
 func (e *ExecuteTrace) ID() string {
@@ -177,4 +192,7 @@ type BlockLog struct {
 
 	// ExitCode is the exit code of the executed block
 	ExitCode int `json:"exitCode"`
+
+	// EvalMode is true if the block was generated as part of an evaluation and shouldn't be used for learning
+	EvalMode bool `json:"evalMode"`
 }
diff --git a/app/pkg/analyze/analyzer.go b/app/pkg/analyze/analyzer.go
index 0ee3384e..e9d97e0f 100644
--- a/app/pkg/analyze/analyzer.go
+++ b/app/pkg/analyze/analyzer.go
@@ -296,6 +296,11 @@ func buildBlockLog(ctx context.Context, block *api.BlockLog, traces map[string]a
 			block.Doc = genTrace.Request.GetDoc()
 		}
 
+		// If the block was generated as part of evaluation mode then consider it to be in evaluation mode.
+		if genTrace.EvalMode {
+			block.EvalMode = true
+		}
+
 		// Find the actual block
 		for _, b := range genTrace.Response.GetBlocks() {
 			if b.GetId() == block.ID {
@@ -327,6 +332,10 @@ func buildBlockLog(ctx context.Context, block *api.BlockLog, traces map[string]a
 		}
 	}
 	if lastTrace != nil {
+		// If the block was executed as part of evaluation mode then consider it to be in evaluation mode.
+		if lastTrace.EvalMode {
+			block.EvalMode = true
+		}
 		block.ExecutedBlock = lastTrace.Request.GetBlock()
 		block.ExitCode = unsetExitCode
 		for _, o := range lastTrace.Response.GetOutputs() {
@@ -364,10 +373,21 @@ func combineEntriesForTrace(ctx context.Context, entries []*api.LogEntry) (api.T
 
 func combineGenerateTrace(ctx context.Context, entries []*api.LogEntry) (*api.GenerateTrace, error) {
 	trace := &api.GenerateTrace{}
+	evalMode := false
 	for _, e := range entries {
 		if trace.TraceID == "" {
 			trace.TraceID = e.TraceID()
 		}
+		if mode, present := e.EvalMode(); present {
+			// If any of the entries are marked as true then we will consider the trace to be in eval mode.
+			// We don't want to assume that the evalMode will be set on all log entries in the trace.
+			// So the logic is to assume its not eval mode by default and then set it to eval mode if we find
+			// One entry that is marked as eval mode.
+			if mode {
+				evalMode = mode
+			}
+		}
+
 		if trace.Request == nil {
 			raw := e.Request()
 			if raw != nil {
@@ -392,16 +412,27 @@ func combineGenerateTrace(ctx context.Context, entries []*api.LogEntry) (*api.Ge
 			}
 		}
 	}
-
+	trace.EvalMode = evalMode
 	return trace, nil
 }
 
 func combineExecuteTrace(ctx context.Context, entries []*api.LogEntry) (*api.ExecuteTrace, error) {
 	trace := &api.ExecuteTrace{}
+	evalMode := false
 	for _, e := range entries {
 		if trace.TraceID == "" {
 			trace.TraceID = e.TraceID()
 		}
+		if mode, present := e.EvalMode(); present {
+			// If any of the entries are marked as true then we will consider the trace to be in eval mode.
+			// We don't want to assume that the evalMode will be set on all log entries in the trace.
+			// So the logic is to assume its not eval mode by default and then set it to eval mode if we find
+			// One entry that is marked as eval mode.
+			if mode {
+				evalMode = mode
+			}
+		}
+
 		if trace.Request == nil {
 			raw := e.Request()
 			if raw != nil {
@@ -426,6 +457,6 @@ func combineExecuteTrace(ctx context.Context, entries []*api.LogEntry) (*api.Exe
 			}
 		}
 	}
-
+	trace.EvalMode = evalMode
 	return trace, nil
 }
diff --git a/app/pkg/analyze/analyzer_test.go b/app/pkg/analyze/analyzer_test.go
index 4624e247..83706b7e 100644
--- a/app/pkg/analyze/analyzer_test.go
+++ b/app/pkg/analyze/analyzer_test.go
@@ -112,9 +112,60 @@ func Test_BuildBlockLog(t *testing.T) {
 		},
 	}
 
+	// Create a block in evaluation mode
+	const bid2 = "g456output1"
+	genTrace2 := &api.GenerateTrace{
+		TraceID:   "g456",
+		StartTime: timeMustParse(time.RFC3339, "2021-01-01T00:00:00Z"),
+		EndTime:   timeMustParse(time.RFC3339, "2021-01-01T00:01:00Z"),
+		Request: &v1alpha1.GenerateRequest{
+			Doc: &v1alpha1.Doc{
+				Blocks: []*v1alpha1.Block{
+					{
+						Contents: "echo hello",
+					},
+				},
+			},
+		},
+		Response: &v1alpha1.GenerateResponse{
+			Blocks: []*v1alpha1.Block{
+				{
+					Id:       bid2,
+					Contents: "outcell",
+				},
+			},
+		},
+		EvalMode: true,
+	}
+
+	execTrace3 := &api.ExecuteTrace{
+		TraceID:   "e912",
+		StartTime: timeMustParse(time.RFC3339, "2021-01-03T00:00:00Z"),
+		EndTime:   timeMustParse(time.RFC3339, "2021-01-03T00:01:00Z"),
+		Request: &v1alpha1.ExecuteRequest{
+			Block: &v1alpha1.Block{
+				Contents: "echo hello",
+				Id:       bid2,
+			},
+		},
+		Response: &v1alpha1.ExecuteResponse{
+			Outputs: []*v1alpha1.BlockOutput{
+				{
+					Items: []*v1alpha1.BlockOutputItem{
+						{
+							TextData: "exitCode: 7",
+						},
+					},
+				},
+			},
+		},
+	}
+
 	traces[genTrace.TraceID] = genTrace
+	traces[genTrace2.TraceID] = genTrace2
 	traces[execTrace1.TraceID] = execTrace1
 	traces[execTrace2.TraceID] = execTrace2
+	traces[execTrace3.TraceID] = execTrace3
 
 	// We shuffle ExecTraceIds to make sure we properly set block log based on the later trace
 	execTraceIds := shuffle([]string{execTrace1.TraceID, execTrace2.TraceID})
@@ -135,6 +186,27 @@ func Test_BuildBlockLog(t *testing.T) {
 				GeneratedBlock: genTrace.Response.Blocks[0],
 				ExecutedBlock:  execTrace2.Request.Block,
 				ExitCode:       7,
+				EvalMode:       false,
+			},
+			traces: traces,
+		},
+		{
+			name: "eval_mode",
+			block: &api.BlockLog{
+				ID:         bid2,
+				GenTraceID: genTrace2.TraceID,
+
+				ExecTraceIDs: []string{execTrace3.TraceID},
+			},
+			expected: &api.BlockLog{
+				ID:             bid2,
+				GenTraceID:     genTrace2.TraceID,
+				ExecTraceIDs:   []string{execTrace3.TraceID},
+				Doc:            genTrace2.Request.Doc,
+				GeneratedBlock: genTrace2.Response.Blocks[0],
+				ExecutedBlock:  execTrace3.Request.Block,
+				ExitCode:       7,
+				EvalMode:       true,
 			},
 			traces: traces,
 		},
@@ -296,14 +368,21 @@ func checkExecuteTracesFiles(t *testing.T, path string) {
 
 func Test_CombineGenerateEntries(t *testing.T) {
 	type testCase struct {
-		name      string
-		linesFile string
+		name             string
+		linesFile        string
+		expectedEvalMode bool
 	}
 
 	cases := []testCase{
 		{
-			name:      "basic",
-			linesFile: "generate_trace_lines.jsonl",
+			name:             "basic",
+			linesFile:        "generate_trace_lines.jsonl",
+			expectedEvalMode: false,
+		},
+		{
+			name:             "evalMode",
+			linesFile:        "generate_trace_lines_eval_mode.jsonl",
+			expectedEvalMode: true,
 		},
 	}
 
@@ -345,20 +424,36 @@ func Test_CombineGenerateEntries(t *testing.T) {
 			if trace.Response == nil {
 				t.Errorf("Expected trace to have a response")
 			}
+
+			if trace.EvalMode != c.expectedEvalMode {
+				t.Errorf("Expected EvalMode to be %v but got %v", c.expectedEvalMode, trace.EvalMode)
+			}
 		})
 	}
 }
 
 func Test_CombineExecuteEntries(t *testing.T) {
 	type testCase struct {
-		name      string
-		linesFile string
+		name             string
+		linesFile        string
+		expectedEvalMode bool
 	}
 
 	cases := []testCase{
 		{
-			name:      "basic",
-			linesFile: "execute_traces_lines.jsonl",
+			name:             "basic",
+			linesFile:        "execute_traces_lines.jsonl",
+			expectedEvalMode: false,
+		},
+		{
+			name:             "eval_mode_true",
+			linesFile:        "execute_traces_lines_eval_mode.jsonl",
+			expectedEvalMode: true,
+		},
+		{
+			name:             "eval_mode_false",
+			linesFile:        "execute_traces_lines_eval_mode_false.jsonl",
+			expectedEvalMode: false,
 		},
 	}
 
@@ -400,6 +495,9 @@ func Test_CombineExecuteEntries(t *testing.T) {
 			if trace.Response == nil {
 				t.Errorf("Expected trace to have a response")
 			}
+			if trace.EvalMode != c.expectedEvalMode {
+				t.Errorf("Expected EvalMode to be %v but got %v", c.expectedEvalMode, trace.EvalMode)
+			}
 		})
 	}
 }
diff --git a/app/pkg/analyze/test_data/execute_traces_lines_eval_mode.jsonl b/app/pkg/analyze/test_data/execute_traces_lines_eval_mode.jsonl
new file mode 100644
index 00000000..d6d7d9af
--- /dev/null
+++ b/app/pkg/analyze/test_data/execute_traces_lines_eval_mode.jsonl
@@ -0,0 +1,2 @@
+{"severity":"info","time":1713303870.400788,"caller":"executor/executor.go:43", "evalMode":true,"function":"github.com/jlewi/foyle/app/pkg/executor.(*Executor).Execute","message":"Executor.Execute","traceId":"43281a4e73cd76570e9851589207a8bd","blockId":"","request":{"block":{"kind":"CODE","language":"","contents":"gcloud logging read --project=foyle-dev\n","outputs":[],"trace_ids":[],"id":""}}}
+{"severity":"info","time":1713303872.587626,"caller":"executor/executor.go:61","function":"github.com/jlewi/foyle/app/pkg/executor.(*Executor).Execute","message":"Executed instructions","traceId":"43281a4e73cd76570e9851589207a8bd","instructionsError":"json: unsupported type: chan string","response":{"outputs":[{"items":[{"mime":"text/plain","text_data":"exitCode: 0"}]}]}}
diff --git a/app/pkg/analyze/test_data/execute_traces_lines_eval_mode_false.jsonl b/app/pkg/analyze/test_data/execute_traces_lines_eval_mode_false.jsonl
new file mode 100644
index 00000000..9a0eface
--- /dev/null
+++ b/app/pkg/analyze/test_data/execute_traces_lines_eval_mode_false.jsonl
@@ -0,0 +1,2 @@
+{"severity":"info","time":1713303870.400788,"caller":"executor/executor.go:43", "evalMode":false,"function":"github.com/jlewi/foyle/app/pkg/executor.(*Executor).Execute","message":"Executor.Execute","traceId":"43281a4e73cd76570e9851589207a8bd","blockId":"","request":{"block":{"kind":"CODE","language":"","contents":"gcloud logging read --project=foyle-dev\n","outputs":[],"trace_ids":[],"id":""}}}
+{"severity":"info","time":1713303872.587626,"caller":"executor/executor.go:61","function":"github.com/jlewi/foyle/app/pkg/executor.(*Executor).Execute","message":"Executed instructions","traceId":"43281a4e73cd76570e9851589207a8bd","instructionsError":"json: unsupported type: chan string","response":{"outputs":[{"items":[{"mime":"text/plain","text_data":"exitCode: 0"}]}]}}
diff --git a/app/pkg/analyze/test_data/generate_trace_lines_eval_mode.jsonl b/app/pkg/analyze/test_data/generate_trace_lines_eval_mode.jsonl
new file mode 100644
index 00000000..9b80c37d
--- /dev/null
+++ b/app/pkg/analyze/test_data/generate_trace_lines_eval_mode.jsonl
@@ -0,0 +1,5 @@
+{"severity":"info","time":1713303852.283372,"caller":"agent/agent.go:61", "evalMode": true,"function":"github.com/jlewi/foyle/app/pkg/agent.(*Agent).Generate","message":"Agent.Generate","traceId":"2c4efb9372de3bb51d66d3f7e9c3a76a","request":{"doc":{"blocks":[{"kind":"MARKUP","language":"markdown","contents":"Use gcloud to read the logs for the cluster dev in project foyle-dev; assume you aready logged in","outputs":[],"trace_ids":["","","",""],"id":""}]}}}
+{"severity":"info","time":1713303852.283869,"caller":"agent/agent.go:117","function":"github.com/jlewi/foyle/app/pkg/agent.(*Agent).completeWithRetries","message":"OpenAI:CreateChatCompletion","traceId":"2c4efb9372de3bb51d66d3f7e9c3a76a","request":{"model":"gpt-3.5-turbo-0125","messages":[{"role":"system","content":"You are a helpful AI assistant for software developers. You are helping software engineers write markdown documents to deploy\nand operate software. Your job is to help users reason about problems and tasks and come up with the appropriate\ncommands to accomplish them. You should never try to execute commands. You should always tell the user\nto execute the commands themselves. To help the user place the commands inside a code block with the language set to\nbash. Users can then execute the commands inside VSCode notebooks. The output will then be appended to the document.\nYou can then use that output to reason about the next steps.\n\nYou are only helping users with tasks related to building, deploying, and operating software. You should interpret\nany questions or commands in that context.\n"},{"role":"user","content":"Please continue writing this markdown document to deal with any tasks or issues listed\nin the document. The document is a markdown document. It will contain a description of the task\nor problem, I need your help with. It will then contain one or more code blocks containing commands\nto be executed to accomplish the task or obtain information needed to figure out the problem.\nIf a command has already been executed the output of the command will be provided in a code block\nwith the language `output`. Use the output to help you figure out the problem or complete the task.\nIf you need me to execute a command please provide the command in a code block and I will execute it\nand then add the output to the document.\n\nHere's the document:\n\nUse gcloud to read the logs for the cluster dev in project foyle-dev; assume you aready logged in\n"}],"max_tokens":2000,"temperature":0.9}}
+{"severity":"info","time":1713303855.224336,"caller":"agent/agent.go:132","function":"github.com/jlewi/foyle/app/pkg/agent.(*Agent).completeWithRetries","message":"OpenAI:CreateChatCompletion response","traceId":"2c4efb9372de3bb51d66d3f7e9c3a76a","resp":{"id":"chatcmpl-9Ekt2d8y2F8FKBEfPdVj6iFiV4ciL","object":"chat.completion","created":1713303852,"model":"gpt-3.5-turbo-0125","choices":[{"index":0,"message":{"role":"assistant","content":"To read the logs for the cluster `dev` in project `foyle-dev` using `gcloud`, you can use the following command:\n\n```bash\ngcloud container clusters get-credentials dev --project=foyle-dev\n```\n\nThis command will authenticate `kubectl` to the cluster `dev` in the `foyle-dev` project.\n\nAfter this, you can use `kubectl` to read the logs for the cluster. Here's the command to read the logs for a specific pod:\n\n```bash\nkubectl logs \u003cpod_name\u003e\n```\n\nReplace `\u003cpod_name\u003e` with the name of the pod for which you want to read the logs. If you are not sure about the pod name, you can list all pods in the cluster using the following command:\n\n```bash\nkubectl get pods\n```\n\nPlease execute the commands above and provide the output if you need further assistance."},"finish_reason":"stop"}],"usage":{"prompt_tokens":336,"completion_tokens":179,"total_tokens":515},"system_fingerprint":"fp_c2295e73ad"}}
+{"severity":"info","time":1713303855.225972,"caller":"agent/agent.go:76","function":"github.com/jlewi/foyle/app/pkg/agent.(*Agent).Generate","message":"Agent.Generate returning blocks","traceId":"2c4efb9372de3bb51d66d3f7e9c3a76a","blockIds":["10b11f2d-7c8d-4d58-bedc-7e2dd51a85dc","8594d742-39d7-473b-b4ad-901ff362fdb3","f1662328-e884-418c-a084-95dfb1a3f7fc","4feb6219-d050-4630-8ceb-d08ec149b60d","3893a0b6-8c84-49ca-a38c-fbf6d7adfcde","fd276a6f-f379-4f9c-9779-0ed07819d0f5","d507ce35-af59-4f92-8dec-6c37d7b26647"]}
+{"severity":"info","time":1713303855.226055,"caller":"agent/agent.go:83","function":"github.com/jlewi/foyle/app/pkg/agent.(*Agent).Generate","message":"Agent.Generate returning response","traceId":"2c4efb9372de3bb51d66d3f7e9c3a76a","response":{"blocks":[{"kind":"MARKUP","language":"","contents":"To read the logs for the cluster `dev` in project `foyle-dev` using `gcloud`, you can use the following command:","outputs":[],"trace_ids":[],"id":"10b11f2d-7c8d-4d58-bedc-7e2dd51a85dc"},{"kind":"CODE","language":"bash","contents":"gcloud container clusters get-credentials dev --project=foyle-dev\n","outputs":[],"trace_ids":[],"id":"8594d742-39d7-473b-b4ad-901ff362fdb3"},{"kind":"MARKUP","language":"","contents":"\n\nThis command will authenticate `kubectl` to the cluster `dev` in the `foyle-dev` project.\n\nAfter this, you can use `kubectl` to read the logs for the cluster. Here's the command to read the logs for a specific pod:","outputs":[],"trace_ids":[],"id":"f1662328-e884-418c-a084-95dfb1a3f7fc"},{"kind":"CODE","language":"bash","contents":"kubectl logs <pod_name>\n","outputs":[],"trace_ids":[],"id":"4feb6219-d050-4630-8ceb-d08ec149b60d"},{"kind":"MARKUP","language":"","contents":"\n\nReplace `<pod_name>` with the name of the pod for which you want to read the logs. If you are not sure about the pod name, you can list all pods in the cluster using the following command:","outputs":[],"trace_ids":[],"id":"3893a0b6-8c84-49ca-a38c-fbf6d7adfcde"},{"kind":"CODE","language":"bash","contents":"kubectl get pods\n","outputs":[],"trace_ids":[],"id":"fd276a6f-f379-4f9c-9779-0ed07819d0f5"},{"kind":"MARKUP","language":"","contents":"\n\nPlease execute the commands above and provide the output if you need further assistance.","outputs":[],"trace_ids":[],"id":"d507ce35-af59-4f92-8dec-6c37d7b26647"}]}}
\ No newline at end of file
diff --git a/app/pkg/learn/learner.go b/app/pkg/learn/learner.go
index 4c85513c..91bb27c7 100644
--- a/app/pkg/learn/learner.go
+++ b/app/pkg/learn/learner.go
@@ -46,8 +46,6 @@ func (l *Learner) Reconcile(ctx context.Context) error {
 	// TODO(jeremy): Can we call Analyze to compute the latest logs?
 	log := logs.FromContext(ctx)
 
-	log.Error(errors.New("Not implemented"), "The learning code needs to be updated to filter out examples that are used for evaluation")
-
 	trainDir := l.Config.GetTrainingDir()
 	if _, err := os.Stat(trainDir); err != nil {
 		if os.IsNotExist(err) {
@@ -97,7 +95,14 @@ func (l *Learner) reconcileExamples(ctx context.Context, blocks map[string]api.B
 			// Block wasn't the result of AI generation
 			continue
 		}
-		// TODO(jeremy): Should we use some sort of distance metric? e.g. edit distance?
+
+		if b.EvalMode {
+			log.V(logs.Debug).Info("Skipping block which was created as part of an eval", "id", b.ID)
+			continue
+		}
+
+		// TODO(jeremy): Should we use some sort of distance metric? e.g. edit distance? We could potentially
+		// Use the metric used for eval.
 		if strings.TrimSpace(b.ExecutedBlock.GetContents()) == strings.TrimSpace(b.GeneratedBlock.GetContents()) {
 			log.V(logs.Debug).Info("Skipping executed block which matches generated block", "id", b.ID)
 			continue
diff --git a/data/eval/git/pr_description.foyle b/data/eval/git/pr_description.foyle
new file mode 100644
index 00000000..0100c542
--- /dev/null
+++ b/data/eval/git/pr_description.foyle
@@ -0,0 +1,14 @@
+{
+  "blocks": [
+    {
+      "kind": "MARKUP",
+      "language": "markdown",
+      "contents": "Create a PR description"
+    },
+    {
+      "kind": "CODE",
+      "language": "bash",
+      "contents": "git diff origin/main | llm --model=gpt-4-0125-preview -s \"Create a PR description from the following diff\""
+    }
+  ]
+}
\ No newline at end of file
diff --git a/data/eval/hydros/image_logs.foyle b/data/eval/hydros/image_logs.foyle
new file mode 100644
index 00000000..ff31c231
--- /dev/null
+++ b/data/eval/hydros/image_logs.foyle
@@ -0,0 +1,14 @@
+{
+  "blocks": [
+    {
+      "kind": "MARKUP",
+      "language": "markdown",
+      "contents": "Get the logs for building the image carabou"
+    },
+    {
+      "kind": "CODE",
+      "language": "bash",
+      "contents": "gcloud logging read 'logName=\"projects/foyle-dev/logs/hydros\" jsonPayload.image=\"carabou\"' --freshness=1d  --project=foyle-dev"
+    }
+  ]
+}
\ No newline at end of file