Prompt should support generating markup cells (#285)

## Suggest Markup Cells * Now that the frontend can render markup as ghost cells we want the agent to start generating them * This will allow the AI to 1. Reason about the outputs of commands - e.g. interpret whether the output supports or refutes a hypothesis 2. Suggest markup cells containing plans * Related to #284 * Don't restrict the response to a single code block. * Now that we can render markup cells as ghost cells * We should allow multi-block responses that can include markup cells ## Change PostProcessing of responses * We no longer limit the response to a single code block. * We allow at most 2 blocks; one markup and one code cell * We do this because from a UX experience generating multiple cells is confusing * If there are multiple markup blocks in sequence we merge them into one block * This is less confusing for users * I believe the multiple cells is an artifact of Runme parses markup into blocks. * Drop any cells after the first code block in the response * Remove the hack which only generated completions if the current cell was a markup cell. We should generate completions even if the cell is code or output cell * This was a cost saving measure. However, switching to gpt4o-mini should have sufficiently reduced costs that we can afford to generate completions on all cells.
jlewi · Oct 23, 2024 · b8ee786 · b8ee786
1 parent 39a5991
commit b8ee786
Show file tree

Hide file tree

Showing 17 changed files with 789 additions and 346 deletions.
diff --git a/app/pkg/agent/agent.go b/app/pkg/agent/agent.go
@@ -135,6 +135,8 @@ func (a *Agent) Generate(ctx context.Context, req *v1alpha1.GenerateRequest) (*v
 	}
 
 	log.Info("Agent.Generate returning response", zap.Object("response", resp))
+
+	assertRequestResponse(ctx, req, resp)
 	return resp, nil
 }
 
@@ -425,7 +427,7 @@ func (a *Agent) StreamGenerate(ctx context.Context, stream *connect.BidiStream[v
 				continue
 			}
 
-			log.Info("Received request", zap.Object("request", req))
+			log.Info("Received request", logs.ZapProto("request", req))
 			// Serialize the doc and make it available for processing
 			func() {
 				mu.Lock()
@@ -604,19 +606,10 @@ func (a *Agent) LogEvents(ctx context.Context, req *connect.Request[v1alpha1.Log
 
 // postProcessBlocks is a helper function to post process the blocks generated by the agent.
 func postProcessBlocks(blocks []*v1alpha1.Block) ([]*v1alpha1.Block, error) {
-	// Only return a single code block and only the code block.
-	// We do this because
-	// 1. Due https://github.com/jlewi/foyle/issues/168 we can't render markdown as ghostcells
-	//   so we only want to return the code block.
-	// 2. We don't want to return multiple code blocks because that can be confusing. We can potentially relax that
-	//    in the future if everything is working
 	// Post process the blocks
 	results := make([]*v1alpha1.Block, 0, len(blocks))
 	for _, block := range blocks {
-		if block.GetKind() != v1alpha1.BlockKind_CODE {
-			continue
-		}
-		// The model sometimes returns just the "</output>" tag but inside a coude block.
+		// The model sometimes returns just the "</output>" tag but inside a code block.
 		// We want to ignore such blocks.
 		if isOutputTag(block.Contents) {
 			continue
@@ -626,8 +619,24 @@ func postProcessBlocks(blocks []*v1alpha1.Block) ([]*v1alpha1.Block, error) {
 		if strings.TrimSpace(block.Contents) == "" {
 			continue
 		}
+
+		if len(results) > 0 && block.Kind == v1alpha1.BlockKind_MARKUP && results[len(results)-1].Kind == v1alpha1.BlockKind_MARKUP {
+			// If the previous block is a markup block we want to merge this with the previous block.
+			lastBlock := results[len(results)-1]
+			// TODO(jeremy): Do we need to add a newline?
+			lastBlock.Contents += "\n" + block.Contents
+			continue
+		}
+
 		results = append(results, block)
-		return results, nil
+
+		// Once we reach a code block drop any other code blocks. This is because showing multiple code blocks
+		// can be confusing.
+		// TODO(jeremy): Should we make this a configurable option so its easy to experiment?
+		if block.Kind == v1alpha1.BlockKind_CODE {
+			// TODO(jeremy): log a level 1 assertion here?
+			return results, nil
+		}
 	}
 	return results, nil
 }
@@ -660,14 +669,7 @@ func (s *streamState) getContextID() string {
 
 // shouldTrigger returns true if the agent should trigger a completion for the current document.
 func shouldTrigger(doc *v1alpha1.Doc, selectedIndex int32) bool {
-	// We should trigger if the last cell is a code cell
-	if len(doc.Blocks) == 0 {
-		return false
-	}
-	// N.B. This is a bit of a hack to reduce costs because we are using so many tokens.
-	// For now only trigger completion if the selected cell is a markup cell.
-	selectedCell := doc.Blocks[selectedIndex]
-	return selectedCell.GetKind() == v1alpha1.BlockKind_MARKUP
+	return len(doc.Blocks) != 0
 }
 
 // dropResponse returns true if the response should be dropped rather than being sent to the client.
@@ -690,3 +692,29 @@ func preprocessDoc(req *v1alpha1.GenerateRequest) []*v1alpha1.Block {
 	cells := req.Doc.Blocks[:req.SelectedIndex+1]
 	return cells
 }
+
+// assertRequestResponse runs some assertions that depend on the generateRequest and the response.
+func assertRequestResponse(ctx context.Context, req *v1alpha1.GenerateRequest, resp *v1alpha1.GenerateResponse) {
+	log := logs.FromContext(ctx)
+	assertMarkupAfterCode := &v1alpha1.Assertion{
+		Name:   v1alpha1.Assertion_MARKUP_AFTER_CODE,
+		Result: v1alpha1.AssertResult_SKIPPED,
+		Id:     ulid.GenerateID(),
+	}
+
+	selected := req.Doc.Blocks[req.SelectedIndex]
+	// Assertion only applies if the selected index is a code cell
+	if selected.Kind == v1alpha1.BlockKind_CODE {
+		if len(resp.Blocks) > 0 && resp.Blocks[0].Kind == v1alpha1.BlockKind_MARKUP {
+			assertMarkupAfterCode.Result = v1alpha1.AssertResult_PASSED
+		} else {
+			assertMarkupAfterCode.Result = v1alpha1.AssertResult_FAILED
+		}
+	}
+
+	if len(resp.Blocks) == 0 {
+		assertMarkupAfterCode.Result = v1alpha1.AssertResult_FAILED
+	}
+
+	log.Info(logs.Level1Assertion, "assertion", assertMarkupAfterCode)
+}
diff --git a/app/pkg/agent/agent_test.go b/app/pkg/agent/agent_test.go
@@ -65,6 +65,17 @@ func Test_Generate(t *testing.T) {
 			},
 			maxResults: 0,
 		},
+		{
+			name: "test-gcloud-iam",
+			doc: &v1alpha1.Doc{
+				Blocks: []*v1alpha1.Block{
+					{
+						Contents: "How do I debug why workload identity isn't working for a deployment in GKE?",
+					},
+				},
+			},
+			maxResults: 0,
+		},
 		{
 			name: "prdiff",
 			doc: &v1alpha1.Doc{
@@ -109,7 +120,7 @@ func Test_Generate(t *testing.T) {
 	}
 
 	cfg.Agent.ModelProvider = api.ModelProviderOpenAI
-	cfg.Agent.Model = openai.GPT3Dot5Turbo0125
+	cfg.Agent.Model = openai.GPT4oMini
 
 	completer, err := oai.NewCompleter(*cfg, client)
 	if err != nil {
@@ -331,7 +342,7 @@ func Test_ShouldTrigger(t *testing.T) {
 				},
 			},
 			selectedIndex: 0,
-			expected:      false,
+			expected:      true,
 		},
 	}
 
@@ -373,6 +384,52 @@ func Test_PostProcessBlocks(t *testing.T) {
 			},
 			expected: []*v1alpha1.Block{},
 		},
+		{
+			name: "merge-markup-blocks",
+			blocks: []*v1alpha1.Block{
+				{
+					Kind:     v1alpha1.BlockKind_MARKUP,
+					Contents: "first block",
+				},
+				{
+					Kind:     v1alpha1.BlockKind_MARKUP,
+					Contents: "second block",
+				},
+			},
+			expected: []*v1alpha1.Block{
+				{
+					Kind:     v1alpha1.BlockKind_MARKUP,
+					Contents: "first block\nsecond block",
+				},
+			},
+		},
+		{
+			name: "stop-at-code-block",
+			blocks: []*v1alpha1.Block{
+				{
+					Kind:     v1alpha1.BlockKind_MARKUP,
+					Contents: "first block",
+				},
+				{
+					Kind:     v1alpha1.BlockKind_CODE,
+					Contents: "echo hello",
+				},
+				{
+					Kind:     v1alpha1.BlockKind_MARKUP,
+					Contents: "last block",
+				},
+			},
+			expected: []*v1alpha1.Block{
+				{
+					Kind:     v1alpha1.BlockKind_MARKUP,
+					Contents: "first block",
+				},
+				{
+					Kind:     v1alpha1.BlockKind_CODE,
+					Contents: "echo hello",
+				},
+			},
+		},
 	}
 
 	for _, c := range cases {
@@ -381,7 +438,7 @@ func Test_PostProcessBlocks(t *testing.T) {
 			if err != nil {
 				t.Fatalf("Error post processing blocks; %v", err)
 			}
-			if d := cmp.Diff(c.expected, actual); d != "" {
+			if d := cmp.Diff(c.expected, actual, cmpopts.IgnoreUnexported(v1alpha1.Block{})); d != "" {
 				t.Errorf("Unexpected diff:\n%s", d)
 			}
 		})

diff --git a/app/pkg/agent/prompt.tmpl b/app/pkg/agent/prompt.tmpl
@@ -1,12 +1,89 @@
-Continue writing the markdown document by adding a code block with the commands a user should execute.
+Continue writing the markdown document by adding markdown and code blocks with the commands a user should execute.
+
 Follow these rules
 
+* If the user is asking a question such as "How do I debug workload identity?" or "Why isn't my pod running?"
+  consider outputting a succinct explanation for how to debug the issue or answer any question
+* For any command that needs to be executed by the user, put it inside a code block
 * Set the language inside the code block to bash
 * Use the text at the end of the document to determine what commands to execute next
 * Use the existing text and code blocks in the document to learn phrases that are predictive of specific commands
-* Only respond with a single code block
 * You can put multiple commands into a code block
 * If the text at the end of the document doesn't clearly describe a command to execute simply respond with the </output> tag
+* If a user executed a command, the output of that command will be included in a code block with the language set to output
+* Use the output of previous commands to determine what to do next
+
+Here's an example:
+
+<example>
+<input>
+# Count users
+* Run a SQL query to count the number of users?
+</input>
+<output>
+1. Fetch the schema for the database
+
+```bash
+sqlite3 /path/to/your/database/db.sqlite ".schema"
+```
+
+1. Run the following sql query to count the number of users
+
+```bash
+sqlite3 /path/to/your/database/db.sqlite "SELECT COUNT(DISTINCT customerId) FROM table_name;"
+```
+</output>
+<reasoning>
+The response intermixes markup and code cells providing the steps to count the number of users in a database.
+</reasoning>
+</example>
+
+* You should look at the document to decide if the user is already in the midst of executing a sequence of steps
+* If the user is in the middle of executing a sequence of steps, you should continue the sequence of steps
+* You should continue the sequence by using the output of the previous command(s) to determine what to do next
+
+* If the document ends with the a code block containing the output of a command, look at the markup preceding
+  the code block containing the commands to try to figure out what question/problem the command was trying to solve.
+  * In this case you should respond with markup answering the question based on the output of the commands.
+    an answer to that question based on the output or a suggestion about what to do next.
+
+Here's an example:
+<example>
+<input>
+1. Check the Kubernetes Service Account Configuration
+   Ensure that the Kubernetes service account is annotated with the correct Google Cloud service account.
+
+```bash
+kubectl get serviceaccount default -n default -o yaml
+```
+
+```output
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  annotations:
+    iam.gke.io/gcp-service-account: developer@foyle-dev.iam.gserviceaccount.com
+  creationTimestamp: "2024-05-30T02:11:21Z"
+  name: default
+  namespace: default
+  resourceVersion: "155079105"
+  uid: 8c8fe74f-b23d-477c-b8b7-7a8937733fa3
+```
+</input>
+<output>
+The annotation `iam.gke.io/gcp-service-account` is correctly set with the Google Cloud service account.
+Since the annoation is correctly set, the next thing to check is the IAM permissions for the
+Google Cloud service account developer@foyle-dev.iam.gserviceaccount.com.
+</output>
+<reasoning>
+* The input ends with the output of the command `kubectl get serviceaccount default -n default -o yaml`
+* The markup preceding the command indicates that we are running this command to check if its annotated with
+  the correct service account
+* So in this case you respond by analyzing the output to answer the question about the annotations
+* Based on that analysis you suggest the next step to debug the issue
+</reasoning>
+</example>
+
 
 {{if .Examples}}
 Here are a bunch of examples of input documents along with the expected output.

diff --git a/app/pkg/agent/test_data/examples.txt b/app/pkg/agent/test_data/examples.txt
@@ -1,12 +1,89 @@
-Continue writing the markdown document by adding a code block with the commands a user should execute.
+Continue writing the markdown document by adding markdown and code blocks with the commands a user should execute.
+
 Follow these rules
 
+* If the user is asking a question such as "How do I debug workload identity?" or "Why isn't my pod running?"
+  consider outputting a succinct explanation for how to debug the issue or answer any question
+* For any command that needs to be executed by the user, put it inside a code block
 * Set the language inside the code block to bash
 * Use the text at the end of the document to determine what commands to execute next
 * Use the existing text and code blocks in the document to learn phrases that are predictive of specific commands
-* Only respond with a single code block
 * You can put multiple commands into a code block
 * If the text at the end of the document doesn't clearly describe a command to execute simply respond with the </output> tag
+* If a user executed a command, the output of that command will be included in a code block with the language set to output
+* Use the output of previous commands to determine what to do next
+
+Here's an example:
+
+<example>
+<input>
+# Count users
+* Run a SQL query to count the number of users?
+</input>
+<output>
+1. Fetch the schema for the database
+
+```bash
+sqlite3 /path/to/your/database/db.sqlite ".schema"
+```
+
+1. Run the following sql query to count the number of users
+
+```bash
+sqlite3 /path/to/your/database/db.sqlite "SELECT COUNT(DISTINCT customerId) FROM table_name;"
+```
+</output>
+<reasoning>
+The response intermixes markup and code cells providing the steps to count the number of users in a database.
+</reasoning>
+</example>
+
+* You should look at the document to decide if the user is already in the midst of executing a sequence of steps
+* If the user is in the middle of executing a sequence of steps, you should continue the sequence of steps
+* You should continue the sequence by using the output of the previous command(s) to determine what to do next
+
+* If the document ends with the a code block containing the output of a command, look at the markup preceding
+  the code block containing the commands to try to figure out what question/problem the command was trying to solve.
+  * In this case you should respond with markup answering the question based on the output of the commands.
+    an answer to that question based on the output or a suggestion about what to do next.
+
+Here's an example:
+<example>
+<input>
+1. Check the Kubernetes Service Account Configuration
+   Ensure that the Kubernetes service account is annotated with the correct Google Cloud service account.
+
+```bash
+kubectl get serviceaccount default -n default -o yaml
+```
+
+```output
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  annotations:
+    iam.gke.io/gcp-service-account: developer@foyle-dev.iam.gserviceaccount.com
+  creationTimestamp: "2024-05-30T02:11:21Z"
+  name: default
+  namespace: default
+  resourceVersion: "155079105"
+  uid: 8c8fe74f-b23d-477c-b8b7-7a8937733fa3
+```
+</input>
+<output>
+The annotation `iam.gke.io/gcp-service-account` is correctly set with the Google Cloud service account.
+Since the annoation is correctly set, the next thing to check is the IAM permissions for the
+Google Cloud service account developer@foyle-dev.iam.gserviceaccount.com.
+</output>
+<reasoning>
+* The input ends with the output of the command `kubectl get serviceaccount default -n default -o yaml`
+* The markup preceding the command indicates that we are running this command to check if its annotated with
+  the correct service account
+* So in this case you respond by analyzing the output to answer the question about the annotations
+* Based on that analysis you suggest the next step to debug the issue
+</reasoning>
+</example>
+
 
 
 Here are a bunch of examples of input documents along with the expected output.