From 03f802ea7dce59f8fb186927761ca10a3a289dd0 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Thu, 15 Feb 2024 18:27:13 +0100
Subject: [PATCH 1/9] feat(tools): support Tools in the API

Co-authored-by: =?UTF-8?q?Stephan=20A=C3=9Fmus?= <stephan.assmus@sap.com>
---
 api/openai/request.go    | 10 ++++++++++
 api/schema/openai.go     |  3 +++
 pkg/grammar/functions.go |  6 ++++++
 3 files changed, 19 insertions(+)

diff --git a/api/openai/request.go b/api/openai/request.go
index 382a930e1c7..05e0bc90f71 100644
--- a/api/openai/request.go
+++ b/api/openai/request.go
@@ -136,6 +136,16 @@ func updateRequestConfig(config *config.Config, input *schema.OpenAIRequest) {
 		}
 	}
 
+	if len(input.Tools) > 0 {
+		for _, tool := range input.Tools {
+			input.Functions = append(input.Functions, tool.Function)
+		}
+	}
+
+	if input.ToolsChoice != nil {
+		input.FunctionCall = input.ToolsChoice
+	}
+
 	// Decode each request's message content
 	index := 0
 	for i, m := range input.Messages {
diff --git a/api/schema/openai.go b/api/schema/openai.go
index 6355ff63d5e..e87f829ccea 100644
--- a/api/schema/openai.go
+++ b/api/schema/openai.go
@@ -117,6 +117,9 @@ type OpenAIRequest struct {
 	Functions    []grammar.Function `json:"functions" yaml:"functions"`
 	FunctionCall interface{}        `json:"function_call" yaml:"function_call"` // might be a string or an object
 
+	Tools       []grammar.Tool `json:"tools,omitempty" yaml:"tools"`
+	ToolsChoice interface{}    `json:"tool_choice,omitempty" yaml:"tool_choice"`
+
 	Stream bool `json:"stream"`
 
 	// Image (not supported by OpenAI)
diff --git a/pkg/grammar/functions.go b/pkg/grammar/functions.go
index ef56662b7b9..1038f5e6f14 100644
--- a/pkg/grammar/functions.go
+++ b/pkg/grammar/functions.go
@@ -11,6 +11,12 @@ type Function struct {
 }
 type Functions []Function
 
+type Tool struct {
+	Type     string   `json:"type"`
+	Function Function `json:"function,omitempty"`
+}
+type Tools []Tool
+
 func (f Functions) ToJSONStructure() JSONFunctionStructure {
 	js := JSONFunctionStructure{}
 	for _, function := range f {

From ccf5faf175a9265613d2a72693418d9619f2e959 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Thu, 15 Feb 2024 19:57:35 +0100
Subject: [PATCH 2/9] feat(tools): support function streaming

---
 api/openai/chat.go   | 320 +++++++++++++++++++++++++++++++------------
 api/schema/openai.go |  14 ++
 2 files changed, 246 insertions(+), 88 deletions(-)

diff --git a/api/openai/chat.go b/api/openai/chat.go
index 819cd6b2d6c..94118178697 100644
--- a/api/openai/chat.go
+++ b/api/openai/chat.go
@@ -55,6 +55,132 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 		})
 		close(responses)
 	}
+
+	/*
+		data:
+		{
+			"id":"chatcmpl-8sZrzBdLsWvnO2lX7Vz6glYAz8JMk",
+			"object":"chat.completion.chunk",
+			"created":1708018287,
+			"model":"gpt-3.5-turbo-0613",
+			"system_fingerprint":null,
+			"choices":[
+			{
+				"index":0,
+				"delta": {
+					"role":"assistant",
+					"content":null,
+					"tool_calls":
+					[
+						{
+							"index":0,
+							"id":"call_kL07suiDkGzYbUCLMZZ5XUIU",
+							"type":"function",
+							"function":
+							{
+								"name":"get_current_weather",
+								"arguments":""
+							}
+						}
+					]
+				},
+			"logprobs":null,
+			"finish_reason":null
+			}]
+		}
+
+		data: {"id":"chatcmpl-8sZrzBdLsWvnO2lX7Vz6glYAz8JMk","object":"chat.completion.chunk","created":1708018287,"model":"gpt-3.5-turbo-0613","system_fingerprint":null,"choices":[{"index":0,"delta":{"tool_calls":[{"index":0,"function":{"a
+		rguments":"{\n"}}]},"logprobs":null,"finish_reason":null}]}
+
+		data: {"id":"chatcmpl-8sZrzBdLsWvnO2lX7Vz6glYAz8JMk","object":"chat.completion.chunk","created":1708018287,"model":"gpt-3.5-turbo-0613","system_fingerprint":null,"choices":[{"index":0,"delta":{"tool_calls":[{"index":0,"function":{"a
+		rguments":" "}}]},"logprobs":null,"finish_reason":null}]}
+
+		data: {"id":"chatcmpl-8sZrzBdLsWvnO2lX7Vz6glYAz8JMk","object":"chat.completion.chunk","created":1708018287,"model":"gpt-3.5-turbo-0613","system_fingerprint":null,"choices":[{"index":0,"delta":{"tool_calls":[{"index":0,"function":{"a
+		rguments":" \""}}]},"logprobs":null,"finish_reason":null}]}
+
+		data: {"id":"chatcmpl-8sZrzBdLsWvnO2lX7Vz6glYAz8JMk","object":"chat.completion.chunk","created":1708018287,"model":"gpt-3.5-turbo-0613","system_fingerprint":null,"choices":[{"index":0,"delta":{"tool_calls":[{"index":0,"function":{"a
+		rguments":"location"}}]},"logprobs":null,"finish_reason":null}]}
+
+		data: {"id":"chatcmpl-8sZrzBdLsWvnO2lX7Vz6glYAz8JMk","object":"chat.completion.chunk","created":1708018287,"model":"gpt-3.5-turbo-0613","system_fingerprint":null,"choices":[{"index":0,"delta":{"tool_calls":[{"index":0,"function":{"a
+		rguments":"\":"}}]},"logprobs":null,"finish_reason":null}]}
+
+		data: {"id":"chatcmpl-8sZrzBdLsWvnO2lX7Vz6glYAz8JMk","object":"chat.completion.chunk","created":1708018287,"model":"gpt-3.5-turbo-0613","system_fingerprint":null,"choices":[{"index":0,"delta":{"tool_calls":[{"index":0,"function":{"a
+		rguments":" \""}}]},"logprobs":null,"finish_reason":null}]}
+
+		data: {"id":"chatcmpl-8sZrzBdLsWvnO2lX7Vz6glYAz8JMk","object":"chat.completion.chunk","created":1708018287,"model":"gpt-3.5-turbo-0613","system_fingerprint":null,"choices":[{"index":0,"delta":{"tool_calls":[{"index":0,"function":{"a
+		rguments":"Boston"}}]},"logprobs":null,"finish_reason":null}]}
+
+		data: {"id":"chatcmpl-8sZrzBdLsWvnO2lX7Vz6glYAz8JMk","object":"chat.completion.chunk","created":1708018287,"model":"gpt-3.5-turbo-0613","system_fingerprint":null,"choices":[{"index":0,"delta":{"tool_calls":[{"index":0,"function":{"a
+		rguments":","}}]},"logprobs":null,"finish_reason":null}]}
+
+		data: {"id":"chatcmpl-8sZrzBdLsWvnO2lX7Vz6glYAz8JMk","object":"chat.completion.chunk","created":1708018287,"model":"gpt-3.5-turbo-0613","system_fingerprint":null,"choices":[{"index":0,"delta":{"tool_calls":[{"index":0,"function":{"a
+		rguments":" MA"}}]},"logprobs":null,"finish_reason":null}]}
+
+		data: {"id":"chatcmpl-8sZrzBdLsWvnO2lX7Vz6glYAz8JMk","object":"chat.completion.chunk","created":1708018287,"model":"gpt-3.5-turbo-0613","system_fingerprint":null,"choices":[{"index":0,"delta":{"tool_calls":[{"index":0,"function":{"a
+		rguments":"\"\n"}}]},"logprobs":null,"finish_reason":null}]}
+
+		data: {"id":"chatcmpl-8sZrzBdLsWvnO2lX7Vz6glYAz8JMk","object":"chat.completion.chunk","created":1708018287,"model":"gpt-3.5-turbo-0613","system_fingerprint":null,"choices":[{"index":0,"delta":{"tool_calls":[{"index":0,"function":{"a
+		rguments":"}"}}]},"logprobs":null,"finish_reason":null}]}
+
+		data: {"id":"chatcmpl-8sZrzBdLsWvnO2lX7Vz6glYAz8JMk","object":"chat.completion.chunk","created":1708018287,"model":"gpt-3.5-turbo-0613","system_fingerprint":null,"choices":[{"index":0,"delta":{},"logprobs":null,"finish_reason":"tool
+		_calls"}]}
+
+		data: [DONE]
+	*/
+	processTools := func(prompt string, req *schema.OpenAIRequest, config *config.Config, loader *model.ModelLoader, responses chan schema.OpenAIResponse) {
+		ComputeChoices(req, prompt, config, o, loader, func(s string, c *[]schema.Choice) {}, func(s string, usage backend.TokenUsage) bool {
+			ss := map[string]interface{}{}
+
+			name, args := parseFunctionCall(s)
+			ss["name"], ss["arguments"] = name, args
+
+			initialMessage := schema.OpenAIResponse{
+				ID:      id,
+				Created: created,
+				Model:   req.Model, // we have to return what the user sent here, due to OpenAI spec.
+				Choices: []schema.Choice{{
+					Delta: &schema.Message{
+						Role: "assistant",
+						ToolCalls: []schema.ToolCall{
+							{
+								Index: 0,
+								ID:    id,
+								Type:  "function",
+								FunctionCall: schema.FunctionCall{
+									Name: name,
+								},
+							},
+						},
+					}}},
+				Object: "chat.completion.chunk",
+			}
+			responses <- initialMessage
+
+			responses <- schema.OpenAIResponse{
+				ID:      id,
+				Created: created,
+				Model:   req.Model, // we have to return what the user sent here, due to OpenAI spec.
+				Choices: []schema.Choice{{
+					Delta: &schema.Message{
+						Role: "assistant",
+						ToolCalls: []schema.ToolCall{
+							{
+								Index: 0,
+								ID:    id,
+								Type:  "function",
+								FunctionCall: schema.FunctionCall{
+									Arguments: args,
+								},
+							},
+						},
+					}}},
+				Object: "chat.completion.chunk",
+			}
+			return true
+		})
+		close(responses)
+	}
+
 	return func(c *fiber.Ctx) error {
 		processFunctions := false
 		funcs := grammar.Functions{}
@@ -122,7 +248,7 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 		}
 
 		// functions are not supported in stream mode (yet?)
-		toStream := input.Stream && !processFunctions
+		toStream := input.Stream
 
 		log.Debug().Msgf("Parameters: %+v", config)
 
@@ -254,10 +380,15 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 			log.Debug().Msgf("Grammar: %+v", config.Grammar)
 		}
 
-		if toStream {
+		switch {
+		case toStream:
 			responses := make(chan schema.OpenAIResponse)
 
-			go process(predInput, input, config, o.Loader, responses)
+			if !processFunctions {
+				go process(predInput, input, config, o.Loader, responses)
+			} else {
+				go processTools(predInput, input, config, o.Loader, responses)
+			}
 
 			c.Context().SetBodyStreamWriter(fasthttp.StreamWriter(func(w *bufio.Writer) {
 
@@ -278,13 +409,18 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 					w.Flush()
 				}
 
+				finishReason := "stop"
+				if processFunctions {
+					finishReason = "tool_calls"
+				}
+
 				resp := &schema.OpenAIResponse{
 					ID:      id,
 					Created: created,
 					Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
 					Choices: []schema.Choice{
 						{
-							FinishReason: "stop",
+							FinishReason: finishReason,
 							Index:        0,
 							Delta:        &schema.Message{Content: &emptyMessage},
 						}},
@@ -298,102 +434,110 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 				w.Flush()
 			}))
 			return nil
-		}
 
-		result, tokenUsage, err := ComputeChoices(input, predInput, config, o, o.Loader, func(s string, c *[]schema.Choice) {
-			if processFunctions {
-				// As we have to change the result before processing, we can't stream the answer (yet?)
-				ss := map[string]interface{}{}
-				// This prevent newlines to break JSON parsing for clients
-				s = utils.EscapeNewLines(s)
-				json.Unmarshal([]byte(s), &ss)
-				log.Debug().Msgf("Function return: %s %+v", s, ss)
-
-				// The grammar defines the function name as "function", while OpenAI returns "name"
-				func_name := ss["function"]
-				// Similarly, while here arguments is a map[string]interface{}, OpenAI actually want a stringified object
-				args := ss["arguments"] // arguments needs to be a string, but we return an object from the grammar result (TODO: fix)
-				d, _ := json.Marshal(args)
-
-				ss["arguments"] = string(d)
-				ss["name"] = func_name
-
-				// if do nothing, reply with a message
-				if func_name == noActionName {
-					log.Debug().Msgf("nothing to do, computing a reply")
-
-					// If there is a message that the LLM already sends as part of the JSON reply, use it
-					arguments := map[string]interface{}{}
-					json.Unmarshal([]byte(d), &arguments)
-					m, exists := arguments["message"]
-					if exists {
-						switch message := m.(type) {
-						case string:
-							if message != "" {
-								log.Debug().Msgf("Reply received from LLM: %s", message)
-								message = backend.Finetune(*config, predInput, message)
-								log.Debug().Msgf("Reply received from LLM(finetuned): %s", message)
-
-								*c = append(*c, schema.Choice{Message: &schema.Message{Role: "assistant", Content: &message}})
-								return
+		default:
+			result, tokenUsage, err := ComputeChoices(input, predInput, config, o, o.Loader, func(s string, c *[]schema.Choice) {
+				if processFunctions {
+					ss := map[string]interface{}{}
+
+					name, args := parseFunctionCall(s)
+					ss["name"], ss["arguments"] = name, args
+
+					// if do nothing, reply with a message
+					if name == noActionName {
+						log.Debug().Msgf("nothing to do, computing a reply")
+
+						// If there is a message that the LLM already sends as part of the JSON reply, use it
+						arguments := map[string]interface{}{}
+						json.Unmarshal([]byte(args), &arguments)
+						m, exists := arguments["message"]
+						if exists {
+							switch message := m.(type) {
+							case string:
+								if message != "" {
+									log.Debug().Msgf("Reply received from LLM: %s", message)
+									message = backend.Finetune(*config, predInput, message)
+									log.Debug().Msgf("Reply received from LLM(finetuned): %s", message)
+
+									*c = append(*c, schema.Choice{Message: &schema.Message{Role: "assistant", Content: &message}})
+									return
+								}
 							}
 						}
-					}
 
-					log.Debug().Msgf("No action received from LLM, without a message, computing a reply")
-					// Otherwise ask the LLM to understand the JSON output and the context, and return a message
-					// Note: This costs (in term of CPU) another computation
-					config.Grammar = ""
-					images := []string{}
-					for _, m := range input.Messages {
-						images = append(images, m.StringImages...)
-					}
-					predFunc, err := backend.ModelInference(input.Context, predInput, images, o.Loader, *config, o, nil)
-					if err != nil {
-						log.Error().Msgf("inference error: %s", err.Error())
-						return
-					}
+						log.Debug().Msgf("No action received from LLM, without a message, computing a reply")
+						// Otherwise ask the LLM to understand the JSON output and the context, and return a message
+						// Note: This costs (in term of CPU) another computation
+						config.Grammar = ""
+						images := []string{}
+						for _, m := range input.Messages {
+							images = append(images, m.StringImages...)
+						}
+						predFunc, err := backend.ModelInference(input.Context, predInput, images, o.Loader, *config, o, nil)
+						if err != nil {
+							log.Error().Msgf("inference error: %s", err.Error())
+							return
+						}
 
-					prediction, err := predFunc()
-					if err != nil {
-						log.Error().Msgf("inference error: %s", err.Error())
-						return
+						prediction, err := predFunc()
+						if err != nil {
+							log.Error().Msgf("inference error: %s", err.Error())
+							return
+						}
+
+						fineTunedResponse := backend.Finetune(*config, predInput, prediction.Response)
+						*c = append(*c, schema.Choice{Message: &schema.Message{Role: "assistant", Content: &fineTunedResponse}})
+					} else {
+						// otherwise reply with the function call
+						*c = append(*c, schema.Choice{
+							FinishReason: "function_call",
+							Message:      &schema.Message{Role: "assistant", FunctionCall: ss},
+						})
 					}
 
-					fineTunedResponse := backend.Finetune(*config, predInput, prediction.Response)
-					*c = append(*c, schema.Choice{Message: &schema.Message{Role: "assistant", Content: &fineTunedResponse}})
-				} else {
-					// otherwise reply with the function call
-					*c = append(*c, schema.Choice{
-						FinishReason: "function_call",
-						Message:      &schema.Message{Role: "assistant", FunctionCall: ss},
-					})
+					return
 				}
+				*c = append(*c, schema.Choice{FinishReason: "stop", Index: 0, Message: &schema.Message{Role: "assistant", Content: &s}})
+			}, nil)
+			if err != nil {
+				return err
+			}
 
-				return
+			resp := &schema.OpenAIResponse{
+				ID:      id,
+				Created: created,
+				Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
+				Choices: result,
+				Object:  "chat.completion",
+				Usage: schema.OpenAIUsage{
+					PromptTokens:     tokenUsage.Prompt,
+					CompletionTokens: tokenUsage.Completion,
+					TotalTokens:      tokenUsage.Prompt + tokenUsage.Completion,
+				},
 			}
-			*c = append(*c, schema.Choice{FinishReason: "stop", Index: 0, Message: &schema.Message{Role: "assistant", Content: &s}})
-		}, nil)
-		if err != nil {
-			return err
-		}
+			respData, _ := json.Marshal(resp)
+			log.Debug().Msgf("Response: %s", respData)
 
-		resp := &schema.OpenAIResponse{
-			ID:      id,
-			Created: created,
-			Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
-			Choices: result,
-			Object:  "chat.completion",
-			Usage: schema.OpenAIUsage{
-				PromptTokens:     tokenUsage.Prompt,
-				CompletionTokens: tokenUsage.Completion,
-				TotalTokens:      tokenUsage.Prompt + tokenUsage.Completion,
-			},
+			// Return the prediction in the response body
+			return c.JSON(resp)
 		}
-		respData, _ := json.Marshal(resp)
-		log.Debug().Msgf("Response: %s", respData)
 
-		// Return the prediction in the response body
-		return c.JSON(resp)
 	}
 }
+
+func parseFunctionCall(llmresult string) (string, string) {
+	// As we have to change the result before processing, we can't stream the answer token-by-token (yet?)
+	ss := map[string]interface{}{}
+	// This prevent newlines to break JSON parsing for clients
+	s := utils.EscapeNewLines(llmresult)
+	json.Unmarshal([]byte(s), &ss)
+	log.Debug().Msgf("Function return: %s %+v", s, ss)
+
+	// The grammar defines the function name as "function", while OpenAI returns "name"
+	func_name := ss["function"]
+	// Similarly, while here arguments is a map[string]interface{}, OpenAI actually want a stringified object
+	args := ss["arguments"] // arguments needs to be a string, but we return an object from the grammar result (TODO: fix)
+	d, _ := json.Marshal(args)
+
+	return func_name.(string), string(d)
+}
diff --git a/api/schema/openai.go b/api/schema/openai.go
index e87f829ccea..dcd11764fba 100644
--- a/api/schema/openai.go
+++ b/api/schema/openai.go
@@ -76,6 +76,20 @@ type Message struct {
 
 	// A result of a function call
 	FunctionCall interface{} `json:"function_call,omitempty" yaml:"function_call,omitempty"`
+
+	ToolCalls []ToolCall `json:"tool_calls,omitempty" yaml:"tool_call,omitempty"`
+}
+
+type ToolCall struct {
+	Index        int          `json:"index"`
+	ID           string       `json:"id"`
+	Type         string       `json:"type"`
+	FunctionCall FunctionCall `json:"function"`
+}
+
+type FunctionCall struct {
+	Name      string `json:"name,omitempty"`
+	Arguments string `json:"arguments"`
 }
 
 type OpenAIModel struct {

From dddd67da69eceeb2cc0230afc6beb7866f4715a0 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Thu, 15 Feb 2024 20:17:28 +0100
Subject: [PATCH 3/9] Adhere to new return types when using tools instead of
 functions

---
 api/openai/chat.go    | 107 ++++++++++++------------------------------
 api/openai/request.go |   7 ++-
 2 files changed, 35 insertions(+), 79 deletions(-)

diff --git a/api/openai/chat.go b/api/openai/chat.go
index 94118178697..3bde838ea6c 100644
--- a/api/openai/chat.go
+++ b/api/openai/chat.go
@@ -55,78 +55,6 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 		})
 		close(responses)
 	}
-
-	/*
-		data:
-		{
-			"id":"chatcmpl-8sZrzBdLsWvnO2lX7Vz6glYAz8JMk",
-			"object":"chat.completion.chunk",
-			"created":1708018287,
-			"model":"gpt-3.5-turbo-0613",
-			"system_fingerprint":null,
-			"choices":[
-			{
-				"index":0,
-				"delta": {
-					"role":"assistant",
-					"content":null,
-					"tool_calls":
-					[
-						{
-							"index":0,
-							"id":"call_kL07suiDkGzYbUCLMZZ5XUIU",
-							"type":"function",
-							"function":
-							{
-								"name":"get_current_weather",
-								"arguments":""
-							}
-						}
-					]
-				},
-			"logprobs":null,
-			"finish_reason":null
-			}]
-		}
-
-		data: {"id":"chatcmpl-8sZrzBdLsWvnO2lX7Vz6glYAz8JMk","object":"chat.completion.chunk","created":1708018287,"model":"gpt-3.5-turbo-0613","system_fingerprint":null,"choices":[{"index":0,"delta":{"tool_calls":[{"index":0,"function":{"a
-		rguments":"{\n"}}]},"logprobs":null,"finish_reason":null}]}
-
-		data: {"id":"chatcmpl-8sZrzBdLsWvnO2lX7Vz6glYAz8JMk","object":"chat.completion.chunk","created":1708018287,"model":"gpt-3.5-turbo-0613","system_fingerprint":null,"choices":[{"index":0,"delta":{"tool_calls":[{"index":0,"function":{"a
-		rguments":" "}}]},"logprobs":null,"finish_reason":null}]}
-
-		data: {"id":"chatcmpl-8sZrzBdLsWvnO2lX7Vz6glYAz8JMk","object":"chat.completion.chunk","created":1708018287,"model":"gpt-3.5-turbo-0613","system_fingerprint":null,"choices":[{"index":0,"delta":{"tool_calls":[{"index":0,"function":{"a
-		rguments":" \""}}]},"logprobs":null,"finish_reason":null}]}
-
-		data: {"id":"chatcmpl-8sZrzBdLsWvnO2lX7Vz6glYAz8JMk","object":"chat.completion.chunk","created":1708018287,"model":"gpt-3.5-turbo-0613","system_fingerprint":null,"choices":[{"index":0,"delta":{"tool_calls":[{"index":0,"function":{"a
-		rguments":"location"}}]},"logprobs":null,"finish_reason":null}]}
-
-		data: {"id":"chatcmpl-8sZrzBdLsWvnO2lX7Vz6glYAz8JMk","object":"chat.completion.chunk","created":1708018287,"model":"gpt-3.5-turbo-0613","system_fingerprint":null,"choices":[{"index":0,"delta":{"tool_calls":[{"index":0,"function":{"a
-		rguments":"\":"}}]},"logprobs":null,"finish_reason":null}]}
-
-		data: {"id":"chatcmpl-8sZrzBdLsWvnO2lX7Vz6glYAz8JMk","object":"chat.completion.chunk","created":1708018287,"model":"gpt-3.5-turbo-0613","system_fingerprint":null,"choices":[{"index":0,"delta":{"tool_calls":[{"index":0,"function":{"a
-		rguments":" \""}}]},"logprobs":null,"finish_reason":null}]}
-
-		data: {"id":"chatcmpl-8sZrzBdLsWvnO2lX7Vz6glYAz8JMk","object":"chat.completion.chunk","created":1708018287,"model":"gpt-3.5-turbo-0613","system_fingerprint":null,"choices":[{"index":0,"delta":{"tool_calls":[{"index":0,"function":{"a
-		rguments":"Boston"}}]},"logprobs":null,"finish_reason":null}]}
-
-		data: {"id":"chatcmpl-8sZrzBdLsWvnO2lX7Vz6glYAz8JMk","object":"chat.completion.chunk","created":1708018287,"model":"gpt-3.5-turbo-0613","system_fingerprint":null,"choices":[{"index":0,"delta":{"tool_calls":[{"index":0,"function":{"a
-		rguments":","}}]},"logprobs":null,"finish_reason":null}]}
-
-		data: {"id":"chatcmpl-8sZrzBdLsWvnO2lX7Vz6glYAz8JMk","object":"chat.completion.chunk","created":1708018287,"model":"gpt-3.5-turbo-0613","system_fingerprint":null,"choices":[{"index":0,"delta":{"tool_calls":[{"index":0,"function":{"a
-		rguments":" MA"}}]},"logprobs":null,"finish_reason":null}]}
-
-		data: {"id":"chatcmpl-8sZrzBdLsWvnO2lX7Vz6glYAz8JMk","object":"chat.completion.chunk","created":1708018287,"model":"gpt-3.5-turbo-0613","system_fingerprint":null,"choices":[{"index":0,"delta":{"tool_calls":[{"index":0,"function":{"a
-		rguments":"\"\n"}}]},"logprobs":null,"finish_reason":null}]}
-
-		data: {"id":"chatcmpl-8sZrzBdLsWvnO2lX7Vz6glYAz8JMk","object":"chat.completion.chunk","created":1708018287,"model":"gpt-3.5-turbo-0613","system_fingerprint":null,"choices":[{"index":0,"delta":{"tool_calls":[{"index":0,"function":{"a
-		rguments":"}"}}]},"logprobs":null,"finish_reason":null}]}
-
-		data: {"id":"chatcmpl-8sZrzBdLsWvnO2lX7Vz6glYAz8JMk","object":"chat.completion.chunk","created":1708018287,"model":"gpt-3.5-turbo-0613","system_fingerprint":null,"choices":[{"index":0,"delta":{},"logprobs":null,"finish_reason":"tool
-		_calls"}]}
-
-		data: [DONE]
-	*/
 	processTools := func(prompt string, req *schema.OpenAIRequest, config *config.Config, loader *model.ModelLoader, responses chan schema.OpenAIResponse) {
 		ComputeChoices(req, prompt, config, o, loader, func(s string, c *[]schema.Choice) {}, func(s string, usage backend.TokenUsage) bool {
 			ss := map[string]interface{}{}
@@ -391,7 +319,6 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 			}
 
 			c.Context().SetBodyStreamWriter(fasthttp.StreamWriter(func(w *bufio.Writer) {
-
 				usage := &schema.OpenAIUsage{}
 
 				for ev := range responses {
@@ -488,11 +415,35 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 						fineTunedResponse := backend.Finetune(*config, predInput, prediction.Response)
 						*c = append(*c, schema.Choice{Message: &schema.Message{Role: "assistant", Content: &fineTunedResponse}})
 					} else {
-						// otherwise reply with the function call
-						*c = append(*c, schema.Choice{
-							FinishReason: "function_call",
-							Message:      &schema.Message{Role: "assistant", FunctionCall: ss},
-						})
+						if len(input.Tools) > 0 {
+							// Result is different in the case we have a tool call
+							*c = append(*c, schema.Choice{
+								FinishReason: "tool_calls",
+								Message: &schema.Message{
+									Role: "assistant",
+									ToolCalls: []schema.ToolCall{
+										{
+											ID:   id,
+											Type: "function",
+											FunctionCall: schema.FunctionCall{
+												Name:      name,
+												Arguments: args,
+											},
+										},
+									},
+									FunctionCall: ss,
+								},
+							})
+						} else {
+							// otherwise reply with the function call
+							*c = append(*c, schema.Choice{
+								FinishReason: "function_call",
+								Message: &schema.Message{
+									Role:         "assistant",
+									FunctionCall: ss,
+								},
+							})
+						}
 					}
 
 					return
diff --git a/api/openai/request.go b/api/openai/request.go
index 05e0bc90f71..6a7a14e8502 100644
--- a/api/openai/request.go
+++ b/api/openai/request.go
@@ -13,6 +13,7 @@ import (
 	fiberContext "github.com/go-skynet/LocalAI/api/ctx"
 	options "github.com/go-skynet/LocalAI/api/options"
 	"github.com/go-skynet/LocalAI/api/schema"
+	"github.com/go-skynet/LocalAI/pkg/grammar"
 	model "github.com/go-skynet/LocalAI/pkg/model"
 	"github.com/gofiber/fiber/v2"
 	"github.com/rs/zerolog/log"
@@ -143,7 +144,11 @@ func updateRequestConfig(config *config.Config, input *schema.OpenAIRequest) {
 	}
 
 	if input.ToolsChoice != nil {
-		input.FunctionCall = input.ToolsChoice
+		var toolChoice grammar.Tool
+		json.Unmarshal([]byte(input.ToolsChoice.(string)), &toolChoice)
+		input.FunctionCall = map[string]interface{}{
+			"name": toolChoice.Function.Name,
+		}
 	}
 
 	// Decode each request's message content

From 496374ae3e89c5b051eca440a4c82be1b63e5cf4 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Thu, 15 Feb 2024 20:20:45 +0100
Subject: [PATCH 4/9] Keep backward compatibility with function calling

---
 api/openai/chat.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/api/openai/chat.go b/api/openai/chat.go
index 3bde838ea6c..3103a3fb18c 100644
--- a/api/openai/chat.go
+++ b/api/openai/chat.go
@@ -337,7 +337,7 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 				}
 
 				finishReason := "stop"
-				if processFunctions {
+				if processFunctions && len(input.Tools) > 0 {
 					finishReason = "tool_calls"
 				}
 

From 251045173a1cd304526637b0ae1c7fa118005d43 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Fri, 16 Feb 2024 20:41:42 +0100
Subject: [PATCH 5/9] Evaluate function names in chat templates

---
 api/openai/chat.go   | 2 ++
 api/schema/openai.go | 4 ++++
 pkg/model/loader.go  | 1 +
 3 files changed, 7 insertions(+)

diff --git a/api/openai/chat.go b/api/openai/chat.go
index 3103a3fb18c..00a3b8519ec 100644
--- a/api/openai/chat.go
+++ b/api/openai/chat.go
@@ -199,6 +199,7 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 			}
 			r := config.Roles[role]
 			contentExists := i.Content != nil && i.StringContent != ""
+
 			// First attempt to populate content via a chat message specific template
 			if config.TemplateConfig.ChatMessage != "" {
 				chatMessageData := model.ChatMessageTemplateData{
@@ -206,6 +207,7 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 					Role:         r,
 					RoleName:     role,
 					Content:      i.StringContent,
+					FunctionName: i.Name,
 					MessageIndex: messageIndex,
 				}
 				templatedChatMessage, err := o.Loader.EvaluateTemplateForChatMessage(config.TemplateConfig.ChatMessage, chatMessageData)
diff --git a/api/schema/openai.go b/api/schema/openai.go
index dcd11764fba..12a39b4284d 100644
--- a/api/schema/openai.go
+++ b/api/schema/openai.go
@@ -68,6 +68,10 @@ type ContentURL struct {
 type Message struct {
 	// The message role
 	Role string `json:"role,omitempty" yaml:"role"`
+
+	// The message name (used for tools calls)
+	Name string `json:"name,omitempty" yaml:"name"`
+
 	// The message content
 	Content interface{} `json:"content" yaml:"content"`
 
diff --git a/pkg/model/loader.go b/pkg/model/loader.go
index 37c2a603a63..bea32fb72a4 100644
--- a/pkg/model/loader.go
+++ b/pkg/model/loader.go
@@ -33,6 +33,7 @@ type ChatMessageTemplateData struct {
 	SystemPrompt string
 	Role         string
 	RoleName     string
+	FunctionName string
 	Content      string
 	MessageIndex int
 }

From f2e803c867123d721dce504cfc8b1132d9d29bb3 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Fri, 16 Feb 2024 23:18:53 +0100
Subject: [PATCH 6/9] Disable recovery with --debug

---
 api/api.go | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/api/api.go b/api/api.go
index 7ec95f1b63a..946204d2b06 100644
--- a/api/api.go
+++ b/api/api.go
@@ -146,7 +146,11 @@ func App(opts ...options.AppOption) (*fiber.App, error) {
 	}
 
 	// Default middleware config
-	app.Use(recover.New())
+
+	if !options.Debug {
+		app.Use(recover.New())
+	}
+
 	if options.Metrics != nil {
 		app.Use(metrics.APIMiddleware(options.Metrics))
 	}

From c6d026e522bddc36e8d82718dbf0550605ebedfd Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Fri, 16 Feb 2024 23:19:07 +0100
Subject: [PATCH 7/9] Correctly stream out the entire result

---
 api/openai/chat.go | 89 ++++++++++++++++++++++++----------------------
 1 file changed, 47 insertions(+), 42 deletions(-)

diff --git a/api/openai/chat.go b/api/openai/chat.go
index 00a3b8519ec..86db7a2d590 100644
--- a/api/openai/chat.go
+++ b/api/openai/chat.go
@@ -56,56 +56,61 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 		close(responses)
 	}
 	processTools := func(prompt string, req *schema.OpenAIRequest, config *config.Config, loader *model.ModelLoader, responses chan schema.OpenAIResponse) {
+
+		result := ""
 		ComputeChoices(req, prompt, config, o, loader, func(s string, c *[]schema.Choice) {}, func(s string, usage backend.TokenUsage) bool {
-			ss := map[string]interface{}{}
+			result += s
+			// TODO: Change generated BNF grammar to be compliant with the schema so we can
+			// stream the result token by token here.
+			return true
+		})
 
-			name, args := parseFunctionCall(s)
-			ss["name"], ss["arguments"] = name, args
+		ss := map[string]interface{}{}
+		name, args := parseFunctionCall(result)
+		ss["name"], ss["arguments"] = name, args
 
-			initialMessage := schema.OpenAIResponse{
-				ID:      id,
-				Created: created,
-				Model:   req.Model, // we have to return what the user sent here, due to OpenAI spec.
-				Choices: []schema.Choice{{
-					Delta: &schema.Message{
-						Role: "assistant",
-						ToolCalls: []schema.ToolCall{
-							{
-								Index: 0,
-								ID:    id,
-								Type:  "function",
-								FunctionCall: schema.FunctionCall{
-									Name: name,
-								},
+		initialMessage := schema.OpenAIResponse{
+			ID:      id,
+			Created: created,
+			Model:   req.Model, // we have to return what the user sent here, due to OpenAI spec.
+			Choices: []schema.Choice{{
+				Delta: &schema.Message{
+					Role: "assistant",
+					ToolCalls: []schema.ToolCall{
+						{
+							Index: 0,
+							ID:    id,
+							Type:  "function",
+							FunctionCall: schema.FunctionCall{
+								Name: name,
 							},
 						},
-					}}},
-				Object: "chat.completion.chunk",
-			}
-			responses <- initialMessage
+					},
+				}}},
+			Object: "chat.completion.chunk",
+		}
+		responses <- initialMessage
 
-			responses <- schema.OpenAIResponse{
-				ID:      id,
-				Created: created,
-				Model:   req.Model, // we have to return what the user sent here, due to OpenAI spec.
-				Choices: []schema.Choice{{
-					Delta: &schema.Message{
-						Role: "assistant",
-						ToolCalls: []schema.ToolCall{
-							{
-								Index: 0,
-								ID:    id,
-								Type:  "function",
-								FunctionCall: schema.FunctionCall{
-									Arguments: args,
-								},
+		responses <- schema.OpenAIResponse{
+			ID:      id,
+			Created: created,
+			Model:   req.Model, // we have to return what the user sent here, due to OpenAI spec.
+			Choices: []schema.Choice{{
+				Delta: &schema.Message{
+					Role: "assistant",
+					ToolCalls: []schema.ToolCall{
+						{
+							Index: 0,
+							ID:    id,
+							Type:  "function",
+							FunctionCall: schema.FunctionCall{
+								Arguments: args,
 							},
 						},
-					}}},
-				Object: "chat.completion.chunk",
-			}
-			return true
-		})
+					},
+				}}},
+			Object: "chat.completion.chunk",
+		}
 		close(responses)
 	}
 

From dba9094d6a93e0a79e31230a3c08d5c9e202fd1a Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Fri, 16 Feb 2024 23:45:58 +0100
Subject: [PATCH 8/9] Detect when llm chooses to reply and to not perform any
 action in SSE

---
 api/openai/chat.go | 136 ++++++++++++++++++++++++++++++---------------
 1 file changed, 91 insertions(+), 45 deletions(-)

diff --git a/api/openai/chat.go b/api/openai/chat.go
index 86db7a2d590..ac914533e8a 100644
--- a/api/openai/chat.go
+++ b/api/openai/chat.go
@@ -55,10 +55,9 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 		})
 		close(responses)
 	}
-	processTools := func(prompt string, req *schema.OpenAIRequest, config *config.Config, loader *model.ModelLoader, responses chan schema.OpenAIResponse) {
-
+	processTools := func(noAction string, prompt string, req *schema.OpenAIRequest, config *config.Config, loader *model.ModelLoader, responses chan schema.OpenAIResponse) {
 		result := ""
-		ComputeChoices(req, prompt, config, o, loader, func(s string, c *[]schema.Choice) {}, func(s string, usage backend.TokenUsage) bool {
+		_, tokenUsage, _ := ComputeChoices(req, prompt, config, o, loader, func(s string, c *[]schema.Choice) {}, func(s string, usage backend.TokenUsage) bool {
 			result += s
 			// TODO: Change generated BNF grammar to be compliant with the schema so we can
 			// stream the result token by token here.
@@ -69,6 +68,40 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 		name, args := parseFunctionCall(result)
 		ss["name"], ss["arguments"] = name, args
 
+		if name == noAction {
+			initialMessage := schema.OpenAIResponse{
+				ID:      id,
+				Created: created,
+				Model:   req.Model, // we have to return what the user sent here, due to OpenAI spec.
+				Choices: []schema.Choice{{Delta: &schema.Message{Role: "assistant", Content: &emptyMessage}}},
+				Object:  "chat.completion.chunk",
+			}
+			responses <- initialMessage
+
+			result, err := handleQuestion(config, req, o, args, prompt)
+			if err != nil {
+				log.Error().Msgf("error handling question: %s", err.Error())
+				return
+			}
+
+			resp := schema.OpenAIResponse{
+				ID:      id,
+				Created: created,
+				Model:   req.Model, // we have to return what the user sent here, due to OpenAI spec.
+				Choices: []schema.Choice{{Delta: &schema.Message{Content: &result}, Index: 0}},
+				Object:  "chat.completion.chunk",
+				Usage: schema.OpenAIUsage{
+					PromptTokens:     tokenUsage.Prompt,
+					CompletionTokens: tokenUsage.Completion,
+					TotalTokens:      tokenUsage.Prompt + tokenUsage.Completion,
+				},
+			}
+
+			responses <- resp
+			close(responses)
+			return
+		}
+
 		initialMessage := schema.OpenAIResponse{
 			ID:      id,
 			Created: created,
@@ -322,14 +355,17 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 			if !processFunctions {
 				go process(predInput, input, config, o.Loader, responses)
 			} else {
-				go processTools(predInput, input, config, o.Loader, responses)
+				go processTools(noActionName, predInput, input, config, o.Loader, responses)
 			}
 
 			c.Context().SetBodyStreamWriter(fasthttp.StreamWriter(func(w *bufio.Writer) {
 				usage := &schema.OpenAIUsage{}
-
+				toolsCalled := false
 				for ev := range responses {
 					usage = &ev.Usage // Copy a pointer to the latest usage chunk so that the stop message can reference it
+					if len(ev.Choices[0].Delta.ToolCalls) > 0 {
+						toolsCalled = true
+					}
 					var buf bytes.Buffer
 					enc := json.NewEncoder(&buf)
 					enc.Encode(ev)
@@ -344,8 +380,10 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 				}
 
 				finishReason := "stop"
-				if processFunctions && len(input.Tools) > 0 {
+				if toolsCalled {
 					finishReason = "tool_calls"
+				} else if toolsCalled && len(input.Tools) == 0 {
+					finishReason = "function_call"
 				}
 
 				resp := &schema.OpenAIResponse{
@@ -379,48 +417,12 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 
 					// if do nothing, reply with a message
 					if name == noActionName {
-						log.Debug().Msgf("nothing to do, computing a reply")
-
-						// If there is a message that the LLM already sends as part of the JSON reply, use it
-						arguments := map[string]interface{}{}
-						json.Unmarshal([]byte(args), &arguments)
-						m, exists := arguments["message"]
-						if exists {
-							switch message := m.(type) {
-							case string:
-								if message != "" {
-									log.Debug().Msgf("Reply received from LLM: %s", message)
-									message = backend.Finetune(*config, predInput, message)
-									log.Debug().Msgf("Reply received from LLM(finetuned): %s", message)
-
-									*c = append(*c, schema.Choice{Message: &schema.Message{Role: "assistant", Content: &message}})
-									return
-								}
-							}
-						}
-
-						log.Debug().Msgf("No action received from LLM, without a message, computing a reply")
-						// Otherwise ask the LLM to understand the JSON output and the context, and return a message
-						// Note: This costs (in term of CPU) another computation
-						config.Grammar = ""
-						images := []string{}
-						for _, m := range input.Messages {
-							images = append(images, m.StringImages...)
-						}
-						predFunc, err := backend.ModelInference(input.Context, predInput, images, o.Loader, *config, o, nil)
-						if err != nil {
-							log.Error().Msgf("inference error: %s", err.Error())
-							return
-						}
-
-						prediction, err := predFunc()
+						result, err := handleQuestion(config, input, o, args, predInput)
 						if err != nil {
-							log.Error().Msgf("inference error: %s", err.Error())
+							log.Error().Msgf("error handling question: %s", err.Error())
 							return
 						}
-
-						fineTunedResponse := backend.Finetune(*config, predInput, prediction.Response)
-						*c = append(*c, schema.Choice{Message: &schema.Message{Role: "assistant", Content: &fineTunedResponse}})
+						*c = append(*c, schema.Choice{Message: &schema.Message{Role: "assistant", Content: &result}})
 					} else {
 						if len(input.Tools) > 0 {
 							// Result is different in the case we have a tool call
@@ -455,6 +457,7 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 
 					return
 				}
+
 				*c = append(*c, schema.Choice{FinishReason: "stop", Index: 0, Message: &schema.Message{Role: "assistant", Content: &s}})
 			}, nil)
 			if err != nil {
@@ -483,6 +486,49 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 	}
 }
 
+func handleQuestion(config *config.Config, input *schema.OpenAIRequest, o *options.Option, args, prompt string) (string, error) {
+	log.Debug().Msgf("nothing to do, computing a reply")
+
+	// If there is a message that the LLM already sends as part of the JSON reply, use it
+	arguments := map[string]interface{}{}
+	json.Unmarshal([]byte(args), &arguments)
+	m, exists := arguments["message"]
+	if exists {
+		switch message := m.(type) {
+		case string:
+			if message != "" {
+				log.Debug().Msgf("Reply received from LLM: %s", message)
+				message = backend.Finetune(*config, prompt, message)
+				log.Debug().Msgf("Reply received from LLM(finetuned): %s", message)
+
+				return message, nil
+			}
+		}
+	}
+
+	log.Debug().Msgf("No action received from LLM, without a message, computing a reply")
+	// Otherwise ask the LLM to understand the JSON output and the context, and return a message
+	// Note: This costs (in term of CPU/GPU) another computation
+	config.Grammar = ""
+	images := []string{}
+	for _, m := range input.Messages {
+		images = append(images, m.StringImages...)
+	}
+
+	predFunc, err := backend.ModelInference(input.Context, prompt, images, o.Loader, *config, o, nil)
+	if err != nil {
+		log.Error().Msgf("inference error: %s", err.Error())
+		return "", err
+	}
+
+	prediction, err := predFunc()
+	if err != nil {
+		log.Error().Msgf("inference error: %s", err.Error())
+		return "", err
+	}
+	return backend.Finetune(*config, prompt, prediction.Response), nil
+}
+
 func parseFunctionCall(llmresult string) (string, string) {
 	// As we have to change the result before processing, we can't stream the answer token-by-token (yet?)
 	ss := map[string]interface{}{}

From d44c8bac078a2ecf0af0414bd683c32fcb0da452 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Fri, 16 Feb 2024 23:49:45 +0100
Subject: [PATCH 9/9] Feedback from code review

---
 api/openai/chat.go | 1 -
 1 file changed, 1 deletion(-)

diff --git a/api/openai/chat.go b/api/openai/chat.go
index ac914533e8a..68c3a291a1b 100644
--- a/api/openai/chat.go
+++ b/api/openai/chat.go
@@ -440,7 +440,6 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 											},
 										},
 									},
-									FunctionCall: ss,
 								},
 							})
 						} else {