From da548414c0b593aeeda1ce528782255c4006d9a9 Mon Sep 17 00:00:00 2001 From: Chris <66376200+crickman@users.noreply.github.com> Date: Tue, 15 Aug 2023 11:10:39 -0700 Subject: [PATCH] Tuned: Document chunking parameters and debug output (customer input) (#178) ### Motivation and Context Responding to customer feedback on: 1. Document memory UX 1. Debug output ### Description - When debugger is attached locally, appliication insight tracing is not written to the debug output. - Larger document chunks with overlap provide a more coherent view of memories. 1000 tokens just isn't that much. **Current:** ![image](https://github.com/microsoft/chat-copilot/assets/66376200/d859b756-3d37-4e2c-8de2-8660b2bb43d3) **Azure AI Playground:** ![image](https://github.com/microsoft/chat-copilot/assets/66376200/48b7b42a-305f-4d5b-9ad8-2822bf475b46) **Tuned:** ![image](https://github.com/microsoft/chat-copilot/assets/66376200/0cf428ad-b502-4294-aa58-10ae75ee55b3) ### Contribution Checklist - [x] The code builds clean without any errors or warnings - [x] The PR follows the [Contribution Guidelines](https://github.com/microsoft/copilot-chat/blob/main/CONTRIBUTING.md) and the [pre-submission formatting script](https://github.com/microsoft/copilot-chat/blob/main/CONTRIBUTING.md#development-scripts) raises no violations - [x] All unit tests pass, and I have added new tests where possible - [x] I didn't break anyone :smile: --- webapi/Controllers/DocumentImportController.cs | 2 +- webapi/Options/DocumentMemoryOptions.cs | 10 +++++----- webapi/Program.cs | 5 ++--- webapi/appsettings.json | 6 +++--- 4 files changed, 11 insertions(+), 12 deletions(-) diff --git a/webapi/Controllers/DocumentImportController.cs b/webapi/Controllers/DocumentImportController.cs index e5a352f0d6d4..9f45d526d8b2 100644 --- a/webapi/Controllers/DocumentImportController.cs +++ b/webapi/Controllers/DocumentImportController.cs @@ -540,7 +540,7 @@ private async Task ParseDocumentContentToMemoryAsync( // Split the document into lines of text and then combine them into paragraphs. // Note that this is only one of many strategies to chunk documents. Feel free to experiment with other strategies. var lines = TextChunker.SplitPlainTextLines(content, this._options.DocumentLineSplitMaxTokens); - var paragraphs = TextChunker.SplitPlainTextParagraphs(lines, this._options.DocumentParagraphSplitMaxLines); + var paragraphs = TextChunker.SplitPlainTextParagraphs(lines, this._options.DocumentChunkMaxTokens, this._options.DocumentLineSplitMaxTokens); // TODO: Perform the save in parallel. for (var i = 0; i < paragraphs.Count; i++) diff --git a/webapi/Options/DocumentMemoryOptions.cs b/webapi/Options/DocumentMemoryOptions.cs index ce0c22426c55..85630eec741a 100644 --- a/webapi/Options/DocumentMemoryOptions.cs +++ b/webapi/Options/DocumentMemoryOptions.cs @@ -24,20 +24,20 @@ public class DocumentMemoryOptions public string ChatDocumentCollectionNamePrefix { get; set; } = "chat-documents-"; /// - /// Gets or sets the maximum number of tokens to use when splitting a document into lines. - /// Default token limits are suggested by OpenAI: + /// Gets or sets the maximum number of tokens to use when splitting a document into "lines". + /// For more details on tokens and how to count them, see: /// https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them /// [Range(0, int.MaxValue)] public int DocumentLineSplitMaxTokens { get; set; } = 30; /// - /// Gets or sets the maximum number of lines to use when combining lines into paragraphs. - /// Default token limits are suggested by OpenAI: + /// Gets or sets the maximum number of tokens to use when splitting documents for embeddings. + /// For more details on tokens and how to count them, see: /// https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them /// [Range(0, int.MaxValue)] - public int DocumentParagraphSplitMaxLines { get; set; } = 100; + public int DocumentChunkMaxTokens { get; set; } = 100; /// /// Maximum size in bytes of a document to be allowed for importing. diff --git a/webapi/Program.cs b/webapi/Program.cs index ec419ff5d80c..cbb0410d8c93 100644 --- a/webapi/Program.cs +++ b/webapi/Program.cs @@ -1,6 +1,7 @@ // Copyright (c) Microsoft. All rights reserved. using System; +using System.Diagnostics; using System.Linq; using System.Threading.Tasks; using CopilotChat.WebApi.Extensions; @@ -56,9 +57,7 @@ public static async Task Main(string[] args) .AddLogging(logBuilder => logBuilder.AddApplicationInsights()) .AddSingleton(); -#if DEBUG - TelemetryDebugWriter.IsTracingDisabled = false; -#endif + TelemetryDebugWriter.IsTracingDisabled = Debugger.IsAttached; // Add in the rest of the services. builder.Services diff --git a/webapi/appsettings.json b/webapi/appsettings.json index 5e671bf6f774..21deb3093c23 100644 --- a/webapi/appsettings.json +++ b/webapi/appsettings.json @@ -153,7 +153,7 @@ // Document import configuration // - Global documents are documents that are shared across all users. // - User documents are documents that are specific to a user. - // - Default token limits are suggested by OpenAI: + // - For more details on tokens and how to count them, see: // https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them // - Prevent large uploads by setting a file size limit (in bytes) as suggested here: // https://learn.microsoft.com/en-us/aspnet/core/mvc/models/file-uploads?view=aspnetcore-6.0 @@ -161,8 +161,8 @@ "DocumentMemory": { "GlobalDocumentCollectionName": "global-documents", "ChatDocumentCollectionNamePrefix": "chat-documents-", - "DocumentLineSplitMaxTokens": 30, - "DocumentParagraphSplitMaxLines": 100, + "DocumentLineSplitMaxTokens": 72, + "DocumentChunkMaxTokens": 512, "FileSizeLimit": 4000000, "FileCountLimit": 10 },