Tuned: Document chunking parameters and debug output (customer input) (…

…microsoft#178) ### Motivation and Context  Responding to customer feedback on: 1. Document memory UX 1. Debug output ### Description  - When debugger is attached locally, appliication insight tracing is not written to the debug output. - Larger document chunks with overlap provide a more coherent view of memories. 1000 tokens just isn't that much. **Current:** ![image](https://github.com/microsoft/chat-copilot/assets/66376200/d859b756-3d37-4e2c-8de2-8660b2bb43d3) **Azure AI Playground:** ![image](https://github.com/microsoft/chat-copilot/assets/66376200/48b7b42a-305f-4d5b-9ad8-2822bf475b46) **Tuned:** ![image](https://github.com/microsoft/chat-copilot/assets/66376200/0cf428ad-b502-4294-aa58-10ae75ee55b3) ### Contribution Checklist  - [x] The code builds clean without any errors or warnings - [x] The PR follows the [Contribution Guidelines](https://github.com/microsoft/copilot-chat/blob/main/CONTRIBUTING.md) and the [pre-submission formatting script](https://github.com/microsoft/copilot-chat/blob/main/CONTRIBUTING.md#development-scripts) raises no violations - [x] All unit tests pass, and I have added new tests where possible - [x] I didn't break anyone 😄
golden-aries · Aug 15, 2023 · da54841 · da54841
1 parent 8edfccc
commit da54841
Show file tree

Hide file tree

Showing 4 changed files with 11 additions and 12 deletions.
diff --git a/webapi/Controllers/DocumentImportController.cs b/webapi/Controllers/DocumentImportController.cs
@@ -540,7 +540,7 @@ private async Task<ImportResult> ParseDocumentContentToMemoryAsync(
         // Split the document into lines of text and then combine them into paragraphs.
         // Note that this is only one of many strategies to chunk documents. Feel free to experiment with other strategies.
         var lines = TextChunker.SplitPlainTextLines(content, this._options.DocumentLineSplitMaxTokens);
-        var paragraphs = TextChunker.SplitPlainTextParagraphs(lines, this._options.DocumentParagraphSplitMaxLines);
+        var paragraphs = TextChunker.SplitPlainTextParagraphs(lines, this._options.DocumentChunkMaxTokens, this._options.DocumentLineSplitMaxTokens);
 
         // TODO: Perform the save in parallel.
         for (var i = 0; i < paragraphs.Count; i++)

diff --git a/webapi/Options/DocumentMemoryOptions.cs b/webapi/Options/DocumentMemoryOptions.cs
@@ -24,20 +24,20 @@ public class DocumentMemoryOptions
     public string ChatDocumentCollectionNamePrefix { get; set; } = "chat-documents-";
 
     /// <summary>
-    /// Gets or sets the maximum number of tokens to use when splitting a document into lines.
-    /// Default token limits are suggested by OpenAI:
+    /// Gets or sets the maximum number of tokens to use when splitting a document into "lines".
+    /// For more details on tokens and how to count them, see:
     /// https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
     /// </summary>
     [Range(0, int.MaxValue)]
     public int DocumentLineSplitMaxTokens { get; set; } = 30;
 
     /// <summary>
-    /// Gets or sets the maximum number of lines to use when combining lines into paragraphs.
-    /// Default token limits are suggested by OpenAI:
+    /// Gets or sets the maximum number of tokens to use when splitting documents for embeddings.
+    /// For more details on tokens and how to count them, see:
     /// https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
     /// </summary>
     [Range(0, int.MaxValue)]
-    public int DocumentParagraphSplitMaxLines { get; set; } = 100;
+    public int DocumentChunkMaxTokens { get; set; } = 100;
 
     /// <summary>
     /// Maximum size in bytes of a document to be allowed for importing.

diff --git a/webapi/Program.cs b/webapi/Program.cs
@@ -1,6 +1,7 @@
 // Copyright (c) Microsoft. All rights reserved.
 
 using System;
+using System.Diagnostics;
 using System.Linq;
 using System.Threading.Tasks;
 using CopilotChat.WebApi.Extensions;
@@ -56,9 +57,7 @@ public static async Task Main(string[] args)
             .AddLogging(logBuilder => logBuilder.AddApplicationInsights())
             .AddSingleton<ITelemetryService, AppInsightsTelemetryService>();
 
-#if DEBUG
-        TelemetryDebugWriter.IsTracingDisabled = false;
-#endif
+        TelemetryDebugWriter.IsTracingDisabled = Debugger.IsAttached;
 
         // Add in the rest of the services.
         builder.Services

diff --git a/webapi/appsettings.json b/webapi/appsettings.json
@@ -153,16 +153,16 @@
   // Document import configuration
   // - Global documents are documents that are shared across all users.
   // - User documents are documents that are specific to a user.
-  // - Default token limits are suggested by OpenAI:
+  // - For more details on tokens and how to count them, see:
   // https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
   // - Prevent large uploads by setting a file size limit (in bytes) as suggested here:
   // https://learn.microsoft.com/en-us/aspnet/core/mvc/models/file-uploads?view=aspnetcore-6.0
   //
   "DocumentMemory": {
     "GlobalDocumentCollectionName": "global-documents",
     "ChatDocumentCollectionNamePrefix": "chat-documents-",
-    "DocumentLineSplitMaxTokens": 30,
-    "DocumentParagraphSplitMaxLines": 100,
+    "DocumentLineSplitMaxTokens": 72,
+    "DocumentChunkMaxTokens": 512,
     "FileSizeLimit": 4000000,
     "FileCountLimit": 10
   },