From da548414c0b593aeeda1ce528782255c4006d9a9 Mon Sep 17 00:00:00 2001
From: Chris <66376200+crickman@users.noreply.github.com>
Date: Tue, 15 Aug 2023 11:10:39 -0700
Subject: [PATCH] Tuned: Document chunking parameters and debug output
 (customer input) (#178)

### Motivation and Context

<!-- Thank you for your contribution to the copilot-chat repo!
Please help reviewers and future users, providing the following
information:
  1. Why is this change required?
  2. What problem does it solve?
  3. What scenario does it contribute to?
  4. If it fixes an open issue, please link to the issue here.
-->

Responding to customer feedback on:

1. Document memory UX
1. Debug output

### Description

<!-- Describe your changes, the overall approach, the underlying design.
These notes will help understanding how your code works. Thanks! -->

- When debugger is attached locally, appliication insight tracing is not
written to the debug output.
- Larger document chunks with overlap provide a more coherent view of
memories. 1000 tokens just isn't that much.

**Current:**

![image](https://github.com/microsoft/chat-copilot/assets/66376200/d859b756-3d37-4e2c-8de2-8660b2bb43d3)
**Azure AI Playground:**

![image](https://github.com/microsoft/chat-copilot/assets/66376200/48b7b42a-305f-4d5b-9ad8-2822bf475b46)
**Tuned:**

![image](https://github.com/microsoft/chat-copilot/assets/66376200/0cf428ad-b502-4294-aa58-10ae75ee55b3)

### Contribution Checklist

<!-- Before submitting this PR, please make sure: -->

- [x] The code builds clean without any errors or warnings
- [x] The PR follows the [Contribution
Guidelines](https://github.com/microsoft/copilot-chat/blob/main/CONTRIBUTING.md)
and the [pre-submission formatting
script](https://github.com/microsoft/copilot-chat/blob/main/CONTRIBUTING.md#development-scripts)
raises no violations
- [x] All unit tests pass, and I have added new tests where possible
- [x] I didn't break anyone :smile:
---
 webapi/Controllers/DocumentImportController.cs |  2 +-
 webapi/Options/DocumentMemoryOptions.cs        | 10 +++++-----
 webapi/Program.cs                              |  5 ++---
 webapi/appsettings.json                        |  6 +++---
 4 files changed, 11 insertions(+), 12 deletions(-)
diff --git a/webapi/Controllers/DocumentImportController.cs b/webapi/Controllers/DocumentImportController.cs
index e5a352f0d6d4..9f45d526d8b2 100644
--- a/webapi/Controllers/DocumentImportController.cs
+++ b/webapi/Controllers/DocumentImportController.cs
@@ -540,7 +540,7 @@ private async Task<ImportResult> ParseDocumentContentToMemoryAsync(
         // Split the document into lines of text and then combine them into paragraphs.
         // Note that this is only one of many strategies to chunk documents. Feel free to experiment with other strategies.
         var lines = TextChunker.SplitPlainTextLines(content, this._options.DocumentLineSplitMaxTokens);
-        var paragraphs = TextChunker.SplitPlainTextParagraphs(lines, this._options.DocumentParagraphSplitMaxLines);
+        var paragraphs = TextChunker.SplitPlainTextParagraphs(lines, this._options.DocumentChunkMaxTokens, this._options.DocumentLineSplitMaxTokens);
 
         // TODO: Perform the save in parallel.
         for (var i = 0; i < paragraphs.Count; i++)
diff --git a/webapi/Options/DocumentMemoryOptions.cs b/webapi/Options/DocumentMemoryOptions.cs
index ce0c22426c55..85630eec741a 100644
--- a/webapi/Options/DocumentMemoryOptions.cs
+++ b/webapi/Options/DocumentMemoryOptions.cs
@@ -24,20 +24,20 @@ public class DocumentMemoryOptions
     public string ChatDocumentCollectionNamePrefix { get; set; } = "chat-documents-";
 
     /// <summary>
-    /// Gets or sets the maximum number of tokens to use when splitting a document into lines.
-    /// Default token limits are suggested by OpenAI:
+    /// Gets or sets the maximum number of tokens to use when splitting a document into "lines".
+    /// For more details on tokens and how to count them, see:
     /// https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
     /// </summary>
     [Range(0, int.MaxValue)]
     public int DocumentLineSplitMaxTokens { get; set; } = 30;
 
     /// <summary>
-    /// Gets or sets the maximum number of lines to use when combining lines into paragraphs.
-    /// Default token limits are suggested by OpenAI:
+    /// Gets or sets the maximum number of tokens to use when splitting documents for embeddings.
+    /// For more details on tokens and how to count them, see:
     /// https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
     /// </summary>
     [Range(0, int.MaxValue)]
-    public int DocumentParagraphSplitMaxLines { get; set; } = 100;
+    public int DocumentChunkMaxTokens { get; set; } = 100;
 
     /// <summary>
     /// Maximum size in bytes of a document to be allowed for importing.
diff --git a/webapi/Program.cs b/webapi/Program.cs
index ec419ff5d80c..cbb0410d8c93 100644
--- a/webapi/Program.cs
+++ b/webapi/Program.cs
@@ -1,6 +1,7 @@
 ﻿// Copyright (c) Microsoft. All rights reserved.
 
 using System;
+using System.Diagnostics;
 using System.Linq;
 using System.Threading.Tasks;
 using CopilotChat.WebApi.Extensions;
@@ -56,9 +57,7 @@ public static async Task Main(string[] args)
             .AddLogging(logBuilder => logBuilder.AddApplicationInsights())
             .AddSingleton<ITelemetryService, AppInsightsTelemetryService>();
 
-#if DEBUG
-        TelemetryDebugWriter.IsTracingDisabled = false;
-#endif
+        TelemetryDebugWriter.IsTracingDisabled = Debugger.IsAttached;
 
         // Add in the rest of the services.
         builder.Services
diff --git a/webapi/appsettings.json b/webapi/appsettings.json
index 5e671bf6f774..21deb3093c23 100644
--- a/webapi/appsettings.json
+++ b/webapi/appsettings.json
@@ -153,7 +153,7 @@
   // Document import configuration
   // - Global documents are documents that are shared across all users.
   // - User documents are documents that are specific to a user.
-  // - Default token limits are suggested by OpenAI:
+  // - For more details on tokens and how to count them, see:
   // https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
   // - Prevent large uploads by setting a file size limit (in bytes) as suggested here:
   // https://learn.microsoft.com/en-us/aspnet/core/mvc/models/file-uploads?view=aspnetcore-6.0
@@ -161,8 +161,8 @@
   "DocumentMemory": {
     "GlobalDocumentCollectionName": "global-documents",
     "ChatDocumentCollectionNamePrefix": "chat-documents-",
-    "DocumentLineSplitMaxTokens": 30,
-    "DocumentParagraphSplitMaxLines": 100,
+    "DocumentLineSplitMaxTokens": 72,
+    "DocumentChunkMaxTokens": 512,
     "FileSizeLimit": 4000000,
     "FileCountLimit": 10
   },