Skip to content

Commit

Permalink
.Net: Added support for OpenAI image detail level property (#9561)
Browse files Browse the repository at this point in the history
### Motivation and Context

<!-- Thank you for your contribution to the semantic-kernel repo!
Please help reviewers and future users, providing the following
information:
  1. Why is this change required?
  2. What problem does it solve?
  3. What scenario does it contribute to?
  4. If it fixes an open issue, please link to the issue here.
-->

Resolves: #4759

This PR adds a support for the `detail` property in the OpenAI image
API. The property can be configured using `ImageContent.Metadata`
property.

`ImageContent.Metadata` usage:
```csharp
chatHistory.AddUserMessage(
[
    new TextContent("What’s in this image?"),
    new ImageContent(imageBytes, "image/jpg") { Metadata = new Dictionary<string, object?> { ["ChatImageDetailLevel"] = "high" } }
]);
```

### Contribution Checklist

<!-- Before submitting this PR, please make sure: -->

- [x] The code builds clean without any errors or warnings
- [x] The PR follows the [SK Contribution
Guidelines](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md)
and the [pre-submission formatting
script](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md#development-scripts)
raises no violations
- [x] All unit tests pass, and I have added new tests where possible
- [x] I didn't break anyone 😄
  • Loading branch information
dmytrostruk authored Nov 6, 2024
1 parent c613ae4 commit 19fef34
Show file tree
Hide file tree
Showing 4 changed files with 143 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -56,4 +56,28 @@ public async Task LocalImageAsync()

Console.WriteLine(reply.Content);
}

[Fact]
public async Task LocalImageWithImageDetailInMetadataAsync()
{
var imageBytes = await EmbeddedResource.ReadAllAsync("sample_image.jpg");

var kernel = Kernel.CreateBuilder()
.AddOpenAIChatCompletion("gpt-4-vision-preview", TestConfiguration.OpenAI.ApiKey)
.Build();

var chatCompletionService = kernel.GetRequiredService<IChatCompletionService>();

var chatHistory = new ChatHistory("You are a friendly assistant.");

chatHistory.AddUserMessage(
[
new TextContent("What’s in this image?"),
new ImageContent(imageBytes, "image/jpg") { Metadata = new Dictionary<string, object?> { ["ChatImageDetailLevel"] = "high" } }
]);

var reply = await chatCompletionService.GetChatMessageContentAsync(chatHistory);

Console.WriteLine(reply.Content);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -642,6 +642,69 @@ public async Task GetChatMessageContentsWithChatMessageContentItemCollectionAndS
Assert.Equal("image_url", contentItems[1].GetProperty("type").GetString());
}

[Theory]
[MemberData(nameof(ImageContentMetadataDetailLevelData))]
public async Task GetChatMessageContentsHandlesImageDetailLevelInMetadataCorrectlyAsync(object? detailLevel, string? expectedDetailLevel)
{
// Arrange
var chatCompletion = new OpenAIChatCompletionService(modelId: "gpt-4-vision-preview", apiKey: "NOKEY", httpClient: this._httpClient);

using var response = new HttpResponseMessage(System.Net.HttpStatusCode.OK) { Content = new StringContent(ChatCompletionResponse) };
this._messageHandlerStub.ResponseToReturn = response;

var chatHistory = new ChatHistory();
chatHistory.AddUserMessage(
[
new ImageContent(new Uri("https://image")) { Metadata = new Dictionary<string, object?> { ["ChatImageDetailLevel"] = detailLevel } }
]);

// Act
await chatCompletion.GetChatMessageContentsAsync(chatHistory);

// Assert
var actualRequestContent = Encoding.UTF8.GetString(this._messageHandlerStub.RequestContent!);
Assert.NotNull(actualRequestContent);
var optionsJson = JsonSerializer.Deserialize<JsonElement>(actualRequestContent);

var messages = optionsJson.GetProperty("messages");

Assert.Equal(1, messages.GetArrayLength());

var contentItems = messages[0].GetProperty("content");
Assert.Equal(1, contentItems.GetArrayLength());

Assert.Equal("image_url", contentItems[0].GetProperty("type").GetString());

var imageProperty = contentItems[0].GetProperty("image_url");

Assert.Equal("https://image/", imageProperty.GetProperty("url").GetString());

if (detailLevel is null || (detailLevel is string detailLevelString && string.IsNullOrWhiteSpace(detailLevelString)))
{
Assert.False(imageProperty.TryGetProperty("detail", out _));
}
else
{
Assert.Equal(expectedDetailLevel, imageProperty.GetProperty("detail").GetString());
}
}

[Fact]
public async Task GetChatMessageContentsThrowsExceptionWithInvalidImageDetailLevelInMetadataAsync()
{
// Arrange
var chatCompletion = new OpenAIChatCompletionService(modelId: "gpt-4-vision-preview", apiKey: "NOKEY", httpClient: this._httpClient);

var chatHistory = new ChatHistory();
chatHistory.AddUserMessage(
[
new ImageContent(new Uri("https://image")) { Metadata = new Dictionary<string, object?> { ["ChatImageDetailLevel"] = "invalid_value" } }
]);

// Act & Assert
await Assert.ThrowsAsync<ArgumentException>(() => chatCompletion.GetChatMessageContentsAsync(chatHistory));
}

[Fact]
public async Task FunctionCallsShouldBePropagatedToCallersViaChatMessageItemsOfTypeFunctionCallContentAsync()
{
Expand Down Expand Up @@ -1558,6 +1621,15 @@ public async Task OnAutoFunctionInvocationAsync(AutoFunctionInvocationContext co
}
""";

public static TheoryData<object?, string?> ImageContentMetadataDetailLevelData => new()
{
{ "auto", "auto" },
{ "high", "high" },
{ "low", "low" },
{ "", null },
{ null, null }
};

#pragma warning disable CS8618, CA1812
private sealed class MathReasoning
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -802,19 +802,44 @@ private static List<ChatMessage> CreateRequestMessages(ChatMessageContent messag

private static ChatMessageContentPart GetImageContentItem(ImageContent imageContent)
{
ChatImageDetailLevel? detailLevel = GetChatImageDetailLevel(imageContent);

if (imageContent.Data is { IsEmpty: false } data)
{
return ChatMessageContentPart.CreateImagePart(BinaryData.FromBytes(data), imageContent.MimeType);
return ChatMessageContentPart.CreateImagePart(BinaryData.FromBytes(data), imageContent.MimeType, detailLevel);
}

if (imageContent.Uri is not null)
{
return ChatMessageContentPart.CreateImagePart(imageContent.Uri);
return ChatMessageContentPart.CreateImagePart(imageContent.Uri, detailLevel);
}

throw new ArgumentException($"{nameof(ImageContent)} must have either Data or a Uri.");
}

private static ChatImageDetailLevel? GetChatImageDetailLevel(ImageContent imageContent)
{
const string DetailLevelProperty = "ChatImageDetailLevel";

if (imageContent.Metadata is not null &&
imageContent.Metadata.TryGetValue(DetailLevelProperty, out object? detailLevel) &&
detailLevel is not null)
{
if (detailLevel is string detailLevelString && !string.IsNullOrWhiteSpace(detailLevelString))
{
return detailLevelString.ToUpperInvariant() switch
{
"AUTO" => ChatImageDetailLevel.Auto,
"LOW" => ChatImageDetailLevel.Low,
"HIGH" => ChatImageDetailLevel.High,
_ => throw new ArgumentException($"Unknown image detail level '{detailLevelString}'. Supported values are 'Auto', 'Low' and 'High'.")
};
}
}

return null;
}

private OpenAIChatMessageContent CreateChatMessageContent(OpenAIChatCompletion completion, string targetModel)
{
var message = new OpenAIChatMessageContent(completion, targetModel, this.GetChatCompletionMetadata(completion));
Expand Down
20 changes: 20 additions & 0 deletions dotnet/src/SemanticKernel.UnitTests/Contents/ImageContentTests.cs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
// Copyright (c) Microsoft. All rights reserved.

using System;
using System.Collections.Generic;
using System.Text;
using System.Text.Json;
using Microsoft.SemanticKernel;
Expand Down Expand Up @@ -241,6 +242,25 @@ public void EmptyConstructorSerializationAndDeserializationAsExpected()
Assert.Null(deserialized.Metadata);
}

[Fact]
public void MetadataSerializationAndDeserializationWorksCorrectly()
{
// Arrange
var content = new ImageContent()
{
Metadata = new Dictionary<string, object?> { ["ChatImageDetailLevel"] = "high" }
};

// Act
var serialized = JsonSerializer.Serialize(content);
var deserialized = JsonSerializer.Deserialize<ImageContent>(serialized);

// Assert
Assert.NotNull(deserialized?.Metadata);
Assert.True(deserialized.Metadata.ContainsKey("ChatImageDetailLevel"));
Assert.Equal("high", deserialized.Metadata["ChatImageDetailLevel"]?.ToString());
}

[Theory]
[InlineData("http://localhost:9090/")]
[InlineData(null)]
Expand Down

0 comments on commit 19fef34

Please sign in to comment.