Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Initial tokenizer.json support #41

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions Tiktoken.sln
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Tiktoken.Encodings.Abstract
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Tiktoken.Encodings.r50k", "src\libs\Tiktoken.Encodings.r50k\Tiktoken.Encodings.r50k.csproj", "{633C9C98-0782-4CFC-9D26-F27E77FA11EC}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Tiktoken.Encodings.Tokenizer", "src\libs\Tiktoken.Encodings.Tokenizer\Tiktoken.Encodings.Tokenizer.csproj", "{1C924F98-52DA-4A18-8FDF-E93D12ABD422}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Expand Down Expand Up @@ -83,6 +85,10 @@ Global
{633C9C98-0782-4CFC-9D26-F27E77FA11EC}.Debug|Any CPU.Build.0 = Debug|Any CPU
{633C9C98-0782-4CFC-9D26-F27E77FA11EC}.Release|Any CPU.ActiveCfg = Release|Any CPU
{633C9C98-0782-4CFC-9D26-F27E77FA11EC}.Release|Any CPU.Build.0 = Release|Any CPU
{1C924F98-52DA-4A18-8FDF-E93D12ABD422}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{1C924F98-52DA-4A18-8FDF-E93D12ABD422}.Debug|Any CPU.Build.0 = Debug|Any CPU
{1C924F98-52DA-4A18-8FDF-E93D12ABD422}.Release|Any CPU.ActiveCfg = Release|Any CPU
{1C924F98-52DA-4A18-8FDF-E93D12ABD422}.Release|Any CPU.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
Expand All @@ -100,5 +106,6 @@ Global
{40F00E7F-84D7-4886-95F4-DC96BDE90965} = {3B092566-9A2F-4C00-BFF1-90C0A6BE8C62}
{3EDC6F81-2C49-4B8D-9AAC-5B7C40D5A1CF} = {3B092566-9A2F-4C00-BFF1-90C0A6BE8C62}
{633C9C98-0782-4CFC-9D26-F27E77FA11EC} = {3B092566-9A2F-4C00-BFF1-90C0A6BE8C62}
{1C924F98-52DA-4A18-8FDF-E93D12ABD422} = {3B092566-9A2F-4C00-BFF1-90C0A6BE8C62}
EndGlobalSection
EndGlobal
1 change: 1 addition & 0 deletions src/Directory.Packages.props
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
<IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
</PackageVersion>
<PackageVersion Include="SharpToken" Version="2.0.3" />
<PackageVersion Include="System.Text.Json" Version="8.0.4" />
<PackageVersion Include="System.ValueTuple" Version="4.5.0" />
<PackageVersion Include="TiktokenSharp" Version="1.1.4" />
<PackageVersion Include="Verify.MSTest" Version="25.3.0" />
Expand Down
8 changes: 5 additions & 3 deletions src/libs/Tiktoken.Core/CoreBPE.cs
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,12 @@ public class CoreBpe
/// <param name="encoder"></param>
/// <param name="specialTokensEncoder"></param>
/// <param name="pattern"></param>
/// <param name="compiled"></param>
public CoreBpe(
IReadOnlyDictionary<byte[], int> encoder,
IReadOnlyDictionary<string, int> specialTokensEncoder,
string pattern)
string pattern,
bool compiled = true)
{
encoder = encoder ?? throw new ArgumentNullException(nameof(encoder));
specialTokensEncoder = specialTokensEncoder ?? throw new ArgumentNullException(nameof(specialTokensEncoder));
Expand All @@ -49,8 +51,8 @@ public CoreBpe(
static x => x.Value);
SpecialTokensEncoder = specialTokensEncoder;

Regex = new Regex(pattern, RegexOptions.Compiled);
SpecialRegex = new Regex("(" + string.Join("|", specialTokensEncoder.Keys.Select(Regex.Escape)) + ")", RegexOptions.Compiled);
Regex = new Regex(pattern, compiled ? RegexOptions.Compiled : RegexOptions.None);
SpecialRegex = new Regex("(" + string.Join("|", specialTokensEncoder.Keys.Select(Regex.Escape)) + ")", compiled ? RegexOptions.Compiled : RegexOptions.None);

Decoder = Encoder
.ToDictionary(
Expand Down
2 changes: 1 addition & 1 deletion src/libs/Tiktoken.Core/Encoder.cs
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ public Encoder(Encoding encoding)
{
encoding = encoding ?? throw new ArgumentNullException(nameof(encoding));

_corePbe = new CoreBpe(encoding.MergeableRanks, encoding.SpecialTokens, encoding.Pattern);
_corePbe = new CoreBpe(encoding.MergeableRanks, encoding.SpecialTokens, encoding.Pattern, encoding.CompiledRegex);
_specialTokensSet = [..encoding.SpecialTokens.Keys];
}

Expand Down
5 changes: 5 additions & 0 deletions src/libs/Tiktoken.Encodings.Abstractions/Encoding.cs
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,11 @@ public class Encoding
/// </summary>
public IReadOnlyDictionary<string, int> SpecialTokens { get; set; }

/// <summary>
///
/// </summary>
public bool CompiledRegex { get; set; } = true;

/// <summary>
///
/// </summary>
Expand Down
51 changes: 51 additions & 0 deletions src/libs/Tiktoken.Encodings.Tokenizer/AddedToken.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
using System.Text.Json.Serialization;

namespace Tiktoken.Encodings;

/// <summary>
///
/// </summary>
public class AddedToken
{
/// <summary>
///
/// </summary>
[JsonPropertyName("id")]
public int Id { get; set; }

/// <summary>
///
/// </summary>
[JsonPropertyName("special")]
public bool Special { get; set; }

/// <summary>
///
/// </summary>
[JsonPropertyName("content")]
public string Content { get; set; } = string.Empty;

/// <summary>
///
/// </summary>
[JsonPropertyName("single_word")]
public bool SingleWord { get; set; }

/// <summary>
///
/// </summary>
[JsonPropertyName("lstrip")]
public bool Lstrip { get; set; }

/// <summary>
///
/// </summary>
[JsonPropertyName("rstrip")]
public bool Rstrip { get; set; }

/// <summary>
///
/// </summary>
[JsonPropertyName("normalized")]
public bool Normalized { get; set; }
}
27 changes: 27 additions & 0 deletions src/libs/Tiktoken.Encodings.Tokenizer/Decoder.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
using System.Text.Json.Serialization;

namespace Tiktoken.Encodings;

/// <summary>
///
/// </summary>
public class Decoder
{
/// <summary>
///
/// </summary>
[JsonPropertyName("type")]
public string Type { get; set; } = string.Empty;

/// <summary>
///
/// </summary>
[JsonPropertyName("add_prefix_space")]
public bool? AddPrefixSpace { get; set; }

/// <summary>
///
/// </summary>
[JsonPropertyName("trim_offsets")]
public bool? TrimOffsets { get; set; }
}
51 changes: 51 additions & 0 deletions src/libs/Tiktoken.Encodings.Tokenizer/Model.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
using System.Text.Json.Serialization;

namespace Tiktoken.Encodings;

/// <summary>
///
/// </summary>
public class Model
{
/// <summary>
///
/// </summary>
[JsonPropertyName("dropout")]
public object? Dropout { get; set; }

/// <summary>
///
/// </summary>
[JsonPropertyName("unk_token")]
public object? UnkToken { get; set; }

/// <summary>
///
/// </summary>
[JsonPropertyName("continuing_subword_prefix")]
public string ContinuingSubwordPrefix { get; set; } = string.Empty;

/// <summary>
///
/// </summary>
[JsonPropertyName("end_of_word_suffix")]
public string EndOfWordSuffix { get; set; } = string.Empty;

/// <summary>
///
/// </summary>
[JsonPropertyName("fuse_unk")]
public bool? FuseUnk { get; set; }

/// <summary>
///
/// </summary>
[JsonPropertyName("vocab")]
public IReadOnlyDictionary<string, int> Vocab { get; set; } = new Dictionary<string, int>();

/// <summary>
///
/// </summary>
[JsonPropertyName("merges")]
public IReadOnlyList<string> Merges { get; } = [];
}
27 changes: 27 additions & 0 deletions src/libs/Tiktoken.Encodings.Tokenizer/PostProcessor.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
using System.Text.Json.Serialization;

namespace Tiktoken.Encodings;

/// <summary>
///
/// </summary>
public class PostProcessor
{
/// <summary>
///
/// </summary>
[JsonPropertyName("type")]
public string Type { get; set; } = string.Empty;

/// <summary>
///
/// </summary>
[JsonPropertyName("add_prefix_space")]
public bool? AddPrefixSpace { get; set; }

/// <summary>
///
/// </summary>
[JsonPropertyName("trim_offsets")]
public bool? TrimOffsets { get; set; }
}
27 changes: 27 additions & 0 deletions src/libs/Tiktoken.Encodings.Tokenizer/PreTokenizer.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
using System.Text.Json.Serialization;

namespace Tiktoken.Encodings;

/// <summary>
///
/// </summary>
public class PreTokenizer
{
/// <summary>
///
/// </summary>
[JsonPropertyName("type")]
public string Type { get; set; } = string.Empty;

/// <summary>
///
/// </summary>
[JsonPropertyName("add_prefix_space")]
public bool? AddPrefixSpace { get; set; }

/// <summary>
///
/// </summary>
[JsonPropertyName("trim_offsets")]
public bool? TrimOffsets { get; set; }
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
using System.Text.Json.Serialization;

namespace Tiktoken.Encodings;

[JsonSerializable(typeof(Tokenizer))]
internal partial class SourceGenerationContext : JsonSerializerContext;
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
<Project Sdk="Microsoft.NET.Sdk">

<PropertyGroup>
<TargetFrameworks>net4.6.2;netstandard2.0;netstandard2.1;net6.0;net8.0</TargetFrameworks>
<RootNamespace>Tiktoken.Encodings</RootNamespace>
</PropertyGroup>

<ItemGroup>
<ProjectReference Include="..\Tiktoken.Encodings.Abstractions\Tiktoken.Encodings.Abstractions.csproj" />
</ItemGroup>

<ItemGroup>
<PackageReference Include="System.Text.Json" />
</ItemGroup>

</Project>
Loading
Loading