Skip to content

Commit

Permalink
Add latin1 transcoding bindings (#16)
Browse files Browse the repository at this point in the history
* Latin1 codegen

* Generated

* Basic tests

* No utf32 to latin1

* Fix

* simplify

---------

Co-authored-by: Nugine <nugine@foxmail.com>
  • Loading branch information
nathanwhit and Nugine committed Sep 1, 2024
1 parent db60b0b commit 3aab406
Show file tree
Hide file tree
Showing 7 changed files with 646 additions and 5 deletions.
20 changes: 17 additions & 3 deletions codegen/src/api.rs
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,9 @@ fn codegen_count() {

fn codegen_transcoding_length() {
for_each_transcoding_length(|from, to| {
if from == "latin1" && to == "utf32" {
return;
}
let from_ch = map_rs_char_type(from);
let from_doc_name = map_doc_name(from);
let to_doc_name = map_doc_name(to);
Expand All @@ -174,9 +177,14 @@ fn codegen_transcoding_length() {
g!("#[inline]");
g!("#[must_use]");
g!("pub unsafe fn {to}_length_from_{from}(src: &[{from_ch}]) -> usize {{");
g!("let len = src.len();");
g!("let buf = src.as_ptr();");
g!("crate::bindings::simdutf_{to}_length_from_{from}(buf, len)");
if is_fixed_length_for_latin1(from, to) {
g!("let len = src.len();");
g!("crate::bindings::simdutf_{to}_length_from_{from}(len)");
} else {
g!("let len = src.len();");
g!("let buf = src.as_ptr();");
g!("crate::bindings::simdutf_{to}_length_from_{from}(buf, len)");
}
g!("}}");
g!();
})
Expand Down Expand Up @@ -213,6 +221,9 @@ fn codegen_transcoding_convert() {
});

for_each_transcoding_convert(|from, to| {
if from == "latin1" || (from == "utf32" && to == "latin1") {
return;
}
let from_ch = map_rs_char_type(from);
let to_ch = map_rs_char_type(to);
let from_doc_name = map_doc_name(from);
Expand All @@ -239,6 +250,9 @@ fn codegen_transcoding_convert() {
});

for_each_transcoding_convert(|from, to| {
if from == "latin1" || (from == "utf32" && to == "latin1") {
return;
}
let from_ch = map_rs_char_type(from);
let to_ch = map_rs_char_type(to);
let from_doc_name = map_doc_name(from);
Expand Down
29 changes: 29 additions & 0 deletions codegen/src/bindings.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,16 @@ pub fn codegen_cpp() {
});

for_each_transcoding_length(|from, to| {
if from == "latin1" && to == "utf32" {
return;
}
if is_fixed_length_for_latin1(from, to) {
g!("size_t simdutf_{to}_length_from_{from}(size_t len) {{");
g!(" return simdutf::{to}_length_from_{from}(len);");
g!("}}");
g!();
return;
}
let from_ch = map_cpp_char_type(from);
g!("size_t simdutf_{to}_length_from_{from}(const {from_ch}* buf, size_t len) {{");
g!(" return simdutf::{to}_length_from_{from}(buf, len);");
Expand All @@ -56,6 +66,9 @@ pub fn codegen_cpp() {
});

for_each_transcoding_convert(|from, to| {
if from == "latin1" || (from == "utf32" && to == "latin1") {
return;
}
let from_ch = map_cpp_char_type(from);
let to_ch = map_cpp_char_type(to);
g!("simdutfrs_result_t \
Expand All @@ -68,6 +81,9 @@ pub fn codegen_cpp() {
});

for_each_transcoding_convert(|from, to| {
if from == "latin1" || (from == "utf32" && to == "latin1") {
return;
}
let from_ch = map_cpp_char_type(from);
let to_ch = map_cpp_char_type(to);
g!("size_t simdutf_convert_valid_{from}_to_{to}(const {from_ch}* src, size_t len, {to_ch}* dst) {{");
Expand Down Expand Up @@ -125,6 +141,13 @@ pub fn codegen_rust() {
g!();

for_each_transcoding_length(|from, to| {
if from == "latin1" && to == "utf32" {
return;
}
if is_fixed_length_for_latin1(from, to) {
g!("pub fn simdutf_{to}_length_from_{from}(len: usize) -> usize;");
return;
}
let from_ch = map_rs_char_type(from);
g!("pub fn simdutf_{to}_length_from_{from}(buf: *const {from_ch}, len: usize) -> usize;");
});
Expand All @@ -139,6 +162,9 @@ pub fn codegen_rust() {
g!();

for_each_transcoding_convert(|from, to| {
if from == "latin1" || (from == "utf32" && to == "latin1") {
return;
}
let from_ch = map_rs_char_type(from);
let to_ch = map_rs_char_type(to);
g!("pub fn simdutf_convert_{from}_to_{to}_with_errors\
Expand All @@ -147,6 +173,9 @@ pub fn codegen_rust() {
g!();

for_each_transcoding_convert(|from, to| {
if from == "latin1" || (from == "utf32" && to == "latin1") {
return;
}
let from_ch = map_rs_char_type(from);
let to_ch = map_rs_char_type(to);
g!("pub fn simdutf_convert_valid_{from}_to_{to}\
Expand Down
26 changes: 24 additions & 2 deletions codegen/src/common.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
pub const ENCODINGS: [&str; 6] = ["ascii", "utf8", "utf16", "utf16be", "utf16le", "utf32"];
pub const ENCODINGS: [&str; 7] = ["ascii", "utf8", "utf16", "utf16be", "utf16le", "utf32", "latin1"];

pub fn map_cpp_char_type(encoding: &str) -> &str {
match encoding {
Expand All @@ -8,6 +8,7 @@ pub fn map_cpp_char_type(encoding: &str) -> &str {
"utf16be" => "char16_t",
"utf16le" => "char16_t",
"utf32" => "char32_t",
"latin1" => "char",
_ => unimplemented!(),
}
}
Expand All @@ -20,6 +21,7 @@ pub fn map_rs_char_type(encoding: &str) -> &str {
"utf16be" => "u16",
"utf16le" => "u16",
"utf32" => "u32",
"latin1" => "u8",
_ => unimplemented!(),
}
}
Expand All @@ -32,19 +34,23 @@ pub fn map_doc_name(encoding: &str) -> &str {
"utf16be" => "UTF-16BE",
"utf16le" => "UTF-16LE",
"utf32" => "UTF-32",
"latin1" => "Latin1",
_ => unimplemented!(),
}
}

pub fn for_each_validate(mut f: impl FnMut(&str)) {
for encoding in ENCODINGS {
if encoding == "latin1" {
continue;
}
f(encoding);
}
}

pub fn for_each_count(mut f: impl FnMut(&str)) {
for encoding in ENCODINGS {
if matches!(encoding, "ascii" | "utf32") {
if matches!(encoding, "ascii" | "utf32" | "latin1") {
continue;
}
f(encoding);
Expand All @@ -57,6 +63,12 @@ pub fn for_each_transcoding_length(mut f: impl FnMut(&str, &str)) {
if from == "ascii" || to == "ascii" {
continue;
}
if from == "latin1" && (to == "utf16le" || to == "utf16be") {
continue;
}
if (from == "utf16le" || from == "utf16be") && to == "latin1" {
continue;
}
if from == to {
continue;
}
Expand Down Expand Up @@ -87,3 +99,13 @@ pub fn for_each_transcoding_convert(mut f: impl FnMut(&str, &str)) {
}
}
}

pub fn is_fixed_length_for_latin1(from: &str, to: &str) -> bool {
if from == "latin1" && to != "utf8" {
return true;
}
if from != "utf8" && to == "latin1" {
return true;
}
false
}
Loading

0 comments on commit 3aab406

Please sign in to comment.