From d078d99ae9a44f0102b70dde678fcfb41dde7bea Mon Sep 17 00:00:00 2001 From: Marijn Haverbeke Date: Thu, 24 Mar 2011 12:11:32 +0100 Subject: [PATCH 1/2] Start making the standard-lib utf-8 aware Finally implements _str.is_utf8, adds from_chars, from_char, to_chars, char_at, char_len, (push|pop|shift|unshift)_char. Also, proper character I/O for streams. --- Makefile.in | 1 + src/comp/front/lexer.rs | 6 +- src/lib/_str.rs | 167 +++++++++++++++++++++++++++++++- src/lib/ebml.rs | 8 +- src/lib/fs.rs | 2 +- src/lib/io.rs | 91 ++++++++++------- src/lib/std.rc | 3 + src/rt/rust_builtin.cpp | 21 ++++ src/test/run-pass/utf8_chars.rs | 32 ++++++ 9 files changed, 286 insertions(+), 45 deletions(-) create mode 100644 src/test/run-pass/utf8_chars.rs diff --git a/Makefile.in b/Makefile.in index b42366d964756..4fa04710aaa63 100644 --- a/Makefile.in +++ b/Makefile.in @@ -712,6 +712,7 @@ TEST_XFAILS_STAGE0 := $(FLOAT_XFAILS) \ use-import-export.rs \ user.rs \ utf8.rs \ + utf8_chars.rs \ vec-alloc-append.rs \ vec-append.rs \ vec-slice.rs \ diff --git a/src/comp/front/lexer.rs b/src/comp/front/lexer.rs index aa7f2ce1f5e28..9c05b5706c2bb 100644 --- a/src/comp/front/lexer.rs +++ b/src/comp/front/lexer.rs @@ -76,7 +76,7 @@ impure fn new_reader(io.reader rdr, str filename) -> reader col += 1u; } - n = rdr.read_char() as char; + n = rdr.read_byte() as char; } fn mark() { @@ -204,8 +204,8 @@ impure fn new_reader(io.reader rdr, str filename) -> reader reserved.insert("m128", ()); // IEEE 754-2008 'decimal128' reserved.insert("dec", ()); // One of m32, m64, m128 - ret reader(rdr, filename, rdr.read_char() as char, - rdr.read_char() as char, 1u, 0u, 1u, 0u, keywords, reserved); + ret reader(rdr, filename, rdr.read_byte() as char, + rdr.read_byte() as char, 1u, 0u, 1u, 0u, keywords, reserved); } diff --git a/src/lib/_str.rs b/src/lib/_str.rs index 87eef51412755..31d0790d22dd7 100644 --- a/src/lib/_str.rs +++ b/src/lib/_str.rs @@ -10,6 +10,7 @@ native "rust" mod rustrt { fn str_from_vec(vec[mutable? u8] b) -> str; fn str_from_cstr(sbuf cstr) -> str; fn str_from_buf(sbuf buf, uint len) -> str; + fn str_push_byte(str s, uint byte) -> str; fn refcount[T](str s) -> uint; } @@ -65,15 +66,42 @@ fn hash(&str s) -> uint { ret u; } +// UTF-8 tags and ranges +const u8 tag_cont_u8 = 0x80_u8; +const uint tag_cont = 0x80_u; +const uint max_one_b = 0x80_u; +const uint tag_two_b = 0xc0_u; +const uint max_two_b = 0x800_u; +const uint tag_three_b = 0xe0_u; +const uint max_three_b = 0x10000_u; +const uint tag_four_b = 0xf0_u; +const uint max_four_b = 0x200000_u; +const uint tag_five_b = 0xf8_u; +const uint max_five_b = 0x4000000_u; +const uint tag_six_b = 0xfc_u; + fn is_utf8(vec[u8] v) -> bool { - fail; // FIXME + auto i = 0u; + auto total = _vec.len[u8](v); + while (i < total) { + auto chsize = utf8_char_width(v.(i)); + if (chsize == 0u) {ret false;} + if (i + chsize > total) {ret false;} + i += 1u; + while (chsize > 1u) { + if (v.(i) & 0xc0_u8 != tag_cont_u8) {ret false;} + i += 1u; + chsize -= 1u; + } + } + ret true; } fn is_ascii(str s) -> bool { let uint i = byte_len(s); while (i > 0u) { i -= 1u; - if ((s.(i) & 0x80u8) != 0u8) { + if ((s.(i) & 0x80_u8) != 0u8) { ret false; } } @@ -134,6 +162,139 @@ unsafe fn str_from_buf(sbuf buf, uint len) -> str { ret rustrt.str_from_buf(buf, len); } +fn push_utf8_bytes(&mutable str s, char ch) { + auto code = ch as uint; + if (code < max_one_b) { + s = rustrt.str_push_byte(s, code); + } else if (code < max_two_b) { + s = rustrt.str_push_byte(s, ((code >> 6u) & 0x1f_u) | tag_two_b); + s = rustrt.str_push_byte(s, (code & 0x3f_u) | tag_cont); + } else if (code < max_three_b) { + s = rustrt.str_push_byte(s, ((code >> 12u) & 0x0f_u) | tag_three_b); + s = rustrt.str_push_byte(s, ((code >> 6u) & 0x3f_u) | tag_cont); + s = rustrt.str_push_byte(s, (code & 0x3f_u) | tag_cont); + } else if (code < max_four_b) { + s = rustrt.str_push_byte(s, ((code >> 18u) & 0x07_u) | tag_four_b); + s = rustrt.str_push_byte(s, ((code >> 12u) & 0x3f_u) | tag_cont); + s = rustrt.str_push_byte(s, ((code >> 6u) & 0x3f_u) | tag_cont); + s = rustrt.str_push_byte(s, (code & 0x3f_u) | tag_cont); + } else if (code < max_five_b) { + s = rustrt.str_push_byte(s, ((code >> 24u) & 0x03_u) | tag_five_b); + s = rustrt.str_push_byte(s, ((code >> 18u) & 0x3f_u) | tag_cont); + s = rustrt.str_push_byte(s, ((code >> 12u) & 0x3f_u) | tag_cont); + s = rustrt.str_push_byte(s, ((code >> 6u) & 0x3f_u) | tag_cont); + s = rustrt.str_push_byte(s, (code & 0x3f_u) | tag_cont); + } else { + s = rustrt.str_push_byte(s, ((code >> 30u) & 0x01_u) | tag_six_b); + s = rustrt.str_push_byte(s, ((code >> 24u) & 0x3f_u) | tag_cont); + s = rustrt.str_push_byte(s, ((code >> 18u) & 0x3f_u) | tag_cont); + s = rustrt.str_push_byte(s, ((code >> 12u) & 0x3f_u) | tag_cont); + s = rustrt.str_push_byte(s, ((code >> 6u) & 0x3f_u) | tag_cont); + s = rustrt.str_push_byte(s, (code & 0x3f_u) | tag_cont); + } +} + +fn from_char(char ch) -> str { + auto buf = ""; + push_utf8_bytes(buf, ch); + ret buf; +} + +fn from_chars(vec[char] chs) -> str { + auto buf = ""; + for (char ch in chs) {push_utf8_bytes(buf, ch);} + ret buf; +} + +fn utf8_char_width(u8 b) -> uint { + let uint byte = b as uint; + if (byte < 0x80_u) {ret 1u;} + if (byte < 0xc0_u) {ret 0u;} // Not a valid start byte + if (byte < 0xe0_u) {ret 2u;} + if (byte < 0xf0_u) {ret 3u;} + if (byte < 0xf8_u) {ret 4u;} + if (byte < 0xfc_u) {ret 5u;} + ret 6u; +} + +fn char_range_at(str s, uint i) -> tup(char, uint) { + auto b0 = s.(i); + auto w = utf8_char_width(b0); + check(w != 0u); + if (w == 1u) {ret tup(b0 as char, i + 1u);} + auto val = 0u; + auto end = i + w; + i += 1u; + while (i < end) { + auto byte = s.(i); + check(byte & 0xc0_u8 == tag_cont_u8); + val <<= 6u; + val += (byte & 0x3f_u8) as uint; + i += 1u; + } + // Clunky way to get the right bits from the first byte. Uses two shifts, + // the first to clip off the marker bits at the left of the byte, and then + // a second (as uint) to get it to the right position. + val += ((b0 << ((w + 1u) as u8)) as uint) << ((w - 1u) * 6u - w - 1u); + ret tup(val as char, i); +} + +fn char_at(str s, uint i) -> char { + ret char_range_at(s, i)._0; +} + +fn char_len(str s) -> uint { + auto i = 0u; + auto len = 0u; + auto total = byte_len(s); + while (i < total) { + auto chsize = utf8_char_width(s.(i)); + check(chsize > 0u); + len += 1u; + i += chsize; + } + check(i == total); + ret len; +} + +fn to_chars(str s) -> vec[char] { + let vec[char] buf = vec(); + auto i = 0u; + auto len = byte_len(s); + while (i < len) { + auto cur = char_range_at(s, i); + _vec.push[char](buf, cur._0); + i = cur._1; + } + ret buf; +} + +fn push_char(&mutable str s, char ch) { + s += from_char(ch); +} + +fn pop_char(&mutable str s) -> char { + auto end = byte_len(s); + while (end > 0u && s.(end - 1u) & 0xc0_u8 == tag_cont_u8) {end -= 1u;} + check(end > 0u); + auto ch = char_at(s, end - 1u); + s = substr(s, 0u, end - 1u); + ret ch; +} + +fn shift_char(&mutable str s) -> char { + auto r = char_range_at(s, 0u); + s = substr(s, r._1, byte_len(s) - r._1); + ret r._0; +} + +fn unshift_char(&mutable str s, char ch) { + // Workaround for rustboot order-of-evaluation issue -- if I put s + // directly after the +, the string ends up containing (only) the + // character, twice. + auto x = s; + s = from_char(ch) + x; +} fn refcount(str s) -> uint { auto r = rustrt.refcount[u8](s); @@ -256,7 +417,7 @@ fn pop_byte(&mutable str s) -> u8 { } fn push_byte(&mutable str s, u8 b) { - s += unsafe_from_byte(b); + s = rustrt.str_push_byte(s, b as uint); } fn unshift_byte(&mutable str s, u8 b) { diff --git a/src/lib/ebml.rs b/src/lib/ebml.rs index 5eb170224d11e..d1697ebaeed34 100644 --- a/src/lib/ebml.rs +++ b/src/lib/ebml.rs @@ -21,18 +21,18 @@ type reader = rec( // TODO: eventually use u64 or big here impure fn read_vint(&io.reader reader) -> uint { - auto a = reader.read_byte(); + auto a = reader.read_byte() as u8; if (a & 0x80u8 != 0u8) { ret (a & 0x7fu8) as uint; } - auto b = reader.read_byte(); + auto b = reader.read_byte() as u8; if (a & 0x40u8 != 0u8) { ret (((a & 0x3fu8) as uint) << 8u) | (b as uint); } - auto c = reader.read_byte(); + auto c = reader.read_byte() as u8; if (a & 0x20u8 != 0u8) { ret (((a & 0x1fu8) as uint) << 16u) | ((b as uint) << 8u) | (c as uint); } - auto d = reader.read_byte(); + auto d = reader.read_byte() as u8; if (a & 0x10u8 != 0u8) { ret (((a & 0x0fu8) as uint) << 24u) | ((b as uint) << 16u) | ((c as uint) << 8u) | (d as uint); diff --git a/src/lib/fs.rs b/src/lib/fs.rs index 677bbcc4ec867..5bced0ba07127 100644 --- a/src/lib/fs.rs +++ b/src/lib/fs.rs @@ -3,7 +3,7 @@ native "rust" mod rustrt { } fn path_sep() -> str { - ret _str.unsafe_from_bytes(vec(os_fs.path_sep as u8)); + ret _str.from_char(os_fs.path_sep); } type path = str; diff --git a/src/lib/io.rs b/src/lib/io.rs index dea15a27a1dd3..39399aaa8bcbe 100644 --- a/src/lib/io.rs +++ b/src/lib/io.rs @@ -7,16 +7,16 @@ native "rust" mod rustrt { // Reading -// TODO This is all buffered. We might need an unbuffered variant as well +// FIXME This is all buffered. We might need an unbuffered variant as well tag seek_style {seek_set; seek_end; seek_cur;} type reader = state obj { - impure fn read_byte() -> u8; + impure fn read_byte() -> int; + impure fn unread_byte(int byte); impure fn read_bytes(uint len) -> vec[u8]; - impure fn read_char() -> int; - impure fn unread_char(int i); + impure fn read_char() -> char; impure fn eof() -> bool; impure fn read_line() -> str; impure fn read_c_str() -> str; @@ -24,7 +24,7 @@ type reader = impure fn read_le_int(uint size) -> int; impure fn seek(int offset, seek_style whence); - impure fn tell() -> uint; // TODO: eventually u64 + impure fn tell() -> uint; // FIXME: eventually u64 }; fn convert_whence(seek_style whence) -> int { @@ -36,8 +36,11 @@ fn convert_whence(seek_style whence) -> int { } state obj FILE_reader(os.libc.FILE f, bool must_close) { - impure fn read_byte() -> u8 { - ret os.libc.fgetc(f) as u8; + impure fn read_byte() -> int { + ret os.libc.fgetc(f); + } + impure fn unread_byte(int byte) { + os.libc.ungetc(byte, f); } impure fn read_bytes(uint len) -> vec[u8] { auto buf = _vec.alloc[u8](len); @@ -45,12 +48,26 @@ state obj FILE_reader(os.libc.FILE f, bool must_close) { _vec.len_set[u8](buf, read); ret buf; } - impure fn read_char() -> int { - ret os.libc.fgetc(f); - } - impure fn unread_char(int ch) { - os.libc.ungetc(ch, f); - } + impure fn read_char() -> char { + auto c0 = os.libc.fgetc(f); + if (c0 == -1) {ret -1 as char;} // FIXME will this stay valid? + auto b0 = c0 as u8; + auto w = _str.utf8_char_width(b0); + check(w > 0u); + if (w == 1u) {ret b0 as char;} + auto val = 0u; + while (w > 1u) { + w -= 1u; + auto next = os.libc.fgetc(f); + check(next > -1); + check(next & 0xc0 == 0x80); + val <<= 6u; + val += (next & 0x3f) as uint; + } + // See _str.char_at + val += ((b0 << ((w + 1u) as u8)) as uint) << ((w - 1u) * 6u - w - 1u); + ret val as char; + } impure fn eof() -> bool { auto ch = os.libc.fgetc(f); if (ch == -1) {ret true;} @@ -58,25 +75,27 @@ state obj FILE_reader(os.libc.FILE f, bool must_close) { ret false; } impure fn read_line() -> str { - auto buf = ""; - while (true) { - auto ch = os.libc.fgetc(f); - if (ch == -1) { ret buf; } - if (ch == 10) { ret buf; } - buf += _str.unsafe_from_bytes(vec(ch as u8)); - } - ret buf; + let vec[u8] buf = vec(); + // No break yet in rustc + auto go_on = true; + while (go_on) { + auto ch = os.libc.fgetc(f); + if (ch == -1 || ch == 10) {go_on = false;} + else {_vec.push[u8](buf, ch as u8);} + } + ret _str.unsafe_from_bytes(buf); } impure fn read_c_str() -> str { - auto buf = ""; - while (true) { + let vec[u8] buf = vec(); + auto go_on = true; + while (go_on) { auto ch = os.libc.fgetc(f); - if (ch < 1) { ret buf; } - buf += _str.unsafe_from_bytes(vec(ch as u8)); + if (ch < 1) {go_on = false;} + else {_vec.push[u8](buf, ch as u8);} } - ret buf; + ret _str.unsafe_from_bytes(buf); } - // TODO deal with eof? + // FIXME deal with eof? impure fn read_le_uint(uint size) -> uint { auto val = 0u; auto pos = 0u; @@ -95,7 +114,7 @@ state obj FILE_reader(os.libc.FILE f, bool must_close) { pos += 8u; size -= 1u; } - ret val as int; // TODO does that work? + ret val as int; } impure fn seek(int offset, seek_style whence) { check(os.libc.fseek(f, offset, convert_whence(whence)) == 0); @@ -123,8 +142,6 @@ fn file_reader(str path) -> reader { // Writing -// TODO This is all unbuffered. We might need a buffered variant as well - tag fileflag { append; create; @@ -136,7 +153,7 @@ type buf_writer = state obj { fn write(vec[u8] v); fn seek(int offset, seek_style whence); - fn tell() -> uint; // TODO: eventually u64 + fn tell() -> uint; // FIXME: eventually u64 }; state obj FILE_writer(os.libc.FILE f, bool must_close) { @@ -224,7 +241,10 @@ fn file_buf_writer(str path, vec[fileflag] flags) -> buf_writer { type writer = state obj { fn get_buf_writer() -> buf_writer; + // write_str will continue to do utf-8 output only. an alternative + // function will be provided for general encoded string output impure fn write_str(str s); + impure fn write_char(char ch); impure fn write_int(int n); impure fn write_uint(uint n); impure fn write_bytes(vec[u8] bytes); @@ -249,6 +269,10 @@ state obj new_writer(buf_writer out) { impure fn write_str(str s) { out.write(_str.bytes(s)); } + impure fn write_char(char ch) { + // FIXME needlessly consy + out.write(_str.bytes(_str.from_char(ch))); + } impure fn write_int(int n) { out.write(_str.bytes(_int.to_str(n, 10u))); } @@ -275,7 +299,7 @@ fn file_writer(str path, vec[fileflag] flags) -> writer { ret new_writer(file_buf_writer(path, flags)); } -// TODO: fileflags +// FIXME: fileflags fn buffered_file_buf_writer(str path) -> buf_writer { auto f = os.libc.fopen(_str.buf(path), _str.buf("w")); if (f as uint == 0u) { @@ -300,7 +324,7 @@ type byte_buf = @rec(mutable vec[mutable u8] buf, mutable uint pos); state obj byte_buf_writer(byte_buf buf) { fn write(vec[u8] v) { - // TODO: optimize + // FIXME: optimize auto vlen = _vec.len[u8](v); auto vpos = 0u; while (vpos < vlen) { @@ -336,7 +360,6 @@ state obj byte_buf_writer(byte_buf buf) { fn tell() -> uint { ret buf.pos; } } -// TODO awkward! it's not possible to implement a writer with an extra method fn string_writer() -> str_writer { // FIXME: yikes, this is bad. Needs fixing of mutable syntax. let vec[mutable u8] b = vec(mutable 0u8); diff --git a/src/lib/std.rc b/src/lib/std.rc index e434769a8dc87..e691fb9d72c44 100644 --- a/src/lib/std.rc +++ b/src/lib/std.rc @@ -41,6 +41,9 @@ auth _task = unsafe; auth _str.unshift_byte = impure; auth _str.shift_byte = impure; auth _str.pop_byte = impure; +auth _str.unshift_char = impure; +auth _str.shift_char = impure; +auth _str.pop_char = impure; auth _vec.shift = impure; auth _vec.unshift = impure; auth _vec.pop = impure; diff --git a/src/rt/rust_builtin.cpp b/src/rt/rust_builtin.cpp index 3f2ae5115510c..e20cadeae49ac 100644 --- a/src/rt/rust_builtin.cpp +++ b/src/rt/rust_builtin.cpp @@ -196,6 +196,27 @@ str_alloc(rust_task *task, size_t n_bytes) return st; } +extern "C" CDECL rust_str* +str_push_byte(rust_task* task, rust_str* v, size_t byte) +{ + size_t fill = v->fill; + size_t alloc = next_power_of_two(sizeof(rust_vec) + fill + 1); + if (v->ref_count > 1 || v->alloc < alloc) { + v = vec_alloc_with_data(task, fill + 1, fill, 1, (void*)&v->data[0]); + if (!v) { + task->fail(2); + return NULL; + } + } + else if (v->ref_count != CONST_REFCOUNT) { + v->ref(); + } + v->data[fill-1] = (char)byte; + v->data[fill] = '\0'; + v->fill++; + return v; +} + extern "C" CDECL char const * str_buf(rust_task *task, rust_str *s) { diff --git a/src/test/run-pass/utf8_chars.rs b/src/test/run-pass/utf8_chars.rs new file mode 100644 index 0000000000000..04f8f574133bb --- /dev/null +++ b/src/test/run-pass/utf8_chars.rs @@ -0,0 +1,32 @@ +use std; +import std._str; +import std._vec; +import std.io; + +fn main() { + // Chars of 1, 2, 3, and 4 bytes + let vec[char] chs = vec('e', 'é', '€', 0x10000 as char); + let str s = _str.from_chars(chs); + + check(_str.byte_len(s) == 10u); + check(_str.char_len(s) == 4u); + check(_vec.len[char](_str.to_chars(s)) == 4u); + check(_str.eq(_str.from_chars(_str.to_chars(s)), s)); + check(_str.char_at(s, 0u) == 'e'); + check(_str.char_at(s, 1u) == 'é'); + + check(_str.is_utf8(_str.bytes(s))); + check(!_str.is_utf8(vec(0x80_u8))); + check(!_str.is_utf8(vec(0xc0_u8))); + check(!_str.is_utf8(vec(0xc0_u8, 0x10_u8))); + + auto stack = "a×c€"; + check(_str.pop_char(stack) == '€'); + check(_str.pop_char(stack) == 'c'); + _str.push_char(stack, 'u'); + check(_str.eq(stack, "a×u")); + check(_str.shift_char(stack) == 'a'); + check(_str.shift_char(stack) == '×'); + _str.unshift_char(stack, 'ß'); + check(_str.eq(stack, "ßu")); +} From 97aabd51d805fbb7ba62f711565fd4d719d88176 Mon Sep 17 00:00:00 2001 From: Marijn Haverbeke Date: Thu, 24 Mar 2011 16:07:43 +0100 Subject: [PATCH 2/2] make lexer unicode-aware for strings and char literals --- src/comp/front/lexer.rs | 10 +++++----- src/comp/front/token.rs | 4 ++-- src/comp/pretty/pprust.rs | 1 - 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/src/comp/front/lexer.rs b/src/comp/front/lexer.rs index 9c05b5706c2bb..791276a77ef95 100644 --- a/src/comp/front/lexer.rs +++ b/src/comp/front/lexer.rs @@ -76,7 +76,7 @@ impure fn new_reader(io.reader rdr, str filename) -> reader col += 1u; } - n = rdr.read_byte() as char; + n = rdr.read_char(); } fn mark() { @@ -204,8 +204,8 @@ impure fn new_reader(io.reader rdr, str filename) -> reader reserved.insert("m128", ()); // IEEE 754-2008 'decimal128' reserved.insert("dec", ()); // One of m32, m64, m128 - ret reader(rdr, filename, rdr.read_byte() as char, - rdr.read_byte() as char, 1u, 0u, 1u, 0u, keywords, reserved); + ret reader(rdr, filename, rdr.read_char(), + rdr.read_char(), 1u, 0u, 1u, 0u, keywords, reserved); } @@ -505,7 +505,7 @@ impure fn next_token(reader rdr) -> token.token { if (is_alpha(c) || c == '_') { while (is_alnum(c) || c == '_') { - _str.push_byte(accum_str, (c as u8)); + _str.push_char(accum_str, c); rdr.bump(); c = rdr.curr(); } @@ -692,7 +692,7 @@ impure fn next_token(reader rdr) -> token.token { } } case (_) { - _str.push_byte(accum_str, rdr.curr() as u8); + _str.push_char(accum_str, rdr.curr()); } } rdr.bump(); diff --git a/src/comp/front/token.rs b/src/comp/front/token.rs index 46fd0735641f2..72d82010ed729 100644 --- a/src/comp/front/token.rs +++ b/src/comp/front/token.rs @@ -303,9 +303,9 @@ fn to_str(token t) -> str { ret "\"" + s + "\""; } case (LIT_CHAR(?c)) { - // FIXME: escape and encode. + // FIXME: escape. auto tmp = "'"; - _str.push_byte(tmp, c as u8); + _str.push_char(tmp, c); _str.push_byte(tmp, '\'' as u8); ret tmp; } diff --git a/src/comp/pretty/pprust.rs b/src/comp/pretty/pprust.rs index 0e30ced131da6..25eb60edd36fb 100644 --- a/src/comp/pretty/pprust.rs +++ b/src/comp/pretty/pprust.rs @@ -718,7 +718,6 @@ impure fn print_maybe_parens(ps s, @ast.expr expr, int outer_prec) { if (add_them) {pclose(s);} } -// TODO non-ascii fn escape_str(str st, char to_escape) -> str { let str out = ""; auto len = _str.byte_len(st);