Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

utf8 basics #284

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Makefile.in
Original file line number Diff line number Diff line change
Expand Up @@ -712,6 +712,7 @@ TEST_XFAILS_STAGE0 := $(FLOAT_XFAILS) \
use-import-export.rs \
user.rs \
utf8.rs \
utf8_chars.rs \
vec-alloc-append.rs \
vec-append.rs \
vec-slice.rs \
Expand Down
10 changes: 5 additions & 5 deletions src/comp/front/lexer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ impure fn new_reader(io.reader rdr, str filename) -> reader
col += 1u;
}

n = rdr.read_char() as char;
n = rdr.read_char();
}

fn mark() {
Expand Down Expand Up @@ -204,8 +204,8 @@ impure fn new_reader(io.reader rdr, str filename) -> reader
reserved.insert("m128", ()); // IEEE 754-2008 'decimal128'
reserved.insert("dec", ()); // One of m32, m64, m128

ret reader(rdr, filename, rdr.read_char() as char,
rdr.read_char() as char, 1u, 0u, 1u, 0u, keywords, reserved);
ret reader(rdr, filename, rdr.read_char(),
rdr.read_char(), 1u, 0u, 1u, 0u, keywords, reserved);
}


Expand Down Expand Up @@ -505,7 +505,7 @@ impure fn next_token(reader rdr) -> token.token {

if (is_alpha(c) || c == '_') {
while (is_alnum(c) || c == '_') {
_str.push_byte(accum_str, (c as u8));
_str.push_char(accum_str, c);
rdr.bump();
c = rdr.curr();
}
Expand Down Expand Up @@ -692,7 +692,7 @@ impure fn next_token(reader rdr) -> token.token {
}
}
case (_) {
_str.push_byte(accum_str, rdr.curr() as u8);
_str.push_char(accum_str, rdr.curr());
}
}
rdr.bump();
Expand Down
4 changes: 2 additions & 2 deletions src/comp/front/token.rs
Original file line number Diff line number Diff line change
Expand Up @@ -303,9 +303,9 @@ fn to_str(token t) -> str {
ret "\"" + s + "\"";
}
case (LIT_CHAR(?c)) {
// FIXME: escape and encode.
// FIXME: escape.
auto tmp = "'";
_str.push_byte(tmp, c as u8);
_str.push_char(tmp, c);
_str.push_byte(tmp, '\'' as u8);
ret tmp;
}
Expand Down
1 change: 0 additions & 1 deletion src/comp/pretty/pprust.rs
Original file line number Diff line number Diff line change
Expand Up @@ -718,7 +718,6 @@ impure fn print_maybe_parens(ps s, @ast.expr expr, int outer_prec) {
if (add_them) {pclose(s);}
}

// TODO non-ascii
fn escape_str(str st, char to_escape) -> str {
let str out = "";
auto len = _str.byte_len(st);
Expand Down
167 changes: 164 additions & 3 deletions src/lib/_str.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ native "rust" mod rustrt {
fn str_from_vec(vec[mutable? u8] b) -> str;
fn str_from_cstr(sbuf cstr) -> str;
fn str_from_buf(sbuf buf, uint len) -> str;
fn str_push_byte(str s, uint byte) -> str;
fn refcount[T](str s) -> uint;
}

Expand Down Expand Up @@ -65,15 +66,42 @@ fn hash(&str s) -> uint {
ret u;
}

// UTF-8 tags and ranges
const u8 tag_cont_u8 = 0x80_u8;
const uint tag_cont = 0x80_u;
const uint max_one_b = 0x80_u;
const uint tag_two_b = 0xc0_u;
const uint max_two_b = 0x800_u;
const uint tag_three_b = 0xe0_u;
const uint max_three_b = 0x10000_u;
const uint tag_four_b = 0xf0_u;
const uint max_four_b = 0x200000_u;
const uint tag_five_b = 0xf8_u;
const uint max_five_b = 0x4000000_u;
const uint tag_six_b = 0xfc_u;

fn is_utf8(vec[u8] v) -> bool {
fail; // FIXME
auto i = 0u;
auto total = _vec.len[u8](v);
while (i < total) {
auto chsize = utf8_char_width(v.(i));
if (chsize == 0u) {ret false;}
if (i + chsize > total) {ret false;}
i += 1u;
while (chsize > 1u) {
if (v.(i) & 0xc0_u8 != tag_cont_u8) {ret false;}
i += 1u;
chsize -= 1u;
}
}
ret true;
}

fn is_ascii(str s) -> bool {
let uint i = byte_len(s);
while (i > 0u) {
i -= 1u;
if ((s.(i) & 0x80u8) != 0u8) {
if ((s.(i) & 0x80_u8) != 0u8) {
ret false;
}
}
Expand Down Expand Up @@ -134,6 +162,139 @@ unsafe fn str_from_buf(sbuf buf, uint len) -> str {
ret rustrt.str_from_buf(buf, len);
}

fn push_utf8_bytes(&mutable str s, char ch) {
auto code = ch as uint;
if (code < max_one_b) {
s = rustrt.str_push_byte(s, code);
} else if (code < max_two_b) {
s = rustrt.str_push_byte(s, ((code >> 6u) & 0x1f_u) | tag_two_b);
s = rustrt.str_push_byte(s, (code & 0x3f_u) | tag_cont);
} else if (code < max_three_b) {
s = rustrt.str_push_byte(s, ((code >> 12u) & 0x0f_u) | tag_three_b);
s = rustrt.str_push_byte(s, ((code >> 6u) & 0x3f_u) | tag_cont);
s = rustrt.str_push_byte(s, (code & 0x3f_u) | tag_cont);
} else if (code < max_four_b) {
s = rustrt.str_push_byte(s, ((code >> 18u) & 0x07_u) | tag_four_b);
s = rustrt.str_push_byte(s, ((code >> 12u) & 0x3f_u) | tag_cont);
s = rustrt.str_push_byte(s, ((code >> 6u) & 0x3f_u) | tag_cont);
s = rustrt.str_push_byte(s, (code & 0x3f_u) | tag_cont);
} else if (code < max_five_b) {
s = rustrt.str_push_byte(s, ((code >> 24u) & 0x03_u) | tag_five_b);
s = rustrt.str_push_byte(s, ((code >> 18u) & 0x3f_u) | tag_cont);
s = rustrt.str_push_byte(s, ((code >> 12u) & 0x3f_u) | tag_cont);
s = rustrt.str_push_byte(s, ((code >> 6u) & 0x3f_u) | tag_cont);
s = rustrt.str_push_byte(s, (code & 0x3f_u) | tag_cont);
} else {
s = rustrt.str_push_byte(s, ((code >> 30u) & 0x01_u) | tag_six_b);
s = rustrt.str_push_byte(s, ((code >> 24u) & 0x3f_u) | tag_cont);
s = rustrt.str_push_byte(s, ((code >> 18u) & 0x3f_u) | tag_cont);
s = rustrt.str_push_byte(s, ((code >> 12u) & 0x3f_u) | tag_cont);
s = rustrt.str_push_byte(s, ((code >> 6u) & 0x3f_u) | tag_cont);
s = rustrt.str_push_byte(s, (code & 0x3f_u) | tag_cont);
}
}

fn from_char(char ch) -> str {
auto buf = "";
push_utf8_bytes(buf, ch);
ret buf;
}

fn from_chars(vec[char] chs) -> str {
auto buf = "";
for (char ch in chs) {push_utf8_bytes(buf, ch);}
ret buf;
}

fn utf8_char_width(u8 b) -> uint {
let uint byte = b as uint;
if (byte < 0x80_u) {ret 1u;}
if (byte < 0xc0_u) {ret 0u;} // Not a valid start byte
if (byte < 0xe0_u) {ret 2u;}
if (byte < 0xf0_u) {ret 3u;}
if (byte < 0xf8_u) {ret 4u;}
if (byte < 0xfc_u) {ret 5u;}
ret 6u;
}

fn char_range_at(str s, uint i) -> tup(char, uint) {
auto b0 = s.(i);
auto w = utf8_char_width(b0);
check(w != 0u);
if (w == 1u) {ret tup(b0 as char, i + 1u);}
auto val = 0u;
auto end = i + w;
i += 1u;
while (i < end) {
auto byte = s.(i);
check(byte & 0xc0_u8 == tag_cont_u8);
val <<= 6u;
val += (byte & 0x3f_u8) as uint;
i += 1u;
}
// Clunky way to get the right bits from the first byte. Uses two shifts,
// the first to clip off the marker bits at the left of the byte, and then
// a second (as uint) to get it to the right position.
val += ((b0 << ((w + 1u) as u8)) as uint) << ((w - 1u) * 6u - w - 1u);
ret tup(val as char, i);
}

fn char_at(str s, uint i) -> char {
ret char_range_at(s, i)._0;
}

fn char_len(str s) -> uint {
auto i = 0u;
auto len = 0u;
auto total = byte_len(s);
while (i < total) {
auto chsize = utf8_char_width(s.(i));
check(chsize > 0u);
len += 1u;
i += chsize;
}
check(i == total);
ret len;
}

fn to_chars(str s) -> vec[char] {
let vec[char] buf = vec();
auto i = 0u;
auto len = byte_len(s);
while (i < len) {
auto cur = char_range_at(s, i);
_vec.push[char](buf, cur._0);
i = cur._1;
}
ret buf;
}

fn push_char(&mutable str s, char ch) {
s += from_char(ch);
}

fn pop_char(&mutable str s) -> char {
auto end = byte_len(s);
while (end > 0u && s.(end - 1u) & 0xc0_u8 == tag_cont_u8) {end -= 1u;}
check(end > 0u);
auto ch = char_at(s, end - 1u);
s = substr(s, 0u, end - 1u);
ret ch;
}

fn shift_char(&mutable str s) -> char {
auto r = char_range_at(s, 0u);
s = substr(s, r._1, byte_len(s) - r._1);
ret r._0;
}

fn unshift_char(&mutable str s, char ch) {
// Workaround for rustboot order-of-evaluation issue -- if I put s
// directly after the +, the string ends up containing (only) the
// character, twice.
auto x = s;
s = from_char(ch) + x;
}

fn refcount(str s) -> uint {
auto r = rustrt.refcount[u8](s);
Expand Down Expand Up @@ -256,7 +417,7 @@ fn pop_byte(&mutable str s) -> u8 {
}

fn push_byte(&mutable str s, u8 b) {
s += unsafe_from_byte(b);
s = rustrt.str_push_byte(s, b as uint);
}

fn unshift_byte(&mutable str s, u8 b) {
Expand Down
8 changes: 4 additions & 4 deletions src/lib/ebml.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,18 +21,18 @@ type reader = rec(

// TODO: eventually use u64 or big here
impure fn read_vint(&io.reader reader) -> uint {
auto a = reader.read_byte();
auto a = reader.read_byte() as u8;
if (a & 0x80u8 != 0u8) { ret (a & 0x7fu8) as uint; }
auto b = reader.read_byte();
auto b = reader.read_byte() as u8;
if (a & 0x40u8 != 0u8) {
ret (((a & 0x3fu8) as uint) << 8u) | (b as uint);
}
auto c = reader.read_byte();
auto c = reader.read_byte() as u8;
if (a & 0x20u8 != 0u8) {
ret (((a & 0x1fu8) as uint) << 16u) | ((b as uint) << 8u) |
(c as uint);
}
auto d = reader.read_byte();
auto d = reader.read_byte() as u8;
if (a & 0x10u8 != 0u8) {
ret (((a & 0x0fu8) as uint) << 24u) | ((b as uint) << 16u) |
((c as uint) << 8u) | (d as uint);
Expand Down
2 changes: 1 addition & 1 deletion src/lib/fs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ native "rust" mod rustrt {
}

fn path_sep() -> str {
ret _str.unsafe_from_bytes(vec(os_fs.path_sep as u8));
ret _str.from_char(os_fs.path_sep);
}

type path = str;
Expand Down
Loading