Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add safety comments #100

Merged
merged 1 commit into from
May 7, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 26 additions & 3 deletions src/normalize.rs
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ where
}

#[inline]
#[allow(unsafe_code)]
fn decompose<D, F>(c: char, decompose_char: D, mut emit_char: F)
where
D: Fn(char) -> Option<&'static [char]>,
Expand All @@ -84,7 +85,10 @@ where

// Perform decomposition for Hangul
if is_hangul_syllable(c) {
decompose_hangul(c, emit_char);
// Safety: Hangul Syllables invariant checked by is_hangul_syllable above
unsafe {
decompose_hangul(c, emit_char);
}
return;
}

Expand Down Expand Up @@ -127,27 +131,37 @@ const T_LAST: u32 = T_BASE + T_COUNT - 1;
// i.e. `T_BASE + 1 ..= T_LAST`.
const T_FIRST: u32 = T_BASE + 1;

// Safety-usable invariant: This ensures that c is a valid Hangul Syllable character (U+AC00..U+D7AF)
pub(crate) fn is_hangul_syllable(c: char) -> bool {
// Safety: This checks the range 0xAC00 (S_BASE) to 0xD7A4 (S_BASE + S_COUNT), upholding the safety-usable invariant
(c as u32) >= S_BASE && (c as u32) < (S_BASE + S_COUNT)
}

// Decompose a precomposed Hangul syllable
#[allow(unsafe_code)]
// Safety: `s` MUST be a valid Hangul Syllable character, between U+AC00..U+D7AF
#[allow(unsafe_code, unused_unsafe)]
#[inline(always)]
fn decompose_hangul<F>(s: char, mut emit_char: F)
unsafe fn decompose_hangul<F>(s: char, mut emit_char: F)
where
F: FnMut(char),
{
// This will be at most 0x2baf, the size of the Hangul Syllables block
let s_index = s as u32 - S_BASE;
// This will be at most 0x2baf / (21 * 28), 19
let l_index = s_index / N_COUNT;
unsafe {
// Safety: L_BASE (0x1100) plus at most 19 is still going to be in range for a valid Unicode code point in the BMP (< 0xD800)
emit_char(char::from_u32_unchecked(L_BASE + l_index));

// Safety: This will be at most (N_COUNT - 1) / T_COUNT = (V*T - 1) / T, which gives us an upper bound of V_COUNT = 21
let v_index = (s_index % N_COUNT) / T_COUNT;
// Safety: V_BASE (0x1161) plus at most 21 is still going to be in range for a valid Unicode code point in the BMP (< 0xD800)
emit_char(char::from_u32_unchecked(V_BASE + v_index));

// Safety: This will be at most T_COUNT - 1 (27)
let t_index = s_index % T_COUNT;
if t_index > 0 {
// Safety: T_BASE (0x11A7) plus at most 27 is still going to be in range for a valid Unicode code point in the BMP (< 0xD800)
emit_char(char::from_u32_unchecked(T_BASE + t_index));
}
}
Expand All @@ -173,14 +187,23 @@ fn compose_hangul(a: char, b: char) -> Option<char> {
match (a, b) {
// Compose a leading consonant and a vowel together into an LV_Syllable
(L_BASE..=L_LAST, V_BASE..=V_LAST) => {
// Safety: based on the above bounds, l_index will be less than or equal to L_COUNT (19)
// and v_index will be <= V_COUNT (21)
let l_index = a - L_BASE;
let v_index = b - V_BASE;
// Safety: This will be <= 19 * (20 * 21) + (21 * 20), which is 8400.
let lv_index = l_index * N_COUNT + v_index * T_COUNT;
// Safety: This is between 0xAC00 and 0xCCD0, which are in range for Hangul Syllables (U+AC00..U+D7AF) and also in range
// for BMP unicode
let s = S_BASE + lv_index;
// Safety: We've verified this is in-range
Some(unsafe { char::from_u32_unchecked(s) })
}
// Compose an LV_Syllable and a trailing consonant into an LVT_Syllable
(S_BASE..=S_LAST, T_FIRST..=T_LAST) if (a - S_BASE) % T_COUNT == 0 => {
// Safety: a is between 0xAC00 and (0xAC00 + 19 * 21 * 28). b - T_BASE is between 0 and 19.
// Adding a number 0 to 19 to a number that is at largest 0xD7A4 will not go out of bounds to 0xD800 (where the
// surrogates start), so this is safe.
Some(unsafe { char::from_u32_unchecked(a + (b - T_BASE)) })
}
_ => None,
Expand Down
Loading