Skip to content

Commit

Permalink
Auto merge of rust-lang#70499 - Dylan-DPC:rollup-f9je1l8, r=Dylan-DPC
Browse files Browse the repository at this point in the history
Rollup of 5 pull requests

Successful merges:

 - rust-lang#70418 (Add long error explanation for E0703)
 - rust-lang#70448 (Create output dir in rustdoc markdown render)
 - rust-lang#70486 (Shrink Unicode tables (even more))
 - rust-lang#70493 (Fix rustdoc.css CSS tab-size property)
 - rust-lang#70495 (Replace last mention of IRC with Discord)

Failed merges:

r? @ghost
  • Loading branch information
bors committed Mar 28, 2020
2 parents e768d6f + e3ccd5b commit c52cee1
Show file tree
Hide file tree
Showing 14 changed files with 1,183 additions and 657 deletions.
25 changes: 0 additions & 25 deletions src/libcore/unicode/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,28 +32,3 @@ pub use unicode_data::lowercase::lookup as Lowercase;
pub use unicode_data::n::lookup as N;
pub use unicode_data::uppercase::lookup as Uppercase;
pub use unicode_data::white_space::lookup as White_Space;

#[inline(always)]
fn range_search<const N: usize, const N1: usize, const N2: usize>(
needle: u32,
chunk_idx_map: &[u8; N],
(last_chunk_idx, last_chunk_mapping): (u16, u8),
bitset_chunk_idx: &[[u8; 16]; N1],
bitset: &[u64; N2],
) -> bool {
let bucket_idx = (needle / 64) as usize;
let chunk_map_idx = bucket_idx / 16;
let chunk_piece = bucket_idx % 16;
let chunk_idx = if chunk_map_idx >= N {
if chunk_map_idx == last_chunk_idx as usize {
last_chunk_mapping
} else {
return false;
}
} else {
chunk_idx_map[chunk_map_idx]
};
let idx = bitset_chunk_idx[(chunk_idx as usize)][chunk_piece];
let word = bitset[(idx as usize)];
(word & (1 << (needle % 64) as u64)) != 0
}
957 changes: 443 additions & 514 deletions src/libcore/unicode/unicode_data.rs

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/librustc_error_codes/error_codes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -389,6 +389,7 @@ E0698: include_str!("./error_codes/E0698.md"),
E0699: include_str!("./error_codes/E0699.md"),
E0700: include_str!("./error_codes/E0700.md"),
E0701: include_str!("./error_codes/E0701.md"),
E0703: include_str!("./error_codes/E0703.md"),
E0704: include_str!("./error_codes/E0704.md"),
E0705: include_str!("./error_codes/E0705.md"),
E0706: include_str!("./error_codes/E0706.md"),
Expand Down Expand Up @@ -603,7 +604,6 @@ E0751: include_str!("./error_codes/E0751.md"),
// E0694, // an unknown tool name found in scoped attributes
E0696, // `continue` pointing to a labeled block
// E0702, // replaced with a generic attribute input check
E0703, // invalid ABI
// E0707, // multiple elided lifetimes used in arguments of `async fn`
E0708, // `async` non-`move` closures with parameters are not currently
// supported
Expand Down
17 changes: 17 additions & 0 deletions src/librustc_error_codes/error_codes/E0703.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
Invalid ABI (Application Binary Interface) used in the code.

Erroneous code example:

```compile_fail,E0703
extern "invalid" fn foo() {} // error!
# fn main() {}
```

At present few predefined ABI's (like Rust, C, system, etc.) can be
used in Rust. Verify that the ABI is predefined. For example you can
replace the given ABI from 'Rust'.

```
extern "Rust" fn foo() {} // ok!
# fn main() { }
```
4 changes: 2 additions & 2 deletions src/librustdoc/html/static/rustdoc.css
Original file line number Diff line number Diff line change
Expand Up @@ -1082,8 +1082,8 @@ h3 > .collapse-toggle, h4 > .collapse-toggle {

pre.rust {
position: relative;
tab-width: 4;
-moz-tab-width: 4;
tab-size: 4;
-moz-tab-size: 4;
}

.search-failed {
Expand Down
7 changes: 6 additions & 1 deletion src/librustdoc/markdown.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use std::fs::File;
use std::fs::{create_dir_all, File};
use std::io::prelude::*;
use std::path::PathBuf;

Expand Down Expand Up @@ -40,6 +40,11 @@ pub fn render(
diag: &rustc_errors::Handler,
edition: Edition,
) -> i32 {
if let Err(e) = create_dir_all(&options.output) {
diag.struct_err(&format!("{}: {}", options.output.display(), e)).emit();
return 4;
}

let mut output = options.output;
output.push(input.file_name().unwrap());
output.set_extension("html");
Expand Down
4 changes: 3 additions & 1 deletion src/libstd/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,8 @@
//! pull-requests for your suggested changes.
//!
//! Contributions are appreciated! If you see a part of the docs that can be
//! improved, submit a PR, or chat with us first on irc.mozilla.org #rust-docs.
//! improved, submit a PR, or chat with us first on [Discord][rust-discord]
//! #docs.
//!
//! # A Tour of The Rust Standard Library
//!
Expand Down Expand Up @@ -194,6 +195,7 @@
//! [multithreading]: thread/index.html
//! [other]: #what-is-in-the-standard-library-documentation
//! [primitive types]: ../book/ch03-02-data-types.html
//! [rust-discord]: https://discord.gg/rust-lang
#![stable(feature = "rust1", since = "1.0.0")]
#![doc(
Expand Down
1 change: 1 addition & 0 deletions src/test/ui/codemap_tests/unicode.stderr
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@ LL | extern "路濫狼á́́" fn foo() {}

error: aborting due to previous error

For more information about this error, try `rustc --explain E0703`.
1 change: 1 addition & 0 deletions src/test/ui/parser/issue-8537.stderr
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@ LL | "invalid-ab_isize"

error: aborting due to previous error

For more information about this error, try `rustc --explain E0703`.
186 changes: 182 additions & 4 deletions src/tools/unicode-table-generator/src/main.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,83 @@
//! This implements the core logic of the compression scheme used to compactly
//! encode Unicode properties.
//!
//! We have two primary goals with the encoding: we want to be compact, because
//! these tables often end up in ~every Rust program (especially the
//! grapheme_extend table, used for str debugging), including those for embedded
//! targets (where space is important). We also want to be relatively fast,
//! though this is more of a nice to have rather than a key design constraint.
//! It is expected that libraries/applications which are performance-sensitive
//! to Unicode property lookups are extremely rare, and those that care may find
//! the tradeoff of the raw bitsets worth it. For most applications, a
//! relatively fast but much smaller (and as such less cache-impacting, etc.)
//! data set is likely preferable.
//!
//! We have two separate encoding schemes: a skiplist-like approach, and a
//! compressed bitset. The datasets we consider mostly use the skiplist (it's
//! smaller) but the lowercase and uppercase sets are sufficiently sparse for
//! the bitset to be worthwhile -- for those sets the biset is a 2x size win.
//! Since the bitset is also faster, this seems an obvious choice. (As a
//! historical note, the bitset was also the prior implementation, so its
//! relative complexity had already been paid).
//!
//! ## The bitset
//!
//! The primary idea is that we 'flatten' the Unicode ranges into an enormous
//! bitset. To represent any arbitrary codepoint in a raw bitset, we would need
//! over 17 kilobytes of data per character set -- way too much for our
//! purposes.
//!
//! First, the raw bitset (one bit for every valid `char`, from 0 to 0x10FFFF,
//! not skipping the small 'gap') is associated into words (u64) and
//! deduplicated. On random data, this would be useless; on our data, this is
//! incredibly beneficial -- our data sets have (far) less than 256 unique
//! words.
//!
//! This gives us an array that maps `u8 -> word`; the current algorithm does
//! not handle the case of more than 256 unique words, but we are relatively far
//! from coming that close.
//!
//! With that scheme, we now have a single byte for every 64 codepoints.
//!
//! We further chunk these by some constant N (between 1 and 64 per group,
//! dynamically chosen for smallest size), and again deduplicate and store in an
//! array (u8 -> [u8; N]).
//!
//! The bytes of this array map into the words from the bitset above, but we
//! apply another trick here: some of these words are similar enough that they
//! can be represented by some function of another word. The particular
//! functions chosen are rotation, inversion, and shifting (right).
//!
//! ## The skiplist
//!
//! The skip list arose out of the desire for an even smaller encoding than the
//! bitset -- and was the answer to the question "what is the smallest
//! representation we can imagine?". However, it is not necessarily the
//! smallest, and if you have a better proposal, please do suggest it!
//!
//! This is a relatively straightforward encoding. First, we break up all the
//! ranges in the input data into offsets from each other, essentially a gap
//! encoding. In practice, most gaps are small -- less than u8::MAX -- so we
//! store those directly. We make use of the larger gaps (which are nicely
//! interspersed already) throughout the dataset to index this data set.
//!
//! In particular, each run of small gaps (terminating in a large gap) is
//! indexed in a separate dataset. That data set stores an index into the
//! primary offset list and a prefix sum of that offset list. These are packed
//! into a single u32 (11 bits for the offset, 21 bits for the prefix sum).
//!
//! Lookup proceeds via a binary search in the index and then a straightforward
//! linear scan (adding up the offsets) until we reach the needle, and then the
//! index of that offset is utilized as the answer to whether we're in the set
//! or not.
use std::collections::{BTreeMap, HashMap};
use std::ops::Range;
use ucd_parse::Codepoints;

mod case_mapping;
mod raw_emitter;
mod skiplist;
mod unicode_download;

use raw_emitter::{emit_codepoints, RawEmitter};
Expand Down Expand Up @@ -152,9 +226,17 @@ fn main() {
std::process::exit(1);
});

// Optional test path, which is a Rust source file testing that the unicode
// property lookups are correct.
let test_path = std::env::args().nth(2);

let unicode_data = load_data();
let ranges_by_property = &unicode_data.ranges;

if let Some(path) = test_path {
std::fs::write(&path, generate_tests(&write_location, &ranges_by_property)).unwrap();
}

let mut total_bytes = 0;
let mut modules = Vec::new();
for (property, ranges) in ranges_by_property {
Expand All @@ -163,7 +245,16 @@ fn main() {
emit_codepoints(&mut emitter, &ranges);

modules.push((property.to_lowercase().to_string(), emitter.file));
println!("{:15}: {} bytes, {} codepoints", property, emitter.bytes_used, datapoints,);
println!(
"{:15}: {} bytes, {} codepoints in {} ranges ({} - {}) using {}",
property,
emitter.bytes_used,
datapoints,
ranges.len(),
ranges.first().unwrap().start,
ranges.last().unwrap().end,
emitter.desc,
);
total_bytes += emitter.bytes_used;
}

Expand All @@ -173,7 +264,10 @@ fn main() {
"///! This file is generated by src/tools/unicode-table-generator; do not edit manually!\n",
);

table_file.push_str("use super::range_search;\n\n");
// Include the range search function
table_file.push('\n');
table_file.push_str(include_str!("range_search.rs"));
table_file.push('\n');

table_file.push_str(&version());

Expand Down Expand Up @@ -236,26 +330,110 @@ fn fmt_list<V: std::fmt::Debug>(values: impl IntoIterator<Item = V>) -> String {
out
}

fn generate_tests(data_path: &str, ranges: &[(&str, Vec<Range<u32>>)]) -> String {
let mut s = String::new();
s.push_str("#![allow(incomplete_features, unused)]\n");
s.push_str("#![feature(const_generics)]\n\n");
s.push_str("\n#[allow(unused)]\nuse std::hint;\n");
s.push_str(&format!("#[path = \"{}\"]\n", data_path));
s.push_str("mod unicode_data;\n\n");

s.push_str("\nfn main() {\n");

for (property, ranges) in ranges {
s.push_str(&format!(r#" println!("Testing {}");"#, property));
s.push('\n');
s.push_str(&format!(" {}_true();\n", property.to_lowercase()));
s.push_str(&format!(" {}_false();\n", property.to_lowercase()));
let mut is_true = Vec::new();
let mut is_false = Vec::new();
for ch_num in 0..(std::char::MAX as u32) {
if std::char::from_u32(ch_num).is_none() {
continue;
}
if ranges.iter().any(|r| r.contains(&ch_num)) {
is_true.push(ch_num);
} else {
is_false.push(ch_num);
}
}

s.push_str(&format!(" fn {}_true() {{\n", property.to_lowercase()));
generate_asserts(&mut s, property, &is_true, true);
s.push_str(" }\n\n");
s.push_str(&format!(" fn {}_false() {{\n", property.to_lowercase()));
generate_asserts(&mut s, property, &is_false, false);
s.push_str(" }\n\n");
}

s.push_str("}");
s
}

fn generate_asserts(s: &mut String, property: &str, points: &[u32], truthy: bool) {
for range in ranges_from_set(points) {
if range.end == range.start + 1 {
s.push_str(&format!(
" assert!({}unicode_data::{}::lookup({:?}), \"{}\");\n",
if truthy { "" } else { "!" },
property.to_lowercase(),
std::char::from_u32(range.start).unwrap(),
range.start,
));
} else {
s.push_str(&format!(" for chn in {:?}u32 {{\n", range));
s.push_str(&format!(
" assert!({}unicode_data::{}::lookup(std::char::from_u32(chn).unwrap()), \"{{:?}}\", chn);\n",
if truthy { "" } else { "!" },
property.to_lowercase(),
));
s.push_str(" }\n");
}
}
}

fn ranges_from_set(set: &[u32]) -> Vec<Range<u32>> {
let mut ranges = set.iter().map(|e| (*e)..(*e + 1)).collect::<Vec<Range<u32>>>();
merge_ranges(&mut ranges);
ranges
}

fn merge_ranges(ranges: &mut Vec<Range<u32>>) {
loop {
let mut new_ranges = Vec::new();
let mut idx_iter = 0..(ranges.len() - 1);
let mut should_insert_last = true;
while let Some(idx) = idx_iter.next() {
let cur = ranges[idx].clone();
let next = ranges[idx + 1].clone();
if cur.end == next.start {
let _ = idx_iter.next(); // skip next as we're merging it in
if idx_iter.next().is_none() {
// We're merging the last element
should_insert_last = false;
}
new_ranges.push(cur.start..next.end);
} else {
// We're *not* merging the last element
should_insert_last = true;
new_ranges.push(cur);
}
}
new_ranges.push(ranges.last().unwrap().clone());
if should_insert_last {
new_ranges.push(ranges.last().unwrap().clone());
}
if new_ranges.len() == ranges.len() {
*ranges = new_ranges;
break;
} else {
*ranges = new_ranges;
}
}

let mut last_end = None;
for range in ranges {
if let Some(last) = last_end {
assert!(range.start > last, "{:?}", range);
}
last_end = Some(range.end);
}
}
Loading

0 comments on commit c52cee1

Please sign in to comment.