Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Parse Strtab only once on Creation #275

Merged
merged 3 commits into from
Jul 2, 2021
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions src/archive/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,7 @@ impl<'a> Index<'a> {
Ok(Index {
size: sizeof_table,
symbol_indexes: indexes,
strtab: strtab.to_vec()?, // because i'm lazy
strtab: strtab.to_vec(), // because i'm lazy
})
}

Expand Down Expand Up @@ -282,8 +282,8 @@ impl<'a> Index<'a> {
let string_offset: u32 = buffer.pread_with(i * 8 + 4, scroll::LE)?;
let archive_member: u32 = buffer.pread_with(i * 8 + 8, scroll::LE)?;

let string = match strtab.get(string_offset as usize) {
Some(result) => result,
let string = match strtab.get_at(string_offset as usize) {
Some(result) => Ok(result),
None => Err(Error::Malformed(format!(
"{} entry {} has string offset {}, which is out of bounds",
BSD_SYMDEF_NAME, i, string_offset
Expand Down Expand Up @@ -325,7 +325,7 @@ impl<'a> Index<'a> {
Ok(Index {
size: symbols,
symbol_indexes: symbol_offsets,
strtab: strtab.to_vec()?,
strtab: strtab.to_vec(),
})
}
}
Expand Down Expand Up @@ -353,8 +353,8 @@ impl<'a> NameIndex<'a> {
let idx = name.trim_start_matches('/').trim_end();
match usize::from_str_radix(idx, 10) {
Ok(idx) => {
let name = match self.strtab.get(idx + 1) {
Some(result) => result,
let name = match self.strtab.get_at(idx + 1) {
Some(result) => Ok(result),
None => Err(Error::Malformed(format!(
"Name {} is out of range in archive NameIndex",
name
Expand Down
2 changes: 1 addition & 1 deletion src/elf/dynamic.rs
Original file line number Diff line number Diff line change
Expand Up @@ -453,7 +453,7 @@ if_alloc! {
let mut needed = Vec::with_capacity(count);
for dynamic in &self.dyns {
if dynamic.d_tag as u64 == DT_NEEDED {
if let Some(Ok(lib)) = strtab.get(dynamic.d_val as usize) {
if let Some(lib) = strtab.get_at(dynamic.d_val as usize) {
needed.push(lib)
} else {
warn!("Invalid DT_NEEDED {}", dynamic.d_val)
Expand Down
6 changes: 2 additions & 4 deletions src/elf/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -174,9 +174,7 @@ if_sylvan! {
continue;
}

if section_name.is_some() && !self.shdr_strtab
.get(sect.sh_name)
.map_or(false, |r| r.ok() == section_name) {
if section_name.is_some() && self.shdr_strtab.get_at(sect.sh_name) != section_name {
continue;
}

Expand Down Expand Up @@ -298,7 +296,7 @@ if_sylvan! {

if dyn_info.soname != 0 {
// FIXME: warn! here
soname = match dynstrtab.get(dyn_info.soname) { Some(Ok(soname)) => Some(soname), _ => None };
soname = dynstrtab.get_at(dyn_info.soname);
}
if dyn_info.needed_count > 0 {
libraries = dynamic.get_libraries(&dynstrtab);
Expand Down
7 changes: 2 additions & 5 deletions src/pe/symbol.rs
Original file line number Diff line number Diff line change
Expand Up @@ -218,11 +218,8 @@ impl Symbol {
/// a strtab entry.
pub fn name<'a>(&'a self, strtab: &'a strtab::Strtab) -> error::Result<&'a str> {
if let Some(offset) = self.name_offset() {
strtab.get(offset as usize).unwrap_or_else(|| {
Err(error::Error::Malformed(format!(
"Invalid Symbol name offset {:#x}",
offset
)))
strtab.get_at(offset as usize).ok_or_else(|| {
error::Error::Malformed(format!("Invalid Symbol name offset {:#x}", offset))
})
} else {
Ok(self.name.pread(0)?)
Expand Down
167 changes: 108 additions & 59 deletions src/strtab.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@

use core::fmt;
use core::ops::Index;
use core::slice;
use core::str;
use scroll::{ctx, Pread};
if_alloc! {
Expand All @@ -15,8 +14,10 @@ if_alloc! {
/// member index). Constructed using [`parse`](#method.parse)
/// with your choice of delimiter. Please be careful.
pub struct Strtab<'a> {
bytes: &'a [u8],
delim: ctx::StrCtx,
bytes: &'a [u8],
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: layout here didn't need to change (bytes after delim in field declarations), this and some of the functions moving around did make this PR a bit hard to review, just fyi

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Was for consistency with the constructors (see line 30 of the prev version).

#[cfg(feature = "alloc")]
strings: Vec<(usize, &'a str)>,
}

#[inline(always)]
Expand All @@ -25,31 +26,31 @@ fn get_str(offset: usize, bytes: &[u8], delim: ctx::StrCtx) -> scroll::Result<&s
}

impl<'a> Strtab<'a> {
/// Construct a new strtab with `bytes` as the backing string table, using `delim` as the delimiter between entries
pub fn new(bytes: &'a [u8], delim: u8) -> Self {
Strtab {
/// Creates a `Strtab` directly without bounds check and without parsing it.
pub fn from_slice_unsafe(bytes: &'a [u8], offset: usize, len: usize, delim: u8) -> Self {
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

e.g., this function here makes a lot of diff noise; also does this function need to be pub? one can construct a strtable now, but the vec isn't filled, which means any get_at calls would fail, which feels like the strtab is "half constructed"; also it is better to not introduce any new pub apis unless necessary, since once they're pub, we can't take them back

Self {
delim: ctx::StrCtx::Delimiter(delim),
bytes,
bytes: &bytes[offset..offset + len],
#[cfg(feature = "alloc")]
strings: Vec::new(),
}
}
/// Construct a strtab from a `ptr`, and a `size`, using `delim` as the delimiter
/// # Safety
/// Gets a str reference from the backing bytes starting at byte `offset`.
///
/// This function creates a `Strtab` directly from a raw pointer and size
pub unsafe fn from_raw(ptr: *const u8, size: usize, delim: u8) -> Strtab<'a> {
Lichtso marked this conversation as resolved.
Show resolved Hide resolved
Strtab {
delim: ctx::StrCtx::Delimiter(delim),
bytes: slice::from_raw_parts(ptr, size),
/// If the index is out of bounds, `None` is returned. Panics if bytes are invalid UTF-8.
/// Use this method if the `Strtab` was created using `from_slice_unsafe()`.
pub fn get_unsafe(&self, offset: usize) -> Option<&'a str> {
if offset >= self.bytes.len() {
None
} else {
Some(get_str(offset, self.bytes, self.delim).unwrap())
}
}
#[cfg(feature = "alloc")]
/// Parses a strtab from `bytes` at `offset` with `len` size as the backing string table, using `delim` as the delimiter
pub fn parse(
bytes: &'a [u8],
offset: usize,
len: usize,
delim: u8,
) -> error::Result<Strtab<'a>> {
/// Parses a `Strtab` from `bytes` at `offset` with `len` size as the backing string table, using `delim` as the delimiter.
///
/// Errors if bytes are invalid UTF-8.
pub fn parse(bytes: &'a [u8], offset: usize, len: usize, delim: u8) -> error::Result<Self> {
let (end, overflow) = offset.overflowing_add(len);
if overflow || end > bytes.len() {
return Err(error::Error::Malformed(format!(
Expand All @@ -60,44 +61,65 @@ impl<'a> Strtab<'a> {
overflow
)));
}
Ok(Strtab {
bytes: &bytes[offset..end],
delim: ctx::StrCtx::Delimiter(delim),
})
}
#[cfg(feature = "alloc")]
/// Converts the string table to a vector, with the original `delim` used to separate the strings
pub fn to_vec(&self) -> error::Result<Vec<&'a str>> {
let len = self.bytes.len();
let mut strings = Vec::with_capacity(len);
let mut result = Self::from_slice_unsafe(bytes, offset, len, delim);
let mut i = 0;
while i < len {
let string = self.get(i).unwrap()?;
i = i + string.len() + 1;
strings.push(string);
let string = get_str(i, result.bytes, result.delim)?;
result.strings.push((i, string));
i += string.len() + 1;
}
Ok(strings)
Ok(result)
}
#[cfg(feature = "alloc")]
/// Parses a `Strtab` with `bytes` as the backing string table, using `delim` as the delimiter between entries.
pub fn new(bytes: &'a [u8], delim: u8) -> error::Result<Self> {
Self::parse(bytes, 0, bytes.len(), delim)
}
/// Safely parses and gets a str reference from the backing bytes starting at byte `offset`.
#[cfg(feature = "alloc")]
/// Converts the string table to a vector of parsed strings.
pub fn to_vec(&self) -> Vec<&'a str> {
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

unfortunately this is still a breaking change, since return signature changed. i'm 50/50 on fence whether it's worth it (can just Ok wrap at end and it's non-breaking, but maybe it's fine to break?)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ironically, I was on the fence of breaking it even further by including the starting offset of each entry in the result.

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok so I've thought about this, and I think we should return Result here, so that we don't have a breaking change, and if we switch to using the enum, it will work without changes; if we get that done i'm ok with merging this and making a minor release asap.

As a bonus, we can perhaps construct the vector on the fly if it's empty (i.e., it was constructed via the unsafe method), but that is probably a niche usecase.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Both done, reverted to use the Result return type and implemented a fallback.

self.strings.iter().map(|&(_key, value)| value).collect()
m4b marked this conversation as resolved.
Show resolved Hide resolved
}
#[cfg(feature = "alloc")]
/// Safely gets a str reference from the parsed table starting at byte `offset`.
///
/// If the index is out of bounds, `None` is returned.
/// Requires `feature = "alloc"`
pub fn get_at(&self, offset: usize) -> Option<&'a str> {
match self
.strings
.binary_search_by_key(&offset, |&(key, _value)| key)
{
Ok(index) => Some(self.strings[index].1),
Err(index) => {
if index == 0 {
return None;
}
let (string_begin_offset, entire_string) = self.strings[index - 1];
entire_string.get(offset - string_begin_offset..)
}
}
}
#[deprecated(since = "0.4.2", note = "Use from_slice_unsafe() instead")]
/// Construct a strtab from a `ptr`, and a `size`, using `delim` as the delimiter
///
/// This function creates a `Strtab` directly from a raw pointer and size
pub unsafe fn from_raw(ptr: *const u8, len: usize, delim: u8) -> Strtab<'a> {
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ah i see now why from_slice_unsafe is pub; so i'm not sure this should be deprecated? i used this in the past, e.g., to construct strtables for a dynamic linker, which doesn't want to allocate (since it's already mapped into memory). i'm not sure what others are doing, but it would force someone to first call from raw_parts on their pointer and len as you do here.

alternatively, it would be feasible to have from_slice_unsafe just get the ptr + len and pass it to this. however, it's probably better to get rid of / deprecate these unsafe apis, since it doesn't add much (the user can just call the unsafe from_raw_parts themselves in their own unsafe context, so we push it onto them, which i think is fine).

so i guess what i'm saying is maybe it might be good to either:

  1. delete this
  2. deprecate as you do with warning it will be removed, then delete next release

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The deprecated marker is a few lines above (the comment) in 103.

Self::from_slice_unsafe(core::slice::from_raw_parts(ptr, len), 0, len, delim)
}
#[deprecated(since = "0.4.2", note = "Bad performance, use get_at() instead")]
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

awesome

#[cfg(feature = "alloc")]
/// Parses a str reference from the parsed table starting at byte `offset`.
///
/// If the index is out of bounds, `None` is returned.
/// Requires `feature = "alloc"`
pub fn get(&self, offset: usize) -> Option<error::Result<&'a str>> {
if offset >= self.bytes.len() {
None
} else {
Some(get_str(offset, self.bytes, self.delim).map_err(core::convert::Into::into))
}
}
/// Gets a str reference from the backing bytes starting at byte `offset`.
/// If the index is out of bounds, `None` is returned. Panics if bytes are invalid UTF-8.
pub fn get_unsafe(&self, offset: usize) -> Option<&'a str> {
if offset >= self.bytes.len() {
None
} else {
Some(get_str(offset, self.bytes, self.delim).unwrap())
}
}
}

impl<'a> fmt::Debug for Strtab<'a> {
Expand All @@ -110,10 +132,12 @@ impl<'a> fmt::Debug for Strtab<'a> {
}

impl<'a> Default for Strtab<'a> {
fn default() -> Strtab<'a> {
Strtab {
bytes: &[],
fn default() -> Self {
Self {
delim: ctx::StrCtx::default(),
bytes: &[],
#[cfg(feature = "alloc")]
strings: Vec::new(),
}
}
}
Expand All @@ -133,36 +157,61 @@ impl<'a> Index<usize> for Strtab<'a> {

#[test]
fn as_vec_no_final_null() {
let bytes = b"\0printf\0memmove\0busta";
let strtab = unsafe { Strtab::from_raw(bytes.as_ptr(), bytes.len(), 0x0) };
m4b marked this conversation as resolved.
Show resolved Hide resolved
let vec = strtab.to_vec().unwrap();
let strtab = Strtab::new(b"\0printf\0memmove\0busta", 0x0).unwrap();
let vec = strtab.to_vec();
assert_eq!(vec.len(), 4);
assert_eq!(vec, vec!["", "printf", "memmove", "busta"]);
}

#[test]
fn as_vec_no_first_null_no_final_null() {
let bytes = b"printf\0memmove\0busta";
let strtab = unsafe { Strtab::from_raw(bytes.as_ptr(), bytes.len(), 0x0) };
let vec = strtab.to_vec().unwrap();
let strtab = Strtab::new(b"printf\0memmove\0busta", 0x0).unwrap();
let vec = strtab.to_vec();
assert_eq!(vec.len(), 3);
assert_eq!(vec, vec!["printf", "memmove", "busta"]);
}

#[test]
fn to_vec_final_null() {
let bytes = b"\0printf\0memmove\0busta\0";
let strtab = unsafe { Strtab::from_raw(bytes.as_ptr(), bytes.len(), 0x0) };
let vec = strtab.to_vec().unwrap();
let strtab = Strtab::new(b"\0printf\0memmove\0busta\0", 0x0).unwrap();
let vec = strtab.to_vec();
assert_eq!(vec.len(), 4);
assert_eq!(vec, vec!["", "printf", "memmove", "busta"]);
}

#[test]
fn to_vec_newline_delim() {
let bytes = b"\nprintf\nmemmove\nbusta\n";
let strtab = unsafe { Strtab::from_raw(bytes.as_ptr(), bytes.len(), b'\n') };
let vec = strtab.to_vec().unwrap();
let strtab = Strtab::new(b"\nprintf\nmemmove\nbusta\n", b'\n').unwrap();
let vec = strtab.to_vec();
assert_eq!(vec.len(), 4);
assert_eq!(vec, vec!["", "printf", "memmove", "busta"]);
}

#[test]
fn parse_utf8() {
assert!(match Strtab::new(&[0x80, 0x80], b'\n') {
Err(error::Error::Scroll(scroll::Error::BadInput {
size: 2,
msg: "invalid utf8",
})) => true,
_ => false,
});
assert!(match Strtab::new(&[0xC6, 0x92, 0x6F, 0x6F], b'\n') {
Ok(_) => true,
_ => false,
});
}

#[test]
fn get_at_utf8() {
let strtab = Strtab::new("\nƒoo\nmemmove\n🅱️usta\n".as_bytes(), b'\n').unwrap();
assert_eq!(strtab.get_at(0), Some(""));
assert_eq!(strtab.get_at(5), Some(""));
assert_eq!(strtab.get_at(6), Some("memmove"));
assert_eq!(strtab.get_at(14), Some("\u{1f171}\u{fe0f}usta"));
assert_eq!(strtab.get_at(16), None);
assert_eq!(strtab.get_at(18), Some("\u{fe0f}usta"));
assert_eq!(strtab.get_at(21), Some("usta"));
assert_eq!(strtab.get_at(25), Some(""));
assert_eq!(strtab.get_at(26), None);
}
2 changes: 1 addition & 1 deletion tests/elf.rs
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ fn parse_text_section_size_lazy(base: &[u8]) -> Result<u64, &'static str> {
)
.map_err(|_| "parse string table error")?;
for (_, section) in obj.section_headers.iter().enumerate() {
let section_name = strtab.get(section.sh_name).unwrap().unwrap();
let section_name = strtab.get_at(section.sh_name).unwrap();
if section_name == ".text" {
return Ok(section.sh_size);
}
Expand Down