Skip to content

Commit

Permalink
Hold interned strings by reference, not by ID
Browse files Browse the repository at this point in the history
Rather than having a `StringStorage` class which exposes a custom
identifier used as a token for resolving an interned string, expose an
`InternedString` type directly constructible from a `std::string`. This
type handles the interning automatically, and using this type in
signatures allows us to encode whether a particular function needs to be
called with an interned string or could be called with any string.

Signed-off-by: Matt Wozniski <mwozniski@bloomberg.net>
  • Loading branch information
godlygeek committed Oct 2, 2023
1 parent 98032ae commit b08a977
Show file tree
Hide file tree
Showing 2 changed files with 73 additions and 104 deletions.
117 changes: 51 additions & 66 deletions src/memray/_memray/native_resolver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,53 +14,46 @@ static const logLevel RESOLVE_LIB_LOG_LEVEL = DEBUG;
static const logLevel RESOLVE_LIB_LOG_LEVEL = WARNING;
#endif

StringStorage::StringStorage()
std::unordered_set<std::string> InternedString::s_interned_data = []() {
std::unordered_set<std::string> ret;
ret.reserve(4096);
return ret;
}();

std::mutex InternedString::s_mutex;

InternedString::InternedString(const std::string& orig)
: d_ref(internString(orig))
{
d_interned_data.reserve(4096);
d_interned_data_storage.reserve(4096);
}

size_t
StringStorage::internString(const std::string& str, const char** interned_string)
const std::string&
InternedString::get() const
{
if (str.empty()) {
return 0;
}

const size_t id = d_interned_data.size() + 1;
auto inserted = d_interned_data.insert({str, id});
if (interned_string) {
*interned_string = inserted.first->first.c_str();
}

if (!inserted.second) {
return inserted.first->second;
}
// C++11 standard § 23.2.5/8: Rehashing the elements of an unordered associative container
// invalidates iterators, changes ordering between elements, and changes which buckets elements
// appear in, but does not invalidate pointers or references to elements.
d_interned_data_storage.push_back(&inserted.first->first);
return d_ref.get();
}

return id;
InternedString::operator const std::string&() const
{
return d_ref.get();
}

const std::string&
StringStorage::resolveString(size_t index) const
std::reference_wrapper<const std::string>
InternedString::internString(const std::string& orig)
{
assert(index != 0);
return *d_interned_data_storage.at(index - 1);
std::lock_guard<std::mutex> lock(s_mutex);
auto inserted = s_interned_data.insert(orig);
return *inserted.first;
}

MemorySegment::MemorySegment(
std::string filename,
InternedString filename,
uintptr_t start,
uintptr_t end,
backtrace_state* state,
size_t filename_index)
: d_filename(std::move(filename))
backtrace_state* state)
: d_filename(filename)
, d_start(start)
, d_end(end)
, d_index(filename_index)
, d_state(state)
{
}
Expand Down Expand Up @@ -107,7 +100,7 @@ MemorySegment::resolveFromSymbolTable(uintptr_t address, MemorySegment::Expanded
auto error_callback = [](void* _data, const char* msg, int errnum) {
auto* data = reinterpret_cast<const CallbackData*>(_data);
LOG(ERROR) << "Error getting backtrace for address " << std::hex << data->address << std::dec
<< " in segment " << data->segment->d_filename << " (errno " << errnum
<< " in segment " << data->segment->d_filename.get() << " (errno " << errnum
<< "): " << msg;
};
backtrace_syminfo(d_state, address, callback, error_callback, &data);
Expand Down Expand Up @@ -157,14 +150,15 @@ MemorySegment::resolveIp(uintptr_t address) const
bool
MemorySegment::operator<(const MemorySegment& segment) const
{
return std::tie(d_start, d_end, d_index) < std::tie(segment.d_start, segment.d_end, segment.d_index);
return std::tie(d_start, d_end, d_filename.get())
< std::tie(segment.d_start, segment.d_end, segment.d_filename.get());
}

bool
MemorySegment::operator!=(const MemorySegment& segment) const
{
return std::tie(d_start, d_end, d_index)
!= std::tie(segment.d_start, segment.d_end, segment.d_index);
return std::tie(d_start, d_end, d_filename.get())
!= std::tie(segment.d_start, segment.d_end, segment.d_filename.get());
}

bool
Expand All @@ -185,45 +179,37 @@ MemorySegment::end() const
return d_end;
}

uintptr_t
MemorySegment::filenameIndex() const
{
return d_index;
}

const std::string&
InternedString
MemorySegment::filename() const
{
return d_filename;
}

ResolvedFrame::ResolvedFrame(
const MemorySegment::Frame& frame,
const std::shared_ptr<StringStorage>& d_string_storage)
: d_string_storage(d_string_storage)
, d_symbol_index(d_string_storage->internString(frame.symbol))
, d_file_index(d_string_storage->internString(frame.filename))
, d_line(frame.lineno)
ResolvedFrame::ResolvedFrame(InternedString symbol, InternedString filename, int lineno)
: d_symbol(symbol)
, d_filename(filename)
, d_line(lineno)
{
}

const std::string&
ResolvedFrame::Symbol() const
{
return d_string_storage->resolveString(d_symbol_index);
return d_symbol;
}

const std::string&
ResolvedFrame::File() const
{
return d_string_storage->resolveString(d_file_index);
return d_filename;
}

int
ResolvedFrame::Line() const
{
return d_line;
}

PyObject*
ResolvedFrame::toPythonObject(python_helpers::PyUnicode_Cache& pystring_cache) const
{
Expand Down Expand Up @@ -255,7 +241,7 @@ ResolvedFrame::toPythonObject(python_helpers::PyUnicode_Cache& pystring_cache) c
const std::string&
ResolvedFrames::memoryMap() const
{
return d_string_storage->resolveString(d_memory_map_index);
return d_interned_memory_map_name;
}

const std::vector<ResolvedFrame>&
Expand Down Expand Up @@ -315,27 +301,28 @@ SymbolResolver::resolveFromSegments(uintptr_t ip, size_t generation)
if (expanded_frame.empty()) {
return nullptr;
}
auto segment_index = segment->filenameIndex();
std::transform(
expanded_frame.begin(),
expanded_frame.end(),
std::back_inserter(frames),
[this](const auto& frame) {
return ResolvedFrame{frame, d_string_storage};
[](const auto& frame) {
return ResolvedFrame{
InternedString(frame.symbol),
InternedString(frame.filename),
frame.lineno,
};
});
return std::make_shared<ResolvedFrames>(segment_index, std::move(frames), d_string_storage);
return std::make_shared<ResolvedFrames>(segment->filename(), std::move(frames));
}

void
SymbolResolver::addSegment(
const std::string& filename,
InternedString filename,
backtrace_state* backtrace_state,
const size_t filename_index,
const uintptr_t address_start,
const uintptr_t address_end)
{
currentSegments()
.emplace_back(filename, address_start, address_end, backtrace_state, filename_index);
currentSegments().emplace_back(filename, address_start, address_end, backtrace_state);
d_are_segments_dirty = true;
}

Expand All @@ -347,10 +334,8 @@ SymbolResolver::addSegments(
{
// We use a char* for the filename to reduce the memory footprint and
// because the libbacktrace callback in findBacktraceState operates on char*
const char* interned_filename = nullptr;
auto filename_index = d_string_storage->internString(filename, &interned_filename);

auto state = findBacktraceState(interned_filename, addr);
InternedString interned_filename(filename);
auto state = findBacktraceState(interned_filename.get().c_str(), addr);
if (state == nullptr) {
LOG(RESOLVE_LIB_LOG_LEVEL) << "Failed to prepare a backtrace state for " << filename;
return;
Expand All @@ -359,7 +344,7 @@ SymbolResolver::addSegments(
for (const auto& segment : segments) {
const uintptr_t segment_start = addr + segment.vaddr;
const uintptr_t segment_end = addr + segment.vaddr + segment.memsz;
addSegment(filename, state, filename_index, segment_start, segment_end);
addSegment(interned_filename, state, segment_start, segment_end);
}
}

Expand Down
60 changes: 22 additions & 38 deletions src/memray/_memray/native_resolver.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include <tuple>
#include <unistd.h>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include <vector>

Expand All @@ -25,24 +26,20 @@ namespace memray::native_resolver {
static constexpr int PREALLOCATED_BACKTRACE_STATES = 64;
static constexpr int PREALLOCATED_IPS_CACHE_ITEMS = 32768;

class StringStorage
class InternedString
{
public:
// Constructors
StringStorage();
StringStorage(StringStorage& other) = delete;
StringStorage(StringStorage&& other) = delete;
void operator=(const StringStorage&) = delete;
void operator=(StringStorage&&) = delete;

// Methods
size_t internString(const std::string& str, const char** interned_string = nullptr);
const std::string& resolveString(size_t index) const;
explicit InternedString(const std::string& orig);
const std::string& get() const;
operator const std::string&() const;

private:
// Data members
std::unordered_map<std::string, size_t> d_interned_data;
std::vector<const std::string*> d_interned_data_storage;
static std::reference_wrapper<const std::string> internString(const std::string& orig);

std::reference_wrapper<const std::string> d_ref;

static std::mutex s_mutex;
static std::unordered_set<std::string> s_interned_data;
};

class MemorySegment
Expand All @@ -59,12 +56,8 @@ class MemorySegment
using ExpandedFrame = std::vector<Frame>;

// Constructors
MemorySegment(
std::string filename,
uintptr_t start,
uintptr_t end,
backtrace_state* state,
size_t filename_index);
MemorySegment(InternedString filename, uintptr_t start, uintptr_t end, backtrace_state* state);

ExpandedFrame resolveIp(uintptr_t address) const;
bool operator<(const MemorySegment& segment) const;
bool operator!=(const MemorySegment& segment) const;
Expand All @@ -73,29 +66,25 @@ class MemorySegment
// Getters
uintptr_t start() const;
uintptr_t end() const;
size_t filenameIndex() const;
const std::string& filename() const;
InternedString filename() const;

private:
// Methods
void resolveFromDebugInfo(uintptr_t address, ExpandedFrame& expanded_frame) const;
void resolveFromSymbolTable(uintptr_t address, ExpandedFrame& expanded_frame) const;

// Data members
std::string d_filename;
InternedString d_filename;
uintptr_t d_start;
uintptr_t d_end;
size_t d_index;
backtrace_state* d_state;
};

class ResolvedFrame
{
public:
// Constructors
ResolvedFrame(
const MemorySegment::Frame& frame,
const std::shared_ptr<StringStorage>& string_storage);
ResolvedFrame(InternedString symbol, InternedString filename, int lineno);

// Methods
PyObject* toPythonObject(python_helpers::PyUnicode_Cache& pystring_cache) const;
Expand All @@ -107,9 +96,8 @@ class ResolvedFrame

private:
// Data members
std::shared_ptr<StringStorage> d_string_storage;
size_t d_symbol_index;
size_t d_file_index;
InternedString d_symbol;
InternedString d_filename;
int d_line;
};

Expand All @@ -118,10 +106,9 @@ class ResolvedFrames
public:
// Constructors
template<typename T>
ResolvedFrames(size_t memory_map_index, T&& frames, std::shared_ptr<StringStorage> strings_storage)
: d_memory_map_index(memory_map_index)
ResolvedFrames(InternedString interned_memory_map_name, T&& frames)
: d_interned_memory_map_name(interned_memory_map_name)
, d_frames(std::forward<T>(frames))
, d_string_storage(std::move(strings_storage))
{
}

Expand All @@ -131,9 +118,8 @@ class ResolvedFrames

private:
// Data members
size_t d_memory_map_index{0};
InternedString d_interned_memory_map_name;
std::vector<ResolvedFrame> d_frames{};
std::shared_ptr<StringStorage> d_string_storage{nullptr};
};

class SymbolResolver
Expand Down Expand Up @@ -170,9 +156,8 @@ class SymbolResolver

// Methods
void addSegment(
const std::string& filename,
InternedString filename,
backtrace_state* backtrace_state,
size_t filename_index,
uintptr_t address_start,
uintptr_t address_end);
std::vector<MemorySegment>& currentSegments();
Expand All @@ -182,7 +167,6 @@ class SymbolResolver
std::unordered_map<size_t, std::vector<MemorySegment>> d_segments;
bool d_are_segments_dirty = false;
std::unordered_map<const char*, backtrace_state*> d_backtrace_states;
std::shared_ptr<StringStorage> d_string_storage{std::make_shared<StringStorage>()};
mutable std::unordered_map<ips_cache_pair_t, resolved_frames_t, ips_cache_pair_hash>
d_resolved_ips_cache;
};
Expand Down

0 comments on commit b08a977

Please sign in to comment.