From dd8a078bca50e40f476f16e569468706532b8fe2 Mon Sep 17 00:00:00 2001 From: Jean Boussier Date: Mon, 25 Mar 2024 13:03:14 +0100 Subject: [PATCH] Implement Hash.new(capacity:) [Feature #19236] When building a large hash, pre-allocating it with enough capacity can save many re-hashes and significantly improve performance. ``` /opt/rubies/3.3.0/bin/ruby --disable=gems -rrubygems -I./benchmark/lib ./benchmark/benchmark-driver/exe/benchmark-driver \ --executables="compare-ruby::../miniruby-master -I.ext/common --disable-gem" \ --executables="built-ruby::./miniruby --disable-gem" \ --output=markdown --output-compare -v $(find ./benchmark -maxdepth 1 -name 'hash_new' -o -name '*hash_new*.yml' -o -name '*hash_new*.rb' | sort) compare-ruby: ruby 3.4.0dev (2024-03-25T11:48:11Z master f53209f023) +YJIT dev [arm64-darwin23] last_commit=[ruby/irb] Cache RDoc::RI::Driver.new (https://github.com/ruby/irb/pull/911) built-ruby: ruby 3.4.0dev (2024-03-25T15:29:40Z hash-new-rb 77652b08a2) +YJIT dev [arm64-darwin23] warming up... | |compare-ruby|built-ruby| |:-------------------|-----------:|---------:| |new | 7.614M| 5.976M| | | 1.27x| -| |new_with_capa_1k | 13.931k| 15.698k| | | -| 1.13x| |new_with_capa_100k | 124.746| 148.283| | | -| 1.19x| ``` --- benchmark/hash_new.yml | 16 +++++++++ common.mk | 2 ++ hash.c | 64 ++++++++++----------------------- hash.rb | 48 +++++++++++++++++++++++++ inits.c | 1 + spec/ruby/core/hash/new_spec.rb | 16 ++++++++- 6 files changed, 100 insertions(+), 47 deletions(-) create mode 100644 benchmark/hash_new.yml create mode 100644 hash.rb diff --git a/benchmark/hash_new.yml b/benchmark/hash_new.yml new file mode 100644 index 00000000000000..c264bed74fb4cd --- /dev/null +++ b/benchmark/hash_new.yml @@ -0,0 +1,16 @@ +prelude: | + has_hash_with_capa = Hash.instance_method(:initialize).parameters.include?([:key, :capacity]) + strings_1k = 1_000.times.map { |i| i.to_s.freeze } + strings_100k = 100_000.times.map { |i| i.to_s.freeze } +benchmark: + new: Hash.new + new_with_capa_1k: | + h = has_hash_with_capa ? Hash.new(capacity: strings_1k.size) : {} + strings_1k.each do |x| + h[x] = true + end + new_with_capa_100k: | + h = has_hash_with_capa ? Hash.new(capacity: strings_100k.size) : {} + strings_100k.each do |x| + h[x] = true + end diff --git a/common.mk b/common.mk index 21ceaa3f01279b..88ea2a2eb35bc2 100644 --- a/common.mk +++ b/common.mk @@ -1200,6 +1200,7 @@ BUILTIN_RB_SRCS = \ $(srcdir)/trace_point.rb \ $(srcdir)/warning.rb \ $(srcdir)/array.rb \ + $(srcdir)/hash.rb \ $(srcdir)/kernel.rb \ $(srcdir)/ractor.rb \ $(srcdir)/symbol.rb \ @@ -7781,6 +7782,7 @@ hash.$(OBJEXT): {$(VPATH)}debug_counter.h hash.$(OBJEXT): {$(VPATH)}defines.h hash.$(OBJEXT): {$(VPATH)}encoding.h hash.$(OBJEXT): {$(VPATH)}hash.c +hash.$(OBJEXT): {$(VPATH)}hash.rbinc hash.$(OBJEXT): {$(VPATH)}id.h hash.$(OBJEXT): {$(VPATH)}id_table.h hash.$(OBJEXT): {$(VPATH)}intern.h diff --git a/hash.c b/hash.c index f34f64065b67d1..83bc1c63850058 100644 --- a/hash.c +++ b/hash.c @@ -48,6 +48,7 @@ #include "ruby/thread_native.h" #include "ruby/ractor.h" #include "vm_sync.h" +#include "builtin.h" /* Flags of RHash * @@ -1762,59 +1763,29 @@ set_proc_default(VALUE hash, VALUE proc) RHASH_SET_IFNONE(hash, proc); } -/* - * call-seq: - * Hash.new(default_value = nil) -> new_hash - * Hash.new {|hash, key| ... } -> new_hash - * - * Returns a new empty +Hash+ object. - * - * The initial default value and initial default proc for the new hash - * depend on which form above was used. See {Default Values}[rdoc-ref:Hash@Default+Values]. - * - * If neither an argument nor a block given, - * initializes both the default value and the default proc to nil: - * h = Hash.new - * h.default # => nil - * h.default_proc # => nil - * - * If argument default_value given but no block given, - * initializes the default value to the given default_value - * and the default proc to nil: - * h = Hash.new(false) - * h.default # => false - * h.default_proc # => nil - * - * If a block given but no argument, stores the block as the default proc - * and sets the default value to nil: - * h = Hash.new {|hash, key| "Default value for #{key}" } - * h.default # => nil - * h.default_proc.class # => Proc - * h[:nosuch] # => "Default value for nosuch" - */ - -static VALUE -rb_hash_initialize(int argc, VALUE *argv, VALUE hash) +static void +rb_hash_init(VALUE hash, long capa, VALUE ifnone_unset, VALUE ifnone, VALUE block) { - rb_hash_modify(hash); + RUBY_ASSERT(RHASH_SIZE(hash) == 0); + RUBY_ASSERT(RHASH_AR_TABLE_P(hash)); - if (rb_block_given_p()) { - rb_check_arity(argc, 0, 0); - SET_PROC_DEFAULT(hash, rb_block_proc()); + if (capa > 0) { + hash_st_table_init(hash, &objhash, capa); } - else { - rb_check_arity(argc, 0, 1); - VALUE options, ifnone; - rb_scan_args(argc, argv, "01:", &ifnone, &options); - if (NIL_P(ifnone) && !NIL_P(options)) { - ifnone = options; - rb_warn_deprecated_to_remove("3.4", "Calling Hash.new with keyword arguments", "Hash.new({ key: value })"); + if (!NIL_P(block)) { + if (ifnone_unset != Qtrue) { + rb_check_arity(1, 0, 0); + } + else { + SET_PROC_DEFAULT(hash, block); } + } + else if (ifnone_unset != Qtrue) { RHASH_SET_IFNONE(hash, ifnone); } - return hash; + hash_verify(hash); } static VALUE rb_hash_to_a(VALUE hash); @@ -7150,7 +7121,6 @@ Init_Hash(void) rb_define_alloc_func(rb_cHash, empty_hash_alloc); rb_define_singleton_method(rb_cHash, "[]", rb_hash_s_create, -1); rb_define_singleton_method(rb_cHash, "try_convert", rb_hash_s_try_convert, 1); - rb_define_method(rb_cHash, "initialize", rb_hash_initialize, -1); rb_define_method(rb_cHash, "initialize_copy", rb_hash_replace, 1); rb_define_method(rb_cHash, "rehash", rb_hash_rehash, 0); @@ -7477,3 +7447,5 @@ Init_Hash(void) HASH_ASSERT(sizeof(ar_hint_t) * RHASH_AR_TABLE_MAX_SIZE == sizeof(VALUE)); } + +#include "hash.rbinc" diff --git a/hash.rb b/hash.rb new file mode 100644 index 00000000000000..e94579b6b5005b --- /dev/null +++ b/hash.rb @@ -0,0 +1,48 @@ +class Hash + # call-seq: + # Hash.new(default_value = nil) -> new_hash + # Hash.new(default_value = nil, capacity: size) -> new_hash + # Hash.new {|hash, key| ... } -> new_hash + # Hash.new(capacity: size) {|hash, key| ... } -> new_hash + # + # Returns a new empty +Hash+ object. + # + # The initial default value and initial default proc for the new hash + # depend on which form above was used. See {Default Values}[rdoc-ref:Hash@Default+Values]. + # + # If neither an argument nor a block is given, + # initializes both the default value and the default proc to nil: + # h = Hash.new + # h.default # => nil + # h.default_proc # => nil + # + # If argument default_value is given but no block is given, + # initializes the default value to the given default_value + # and the default proc to nil: + # h = Hash.new(false) + # h.default # => false + # h.default_proc # => nil + # + # If a block is given but no default_value, stores the block as the default proc + # and sets the default value to nil: + # h = Hash.new {|hash, key| "Default value for #{key}" } + # h.default # => nil + # h.default_proc.class # => Proc + # h[:nosuch] # => "Default value for nosuch" + # + # If both a block and a default_value are given, raises an +ArgumentError+ + # + # If the optional keyword argument +capacity+ is given, the hash will be allocated + # with enough capacity to accomodate this many keys without having to be resized. + def initialize(ifnone = (ifnone_unset = true), capacity: 0, &block) + if Primitive.mandatory_only? + Primitive.attr! :leaf + else + b = block # TODO: builtin bug + Primitive.cstmt!(%{ + rb_hash_init(self, NUM2LONG(capacity), ifnone_unset, ifnone, b); + return Qnil; + }) + end + end +end diff --git a/inits.c b/inits.c index 9ed104f369e02f..c91132becabf6a 100644 --- a/inits.c +++ b/inits.c @@ -95,6 +95,7 @@ rb_call_builtin_inits(void) BUILTIN(pack); BUILTIN(warning); BUILTIN(array); + BUILTIN(hash); BUILTIN(kernel); BUILTIN(symbol); BUILTIN(timev); diff --git a/spec/ruby/core/hash/new_spec.rb b/spec/ruby/core/hash/new_spec.rb index 6279815fd6e660..07054e226ba28f 100644 --- a/spec/ruby/core/hash/new_spec.rb +++ b/spec/ruby/core/hash/new_spec.rb @@ -34,7 +34,7 @@ -> { Hash.new(nil) { 0 } }.should raise_error(ArgumentError) end - ruby_version_is "3.3" do + ruby_version_is "3.3"..."3.4" do it "emits a deprecation warning if keyword arguments are passed" do -> { Hash.new(unknown: true) }.should complain( Regexp.new(Regexp.escape("Calling Hash.new with keyword arguments is deprecated and will be removed in Ruby 3.4; use Hash.new({ key: value }) instead")) @@ -46,4 +46,18 @@ Hash.new({ unknown: true }).default.should == { unknown: true } end end + + ruby_version_is "3.4" do + it "accepts a capacity: argument" do + Hash.new(5, capacity: 42).default.should == 5 + Hash.new(capacity: 42).default.should == nil + (Hash.new(capacity: 42) { 1 }).default_proc.should_not == nil + end + + it "raises an error if unknown keyword arguments are passed" do + -> { Hash.new(unknown: true) }.should raise_error(ArgumentError) + -> { Hash.new(1, unknown: true) }.should raise_error(ArgumentError) + -> { Hash.new(unknown: true) { 0 } }.should raise_error(ArgumentError) + end + end end