Fix data leakage in KV cache initialization (#1057)

### Description This PR fixes a data leakage issue that can occur between generators when buffer sharing is enabled. ### Motivation and Context Suppose you create generator A with one input id and run one iteration of the generation loop with it. Now you destroy generator A and create generator B with three input ids. The input KV caches for generator B before running one iteration contain the values in the output KV caches for generator A after generator A ran for one iteration. The data leakage can be stopped when initializing the KV caches by always setting the memory to zeros.
microsoft · Nov 12, 2024 · cc4577e · cc4577e
1 parent f66e4f5
commit cc4577e
Showing 1 changed file with 9 additions and 0 deletions.
diff --git a/src/models/kv_cache.cpp b/src/models/kv_cache.cpp
@@ -166,10 +166,19 @@ KV_Cache::KV_Cache(State& state)
     }
   }
 
+  auto kv_cache_size_bytes = SizeOf(type_) * shape_[0] * shape_[1] * shape_[2] * shape_[3];
   for (int i = 0; i < layer_count_ * 2; ++i) {
     presents_.push_back(
         sb_kv_caches_.empty() ? OrtValue::CreateTensor(*model_.allocator_kvcache_, shape_, type_)
                               : sb_kv_caches_[i]->CreateTensorOnStaticBuffer(shape_, type_));
+#if USE_CUDA
+    if (model_.device_type_ == DeviceType::CUDA) {
+      cudaMemsetAsync(presents_.back()->GetTensorMutableRawData(), 0, kv_cache_size_bytes, model_.cuda_stream_);
+    } else
+#endif
+    {
+      memset(presents_.back()->GetTensorMutableRawData(), 0, kv_cache_size_bytes);
+    }
   }
 }