diff --git a/docs/arithmetics.rst b/docs/arithmetics.rst
index c9515f23c556b..d0a0ebdd4468b 100644
--- a/docs/arithmetics.rst
+++ b/docs/arithmetics.rst
@@ -129,6 +129,11 @@ Random number generator
 
     Generates a random floating point number from the standard normal distribution.
 
+.. note::
+
+  On **CPU** and **CUDA** backends, use the ``random_seed`` argument in ``ti.init()`` to specify the integer seed for random number generation.
+  The random seed is 0 by default.
+
 Element-wise arithmetics for vectors and matrices
 -------------------------------------------------
 
diff --git a/docs/global_settings.rst b/docs/global_settings.rst
index e205815b08a40..ec2a821a74983 100644
--- a/docs/global_settings.rst
+++ b/docs/global_settings.rst
@@ -25,6 +25,7 @@ Runtime
 - Restart the entire Taichi system (destroy all fields and kernels): ``ti.reset()``.
 - To start program in debug mode: ``ti.init(debug=True)`` or ``ti debug your_script.py``.
 - To disable importing torch on start up: ``export TI_ENABLE_TORCH=0``.
+- To change the random seed for random number generation: ``ti.init(random_seed=42)`` (effective on CPU and CUDA backends only).
 
 Logging
 *******
diff --git a/misc/links.md b/misc/links.md
index c17c7fb0a1e58..e8c3ce2c04f78 100644
--- a/misc/links.md
+++ b/misc/links.md
@@ -11,4 +11,4 @@
 - [Taichi GLSL](https://github.com/taichi-dev/taichi_glsl): A Taichi extension library that provides a set of GLSL-style helper functions.
 - [Taichi Blend](https://github.com/taichi-dev/taichi_blend): Taichi Blender intergration for physics-based animations (work in progress)
 - [Taichi.js](https://github.com/taichi-dev/taichi.js): Run compiled Taichi programs in Javascript and WASM (work in progress).
-- [Shadertoy in Taichi](https://github.com/Phonicavi/Shadertoy-taichi): Some shadertoy examples implemented in Taichi, by [Qiu Feng (Phonicavi)](https://github.com/Phonicavi).
\ No newline at end of file
+- [Shadertoy in Taichi](https://github.com/Phonicavi/Shadertoy-taichi): Some shadertoy examples implemented in Taichi, by [Qiu Feng (Phonicavi)](https://github.com/Phonicavi).
diff --git a/taichi/program/compile_config.cpp b/taichi/program/compile_config.cpp
index b0bfe1a05388d..cdd4da23773e5 100644
--- a/taichi/program/compile_config.cpp
+++ b/taichi/program/compile_config.cpp
@@ -42,6 +42,7 @@ CompileConfig::CompileConfig() {
   saturating_grid_dim = 0;
   max_block_dim = 0;
   cpu_max_num_threads = std::thread::hardware_concurrency();
+  random_seed = 0;
 
   ad_stack_size = 16;
 
diff --git a/taichi/program/compile_config.h b/taichi/program/compile_config.h
index b6330605359ec..e25aeb34ea927 100644
--- a/taichi/program/compile_config.h
+++ b/taichi/program/compile_config.h
@@ -46,6 +46,7 @@ struct CompileConfig {
   int saturating_grid_dim;
   int max_block_dim;
   int cpu_max_num_threads;
+  int random_seed;
 
   // LLVM backend options:
   bool print_struct_llvm_ir;
diff --git a/taichi/program/program.cpp b/taichi/program/program.cpp
index 76368ba93e982..cd02b673506bc 100644
--- a/taichi/program/program.cpp
+++ b/taichi/program/program.cpp
@@ -307,6 +307,11 @@ void Program::initialize_runtime_system(StructCompiler *scomp) {
   auto snodes = scomp->snodes;
   int root_id = snode_root->id;
 
+  // Starting random state for the program calculated using the random seed.
+  // The seed is multiplied by 2^20 so that two programs with different seeds
+  // will not have overlapping random states in any thread.
+  int starting_rand_state = config.random_seed * 1048576;
+
   // Number of random states. One per CPU/CUDA thread.
   int num_rand_states = 0;
 
@@ -325,12 +330,12 @@ void Program::initialize_runtime_system(StructCompiler *scomp) {
   TI_TRACE("Allocating data structure of size {} B", scomp->root_size);
   TI_TRACE("Allocating {} random states (used by CUDA only)", num_rand_states);
 
-  runtime->call<void *, void *, std::size_t, std::size_t, void *, int, void *,
-                void *, void *>("runtime_initialize", result_buffer, this,
-                                (std::size_t)scomp->root_size, prealloc_size,
-                                preallocated_device_buffer, num_rand_states,
-                                (void *)&taichi_allocate_aligned,
-                                (void *)std::printf, (void *)std::vsnprintf);
+  runtime->call<void *, void *, std::size_t, std::size_t, void *, int, int,
+                void *, void *, void *>(
+      "runtime_initialize", result_buffer, this, (std::size_t)scomp->root_size,
+      prealloc_size, preallocated_device_buffer, starting_rand_state,
+      num_rand_states, (void *)&taichi_allocate_aligned, (void *)std::printf,
+      (void *)std::vsnprintf);
 
   TI_TRACE("LLVMRuntime initialized");
   llvm_runtime = fetch_result<void *>(taichi_result_buffer_ret_value_id);
diff --git a/taichi/python/export_lang.cpp b/taichi/python/export_lang.cpp
index 81979203dc9c6..a6fea501b576b 100644
--- a/taichi/python/export_lang.cpp
+++ b/taichi/python/export_lang.cpp
@@ -95,9 +95,8 @@ void export_lang(py::module &m) {
       .def(py::self == py::self)
       .def("__hash__", &DataType::hash)
       .def("to_string", &DataType::to_string)
-      .def(
-          "get_ptr", [](DataType *dtype) -> Type * { return *dtype; },
-          py::return_value_policy::reference)
+      .def("get_ptr", [](DataType *dtype) -> Type * { return *dtype; },
+           py::return_value_policy::reference)
       .def(py::pickle(
           [](const DataType &dt) {
             // Note: this only works for primitive types, which is fine for now.
@@ -148,6 +147,7 @@ void export_lang(py::module &m) {
       .def_readwrite("saturating_grid_dim", &CompileConfig::saturating_grid_dim)
       .def_readwrite("max_block_dim", &CompileConfig::max_block_dim)
       .def_readwrite("cpu_max_num_threads", &CompileConfig::cpu_max_num_threads)
+      .def_readwrite("random_seed", &CompileConfig::random_seed)
       .def_readwrite("verbose_kernel_launches",
                      &CompileConfig::verbose_kernel_launches)
       .def_readwrite("verbose", &CompileConfig::verbose)
@@ -195,10 +195,9 @@ void export_lang(py::module &m) {
   m.def("reset_default_compile_config",
         [&]() { default_compile_config = CompileConfig(); });
 
-  m.def(
-      "default_compile_config",
-      [&]() -> CompileConfig & { return default_compile_config; },
-      py::return_value_policy::reference);
+  m.def("default_compile_config",
+        [&]() -> CompileConfig & { return default_compile_config; },
+        py::return_value_policy::reference);
 
   py::class_<Program>(m, "Program")
       .def(py::init<>())
@@ -215,12 +214,11 @@ void export_lang(py::module &m) {
            })
       .def("print_memory_profiler_info", &Program::print_memory_profiler_info)
       .def("finalize", &Program::finalize)
-      .def(
-          "get_root",
-          [&](Program *program) -> SNode * {
-            return program->snode_root.get();
-          },
-          py::return_value_policy::reference)
+      .def("get_root",
+           [&](Program *program) -> SNode * {
+             return program->snode_root.get();
+           },
+           py::return_value_policy::reference)
       .def("get_total_compilation_time", &Program::get_total_compilation_time)
       .def("print_snode_tree", &Program::print_snode_tree)
       .def("get_snode_num_dynamically_allocated",
@@ -235,10 +233,9 @@ void export_lang(py::module &m) {
   m.def("get_current_program", get_current_program,
         py::return_value_policy::reference);
 
-  m.def(
-      "current_compile_config",
-      [&]() -> CompileConfig & { return get_current_program().config; },
-      py::return_value_policy::reference);
+  m.def("current_compile_config",
+        [&]() -> CompileConfig & { return get_current_program().config; },
+        py::return_value_policy::reference);
 
   py::class_<Index>(m, "Index").def(py::init<int>());
   py::class_<SNode>(m, "SNode")
@@ -273,10 +270,9 @@ void export_lang(py::module &m) {
       .def("data_type", [](SNode *snode) { return snode->dt; })
       .def("get_num_ch",
            [](SNode *snode) -> int { return (int)snode->ch.size(); })
-      .def(
-          "get_ch",
-          [](SNode *snode, int i) -> SNode * { return snode->ch[i].get(); },
-          py::return_value_policy::reference)
+      .def("get_ch",
+           [](SNode *snode, int i) -> SNode * { return snode->ch[i].get(); },
+           py::return_value_policy::reference)
       .def("lazy_grad",
            [](SNode *snode) {
              make_lazy_grad(snode,
@@ -376,14 +372,13 @@ void export_lang(py::module &m) {
 
   py::class_<Stmt>(m, "Stmt");
   py::class_<Program::KernelProxy>(m, "KernelProxy")
-      .def(
-          "define",
-          [](Program::KernelProxy *ker,
-             const std::function<void()> &func) -> Kernel & {
-            py::gil_scoped_release release;
-            return ker->def(func);
-          },
-          py::return_value_policy::reference);
+      .def("define",
+           [](Program::KernelProxy *ker,
+              const std::function<void()> &func) -> Kernel & {
+             py::gil_scoped_release release;
+             return ker->def(func);
+           },
+           py::return_value_policy::reference);
 
   m.def("insert_deactivate", [](SNode *snode, const ExprGroup &indices) {
     return Deactivate(snode, indices);
diff --git a/taichi/runtime/llvm/runtime.cpp b/taichi/runtime/llvm/runtime.cpp
index 59d43583b3633..8906d1d80ec44 100644
--- a/taichi/runtime/llvm/runtime.cpp
+++ b/taichi/runtime/llvm/runtime.cpp
@@ -833,6 +833,7 @@ void runtime_initialize(
     std::size_t
         preallocated_size,  // Non-zero means use the preallocated buffer
     Ptr preallocated_buffer,
+    i32 starting_rand_state,
     i32 num_rand_states,
     void *_vm_allocator,
     void *_host_printf,
@@ -885,7 +886,7 @@ void runtime_initialize(
   runtime->rand_states = (RandState *)runtime->allocate_aligned(
       sizeof(RandState) * runtime->num_rand_states, taichi_page_size);
   for (int i = 0; i < runtime->num_rand_states; i++)
-    initialize_rand_state(&runtime->rand_states[i], i);
+    initialize_rand_state(&runtime->rand_states[i], starting_rand_state + i);
 }
 
 void runtime_initialize2(LLVMRuntime *runtime, int root_id, int num_snodes) {
diff --git a/taichi/transforms/auto_diff.cpp b/taichi/transforms/auto_diff.cpp
index 04c5a4c00adfb..a3d7d81669360 100644
--- a/taichi/transforms/auto_diff.cpp
+++ b/taichi/transforms/auto_diff.cpp
@@ -192,16 +192,18 @@ class ReplaceLocalVarWithStacks : public BasicStmtVisitor {
 
   void visit(AllocaStmt *alloc) override {
     TI_ASSERT(alloc->width() == 1);
-    bool load_only =
-        irpass::analysis::gather_statements(alloc->parent, [&](Stmt *s) {
-          if (auto store = s->cast<LocalStoreStmt>())
-            return store->dest == alloc;
-          else if (auto atomic = s->cast<AtomicOpStmt>()) {
-            return atomic->dest == alloc;
-          } else {
-            return false;
-          }
-        }).empty();
+    bool load_only = irpass::analysis::gather_statements(
+                         alloc->parent,
+                         [&](Stmt *s) {
+                           if (auto store = s->cast<LocalStoreStmt>())
+                             return store->dest == alloc;
+                           else if (auto atomic = s->cast<AtomicOpStmt>()) {
+                             return atomic->dest == alloc;
+                           } else {
+                             return false;
+                           }
+                         })
+                         .empty();
     if (!load_only) {
       auto dtype = alloc->ret_type;
       auto stack_alloca = Stmt::make<AdStackAllocaStmt>(dtype, ad_stack_size);
diff --git a/tests/python/test_random.py b/tests/python/test_random.py
index fc3015b73b62b..3b93f0ce4baf4 100644
--- a/tests/python/test_random.py
+++ b/tests/python/test_random.py
@@ -107,6 +107,27 @@ def gen(i: ti.i32):
     assert count <= n * 0.15
 
 
+@ti.test(arch=[ti.cpu, ti.cuda])
+def test_random_seed_per_program():
+    import numpy as np
+    n = 10
+    result = []
+    for s in [0, 1]:
+        ti.init(random_seed=s)
+        x = ti.field(ti.f32, shape=n)
+
+        @ti.kernel
+        def gen():
+            for i in x:
+                x[i] = ti.random()
+
+        gen()
+        result.append(x.to_numpy())
+        ti.reset()
+
+    assert not np.allclose(result[0], result[1])
+
+
 @ti.test(arch=[ti.cpu, ti.cuda])
 def test_random_f64():
     '''