diff --git a/paddle/phi/kernels/gpu/lamb_kernel.cu b/paddle/phi/kernels/gpu/lamb_kernel.cu
index 20ad2eb439a46..220fa97a0e107 100644
--- a/paddle/phi/kernels/gpu/lamb_kernel.cu
+++ b/paddle/phi/kernels/gpu/lamb_kernel.cu
@@ -23,6 +23,7 @@ PD_REGISTER_KERNEL(lamb,
                    ALL_LAYOUT,
                    phi::LambKernel,
                    phi::dtype::float16,
+                   phi::dtype::bfloat16,
                    float,
                    double) {
   kernel->InputAt(5).SetBackend(phi::Backend::ALL_BACKEND);
diff --git a/test/legacy_test/test_lamb_op.py b/test/legacy_test/test_lamb_op.py
index 35a1c5e010c04..f6ba331573275 100644
--- a/test/legacy_test/test_lamb_op.py
+++ b/test/legacy_test/test_lamb_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from eager_op_test import OpTest, convert_float_to_uint16
 from op import Operator
 
 import paddle
@@ -69,13 +69,24 @@ def set_attrs(self):
             'always_adapt': False,
         }
 
+    def set_dtype(self):
+        self.dtype = np.float32
+
     def setUp(self):
         '''Test Lamb Op with supplied attributes'''
         self.op_type = "lamb"
-        param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
-        grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
-        moment1 = np.random.uniform(-1, 1, (102, 105)).astype("float32")
-        moment2 = np.random.random((102, 105)).astype("float32")
+        self.set_dtype()
+
+        if self.is_bfloat16_op():
+            param = np.random.uniform(-1, 1, (102, 105)).astype(np.float32)
+            grad = np.random.uniform(-1, 1, (102, 105)).astype(np.float32)
+            moment1 = np.random.uniform(-1, 1, (102, 105)).astype(np.float32)
+            moment2 = np.random.random((102, 105)).astype(np.float32)
+        else:
+            param = np.random.uniform(-1, 1, (102, 105)).astype(self.dtype)
+            grad = np.random.uniform(-1, 1, (102, 105)).astype(self.dtype)
+            moment1 = np.random.uniform(-1, 1, (102, 105)).astype(self.dtype)
+            moment2 = np.random.random((102, 105)).astype(self.dtype)
 
         learning_rate = 0.001
         self.set_attrs()
@@ -86,15 +97,33 @@ def setUp(self):
         beta1_pow = self.attrs['beta1']
         beta2_pow = self.attrs['beta2']
 
-        self.inputs = {
-            'Param': param,
-            'Grad': grad,
-            'Moment1': moment1,
-            'Moment2': moment2,
-            'LearningRate': np.array([learning_rate]).astype("float32"),
-            'Beta1Pow': np.array([beta1_pow]).astype("float32"),
-            'Beta2Pow': np.array([beta2_pow]).astype("float32"),
-        }
+        if self.is_bfloat16_op():
+            self.inputs = {
+                'Param': convert_float_to_uint16(param),
+                'Grad': convert_float_to_uint16(grad),
+                'Moment1': convert_float_to_uint16(moment1),
+                'Moment2': convert_float_to_uint16(moment2),
+                'LearningRate': convert_float_to_uint16(
+                    np.array([learning_rate]).astype(self.dtype)
+                ),
+                'Beta1Pow': convert_float_to_uint16(
+                    np.array([beta1_pow]).astype(self.dtype)
+                ),
+                'Beta2Pow': convert_float_to_uint16(
+                    np.array([beta2_pow]).astype(self.dtype)
+                ),
+            }
+
+        else:
+            self.inputs = {
+                'Param': param,
+                'Grad': grad,
+                'Moment1': moment1,
+                'Moment2': moment2,
+                'LearningRate': np.array([learning_rate]).astype(self.dtype),
+                'Beta1Pow': np.array([beta1_pow]).astype(self.dtype),
+                'Beta2Pow': np.array([beta2_pow]).astype(self.dtype),
+            }
 
         (
             param_out,
@@ -104,13 +133,22 @@ def setUp(self):
             beta2_pow_out,
         ) = lamb_step(self.inputs, self.attrs)
 
-        self.outputs = {
-            'Moment1Out': moment1_out,
-            'Moment2Out': moment2_out,
-            'ParamOut': param_out,
-            'Beta1PowOut': beta1_pow_out,
-            'Beta2PowOut': beta2_pow_out,
-        }
+        if self.is_bfloat16_op():
+            self.outputs = {
+                'Moment1Out': convert_float_to_uint16(moment1_out),
+                'Moment2Out': convert_float_to_uint16(moment2_out),
+                'ParamOut': convert_float_to_uint16(param_out),
+                'Beta1PowOut': convert_float_to_uint16(beta1_pow_out),
+                'Beta2PowOut': convert_float_to_uint16(beta2_pow_out),
+            }
+        else:
+            self.outputs = {
+                'Moment1Out': moment1_out,
+                'Moment2Out': moment2_out,
+                'ParamOut': param_out,
+                'Beta1PowOut': beta1_pow_out,
+                'Beta2PowOut': beta2_pow_out,
+            }
 
     def test_check_output(self):
         self.check_output()
@@ -181,7 +219,129 @@ def test_check_output(self):
 
             # Randomize gradient for next step
             self.inputs['Grad'] = np.random.uniform(-1, 1, (102, 105)).astype(
-                "float32"
+                self.dtype
+            )
+
+
+class TestLambFP16Op1(TestLambOp1):
+    def set_dtype(self):
+        self.__class__.op_type = "lamb"
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place)
+
+
+class TestLambFP16Op2(TestLambOp2):
+    def set_dtype(self):
+        self.__class__.op_type = "lamb"
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place)
+
+
+class TestLambFP16Op3(TestLambOp3):
+    def set_dtype(self):
+        self.__class__.op_type = "lamb"
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place)
+
+
+class TestLambFP16OpMultipleSteps(TestLambOpMultipleSteps):
+    def set_dtype(self):
+        self.__class__.op_type = "lamb"
+        self.dtype = np.float16
+
+
+class TestLambBF16Op1(TestLambOp1):
+    def set_dtype(self):
+        self.__class__.op_type = "lamb"
+        self.dtype = np.uint16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_bfloat16_supported(place):
+                self.check_output_with_place(place)
+
+
+class TestLambBF16Op2(TestLambOp2):
+    def set_dtype(self):
+        self.__class__.op_type = "lamb"
+        self.dtype = np.uint16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_bfloat16_supported(place):
+                self.check_output_with_place(place)
+
+
+class TestLambBF16Op3(TestLambOp3):
+    def set_dtype(self):
+        self.__class__.op_type = "lamb"
+        self.dtype = np.uint16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_bfloat16_supported(place):
+                self.check_output_with_place(place)
+
+
+class TestLambBF16OpMultipleSteps(TestLambOpMultipleSteps):
+    def set_dtype(self):
+        self.__class__.op_type = "lamb"
+        self.dtype = np.uint16
+
+    def test_check_output(self):
+        for i in range(self.num_steps):
+            (
+                param_out,
+                moment1_out,
+                moment2_out,
+                beta1_pow_out,
+                beta2_pow_out,
+            ) = lamb_step(self.inputs, self.attrs)
+
+            self.outputs = {
+                'Moment1Out': convert_float_to_uint16(moment1_out),
+                'Moment2Out': convert_float_to_uint16(moment2_out),
+                'ParamOut': convert_float_to_uint16(param_out),
+                'Beta1PowOut': convert_float_to_uint16(beta1_pow_out),
+                'Beta2PowOut': convert_float_to_uint16(beta2_pow_out),
+            }
+
+            # Verify output for this step
+            if core.is_compiled_with_cuda():
+                place = core.CUDAPlace(0)
+                if core.is_bfloat16_supported(place):
+                    self.check_output_with_place(place)
+
+            # Output of this step becomes input for next step
+            self.inputs['Param'] = convert_float_to_uint16(param_out)
+            self.inputs['Moment1'] = convert_float_to_uint16(moment1_out)
+            self.inputs['Moment2'] = convert_float_to_uint16(moment2_out)
+
+            # Update powers of Beta1 and Beta2 for next time step
+            self.inputs['Beta1Pow'] = convert_float_to_uint16(beta1_pow_out)
+            self.inputs['Beta2Pow'] = convert_float_to_uint16(beta2_pow_out)
+
+            # Randomize gradient for next step
+            self.inputs['Grad'] = np.random.uniform(-1, 1, (102, 105)).astype(
+                np.float32
             )