Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix spece spec, etc #63092

Merged
merged 2 commits into from
Apr 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion python/cinn/auto_schedule/cost_model/xgb_cost_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ def load(self, path):
self.booster = xgb.Booster()
self.booster.load_model(path)
# Should we save/load config parameters? Not now because it is pre-set.
# But we should do that here if that's changable in the future.
# But we should do that here if that's changeable in the future.

def update(self, samples, labels):
# xgb doesn't support incremental training, we leave this method as TODO
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def update_dims_mapping(dist_op):
output_spec = get_dist_tensor_spec(dist_op, output_arg_name, False)

# step2: infer spmd
# TODO reivse me
# TODO revise me
op_type = op_desc.type()
rule = get_phi_spmd_rule(op_type)
fw_results = rule.infer_forward(*input_specs)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -515,7 +515,7 @@ def update_dims_mapping_matmul(dist_op):
trans_x = False
trans_y = False

# TODO (zhangyichen) replace dist tensor spece by dist tensor in future.
# TODO (zhangyichen) replace dist tensor spec by dist tensor in future.
x_spec = get_dist_tensor_spec(dist_op, x_name)
y_spec = get_dist_tensor_spec(dist_op, y_name)
out_spec = get_dist_tensor_spec(dist_op, out_name, False)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def update_dims_mapping(dist_op):
keep_dim = op_desc.attr('keep_dim')
dims = op_desc.attr('dim')

# TODO (zhangyichen) replace dist tensor spece by dist tensor in future.
# TODO (zhangyichen) replace dist tensor spec by dist tensor in future.
input_spec = get_dist_tensor_spec(dist_op, input_arg_name)
output_spec = get_dist_tensor_spec(dist_op, output_arg_name, False)
# len(dims) == 0 means reduce_all
Expand Down Expand Up @@ -122,18 +122,18 @@ def is_partial_reduce(axes, dims_mapping):
register_distributed_operator_impl_container(DistributedReduceSum("reduce_sum"))


class DistributedReduceSumPrimtive(DistributedOperatorImplContainer):
class DistributedReduceSumPrimitive(DistributedOperatorImplContainer):
def __init__(self, op_type):
super().__init__(op_type)


register_distributed_operator_impl_container(
DistributedReduceSumPrimtive("reduce_sum_p")
DistributedReduceSumPrimitive("reduce_sum_p")
)


# Batch Dimension ReduceSum Primitive
class DistributedReduceSumPrimtiveImpl0(DistributedOperatorImpl):
class DistributedReduceSumPrimitiveImpl0(DistributedOperatorImpl):
def __init__(self, name):
super().__init__(name)
self._forward_implemented = True
Expand Down Expand Up @@ -243,5 +243,5 @@ def backward(ctx, *args, **kwargs):

register_distributed_operator_impl(
"reduce_sum_p",
DistributedReduceSumPrimtiveImpl0("batch_dimension_reduce_sum_p"),
DistributedReduceSumPrimitiveImpl0("batch_dimension_reduce_sum_p"),
)
10 changes: 5 additions & 5 deletions python/paddle/distributed/fleet/fleet.py
Original file line number Diff line number Diff line change
Expand Up @@ -391,7 +391,7 @@ def allreduce_perf(
)
if perf_threshold_time > -1 and ret > perf_threshold_time:
logger.warning(
f"[Perf Warnning] AllReduce Test Timeout! {ret} > {perf_threshold_time}"
f"[Perf Warning] AllReduce Test Timeout! {ret} > {perf_threshold_time}"
)

# test reduce perf
Expand All @@ -412,7 +412,7 @@ def reduce_perf(self, iteration, x, group, perf_size, perf_threshold_time):
)
if perf_threshold_time > -1 and ret > perf_threshold_time:
logger.warning(
f"[Perf Warnning] Reduce Test Timeout! {ret} > {perf_threshold_time}"
f"[Perf Warning] Reduce Test Timeout! {ret} > {perf_threshold_time}"
)

# test broadcast perf
Expand All @@ -435,7 +435,7 @@ def broadcast_perf(
)
if perf_threshold_time > -1 and ret > perf_threshold_time:
logger.warning(
f"[Perf Warnning] Broadcast Test Timeout! {ret} > {perf_threshold_time}"
f"[Perf Warning] Broadcast Test Timeout! {ret} > {perf_threshold_time}"
)

# test allgather perf
Expand All @@ -459,7 +459,7 @@ def allgather_perf(
)
if perf_threshold_time > -1 and ret > perf_threshold_time:
logger.warning(
f"[Perf Warnning] Allgather Test Timeout! {ret} > {perf_threshold_time}"
f"[Perf Warning] Allgather Test Timeout! {ret} > {perf_threshold_time}"
)

# test reduce_scatter perf
Expand Down Expand Up @@ -502,7 +502,7 @@ def reduce_scatter_perf(
)
if perf_threshold_time > -1 and ret > perf_threshold_time:
logger.warning(
f"[Perf Warnning] ReduceScatter Test Timeout! {ret} > {perf_threshold_time}"
f"[Perf Warning] ReduceScatter Test Timeout! {ret} > {perf_threshold_time}"
)

def _collective_perf_impl(self, round=50, context={}, hcg=None):
Expand Down
4 changes: 2 additions & 2 deletions python/paddle/distributed/fleet/fleet_executor_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ def task_node(self):
def set_program(self, program):
assert (
self.lazy_initialize
), "Inside program is unchangable for immediate initialized task node. Set the lazy_initialize to be true if the inside program need to be update. Remember to do all your change before eval node.task_node()."
), "Inside program is unchangeable for immediate initialized task node. Set the lazy_initialize to be true if the inside program need to be update. Remember to do all your change before eval node.task_node()."
self.program = program

def get_program(self):
Expand Down Expand Up @@ -423,7 +423,7 @@ def run1f1b(
):
"""
Split the program to support 1f1b pipeline scheduler.
This funct will split the program based on the op_role.
This function will split the program based on the op_role.
The program will be split into four parts: lr_sched, fwd, bwd, opt.
And will create task nodes based on the four parts of the program.
:param program: The origin program.
Expand Down
2 changes: 1 addition & 1 deletion python/paddle/distributed/fleet/launch_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -321,7 +321,7 @@ def get_cluster(


def terminate_local_procs(procs):
# try to terminate process by group, this happend in multiprocess senario in user process
# try to terminate process by group, this happened in multiprocess scenario in user process
if os.name != 'nt':
for p in procs:
if p.proc.poll() is None:
Expand Down
32 changes: 16 additions & 16 deletions test/legacy_test/test_merged_momentum_op.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
def run_momentum_op(
params,
grads,
velocitys,
velocities,
master_params,
learning_rate,
place,
Expand All @@ -34,7 +34,7 @@ def run_momentum_op(
use_merged=False,
):
assert len(params) == len(grads)
assert len(params) == len(velocitys)
assert len(params) == len(velocities)
if multi_precision:
assert len(params) == len(master_params)
op_type = 'merged_momentum' if use_merged else 'momentum'
Expand All @@ -61,7 +61,7 @@ def run_momentum_op(
helper.create_variable(
persistable=True, shape=v.shape, dtype=v.dtype
)
for v in velocitys
for v in velocities
]
lr_var = helper.create_variable(
persistable=True,
Expand All @@ -83,7 +83,7 @@ def run_momentum_op(
OrderedDict(
[
(v_var.name, v_val)
for v_var, v_val in zip(velocity_vars, velocitys)
for v_var, v_val in zip(velocity_vars, velocities)
]
)
)
Expand Down Expand Up @@ -162,7 +162,7 @@ def run_momentum_op(
def run_momentum_op2(
params,
grads,
velocitys,
velocities,
master_params,
learning_rate,
place,
Expand All @@ -173,7 +173,7 @@ def run_momentum_op2(
use_nesterov=True,
):
assert len(params) == len(grads)
assert len(params) == len(velocitys)
assert len(params) == len(velocities)
if multi_precision:
assert len(params) == len(master_params)
op_type = 'merged_momentum' if use_merged else 'momentum'
Expand All @@ -195,7 +195,7 @@ def run_momentum_op2(
helper.create_variable(
persistable=True, shape=v.shape, dtype=v.dtype
)
for v in velocitys
for v in velocities
]
lr_var = helper.create_variable(
persistable=True,
Expand All @@ -217,7 +217,7 @@ def run_momentum_op2(
OrderedDict(
[
(v_var.name, v_val)
for v_var, v_val in zip(velocity_vars, velocitys)
for v_var, v_val in zip(velocity_vars, velocities)
]
)
)
Expand Down Expand Up @@ -331,19 +331,19 @@ def prepare_data(self, shapes, multi_precision, seed, place):
)
params = self.gen_rand_data(shapes, dtype)
grads = self.gen_rand_data(shapes, dtype)
velocitys = self.gen_rand_data(shapes, mp_dtype)
velocities = self.gen_rand_data(shapes, mp_dtype)
learning_rate = self.gen_rand_data([[1]], mp_dtype)[0]
if multi_precision:
master_params = [p.astype(mp_dtype) for p in params]
else:
master_params = None
return params, grads, velocitys, master_params, learning_rate
return params, grads, velocities, master_params, learning_rate

def check_with_place(self, place, multi_precision):
(
params,
grads,
velocitys,
velocities,
master_params,
learning_rate,
) = self.prepare_data(self.shapes, multi_precision, self.seed, place)
Expand All @@ -354,7 +354,7 @@ def run_op(use_merged):
return run_momentum_op(
params,
grads,
velocitys,
velocities,
master_params,
learning_rate,
place,
Expand Down Expand Up @@ -403,19 +403,19 @@ def prepare_data(self, shapes, multi_precision, seed, place):
)
params = self.gen_rand_data(shapes, dtype)
grads = self.gen_rand_data(shapes, dtype)
velocitys = self.gen_rand_data(shapes, mp_dtype)
velocities = self.gen_rand_data(shapes, mp_dtype)
learning_rate = self.gen_rand_data([[1]], mp_dtype)[0]
if multi_precision:
master_params = [p.astype(mp_dtype) for p in params]
else:
master_params = None
return params, grads, velocitys, master_params, learning_rate
return params, grads, velocities, master_params, learning_rate

def check_with_place(self, place, multi_precision):
(
params,
grads,
velocitys,
velocities,
master_params,
learning_rate,
) = self.prepare_data(self.shapes, multi_precision, self.seed, place)
Expand All @@ -426,7 +426,7 @@ def run_op(use_nesterov, use_merged):
return run_momentum_op2(
params,
grads,
velocitys,
velocities,
master_params,
learning_rate,
place,
Expand Down
12 changes: 6 additions & 6 deletions test/legacy_test/test_momentum_op.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@ def setUp(self):

params = []
grads = []
velocitys = []
velocities = []
learning_rates = []
master_params = []
param_outs = []
Expand Down Expand Up @@ -216,7 +216,7 @@ def setUp(self):

params.append(("SubParam_" + str(i), param))
grads.append(("SubGrad_" + str(i), grad))
velocitys.append(("SubVelocity_" + str(i), velocity))
velocities.append(("SubVelocity_" + str(i), velocity))
learning_rates.append(("SubLearning_rate_" + str(i), learning_rate))
velocity_outs.append(("SubVelocity_out_" + str(i), velocity_out))
param_outs.append(("SubParam_out_" + str(i), param_out))
Expand All @@ -228,7 +228,7 @@ def setUp(self):
self.inputs = {
'Param': params,
'Grad': grads,
'Velocity': velocitys,
'Velocity': velocities,
'LearningRate': learning_rates,
'MasterParam': master_params,
}
Expand Down Expand Up @@ -268,7 +268,7 @@ def setUp(self):

params = []
grads = []
velocitys = []
velocities = []
param_outs = []
velocity_outs = []
learning_rates = []
Expand All @@ -292,15 +292,15 @@ def setUp(self):

params.append(("SubParam_" + str(i), param))
grads.append(("SubGrad_" + str(i), grad))
velocitys.append(("SubVelocity_" + str(i), velocity))
velocities.append(("SubVelocity_" + str(i), velocity))
learning_rates.append(("SubLearning_rate_" + str(i), learning_rate))
velocity_outs.append(("SubVelocity_out_" + str(i), velocity_out))
param_outs.append(("SubParam_out_" + str(i), param_out))

self.inputs = {
'Param': params,
'Grad': grads,
'Velocity': velocitys,
'Velocity': velocities,
'LearningRate': learning_rates,
}

Expand Down
2 changes: 1 addition & 1 deletion test/legacy_test/test_mul_op.py
Original file line number Diff line number Diff line change
Expand Up @@ -312,7 +312,7 @@ def test_check_grad_ignore_y(self):
)


# TODO: verify the requirments of CUDA ARCH
# TODO: verify the requirements of CUDA ARCH
@unittest.skipIf(
not core.is_compiled_with_cuda() or get_cuda_version() < 11060,
"MatmulInt8 requires CUDA >= 11.6",
Expand Down
10 changes: 5 additions & 5 deletions test/legacy_test/test_multi_label_soft_margin_loss.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,10 @@ def call_MultiLabelSoftMarginLoss_layer(
weight=None,
reduction='mean',
):
multilabel_margin_loss = paddle.nn.MultiLabelSoftMarginLoss(
multi_label_margin_loss = paddle.nn.MultiLabelSoftMarginLoss(
weight=weight, reduction=reduction
)
res = multilabel_margin_loss(
res = multi_label_margin_loss(
input=input,
label=label,
)
Expand Down Expand Up @@ -115,7 +115,7 @@ def test_dygraph(
return dy_result


def calc_multilabel_margin_loss(
def calc_multi_label_margin_loss(
input,
label,
weight=None,
Expand Down Expand Up @@ -151,7 +151,7 @@ def test_MultiLabelSoftMarginLoss(self):
reductions = ['sum', 'mean', 'none']
for place in places:
for reduction in reductions:
expected = calc_multilabel_margin_loss(
expected = calc_multi_label_margin_loss(
input=input, label=label, reduction=reduction
)

Expand Down Expand Up @@ -218,7 +218,7 @@ def test_MultiLabelSoftMarginLoss_weights(self):
weight = np.random.randint(0, 2, size=(5, 5)).astype(np.float64)
place = 'cpu'
reduction = 'mean'
expected = calc_multilabel_margin_loss(
expected = calc_multi_label_margin_loss(
input=input, label=label, weight=weight, reduction=reduction
)

Expand Down
2 changes: 1 addition & 1 deletion test/legacy_test/test_multinomial_op.py
Original file line number Diff line number Diff line change
Expand Up @@ -393,7 +393,7 @@ def test_fixed_random_number(self):
if not paddle.is_compiled_with_cuda():
return

# Different GPU generatte different random value. Only test V100 here.
# Different GPU generate different random value. Only test V100 here.
if "V100" not in paddle.device.cuda.get_device_name():
return

Expand Down
Loading