Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Paddle Inference] refactor linear_compress #55490

Merged
merged 18 commits into from
Aug 22, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 19 additions & 18 deletions paddle/phi/api/yaml/ops.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1437,15 +1437,15 @@
data_transform :
skip_transform : out_size, size_tensor, scale_tensor

- op : llm_int8_matmul
args : (Tensor x, Tensor weight, Tensor weight_scale, float threshold=6.0)
- op : llm_int8_linear
args : (Tensor x, Tensor weight, Tensor bias, Tensor weight_scale, float threshold=6.0)
output : Tensor(out)
infer_meta :
func : LLMInt8MatmulInferMeta
param : [x, weight]
func : LLMInt8LinearInferMeta
kernel :
func : llm_int8_matmul
func : llm_int8_linear
data_type : x
optional: bias

- op : log
args : (Tensor x)
Expand Down Expand Up @@ -2013,15 +2013,6 @@
func : qr
backward : qr_grad

- op : quant_for_compress
args : (Tensor x, int bits = 8, str layout = "weight_only")
output : Tensor(out), Tensor(scale)
infer_meta :
func : QuantForCompressInferMeta
kernel :
func : quant_for_compress
data_type: x

- op : real
args : (Tensor x)
output : Tensor (out)
Expand Down Expand Up @@ -2768,14 +2759,24 @@
intermediate: warprnntgrad
backward : warprnnt_grad

- op : weight_only_matmul
args : (Tensor x, Tensor weight, Tensor weight_scale)
- op : weight_only_linear
args : (Tensor x, Tensor weight, Tensor bias, Tensor weight_scale, str weight_dtype)
output : Tensor(out)
infer_meta :
func : WeightOnlyMatmulInferMeta
func : WeightOnlyLinearInferMeta
kernel :
func : weight_only_matmul
func : weight_only_linear
data_type : x
optional: bias

- op : weight_quantize
args : (Tensor x, str algo = "weight_only_int8")
output : Tensor(out), Tensor(scale)
infer_meta :
func : WeightQuantizeInferMeta
kernel :
func : weight_quantize
data_type: x

- op : weighted_sample_neighbors
args : (Tensor row, Tensor colptr, Tensor edge_weight, Tensor input_nodes, Tensor eids, int sample_size, bool return_eids)
Expand Down
145 changes: 93 additions & 52 deletions paddle/phi/infermeta/multiary.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2468,6 +2468,53 @@ void LambInferMeta(const MetaTensor& param,
}
}

void LLMInt8LinearInferMeta(const MetaTensor& x,
const MetaTensor& weight,
const MetaTensor& bias,
const MetaTensor& weight_scale,
const float threshold,
MetaTensor* out) {
auto x_dims = x.dims();
auto w_dims = weight.dims();
PADDLE_ENFORCE_EQ(
w_dims.size(),
2UL,
errors::InvalidArgument("The input(weight) must be a 2D Tensor."));
PADDLE_ENFORCE_EQ(
x_dims[x_dims.size() - 1],
w_dims[1],
errors::InvalidArgument(
"Input(X) dim[-1] and Input(Weight) dim[1] should be euqal."
"But received Input(X) dim[-1](%s) != Input(Weight) dim[1](%s)",
x_dims[x_dims.size() - 1],
w_dims[1]));
PADDLE_ENFORCE_EQ(
w_dims[0] % 16,
0,
phi::errors::InvalidArgument(
"The first dimension of input must be divisible by 16, but got[%d]",
w_dims[0]));
PADDLE_ENFORCE_EQ(
w_dims[1] % 16,
0,
phi::errors::InvalidArgument(
"The second dimension of input must be divisible by 16, but got[%d]",
w_dims[1]));
PADDLE_ENFORCE_EQ(
weight_scale.dims()[0],
w_dims[0],
errors::InvalidArgument(
"Input(weight_scale) dim[0] and Input(Weight) dim[0] should be euqal."
"But received Input(weight_scale) dim[0](%s) != Input(Weight) "
"dim[0](%s)",
weight_scale.dims()[0],
w_dims[0]));
auto out_dims = x_dims;
out_dims[out_dims.size() - 1] = w_dims[0];
out->set_dims(out_dims);
out->set_dtype(x.dtype());
}

void LogspaceInferMeta(const MetaTensor& start,
const MetaTensor& stop,
const MetaTensor& number,
Expand Down Expand Up @@ -3598,6 +3645,52 @@ void WarprnntInferMeta(const MetaTensor& input,
loss->set_dtype(input.dtype());
}

void WeightOnlyLinearInferMeta(const MetaTensor& x,
const MetaTensor& weight,
const MetaTensor& bias,
const MetaTensor& weight_scale,
const std::string& weight_dtype,
MetaTensor* out) {
auto x_dims = x.dims();
auto w_dims = weight.dims();
auto n = weight_scale.dims()[0];
PADDLE_ENFORCE(
weight_dtype == "int8" || weight_dtype == "int4",
errors::InvalidArgument("quant_method must be 'int8' or 'int4'."));
PADDLE_ENFORCE_EQ(
w_dims.size(),
2UL,
errors::InvalidArgument("The input(weight) must be a 2D Tensor."));
PADDLE_ENFORCE_EQ(
weight_scale.dims().size(),
1UL,
errors::InvalidArgument("The input(weight_scale) must be a 1D Tensor."));
PADDLE_ENFORCE_EQ(
w_dims[0] % 16,
0,
phi::errors::InvalidArgument(
"The first dimension of input must be divisible by 16, but got[%d]",
w_dims[0]));
PADDLE_ENFORCE_EQ(
w_dims[1] % 16,
0,
phi::errors::InvalidArgument(
"The second dimension of input must be divisible by 16, but got[%d]",
w_dims[1]));
PADDLE_ENFORCE_EQ(
x_dims[x_dims.size() - 1],
w_dims[1],
errors::InvalidArgument(
"Input(X) dim[-1] and Input(Weight) dim[1] should be euqal."
"But received Input(X) dim[-1](%s) != Input(Weight) dim[1](%s)",
x_dims[x_dims.size() - 1],
w_dims[1]));
auto out_dims = x_dims;
out_dims[out_dims.size() - 1] = n;
out->set_dims(out_dims);
out->set_dtype(x.dtype());
}

void WhereInferMeta(const MetaTensor& condition,
const MetaTensor& x,
const MetaTensor& y,
Expand Down Expand Up @@ -3931,58 +4024,6 @@ void WeightedSampleNeighborsInferMeta(const MetaTensor& row,
out_count->set_dtype(DataType::INT32);
}

void LLMInt8MatmulInferMeta(const MetaTensor& x,
const MetaTensor& weight,
MetaTensor* out) {
auto x_dims = x.dims();
auto w_dims = weight.dims();
PADDLE_ENFORCE_EQ(
w_dims.size(),
2UL,
errors::InvalidArgument("The input(weight) must be a 2D Tensor."));
PADDLE_ENFORCE_EQ(
x_dims[x_dims.size() - 1],
w_dims[1],
errors::InvalidArgument(
"Input(X) dim[-1] and Input(Weight) dim[1] should be euqal."
"But received Input(X) dim[-1](%s) != Input(Weight) dim[1](%s)",
x_dims[x_dims.size() - 1],
w_dims[1]));
auto out_dims = x_dims;
out_dims[out_dims.size() - 1] = w_dims[0];
out->set_dims(out_dims);
out->set_dtype(x.dtype());
}

void WeightOnlyMatmulInferMeta(const MetaTensor& x,
const MetaTensor& weight,
const MetaTensor& weight_scale,
MetaTensor* out) {
auto x_dims = x.dims();
auto w_dims = weight.dims();
auto n = weight_scale.dims()[0];
PADDLE_ENFORCE_EQ(
w_dims.size(),
2UL,
errors::InvalidArgument("The input(weight) must be a 2D Tensor."));
PADDLE_ENFORCE_EQ(
weight_scale.dims().size(),
1UL,
errors::InvalidArgument("The input(weight_scale) must be a 1D Tensor."));
PADDLE_ENFORCE_EQ(
x_dims[x_dims.size() - 1],
w_dims[1],
errors::InvalidArgument(
"Input(X) dim[-1] and Input(Weight) dim[1] should be euqal."
"But received Input(X) dim[-1](%s) != Input(Weight) dim[1](%s)",
x_dims[x_dims.size() - 1],
w_dims[1]));
auto out_dims = x_dims;
out_dims[out_dims.size() - 1] = n;
out->set_dims(out_dims);
out->set_dtype(x.dtype());
}

void MaskedMultiheadAttentionInferMeta(const MetaTensor& x,
const MetaTensor& cache_kv,
const MetaTensor& src_mask,
Expand Down
23 changes: 14 additions & 9 deletions paddle/phi/infermeta/multiary.h
Original file line number Diff line number Diff line change
Expand Up @@ -426,6 +426,13 @@ void LambInferMeta(const MetaTensor& param,
MetaTensor* beta2_pow_out,
MetaTensor* master_param_outs);

void LLMInt8LinearInferMeta(const MetaTensor& x,
const MetaTensor& weight,
const MetaTensor& bias,
const MetaTensor& weight_scale,
const float threshold,
MetaTensor* out);

void LogspaceInferMeta(const MetaTensor& start,
const MetaTensor& stop,
const MetaTensor& number,
Expand Down Expand Up @@ -656,6 +663,13 @@ void WarprnntInferMeta(const MetaTensor& input,
MetaTensor* loss,
MetaTensor* warpctcgrad);

void WeightOnlyLinearInferMeta(const MetaTensor& x,
const MetaTensor& weight,
const MetaTensor& bias,
const MetaTensor& weight_scale,
const std::string& weight_dtype,
MetaTensor* out);

void WeightedSampleNeighborsInferMeta(const MetaTensor& row,
const MetaTensor& col_ptr,
const MetaTensor& edge_weight,
Expand Down Expand Up @@ -755,15 +769,6 @@ void FusedMultiHeadAttentionVariableInferMeta(const MetaTensor& query,
bool causal,
MetaTensor* out);

void LLMInt8MatmulInferMeta(const MetaTensor& x,
const MetaTensor& weight,
MetaTensor* out);

void WeightOnlyMatmulInferMeta(const MetaTensor& x,
const MetaTensor& weight,
const MetaTensor& weight_scale,
MetaTensor* out);

void FusedRopeInferMeta(const MetaTensor& q,
const MetaTensor& k,
const MetaTensor& v,
Expand Down
82 changes: 42 additions & 40 deletions paddle/phi/infermeta/unary.cc
Original file line number Diff line number Diff line change
Expand Up @@ -5030,6 +5030,48 @@ void UnStackInferMeta(const MetaTensor& x,
}
}

void WeightQuantizeInferMeta(const MetaTensor& x,
const std::string& algo,
MetaTensor* out,
MetaTensor* scale) {
auto x_dims = x.dims();
PADDLE_ENFORCE_EQ(
x_dims.size(),
2UL,
phi::errors::InvalidArgument(
"The x tensor of quant op must be 2D, but got[%d]", x_dims.size()));
PADDLE_ENFORCE_EQ(
x_dims[0] % 64,
0,
phi::errors::InvalidArgument(
"The first dimension of input must be divisible by 64, but got[%d]",
x_dims[0]));
PADDLE_ENFORCE_EQ(
x_dims[1] % 16,
0,
phi::errors::InvalidArgument(
"The second dimension of input must be divisible by 16, but got[%d]",
x_dims[1]));
std::vector<int64_t> dim_scale({x_dims[1]});
std::vector<int64_t> dim_out;
if (algo == "weight_only_int8" || algo == "llm.int8") {
dim_out = std::vector<int64_t>({x_dims[1], x_dims[0]});
} else if (algo == "weight_only_int4") {
dim_out = std::vector<int64_t>({x_dims[1] / 2, x_dims[0]});
} else {
phi::errors::InvalidArgument(
"The algo must be in ['weight_only_int8', 'weight_only_int4', "
"'llm.int8'], but got[%s]",
algo);
}
out->set_dims(phi::make_ddim(dim_out));

out->set_dtype(DataType::INT8);

scale->set_dims(phi::make_ddim(dim_scale));
scale->set_dtype(DataType::FLOAT32);
}

void ChannelShuffleInferMeta(const MetaTensor& x,
int groups,
const std::string& data_format,
Expand Down Expand Up @@ -5090,46 +5132,6 @@ void CheckNumericsInferMeta(const MetaTensor& tensor,
values->set_dims(phi::make_ddim({3}));
}

void QuantForCompressInferMeta(const MetaTensor& x,
int bits,
const std::string& layout,
MetaTensor* out,
MetaTensor* scale) {
auto x_dims = x.dims();
PADDLE_ENFORCE_EQ(
x_dims.size(),
2UL,
phi::errors::InvalidArgument(
"The x tensor of quant op must be 2D, but got[%d]", x_dims.size()));
PADDLE_ENFORCE_GE(
x_dims[0],
64,
phi::errors::OutOfRange("The first dimension of input is out of range "
"(expected at least 64, but got %ld).",
x_dims[0]));
PADDLE_ENFORCE_EQ(
x_dims[0] % 64,
0,
phi::errors::InvalidArgument(
"The first dimension of input must be divisible by 64, but got[%d]",
x_dims[0]));
std::vector<int64_t> dim_scale({x_dims[1]});
std::vector<int64_t> dim_out;
if (bits == 8) {
dim_out = std::vector<int64_t>({x_dims[1], x_dims[0]});
} else if (bits == 4) {
dim_out = std::vector<int64_t>({x_dims[1] / 2, x_dims[0]});
} else {
phi::errors::InvalidArgument("The bit must be 8 or 4, but got %d", bits);
}
out->set_dims(phi::make_ddim(dim_out));

out->set_dtype(DataType::INT8);

scale->set_dims(phi::make_ddim(dim_scale));
scale->set_dtype(DataType::FLOAT32);
}

void StridedUnChangedInferMeta(const MetaTensor& x, MetaTensor* out) {
out->share_meta(x);
out->set_strides(x.strides());
Expand Down
11 changes: 5 additions & 6 deletions paddle/phi/infermeta/unary.h
Original file line number Diff line number Diff line change
Expand Up @@ -436,6 +436,11 @@ void QrInferMeta(const MetaTensor& x,
MetaTensor* q,
MetaTensor* r);

void WeightQuantizeInferMeta(const MetaTensor& x,
const std::string& algo,
MetaTensor* out,
MetaTensor* scale);

void RealAndImagInferMeta(const MetaTensor& x, MetaTensor* out);

void ReduceInferMeta(const MetaTensor& x,
Expand Down Expand Up @@ -728,12 +733,6 @@ void UnStackInferMeta(const MetaTensor& x,
int num,
std::vector<MetaTensor*> outs);

void QuantForCompressInferMeta(const MetaTensor& x,
int bits,
const std::string& layout,
MetaTensor* out,
MetaTensor* scale);

void StridedUnChangedInferMeta(const MetaTensor& x, MetaTensor* out);

} // namespace phi
Loading