Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

some bug fix for sparse matrix #133

Merged
merged 2 commits into from
Sep 30, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions paddle/cuda/include/hl_sparse.h
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,7 @@ extern void hl_matrix_csc2dense(hl_sparse_matrix_s A_d,
* @param[in] dimK width of op(A) & height of op(B)
* @param[in] alpha scalar used for multiplication.
* @param[in] beta scalar used for multiplication.
* If beta is zero, C does not have to be a valid input.
*
* @note transb is not support HPPL_OP_T.
*
Expand Down Expand Up @@ -251,6 +252,7 @@ extern void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d,
* @param[in] dimK width of op(A) & height of op(B)
* @param[in] alpha scalar used for multiplication.
* @param[in] beta scalar used for multiplication.
* If beta is zero, C does not have to be a valid input.
*
* @note transb is not support HPPL_OP_T.
*
Expand All @@ -275,6 +277,7 @@ extern void hl_matrix_csc_mul_dense(hl_sparse_matrix_s A_d,
* @param[in] dimK width of op(A) & height of op(B)
* @param[in] alpha scalar used for multiplication.
* @param[in] beta scalar used for multiplication.
* If beta is zero, C does not have to be a valid input.
*
* @note transa is not support HPPL_OP_T.
*
Expand Down Expand Up @@ -327,6 +330,7 @@ extern void hl_sparse_matrix_mul(real* A_d, hl_trans_op_t transa,
* @param[in] dimK width of op(A) & height of op(B)
* @param[in] alpha scalar used for multiplication.
* @param[in] beta scalar used for multiplication.
* If beta is zero, C does not have to be a valid input.
*
*
* @note transa is not support HPPL_OP_T.
Expand Down
107 changes: 35 additions & 72 deletions paddle/cuda/src/hl_cuda_sparse.cu
Original file line number Diff line number Diff line change
Expand Up @@ -562,6 +562,22 @@ void hl_memcpy_sparse_matrix(hl_sparse_matrix_s dst,
}
}

/**
* Calculate beta * C, if beta is zero, C does not have to be a valid input.
*/
static void _beta_mul_c(real *c, int dimM, int dimN, real beta) {
if (beta == 0.0) {
hl_gpu_apply_unary_op(unary::Zero<real>(), c, dimM, dimN, dimN);
Copy link
Contributor

@qingqing01 qingqing01 Sep 29, 2016

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see the "=" is used in the kernel if beta == 0, namely

+__device__ __forceinline__
 +void _calculate_c(real &c, real sum) {
 +  c = sum;
 +}

so could this zero memory operation be removed? Otherwise, it will zero every time, although it may not be consuming.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

_beat_mul_c is used before kernel KeSMatrixDenseMulCsr, in this kernel will not call _calculate_c again.

} else {
if (beta != 1.0){
hl_gpu_apply_unary_op(
unary::mul_scalar<real>(beta), c, dimM, dimN, dimN);
}
}

return;
}

void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa,
real *B_d, hl_trans_op_t transb,
real *C_d,
Expand All @@ -580,15 +596,8 @@ void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa,
}

if (A_d->nnz == 0) {
if (beta != 1.0) {
hl_gpu_apply_unary_op(unary::mul_scalar<real>(beta),
C_d,
dimM,
dimN,
dimN);
} else {
return;
}
_beta_mul_c(C_d, dimM, dimN, beta);
return;
}

/* nnz != 0 */
Expand Down Expand Up @@ -633,13 +642,7 @@ void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa,
beta);
}
} else if (HPPL_OP_T == transa) {
if (beta != 1.0) {
hl_gpu_apply_unary_op(unary::mul_scalar<real>(beta),
C_d,
dimM,
dimN,
dimN);
}
_beta_mul_c(C_d, dimM, dimN, beta);

int blocksX = (dimN + CU_CSC_MUL_DENSE_BLOCK_N - 1) /
CU_CSC_MUL_DENSE_BLOCK_N;
Expand Down Expand Up @@ -699,15 +702,8 @@ void hl_matrix_dense_mul_csc(real *A_d, hl_trans_op_t transa,
<< "matrix format error!";

if (B_d->nnz == 0) {
if (beta != 1.0) {
hl_gpu_apply_unary_op(unary::mul_scalar<real>(beta),
C_d,
dimM,
dimN,
dimN);
} else {
return;
}
_beta_mul_c(C_d, dimM, dimN, beta);
return;
}

/* nnz != 0 */
Expand Down Expand Up @@ -750,13 +746,7 @@ void hl_matrix_dense_mul_csc(real *A_d, hl_trans_op_t transa,
beta);
}
} else if (transb == HPPL_OP_T) {
if (beta != 1.0) {
hl_gpu_apply_unary_op(unary::mul_scalar<real>(beta),
C_d,
dimM,
dimN,
dimN);
}
_beta_mul_c(C_d, dimM, dimN, beta);
int blocksX = 1 + (dimK-1)/CU_DM_CSR_THREAD_X;
int blocksY = 1 + (dimM-1)/CU_DM_CSR_BLOCK_M;
dim3 threads(CU_DM_CSR_THREAD_X, CU_DM_CSR_THREAD_Y);
Expand Down Expand Up @@ -813,15 +803,8 @@ void hl_matrix_dense_mul_csr(real *A_d, hl_trans_op_t transa,
<< "matrix format error!";

if (B_d->nnz == 0) {
if (beta != 1.0) {
hl_gpu_apply_unary_op(unary::mul_scalar<real>(beta),
C_d,
dimM,
dimN,
dimN);
} else {
return;
}
_beta_mul_c(C_d, dimM, dimN, beta);
return;
}

/* nnz != 0 */
Expand All @@ -833,14 +816,7 @@ void hl_matrix_dense_mul_csr(real *A_d, hl_trans_op_t transa,
}

if (transb == HPPL_OP_N) {
if (beta != 1.0) {
hl_gpu_apply_unary_op(unary::mul_scalar<real>(beta),
C_d,
dimM,
dimN,
dimN);
}

_beta_mul_c(C_d, dimM, dimN, beta);
int blocksX = 1 + (dimK-1)/CU_DM_CSR_THREAD_X;
int blocksY = 1 + (dimM-1)/CU_DM_CSR_BLOCK_M;
dim3 threads(CU_DM_CSR_THREAD_X, CU_DM_CSR_THREAD_Y);
Expand Down Expand Up @@ -925,15 +901,8 @@ void hl_matrix_csc_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa,
}

if (A_d->nnz == 0) {
if (beta != 1.0) {
hl_gpu_apply_unary_op(unary::mul_scalar<real>(beta),
C_d,
dimM,
dimN,
dimN);
} else {
return;
}
_beta_mul_c(C_d, dimM, dimN, beta);
return;
}

/* nnz != 0 */
Expand All @@ -945,13 +914,7 @@ void hl_matrix_csc_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa,
}

if (HPPL_OP_N == transa) {
if (beta != 1.0) {
hl_gpu_apply_unary_op(unary::mul_scalar<real>(beta),
C_d,
dimM,
dimN,
dimN);
}
_beta_mul_c(C_d, dimM, dimN, beta);

int blocksX = (dimN + CU_CSC_MUL_DENSE_BLOCK_N -1)/CU_CSC_MUL_DENSE_BLOCK_N;
int blocksY = (dimK + CU_CSC_MUL_DENSE_BLOCK_K -1)/CU_CSC_MUL_DENSE_BLOCK_K;
Expand Down Expand Up @@ -1113,7 +1076,7 @@ void hl_sparse_matrix_mul(real *A_d, hl_trans_op_t transa,
CHECK(!transA) << "Not supported A is trans and B is not trans!";

dim3 block(CU_BLOCK_SIZE, 1);
int avgNnzPerRow = C_d2->nnz_s / dimM;
int avgNnzPerRow = C_d->nnz / dimM;
avgNnzPerRow = avgNnzPerRow > 0 ? avgNnzPerRow : 1;
int gridx = DIVUP(avgNnzPerRow, CU_BLOCK_SIZE);
dim3 grid(gridx, dimM);
Expand Down Expand Up @@ -1242,9 +1205,9 @@ void hl_matrix_csr_column_sum(real* A_d, hl_sparse_matrix_s B_d,
LOG(FATAL) << "parameter B is null!";
}

if (B_d2->nnz_s == 0) return;
if (B_d->nnz == 0) return;

int nnz = B_d2->nnz_s;
int nnz = B_d->nnz;
int block = 512;
int grid = DIVUP(nnz, 512);
KeSMatrixCsrColumnSum<<<grid, block, 0, STREAM_DEFAULT>>>(
Expand Down Expand Up @@ -1273,9 +1236,9 @@ void hl_matrix_csr_add_bias(hl_sparse_matrix_s A_d, real* B_d,
LOG(FATAL) << "parameter A_d is null!";
}

if (A_d2->nnz_s == 0) return;
if (A_d->nnz == 0) return;

int nnz = A_d2->nnz_s;
int nnz = A_d->nnz;
int block = 512;
int grid = DIVUP(nnz, 512);
KeSMatrixCsrAddBias<<<grid, block, 0, STREAM_DEFAULT>>>(
Expand Down Expand Up @@ -1308,9 +1271,9 @@ void hl_matrix_csr_add_dense(hl_sparse_matrix_s A_d, real* B_d, int dimM,
LOG(FATAL) << "parameter A_d is null!";
}

if (A_d2->nnz_s == 0) return;
if (A_d->nnz == 0) return;

int gridX = DIVUP((A_d2->nnz_s / dimM), 512);
int gridX = DIVUP((A_d->nnz / dimM), 512);
gridX = gridX > 0 ? gridX : 1;
dim3 block(512, 1);
dim3 grid(gridX, dimM);
Expand Down
48 changes: 37 additions & 11 deletions paddle/cuda/src/hl_cuda_sparse.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,15 @@ __global__ void KeSMatrixCsc2Dense(real * csc_val,
C_d[row*dimN + col] = sum;
}

__device__ __forceinline__
void _calculate_c(real &c, real sum) {
c = sum;
}
__device__ __forceinline__
void _calculate_c(real &c, real sum, real beta) {
c = sum + beta * c;
}

#define CU_CSRMM_N 4
#define CU_CSRMM_THREAD_X 32
#define CU_CSRMM_THREAD_Y 32
Expand Down Expand Up @@ -191,11 +200,19 @@ __global__ void KeSMatrixCsrMulDense(real *C_d,
}

C_d += __mul24(index_m, dimN);
#pragma unroll
for (int n = 0; n < CU_CSRMM_N; n++) {
if (index_n < dimN) {
C_d[index_n] = alpha*sum[n] + beta*C_d[index_n];
index_n += CU_CSRMM_THREAD_X;
if (beta == 0.0) {
for (int n = 0; n < CU_CSRMM_N; n++) {
if (index_n < dimN) {
_calculate_c(C_d[index_n], alpha * sum[n]);
index_n += CU_CSRMM_THREAD_X;
}
}
} else {
for (int n = 0; n < CU_CSRMM_N; n++) {
if (index_n < dimN) {
_calculate_c(C_d[index_n], alpha * sum[n], beta);
index_n += CU_CSRMM_THREAD_X;
}
}
}
}
Expand Down Expand Up @@ -544,13 +561,22 @@ TEMP_TEST:
int index_m_c = ibx + idy;
int index_n_c = blockIdx.y*CU_CSCMM_BLOCK_N_BEST + idx;
C_d += index_n_c + __mul24(index_m_c, dimN);
#pragma unroll
for (int m = 0; m < CU_CSCMM_THREAD_M_BEST; m++) {
if (index_m_c < dimM && index_n_c < dimN) {
C_d[0] = A_s[idy+m*32][idx] + beta*C_d[0];
if (beta == 0.0) {
for (int m = 0; m < CU_CSCMM_THREAD_M_BEST; m++) {
if (index_m_c < dimM && index_n_c < dimN) {
_calculate_c(C_d[0], A_s[idy + m * 32][idx]);
}
index_m_c += 32;
C_d += dimN*32;
}
} else {
for (int m = 0; m < CU_CSCMM_THREAD_M_BEST; m++) {
if (index_m_c < dimM && index_n_c < dimN) {
_calculate_c(C_d[0], A_s[idy + m * 32][idx], beta);
}
index_m_c += 32;
C_d += dimN*32;
}
index_m_c += 32;
C_d += dimN*32;
}
}

Expand Down
27 changes: 22 additions & 5 deletions paddle/gserver/layers/Layer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ limitations under the License. */
#include "paddle/utils/Util.h"

#include "paddle/utils/Logging.h"
#include "paddle/math/SparseMatrix.h"

#include "AddtoLayer.h"
#include "CosSimLayer.h"
Expand Down Expand Up @@ -290,14 +291,30 @@ void Layer::showOutputStats() {
<< " is 0, skip to show the statistics";
return;
}
real mean = out->getSum() / out->getElementCnt();
MatrixPtr outSquare = out->clone();
outSquare->copyFrom(*out);
MatrixPtr outSquare;
if (dynamic_cast<GpuSparseMatrix*>(out.get())) {
GpuSparseMatrix *tmp = dynamic_cast<GpuSparseMatrix*>(out.get());
outSquare = std::make_shared<CpuSparseMatrix>(
tmp->getHeight(), tmp->getWidth(), tmp->getElementCnt(),
tmp->getValueType(), tmp->getFormat());
} else {
outSquare = out->clone();
}
outSquare->copyFrom(*out, HPPL_STREAM_DEFAULT);
hl_stream_synchronize(HPPL_STREAM_DEFAULT);

real mean = outSquare->getSum() / out->getElementCnt();
real min;
real max;
if (dynamic_cast<CpuSparseMatrix*>(outSquare.get())) {
auto tmpMat = dynamic_cast<CpuSparseMatrix*>(outSquare.get());
min = tmpMat->getMin();
max = tmpMat->getMax();
tmpMat->square();
LOG(INFO) << "show statistics of [none zero values] in sparse matrix";
} else {
min = outSquare->getMin();
max = outSquare->getMax();
outSquare->square();
}
real std = (outSquare->getSum() / outSquare->getElementCnt()) - mean * mean;
Expand All @@ -306,8 +323,8 @@ void Layer::showOutputStats() {
<< ", "
<< "std=" << std
<< ", "
<< "min=" << out->getMin() << ", "
<< "max=" << out->getMax();
<< "min=" << min << ", "
<< "max=" << max;
}

void Layer::forwardActivation() {
Expand Down