Skip to content

Commit

Permalink
review changes rapidsai#1
Browse files Browse the repository at this point in the history
- indented comments properly
- avoided some implicit compiler issued typecasts
- refactored to adding the correct API declaration of the implementation in metrics.hpp
- refactored into using deviceAllocators wherever possible
- added `CUDA_CHECK(cudaStreamSynchronize(stream))` at suspected places
  • Loading branch information
Ganesh Venkataramana committed Jun 24, 2019
1 parent d1fce7b commit 37d570a
Show file tree
Hide file tree
Showing 3 changed files with 163 additions and 178 deletions.
137 changes: 68 additions & 69 deletions cpp/src/metrics/metrics.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,91 +23,90 @@ namespace ML {
namespace Metrics {

/**
* Calculates the "Coefficient of Determination" (R-Squared) score
* normalizing the sum of squared errors by the total sum of squares
* with single precision.
*
* This score indicates the proportionate amount of variation in an
* expected response variable is explained by the independent variables
* in a linear regression model. The larger the R-squared value, the
* more variability is explained by the linear regression model.
*
* @param handle: cumlHandle
* @param y: Array of ground-truth response variables
* @param y_hat: Array of predicted response variables
* @param n: Number of elements in y and y_hat
* @return: The R-squared value.
*/
* Calculates the "Coefficient of Determination" (R-Squared) score
* normalizing the sum of squared errors by the total sum of squares
* with single precision.
*
* This score indicates the proportionate amount of variation in an
* expected response variable is explained by the independent variables
* in a linear regression model. The larger the R-squared value, the
* more variability is explained by the linear regression model.
*
* @param handle: cumlHandle
* @param y: Array of ground-truth response variables
* @param y_hat: Array of predicted response variables
* @param n: Number of elements in y and y_hat
* @return: The R-squared value.
*/
float r2_score_py(const cumlHandle &handle, float *y, float *y_hat, int n);

/**
* Calculates the "Coefficient of Determination" (R-Squared) score
* normalizing the sum of squared errors by the total sum of squares
* with double precision.
*
* This score indicates the proportionate amount of variation in an
* expected response variable is explained by the independent variables
* in a linear regression model. The larger the R-squared value, the
* more variability is explained by the linear regression model.
*
* @param handle: cumlHandle
* @param y: Array of ground-truth response variables
* @param y_hat: Array of predicted response variables
* @param n: Number of elements in y and y_hat
* @return: The R-squared value.
*/
* Calculates the "Coefficient of Determination" (R-Squared) score
* normalizing the sum of squared errors by the total sum of squares
* with double precision.
*
* This score indicates the proportionate amount of variation in an
* expected response variable is explained by the independent variables
* in a linear regression model. The larger the R-squared value, the
* more variability is explained by the linear regression model.
*
* @param handle: cumlHandle
* @param y: Array of ground-truth response variables
* @param y_hat: Array of predicted response variables
* @param n: Number of elements in y and y_hat
* @return: The R-squared value.
*/
double r2_score_py(const cumlHandle &handle, double *y, double *y_hat, int n);

/**
* Calculates the "rand index"
*
* This metric is a measure of similarity between two data clusterings.
*
* @param handle: cumlHandle
* @param y: Array of response variables of the first clustering classifications
* @param y_hat: Array of response variables of the second clustering classifications
* @param n: Number of elements in y and y_hat
* @return: The rand index value
*/
* Calculates the "rand index"
*
* This metric is a measure of similarity between two data clusterings.
*
* @param handle: cumlHandle
* @param y: Array of response variables of the first clustering classifications
* @param y_hat: Array of response variables of the second clustering classifications
* @param n: Number of elements in y and y_hat
* @return: The rand index value
*/

double randIndex(const cumlHandle &handle, double *y, double *y_hat, int n);

/**
* Calculates the "adjusted rand index"
*
* This metric is the corrected-for-chance version of the rand index
*
* @param handle: cumlHandle
* @param y: Array of response variables of the first clustering classifications
* @param y_hat: Array of response variables of the second clustering classifications
* @param n: Number of elements in y and y_hat
* @param lower_class_range: the lowest value in the range of classes
* @param upper_class_range: the highest value in the range of classes
* @return: The adjusted rand index value
*/
* Calculates the "adjusted rand index"
*
* This metric is the corrected-for-chance version of the rand index
*
* @param handle: cumlHandle
* @param y: Array of response variables of the first clustering classifications
* @param y_hat: Array of response variables of the second clustering classifications
* @param n: Number of elements in y and y_hat
* @param lower_class_range: the lowest value in the range of classes
* @param upper_class_range: the highest value in the range of classes
* @return: The adjusted rand index value
*/
double adjustedRandIndex(const cumlHandle &handle, const int *y,
const int *y_hat, const int n,
const int lower_class_range,
const int upper_class_range);

/**
* Calculates the "Mutual Information score" between two clusters
*
* Mutual Information is a measure of the similarity between two labels of
* the same data.
*
* @param handle: cumlHandle
* @param y: Array of response variables of the first clustering classifications
* @param y_hat: Array of response variables of the second clustering classifications
* @param n: Number of elements in y and y_hat
* @param lower_class_range: the lowest value in the range of classes
* @param upper_class_range: the highest value in the range of classes
* @return: The mutual information score
*/
double adjustedRandIndex(const cumlHandle &handle, const int *y,
const int *y_hat, const int n,
const int lower_class_range,
const int upper_class_range);
* Calculates the "Mutual Information score" between two clusters
*
* Mutual Information is a measure of the similarity between two labels of
* the same data.
*
* @param handle: cumlHandle
* @param y: Array of response variables of the first clustering classifications
* @param y_hat: Array of response variables of the second clustering classifications
* @param n: Number of elements in y and y_hat
* @param lower_class_range: the lowest value in the range of classes
* @param upper_class_range: the highest value in the range of classes
* @return: The mutual information score
*/
double mutualInfoScore(const cumlHandle &handle, const int *y, const int *y_hat,
const int n, const int lower_class_range,
const int upper_class_range);

} // namespace Metrics
} // namespace ML
24 changes: 14 additions & 10 deletions cpp/src_prims/metrics/mutualInfoScore.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,13 +52,13 @@ __global__ void mutualInfoKernel(const int *dContingencyMatrix, const int *a,
int i = threadIdx.y + blockIdx.y * blockDim.y;

//thread-local variable to count the mutual info
double localMI = 0;
double localMI = 0.0;

if (i < size && j < size && a[i] * b[j] != 0 &&
dContingencyMatrix[i * size + j] != 0) {
localMI += (double(dContingencyMatrix[i * size + j])) *
double(log(double(dContingencyMatrix[i * size + j])) -
log(double(a[i] * b[j])));
(log(double(dContingencyMatrix[i * size + j])) -
log(double(a[i] * b[j])));
}

//specialize blockReduce for a 2D block of 1024 threads of type uint64_t
Expand Down Expand Up @@ -106,16 +106,15 @@ double mutualInfoScore(const T *firstClusterArray, const T *secondClusterArray,
stream));

//workspace allocation
char *pWorkspace = nullptr;
size_t workspaceSz = MLCommon::Metrics::getContingencyMatrixWorkspaceSize(
size, firstClusterArray, stream, lowerLabelRange, upperLabelRange);
if (workspaceSz != 0) MLCommon::allocate(pWorkspace, workspaceSz);
device_buffer<char> pWorkspace(allocator, stream, workspaceSz);

//calculating the contingency matrix
MLCommon::Metrics::contingencyMatrix(
firstClusterArray, secondClusterArray, (int)size,
(int *)dContingencyMatrix.data(), stream, (void *)pWorkspace, workspaceSz,
lowerLabelRange, upperLabelRange);
(int *)dContingencyMatrix.data(), stream, (void *)pWorkspace.data(),
workspaceSz, lowerLabelRange, upperLabelRange);

//creating device buffers for all the parameters involved in ARI calculation
//device variables
Expand All @@ -133,6 +132,8 @@ double mutualInfoScore(const T *firstClusterArray, const T *secondClusterArray,
cudaMemsetAsync(b.data(), 0, numUniqueClasses * sizeof(int), stream));
CUDA_CHECK(cudaMemsetAsync(d_MI.data(), 0, sizeof(double), stream));

CUDA_CHECK(cudaStreamSynchronize(stream));

//calculating the row-wise sums
MLCommon::LinAlg::reduce<int, int, int>(a.data(), dContingencyMatrix.data(),
numUniqueClasses, numUniqueClasses, 0,
Expand All @@ -149,19 +150,22 @@ double mutualInfoScore(const T *firstClusterArray, const T *secondClusterArray,
dim3 numBlocks(ceildiv<int>(size, numThreadsPerBlock.x),
ceildiv<int>(size, numThreadsPerBlock.y));

CUDA_CHECK(cudaStreamSynchronize(stream));

//calling the kernel
mutualInfoKernel<T, BLOCK_DIM_X, BLOCK_DIM_Y>
<<<numBlocks, numThreadsPerBlock, 0, stream>>>(
dContingencyMatrix.data(), a.data(), b.data(), numUniqueClasses,
d_MI.data());

CUDA_CHECK(cudaStreamSynchronize(stream));

//updating in the host memory
MLCommon::updateHost(&h_MI, d_MI.data(), 1, stream);

//freeing the memories in the device
if (pWorkspace) CUDA_CHECK(cudaFree(pWorkspace));
CUDA_CHECK(cudaStreamSynchronize(stream));

return h_MI/size;
return h_MI / size;
}

}; //end namespace Metrics
Expand Down
Loading

0 comments on commit 37d570a

Please sign in to comment.