Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add get_synthetic_dataset function to util #146

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
155 changes: 148 additions & 7 deletions python/mxnet/test_utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,20 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

"""Tools for testing."""
# pylint: disable=too-many-lines
from __future__ import absolute_import, print_function, division
Expand All @@ -11,8 +28,8 @@
import os
import errno
import logging
import scipy.sparse as sp
from contextlib import contextmanager
import scipy.sparse as sp
import numpy as np
import numpy.testing as npt
import numpy.random as rnd
Expand Down Expand Up @@ -79,11 +96,130 @@ def random_sample(population, k):
return population_copy[0:k]


def rand_sparse_ndarray(shape, stype, density=None, dtype=None):
"""Generate a random sparse ndarray. Returns the ndarray, value(np) and indices(np) """
def _validate_csr_generation_inputs(num_rows, num_cols, density,
distribution="uniform"):
"""Validates inputs for csr generation helper functions
"""
total_nnz = int(num_rows * num_cols * density)
if density < 0 or density > 1:
raise ValueError("density has to be between 0 and 1")

if num_rows <= 0 or num_cols <= 0:
raise ValueError("num_rows or num_cols should be greater than 0")

if distribution == "powerlaw":
if total_nnz < 2 * num_rows:
raise ValueError("not supported for this density: %s"
" for this shape (%s, %s)"
" Please keep :"
" num_rows * num_cols * density >= 2 * num_rows"
% (density, num_rows, num_cols))


def _get_uniform_dataset_csr(num_rows, num_cols, density=0.1, dtype=None):
"""Returns CSRNDArray with uniform distribution
This generates a csr matrix with totalnnz unique randomly chosen numbers
from num_rows*num_cols and arranges them in the 2d array in the
following way: row_index = (random_number_generated / num_rows)
col_index = random_number_generated - row_index * num_cols
"""
_validate_csr_generation_inputs(num_rows, num_cols, density,
distribution="uniform")
csr = sp.rand(num_rows, num_cols, density, dtype=dtype, format="csr")
result = mx.nd.csr_matrix(csr.data, csr.indptr, csr.indices,
(num_rows, num_cols), dtype=dtype)
return result


def _get_powerlaw_dataset_csr(num_rows, num_cols, density=0.1, dtype=None):
"""Returns CSRNDArray with powerlaw distribution
with exponentially increasing number of non zeros in each row.
Not supported for cases where total_nnz < 2*num_rows. This is because
the algorithm first tries to ensure that there are rows with no zeros by
putting non zeros at beginning of each row.
"""

_validate_csr_generation_inputs(num_rows, num_cols, density,
distribution="powerlaw")

total_nnz = int(num_rows * num_cols * density)

unused_nnz = total_nnz
output_arr = np.zeros((num_rows, num_cols), dtype=dtype)
# Start with ones on each row so that no row is empty
for row in range(num_rows):
output_arr[row][0] = 1 + rnd.uniform(0.001, 2)
unused_nnz = unused_nnz - 1
if unused_nnz <= 0:
return mx.nd.array(output_arr).tostype("csr")

# Populate rest of matrix with 2^i items in ith row.
# if we have used all total nnz return the sparse matrix
# else if we reached max column size then fill up full columns until we use all nnz
col_max = 2
for row in range(num_rows):
col_limit = min(num_cols, col_max)
# In case col_limit reached assign same value to all elements, which is much faster
if col_limit == num_cols and unused_nnz > col_limit:
output_arr[row] = 1 + rnd.uniform(0.001, 2)
unused_nnz = unused_nnz - col_limit + 1
if unused_nnz <= 0:
return mx.nd.array(output_arr).tostype("csr")
else:
continue
for col_index in range(1, col_limit):
output_arr[row][col_index] = 1 + rnd.uniform(0.001, 2)
unused_nnz = unused_nnz - 1
if unused_nnz <= 0:
return mx.nd.array(output_arr).tostype("csr")
col_max = col_max * 2

if unused_nnz > 0:
#return mx.nd.array(sp.random(num_rows, num_cols, density).toarray()).tostype("csr")
raise ValueError("not supported for this density: %s"
" for this shape (%s,%s)" % (density, num_rows, num_cols))
else:
return mx.nd.array(output_arr).tostype("csr")


def rand_sparse_ndarray(shape, stype, density=None, distribution="uniform", dtype=None):
"""Generate a random sparse ndarray. Returns the ndarray, value(np) and indices(np)
Parameters
----------
shape: list or tuple
stype: str, valid values: "csr" or "row_sparse"
density, optional: float, should be between 0 and 1
distribution, optional: str, valid values: "uniform" or "powerlaw"
dtype, optional: numpy.dtype, default value is None
Returns
-------
Result of type CSRNDArray or RowSparseNDArray
Examples
--------
Below is an example of the powerlaw distribution with csr as the stype.
It calculates the nnz using the shape and density.
It fills up the ndarray with exponentially increasing number of elements.
If there are enough unused_nnzs, n+1th row will have twice more nnzs compared to nth row.
else, remaining unused_nnzs will be used in n+1th row
If number of cols is too small and we have already reached column size it will fill up
all following columns in all followings rows until we reach the required density.

>>> csr_arr, _ = rand_sparse_ndarray(shape=(5, 16), stype="csr",
density=0.50, distribution="powerlaw")
>>> indptr = csr_arr.indptr.asnumpy()
>>> indices = csr_arr.indices.asnumpy()
>>> data = csr_arr.data.asnumpy()
>>> row2nnz = len(data[indptr[1]:indptr[2]])
>>> row3nnz = len(data[indptr[2]:indptr[3]])
>>> assert(row3nnz == 2*row2nnz)
>>> row4nnz = len(data[indptr[3]:indptr[4]])
>>> assert(row4nnz == 2*row3nnz)
"""
density = rnd.rand() if density is None else density
dtype = default_dtype() if dtype is None else dtype
if stype == 'row_sparse':
assert (distribution == "uniform"), \
"Distribution %s not supported for row_sparse" % (distribution)
# sample index
idx_sample = rnd.rand(shape[0])
indices = np.argwhere(idx_sample < density).flatten()
Expand All @@ -95,10 +231,15 @@ def rand_sparse_ndarray(shape, stype, density=None, dtype=None):
arr = mx.nd.row_sparse_array(val, indices, shape, indices_type=np.int64, dtype=dtype)
return arr, (val, indices)
elif stype == 'csr':
assert(len(shape) == 2)
csr = sp.rand(shape[0], shape[1], density=density, format='csr', dtype=dtype)
result = mx.nd.csr_matrix(csr.data, csr.indptr, csr.indices, shape, dtype=dtype)
return result, (csr.indptr, csr.indices, csr.data)
assert len(shape) == 2
if distribution == "uniform":
csr = _get_uniform_dataset_csr(shape[0], shape[1], density, dtype=dtype)
return csr, (csr.indptr, csr.indices, csr.data)
elif distribution == "powerlaw":
csr = _get_powerlaw_dataset_csr(shape[0], shape[1], density, dtype=dtype)
return csr, (csr.indptr, csr.indices, csr.data)
else:
assert(False), "Distribution not supported: %s" % (distribution)
else:
assert(False), "unknown storage type"

Expand Down
33 changes: 33 additions & 0 deletions tests/python/unittest/test_sparse_ndarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -450,6 +450,39 @@ def test_sparse_nd_empty():
assert(nd.stype == stype)


def test_synthetic_dataset_generator():
def test_powerlaw_generator(csr_arr, final_row=1):
"""Test power law distribution
Total Elements: 32000, Number of zeros: 3200
Every row has 2 * non zero elements of the previous row.
Also since (2047 < 3200 < 4095) this will be true till 10th row"""
indices = csr_arr.indices.asnumpy()
indptr = csr_arr.indptr.asnumpy()
for row in range(1, final_row + 1):
nextrow = row + 1
current_row_nnz = indices[indptr[row] - 1] + 1
next_row_nnz = indices[indptr[nextrow] - 1] + 1
assert next_row_nnz == 2 * current_row_nnz

# Test if density is preserved
csr_arr_cols, _ = rand_sparse_ndarray(shape=(32, 10000), stype="csr",
density=0.01, distribution="powerlaw")

csr_arr_small, _ = rand_sparse_ndarray(shape=(5, 5), stype="csr",
density=0.5, distribution="powerlaw")

csr_arr_big, _ = rand_sparse_ndarray(shape=(32, 1000000), stype="csr",
density=0.4, distribution="powerlaw")

csr_arr_square, _ = rand_sparse_ndarray(shape=(1600, 1600), stype="csr",
density=0.5, distribution="powerlaw")
assert len(csr_arr_cols.data) == 3200
test_powerlaw_generator(csr_arr_cols, final_row=9)
test_powerlaw_generator(csr_arr_small, final_row=1)
test_powerlaw_generator(csr_arr_big, final_row=4)
test_powerlaw_generator(csr_arr_square, final_row=6)


if __name__ == '__main__':
import nose
nose.runmodule()