Skip to content

Commit

Permalink
Merge pull request #46 from CNugteren/development
Browse files Browse the repository at this point in the history
Update to version 2.6.0
  • Loading branch information
CNugteren authored Oct 23, 2016
2 parents d0ec5a1 + dc1cb0b commit 35de111
Show file tree
Hide file tree
Showing 16 changed files with 163 additions and 52 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@

Version 2.6.0
- Changed timing measurements to now also include the (varying) kernel launch overhead
- It is now possible to set OpenCL compiler options through the env variable CLTUNE_BUILD_OPTIONS
- Added support for compilation under Visual Studio 2013 (MSVC++ 12.0)
- Added an option to build a static version of the library

Version 2.5.0
- Updated to version 8.0 of the CLCudaAPI header
- Made it possible to configure the number of times each kernel is run (to average results)
Expand Down
35 changes: 31 additions & 4 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
#
# ==================================================================================================

cmake_minimum_required(VERSION 2.8.10)
cmake_minimum_required(VERSION 2.8.11)

# Overrides for MSVC static runtime
set(CMAKE_USER_MAKE_RULES_OVERRIDE ${CMAKE_CURRENT_SOURCE_DIR}/cmake/c_flag_overrides.cmake)
Expand All @@ -32,10 +32,11 @@ set(CMAKE_USER_MAKE_RULES_OVERRIDE_CXX ${CMAKE_CURRENT_SOURCE_DIR}/cmake/cxx_fla
# CMake project details
project("cltune" CXX)
set(cltune_VERSION_MAJOR 2)
set(cltune_VERSION_MINOR 5)
set(cltune_VERSION_MINOR 6)
set(cltune_VERSION_PATCH 0)

# Options
# Options and their default values
option(BUILD_SHARED_LIBS "Build a shared (ON) or static library (OFF)" ON)
option(SAMPLES "Enable compilation of sample programs" ON)
option(TESTS "Enable compilation of the Google tests" OFF)

Expand Down Expand Up @@ -85,6 +86,13 @@ elseif(MSVC)
endif()
endif()

# DLL Settings
if(MSVC)
if(BUILD_SHARED_LIBS)
add_definitions(" /DCLTUNE_DLL")
endif()
endif(MSVC)

# C++ compiler settings
if(MSVC)
set(FLAGS "/Ox")
Expand Down Expand Up @@ -143,13 +151,32 @@ set(TUNER
src/ml_models/neural_network.cc)

# Creates and links the library
add_library(cltune SHARED ${TUNER})
if(BUILD_SHARED_LIBS)
add_library(cltune SHARED ${TUNER})
else(BUILD_SHARED_LIBS)
add_library(cltune STATIC ${TUNER})
endif()
target_link_libraries(cltune ${FRAMEWORK_LIBRARIES})

# Sets the proper __declspec(dllexport) keyword for Visual Studio when the library is built
if(MSVC)
if(BUILD_SHARED_LIBS)
target_compile_definitions(cltune PRIVATE COMPILING_DLL=1) # requires at least CMake 2.8.11
endif()
endif()

# Installs the library
install(TARGETS cltune DESTINATION lib)
install(FILES include/cltune.h DESTINATION include)

# Install pkg-config file on Linux
if(UNIX)
configure_file("${CMAKE_CURRENT_SOURCE_DIR}/cltune.pc.in"
"${CMAKE_CURRENT_BINARY_DIR}/cltune.pc" @ONLY IMMEDIATE)
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/cltune.pc
DESTINATION lib/pkgconfig)
endif()

# ==================================================================================================

# Optional: Enables compilation of sample programs
Expand Down
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ CLTune can be compiled as a shared library using CMake. The pre-requisites are:
- Clang 3.3 or newer
- AppleClang 5.0 or newer
- ICC 14.0 or newer
- MSVC (Visual Studio) 2015 or newer
- MSVC (Visual Studio) 2013 or newer
* An OpenCL library. CLTune has been tested with:
- Apple OpenCL
- NVIDIA CUDA SDK (requires version 7.5 or newer for the CUDA back-end)
Expand All @@ -48,6 +48,8 @@ You can then link your own programs against the CLTune library. An example for a
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/path/to/libcltune.so
g++ example.cc -o example -L/path/to/libcltune.so -lcltune -lOpenCL

Furthermore, it is possible to optionally set an OS environmental variable `CLTUNE_BUILD_OPTIONS` to pass specific build options to the OpenCL compiler at run-time.


Example of using the tuner
-------------
Expand Down
10 changes: 10 additions & 0 deletions cltune.pc.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
prefix=@CMAKE_INSTALL_PREFIX@
exec_prefix=${prefix}
includedir=${prefix}/include
libdir=${exec_prefix}/lib

Name: CLTune
Description: CLTune: An automatic OpenCL & CUDA kernel tuner
Version: @cltune_VERSION_MAJOR@.@cltune_VERSION_MINOR@.@cltune_VERSION_PATCH@
Libs: -L${libdir} -lcltune
Cflags: -I${includedir}
8 changes: 6 additions & 2 deletions include/cltune.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,12 @@

// Exports library functions under Windows when building a DLL. See also:
// https://msdn.microsoft.com/en-us/library/a90k134d.aspx
#ifdef _WIN32
#define PUBLIC_API __declspec(dllexport)
#if defined(_WIN32) && defined(CLTUNE_DLL)
#if defined(COMPILING_DLL)
#define PUBLIC_API __declspec(dllexport)
#else
#define PUBLIC_API __declspec(dllimport)
#endif
#else
#define PUBLIC_API
#endif
Expand Down
19 changes: 10 additions & 9 deletions include/internal/kernel_info.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
#endif

#include "cltune.h"
#include "internal/msvc.h"

namespace cltune {
// =================================================================================================
Expand Down Expand Up @@ -100,7 +101,7 @@ class KernelInfo {
};

// Initializes the class with a given name and a string of kernel source-code
explicit KernelInfo(const std::string name, const std::string source, const Device &device);
explicit PUBLIC_API KernelInfo(const std::string name, const std::string source, const Device &device);

// Accessors (getters)
std::string name() const { return name_; }
Expand All @@ -117,36 +118,36 @@ class KernelInfo {
void set_local_base(IntRange local) { local_base_ = local; local_ = local; }

// Prepend to the source-code
void PrependSource(const std::string &extra_source);
void PUBLIC_API PrependSource(const std::string &extra_source);

// Adds a new parameter with a name and a vector of possible values
void AddParameter(const std::string &name, const std::vector<size_t> &values);
void PUBLIC_API AddParameter(const std::string &name, const std::vector<size_t> &values);

// Checks wheter a parameter exists, returns "true" if it does exist
bool ParameterExists(const std::string parameter_name);
bool PUBLIC_API ParameterExists(const std::string parameter_name);

// Specifies a modifier in the form of a StringRange to the global/local thread-sizes. This
// modifier has to contain (per-dimension) the name of a single parameter or an empty string. The
// supported modifiers are given by the ThreadSizeModifierType enumeration.
void AddModifier(const StringRange range, const ThreadSizeModifierType type);
void PUBLIC_API AddModifier(const StringRange range, const ThreadSizeModifierType type);

// Adds a new constraint to the set of parameters (e.g. must be equal or larger than). The
// constraints come in the form of a function object which takes a number of tuning parameters,
// given as a vector of strings (parameter names). Their names are later substituted by actual
// values.
void AddConstraint(ConstraintFunction valid_if, const std::vector<std::string> &parameters);
void PUBLIC_API AddConstraint(ConstraintFunction valid_if, const std::vector<std::string> &parameters);

// As above, but for local memory usage
void SetLocalMemoryUsage(LocalMemoryFunction amount, const std::vector<std::string> &parameters);
void PUBLIC_API SetLocalMemoryUsage(LocalMemoryFunction amount, const std::vector<std::string> &parameters);

// Computes the global/local ranges (in NDRange-form) based on all global/local thread-sizes (in
// StringRange-form) and a single permutation (i.e. a configuration) containing a list of all
// parameter names and their current values.
void ComputeRanges(const Configuration &config);
void PUBLIC_API ComputeRanges(const Configuration &config);

// Computes all permutations based on the parameters and their values (the configuration list).
// The result is stored as a member variable.
void SetConfigurations();
void PUBLIC_API SetConfigurations();

private:
// Called recursively internally by SetConfigurations
Expand Down
38 changes: 38 additions & 0 deletions include/internal/msvc.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@

// =================================================================================================
// This file is part of the CLTune project, which loosely follows the Google C++ styleguide and uses
// a tab-size of two spaces and a max-width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file provides macro's and definitions to make compilation work on Microsoft Visual Studio,
// in particular for versions older than 2015 with limited C++11 support.
// MSVC++ 14.0 _MSC_VER == 1900 (Visual Studio 2015)
// MSVC++ 12.0 _MSC_VER == 1800 (Visual Studio 2013)
// MSVC++ 11.0 _MSC_VER == 1700 (Visual Studio 2012)
// MSVC++ 10.0 _MSC_VER == 1600 (Visual Studio 2010)
// MSVC++ 9.0 _MSC_VER == 1500 (Visual Studio 2008)
//
// =================================================================================================

#ifndef CLTUNE_MSVC_H_
#define CLTUNE_MSVC_H_

namespace cltune {
// =================================================================================================
#ifdef _MSC_VER

// No support for constexpr prior to 2015. Note that this only works with constants, not with
// constexpr functions (unused in this project).
#if _MSC_VER < 1900
#define constexpr const
#endif

// _MSC_VER
#endif
// =================================================================================================
} // namespace cltune

// CLTUNE_MSVC_H_
#endif
4 changes: 2 additions & 2 deletions include/internal/searchers/annealing.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,10 @@ class Annealing: public Searcher {

// Maximum number of successive visits to already visited states. If this number is exceeded, the
// algorithm ends
static constexpr auto kMaxAlreadyVisitedStates = size_t{10};
static const size_t kMaxAlreadyVisitedStates;

// Maximum number of differences to consider this still a neighbour
static constexpr auto kMaxDifferences = size_t{3};
static const size_t kMaxDifferences;

// Takes additionally a fraction of configurations to consider
Annealing(const Configurations &configurations,
Expand Down
3 changes: 2 additions & 1 deletion include/internal/tuner_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
#endif

#include "internal/kernel_info.h"
#include "internal/msvc.h"

// Host data-type for half-precision floating-point (16-bit)
#include "internal/half.h"
Expand Down Expand Up @@ -73,7 +74,7 @@ class TunerImpl {
public:

// Parameters
static constexpr auto kMaxL2Norm = 1e-4; // This is the threshold for 'correctness'
static const double kMaxL2Norm; // This is the threshold for 'correctness'

// Messages printed to stdout (in colours)
static const std::string kMessageFull;
Expand Down
18 changes: 9 additions & 9 deletions samples/conv/conv.cc
Original file line number Diff line number Diff line change
Expand Up @@ -45,18 +45,18 @@ bool IsMultiple(size_t a, size_t b) {
};

// Constants
constexpr auto kDefaultDevice = size_t{0};
constexpr auto kDefaultPlatform = size_t{0};
constexpr auto kDefaultSearchMethod = size_t{1};
constexpr auto kDefaultSearchParameter1 = size_t{4};
const auto kDefaultDevice = size_t{0};
const auto kDefaultPlatform = size_t{0};
const auto kDefaultSearchMethod = size_t{1};
const auto kDefaultSearchParameter1 = size_t{4};

// Settings (synchronise these with "conv.cc", "conv.opencl" and "conv_reference.opencl")
#define HFS (3) // Half filter size
#define FS (HFS+HFS+1) // Filter size

// Settings (sizes)
constexpr auto kSizeX = size_t{8192}; // Matrix dimension X
constexpr auto kSizeY = size_t{4096}; // Matrix dimension Y
const auto kSizeX = size_t{8192}; // Matrix dimension X
const auto kSizeY = size_t{4096}; // Matrix dimension Y

// =================================================================================================

Expand Down Expand Up @@ -91,7 +91,7 @@ int main(int argc, char* argv[]) {
}

// Creates data structures
constexpr auto kExtraSize = size_t{FS*8};
const auto kExtraSize = size_t{FS*8};
auto mat_a = std::vector<float>((kExtraSize+kSizeX)*(kExtraSize+kSizeY));
auto mat_b = std::vector<float>(kSizeX*kSizeY);
auto coeff = std::vector<float>(FS*FS);
Expand Down Expand Up @@ -230,8 +230,8 @@ int main(int argc, char* argv[]) {
tuner.PrintJSON("output.json", {{"sample","convolution"}});

// Also prints the performance of the best-case in terms of GB/s and GFLOPS
constexpr auto kMB = (sizeof(float)*2*kSizeX*kSizeY) * 1.0e-6;
constexpr auto kMFLOPS = ((1+2*FS*FS)*kSizeX*kSizeY) * 1.0e-6;
const auto kMB = (sizeof(float)*2*kSizeX*kSizeY) * 1.0e-6;
const auto kMFLOPS = ((1+2*FS*FS)*kSizeX*kSizeY) * 1.0e-6;
if (time_ms != 0.0) {
printf("[ -------> ] %.1lf ms or %.1lf GB/s or %1.lf GFLOPS\n",
time_ms, kMB/time_ms, kMFLOPS/time_ms);
Expand Down
4 changes: 2 additions & 2 deletions samples/conv_simple/conv_simple.cc
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,8 @@ int main() {
#endif

// Input/output sizes
constexpr auto kSizeX = size_t{8192}; // Matrix dimension X
constexpr auto kSizeY = size_t{4096}; // Matrix dimension Y
const auto kSizeX = size_t{8192}; // Matrix dimension X
const auto kSizeY = size_t{4096}; // Matrix dimension Y

// Creates the input/output matrices and fills them with some example data
std::vector<float> mat_a(kSizeX*kSizeY, 2.0f);
Expand Down
16 changes: 8 additions & 8 deletions samples/gemm/gemm.cc
Original file line number Diff line number Diff line change
Expand Up @@ -45,15 +45,15 @@ bool IsMultiple(size_t a, size_t b) {
};

// Constants
constexpr auto kDefaultDevice = size_t{0};
constexpr auto kDefaultPlatform = size_t{0};
constexpr auto kDefaultSearchMethod = size_t{1};
constexpr auto kDefaultSearchParameter1 = size_t{4};
const auto kDefaultDevice = size_t{0};
const auto kDefaultPlatform = size_t{0};
const auto kDefaultSearchMethod = size_t{1};
const auto kDefaultSearchParameter1 = size_t{4};

// Settings (sizes)
constexpr auto kSizeM = size_t{2048};
constexpr auto kSizeN = size_t{2048};
constexpr auto kSizeK = size_t{2048};
const auto kSizeM = size_t{2048};
const auto kSizeN = size_t{2048};
const auto kSizeK = size_t{2048};

// =================================================================================================

Expand Down Expand Up @@ -203,7 +203,7 @@ int main(int argc, char* argv[]) {
tuner.PrintFormatted();

// Also prints the performance of the best-case in terms of GFLOPS
constexpr auto kMGFLOP = (2*kSizeM*kSizeN*kSizeK) * 1.0e-6;
const auto kMGFLOP = (2*kSizeM*kSizeN*kSizeK) * 1.0e-6;
if (time_ms != 0.0) {
printf("[ -------> ] %.1lf ms or %.3lf GFLOPS\n", time_ms, kMGFLOP/time_ms);
}
Expand Down
4 changes: 2 additions & 2 deletions samples/multiple_kernels/multiple_kernels.cc
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,8 @@ int main() {
#endif

// Matrix size
constexpr auto kSizeM = size_t{2048};
constexpr auto kSizeN = size_t{4096};
const auto kSizeM = size_t{2048};
const auto kSizeN = size_t{4096};

// Creates data structures
std::vector<float> mat_a(kSizeN*kSizeM); // Assumes matrix A is transposed
Expand Down
2 changes: 1 addition & 1 deletion samples/simple/simple.cc
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ int main() {
#endif

// Vector dimension
constexpr auto kVectorSize = size_t{16*1024*1024};
const auto kVectorSize = size_t{16*1024*1024};

// Creates the vectors and fills them with some example data
std::vector<float> vec_a(kVectorSize, 1.0f);
Expand Down
7 changes: 7 additions & 0 deletions src/searchers/annealing.cc
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,13 @@
namespace cltune {
// =================================================================================================

// Maximum number of successive visits to already visited states. If this number is exceeded, the
// algorithm ends
const size_t Annealing::kMaxAlreadyVisitedStates = size_t{10};

// Maximum number of differences to consider this still a neighbour
const size_t Annealing::kMaxDifferences = size_t{3};

// Initializes the simulated annealing searcher by specifying the fraction of the total search space
// to consider and the maximum annealing 'temperature'.
Annealing::Annealing(const Configurations &configurations,
Expand Down
Loading

0 comments on commit 35de111

Please sign in to comment.