ViennaCL - The Vienna Computing Library  1.7.0
Free open-source GPU-accelerated linear algebra and solver library.
multithreaded.cpp

This tutorial shows how to use ViennaCL with multiple threads, one thread per GPU. The use of one thread per context (host, CUDA, OpenCL) is supported. However, using more than one thread per context is not fully covered by the OpenCL standard. It is, however, perfectly fine to use multiple OpenCL contexts simultaneously, each using one thread.

We start with including the necessary headers:

#ifndef VIENNACL_WITH_OPENCL
#define VIENNACL_WITH_OPENCL
#endif
// include necessary system headers
#include <iostream>
//include basic scalar and vector types of ViennaCL
//include the generic inner product functions of ViennaCL
#include <boost/thread.hpp>

Each thread runs a separate task, for which we provide the following functor. For each thread, vector operations and a vector norm gets computed.

template<typename NumericT>
class worker
{
public:
worker(std::size_t tid) : thread_id_(tid) {}
void operator()()
{
std::size_t N = 6;
viennacl::context ctx(viennacl::ocl::get_context(static_cast<long>(thread_id_)));
u += v;
std::stringstream ss;
ss << "Result of thread " << thread_id_ << " on device " << viennacl::ocl::get_context(static_cast<long>(thread_id_)).devices()[0].name() << ": " << result << std::endl;
ss << " A: " << A << std::endl;
ss << " x: " << x << std::endl;
message_ = ss.str();
}
std::string message() const { return message_; }
private:
std::string message_;
std::size_t thread_id_;
};

In the main routine we create two OpenCL contexts and then use one thread per context to run the operations in the functor defined above.

int main()
{
// Change this type definition to double if your gpu supports that
typedef float ScalarType;
{
std::cerr << "Error: No platform found!" << std::endl;
return EXIT_FAILURE;
}

Part 1: Setup first device for first context, second device for second context:

std::vector<viennacl::ocl::device> const & devices = pf.devices();
// Set first device to first context:
// Set second device for second context (use the same device for the second context if only one device available):
if (devices.size() > 1)
else

Part 2: Now let two threads operate on two GPUs in parallel.

worker<ScalarType> work_functor0(0);
worker<ScalarType> work_functor1(1);
boost::thread worker_thread_0(boost::ref(work_functor0));
boost::thread worker_thread_1(boost::ref(work_functor1));
worker_thread_0.join();
worker_thread_1.join();
std::cout << work_functor0.message() << std::endl;
std::cout << work_functor1.message() << std::endl;

We're done - print a success message.

std::cout << "!!!! TUTORIAL COMPLETED SUCCESSFULLY !!!!" << std::endl;
return EXIT_SUCCESS;
}

Full Example Code

/* =========================================================================
Copyright (c) 2010-2015, Institute for Microelectronics,
Institute for Analysis and Scientific Computing,
TU Wien.
Portions of this software are copyright by UChicago Argonne, LLC.
-----------------
ViennaCL - The Vienna Computing Library
-----------------
Project Head: Karl Rupp rupp@iue.tuwien.ac.at
(A list of authors and contributors can be found in the PDF manual)
License: MIT (X11), see file LICENSE in the base directory
============================================================================= */
#ifndef VIENNACL_WITH_OPENCL
#define VIENNACL_WITH_OPENCL
#endif
// include necessary system headers
#include <iostream>
//include basic scalar and vector types of ViennaCL
//include the generic inner product functions of ViennaCL
#include <boost/thread.hpp>
template<typename NumericT>
class worker
{
public:
worker(std::size_t tid) : thread_id_(tid) {}
void operator()()
{
std::size_t N = 6;
viennacl::context ctx(viennacl::ocl::get_context(static_cast<long>(thread_id_)));
u += v;
std::stringstream ss;
ss << "Result of thread " << thread_id_ << " on device " << viennacl::ocl::get_context(static_cast<long>(thread_id_)).devices()[0].name() << ": " << result << std::endl;
ss << " A: " << A << std::endl;
ss << " x: " << x << std::endl;
message_ = ss.str();
}
std::string message() const { return message_; }
private:
std::string message_;
std::size_t thread_id_;
};
int main()
{
// Change this type definition to double if your gpu supports that
typedef float ScalarType;
{
std::cerr << "Error: No platform found!" << std::endl;
return EXIT_FAILURE;
}
std::vector<viennacl::ocl::device> const & devices = pf.devices();
// Set first device to first context:
// Set second device for second context (use the same device for the second context if only one device available):
if (devices.size() > 1)
else
worker<ScalarType> work_functor0(0);
worker<ScalarType> work_functor1(1);
boost::thread worker_thread_0(boost::ref(work_functor0));
boost::thread worker_thread_1(boost::ref(work_functor1));
worker_thread_0.join();
worker_thread_1.join();
std::cout << work_functor0.message() << std::endl;
std::cout << work_functor1.message() << std::endl;
std::cout << "!!!! TUTORIAL COMPLETED SUCCESSFULLY !!!!" << std::endl;
return EXIT_SUCCESS;
}