10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_DEVICE_TYPE_H
11 #define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_TYPE_H
17 struct DefaultDevice {
18 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void* allocate(
size_t num_bytes)
const {
19 return internal::aligned_malloc(num_bytes);
21 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void deallocate(
void* buffer)
const {
22 internal::aligned_free(buffer);
24 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void memcpy(
void* dst,
const void* src,
size_t n)
const {
25 ::memcpy(dst, src, n);
27 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void memcpyHostToDevice(
void* dst,
const void* src,
size_t n)
const {
30 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void memcpyDeviceToHost(
void* dst,
const void* src,
size_t n)
const {
33 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void memset(
void* buffer,
int c,
size_t n)
const {
34 ::memset(buffer, c, n);
37 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
size_t numThreads()
const {
47 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
int majorDeviceVersion()
const {
54 return __CUDA_ARCH__ / 100;
62 #ifdef EIGEN_USE_THREADS
66 class ThreadPoolInterface {
68 virtual void Schedule(std::function<
void()> fn) = 0;
70 virtual ~ThreadPoolInterface() {}
76 class ThreadPool :
public ThreadPoolInterface {
79 explicit ThreadPool(
int num_threads) {
80 for (
int i = 0; i < num_threads; i++) {
81 threads_.push_back(
new std::thread([
this]() { WorkerLoop(); }));
91 std::unique_lock<std::mutex> l(mu_);
92 empty_.wait(l, [
this]() {
return pending_.empty(); });
96 for (
auto w : waiters_) {
104 for (
auto t : threads_) {
112 void Schedule(std::function<
void()> fn) {
113 std::unique_lock<std::mutex> l(mu_);
114 if (waiters_.empty()) {
115 pending_.push_back(fn);
117 Waiter* w = waiters_.back();
127 std::unique_lock<std::mutex> l(mu_);
130 std::function<void()> fn;
131 if (pending_.empty()) {
134 waiters_.push_back(&w);
135 w.cv.wait(l, [&w]() {
return w.ready; });
140 fn = pending_.front();
141 pending_.pop_front();
142 if (pending_.empty()) {
156 std::condition_variable cv;
157 std::function<void()> work;
162 std::vector<std::thread*> threads_;
163 std::vector<Waiter*> waiters_;
164 std::deque<std::function<void()>> pending_;
165 std::condition_variable empty_;
166 bool exiting_ =
false;
177 Notification() : notified_(false) {}
181 std::unique_lock<std::mutex> l(mu_);
182 eigen_assert(!notified_);
187 void WaitForNotification() {
188 std::unique_lock<std::mutex> l(mu_);
189 cv_.wait(l, [
this]() {
return notified_; } );
194 std::condition_variable cv_;
200 template <
typename Function,
typename... Args>
struct FunctionWrapper
202 static void run(Notification* n, Function f, Args... args) {
208 static EIGEN_STRONG_INLINE
void wait_until_ready(Notification* n) {
210 n->WaitForNotification();
216 struct ThreadPoolDevice {
218 ThreadPoolDevice(ThreadPoolInterface* pool,
size_t num_cores) : pool_(pool), num_threads_(num_cores) { }
220 EIGEN_STRONG_INLINE
void* allocate(
size_t num_bytes)
const {
221 return internal::aligned_malloc(num_bytes);
224 EIGEN_STRONG_INLINE
void deallocate(
void* buffer)
const {
225 internal::aligned_free(buffer);
228 EIGEN_STRONG_INLINE
void memcpy(
void* dst,
const void* src,
size_t n)
const {
229 ::memcpy(dst, src, n);
231 EIGEN_STRONG_INLINE
void memcpyHostToDevice(
void* dst,
const void* src,
size_t n)
const {
234 EIGEN_STRONG_INLINE
void memcpyDeviceToHost(
void* dst,
const void* src,
size_t n)
const {
238 EIGEN_STRONG_INLINE
void memset(
void* buffer,
int c,
size_t n)
const {
239 ::memset(buffer, c, n);
242 EIGEN_STRONG_INLINE
size_t numThreads()
const {
246 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
int majorDeviceVersion()
const {
251 template <
class Function,
class... Args>
252 EIGEN_STRONG_INLINE Notification* enqueue(Function&& f, Args&&... args)
const {
253 Notification* n =
new Notification();
254 std::function<void()> func =
255 std::bind(&FunctionWrapper<Function, Args...>::run, n, f, args...);
256 pool_->Schedule(func);
259 template <
class Function,
class... Args>
260 EIGEN_STRONG_INLINE
void enqueueNoNotification(Function&& f, Args&&... args)
const {
261 std::function<void()> func = std::bind(f, args...);
262 pool_->Schedule(func);
266 ThreadPoolInterface* pool_;
278 class StreamInterface {
280 virtual ~StreamInterface() {}
282 virtual const cudaStream_t& stream()
const = 0;
283 virtual const cudaDeviceProp& deviceProperties()
const = 0;
286 virtual void* allocate(
size_t num_bytes)
const = 0;
287 virtual void deallocate(
void* buffer)
const = 0;
290 static cudaDeviceProp* m_deviceProperties;
291 static bool m_devicePropInitialized =
false;
293 static void initializeDeviceProp() {
294 if (!m_devicePropInitialized) {
295 if (!m_devicePropInitialized) {
297 cudaError_t status = cudaGetDeviceCount(&num_devices);
298 assert(status == cudaSuccess);
299 m_deviceProperties =
new cudaDeviceProp[num_devices];
300 for (
int i = 0; i < num_devices; ++i) {
301 status = cudaGetDeviceProperties(&m_deviceProperties[i], i);
302 assert(status == cudaSuccess);
304 m_devicePropInitialized =
true;
309 static const cudaStream_t default_stream = cudaStreamDefault;
311 class CudaStreamDevice :
public StreamInterface {
314 CudaStreamDevice() : stream_(&default_stream) {
315 cudaGetDevice(&device_);
316 initializeDeviceProp();
319 CudaStreamDevice(
int device) : stream_(&default_stream), device_(device) {
320 initializeDeviceProp();
326 CudaStreamDevice(
const cudaStream_t* stream,
int device = -1)
327 : stream_(stream), device_(device) {
329 cudaGetDevice(&device_);
332 cudaError_t err = cudaGetDeviceCount(&num_devices);
333 assert(err == cudaSuccess);
334 assert(device < num_devices);
337 initializeDeviceProp();
340 const cudaStream_t& stream()
const {
return *stream_; }
341 const cudaDeviceProp& deviceProperties()
const {
342 return m_deviceProperties[device_];
344 virtual void* allocate(
size_t num_bytes)
const {
345 cudaError_t err = cudaSetDevice(device_);
346 assert(err == cudaSuccess);
348 err = cudaMalloc(&result, num_bytes);
349 assert(err == cudaSuccess);
350 assert(result != NULL);
353 virtual void deallocate(
void* buffer)
const {
354 cudaError_t err = cudaSetDevice(device_);
355 assert(err == cudaSuccess);
356 assert(buffer != NULL);
357 err = cudaFree(buffer);
358 assert(err == cudaSuccess);
362 const cudaStream_t* stream_;
370 explicit GpuDevice(
const StreamInterface* stream) : stream_(stream) {
371 eigen_assert(stream);
375 EIGEN_STRONG_INLINE
const cudaStream_t& stream()
const {
376 return stream_->stream();
379 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void* allocate(
size_t num_bytes)
const {
380 #ifndef __CUDA_ARCH__
381 return stream_->allocate(num_bytes);
383 eigen_assert(
false &&
"The default device should be used instead to generate kernel code");
388 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void deallocate(
void* buffer)
const {
389 #ifndef __CUDA_ARCH__
390 stream_->deallocate(buffer);
393 eigen_assert(
false &&
"The default device should be used instead to generate kernel code");
397 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void memcpy(
void* dst,
const void* src,
size_t n)
const {
398 #ifndef __CUDA_ARCH__
399 cudaError_t err = cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToDevice,
401 assert(err == cudaSuccess);
403 eigen_assert(
false &&
"The default device should be used instead to generate kernel code");
407 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void memcpyHostToDevice(
void* dst,
const void* src,
size_t n)
const {
408 #ifndef __CUDA_ARCH__
410 cudaMemcpyAsync(dst, src, n, cudaMemcpyHostToDevice, stream_->stream());
411 assert(err == cudaSuccess);
413 eigen_assert(
false &&
"The default device should be used instead to generate kernel code");
417 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void memcpyDeviceToHost(
void* dst,
const void* src,
size_t n)
const {
418 #ifndef __CUDA_ARCH__
420 cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToHost, stream_->stream());
421 assert(err == cudaSuccess);
423 eigen_assert(
false &&
"The default device should be used instead to generate kernel code");
427 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void memset(
void* buffer,
int c,
size_t n)
const {
428 #ifndef __CUDA_ARCH__
429 cudaError_t err = cudaMemsetAsync(buffer, c, n, stream_->stream());
430 assert(err == cudaSuccess);
432 eigen_assert(
false &&
"The default device should be used instead to generate kernel code");
436 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
size_t numThreads()
const {
441 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
size_t firstLevelCacheSize()
const {
446 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
size_t lastLevelCacheSize()
const {
449 return firstLevelCacheSize();
452 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void synchronize()
const {
453 #ifndef __CUDA_ARCH__
454 cudaError_t err = cudaStreamSynchronize(stream_->stream());
455 assert(err == cudaSuccess);
457 assert(
false &&
"The default device should be used instead to generate kernel code");
461 inline int getNumCudaMultiProcessors()
const {
462 return stream_->deviceProperties().multiProcessorCount;
464 inline int maxCudaThreadsPerBlock()
const {
465 return stream_->deviceProperties().maxThreadsPerBlock;
467 inline int maxCudaThreadsPerMultiProcessor()
const {
468 return stream_->deviceProperties().maxThreadsPerMultiProcessor;
470 inline int sharedMemPerBlock()
const {
471 return stream_->deviceProperties().sharedMemPerBlock;
473 inline int majorDeviceVersion()
const {
474 return stream_->deviceProperties().major;
479 inline bool ok()
const {
480 cudaError_t error = cudaStreamQuery(stream_->stream());
481 return (error == cudaSuccess) || (error == cudaErrorNotReady);
485 const StreamInterface* stream_;
490 #define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \
491 (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \
492 assert(cudaGetLastError() == cudaSuccess);
496 static inline void setCudaSharedMemConfig(cudaSharedMemConfig config) {
497 cudaError_t status = cudaDeviceSetSharedMemConfig(config);
498 assert(status == cudaSuccess);
505 #endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_TYPE_H
Namespace containing all symbols from the Eigen library.
Definition: CXX11Meta.h:13