I use this for all my CUDA - Stolen from TF 1DKern.cu

Put your Vanilla code here

I use this for all my CUDA - Stolen from TF 1DKern.cu

Postby hbyte » Sat Feb 03, 2024 4:37 pm

Its a basic iterator that just takes the hassle out of CUDA.( Its in the TensorFlow source code. )

Code: Select all
#include <algorithm>
#include <complex>
#include <iostream>
#include <math.h>
#include <vector>


/*Begin 1DKern definition */

/*This is a direct copy of Tensorflows 1DKern code*/

namespace detail {
template <typename T>
class GpuGridRange {

   struct Iterator {
      __device__ Iterator(T index, T delta) : index_(index), delta_(delta) {}
      __device__ T operator*() const { return index_;}
      __device__ Iterator& operator++() {
         index_ += delta_;
         return *this;
      }

      __device__ bool operator!=(const Iterator& other) const {
         bool greater = index_ > other.index_;
         bool less = index_ < other.index_;
         if(!other.delta_){
         return less;
         }
         if(!delta_){
         return greater;
         }   

      return less || greater;
      }   

      private:
      T index_;
      const T delta_;

   };   //end Iterator struct


   public:
          __device__ GpuGridRange(T begin,T delta,T end)
      : begin_(begin),delta_(delta),end_(end) {}
   
   __device__ Iterator begin() const {return Iterator(begin_,delta_); }
   __device__ Iterator end() const {return Iterator(end_,0);}

   private:
   T begin_;
   T delta_;
   T end_;   
   


};   //end GPU class class
};   //end namespace detail

template <typename T>   //Allows you to use GPU iterator with all data types
__device__ detail::GpuGridRange<T> GpuGridRangeX(T count) {
return detail::GpuGridRange<T>(

   /*begin*/blockIdx.x * blockDim.x + threadIdx.x,
   /*delta*/gridDim.x * blockDim.x, /*end*/count
            );

}

template <typename T>   //Allows you to use GPU iterator with all data types
__device__ detail::GpuGridRange<T> GpuGridRangeY(T count) {
return detail::GpuGridRange<T>(

   /*begin*/blockIdx.y * blockDim.y + threadIdx.y,
   /*delta*/gridDim.y * blockDim.y, /*end*/count
            );

}
template <typename T>   //Allows you to use GPU iterator with all data types
__device__ detail::GpuGridRange<T> GpuGridRangeZ(T count) {
return detail::GpuGridRange<T>(

   /*begin*/blockIdx.z * blockDim.z + threadIdx.z,
   /*delta*/gridDim.z * blockDim.z, /*end*/count
            );

}


#define GPU_1D_KERN_LOOP(i, n) \
  for (int i : ::GpuGridRangeX<int>(n))

#define GPU_AXIS_KERNEL_LOOP(i, n, axis) \
  for (int i : ::GpuGridRange##axis<int>(n))


/*End 1DKern definition*/
hbyte
Site Admin
 
Posts: 80
Joined: Thu Aug 13, 2020 6:11 pm

Return to Python and ML

Who is online

Users browsing this forum: No registered users and 27 guests

cron