Its a basic iterator that just takes the hassle out of CUDA.( Its in the TensorFlow source code. )

#include <algorithm>
#include <complex>
#include <iostream>
#include <math.h>
#include <vector>

/*Begin 1DKern definition */

/*This is a direct copy of Tensorflows 1DKern code*/

namespace detail {
template <typename T>
class GpuGridRange {

   struct Iterator {
      __device__ Iterator(T index, T delta) : index_(index), delta_(delta) {}
      __device__ T operator*() const { return index_;}
      __device__ Iterator& operator++() {
         index_ += delta_;
         return *this;

      __device__ bool operator!=(const Iterator& other) const {
         bool greater = index_ > other.index_;
         bool less = index_ < other.index_;
         return less;
         return greater;

      return less || greater;

      T index_;
      const T delta_;

   };   //end Iterator struct

          __device__ GpuGridRange(T begin,T delta,T end)
      : begin_(begin),delta_(delta),end_(end) {}
   __device__ Iterator begin() const {return Iterator(begin_,delta_); }
   __device__ Iterator end() const {return Iterator(end_,0);}

   T begin_;
   T delta_;
   T end_;   

};   //end GPU class class
};   //end namespace detail

template <typename T>   //Allows you to use GPU iterator with all data types
__device__ detail::GpuGridRange<T> GpuGridRangeX(T count) {
return detail::GpuGridRange<T>(

   /*begin*/blockIdx.x * blockDim.x + threadIdx.x,
   /*delta*/gridDim.x * blockDim.x, /*end*/count


template <typename T>   //Allows you to use GPU iterator with all data types
__device__ detail::GpuGridRange<T> GpuGridRangeY(T count) {
return detail::GpuGridRange<T>(

   /*begin*/blockIdx.y * blockDim.y + threadIdx.y,
   /*delta*/gridDim.y * blockDim.y, /*end*/count

template <typename T>   //Allows you to use GPU iterator with all data types
__device__ detail::GpuGridRange<T> GpuGridRangeZ(T count) {
return detail::GpuGridRange<T>(

   /*begin*/blockIdx.z * blockDim.z + threadIdx.z,
   /*delta*/gridDim.z * blockDim.z, /*end*/count


#define GPU_1D_KERN_LOOP(i, n) \
  for (int i : ::GpuGridRangeX<int>(n))

#define GPU_AXIS_KERNEL_LOOP(i, n, axis) \
  for (int i : ::GpuGridRange##axis<int>(n))

/*End 1DKern definition*/
