Page 1 of 1

Scrappy GA on Cuda.

PostPosted: Fri Apr 23, 2021 5:37 pm
by hbyte
Here we are so far. Objective send three functions to the onboard CUDA device.

We are using a Genetic Algo to search for solutions to some basic problems.

The first function to send is call bin2dec, binary to decimal convertor. Takes a set of genes for each population member and outputs a neat set of decimal values corresponding to a solution.

Input = NxLxM

N= population
L= lengthof chromosome (number of solution variables)
M=size of bit coded gene

First off we need to flatten the arrays so we have an input array

Input[ NxLxM]

Acceptable to the Cuda device.

What else is acceptable to Cuda, beyond SAXPY, is ATOMIC add, and multiply functions to be used alongside Y[i] = aX[i] + Y[i]

1. Bin2dec function to CUDA - The bin2dec function is a way to convert a binary string into a decimal
number. (Actually its not a string but an array of 0/1's)

Ok so here is the first function sent to CUDA and the timings:

It took me a while to figure this out as the atomicAdd function produces cuda runtime errors. However the main thing was that it seemed to be slower than the cpu. But that was because I included in the timing the transfer back from the CUDA device / GPU to my CPU . When timed exclusively the CUDA function bin2dec it was a fraction of the CPU time. Hurrah.

So the take home message is only do the transfer at the end of the algo, otherwise keep variables on the device!

Heres the code:
Code: Select all
#include <cstdlib>
#include <vector>
#include <memory>
#include <cublas_v2.h>
#include <cuda_runtime_api.h>
//#include <sm_60_atomic_functions.h>
#include <cuda.h>
#include <cstdio>
#include <iostream>
#include <ctime>
#include <time.h>
using namespace std;


__global__
void bin2dec(int N,int L,int W, double sum_, int *x,double  *y,float *id3,float *id4);
void init_cuda(int size,int lenchrom,int popsize);

int main(){

init_cuda(50,2,300);

}

void init_cuda(int size,int lenchrom,int popsize){

//Bin2dec
float* varr_;
float* darr_;

int*    binary;
double* b2dec;

int N,L,W;
double sum;


clock_t startTime, endTime;

int dgene[popsize*size*lenchrom];      //Pointer to genes for all individuals

W = size;
N = popsize;
L = lenchrom;


sum =0;
for(int i=0;i<W;i++){
sum+=pow(2,i);
         }

double b2dout[N*L];        // Output from bin2dec         
      
binary = NULL;
//cwgts  = NULL;
//fitness =NULL;

//Init and Print out genes
for(int i=0;i<N;i++){
for(int j=0;j<L;j++){
cout<<"\n\n";
for(int k=0;k<W;k++){
dgene[i+W*(j+L*k)]=rand()%2; //0 or 1
cout<<dgene[i+W*(j+L*k)]<<",";
}}}

//Start Bin2dec testing
///Prep matrix
float *varr,*darr;
float *_varr,*_darr;
varr = new float[L*W];
darr = new float[N*L];
_varr = new float[L*W];
_darr = new float[N*L];


//Create Id mat for Val
float val_=0;

for(int j=0;j<L;j++){
val_=1;
for(int i=W;i>0;i--){
val_*=2;
varr[i+W*j]=val_;
_varr[i+W*j]=val_;
}}
               
for(int i=0;i<N;i++){
for(int j=0;j<L;j++){
darr[i+L*j]=0;
_darr[i+L*j]=0;
               }
               }               

   

cudaMalloc(&varr_, L * W *sizeof(float));   //Identity matrix for Val //Id matrix Used for bin2dec counting
cudaMalloc(&darr_, N * L *sizeof(float));   //Identity matrix for Dec //Id matrix Bin2dec value

cudaMemcpy(darr_, darr, N * L *sizeof(float), cudaMemcpyHostToDevice); //Copy Id matrix
cudaMemcpy(varr_, varr, L * W *sizeof(float), cudaMemcpyHostToDevice); //Copy Id matrix

cudaMalloc(&binary, L * N * W *sizeof(int));   //Binary in
cudaMalloc(&b2dec, L * N *sizeof(double));   //Double out

cudaMemcpy(binary, &dgene, L*N*W*sizeof(int), cudaMemcpyHostToDevice); //Copy entire population genome to device

int threadlimit = 1024;
int numthreads = N * L * W;

int requiredblocks = (numthreads/threadlimit) +1;

dim3 grid(requiredblocks, 1, 1);
dim3 block(threadlimit, 1, 1);

//Begin Bin2dec testing

startTime = clock();

bin2dec<<<grid,block>>>(N,L,W, sum, binary, b2dec,varr_,darr_);

endTime = clock();

/*This memcpy takes too much time therefore keep everything on device*/
cudaMemcpy(b2dout, b2dec, N*L*sizeof(double), cudaMemcpyDeviceToHost);





cudaFree(binary);
cudaFree(b2dec);
cudaFree(varr_);
cudaFree(darr_);

double gpuTime = endTime-startTime;

 //End Bin2dec testing

 for(int i=0;i<N;i++){
 cout<<"\n";
 for(int j=0;j<L;j++){
 cout<<b2dout[i+L*j]<<",";
 }
 }


 
//Start cpu bin2dec



float dec=0;
float val;
sum=0;


for(int i=0;i<size;i++){
sum+=pow(2,i);
         }
         
         
startTime = clock();

if(1){
         
for(int k=0;k<N;k++){         //Count through population of individuals
         
for(int j=0;j<L;j++){            //Count through number of chromosomes

val =1;

for(int i=0;i<W;++i){         //Count down genes

if(dgene[k+W*(j+L*(W-i))] == 1){

dec += val;
         }
val *=2;

}

b2dout[k+L*j]=(((dec/sum)*(1-0))/4)+0;   //Set max=4 feature scaling to 0,1

dec=0;

                  }   
                  }
}else{

for(int k=0;k<N;k++){
for(int j=0;j<W;j++){
for(int i=0;i<L;i++){


//(dgene[k+W*(j+L*(W-i))]) ? _darr[k+L*i]+=_varr[j+W*i]  : 0 ;

if(dgene[k+W*(j+L*(W-i))]){_darr[k+L*i]+=_varr[j+W*i];}

//if(1){_darr[k+L*i]+=_varr[j+W*i];}

 b2dout[k+L*i] = (((_darr[k+L*i]/sum)*(1))/4)+0;

}}}
   }               
               
               
endTime = clock();

double cpuTime = endTime-startTime;

cout<<"\n\n************\n\n";
 
 for(int i=0;i<N;i++){
 cout<<"\n";
 for(int j=0;j<L;j++){
 cout<<b2dout[i+L*j]<<",";
 }
 }

 
 cout<<"\n\n*******************\n*******************\nNon-CUDA cpu took "<<1000*((cpuTime)/(double)CLOCKS_PER_SEC) << " ms. \n\n*******************\n*******************" << endl;
 cout<<"\n\n*******************\n*******************\nCUDA gpu took "<<1000*((gpuTime)/(double)CLOCKS_PER_SEC) << " ms. \n\n*******************\n*******************" << endl;

delete _varr,_darr,darr,varr,b2dout;

}


#if __CUDA_ARCH__ < 600
__device__ double atomicAdd_(double* address, double val)
{
    unsigned long long int* address_as_ull =
                              (unsigned long long int*)address;
    unsigned long long int old = *address_as_ull, assumed;

    do {
        assumed = old;
        old = atomicCAS(address_as_ull, assumed,
                        __double_as_longlong(val +
                               __longlong_as_double(assumed)));

    // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
    } while (assumed != old);

    return __longlong_as_double(old);
}
#endif


__global__
void bin2dec(int N_,int L,int W, double sum_, int *x,double  *y,float *id3,float *id4){


const int pos = blockIdx.x * blockDim.x + threadIdx.x;
   const int size = blockDim.x * gridDim.x;
   double upper=1,lower=0;   //Set bounds
   const int N = N_*L*W;

for (int n = N * pos / size; n < N * (pos+1) / size; ++n) {

int idx = n;
      const int i1 = ((idx /= 1   ) % L);
      const int i2 = ((idx /= L   ) % W);
      const int i3 = ((idx /= W   ) % N_);
      
      
      
(x[i3+W*(i1+L*(W-i2))]) ? atomicAdd(&id4[i3+L*i1],id3[i2+W*i1])  : NULL ;
   
    y[i3+L*i1] = (((id4[i3+L*i1]/sum_)*(upper-lower))/4)+lower;      //Feature scaled (0,1) using max=4
                                                   //Option calc max-min from binary ??
                              
                              }

}







Re: Scrappy GA on Cuda.

PostPosted: Fri Jun 04, 2021 11:08 pm
by hbyte
Compile with nvcc, currently works on CUDA 7.5

So thats good news means that with a good enough GPU you can theoretically run a large population in your Genetic Algorithm in paralell. This output is for 300 population running Bin2dec, keep the variables on the GPU and then run fitness test followed by selection crossover / mutation all on the GPU. Then when done at the end move back the solution, and we're done!

The following shows the time saving (Took a while to get it working!)
Code: Select all

*******************
*******************
Non-CUDA cpu took 0.255 ms.

*******************
*******************


*******************
*******************
CUDA gpu took 0.017 ms.

*******************
*******************


Re: Scrappy GA on Cuda.

PostPosted: Sat Jul 17, 2021 2:42 pm
by hbyte
This method only works on compute_20 , use the nvcc flag!