Scrappy GA on Cuda.

by **hbyte** » Fri Apr 23, 2021 5:37 pm

Here we are so far. Objective send three functions to the onboard CUDA device.

We are using a Genetic Algo to search for solutions to some basic problems.

The first function to send is call bin2dec, binary to decimal convertor. Takes a set of genes for each population member and outputs a neat set of decimal values corresponding to a solution.

Input = NxLxM

N= population
L= lengthof chromosome (number of solution variables)
M=size of bit coded gene

First off we need to flatten the arrays so we have an input array

Input[ NxLxM]

Acceptable to the Cuda device.

What else is acceptable to Cuda, beyond SAXPY, is ATOMIC add, and multiply functions to be used alongside Y[i] = aX[i] + Y[i]

1. Bin2dec function to CUDA - The bin2dec function is a way to convert a binary string into a decimal
number. (Actually its not a string but an array of 0/1's)

Ok so here is the first function sent to CUDA and the timings:

It took me a while to figure this out as the atomicAdd function produces cuda runtime errors. However the main thing was that it seemed to be slower than the cpu. But that was because I included in the timing the transfer back from the CUDA device / GPU to my CPU . When timed exclusively the CUDA function bin2dec it was a fraction of the CPU time. Hurrah.

So the take home message is only do the transfer at the end of the algo, otherwise keep variables on the device!

Heres the code:

Code: Select all: #include <cstdlib> #include <vector> #include <memory> #include <cublas_v2.h> #include <cuda_runtime_api.h> //#include <sm_60_atomic_functions.h> #include <cuda.h> #include <cstdio> #include <iostream> #include <ctime> #include <time.h> using namespace std; __global__ void bin2dec(int N,int L,int W, double sum_, int *x,double *y,float *id3,float *id4); void init_cuda(int size,int lenchrom,int popsize); int main(){ init_cuda(50,2,300); } void init_cuda(int size,int lenchrom,int popsize){ //Bin2dec float* varr_; float* darr_; int* binary; double* b2dec; int N,L,W; double sum; clock_t startTime, endTime; int dgene[popsize*size*lenchrom]; //Pointer to genes for all individuals W = size; N = popsize; L = lenchrom; sum =0; for(int i=0;i<W;i++){ sum+=pow(2,i); } double b2dout[N*L]; // Output from bin2dec binary = NULL; //cwgts = NULL; //fitness =NULL; //Init and Print out genes for(int i=0;i<N;i++){ for(int j=0;j<L;j++){ cout<<"\n\n"; for(int k=0;k<W;k++){ dgene[i+W*(j+L*k)]=rand()%2; //0 or 1 cout<<dgene[i+W*(j+L*k)]<<","; }}} //Start Bin2dec testing ///Prep matrix float *varr,*darr; float *_varr,*_darr; varr = new float[L*W]; darr = new float[N*L]; _varr = new float[L*W]; _darr = new float[N*L]; //Create Id mat for Val float val_=0; for(int j=0;j<L;j++){ val_=1; for(int i=W;i>0;i--){ val_*=2; varr[i+W*j]=val_; _varr[i+W*j]=val_; }} for(int i=0;i<N;i++){ for(int j=0;j<L;j++){ darr[i+L*j]=0; _darr[i+L*j]=0; } } cudaMalloc(&varr_, L * W *sizeof(float)); //Identity matrix for Val //Id matrix Used for bin2dec counting cudaMalloc(&darr_, N * L *sizeof(float)); //Identity matrix for Dec //Id matrix Bin2dec value cudaMemcpy(darr_, darr, N * L *sizeof(float), cudaMemcpyHostToDevice); //Copy Id matrix cudaMemcpy(varr_, varr, L * W *sizeof(float), cudaMemcpyHostToDevice); //Copy Id matrix cudaMalloc(&binary, L * N * W *sizeof(int)); //Binary in cudaMalloc(&b2dec, L * N *sizeof(double)); //Double out cudaMemcpy(binary, &dgene, L*N*W*sizeof(int), cudaMemcpyHostToDevice); //Copy entire population genome to device int threadlimit = 1024; int numthreads = N * L * W; int requiredblocks = (numthreads/threadlimit) +1; dim3 grid(requiredblocks, 1, 1); dim3 block(threadlimit, 1, 1); //Begin Bin2dec testing startTime = clock(); bin2dec<<<grid,block>>>(N,L,W, sum, binary, b2dec,varr_,darr_); endTime = clock(); /*This memcpy takes too much time therefore keep everything on device*/ cudaMemcpy(b2dout, b2dec, N*L*sizeof(double), cudaMemcpyDeviceToHost); cudaFree(binary); cudaFree(b2dec); cudaFree(varr_); cudaFree(darr_); double gpuTime = endTime-startTime; //End Bin2dec testing for(int i=0;i<N;i++){ cout<<"\n"; for(int j=0;j<L;j++){ cout<<b2dout[i+L*j]<<","; } } //Start cpu bin2dec float dec=0; float val; sum=0; for(int i=0;i<size;i++){ sum+=pow(2,i); } startTime = clock(); if(1){ for(int k=0;k<N;k++){ //Count through population of individuals for(int j=0;j<L;j++){ //Count through number of chromosomes val =1; for(int i=0;i<W;++i){ //Count down genes if(dgene[k+W*(j+L*(W-i))] == 1){ dec += val; } val *=2; } b2dout[k+L*j]=(((dec/sum)*(1-0))/4)+0; //Set max=4 feature scaling to 0,1 dec=0; } } }else{ for(int k=0;k<N;k++){ for(int j=0;j<W;j++){ for(int i=0;i<L;i++){ //(dgene[k+W*(j+L*(W-i))]) ? _darr[k+L*i]+=_varr[j+W*i] : 0 ; if(dgene[k+W*(j+L*(W-i))]){_darr[k+L*i]+=_varr[j+W*i];} //if(1){_darr[k+L*i]+=_varr[j+W*i];} b2dout[k+L*i] = (((_darr[k+L*i]/sum)*(1))/4)+0; }}} } endTime = clock(); double cpuTime = endTime-startTime; cout<<"\n\n************\n\n"; for(int i=0;i<N;i++){ cout<<"\n"; for(int j=0;j<L;j++){ cout<<b2dout[i+L*j]<<","; } } cout<<"\n\n*******************\n*******************\nNon-CUDA cpu took "<<1000*((cpuTime)/(double)CLOCKS_PER_SEC) << " ms. \n\n*******************\n*******************" << endl; cout<<"\n\n*******************\n*******************\nCUDA gpu took "<<1000*((gpuTime)/(double)CLOCKS_PER_SEC) << " ms. \n\n*******************\n*******************" << endl; delete _varr,_darr,darr,varr,b2dout; } #if __CUDA_ARCH__ < 600 __device__ double atomicAdd_(double* address, double val) { unsigned long long int* address_as_ull = (unsigned long long int*)address; unsigned long long int old = *address_as_ull, assumed; do { assumed = old; old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed))); // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) } while (assumed != old); return __longlong_as_double(old); } #endif __global__ void bin2dec(int N_,int L,int W, double sum_, int *x,double *y,float *id3,float *id4){ const int pos = blockIdx.x * blockDim.x + threadIdx.x; const int size = blockDim.x * gridDim.x; double upper=1,lower=0; //Set bounds const int N = N_*L*W; for (int n = N * pos / size; n < N * (pos+1) / size; ++n) { int idx = n; const int i1 = ((idx /= 1 ) % L); const int i2 = ((idx /= L ) % W); const int i3 = ((idx /= W ) % N_); (x[i3+W*(i1+L*(W-i2))]) ? atomicAdd(&id4[i3+L*i1],id3[i2+W*i1]) : NULL ; y[i3+L*i1] = (((id4[i3+L*i1]/sum_)*(upper-lower))/4)+lower; //Feature scaled (0,1) using max=4 //Option calc max-min from binary ?? } }

by **hbyte** » Fri Jun 04, 2021 11:08 pm

Compile with nvcc, currently works on CUDA 7.5

So thats good news means that with a good enough GPU you can theoretically run a large population in your Genetic Algorithm in paralell. This output is for 300 population running Bin2dec, keep the variables on the GPU and then run fitness test followed by selection crossover / mutation all on the GPU. Then when done at the end move back the solution, and we're done!

The following shows the time saving (Took a while to get it working!)

Code: Select all: ******************* ******************* Non-CUDA cpu took 0.255 ms. ******************* ******************* ******************* ******************* CUDA gpu took 0.017 ms. ******************* *******************

by **hbyte** » Sat Jul 17, 2021 2:42 pm

This method only works on compute_20 , use the nvcc flag!

Scrappy GA on Cuda.

Scrappy GA on Cuda.

Re: Scrappy GA on Cuda.

Re: Scrappy GA on Cuda.

Who is online