Scrappy GA on Cuda.
Posted: Fri Apr 23, 2021 5:37 pm
Here we are so far. Objective send three functions to the onboard CUDA device.
We are using a Genetic Algo to search for solutions to some basic problems.
The first function to send is call bin2dec, binary to decimal convertor. Takes a set of genes for each population member and outputs a neat set of decimal values corresponding to a solution.
Input = NxLxM
N= population
L= lengthof chromosome (number of solution variables)
M=size of bit coded gene
First off we need to flatten the arrays so we have an input array
Input[ NxLxM]
Acceptable to the Cuda device.
What else is acceptable to Cuda, beyond SAXPY, is ATOMIC add, and multiply functions to be used alongside Y[i] = aX[i] + Y[i]
1. Bin2dec function to CUDA - The bin2dec function is a way to convert a binary string into a decimal
number. (Actually its not a string but an array of 0/1's)
Ok so here is the first function sent to CUDA and the timings:
It took me a while to figure this out as the atomicAdd function produces cuda runtime errors. However the main thing was that it seemed to be slower than the cpu. But that was because I included in the timing the transfer back from the CUDA device / GPU to my CPU . When timed exclusively the CUDA function bin2dec it was a fraction of the CPU time. Hurrah.
So the take home message is only do the transfer at the end of the algo, otherwise keep variables on the device!
Heres the code:
We are using a Genetic Algo to search for solutions to some basic problems.
The first function to send is call bin2dec, binary to decimal convertor. Takes a set of genes for each population member and outputs a neat set of decimal values corresponding to a solution.
Input = NxLxM
N= population
L= lengthof chromosome (number of solution variables)
M=size of bit coded gene
First off we need to flatten the arrays so we have an input array
Input[ NxLxM]
Acceptable to the Cuda device.
What else is acceptable to Cuda, beyond SAXPY, is ATOMIC add, and multiply functions to be used alongside Y[i] = aX[i] + Y[i]
1. Bin2dec function to CUDA - The bin2dec function is a way to convert a binary string into a decimal
number. (Actually its not a string but an array of 0/1's)
Ok so here is the first function sent to CUDA and the timings:
It took me a while to figure this out as the atomicAdd function produces cuda runtime errors. However the main thing was that it seemed to be slower than the cpu. But that was because I included in the timing the transfer back from the CUDA device / GPU to my CPU . When timed exclusively the CUDA function bin2dec it was a fraction of the CPU time. Hurrah.
So the take home message is only do the transfer at the end of the algo, otherwise keep variables on the device!
Heres the code:
- Code: Select all
#include <cstdlib>
#include <vector>
#include <memory>
#include <cublas_v2.h>
#include <cuda_runtime_api.h>
//#include <sm_60_atomic_functions.h>
#include <cuda.h>
#include <cstdio>
#include <iostream>
#include <ctime>
#include <time.h>
using namespace std;
__global__
void bin2dec(int N,int L,int W, double sum_, int *x,double *y,float *id3,float *id4);
void init_cuda(int size,int lenchrom,int popsize);
int main(){
init_cuda(50,2,300);
}
void init_cuda(int size,int lenchrom,int popsize){
//Bin2dec
float* varr_;
float* darr_;
int* binary;
double* b2dec;
int N,L,W;
double sum;
clock_t startTime, endTime;
int dgene[popsize*size*lenchrom]; //Pointer to genes for all individuals
W = size;
N = popsize;
L = lenchrom;
sum =0;
for(int i=0;i<W;i++){
sum+=pow(2,i);
}
double b2dout[N*L]; // Output from bin2dec
binary = NULL;
//cwgts = NULL;
//fitness =NULL;
//Init and Print out genes
for(int i=0;i<N;i++){
for(int j=0;j<L;j++){
cout<<"\n\n";
for(int k=0;k<W;k++){
dgene[i+W*(j+L*k)]=rand()%2; //0 or 1
cout<<dgene[i+W*(j+L*k)]<<",";
}}}
//Start Bin2dec testing
///Prep matrix
float *varr,*darr;
float *_varr,*_darr;
varr = new float[L*W];
darr = new float[N*L];
_varr = new float[L*W];
_darr = new float[N*L];
//Create Id mat for Val
float val_=0;
for(int j=0;j<L;j++){
val_=1;
for(int i=W;i>0;i--){
val_*=2;
varr[i+W*j]=val_;
_varr[i+W*j]=val_;
}}
for(int i=0;i<N;i++){
for(int j=0;j<L;j++){
darr[i+L*j]=0;
_darr[i+L*j]=0;
}
}
cudaMalloc(&varr_, L * W *sizeof(float)); //Identity matrix for Val //Id matrix Used for bin2dec counting
cudaMalloc(&darr_, N * L *sizeof(float)); //Identity matrix for Dec //Id matrix Bin2dec value
cudaMemcpy(darr_, darr, N * L *sizeof(float), cudaMemcpyHostToDevice); //Copy Id matrix
cudaMemcpy(varr_, varr, L * W *sizeof(float), cudaMemcpyHostToDevice); //Copy Id matrix
cudaMalloc(&binary, L * N * W *sizeof(int)); //Binary in
cudaMalloc(&b2dec, L * N *sizeof(double)); //Double out
cudaMemcpy(binary, &dgene, L*N*W*sizeof(int), cudaMemcpyHostToDevice); //Copy entire population genome to device
int threadlimit = 1024;
int numthreads = N * L * W;
int requiredblocks = (numthreads/threadlimit) +1;
dim3 grid(requiredblocks, 1, 1);
dim3 block(threadlimit, 1, 1);
//Begin Bin2dec testing
startTime = clock();
bin2dec<<<grid,block>>>(N,L,W, sum, binary, b2dec,varr_,darr_);
endTime = clock();
/*This memcpy takes too much time therefore keep everything on device*/
cudaMemcpy(b2dout, b2dec, N*L*sizeof(double), cudaMemcpyDeviceToHost);
cudaFree(binary);
cudaFree(b2dec);
cudaFree(varr_);
cudaFree(darr_);
double gpuTime = endTime-startTime;
//End Bin2dec testing
for(int i=0;i<N;i++){
cout<<"\n";
for(int j=0;j<L;j++){
cout<<b2dout[i+L*j]<<",";
}
}
//Start cpu bin2dec
float dec=0;
float val;
sum=0;
for(int i=0;i<size;i++){
sum+=pow(2,i);
}
startTime = clock();
if(1){
for(int k=0;k<N;k++){ //Count through population of individuals
for(int j=0;j<L;j++){ //Count through number of chromosomes
val =1;
for(int i=0;i<W;++i){ //Count down genes
if(dgene[k+W*(j+L*(W-i))] == 1){
dec += val;
}
val *=2;
}
b2dout[k+L*j]=(((dec/sum)*(1-0))/4)+0; //Set max=4 feature scaling to 0,1
dec=0;
}
}
}else{
for(int k=0;k<N;k++){
for(int j=0;j<W;j++){
for(int i=0;i<L;i++){
//(dgene[k+W*(j+L*(W-i))]) ? _darr[k+L*i]+=_varr[j+W*i] : 0 ;
if(dgene[k+W*(j+L*(W-i))]){_darr[k+L*i]+=_varr[j+W*i];}
//if(1){_darr[k+L*i]+=_varr[j+W*i];}
b2dout[k+L*i] = (((_darr[k+L*i]/sum)*(1))/4)+0;
}}}
}
endTime = clock();
double cpuTime = endTime-startTime;
cout<<"\n\n************\n\n";
for(int i=0;i<N;i++){
cout<<"\n";
for(int j=0;j<L;j++){
cout<<b2dout[i+L*j]<<",";
}
}
cout<<"\n\n*******************\n*******************\nNon-CUDA cpu took "<<1000*((cpuTime)/(double)CLOCKS_PER_SEC) << " ms. \n\n*******************\n*******************" << endl;
cout<<"\n\n*******************\n*******************\nCUDA gpu took "<<1000*((gpuTime)/(double)CLOCKS_PER_SEC) << " ms. \n\n*******************\n*******************" << endl;
delete _varr,_darr,darr,varr,b2dout;
}
#if __CUDA_ARCH__ < 600
__device__ double atomicAdd_(double* address, double val)
{
unsigned long long int* address_as_ull =
(unsigned long long int*)address;
unsigned long long int old = *address_as_ull, assumed;
do {
assumed = old;
old = atomicCAS(address_as_ull, assumed,
__double_as_longlong(val +
__longlong_as_double(assumed)));
// Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
} while (assumed != old);
return __longlong_as_double(old);
}
#endif
__global__
void bin2dec(int N_,int L,int W, double sum_, int *x,double *y,float *id3,float *id4){
const int pos = blockIdx.x * blockDim.x + threadIdx.x;
const int size = blockDim.x * gridDim.x;
double upper=1,lower=0; //Set bounds
const int N = N_*L*W;
for (int n = N * pos / size; n < N * (pos+1) / size; ++n) {
int idx = n;
const int i1 = ((idx /= 1 ) % L);
const int i2 = ((idx /= L ) % W);
const int i3 = ((idx /= W ) % N_);
(x[i3+W*(i1+L*(W-i2))]) ? atomicAdd(&id4[i3+L*i1],id3[i2+W*i1]) : NULL ;
y[i3+L*i1] = (((id4[i3+L*i1]/sum_)*(upper-lower))/4)+lower; //Feature scaled (0,1) using max=4
//Option calc max-min from binary ??
}
}