Swarm-NG  1.1
utilities.cu
Go to the documentation of this file.
1 /*************************************************************************
2  * Copyright (C) 2008-2010 by Mario Juric & Swarm-NG Development Team *
3  * *
4  * This program is free software; you can redistribute it and/or modify *
5  * it under the terms of the GNU General Public License as published by *
6  * the Free Software Foundation; either version 3 of the License. *
7  * *
8  * This program is distributed in the hope that it will be useful, *
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of *
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
11  * GNU General Public License for more details. *
12  * *
13  * You should have received a copy of the GNU General Public License *
14  * along with this program; if not, write to the *
15  * Free Software Foundation, Inc., *
16  * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
17  ************************************************************************/
18 
24 #include "../common.hpp"
25 #include "utilities.hpp"
26 
27 
28 namespace swarm {
29 
30 struct count_systems_t {
31  ensemble ens;
32  int count_running;
33  count_systems_t(const ensemble &ens):ens(ens),count_running(0){}
34 };
35 
36 __global__
37 void number_of_active_systems_kernel( count_systems_t* csys){
38  int sysid = ((blockIdx.z * gridDim.y + blockIdx.y) * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;
39  if(sysid>=csys->ens.nsys()) return;
40  if(csys->ens[sysid].is_active() )
41  atomicAdd(&csys->count_running,1);
42 };
43 
45  const int system_per_block = 16*16; // need not be the same as used by the integrator
46  const int nblocks = ( ens.nsys() + system_per_block - 1 ) / system_per_block;
47  dim3 gD; gD.z = 1;
48  find_best_factorization(gD.x,gD.y,nblocks);
49  dim3 tD; tD.x = system_per_block; tD.y = 1;
50 
51  count_systems_t count_systems(ens), *pcount_systems ;
52 
53 
54  cudaErrCheck ( cudaMalloc(&pcount_systems,sizeof(count_systems_t)) );
55  cudaErrCheck ( cudaMemcpy(pcount_systems,&count_systems,sizeof(count_systems_t),cudaMemcpyHostToDevice) );
56 
57  number_of_active_systems_kernel<<< gD, tD >>>( pcount_systems );
58 
59  cudaErrCheck ( cudaMemcpy(&count_systems,pcount_systems,sizeof(count_systems_t),cudaMemcpyDeviceToHost) );
60  cudaErrCheck ( cudaFree(pcount_systems) );
61 
62  return count_systems.count_running;
63 
64 }
65 
66 bool configure_grid(dim3 &gridDim, int threadsPerBlock, int nthreads, int dynShmemPerThread, int staticShmemPerBlock)
67 {
68  const int shmemPerMP = 16384;
69 
70  int dyn_shared_mem_required = dynShmemPerThread*threadsPerBlock;
71  int shared_mem_required = staticShmemPerBlock + dyn_shared_mem_required;
72  if(shared_mem_required > shmemPerMP) { return false; }
73 
74  // calculate the total number of threads
75  int nthreadsEx = nthreads;
76  int over = nthreads % threadsPerBlock;
77  if(over) { nthreadsEx += threadsPerBlock - over; } // round up to multiple of threadsPerBlock
78 
79  // calculate the number of blocks
80  int nblocks = nthreadsEx / threadsPerBlock;
81  if(nthreadsEx % threadsPerBlock) { nblocks++; }
82 
83  // calculate block dimensions so that there are as close to nblocks blocks as possible
84  find_best_factorization(gridDim.x, gridDim.y, nblocks);
85  gridDim.z = 1;
86 
87  return true;
88 }
89 
90 
91 
92 void find_best_factorization(unsigned int &bx, unsigned int &by, int nblocks)
93 {
94  bx = -1;
95  int best_r = 100000;
96  for(int bytmp = 1; bytmp != 65536; bytmp++)
97  {
98  int r = nblocks % bytmp;
99  if(r < best_r && nblocks / bytmp < 65535)
100  {
101  by = bytmp;
102  bx = nblocks / bytmp;
103  best_r = r;
104 
105  if(r == 0) { break; }
106  bx++;
107  }
108  }
109  if(bx == -1) { std::cerr << "Unfactorizable?!\n"; exit(-1); }
110 }
111 
112 
113 __global__
114 void reactivate_systems_kernel( ensemble ens ){
115  int sysid = ((blockIdx.z * gridDim.y + blockIdx.y) * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;
116  if(sysid>=ens.nsys()) return;
117  if(ens[sysid].is_inactive() )
118  ens[sysid].set_active();
119 };
120 
121 
122 
123 void reactivate_systems(ensemble ens) {
124  const int system_per_block = 16*16; // need not be the same as used by the integrator
125  const int nblocks = ( ens.nsys() + system_per_block - 1 ) / system_per_block;
126  dim3 gD; gD.z = 1;
127  find_best_factorization(gD.x,gD.y,nblocks);
128  dim3 tD; tD.x = system_per_block; tD.y = 1;
129 
130  reactivate_systems_kernel<<< gD, tD >>>( ens );
131 }
132 
133 
134 }