Swarm-NG  1.1
gpulog_log.h
Go to the documentation of this file.
1 /***************************************************************************
2  * Copyright (C) 2010 by Mario Juric *
3  * mjuric@cfa.harvard.EDU *
4  * *
5  * This program is free software; you can redistribute it and/or modify *
6  * it under the terms of the GNU General Public License as published by *
7  * the Free Software Foundation; either version 3 of the License, or *
8  * (at your option) any later version. *
9  * *
10  * This program is distributed in the hope that it will be useful, *
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of *
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
13  * GNU General Public License for more details. *
14  * *
15  * You should have received a copy of the GNU General Public License *
16  * along with this program; if not, write to the *
17  * Free Software Foundation, Inc., *
18  * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
19  ***************************************************************************/
20 
27 #ifndef bits_gpulog_log_h__
28 #define bits_gpulog_log_h__
29 
30 
31 #ifdef __CUDACC__
32 __device__ static inline int global_atomicAdd(int *x, int add) {
33  return atomicAdd(x,add);
34 }
35 #else
36 __host__ static inline int global_atomicAdd(int *x, int add) {
37  assert(0); // this must not be called from host code.
38  return 0;
39 }
40 #endif
41 
42 
43 namespace gpulog
44 {
45 
46  namespace internal
47  {
48 
49 
52  {
54  template<typename T>
55  __host__ static void alloc(T* &ret, int num = 1)
56  {
57  cudaMalloc((void **)&ret, num*sizeof(T));
58  }
59 
61  template<typename T>
62  __host__ static const T get(T *ptr)
63  {
64  T ret;
65  cudaMemcpy(&ret, ptr, sizeof(ret), cudaMemcpyDeviceToHost);
66  return ret;
67  }
68 
70  template<typename T>
71  __host__ static void set(T *ptr, const T& val)
72  {
73  cudaMemcpy(ptr, &val, sizeof(*ptr), cudaMemcpyHostToDevice);
74  }
75 
77  template<typename T>
78  __host__ static void dealloc(T* p, int num = 1)
79  {
80  cudaFree(p);
81  }
82 
84  __device__ static inline int threadId()
85  {
86  #ifdef __CUDACC__
87  return ((blockIdx.z * gridDim.y + blockIdx.y) * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;
88  #else
89  assert(0); // this must not be called from host code.
90  return 0;
91  #endif
92  }
93 
94 #ifdef __CUDACC__
95  __device__ static inline int atomicAdd(int *x, int add) {
97  return global_atomicAdd(x, add);
98  }
99 #else
100  __host__ static inline int atomicAdd(int *x, int add) {
102  return global_atomicAdd(x, add);
103  }
104 #endif
105  };
106 
109  {
111  template<typename T>
112  static void alloc(T* &ret, int num = 1)
113  {
114  ret = num == 1 ? new T : new T[num];
115  }
116 
118  template<typename T>
119  static const T get(T *ptr)
120  {
121  return *ptr;
122  }
123 
125  template<typename T>
126  static void set(T *ptr, const T& val)
127  {
128  *ptr = val;
129  }
130 
132  template<typename T>
133  static void dealloc(T* p, int num = 1)
134  {
135  if(num == 1) delete p;
136  else delete [] p;
137  }
138 
140  static inline int atomicAdd(int *x, int add) {
141 
142  int tmp;
143  #pragma omp critical
144  {
145  tmp = *x;
146  *x += add;
147  }
148  return tmp;
149 
150  }
151  static int threadId() { return -1; }
152  };
153 
154  /*
155  workaround for CUDA 2.2 template parsing bug -- CUDA 2.2 tries to compile a template
156  function as __host__ if it returns a derived type T* (or T&, etc..)
157  */
158  template<typename T> struct ptr_t
159  {
160  T* ptr;
161  __host__ __device__ inline ptr_t(T*p) : ptr(p) {}
162  __host__ __device__ operator T*() const { return ptr; }
163  };
164  /* CUDA 2.2 compatible version */
165  #define PTR_T(T) gpulog::internal::ptr_t<T>
166  /* CUDA 2.3 and beyond */
167  // #define PTR_T(T) T*
168 
169 
171  template<typename A>
172  struct log_base
173  {
174  protected:
175  char *buffer;
176  int *at;
177  int buf_len;
178 
179  public: /* manipulation from host */
181  __host__ void alloc(size_t len)
182  {
183  buf_len = len;
184 
185  A::alloc(at, 1);
186  A::set(at, 0);
187  A::alloc(buffer, len);
188 
189  DHOST( std::cerr << "Allocated " << len << " bytes.\n"; )
190  }
191 
193  __host__ void free()
194  {
195  A::dealloc(buffer, buf_len);
196  A::dealloc(at);
197 
198  buffer = NULL; buf_len = 0;
199  at = NULL;
200  }
201 
203  __host__ void clear()
204  {
205  A::set(at, 0);
206  }
207 
209  __host__ int fetch_size() const
210  {
211  return A::get(at);
212  }
213 
215  __host__ void set_size(int pos) const
216  {
217  A::set(at, pos);
218  }
219 
220  public:
222  __host__ __device__ int capacity() const
223  {
224  return buf_len;
225  }
226 
228  __host__ __device__ int size() const
229  {
230  return *at;
231  }
232 
234  __host__ __device__ void seek(int pos) const
235  {
236  *at = pos;
237  }
238 
240  __host__ __device__ char* internal_buffer() const
241  {
242  return buffer;
243  }
244 
246  __host__ __device__ inline bool has_overflowed(int idx)
247  {
248  return idx > buf_len;
249  }
250 
251  #if 0
252  template<typename T1, typename T2, typename T3>
253  __device__ inline PTR_T(SCALAR(T3)) write(const int msgid, const T1 &v1, const T2 &v2, const T3 &v3)
254  {
256  P::dump();
257 
258  // allocate and test for end-of-buffer
259  int len = P::len_with_padding(v3);
260  int at = A::atomicAdd(this->at, len);
261  if(has_overflowed(at + len)) { return NULL; }
262  char *ptr = buffer + at;
263 
264  // write
265  header v0(msgid, len);
266  P::IO0::put(ptr, v0, P::begin0, P::len0);
267  P::IO1::put(ptr, v1, P::begin1, P::len1);
268  P::IO2::put(ptr, v2, P::begin2, P::len2);
269  P::IO3::put(ptr, v3, P::begin3, P::len3);
270 
271  #if ARGINFO
272  P::store_arginfo(ptr, v3);
273  #endif
274 
275  DHOST( std::cerr << "Total packet len = " << len << "\n"; )
276  return (SCALAR(T3)*)(ptr + P::begin3);
277  }
278  #else
279  #include "gpulog_write.h"
280  #endif
281 
282 
283  };
284 
286  typedef log_base<dev_internals> device_log;
287 
289  struct host_log : public log_base<host_internals>
290  {
291  host_log(size_t len = 0)
292  {
293  alloc(len);
294  }
295 
296  ~host_log()
297  {
298  free();
299  }
300  };
301 
302 
303  /*
304  Log memory management and copying API
305  */
306 
309  inline void download_device_log(device_log &log, device_log* dlog)
310  {
311  cudaMemcpy(&log, dlog, sizeof(log), cudaMemcpyDeviceToHost);
312  }
313 
316  inline void download_device_log(device_log &log, const char *name)
317  {
318  cudaMemcpyFromSymbol(&log, name, sizeof(log), 0, cudaMemcpyDeviceToHost);
319  }
320 
323  inline void upload_device_log(const char *name, device_log &log)
324  {
325  cudaMemcpyToSymbol(name, &log, sizeof(log), 0, cudaMemcpyHostToDevice);
326  }
327 
329  inline device_log* upload_device_log(device_log &log)
330  {
331  void* pdlog;
332  cudaMalloc(&pdlog, sizeof(log));
333  cudaMemcpy(pdlog, &log, sizeof(log), cudaMemcpyHostToDevice);
334  return (device_log*) pdlog;
335  }
336 
343  __host__ inline void copy(host_log &to, device_log &from, int flags = 0)
344  {
345  // clear host log
346  to.clear();
347 
348  // memcpy from device log
349  int size = from.fetch_size();
350  if(size == 0) { return; }
351 
352  // clear/resize host log if needed
353  if(to.capacity() != from.capacity())
354  {
355  to.free();
356  to.alloc(from.capacity());
357  }
358 
359  // memcpy the data
360  cudaMemcpy(to.internal_buffer(), from.internal_buffer(), size, cudaMemcpyDeviceToHost);
361  to.set_size(size);
362 
363  // clear device log if asked for
364  if(flags & LOG_DEVCLEAR)
365  {
366  from.clear();
367  }
368  }
369 
376  inline void copy(host_log &to, const char *from, int flags = 0)
377  {
378  device_log dlog;
379  download_device_log(dlog, from);
380  copy(to, dlog, flags);
381  }
382 
389  inline void copy(host_log &to, device_log *from, int flags = 0)
390  {
391  device_log dlog;
392  download_device_log(dlog, from);
393  copy(to, dlog, flags);
394  }
395 
397  inline device_log alloc_device_log(const char *symbol, size_t len)
398  {
399  device_log dlog;
400  dlog.alloc(len);
401  upload_device_log(symbol, dlog);
402  return dlog;
403  }
404 
406  inline device_log* alloc_device_log(size_t len)
407  {
408  device_log dlog;
409  dlog.alloc(len);
410  return upload_device_log(dlog);
411  }
412 
414  inline void free_device_log(const char *symbol)
415  {
416  device_log dlog;
417  download_device_log(dlog, symbol);
418  dlog.free();
419  }
420 
421 
422 
423  } // namespace internal
424 } // namespace gpulog
425 
426 #endif // bits_gpulog_log_h__