forked from OSchip/llvm-project
				
			
		
			
				
	
	
		
			377 lines
		
	
	
		
			13 KiB
		
	
	
	
		
			C++
		
	
	
	
			
		
		
	
	
			377 lines
		
	
	
		
			13 KiB
		
	
	
	
		
			C++
		
	
	
	
| // RUN: %libomp-cxx-compile-and-run
 | |
| // RUN: %libomp-cxx-compile -DFLG=1 && %libomp-run
 | |
| // GCC-5 is needed for OpenMP 4.0 support (taskgroup)
 | |
| // XFAIL: gcc-4
 | |
| #include <cstdio>
 | |
| #include <cmath>
 | |
| #include <cassert>
 | |
| #include <omp.h>
 | |
| 
 | |
| // Total number of loop iterations, should be multiple of T for this test
 | |
| #define N 10000
 | |
| 
 | |
| // Flag to request lazy (1) or eager (0) allocation of reduction objects
 | |
| #ifndef FLG
 | |
| #define FLG 0
 | |
| #endif
 | |
| 
 | |
| /*
 | |
|   // initial user's code that corresponds to pseudo code of the test
 | |
|   #pragma omp taskgroup task_reduction(+:i,j) task_reduction(*:x)
 | |
|   {
 | |
|     for( int l = 0; l < N; ++l ) {
 | |
|       #pragma omp task firstprivate(l) in_reduction(+:i) in_reduction(*:x)
 | |
|       {
 | |
|         i += l;
 | |
|         if( l%2 )
 | |
|           x *= 1.0 / (l + 1);
 | |
|         else
 | |
|           x *= (l + 1);
 | |
|       }
 | |
|     }
 | |
| 
 | |
|     #pragma omp taskgroup task_reduction(-:i,k) task_reduction(+:y)
 | |
|     {
 | |
|       for( int l = 0; l < N; ++l ) {
 | |
|         #pragma omp task firstprivate(l) in_reduction(+:j,y) \
 | |
|             in_reduction(*:x) in_reduction(-:k)
 | |
|         {
 | |
|           j += l;
 | |
|           k -= l;
 | |
|           y += (double)l;
 | |
|           if( l%2 )
 | |
|             x *= 1.0 / (l + 1);
 | |
|           else
 | |
|             x *= (l + 1);
 | |
|         }
 | |
|         #pragma omp task firstprivate(l) in_reduction(+:y) in_reduction(-:i,k)
 | |
|         {
 | |
|           i -= l;
 | |
|           k -= l;
 | |
|           y += (double)l;
 | |
|         }
 | |
|         #pragma omp task firstprivate(l) in_reduction(+:j) in_reduction(*:x)
 | |
|         {
 | |
|           j += l;
 | |
|           if( l%2 )
 | |
|             x *= 1.0 / (l + 1);
 | |
|           else
 | |
|             x *= (l + 1);
 | |
|         }
 | |
|       }
 | |
|     } // inner reduction
 | |
| 
 | |
|     for( int l = 0; l < N; ++l ) {
 | |
|       #pragma omp task firstprivate(l) in_reduction(+:j)
 | |
|         j += l;
 | |
|     }
 | |
|   } // outer reduction
 | |
| */
 | |
| 
 | |
| //------------------------------------------------
 | |
| // OpenMP runtime library routines
 | |
| #ifdef __cplusplus
 | |
| extern "C" {
 | |
| #endif
 | |
| extern void* __kmpc_task_reduction_get_th_data(int gtid, void* tg, void* item);
 | |
| extern void* __kmpc_task_reduction_init(int gtid, int num, void* data);
 | |
| extern int __kmpc_global_thread_num(void*);
 | |
| #ifdef __cplusplus
 | |
| }
 | |
| #endif
 | |
| 
 | |
| //------------------------------------------------
 | |
| // Compiler-generated code
 | |
| 
 | |
| typedef struct _task_red_item {
 | |
|     void       *shar; // shared reduction item
 | |
|     size_t      size; // size of data item
 | |
|     void       *f_init; // data initialization routine
 | |
|     void       *f_fini; // data finalization routine
 | |
|     void       *f_comb; // data combiner routine
 | |
|     unsigned    flags;
 | |
| } _task_red_item_t;
 | |
| 
 | |
| // int:+   no need in init/fini callbacks, valid for subtraction
 | |
| void __red_int_add_comb(void *lhs, void *rhs) // combiner
 | |
| { *(int*)lhs += *(int*)rhs; }
 | |
| 
 | |
| // long long:+   no need in init/fini callbacks, valid for subtraction
 | |
| void __red_llong_add_comb(void *lhs, void *rhs) // combiner
 | |
| { *(long long*)lhs += *(long long*)rhs; }
 | |
| 
 | |
| // double:*   no need in fini callback
 | |
| void __red_dbl_mul_init(void *data) // initializer
 | |
| { *(double*)data = 1.0; }
 | |
| void __red_dbl_mul_comb(void *lhs, void *rhs) // combiner
 | |
| { *(double*)lhs *= *(double*)rhs; }
 | |
| 
 | |
| // double:+   no need in init/fini callbacks
 | |
| void __red_dbl_add_comb(void *lhs, void *rhs) // combiner
 | |
| { *(double*)lhs += *(double*)rhs; }
 | |
| 
 | |
| // ==============================
 | |
| 
 | |
| void calc_serial(int *pi, long long *pj, double *px, long long *pk, double *py)
 | |
| {
 | |
|     for( int l = 0; l < N; ++l ) {
 | |
|         *pi += l;
 | |
|         if( l%2 )
 | |
|           *px *= 1.0 / (l + 1);
 | |
|         else
 | |
|           *px *= (l + 1);
 | |
|     }
 | |
|     for( int l = 0; l < N; ++l ) {
 | |
|         *pj += l;
 | |
|         *pk -= l;
 | |
|         *py += (double)l;
 | |
|         if( l%2 )
 | |
|             *px *= 1.0 / (l + 1);
 | |
|         else
 | |
|             *px *= (l + 1);
 | |
| 
 | |
|         *pi -= l;
 | |
|         *pk -= l;
 | |
|         *py += (double)l;
 | |
| 
 | |
|         *pj += l;
 | |
|         if( l%2 )
 | |
|             *px *= 1.0 / (l + 1);
 | |
|         else
 | |
|             *px *= (l + 1);
 | |
|     }
 | |
|     for( int l = 0; l < N; ++l ) {
 | |
|         *pj += l;
 | |
|     }
 | |
| }
 | |
| 
 | |
| //------------------------------------------------
 | |
| // Test case
 | |
| int main()
 | |
| {
 | |
|   int nthreads = omp_get_max_threads();
 | |
|   int err = 0;
 | |
|   void** ptrs = (void**)malloc(nthreads*sizeof(void*));
 | |
| 
 | |
|   // user's code ======================================
 | |
|   // variables for serial calculations:
 | |
|   int is = 3;
 | |
|   long long js = -9999999;
 | |
|   double xs = 99999.0;
 | |
|   long long ks = 99999999;
 | |
|   double ys = -99999999.0;
 | |
|   // variables for parallel calculations:
 | |
|   int ip = 3;
 | |
|   long long jp = -9999999;
 | |
|   double xp = 99999.0;
 | |
|   long long kp = 99999999;
 | |
|   double yp = -99999999.0;
 | |
| 
 | |
|   calc_serial(&is, &js, &xs, &ks, &ys);
 | |
|   // ==================================================
 | |
|   for (int i = 0; i < nthreads; ++i)
 | |
|     ptrs[i] = NULL;
 | |
|   #pragma omp parallel
 | |
|   {
 | |
|     #pragma omp single nowait
 | |
|     {
 | |
|       // outer taskgroup reduces (i,j,x)
 | |
|       #pragma omp taskgroup // task_reduction(+:i,j) task_reduction(*:x)
 | |
|       {
 | |
|         _task_red_item_t red_data[3];
 | |
|         red_data[0].shar = &ip;
 | |
|         red_data[0].size = sizeof(ip);
 | |
|         red_data[0].f_init = NULL; // RTL will zero thread-specific objects
 | |
|         red_data[0].f_fini = NULL; // no destructors needed
 | |
|         red_data[0].f_comb = (void*)&__red_int_add_comb;
 | |
|         red_data[0].flags = FLG;
 | |
|         red_data[1].shar = &jp;
 | |
|         red_data[1].size = sizeof(jp);
 | |
|         red_data[1].f_init = NULL; // RTL will zero thread-specific objects
 | |
|         red_data[1].f_fini = NULL; // no destructors needed
 | |
|         red_data[1].f_comb = (void*)&__red_llong_add_comb;
 | |
|         red_data[1].flags = FLG;
 | |
|         red_data[2].shar = &xp;
 | |
|         red_data[2].size = sizeof(xp);
 | |
|         red_data[2].f_init = (void*)&__red_dbl_mul_init;
 | |
|         red_data[2].f_fini = NULL; // no destructors needed
 | |
|         red_data[2].f_comb = (void*)&__red_dbl_mul_comb;
 | |
|         red_data[2].flags = FLG;
 | |
|         int gtid = __kmpc_global_thread_num(NULL);
 | |
|         void* tg1 = __kmpc_task_reduction_init(gtid, 3, red_data);
 | |
| 
 | |
|         for( int l = 0; l < N; l += 2 ) {
 | |
|           // 2 iterations per task to get correct x value; actually any even
 | |
|           // number of iters per task will work, otherwise x looses precision
 | |
|           #pragma omp task firstprivate(l) //in_reduction(+:i) in_reduction(*:x)
 | |
|           {
 | |
|             int gtid = __kmpc_global_thread_num(NULL);
 | |
|             int *p_ip = (int*)__kmpc_task_reduction_get_th_data(gtid, tg1, &ip);
 | |
|             double *p_xp = (double*)__kmpc_task_reduction_get_th_data(
 | |
|                                         gtid, tg1, &xp);
 | |
|             if (!ptrs[gtid]) ptrs[gtid] = p_xp;
 | |
| 
 | |
|             // user's pseudo-code ==============================
 | |
|             *p_ip += l;
 | |
|             *p_xp *= (l + 1);
 | |
| 
 | |
|             *p_ip += l + 1;
 | |
|             *p_xp *= 1.0 / (l + 2);
 | |
|             // ==================================================
 | |
|           }
 | |
|         }
 | |
|         // inner taskgroup reduces (i,k,y), i is same object as in outer one
 | |
|         #pragma omp taskgroup // task_reduction(-:i,k) task_reduction(+:y)
 | |
|         {
 | |
|           _task_red_item_t red_data[3];
 | |
|           red_data[0].shar = &ip;
 | |
|           red_data[0].size = sizeof(ip);
 | |
|           red_data[0].f_init = NULL; // RTL will zero thread-specific objects
 | |
|           red_data[0].f_fini = NULL; // no destructors needed
 | |
|           red_data[0].f_comb = (void*)&__red_int_add_comb;
 | |
|           red_data[0].flags = FLG;
 | |
|           red_data[1].shar = &kp;
 | |
|           red_data[1].size = sizeof(kp);
 | |
|           red_data[1].f_init = NULL; // RTL will zero thread-specific objects
 | |
|           red_data[1].f_fini = NULL; // no destructors needed
 | |
|           red_data[1].f_comb = (void*)&__red_llong_add_comb; // same for + and -
 | |
|           red_data[1].flags = FLG;
 | |
|           red_data[2].shar = &yp;
 | |
|           red_data[2].size = sizeof(yp);
 | |
|           red_data[2].f_init = NULL; // RTL will zero thread-specific objects
 | |
|           red_data[2].f_fini = NULL; // no destructors needed
 | |
|           red_data[2].f_comb = (void*)&__red_dbl_add_comb;
 | |
|           red_data[2].flags = FLG;
 | |
|           int gtid = __kmpc_global_thread_num(NULL);
 | |
|           void* tg2 = __kmpc_task_reduction_init(gtid, 3, red_data);
 | |
| 
 | |
|           for( int l = 0; l < N; l += 2 ) {
 | |
|             #pragma omp task firstprivate(l)
 | |
|             // in_reduction(+:j,y) in_reduction(*:x) in_reduction(-:k)
 | |
|             {
 | |
|               int gtid = __kmpc_global_thread_num(NULL);
 | |
|               long long *p_jp = (long long*)__kmpc_task_reduction_get_th_data(
 | |
|                                                 gtid, tg1, &jp);
 | |
|               long long *p_kp = (long long*)__kmpc_task_reduction_get_th_data(
 | |
|                                                 gtid, tg2, &kp);
 | |
|               double *p_xp = (double*)__kmpc_task_reduction_get_th_data(
 | |
|                                           gtid, tg1, &xp);
 | |
|               double *p_yp = (double*)__kmpc_task_reduction_get_th_data(
 | |
|                                           gtid, tg2, &yp);
 | |
|               // user's pseudo-code ==============================
 | |
|               *p_jp += l;
 | |
|               *p_kp -= l;
 | |
|               *p_yp += (double)l;
 | |
|               *p_xp *= (l + 1);
 | |
| 
 | |
|               *p_jp += l + 1;
 | |
|               *p_kp -= l + 1;
 | |
|               *p_yp += (double)(l + 1);
 | |
|               *p_xp *= 1.0 / (l + 2);
 | |
|               // =================================================
 | |
| {
 | |
|   // the following code is here just to check __kmpc_task_reduction_get_th_data:
 | |
|   int tid = omp_get_thread_num();
 | |
|   void *addr1;
 | |
|   void *addr2;
 | |
|   addr1 = __kmpc_task_reduction_get_th_data(gtid, tg1, &xp); // from shared
 | |
|   addr2 = __kmpc_task_reduction_get_th_data(gtid, tg1, addr1); // from private
 | |
|   if (addr1 != addr2) {
 | |
|     #pragma omp atomic
 | |
|       ++err;
 | |
|     printf("Wrong thread-specific addresses %d s:%p p:%p\n", tid, addr1, addr2);
 | |
|   }
 | |
|   // from neighbour w/o taskgroup (should start lookup from current tg2)
 | |
|   if (tid > 0) {
 | |
|     if (ptrs[tid-1]) {
 | |
|       addr2 = __kmpc_task_reduction_get_th_data(gtid, NULL, ptrs[tid-1]);
 | |
|       if (addr1 != addr2) {
 | |
|         #pragma omp atomic
 | |
|           ++err;
 | |
|         printf("Wrong thread-specific addresses %d s:%p n:%p\n",
 | |
|                tid, addr1, addr2);
 | |
|       }
 | |
|     }
 | |
|   } else {
 | |
|     if (ptrs[nthreads-1]) {
 | |
|       addr2 = __kmpc_task_reduction_get_th_data(gtid, NULL, ptrs[nthreads-1]);
 | |
|       if (addr1 != addr2) {
 | |
|         #pragma omp atomic
 | |
|           ++err;
 | |
|         printf("Wrong thread-specific addresses %d s:%p n:%p\n",
 | |
|                tid, addr1, addr2);
 | |
|       }
 | |
|     }
 | |
|   }
 | |
|   // ----------------------------------------------
 | |
| }
 | |
|             }
 | |
|             #pragma omp task firstprivate(l)
 | |
|             // in_reduction(+:y) in_reduction(-:i,k)
 | |
|             {
 | |
|               int gtid = __kmpc_global_thread_num(NULL);
 | |
|               int *p_ip = (int*)__kmpc_task_reduction_get_th_data(
 | |
|                                     gtid, tg2, &ip);
 | |
|               long long *p_kp = (long long*)__kmpc_task_reduction_get_th_data(
 | |
|                                                 gtid, tg2, &kp);
 | |
|               double *p_yp = (double*)__kmpc_task_reduction_get_th_data(
 | |
|                                           gtid, tg2, &yp);
 | |
| 
 | |
|               // user's pseudo-code ==============================
 | |
|               *p_ip -= l;
 | |
|               *p_kp -= l;
 | |
|               *p_yp += (double)l;
 | |
| 
 | |
|               *p_ip -= l + 1;
 | |
|               *p_kp -= l + 1;
 | |
|               *p_yp += (double)(l + 1);
 | |
|               // =================================================
 | |
|             }
 | |
|             #pragma omp task firstprivate(l)
 | |
|             // in_reduction(+:j) in_reduction(*:x)
 | |
|             {
 | |
|               int gtid = __kmpc_global_thread_num(NULL);
 | |
|               long long *p_jp = (long long*)__kmpc_task_reduction_get_th_data(
 | |
|                                                 gtid, tg1, &jp);
 | |
|               double *p_xp = (double*)__kmpc_task_reduction_get_th_data(
 | |
|                                           gtid, tg1, &xp);
 | |
|               // user's pseudo-code ==============================
 | |
|               *p_jp += l;
 | |
|               *p_xp *= (l + 1);
 | |
| 
 | |
|               *p_jp += l + 1;
 | |
|               *p_xp *= 1.0 / (l + 2);
 | |
|               // =================================================
 | |
|             }
 | |
|           }
 | |
|         } // inner reduction
 | |
| 
 | |
|         for( int l = 0; l < N; l += 2 ) {
 | |
|           #pragma omp task firstprivate(l) // in_reduction(+:j)
 | |
|           {
 | |
|             int gtid = __kmpc_global_thread_num(NULL);
 | |
|             long long *p_jp = (long long*)__kmpc_task_reduction_get_th_data(
 | |
|                                               gtid, tg1, &jp);
 | |
|             // user's pseudo-code ==============================
 | |
|             *p_jp += l;
 | |
|             *p_jp += l + 1;
 | |
|             // =================================================
 | |
|           }
 | |
|         }
 | |
|       } // outer reduction
 | |
|     } // end single
 | |
|   } // end parallel
 | |
|   // check results
 | |
| #if _DEBUG
 | |
|   printf("reduction flags = %u\n", FLG);
 | |
| #endif
 | |
|   if (ip == is && jp == js && ks == kp &&
 | |
|       fabs(xp - xs) < 0.01 && fabs(yp - ys) < 0.01)
 | |
|     printf("passed\n");
 | |
|   else
 | |
|     printf("failed,\n ser:(%d %lld %f %lld %f)\n par:(%d %lld %f %lld %f)\n",
 | |
|       is, js, xs, ks, ys,
 | |
|       ip, jp, xp, kp, yp);
 | |
|   return 0;
 | |
| }
 |