/*****************************************************************************/ /* */ /* Copyright (c) 2008, 2009, 2010 */ /* Computer Architecture Group (CAG) */ /* University of A Coruña, Spain */ /* (http://gac.des.udc.es) */ /* Galicia Supercomputing Center (CESGA) */ /* (http://www.cesga.es) */ /* Hewlett-Packard Spain (HP) */ /* (http://www.hp.es) */ /* */ /* This file is part of UPC Operations Microbenchmarking Suite (UOMS). */ /* */ /* UOMS is free software: you can redistribute it and/or modify */ /* it under the terms of the GNU Lesser General Public License as published */ /* by the Free Software Foundation, either version 3 of the License, or */ /* (at your option) any later version. */ /* */ /* UOMS is distributed in the hope that it will be useful, */ /* but WITHOUT ANY WARRANTY; without even the implied warranty of */ /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ /* GNU Lesser General Public License for more details. */ /* */ /* You should have received a copy of the GNU Lesser General Public License */ /* along with UOMS. If not, see . */ /* */ /*****************************************************************************/ /*****************************************************************************/ /* */ /* FUNDING: This development has been funded by Hewlett-Packard Spain */ /* */ /* Project Name: */ /* UPCHACO (2008-2011) */ /* Subproject: */ /* Improving UPC Usability and Performance in Constellation Systems: */ /* Implementation/Extensions of UPC Libraries. */ /* (UPCPU­Project -> UPC Performance and Usability Project) */ /* */ /*****************************************************************************/ /***************************************************************************** For further documentation, see [1] Files under doc/ ******************************************************************************/ #include #include #include #include #include #include #include #include #include #include #include /* UPC */ #include #include #include /* Timers */ #include "timers.h" #include "defines.h" #include "headers.h" FILE* unit; /* Distributed array used in various benchmarks */ shared char *distArr; /* Broadcast array */ shared [] char *broadcastArr; /* Scatter array */ shared [] char *scatterArr; /* Gather array */ shared [] char *gatherArr; /* Gatherall array */ shared char *gatherallArr; /* Exchange array */ shared char *exchangeArr; /* Permute array */ shared char *permuteArr; /* Reduce array (will store only 1 element) */ shared [] char *reduceArr; /* Prefix reduce array */ shared char *prefixReduceArr; /* Pointer for memory allocation and freeing test */ shared char *mem_alloc_tests_pointer; /* Arrays for p2p benchmarking */ shared char *p2pDistArr; shared char *p2pDistArr2; char *p2pLocalArr; char *p2pLocalArr2; shared double b;// for reduction result shared int perm[THREADS]; //for permutation shared long times[THREADS]; //for timing long *sizes; int num_sizes = 0; int *bm_list; int num_bms = 0; char * valid_bms[NUM_BMS]; int cache_invalidation = 0; upc_op_t reduce_op = UPC_ADD; char * char_reduce_op = "UPC_ADD"; int warmup; upc_flag_t sync_mode = UPC_IN_ALLSYNC | UPC_OUT_ALLSYNC; char * char_sync_mode = "UPC_IN_ALLSYNC|UPC_OUT_ALLSYNC"; int main(int argc, char **argv) /* Input variables: -argc (type int) Number of command line arguments -argv (type char **) List of command line arguments Return value (type int) 0 always */ { init(argc, argv); if (!MYTHREAD) UOMS_general_info(); for(int i = 0; i < num_bms; i++){ if(p2poperation(bm_list[i]) == 0){ bench(bm_list[i]); } else{ if (THREADS == 2){ affinity_bench(bm_list[i]); } else{ if(MYTHREAD == 0){ fprintf(unit,"\n#---------------------------------------------------\n"); fprintf(unit,"# Skipping benchmark %s. To run it use only 2 threads.\n",valid_bms[bm_list[i]]); fprintf(unit,"#---------------------------------------------------\n\n"); } } } } if(unit != stdout && unit != stderr){ fclose(unit); } return 0; } int p2poperation(int operation_code) { switch (operation_code) { case LMEMCPY: case MEMCPY: case LMEMGET: case MEMGET: case LMEMPUT: case MEMPUT: #ifdef ASYNC_MEM_TEST case ALMEMCPY: case AMEMCPY: case ALMEMGET: case AMEMGET: case ALMEMPUT: case AMEMPUT: #endif case SMEMCPY: case MEMMOVE: return 1; default: break; } return 0; } /* Generic bench function */ void bench(int operation_code) { long int cursize; long int niterations, iter; uint64_t start, end; uint64_t minTime, totalTime, maxTime, prevMinTime; if (!MYTHREAD) UOMS_function_info(operation_code,THREADS,operation_header(operation_code)); int global_iter; int initial_iter = (warmup)?0:1; int mem_is_ok; for(global_iter = initial_iter; global_iter <2; global_iter++) { prevMinTime = 0; /* Benchmarking coll with cursize-size and niterations-iterations */ for(int cursize_index=0;cursize_indextmax) tmax=times[i]; } } end=tmax; totalTime+=end; //in order to avoid irregular latencies for short messages if (endmaxTime) maxTime = end; } upc_barrier; free_arrays(operation_code); if(global_iter) print_performance_data(operation_code,cursize,niterations,minTime,maxTime,totalTime); prevMinTime = minTime; upc_barrier; if(operation_code == BARRIER){ break; } } }//fi global_iter return; } /* Call the corresponding function */ void function(int operation_code, long int cursize,long int offset){ switch (operation_code) { case BROADCAST: upc_all_broadcast(&(distArr[THREADS*offset]),&(broadcastArr[offset]), cursize, sync_mode); break; case SCATTER: upc_all_scatter(&(distArr[THREADS*offset]),&(scatterArr[THREADS*offset]), cursize, sync_mode); break; case GATHER: upc_all_gather( &(gatherArr[THREADS*offset]),&(distArr[THREADS*offset]), cursize, sync_mode); break; case GATHERALL: upc_all_gather_all( &(gatherallArr[THREADS*THREADS*offset]),&(distArr[THREADS*offset]), cursize, sync_mode); break; case EXCHANGE: upc_all_exchange(&(exchangeArr[THREADS*THREADS*offset]), &(distArr[THREADS*THREADS*offset]), cursize, sync_mode ); break; case PERMUTE: upc_all_permute(&(permuteArr[THREADS*offset]), &(distArr[THREADS*offset]), perm, cursize, sync_mode ); break; case REDUCE_C: upc_all_reduceC((shared char *)reduceArr, (shared char*)&(distArr[THREADS*offset]), reduce_op, (cursize/sizeof(char))*THREADS, cursize/sizeof(char), NULL, sync_mode ); break; case PREFIX_REDUCE_C: upc_all_prefix_reduceC((shared char *)&(distArr[THREADS*offset]), (shared char *)&(prefixReduceArr[THREADS*offset]), reduce_op, (cursize/sizeof(char))*THREADS, cursize/sizeof(char), NULL, sync_mode ); break; case REDUCE_UC: upc_all_reduceUC((shared unsigned char *)reduceArr, (shared unsigned char *)&(distArr[THREADS*offset]), reduce_op, (cursize/sizeof(unsigned char))*THREADS, cursize/sizeof(unsigned char), NULL, sync_mode ); break; case PREFIX_REDUCE_UC: upc_all_prefix_reduceUC((shared unsigned char *)&(distArr[THREADS*offset]), (shared unsigned char *)&(prefixReduceArr[THREADS*offset]), reduce_op, (cursize/sizeof(unsigned char))*THREADS, cursize/sizeof(unsigned char), NULL, sync_mode ); break; case REDUCE_S: upc_all_reduceS((shared short *)reduceArr, (shared short *)&(distArr[THREADS*offset]), reduce_op, (cursize/sizeof(short))*THREADS, cursize/sizeof(short), NULL, sync_mode ); break; case PREFIX_REDUCE_S: upc_all_prefix_reduceS((shared short *)&(distArr[THREADS*offset]), (shared short *)&(prefixReduceArr[THREADS*offset]), reduce_op, (cursize/sizeof(short))*THREADS, cursize/sizeof(short), NULL, sync_mode ); break; case REDUCE_US: upc_all_reduceUS((shared unsigned short *)reduceArr, (shared unsigned short *)&(distArr[THREADS*offset]), reduce_op, (cursize/sizeof(unsigned short))*THREADS, cursize/sizeof(unsigned short), NULL, sync_mode ); break; case PREFIX_REDUCE_US: upc_all_prefix_reduceUS((shared unsigned short *)&(distArr[THREADS*offset]), (shared unsigned short *)&(prefixReduceArr[THREADS*offset]), reduce_op, (cursize/sizeof(unsigned short))*THREADS, cursize/sizeof(unsigned short), NULL, sync_mode ); break; case REDUCE_I: upc_all_reduceI((shared int *)reduceArr, (shared int *)&(distArr[THREADS*offset]), reduce_op, (cursize/sizeof(int))*THREADS, cursize/sizeof(int), NULL, sync_mode ); break; case PREFIX_REDUCE_I: upc_all_prefix_reduceI((shared int *)&(distArr[THREADS*offset]), (shared int *)&(prefixReduceArr[THREADS*offset]), reduce_op, (cursize/sizeof(int))*THREADS, cursize/sizeof(int), NULL, sync_mode ); break; case REDUCE_UI: upc_all_reduceUI((shared unsigned int *)reduceArr, (shared unsigned int *)&(distArr[THREADS*offset]), reduce_op, (cursize/sizeof(unsigned int))*THREADS, cursize/sizeof(unsigned int), NULL, sync_mode ); break; case PREFIX_REDUCE_UI: upc_all_prefix_reduceUI((shared unsigned int *)&(distArr[THREADS*offset]), (shared unsigned int *)&(prefixReduceArr[THREADS*offset]), reduce_op, (cursize/sizeof(unsigned int))*THREADS, cursize/sizeof(unsigned int), NULL, sync_mode ); break; case REDUCE_L: upc_all_reduceL((shared long *)reduceArr, (shared long *)&(distArr[THREADS*offset]), reduce_op, (cursize/sizeof(long))*THREADS, cursize/sizeof(long), NULL, sync_mode ); break; case PREFIX_REDUCE_L: upc_all_prefix_reduceL((shared long *)&(distArr[THREADS*offset]), (shared long *)&(prefixReduceArr[THREADS*offset]), reduce_op, (cursize/sizeof(long))*THREADS, cursize/sizeof(long), NULL, sync_mode ); break; case REDUCE_UL: upc_all_reduceUL((shared unsigned long *)reduceArr, (shared unsigned long *)&(distArr[THREADS*offset]), reduce_op, (cursize/sizeof(unsigned long))*THREADS, cursize/sizeof(unsigned long), NULL, sync_mode ); break; case PREFIX_REDUCE_UL: upc_all_prefix_reduceUL((shared unsigned long *)&(distArr[THREADS*offset]), (shared unsigned long *)&(prefixReduceArr[THREADS*offset]), reduce_op, (cursize/sizeof(unsigned long))*THREADS, cursize/sizeof(unsigned long), NULL, sync_mode ); break; case REDUCE_F: upc_all_reduceF((shared float *)reduceArr, (shared float *)&(distArr[THREADS*offset]), reduce_op, (cursize/sizeof(float))*THREADS, cursize/sizeof(float), NULL, sync_mode ); break; case PREFIX_REDUCE_F: upc_all_prefix_reduceF((shared float *)&(distArr[THREADS*offset]), (shared float *)&(prefixReduceArr[THREADS*offset]), reduce_op, (cursize/sizeof(float))*THREADS, cursize/sizeof(float), NULL, sync_mode ); break; case REDUCE_D: upc_all_reduceD((shared double *)reduceArr, (shared double *)&(distArr[THREADS*offset]), reduce_op, (cursize/sizeof(double))*THREADS, cursize/sizeof(double), NULL, sync_mode ); break; case PREFIX_REDUCE_D: upc_all_prefix_reduceD((shared double *)&(distArr[THREADS*offset]), (shared double *)&(prefixReduceArr[THREADS*offset]), reduce_op, (cursize/sizeof(double))*THREADS, cursize/sizeof(double), NULL, sync_mode ); break; case REDUCE_LD: upc_all_reduceLD((shared long double *)reduceArr, (shared long double *)&(distArr[THREADS*offset]), reduce_op, (cursize/sizeof(long double))*THREADS, cursize/sizeof(long double), NULL, sync_mode ); break; case PREFIX_REDUCE_LD: upc_all_prefix_reduceLD((shared long double *)&(distArr[THREADS*offset]),(shared long double *)&(prefixReduceArr[THREADS*offset]), reduce_op, (cursize/sizeof(long double))*THREADS, cursize/sizeof(long double), NULL, sync_mode ); break; case BARRIER: upc_barrier; break; case MEMGET: if (!MYTHREAD) upc_memget((p2pLocalArr+offset),&(p2pDistArr[1+THREADS*offset]),cursize); break; case LMEMGET: if (!MYTHREAD) upc_memget((p2pLocalArr+offset),&(p2pDistArr[THREADS*offset]),cursize); break; #ifdef ASYNC_MEM_TEST case AMEMGET: if (!MYTHREAD){ upc_memget_asynci((p2pLocalArr+offset),&(p2pDistArr[1+THREADS*offset]),cursize); upc_waitsynci(); } break; case ALMEMGET: if (!MYTHREAD){ upc_memget_asynci((p2pLocalArr+offset),&(p2pDistArr[THREADS*offset]),cursize); upc_waitsynci(); } break; #endif case MEMPUT: if (!MYTHREAD) upc_memput(&(p2pDistArr[1+THREADS*offset]),p2pLocalArr+offset,cursize); break; case LMEMPUT: if (!MYTHREAD) upc_memput(&(p2pDistArr[THREADS*offset]),p2pLocalArr+offset,cursize); break; #ifdef ASYNC_MEM_TEST case AMEMPUT: if (!MYTHREAD){ upc_memput_asynci(&(p2pDistArr[1+THREADS*offset]),p2pLocalArr+offset,cursize); upc_waitsynci(); } break; case ALMEMPUT: if (!MYTHREAD){ upc_memput_asynci(&(p2pDistArr[THREADS*offset]),p2pLocalArr+offset,cursize); upc_waitsynci(); } break; #endif case MEMCPY: if (!MYTHREAD) upc_memcpy(&(p2pDistArr[1+THREADS*offset]),&(p2pDistArr2[THREADS*offset]),cursize); break; case LMEMCPY: if (!MYTHREAD) upc_memcpy(&(p2pDistArr[THREADS*offset]),&(p2pDistArr2[THREADS*offset]),cursize); break; #ifdef ASYNC_MEM_TEST case AMEMCPY: if (!MYTHREAD){ upc_memcpy_asynci(&(p2pDistArr[1+THREADS*offset]),&(p2pDistArr2[THREADS*offset]),cursize); upc_waitsynci(); } break; case ALMEMCPY: if (!MYTHREAD){ upc_memcpy_asynci(&(p2pDistArr[THREADS*offset]),&(p2pDistArr2[THREADS*offset]),cursize); upc_waitsynci(); } break; #endif case SMEMCPY: if (!MYTHREAD) memcpy(p2pLocalArr2+offset,p2pLocalArr+offset,cursize); break; case MEMMOVE: if (!MYTHREAD) memmove(p2pLocalArr2+offset,p2pLocalArr+offset,cursize); break; case ALLALLOC: mem_alloc_tests_pointer = upc_all_alloc(THREADS,cursize); break; case FREE: if(!MYTHREAD) upc_free(mem_alloc_tests_pointer); break; default: break; } return; }