/*****************************************************************************/
/* */
/* Copyright (c) 2008, 2009, 2010 */
/* Computer Architecture Group (CAG) */
/* University of A Coruña, Spain */
/* (http://gac.des.udc.es) */
/* Galicia Supercomputing Center (CESGA) */
/* (http://www.cesga.es) */
/* Hewlett-Packard Spain (HP) */
/* (http://www.hp.es) */
/* */
/* This file is part of UPC Operations Microbenchmarking Suite (UOMS). */
/* */
/* UOMS is free software: you can redistribute it and/or modify */
/* it under the terms of the GNU Lesser General Public License as published */
/* by the Free Software Foundation, either version 3 of the License, or */
/* (at your option) any later version. */
/* */
/* UOMS is distributed in the hope that it will be useful, */
/* but WITHOUT ANY WARRANTY; without even the implied warranty of */
/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
/* GNU Lesser General Public License for more details. */
/* */
/* You should have received a copy of the GNU Lesser General Public License */
/* along with UOMS. If not, see . */
/* */
/*****************************************************************************/
/*****************************************************************************/
/* */
/* FUNDING: This development has been funded by Hewlett-Packard Spain */
/* */
/* Project Name: */
/* UPCHACO (2008-2011) */
/* Subproject: */
/* Improving UPC Usability and Performance in Constellation Systems: */
/* Implementation/Extensions of UPC Libraries. */
/* (UPCPUProject -> UPC Performance and Usability Project) */
/* */
/*****************************************************************************/
/*****************************************************************************
For further documentation, see
[1] Files under doc/
******************************************************************************/
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
/* UPC */
#include
#include
#include
/* Timers */
#include "timers.h"
#include "defines.h"
#include "headers.h"
FILE* unit;
/*
Distributed array used in various benchmarks
*/
shared char *distArr;
/*
Broadcast array
*/
shared [] char *broadcastArr;
/*
Scatter array
*/
shared [] char *scatterArr;
/*
Gather array
*/
shared [] char *gatherArr;
/*
Gatherall array
*/
shared char *gatherallArr;
/*
Exchange array
*/
shared char *exchangeArr;
/*
Permute array
*/
shared char *permuteArr;
/*
Reduce array (will store only 1 element)
*/
shared [] char *reduceArr;
/*
Prefix reduce array
*/
shared char *prefixReduceArr;
/*
Pointer for memory allocation and freeing test
*/
shared char *mem_alloc_tests_pointer;
/*
Arrays for p2p benchmarking
*/
shared char *p2pDistArr;
shared char *p2pDistArr2;
char *p2pLocalArr;
char *p2pLocalArr2;
shared double b;// for reduction result
shared int perm[THREADS]; //for permutation
shared long times[THREADS]; //for timing
long *sizes;
int num_sizes = 0;
int *bm_list;
int num_bms = 0;
char * valid_bms[NUM_BMS];
int cache_invalidation = 0;
upc_op_t reduce_op = UPC_ADD;
char * char_reduce_op = "UPC_ADD";
int warmup;
upc_flag_t sync_mode = UPC_IN_ALLSYNC | UPC_OUT_ALLSYNC;
char * char_sync_mode = "UPC_IN_ALLSYNC|UPC_OUT_ALLSYNC";
int main(int argc, char **argv)
/*
Input variables:
-argc (type int)
Number of command line arguments
-argv (type char **)
List of command line arguments
Return value (type int)
0 always
*/
{
init(argc, argv);
if (!MYTHREAD)
UOMS_general_info();
for(int i = 0; i < num_bms; i++){
if(p2poperation(bm_list[i]) == 0){
bench(bm_list[i]);
}
else{
if (THREADS == 2){
affinity_bench(bm_list[i]);
}
else{
if(MYTHREAD == 0){
fprintf(unit,"\n#---------------------------------------------------\n");
fprintf(unit,"# Skipping benchmark %s. To run it use only 2 threads.\n",valid_bms[bm_list[i]]);
fprintf(unit,"#---------------------------------------------------\n\n");
}
}
}
}
if(unit != stdout && unit != stderr){
fclose(unit);
}
return 0;
}
int p2poperation(int operation_code) {
switch (operation_code) {
case LMEMCPY:
case MEMCPY:
case LMEMGET:
case MEMGET:
case LMEMPUT:
case MEMPUT:
#ifdef ASYNC_MEM_TEST
case ALMEMCPY:
case AMEMCPY:
case ALMEMGET:
case AMEMGET:
case ALMEMPUT:
case AMEMPUT:
#endif
case SMEMCPY:
case MEMMOVE:
return 1;
default:
break;
}
return 0;
}
/*
Generic bench function
*/
void bench(int operation_code) {
long int cursize;
long int niterations, iter;
uint64_t start, end;
uint64_t minTime, totalTime, maxTime, prevMinTime;
if (!MYTHREAD)
UOMS_function_info(operation_code,THREADS,operation_header(operation_code));
int global_iter;
int initial_iter = (warmup)?0:1;
int mem_is_ok;
for(global_iter = initial_iter; global_iter <2; global_iter++) {
prevMinTime = 0;
/*
Benchmarking coll with cursize-size and niterations-iterations
*/
for(int cursize_index=0;cursize_indextmax)
tmax=times[i];
}
}
end=tmax;
totalTime+=end;
//in order to avoid irregular latencies for short messages
if (endmaxTime)
maxTime = end;
}
upc_barrier;
free_arrays(operation_code);
if(global_iter)
print_performance_data(operation_code,cursize,niterations,minTime,maxTime,totalTime);
prevMinTime = minTime;
upc_barrier;
if(operation_code == BARRIER){
break;
}
}
}//fi global_iter
return;
}
/*
Call the corresponding function
*/
void function(int operation_code, long int cursize,long int offset){
switch (operation_code) {
case BROADCAST:
upc_all_broadcast(&(distArr[THREADS*offset]),&(broadcastArr[offset]), cursize, sync_mode);
break;
case SCATTER:
upc_all_scatter(&(distArr[THREADS*offset]),&(scatterArr[THREADS*offset]), cursize, sync_mode);
break;
case GATHER:
upc_all_gather( &(gatherArr[THREADS*offset]),&(distArr[THREADS*offset]), cursize, sync_mode);
break;
case GATHERALL:
upc_all_gather_all( &(gatherallArr[THREADS*THREADS*offset]),&(distArr[THREADS*offset]), cursize, sync_mode);
break;
case EXCHANGE:
upc_all_exchange(&(exchangeArr[THREADS*THREADS*offset]), &(distArr[THREADS*THREADS*offset]), cursize, sync_mode );
break;
case PERMUTE:
upc_all_permute(&(permuteArr[THREADS*offset]), &(distArr[THREADS*offset]), perm, cursize, sync_mode );
break;
case REDUCE_C:
upc_all_reduceC((shared char *)reduceArr, (shared char*)&(distArr[THREADS*offset]),
reduce_op, (cursize/sizeof(char))*THREADS, cursize/sizeof(char), NULL, sync_mode );
break;
case PREFIX_REDUCE_C:
upc_all_prefix_reduceC((shared char *)&(distArr[THREADS*offset]), (shared char *)&(prefixReduceArr[THREADS*offset]),
reduce_op, (cursize/sizeof(char))*THREADS, cursize/sizeof(char), NULL, sync_mode );
break;
case REDUCE_UC:
upc_all_reduceUC((shared unsigned char *)reduceArr, (shared unsigned char *)&(distArr[THREADS*offset]),
reduce_op, (cursize/sizeof(unsigned char))*THREADS, cursize/sizeof(unsigned char), NULL, sync_mode );
break;
case PREFIX_REDUCE_UC:
upc_all_prefix_reduceUC((shared unsigned char *)&(distArr[THREADS*offset]), (shared unsigned char *)&(prefixReduceArr[THREADS*offset]),
reduce_op, (cursize/sizeof(unsigned char))*THREADS, cursize/sizeof(unsigned char), NULL, sync_mode );
break;
case REDUCE_S:
upc_all_reduceS((shared short *)reduceArr, (shared short *)&(distArr[THREADS*offset]),
reduce_op, (cursize/sizeof(short))*THREADS, cursize/sizeof(short), NULL, sync_mode );
break;
case PREFIX_REDUCE_S:
upc_all_prefix_reduceS((shared short *)&(distArr[THREADS*offset]), (shared short *)&(prefixReduceArr[THREADS*offset]),
reduce_op, (cursize/sizeof(short))*THREADS, cursize/sizeof(short), NULL, sync_mode );
break;
case REDUCE_US:
upc_all_reduceUS((shared unsigned short *)reduceArr, (shared unsigned short *)&(distArr[THREADS*offset]),
reduce_op, (cursize/sizeof(unsigned short))*THREADS, cursize/sizeof(unsigned short), NULL, sync_mode );
break;
case PREFIX_REDUCE_US:
upc_all_prefix_reduceUS((shared unsigned short *)&(distArr[THREADS*offset]), (shared unsigned short *)&(prefixReduceArr[THREADS*offset]),
reduce_op, (cursize/sizeof(unsigned short))*THREADS, cursize/sizeof(unsigned short), NULL, sync_mode );
break;
case REDUCE_I:
upc_all_reduceI((shared int *)reduceArr, (shared int *)&(distArr[THREADS*offset]),
reduce_op, (cursize/sizeof(int))*THREADS, cursize/sizeof(int), NULL, sync_mode );
break;
case PREFIX_REDUCE_I:
upc_all_prefix_reduceI((shared int *)&(distArr[THREADS*offset]), (shared int *)&(prefixReduceArr[THREADS*offset]),
reduce_op, (cursize/sizeof(int))*THREADS, cursize/sizeof(int), NULL, sync_mode );
break;
case REDUCE_UI:
upc_all_reduceUI((shared unsigned int *)reduceArr, (shared unsigned int *)&(distArr[THREADS*offset]),
reduce_op, (cursize/sizeof(unsigned int))*THREADS, cursize/sizeof(unsigned int), NULL, sync_mode );
break;
case PREFIX_REDUCE_UI:
upc_all_prefix_reduceUI((shared unsigned int *)&(distArr[THREADS*offset]), (shared unsigned int *)&(prefixReduceArr[THREADS*offset]),
reduce_op, (cursize/sizeof(unsigned int))*THREADS, cursize/sizeof(unsigned int), NULL, sync_mode );
break;
case REDUCE_L:
upc_all_reduceL((shared long *)reduceArr, (shared long *)&(distArr[THREADS*offset]),
reduce_op, (cursize/sizeof(long))*THREADS, cursize/sizeof(long), NULL, sync_mode );
break;
case PREFIX_REDUCE_L:
upc_all_prefix_reduceL((shared long *)&(distArr[THREADS*offset]), (shared long *)&(prefixReduceArr[THREADS*offset]),
reduce_op, (cursize/sizeof(long))*THREADS, cursize/sizeof(long), NULL, sync_mode );
break;
case REDUCE_UL:
upc_all_reduceUL((shared unsigned long *)reduceArr, (shared unsigned long *)&(distArr[THREADS*offset]),
reduce_op, (cursize/sizeof(unsigned long))*THREADS, cursize/sizeof(unsigned long), NULL, sync_mode );
break;
case PREFIX_REDUCE_UL:
upc_all_prefix_reduceUL((shared unsigned long *)&(distArr[THREADS*offset]), (shared unsigned long *)&(prefixReduceArr[THREADS*offset]),
reduce_op, (cursize/sizeof(unsigned long))*THREADS, cursize/sizeof(unsigned long), NULL, sync_mode );
break;
case REDUCE_F:
upc_all_reduceF((shared float *)reduceArr, (shared float *)&(distArr[THREADS*offset]),
reduce_op, (cursize/sizeof(float))*THREADS, cursize/sizeof(float), NULL, sync_mode );
break;
case PREFIX_REDUCE_F:
upc_all_prefix_reduceF((shared float *)&(distArr[THREADS*offset]), (shared float *)&(prefixReduceArr[THREADS*offset]),
reduce_op, (cursize/sizeof(float))*THREADS, cursize/sizeof(float), NULL, sync_mode );
break;
case REDUCE_D:
upc_all_reduceD((shared double *)reduceArr, (shared double *)&(distArr[THREADS*offset]),
reduce_op, (cursize/sizeof(double))*THREADS, cursize/sizeof(double), NULL, sync_mode );
break;
case PREFIX_REDUCE_D:
upc_all_prefix_reduceD((shared double *)&(distArr[THREADS*offset]), (shared double *)&(prefixReduceArr[THREADS*offset]),
reduce_op, (cursize/sizeof(double))*THREADS, cursize/sizeof(double), NULL, sync_mode );
break;
case REDUCE_LD:
upc_all_reduceLD((shared long double *)reduceArr, (shared long double *)&(distArr[THREADS*offset]),
reduce_op, (cursize/sizeof(long double))*THREADS, cursize/sizeof(long double), NULL, sync_mode );
break;
case PREFIX_REDUCE_LD:
upc_all_prefix_reduceLD((shared long double *)&(distArr[THREADS*offset]),(shared long double *)&(prefixReduceArr[THREADS*offset]),
reduce_op, (cursize/sizeof(long double))*THREADS, cursize/sizeof(long double), NULL, sync_mode );
break;
case BARRIER:
upc_barrier;
break;
case MEMGET:
if (!MYTHREAD)
upc_memget((p2pLocalArr+offset),&(p2pDistArr[1+THREADS*offset]),cursize);
break;
case LMEMGET:
if (!MYTHREAD)
upc_memget((p2pLocalArr+offset),&(p2pDistArr[THREADS*offset]),cursize);
break;
#ifdef ASYNC_MEM_TEST
case AMEMGET:
if (!MYTHREAD){
upc_memget_asynci((p2pLocalArr+offset),&(p2pDistArr[1+THREADS*offset]),cursize);
upc_waitsynci();
}
break;
case ALMEMGET:
if (!MYTHREAD){
upc_memget_asynci((p2pLocalArr+offset),&(p2pDistArr[THREADS*offset]),cursize);
upc_waitsynci();
}
break;
#endif
case MEMPUT:
if (!MYTHREAD)
upc_memput(&(p2pDistArr[1+THREADS*offset]),p2pLocalArr+offset,cursize);
break;
case LMEMPUT:
if (!MYTHREAD)
upc_memput(&(p2pDistArr[THREADS*offset]),p2pLocalArr+offset,cursize);
break;
#ifdef ASYNC_MEM_TEST
case AMEMPUT:
if (!MYTHREAD){
upc_memput_asynci(&(p2pDistArr[1+THREADS*offset]),p2pLocalArr+offset,cursize);
upc_waitsynci();
}
break;
case ALMEMPUT:
if (!MYTHREAD){
upc_memput_asynci(&(p2pDistArr[THREADS*offset]),p2pLocalArr+offset,cursize);
upc_waitsynci();
}
break;
#endif
case MEMCPY:
if (!MYTHREAD)
upc_memcpy(&(p2pDistArr[1+THREADS*offset]),&(p2pDistArr2[THREADS*offset]),cursize);
break;
case LMEMCPY:
if (!MYTHREAD)
upc_memcpy(&(p2pDistArr[THREADS*offset]),&(p2pDistArr2[THREADS*offset]),cursize);
break;
#ifdef ASYNC_MEM_TEST
case AMEMCPY:
if (!MYTHREAD){
upc_memcpy_asynci(&(p2pDistArr[1+THREADS*offset]),&(p2pDistArr2[THREADS*offset]),cursize);
upc_waitsynci();
}
break;
case ALMEMCPY:
if (!MYTHREAD){
upc_memcpy_asynci(&(p2pDistArr[THREADS*offset]),&(p2pDistArr2[THREADS*offset]),cursize);
upc_waitsynci();
}
break;
#endif
case SMEMCPY:
if (!MYTHREAD)
memcpy(p2pLocalArr2+offset,p2pLocalArr+offset,cursize);
break;
case MEMMOVE:
if (!MYTHREAD)
memmove(p2pLocalArr2+offset,p2pLocalArr+offset,cursize);
break;
case ALLALLOC:
mem_alloc_tests_pointer = upc_all_alloc(THREADS,cursize);
break;
case FREE:
if(!MYTHREAD)
upc_free(mem_alloc_tests_pointer);
break;
default:
break;
}
return;
}