/************************************************************************************ * HWDD Bandwidth tool * * bandwidth.c: Interacts with memory module and gets the memory bandwidth details ************************************************************************************/ /*-----------------------------------------------------------------------*/ /* Program: Stream */ /* Revision: $Id: stream.c,v 5.9 2009/04/11 16:35:00 mccalpin Exp $ */ /* Original code developed by John D. McCalpin */ /* Programmers: John D. McCalpin */ /* Joe R. Zagar */ /* */ /* This program measures memory transfer rates in MB/s for simple */ /* computational kernels coded in C. */ /*-----------------------------------------------------------------------*/ /* Copyright 1991-2005: John D. McCalpin */ /*-----------------------------------------------------------------------*/ /* License: */ /* 1. You are free to use this program and/or to redistribute */ /* this program. */ /* 2. You are free to modify this program for your own use, */ /* including commercial use, subject to the publication */ /* restrictions in item 3. */ /* 3. You are free to publish results obtained from running this */ /* program, or from works that you derive from this program, */ /* with the following limitations: */ /* 3a. In order to be referred to as "STREAM benchmark results", */ /* published results must be in conformance to the STREAM */ /* Run Rules, (briefly reviewed below) published at */ /* http://www.cs.virginia.edu/stream/ref.html */ /* and incorporated herein by reference. */ /* As the copyright holder, John McCalpin retains the */ /* right to determine conformity with the Run Rules. */ /* 3b. Results based on modified source code or on runs not in */ /* accordance with the STREAM Run Rules must be clearly */ /* labelled whenever they are published. Examples of */ /* proper labelling include: */ /* "tuned STREAM benchmark results" */ /* "based on a variant of the STREAM benchmark code" */ /* Other comparable, clear and reasonable labelling is */ /* acceptable. */ /* 3c. Submission of results to the STREAM benchmark web site */ /* is encouraged, but not required. */ /* 4. Use of this program or creation of derived works based on this */ /* program constitutes acceptance of these licensing restrictions. */ /* 5. Absolutely no warranty is expressed or implied. */ /*-----------------------------------------------------------------------*/ # include # include # include /* INSTRUCTIONS: * * * * 1) Stream requires a good bit of memory to run. Adjust the * * value of 'N' (below) to give a 'timing calibration' of * * at least 20 clock-ticks. This will provide rate estimates * * that should be good to about 5% precision. * */ #ifndef N # define N 2000000 #endif #ifndef NTIMES # define NTIMES 10 #endif #ifndef OFFSET # define OFFSET 0 #endif #define U64_MAX 0xffffffffffffffff #define PASS 0 #define FAIL -1 /* * * 3) Compile the code with full optimization. Many compilers * * generate unreasonably bad code before the optimizer tightens * * things up. If the results are unreasonably good, on the * * other hand, the optimizer might be too smart for me! * * * * Try compiling with: * * cc -O stream_omp.c -o stream_omp * * * * This is known to work on Cray, SGI, IBM, and Sun machines. * * * * * * 4) Mail the results to mccalpin@cs.virginia.edu * * Be sure to include: * * a) computer hardware model number and software revision * * b) the compiler flags * * c) all of the output from the test case. * * Thanks! * * * */ # ifndef MIN # define MIN(x,y) ((x)<(y)?(x):(y)) # endif # ifndef MAX # define MAX(x,y) ((x)>(y)?(x):(y)) # endif static char *label[4] = {"Copy: ", "Scale: ","Add: ", "Triad: "}; static u64 a[N+OFFSET],b[N+OFFSET],c[N+OFFSET]; static u64 avgtime[4] = {0}, maxtime[4] = {0}, mintime[4] = {U64_MAX,U64_MAX,U64_MAX,U64_MAX}; static u64 bytes[4] = { 2 * sizeof(u64) * N, // copy operation 2 * sizeof(u64) * N, // Scale Operation 3 * sizeof(u64) * N, // Add Operation 3 * sizeof(u64) * N // triad operation }; static void bandwidth_reset_data_sets(void); static u64 mysecond(void); static int checkSTREAMresults(void *); static int checktick(void); #ifdef TUNED void tuned_STREAM_Copy(void); void tuned_STREAM_Scale(u64 scalar); void tuned_STREAM_Add(void); void tuned_STREAM_Triad(u64 scalar); #endif //#ifdef _OPENMP //extern int omp_get_num_threads(); //#endif int get_memory_bandwidth(void *log_ptr) { u32 quantum; u64 j, k; u64 scalar, t, times[4][NTIMES]; s16 ret = FAIL, BytesPerWord; hwdd_printk(HWDD_DEBUG, log_ptr," STREAM version $Revision: 5.9 $\n"); //resets the data sets bandwidth_reset_data_sets(); BytesPerWord = sizeof(u64); hwdd_printk(HWDD_DEBUG, log_ptr, " Total memory required = %Lx MB.\n", (u64)((3 * BytesPerWord) * ( (u64) N / 1048576))); /* printk("Each test is run %d times, but only\n", NTIMES); printk("the *best* time for each is used.\n"); #ifdef _OPENMP printk(HLINE); #pragma omp parallel { #pragma omp master { k = omp_get_num_threads(); printk ("Number of Threads requested = %i\n",k); } } #endif printk(HLINE); #pragma omp parallel { printk ("Printing one line per active thread....\n"); } */ /* Get initial value for system clock. */ //#pragma omp parallel for for (j=0; j= 1) hwdd_printk(HWDD_DEBUG,log_ptr, " Your clock granularity/precision appears to be " "%d microseconds.\n", quantum); else { hwdd_printk(HWDD_DEBUG,log_ptr," Your clock granularity appears to be " "less than one microsecond.\n"); quantum = 1; } t = mysecond(); //#pragma omp parallel for for (j = 0; j < N; j++) { a[j] = 2 * a[j]; } t = (mysecond() - t); hwdd_printk(HWDD_DEBUG,log_ptr, "\n Each test below will take on the order" "of %Ld microseconds.\n", t); hwdd_printk(HWDD_DEBUG,log_ptr," (= %d clock ticks)\n", (int) (t/quantum) ); //hwdd_printk("Increase the size of the arrays if this shows that\n"); // printk("you are not getting at least 20 clock ticks per test.\n"); // printk(HLINE); // printk("WARNING -- The above is only a rough guideline.\n"); // printk("For best results, please be sure you know the\n"); // printk("precision of your system timer.\n"); // printk(HLINE); /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ scalar = 3; for (k=0; k= 0 ? (a) : -(a)) #endif epsilon = 1000000; if (aj != asum) { if (asum/abs(aj-asum) < epsilon) { hwdd_printk (HWDD_ERROR, log_ptr, " Failed Validation on array a[]\n"); hwdd_printk(HWDD_ERROR, log_ptr," Expected : %Lx \n",aj); hwdd_printk (HWDD_ERROR, log_ptr," Observed : %Lx \n",asum); goto final; } } if (bj != bsum) { if (bsum/abs(bj-bsum) < epsilon) { hwdd_printk(HWDD_ERROR, log_ptr," Failed Validation on array b[]\n"); hwdd_printk(HWDD_ERROR, log_ptr," Expected : %Lx \n",bj); hwdd_printk(HWDD_ERROR, log_ptr," Observed : %Lx \n",bsum); goto final; } } if (cj != csum) { if (csum/abs(cj-csum) < epsilon) { hwdd_printk(HWDD_ERROR, log_ptr," Failed Validation on array c[]\n"); hwdd_printk(HWDD_ERROR, log_ptr," Expected : %Lx \n",cj); hwdd_printk(HWDD_ERROR, log_ptr," Observed : %Lx \n",csum); goto final; } } hwdd_printk (HWDD_DEBUG, log_ptr, " Solution Validates\n"); return PASS; final: return FAIL; } static void bandwidth_reset_data_sets(void) { u8 i; for(i=0;i<4;i++) { avgtime[i] = maxtime[i] = 0; mintime[i]= U64_MAX; } } void tuned_STREAM_Copy(void) { u64 j; //#pragma omp parallel for for (j=0; j