IOR
ior.c
Go to the documentation of this file.
1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2  * vim:expandtab:shiftwidth=8:tabstop=8:
3  */
4 /******************************************************************************\
5 * *
6 * Copyright (c) 2003, The Regents of the University of California *
7 * See the file COPYRIGHT for a complete copyright notice and license. *
8 * *
9 \******************************************************************************/
10 
11 #ifdef HAVE_CONFIG_H
12 # include "config.h"
13 #endif
14 
15 #include <stdio.h>
16 #include <stdlib.h>
17 #include <unistd.h>
18 #include <ctype.h> /* tolower() */
19 #include <errno.h>
20 #include <math.h>
21 #include <mpi.h>
22 #include <string.h>
23 
24 #if defined(HAVE_STRINGS_H)
25 #include <strings.h>
26 #endif
27 
28 #include <sys/stat.h> /* struct stat */
29 #include <time.h>
30 
31 #ifndef _WIN32
32 # include <sys/time.h> /* gettimeofday() */
33 # include <sys/utsname.h> /* uname() */
34 #endif
35 
36 #ifdef HAVE_CUDA
37 #include <cuda_runtime.h>
38 #endif
39 
40 #include <assert.h>
41 
42 #include "ior.h"
43 #include "ior-internal.h"
44 #include "aiori.h"
45 #include "utilities.h"
46 #include "parse_options.h"
47 
48 #define IOR_NB_TIMERS 6
49 
50 /* file scope globals */
51 extern char **environ;
52 static int totalErrorCount;
53 static const ior_aiori_t *backend;
54 
55 static void DestroyTests(IOR_test_t *tests_head);
56 static char *PrependDir(IOR_param_t *, char *);
57 static char **ParseFileName(char *, int *);
58 static void InitTests(IOR_test_t *);
59 static void TestIoSys(IOR_test_t *);
60 static void ValidateTests(IOR_param_t * params, MPI_Comm com);
62  aiori_fd_t *fd, const int access,
63  IOR_io_buffers *ioBuffers);
64 
66  aiori_xfer_hint_t * hints = & p->hints;
67  hints->dryRun = p->dryRun;
68  hints->filePerProc = p->filePerProc;
69  hints->collective = p->collective;
70  hints->numTasks = p->numTasks;
71  hints->numNodes = p->numNodes;
72  hints->randomOffset = p->randomOffset;
73  hints->fsyncPerWrite = p->fsyncPerWrite;
74  hints->segmentCount = p->segmentCount;
75  hints->blockSize = p->blockSize;
76  hints->transferSize = p->transferSize;
79 
80  if(backend->xfer_hints){
81  backend->xfer_hints(hints);
82  }
83 }
84 
86 
87 /*
88  Returns 1 if the process participates in the test
89  */
90 static int test_initialize(IOR_test_t * test){
91  int range[3];
92  IOR_param_t *params = &test->params;
93  MPI_Group orig_group, new_group;
94 
95  /* set up communicator for test */
96  MPI_CHECK(MPI_Comm_group(params->mpi_comm_world, &orig_group),
97  "MPI_Comm_group() error");
98  range[0] = 0; /* first rank */
99  range[1] = params->numTasks - 1; /* last rank */
100  range[2] = 1; /* stride */
101  MPI_CHECK(MPI_Group_range_incl(orig_group, 1, &range, &new_group),
102  "MPI_Group_range_incl() error");
103  MPI_CHECK(MPI_Comm_create(params->mpi_comm_world, new_group, & params->testComm),
104  "MPI_Comm_create() error");
105  MPI_CHECK(MPI_Group_free(&orig_group), "MPI_Group_Free() error");
106  MPI_CHECK(MPI_Group_free(&new_group), "MPI_Group_Free() error");
107 
108 
109  if (params->testComm == MPI_COMM_NULL) {
110  /* tasks not in the group do not participate in this test, this matches the proceses in test_finalize() that participate */
111  MPI_CHECK(MPI_Barrier(params->mpi_comm_world), "barrier error");
112  return 0;
113  }
114 
115  /* Setup global variables */
116  testComm = params->testComm;
117  verbose = test->params.verbose;
118  backend = test->params.backend;
119 
120 #ifdef HAVE_CUDA
121  cudaError_t cret = cudaSetDevice(test->params.gpuID);
122  if(cret != cudaSuccess){
123  EWARNF("cudaSetDevice(%d) error: %s", test->params.gpuID, cudaGetErrorString(cret));
124  }
125 #endif
126 
127  if(backend->initialize){
128  backend->initialize(test->params.backend_options);
129  }
130  ior_set_xfer_hints(& test->params);
132 
133  if (rank == 0 && verbose >= VERBOSE_0) {
134  ShowTestStart(& test->params);
135  }
136  return 1;
137 }
138 
139 static void test_finalize(IOR_test_t * test){
140  backend = test->params.backend;
141  if(backend->finalize){
142  backend->finalize(test->params.backend_options);
143  }
144  MPI_CHECK(MPI_Barrier(test->params.mpi_comm_world), "barrier error");
145  MPI_CHECK(MPI_Comm_free(& testComm), "MPI_Comm_free() error");
146 }
147 
148 
149 IOR_test_t * ior_run(int argc, char **argv, MPI_Comm world_com, FILE * world_out){
150  IOR_test_t *tests_head;
151  IOR_test_t *tptr;
152  out_logfile = world_out;
153  out_resultfile = world_out;
154 
155  MPI_CHECK(MPI_Comm_rank(world_com, &rank), "cannot get rank");
156 
157  /* setup tests, and validate parameters */
158  tests_head = ParseCommandLine(argc, argv, world_com);
159  InitTests(tests_head);
160 
161  PrintHeader(argc, argv);
162 
163  /* perform each test */
164  for (tptr = tests_head; tptr != NULL; tptr = tptr->next) {
165  int participate = test_initialize(tptr);
166  if( ! participate ) continue;
167  totalErrorCount = 0;
168  TestIoSys(tptr);
169  tptr->results->errors = totalErrorCount;
170  ShowTestEnd(tptr);
171  test_finalize(tptr);
172  }
173 
174  PrintLongSummaryAllTests(tests_head);
175 
176  /* display finish time */
177  PrintTestEnds();
178  return tests_head;
179 }
180 
181 
182 
183 int ior_main(int argc, char **argv)
184 {
185  IOR_test_t *tests_head;
186  IOR_test_t *tptr;
187 
188  out_logfile = stdout;
189  out_resultfile = stdout;
190 
191  /*
192  * check -h option from commandline without starting MPI;
193  */
194  tests_head = ParseCommandLine(argc, argv, MPI_COMM_WORLD);
195 
196  /* start the MPI code */
197  MPI_CHECK(MPI_Init(&argc, &argv), "cannot initialize MPI");
198 
199  MPI_CHECK(MPI_Comm_rank(MPI_COMM_WORLD, &rank), "cannot get rank");
200 
201  /* set error-handling */
202  /*MPI_CHECK(MPI_Errhandler_set(mpi_comm_world, MPI_ERRORS_RETURN),
203  "cannot set errhandler"); */
204 
205  /* setup tests, and validate parameters */
206  InitTests(tests_head);
207 
208  PrintHeader(argc, argv);
209 
210  /* perform each test */
211  for (tptr = tests_head; tptr != NULL; tptr = tptr->next) {
212  int participate = test_initialize(tptr);
213  if( ! participate ) continue;
214 
215  // This is useful for trapping a running MPI process. While
216  // this is sleeping, run the script 'testing/hdfs/gdb.attach'
217  if (verbose >= VERBOSE_4) {
218  fprintf(out_logfile, "\trank %d: sleeping\n", rank);
219  sleep(5);
220  fprintf(out_logfile, "\trank %d: awake.\n", rank);
221  }
222 
223  TestIoSys(tptr);
224  ShowTestEnd(tptr);
225  test_finalize(tptr);
226  }
227 
228  if (verbose <= VERBOSE_0)
229  /* always print final summary */
230  verbose = VERBOSE_1;
231  PrintLongSummaryAllTests(tests_head);
232 
233  /* display finish time */
234  PrintTestEnds();
235 
236  MPI_CHECK(MPI_Finalize(), "cannot finalize MPI");
237 
238  DestroyTests(tests_head);
239 
240  return totalErrorCount;
241 }
242 
243 /***************************** F U N C T I O N S ******************************/
244 
245 /*
246  * Initialize an IOR_param_t structure to the defaults
247  */
248 void init_IOR_Param_t(IOR_param_t * p, MPI_Comm com)
249 {
250  const char *default_aiori = aiori_default ();
251  assert (NULL != default_aiori);
252 
253  memset(p, 0, sizeof(IOR_param_t));
254  p->api = strdup(default_aiori);
255  p->platform = strdup("HOST(OSTYPE)");
256  p->testFileName = strdup("testFile");
257 
258  p->writeFile = p->readFile = FALSE;
259  p->checkWrite = p->checkRead = FALSE;
260 
261  /*
262  * These can be overridden from the command-line but otherwise will be
263  * set from MPI.
264  */
265  p->numTasks = -1;
266  p->numNodes = -1;
267  p->numTasksOnNode0 = -1;
268 
269  p->repetitions = 1;
270  p->repCounter = -1;
271  p->open = WRITE;
272  p->taskPerNodeOffset = 1;
273  p->segmentCount = 1;
274  p->blockSize = 1048576;
275  p->transferSize = 262144;
276  p->randomSeed = -1;
277  p->incompressibleSeed = 573;
278  p->testComm = com; // this com might change for smaller tests
279  p->mpi_comm_world = com;
280 
281  p->URI = NULL;
282 }
283 
284 static void
286  double timerVal,
287  char *timeString, int access, int outlierThreshold)
288 {
289  char accessString[MAX_STR];
290  double sum, mean, sqrDiff, var, sd;
291 
292  /* for local timerVal, don't compensate for wall clock delta */
293  //timerVal += wall_clock_delta;
294 
295  MPI_CHECK(MPI_Allreduce
296  (&timerVal, &sum, 1, MPI_DOUBLE, MPI_SUM, testComm),
297  "MPI_Allreduce()");
298  mean = sum / numTasks;
299  sqrDiff = pow((mean - timerVal), 2);
300  MPI_CHECK(MPI_Allreduce
301  (&sqrDiff, &var, 1, MPI_DOUBLE, MPI_SUM, testComm),
302  "MPI_Allreduce()");
303  var = var / numTasks;
304  sd = sqrt(var);
305 
306  if (access == WRITE) {
307  strcpy(accessString, "write");
308  } else { /* READ */
309  strcpy(accessString, "read");
310  }
311  if (fabs(timerVal - mean) > (double)outlierThreshold) {
312  char hostname[MAX_STR];
313  int ret = gethostname(hostname, MAX_STR);
314  if (ret != 0)
315  strcpy(hostname, "unknown");
316 
317  EWARNF("for %s, task %d, %s %s is %f (mean=%f, stddev=%f)\n",
318  hostname, rank, accessString, timeString, timerVal, mean, sd);
319  }
320 }
321 
322 /*
323  * Check for outliers in start/end times and elapsed create/xfer/close times.
324  */
325 static void
326 CheckForOutliers(IOR_param_t *test, const double *timer, const int access)
327 {
328  DisplayOutliers(test->numTasks, timer[0],
329  "start time", access, test->outlierThreshold);
331  timer[1] - timer[0],
332  "elapsed create time", access, test->outlierThreshold);
334  timer[3] - timer[2],
335  "elapsed transfer time", access,
336  test->outlierThreshold);
338  timer[5] - timer[4],
339  "elapsed close time", access, test->outlierThreshold);
340  DisplayOutliers(test->numTasks, timer[5], "end time",
341  access, test->outlierThreshold);
342 }
343 
344 /*
345  * Check if actual file size equals expected size; if not use actual for
346  * calculating performance rate.
347  */
348 static void CheckFileSize(IOR_test_t *test, char * testFilename, IOR_offset_t dataMoved, int rep, const int access)
349 {
350  IOR_param_t *params = &test->params;
351  IOR_results_t *results = test->results;
352  IOR_point_t *point = (access == WRITE) ? &results[rep].write :
353  &results[rep].read;
354 
355  /* get the size of the file */
356  IOR_offset_t aggFileSizeFromStat, tmpMin, tmpMax, tmpSum;
357  aggFileSizeFromStat = backend->get_file_size(params->backend_options, testFilename);
358 
359  if (params->hints.filePerProc == TRUE) {
360  MPI_CHECK(MPI_Allreduce(&aggFileSizeFromStat, &tmpSum, 1,
361  MPI_LONG_LONG_INT, MPI_SUM, testComm),
362  "cannot reduce total data moved");
363  aggFileSizeFromStat = tmpSum;
364  } else {
365  MPI_CHECK(MPI_Allreduce(&aggFileSizeFromStat, &tmpMin, 1,
366  MPI_LONG_LONG_INT, MPI_MIN, testComm),
367  "cannot reduce total data moved");
368  MPI_CHECK(MPI_Allreduce(&aggFileSizeFromStat, &tmpMax, 1,
369  MPI_LONG_LONG_INT, MPI_MAX, testComm),
370  "cannot reduce total data moved");
371  if (tmpMin != tmpMax) {
372  if (rank == 0) {
373  WARN("inconsistent file size by different tasks");
374  }
375  /* incorrect, but now consistent across tasks */
376  aggFileSizeFromStat = tmpMin;
377  }
378  }
379  point->aggFileSizeFromStat = aggFileSizeFromStat;
380 
381  MPI_CHECK(MPI_Allreduce(&dataMoved, &point->aggFileSizeFromXfer,
382  1, MPI_LONG_LONG_INT, MPI_SUM, testComm),
383  "cannot total data moved");
384 
385  if (strcasecmp(params->api, "HDF5") != 0 && strcasecmp(params->api, "NCMPI") != 0) {
386  if (verbose >= VERBOSE_0 && rank == 0) {
387  if ((params->expectedAggFileSize
388  != point->aggFileSizeFromXfer)
389  || (point->aggFileSizeFromStat
390  != point->aggFileSizeFromXfer)) {
391  EWARNF("Expected aggregate file size = %lld", (long long) params->expectedAggFileSize);
392  EWARNF("Stat() of aggregate file size = %lld", (long long) point->aggFileSizeFromStat);
393  EWARNF("Using actual aggregate bytes moved = %lld", (long long) point->aggFileSizeFromXfer);
394  if(params->deadlineForStonewalling){
395  EWARN("Maybe caused by deadlineForStonewalling");
396  }
397  }
398  }
399  }
400 
401  point->aggFileSizeForBW = point->aggFileSizeFromXfer;
402 }
403 
404 /*
405  * Compare buffers after reading/writing each transfer. Displays only first
406  * difference in buffers and returns total errors counted.
407  */
408 static size_t
409 CompareData(void *expectedBuffer, size_t size, IOR_offset_t transferCount, IOR_param_t *test, IOR_offset_t offset, int fillrank, int access)
410 {
411  assert(access == WRITECHECK || access == READCHECK);
412 
413  char testFileName[MAX_PATHLEN];
414  char * bufferLabel1 = "Expected: ";
415  char * bufferLabel2 = "Actual: ";
416  size_t i, j, length;
417  size_t errorCount = 0;
418 
419  IOR_offset_t offsetSignature = 0;
420  unsigned long long hi, lo, val; // for data verification
421  hi = ((unsigned long long)fillrank) << 32;
422  lo = (unsigned long long)test->timeStampSignatureValue;
423  if (test->storeFileOffset){
424  offsetSignature = offset;
425  }
426 
427  unsigned long long *testbuf = (unsigned long long *)expectedBuffer;
428 
429  length = size / sizeof(IOR_size_t);
430  if (verbose >= VERBOSE_3) {
431  fprintf(out_logfile,
432  "[%d] At file byte offset %lld, comparing %llu-byte transfer\n",
433  rank, (long long) offset, (long long)size);
434  }
435 
436  int incompressibleSeed = test->setTimeStampSignature + fillrank;
437  for (i = 0; i < length; i++) {
438  if(test->dataPacketType == incompressible ) {
439  /* same logic as in FillIncompressibleBuffer() */
440  /* WARNING: make sure that both functions are changed at the same time */
441  hi = ((unsigned long long) rand_r(& incompressibleSeed) << 32);
442  lo = (unsigned long long) rand_r(& incompressibleSeed);
443  val = hi | lo;
444  }else{
445  if ((i % 2) == 0) {
446  /* evens contain MPI rank and time in seconds */
447  val = hi | lo;
448  } else {
449  /* odds contain offset */
450  val = offsetSignature + (i * sizeof(unsigned long long));
451  }
452  }
453  if (testbuf[i] != val) {
454  errorCount++;
455  if (verbose >= VERBOSE_2) {
456  fprintf(out_logfile,
457  "[%d] At transfer buffer #%lld, index #%lld (file byte offset %lld):\n",
458  rank, transferCount - 1, (long long)i,
459  (long long) offset +
460  (IOR_size_t) (i * sizeof(IOR_size_t)));
461  fprintf(out_logfile, "[%d] %s0x", rank, bufferLabel1);
462  fprintf(out_logfile, "%016llx\n", val);
463  fprintf(out_logfile, "[%d] %s0x", rank, bufferLabel2);
464  fprintf(out_logfile, "%016llx\n", testbuf[i]);
465  }
466 
467  } else if (verbose >= VERBOSE_5) {
468  fprintf(out_logfile,
469  "[%d] PASSED offset = %llu bytes, transfer %lld\n",
470  rank, ((i * sizeof(unsigned long long)) + offset), transferCount);
471  fprintf(out_logfile, "[%d] GOOD %s0x", rank, bufferLabel1);
472  fprintf(out_logfile, "%016llx ", val);
473  fprintf(out_logfile, "\n[%d] GOOD %s0x", rank, bufferLabel2);
474  fprintf(out_logfile, "%016llx ", testbuf[i]);
475  fprintf(out_logfile, "\n");
476  }
477  }
478  if (errorCount > 0 && verbose >= VERBOSE_1) {
479  GetTestFileName(testFileName, test);
480  EWARNF("[%d] FAILED comparison of buffer in file %s during transfer %lld offset %lld containing %d-byte ints (%zd errors)",
481  rank, testFileName, transferCount, offset, (int)sizeof(unsigned long long int),errorCount);
482  }else if(verbose >= VERBOSE_2){
483  fprintf(out_logfile, "[%d] comparison successful during transfer %lld offset %lld\n", rank, transferCount, offset);
484  }
485 
486  return (errorCount);
487 }
488 
489 /*
490  * Count all errors across all tasks; report errors found.
491  */
492 static int CountErrors(IOR_param_t * test, int access, int errors)
493 {
494  int allErrors = 0;
495 
496  if (test->checkWrite || test->checkRead) {
497  MPI_CHECK(MPI_Reduce(&errors, &allErrors, 1, MPI_INT, MPI_SUM,
498  0, testComm), "cannot reduce errors");
499  MPI_CHECK(MPI_Bcast(&allErrors, 1, MPI_INT, 0, testComm),
500  "cannot broadcast allErrors value");
501  if (allErrors != 0) {
502  totalErrorCount += allErrors;
503  test->errorFound = TRUE;
504  }
505  if (rank == 0 && allErrors != 0) {
506  if (allErrors < 0) {
507  WARN("overflow in errors counted");
508  allErrors = -1;
509  }
510  EWARNF("Incorrect data on %s (%d errors found).\n",
511  access == WRITECHECK ? "write" : "read", allErrors);
512  fprintf(out_logfile,
513  "Used Time Stamp %u (0x%x) for Data Signature\n",
516  }
517  }
518  return (allErrors);
519 }
520 
522 {
523  int reps;
524  if (test->results != NULL)
525  return;
526 
527  reps = test->params.repetitions;
528  test->results = (IOR_results_t *) safeMalloc(sizeof(IOR_results_t) * reps);
529 }
530 
532 {
533  if (test->results != NULL) {
534  free(test->results);
535  }
536 }
537 
538 
542 IOR_test_t *CreateTest(IOR_param_t *init_params, int test_num)
543 {
544  IOR_test_t *newTest = NULL;
545 
546  newTest = (IOR_test_t *) malloc(sizeof(IOR_test_t));
547  if (newTest == NULL)
548  ERR("malloc() of IOR_test_t failed");
549  newTest->params = *init_params;
550  newTest->params.platform = GetPlatformName();
551  newTest->params.id = test_num;
552  newTest->next = NULL;
553  newTest->results = NULL;
554 
555  return newTest;
556 }
557 
558 static void DestroyTest(IOR_test_t *test)
559 {
560  FreeResults(test);
561  free(test);
562 }
563 
564 static void DestroyTests(IOR_test_t *tests_head)
565 {
566  IOR_test_t *tptr, *next;
567 
568  for (tptr = tests_head; tptr != NULL; tptr = next) {
569  next = tptr->next;
570  DestroyTest(tptr);
571  }
572 }
573 
574 /*
575  * Distribute IOR_HINTs to all tasks' environments.
576  */
577 static void DistributeHints(MPI_Comm com)
578 {
579  char hint[MAX_HINTS][MAX_STR], fullHint[MAX_STR], hintVariable[MAX_STR];
580  int hintCount = 0, i;
581 
582  if (rank == 0) {
583  for (i = 0; environ[i] != NULL; i++) {
584  if (strncmp(environ[i], "IOR_HINT", strlen("IOR_HINT"))
585  == 0) {
586  hintCount++;
587  if (hintCount == MAX_HINTS) {
588  WARN("exceeded max hints; reset MAX_HINTS and recompile");
589  hintCount = MAX_HINTS;
590  break;
591  }
592  /* assume no IOR_HINT is greater than MAX_STR in length */
593  strncpy(hint[hintCount - 1], environ[i],
594  MAX_STR - 1);
595  }
596  }
597  }
598 
599  MPI_CHECK(MPI_Bcast(&hintCount, sizeof(hintCount), MPI_BYTE, 0, com), "cannot broadcast hints");
600  for (i = 0; i < hintCount; i++) {
601  MPI_CHECK(MPI_Bcast(&hint[i], MAX_STR, MPI_BYTE, 0, com),
602  "cannot broadcast hints");
603  strcpy(fullHint, hint[i]);
604  strcpy(hintVariable, strtok(fullHint, "="));
605  if (getenv(hintVariable) == NULL) {
606  /* doesn't exist in this task's environment; better set it */
607  if (putenv(hint[i]) != 0)
608  WARN("cannot set environment variable");
609  }
610  }
611 }
612 
613 /*
614  * Fill buffer, which is transfer size bytes long, with known 8-byte long long
615  * int values. In even-numbered 8-byte long long ints, store MPI task in high
616  * bits and timestamp signature in low bits. In odd-numbered 8-byte long long
617  * ints, store transfer offset. If storeFileOffset option is used, the file
618  * (not transfer) offset is stored instead.
619  */
620 static unsigned int reseed_incompressible_prng = TRUE;
621 
622 static void
624 {
625  size_t i;
626  unsigned long long hi, lo;
627  unsigned long long *buf = (unsigned long long *)buffer;
628 
629  /* In order for write checks to work, we have to restart the pseudo random sequence */
630  /* This function has the same logic as CompareData() */
631  /* WARNING: make sure that both functions are changed at the same time */
633  test->incompressibleSeed = test->setTimeStampSignature + rank; /* We copied seed into timestampSignature at initialization, also add the rank to add randomness between processes */
635  }
636  for (i = 0; i < test->transferSize / sizeof(unsigned long long); i++) {
637  hi = ((unsigned long long) rand_r(&test->incompressibleSeed) << 32);
638  lo = (unsigned long long) rand_r(&test->incompressibleSeed);
639  buf[i] = hi | lo;
640  }
641 }
642 
643 static void
644 FillBuffer(void *buffer,
645  IOR_param_t * test, unsigned long long offset, int fillrank)
646 {
647  size_t i;
648  unsigned long long hi, lo;
649  unsigned long long *buf = (unsigned long long *)buffer;
650 
651  if(test->dataPacketType == incompressible ) { /* Make for some non compressible buffers with randomish data */
652  FillIncompressibleBuffer(buffer, test);
653  } else {
654  hi = ((unsigned long long)fillrank) << 32;
655  lo = (unsigned long long)test->timeStampSignatureValue;
656  for (i = 0; i < test->transferSize / sizeof(unsigned long long); i++) {
657  if ((i % 2) == 0) {
658  /* evens contain MPI rank and time in seconds */
659  buf[i] = hi | lo;
660  } else {
661  /* odds contain offset */
662  buf[i] = offset + (i * sizeof(unsigned long long));
663  }
664  }
665  }
666 }
667 
668 /*
669  * Return string describing machine name and type.
670  */
672 {
673  char nodeName[MAX_STR], *p, *start, sysName[MAX_STR];
674  char platformName[MAX_STR];
675  struct utsname name;
676 
677  if (uname(&name) != 0) {
678  EWARN("cannot get platform name");
679  sprintf(sysName, "%s", "Unknown");
680  sprintf(nodeName, "%s", "Unknown");
681  } else {
682  sprintf(sysName, "%s", name.sysname);
683  sprintf(nodeName, "%s", name.nodename);
684  }
685 
686  start = nodeName;
687  if (strlen(nodeName) == 0) {
688  p = start;
689  } else {
690  /* point to one character back from '\0' */
691  p = start + strlen(nodeName) - 1;
692  }
693  /*
694  * to cut off trailing node number, search backwards
695  * for the first non-numeric character
696  */
697  while (p != start) {
698  if (*p < '0' || *p > '9') {
699  *(p + 1) = '\0';
700  break;
701  } else {
702  p--;
703  }
704  }
705 
706  sprintf(platformName, "%s(%s)", nodeName, sysName);
707  return strdup(platformName);
708 }
709 
710 
711 
712 /*
713  * Parse file name.
714  */
715 static char **ParseFileName(char *name, int *count)
716 {
717  char **fileNames, *tmp, *token;
718  char delimiterString[3] = { FILENAME_DELIMITER, '\n', '\0' };
719  int i = 0;
720 
721  *count = 0;
722  tmp = name;
723 
724  /* pass one */
725  /* if something there, count the first item */
726  if (*tmp != '\0') {
727  (*count)++;
728  }
729  /* count the rest of the filenames */
730  while (*tmp != '\0') {
731  if (*tmp == FILENAME_DELIMITER) {
732  (*count)++;
733  }
734  tmp++;
735  }
736 
737  fileNames = (char **)malloc((*count) * sizeof(char **));
738  if (fileNames == NULL)
739  ERR("out of memory");
740 
741  /* pass two */
742  token = strtok(name, delimiterString);
743  while (token != NULL) {
744  fileNames[i] = token;
745  token = strtok(NULL, delimiterString);
746  i++;
747  }
748  return (fileNames);
749 }
750 
751 
752 /*
753  * Return test file name to access.
754  * for single shared file, fileNames[0] is returned in testFileName
755  */
756 void GetTestFileName(char *testFileName, IOR_param_t * test)
757 {
758  char **fileNames;
759  char initialTestFileName[MAX_PATHLEN];
760  char testFileNameRoot[MAX_STR];
761  char tmpString[MAX_STR];
762  int count;
763  int socket, core;
764 
765  /* parse filename for multiple file systems */
766  strcpy(initialTestFileName, test->testFileName);
767  if(test->dualMount){
768  GetProcessorAndCore(&socket, &core);
769  sprintf(tmpString, "%s%d/%s",initialTestFileName, socket, "data");
770  strcpy(initialTestFileName, tmpString);
771  }
772  fileNames = ParseFileName(initialTestFileName, &count);
773  if (count > 1 && test->uniqueDir == TRUE)
774  ERR("cannot use multiple file names with unique directories");
775  if (test->filePerProc) {
776  strcpy(testFileNameRoot,
777  fileNames[((rank +
778  rankOffset) % test->numTasks) % count]);
779  } else {
780  strcpy(testFileNameRoot, fileNames[0]);
781  }
782 
783  /* give unique name if using multiple files */
784  if (test->filePerProc) {
785  /*
786  * prepend rank subdirectory before filename
787  * e.g., /dir/file => /dir/<rank>/file
788  */
789  if (test->uniqueDir == TRUE) {
790  strcpy(testFileNameRoot,
791  PrependDir(test, testFileNameRoot));
792  }
793  sprintf(testFileName, "%s.%08d", testFileNameRoot,
794  (rank + rankOffset) % test->numTasks);
795  } else {
796  strcpy(testFileName, testFileNameRoot);
797  }
798 
799  /* add suffix for multiple files */
800  if (test->repCounter > -1) {
801  sprintf(tmpString, ".%d", test->repCounter);
802  strcat(testFileName, tmpString);
803  }
804  free (fileNames);
805 }
806 
807 /*
808  * From absolute directory, insert rank as subdirectory. Allows each task
809  * to write to its own directory. E.g., /dir/file => /dir/<rank>/file.
810  */
811 static char *PrependDir(IOR_param_t * test, char *rootDir)
812 {
813  char *dir;
814  char *fname;
815  int i;
816 
817  dir = (char *)malloc(MAX_STR + 1);
818  if (dir == NULL)
819  ERR("out of memory");
820 
821  /* get dir name */
822  strcpy(dir, rootDir);
823  i = strlen(dir) - 1;
824  while (i > 0) {
825  if (dir[i] == '\0' || dir[i] == '/') {
826  dir[i] = '/';
827  dir[i + 1] = '\0';
828  break;
829  }
830  i--;
831  }
832 
833  /* get file name */
834  fname = rootDir + i + 1;
835 
836  /* create directory with rank as subdirectory */
837  sprintf(dir + i + 1, "%d", (rank + rankOffset) % test->numTasks);
838 
839  /* dir doesn't exist, so create */
840  if (backend->access(dir, F_OK, test->backend_options) != 0) {
841  if (backend->mkdir(dir, S_IRWXU, test->backend_options) < 0) {
842  ERRF("cannot create directory: %s", dir);
843  }
844 
845  /* check if correct permissions */
846  } else if (backend->access(dir, R_OK, test->backend_options) != 0 ||
847  backend->access(dir, W_OK, test->backend_options) != 0 ||
848  backend->access(dir, X_OK, test->backend_options) != 0) {
849  ERRF("invalid directory permissions: %s", dir);
850  }
851 
852  /* concatenate dir and file names */
853  strcat(dir, "/");
854  strcat(dir, fname);
855 
856  return dir;
857 }
858 
859 /******************************************************************************/
860 /*
861  * Reduce test results, and show if verbose set.
862  */
863 static void
864 ReduceIterResults(IOR_test_t *test, double *timer, const int rep, const int access)
865 {
866  double reduced[IOR_NB_TIMERS] = { 0 };
867  double diff[IOR_NB_TIMERS / 2 + 1];
868  double totalTime, accessTime;
869  IOR_param_t *params = &test->params;
870  double bw, iops, latency, minlatency;
871  int i;
872  MPI_Op op;
873 
874  assert(access == WRITE || access == READ);
875 
876  /* Find the minimum start time of the even numbered timers, and the
877  maximum finish time for the odd numbered timers */
878  for (i = 0; i < IOR_NB_TIMERS; i++) {
879  op = i % 2 ? MPI_MAX : MPI_MIN;
880  MPI_CHECK(MPI_Reduce(&timer[i], &reduced[i], 1, MPI_DOUBLE,
881  op, 0, testComm), "MPI_Reduce()");
882  }
883 
884  /* Calculate elapsed times and throughput numbers */
885  for (i = 0; i < IOR_NB_TIMERS / 2; i++)
886  diff[i] = reduced[2 * i + 1] - reduced[2 * i];
887 
888  totalTime = reduced[5] - reduced[0];
889  accessTime = reduced[3] - reduced[2];
890 
891  IOR_point_t *point = (access == WRITE) ? &test->results[rep].write :
892  &test->results[rep].read;
893 
894  point->time = totalTime;
895 
896  if (verbose < VERBOSE_0)
897  return;
898 
899  bw = (double)point->aggFileSizeForBW / totalTime;
900 
901  /* For IOPS in this iteration, we divide the total amount of IOs from
902  * all ranks over the entire access time (first start -> last end). */
903  iops = (point->aggFileSizeForBW / params->transferSize) / accessTime;
904 
905  /* For Latency, we divide the total access time for each task over the
906  * number of I/Os issued from that task; then reduce and display the
907  * minimum (best) latency achieved. So what is reported is the average
908  * latency of all ops from a single task, then taking the minimum of
909  * that between all tasks. */
910  latency = (timer[3] - timer[2]) / (params->blockSize / params->transferSize);
911  MPI_CHECK(MPI_Reduce(&latency, &minlatency, 1, MPI_DOUBLE,
912  MPI_MIN, 0, testComm), "MPI_Reduce()");
913 
914  /* Only rank 0 tallies and prints the results. */
915  if (rank != 0)
916  return;
917 
918  PrintReducedResult(test, access, bw, iops, latency, diff, totalTime, rep);
919 }
920 
921 /*
922  * Check for file(s), then remove all files if file-per-proc, else single file.
923  *
924  */
925 static void RemoveFile(char *testFileName, int filePerProc, IOR_param_t * test)
926 {
927  int tmpRankOffset = 0;
928  if (filePerProc) {
929  /* in random tasks, delete own file */
930  if (test->reorderTasksRandom == TRUE) {
931  tmpRankOffset = rankOffset;
932  rankOffset = 0;
933  GetTestFileName(testFileName, test);
934  }
935  if (backend->access(testFileName, F_OK, test->backend_options) == 0) {
936  if (verbose >= VERBOSE_3) {
937  fprintf(out_logfile, "task %d removing %s\n", rank,
938  testFileName);
939  }
940  backend->delete(testFileName, test->backend_options);
941  }
942  if (test->reorderTasksRandom == TRUE) {
943  rankOffset = tmpRankOffset;
944  GetTestFileName(testFileName, test);
945  }
946  } else {
947  if ((rank == 0) && (backend->access(testFileName, F_OK, test->backend_options) == 0)) {
948  if (verbose >= VERBOSE_3) {
949  fprintf(out_logfile, "task %d removing %s\n", rank,
950  testFileName);
951  }
952  backend->delete(testFileName, test->backend_options);
953  }
954  }
955 }
956 
957 /*
958  * Setup tests by parsing commandline and creating test script.
959  * Perform a sanity-check on the configured parameters.
960  */
961 static void InitTests(IOR_test_t *tests)
962 {
963  if(tests == NULL){
964  return;
965  }
966  MPI_Comm com = tests->params.mpi_comm_world;
967  int mpiNumNodes = 0;
968  int mpiNumTasks = 0;
969  int mpiNumTasksOnNode0 = 0;
970 
971  verbose = tests->params.verbose;
973 
974  /*
975  * These default values are the same for every test and expensive to
976  * retrieve so just do it once.
977  */
978  mpiNumNodes = GetNumNodes(com);
979  mpiNumTasks = GetNumTasks(com);
980  mpiNumTasksOnNode0 = GetNumTasksOnNode0(com);
981 
982  /*
983  * Since there is no guarantee that anyone other than
984  * task 0 has the environment settings for the hints, pass
985  * the hint=value pair to everyone else in mpi_comm_world
986  */
987  DistributeHints(com);
988 
989  /* check validity of tests and create test queue */
990  while (tests != NULL) {
991  IOR_param_t *params = & tests->params;
992  params->testComm = com;
993 
994  /* use MPI values if not overridden on command-line */
995  if (params->numNodes == -1) {
996  params->numNodes = mpiNumNodes;
997  }
998  if (params->numTasks == -1) {
999  params->numTasks = mpiNumTasks;
1000  } else if (params->numTasks > mpiNumTasks) {
1001  if (rank == 0) {
1002  EWARNF("More tasks requested (%d) than available (%d),",
1003  params->numTasks, mpiNumTasks);
1004  EWARNF(" running with %d tasks.\n", mpiNumTasks);
1005  }
1006  params->numTasks = mpiNumTasks;
1007  }
1008  if (params->numTasksOnNode0 == -1) {
1009  params->numTasksOnNode0 = mpiNumTasksOnNode0;
1010  }
1011 
1012  params->tasksBlockMapping = QueryNodeMapping(com,false);
1013  params->expectedAggFileSize =
1014  params->blockSize * params->segmentCount * params->numTasks;
1015 
1016  ValidateTests(&tests->params, com);
1017  tests = tests->next;
1018  }
1019 
1020  init_clock(com);
1021 }
1022 
1023 /*
1024  * Setup transfer buffers, creating and filling as needed.
1025  */
1026 static void XferBuffersSetup(IOR_io_buffers* ioBuffers, IOR_param_t* test,
1027  int pretendRank)
1028 {
1029  ioBuffers->buffer = aligned_buffer_alloc(test->transferSize, test->gpuMemoryFlags);
1030 }
1031 
1032 /*
1033  * Free transfer buffers.
1034  */
1035 static void XferBuffersFree(IOR_io_buffers* ioBuffers, IOR_param_t* test)
1036 
1037 {
1038  aligned_buffer_free(ioBuffers->buffer, test->gpuMemoryFlags);
1039 }
1040 
1041 
1042 
1043 /*
1044  * malloc a buffer, touching every page in an attempt to defeat lazy allocation.
1045  */
1046 static void *malloc_and_touch(size_t size)
1047 {
1048  size_t page_size;
1049  char *buf;
1050  char *ptr;
1051 
1052  if (size == 0)
1053  return NULL;
1054 
1055  page_size = sysconf(_SC_PAGESIZE);
1056 
1057  buf = (char *)malloc(size);
1058  if (buf == NULL)
1059  return NULL;
1060 
1061  for (ptr = buf; ptr < buf+size; ptr += page_size) {
1062  *ptr = (char)1;
1063  }
1064 
1065  return (void *)buf;
1066 }
1067 
1068 static void file_hits_histogram(IOR_param_t *params)
1069 {
1070  int *rankoffs = NULL;
1071  int *filecont = NULL;
1072  int *filehits = NULL;
1073  int ifile;
1074  int jfile;
1075 
1076  if (rank == 0) {
1077  rankoffs = (int *)malloc(params->numTasks * sizeof(int));
1078  filecont = (int *)malloc(params->numTasks * sizeof(int));
1079  filehits = (int *)malloc(params->numTasks * sizeof(int));
1080  }
1081 
1082  MPI_CHECK(MPI_Gather(&rankOffset, 1, MPI_INT, rankoffs,
1083  1, MPI_INT, 0, params->testComm),
1084  "MPI_Gather error");
1085 
1086  if (rank != 0)
1087  return;
1088 
1089  memset((void *)filecont, 0, params->numTasks * sizeof(int));
1090  for (ifile = 0; ifile < params->numTasks; ifile++) {
1091  filecont[(ifile + rankoffs[ifile]) % params->numTasks]++;
1092  }
1093  memset((void *)filehits, 0, params->numTasks * sizeof(int));
1094  for (ifile = 0; ifile < params->numTasks; ifile++)
1095  for (jfile = 0; jfile < params->numTasks; jfile++) {
1096  if (ifile == filecont[jfile])
1097  filehits[ifile]++;
1098  }
1099  fprintf(out_logfile, "#File Hits Dist:");
1100  jfile = 0;
1101  ifile = 0;
1102  while (jfile < params->numTasks && ifile < params->numTasks) {
1103  fprintf(out_logfile, " %d", filehits[ifile]);
1104  jfile += filehits[ifile], ifile++;
1105  }
1106  fprintf(out_logfile, "\n");
1107  free(rankoffs);
1108  free(filecont);
1109  free(filehits);
1110 }
1111 
1112 
1113 int test_time_elapsed(IOR_param_t *params, double startTime)
1114 {
1115  double endTime;
1116 
1117  if (params->maxTimeDuration == 0)
1118  return 0;
1119 
1120  endTime = startTime + (params->maxTimeDuration * 60);
1121 
1122  return GetTimeStamp() >= endTime;
1123 }
1124 
1125 /*
1126  * hog some memory as a rough simulation of a real application's memory use
1127  */
1128 static void *HogMemory(IOR_param_t *params)
1129 {
1130  size_t size;
1131  void *buf;
1132 
1133  if (params->memoryPerTask != 0) {
1134  size = params->memoryPerTask;
1135  } else if (params->memoryPerNode != 0) {
1136  if (verbose >= VERBOSE_3)
1137  fprintf(out_logfile, "This node hogging %ld bytes of memory\n",
1138  params->memoryPerNode);
1139  size = params->memoryPerNode / params->numTasksOnNode0;
1140  } else {
1141  return NULL;
1142  }
1143 
1144  if (verbose >= VERBOSE_3)
1145  fprintf(out_logfile, "This task hogging %ld bytes of memory\n", size);
1146 
1147  buf = malloc_and_touch(size);
1148  if (buf == NULL)
1149  ERR("malloc of simulated applciation buffer failed");
1150 
1151  return buf;
1152 }
1153 /*
1154  * Write times taken during each iteration of the test.
1155  */
1156 static void
1157 WriteTimes(IOR_param_t *test, const double *timer, const int iteration,
1158  const int access)
1159 {
1160  char timerName[MAX_STR];
1161 
1162  for (int i = 0; i < IOR_NB_TIMERS; i++) {
1163 
1164  if (access == WRITE) {
1165  switch (i) {
1166  case 0:
1167  strcpy(timerName, "write open start");
1168  break;
1169  case 1:
1170  strcpy(timerName, "write open stop");
1171  break;
1172  case 2:
1173  strcpy(timerName, "write start");
1174  break;
1175  case 3:
1176  strcpy(timerName, "write stop");
1177  break;
1178  case 4:
1179  strcpy(timerName, "write close start");
1180  break;
1181  case 5:
1182  strcpy(timerName, "write close stop");
1183  break;
1184  default:
1185  strcpy(timerName, "invalid timer");
1186  break;
1187  }
1188  }
1189  else {
1190  switch (i) {
1191  case 0:
1192  strcpy(timerName, "read open start");
1193  break;
1194  case 1:
1195  strcpy(timerName, "read open stop");
1196  break;
1197  case 2:
1198  strcpy(timerName, "read start");
1199  break;
1200  case 3:
1201  strcpy(timerName, "read stop");
1202  break;
1203  case 4:
1204  strcpy(timerName, "read close start");
1205  break;
1206  case 5:
1207  strcpy(timerName, "read close stop");
1208  break;
1209  default:
1210  strcpy(timerName, "invalid timer");
1211  break;
1212  }
1213  }
1214  fprintf(out_logfile, "Test %d: Iter=%d, Task=%d, Time=%f, %s\n",
1215  test->id, iteration, (int)rank, timer[i],
1216  timerName);
1217  }
1218 }
1219 
1220 static void StoreRankInformation(IOR_test_t *test, double *timer, const int rep, const int access){
1221  IOR_param_t *params = &test->params;
1222  double totalTime = timer[5] - timer[0];
1223  double accessTime = timer[3] - timer[2];
1224  double times[] = {totalTime, accessTime};
1225 
1226  if(rank == 0){
1227  FILE* fd = fopen(params->saveRankDetailsCSV, "a");
1228  if (fd == NULL){
1229  FAIL("Cannot open saveRankPerformanceDetailsCSV file for writes!");
1230  }
1231  int size;
1232  MPI_Comm_size(params->testComm, & size);
1233  double *all_times = malloc(2* size * sizeof(double));
1234  MPI_Gather(times, 2, MPI_DOUBLE, all_times, 2, MPI_DOUBLE, 0, params->testComm);
1235  IOR_point_t *point = (access == WRITE) ? &test->results[rep].write : &test->results[rep].read;
1236  double file_size = ((double) point->aggFileSizeForBW) / size;
1237 
1238  for(int i=0; i < size; i++){
1239  char buff[1024];
1240  sprintf(buff, "%s,%d,%.10e,%.10e,%.10e,%.10e\n", access==WRITE ? "write" : "read", i, all_times[i*2], all_times[i*2+1], file_size/all_times[i*2], file_size/all_times[i*2+1] );
1241  int ret = fwrite(buff, strlen(buff), 1, fd);
1242  if(ret != 1){
1243  WARN("Couln't append to saveRankPerformanceDetailsCSV file\n");
1244  break;
1245  }
1246  }
1247  fclose(fd);
1248  }else{
1249  MPI_Gather(& times, 2, MPI_DOUBLE, NULL, 2, MPI_DOUBLE, 0, testComm);
1250  }
1251 }
1252 
1253 static void ProcessIterResults(IOR_test_t *test, double *timer, const int rep, const int access){
1254  IOR_param_t *params = &test->params;
1255 
1256  if (verbose >= VERBOSE_3)
1257  WriteTimes(params, timer, rep, access);
1258  ReduceIterResults(test, timer, rep, access);
1259  if (params->outlierThreshold) {
1260  CheckForOutliers(params, timer, access);
1261  }
1262 
1263  if(params->saveRankDetailsCSV){
1264  StoreRankInformation(test, timer, rep, access);
1265  }
1266 }
1267 
1268 /*
1269  * Using the test parameters, run iteration(s) of single test.
1270  */
1271 static void TestIoSys(IOR_test_t *test)
1272 {
1273  IOR_param_t *params = &test->params;
1274  IOR_results_t *results = test->results;
1275  char testFileName[MAX_STR];
1276  double timer[IOR_NB_TIMERS];
1277  double startTime;
1278  int pretendRank;
1279  int rep;
1280  aiori_fd_t *fd;
1281  IOR_offset_t dataMoved; /* for data rate calculation */
1282  void *hog_buf;
1283  IOR_io_buffers ioBuffers;
1284 
1285  if (rank == 0 && verbose >= VERBOSE_1) {
1286  fprintf(out_logfile, "Participating tasks : %d\n", params->numTasks);
1287  fflush(out_logfile);
1288  }
1289  if (rank == 0 && params->reorderTasks == TRUE && verbose >= VERBOSE_1) {
1290  fprintf(out_logfile,
1291  "Using reorderTasks '-C' (useful to avoid read cache in client)\n");
1292  fflush(out_logfile);
1293  }
1294  /* show test setup */
1295  if (rank == 0 && verbose >= VERBOSE_0)
1296  ShowSetup(params);
1297 
1298  hog_buf = HogMemory(params);
1299 
1300  pretendRank = (rank + rankOffset) % params->numTasks;
1301 
1302  /* IO Buffer Setup */
1303 
1304  if (params->setTimeStampSignature) { // initialize the buffer properly
1305  params->timeStampSignatureValue = (unsigned int) params->setTimeStampSignature;
1306  }
1307  XferBuffersSetup(&ioBuffers, params, pretendRank);
1308  reseed_incompressible_prng = TRUE; // reset pseudo random generator, necessary to guarantee the next call to FillBuffer produces the same value as it is right now
1309 
1310  /* Initial time stamp */
1311  startTime = GetTimeStamp();
1312 
1313  /* loop over test iterations */
1314  uint64_t params_saved_wearout = params->stoneWallingWearOutIterations;
1315 
1316  /* Check if the file exists and warn users */
1317  if((params->writeFile || params->checkWrite) && (params->hints.filePerProc || rank == 0)){
1318  struct stat sb;
1319  GetTestFileName(testFileName, params);
1320  int ret = backend->stat(testFileName, & sb, params->backend_options);
1321  if(ret == 0) {
1322  EWARNF("The file \"%s\" exists already and will be overwritten", testFileName);
1323  }
1324  }
1325 
1326  for (rep = 0; rep < params->repetitions; rep++) {
1327  /* Get iteration start time in seconds in task 0 and broadcast to
1328  all tasks */
1329  if (rank == 0) {
1330  if (! params->setTimeStampSignature) {
1331  time_t currentTime;
1332  if ((currentTime = time(NULL)) == -1) {
1333  ERR("cannot get current time");
1334  }
1335  params->timeStampSignatureValue =
1336  (unsigned int)currentTime;
1337  }
1338  if (verbose >= VERBOSE_2) {
1339  fprintf(out_logfile,
1340  "Using Time Stamp %u (0x%x) for Data Signature\n",
1341  params->timeStampSignatureValue,
1342  params->timeStampSignatureValue);
1343  }
1344  if (rep == 0 && verbose >= VERBOSE_0) {
1345  PrintTableHeader();
1346  }
1347  }
1348  MPI_CHECK(MPI_Bcast
1349  (&params->timeStampSignatureValue, 1, MPI_UNSIGNED, 0,
1350  testComm), "cannot broadcast start time value");
1351 
1352  FillBuffer(ioBuffers.buffer, params, 0, pretendRank);
1353  /* use repetition count for number of multiple files */
1354  if (params->multiFile)
1355  params->repCounter = rep;
1356 
1357  /*
1358  * write the file(s), getting timing between I/O calls
1359  */
1360 
1361  if (params->writeFile && !test_time_elapsed(params, startTime)) {
1362  GetTestFileName(testFileName, params);
1363  if (verbose >= VERBOSE_3) {
1364  fprintf(out_logfile, "task %d writing %s\n", rank,
1365  testFileName);
1366  }
1367  DelaySecs(params->interTestDelay);
1368  if (params->useExistingTestFile == FALSE) {
1369  RemoveFile(testFileName, params->filePerProc,
1370  params);
1371  }
1372 
1373  params->stoneWallingWearOutIterations = params_saved_wearout;
1374  MPI_CHECK(MPI_Barrier(testComm), "barrier error");
1375  params->open = WRITE;
1376  timer[0] = GetTimeStamp();
1377  fd = backend->create(testFileName, IOR_WRONLY | IOR_CREAT | IOR_TRUNC, params->backend_options);
1378  if(fd == NULL) FAIL("Cannot create file");
1379  timer[1] = GetTimeStamp();
1380  if (params->intraTestBarriers)
1381  MPI_CHECK(MPI_Barrier(testComm),
1382  "barrier error");
1383  if (rank == 0 && verbose >= VERBOSE_1) {
1384  fprintf(out_logfile,
1385  "Commencing write performance test: %s",
1386  CurrentTimeString());
1387  }
1388  timer[2] = GetTimeStamp();
1389  dataMoved = WriteOrRead(params, &results[rep], fd, WRITE, &ioBuffers);
1390  if (params->verbose >= VERBOSE_4) {
1391  fprintf(out_logfile, "* data moved = %llu\n", dataMoved);
1392  fflush(out_logfile);
1393  }
1394  timer[3] = GetTimeStamp();
1395  if (params->intraTestBarriers)
1396  MPI_CHECK(MPI_Barrier(testComm),
1397  "barrier error");
1398  timer[4] = GetTimeStamp();
1399  backend->close(fd, params->backend_options);
1400 
1401  timer[5] = GetTimeStamp();
1402  MPI_CHECK(MPI_Barrier(testComm), "barrier error");
1403 
1404  /* check if stat() of file doesn't equal expected file size,
1405  use actual amount of byte moved */
1406  CheckFileSize(test, testFileName, dataMoved, rep, WRITE);
1407 
1408  ProcessIterResults(test, timer, rep, WRITE);
1409 
1410  /* check if in this round we run write with stonewalling */
1411  if(params->deadlineForStonewalling > 0){
1412  params->stoneWallingWearOutIterations = results[rep].write.pairs_accessed;
1413  }
1414  }
1415 
1416  /*
1417  * perform a check of data, reading back data and comparing
1418  * against what was expected to be written
1419  */
1420  if (params->checkWrite && !test_time_elapsed(params, startTime)) {
1421  MPI_CHECK(MPI_Barrier(testComm), "barrier error");
1422  if (rank == 0 && verbose >= VERBOSE_1) {
1423  fprintf(out_logfile,
1424  "Verifying contents of the file(s) just written.\n");
1425  fprintf(out_logfile, "%s\n", CurrentTimeString());
1426  }
1427  if (params->reorderTasks) {
1428  /* move two nodes away from writing node */
1429  int shift = 1; /* assume a by-node (round-robin) mapping of tasks to nodes */
1430  if (params->tasksBlockMapping) {
1431  shift = params->numTasksOnNode0; /* switch to by-slot (contiguous block) mapping */
1432  }
1433  rankOffset = (2 * shift) % params->numTasks;
1434  }
1435  reseed_incompressible_prng = TRUE; /* Re-Seed the PRNG to get same sequence back, if random */
1436 
1437  GetTestFileName(testFileName, params);
1438  params->open = WRITECHECK;
1439  fd = backend->open(testFileName, IOR_RDONLY, params->backend_options);
1440  if(fd == NULL) FAIL("Cannot open file");
1441  dataMoved = WriteOrRead(params, &results[rep], fd, WRITECHECK, &ioBuffers);
1442  backend->close(fd, params->backend_options);
1443  rankOffset = 0;
1444  }
1445  /*
1446  * read the file(s), getting timing between I/O calls
1447  */
1448  if ((params->readFile || params->checkRead ) && !test_time_elapsed(params, startTime)) {
1449  /* check for stonewall */
1450  if(params->stoneWallingStatusFile){
1452  if(params->stoneWallingWearOutIterations == -1 && rank == 0){
1453  WARN("Could not read back the stonewalling status from the file!");
1454  params->stoneWallingWearOutIterations = 0;
1455  }
1456  }
1457  int operation_flag = READ;
1458  if ( params->checkRead ){
1459  // actually read and then compare the buffer
1460  operation_flag = READCHECK;
1461  }
1462  /* Get rankOffset [file offset] for this process to read, based on -C,-Z,-Q,-X options */
1463  /* Constant process offset reading */
1464  if (params->reorderTasks) {
1465  /* move one node away from writing node */
1466  int shift = 1; /* assume a by-node (round-robin) mapping of tasks to nodes */
1467  if (params->tasksBlockMapping) {
1468  shift=params->numTasksOnNode0; /* switch to a by-slot (contiguous block) mapping */
1469  }
1470  rankOffset = (params->taskPerNodeOffset * shift) % params->numTasks;
1471  }
1472  /* random process offset reading */
1473  if (params->reorderTasksRandom) {
1474  /* this should not intefere with randomOffset within a file because GetOffsetArrayRandom */
1475  /* seeds every rand() call */
1476  int nodeoffset;
1477  unsigned int iseed0;
1478  nodeoffset = params->taskPerNodeOffset;
1479  nodeoffset = (nodeoffset < params->numNodes) ? nodeoffset : params->numNodes - 1;
1480  if (params->reorderTasksRandomSeed < 0)
1481  iseed0 = -1 * params->reorderTasksRandomSeed + rep;
1482  else
1483  iseed0 = params->reorderTasksRandomSeed;
1484  srand(rank + iseed0);
1485  {
1486  rankOffset = rand() % params->numTasks;
1487  }
1488  while (rankOffset <
1489  (nodeoffset * params->numTasksOnNode0)) {
1490  rankOffset = rand() % params->numTasks;
1491  }
1492  /* Get more detailed stats if requested by verbose level */
1493  if (verbose >= VERBOSE_2) {
1494  file_hits_histogram(params);
1495  }
1496  }
1497  /* Using globally passed rankOffset, following function generates testFileName to read */
1498  GetTestFileName(testFileName, params);
1499 
1500  if (verbose >= VERBOSE_3) {
1501  fprintf(out_logfile, "task %d reading %s\n", rank,
1502  testFileName);
1503  }
1504  DelaySecs(params->interTestDelay);
1505  MPI_CHECK(MPI_Barrier(testComm), "barrier error");
1506  params->open = READ;
1507  timer[0] = GetTimeStamp();
1508  fd = backend->open(testFileName, IOR_RDONLY, params->backend_options);
1509  if(fd == NULL) FAIL("Cannot open file");
1510  timer[1] = GetTimeStamp();
1511  if (params->intraTestBarriers)
1512  MPI_CHECK(MPI_Barrier(testComm),
1513  "barrier error");
1514  if (rank == 0 && verbose >= VERBOSE_1) {
1515  fprintf(out_logfile,
1516  "Commencing read performance test: %s\n",
1517  CurrentTimeString());
1518  }
1519  timer[2] = GetTimeStamp();
1520  dataMoved = WriteOrRead(params, &results[rep], fd, operation_flag, &ioBuffers);
1521  timer[3] = GetTimeStamp();
1522  if (params->intraTestBarriers)
1523  MPI_CHECK(MPI_Barrier(testComm),
1524  "barrier error");
1525  timer[4] = GetTimeStamp();
1526  backend->close(fd, params->backend_options);
1527  timer[5] = GetTimeStamp();
1528 
1529  /* check if stat() of file doesn't equal expected file size,
1530  use actual amount of byte moved */
1531  CheckFileSize(test, testFileName, dataMoved, rep, READ);
1532 
1533  ProcessIterResults(test, timer, rep, READ);
1534  }
1535 
1536  if (!params->keepFile
1537  && !(params->errorFound && params->keepFileWithError)) {
1538  double start, finish;
1539  start = GetTimeStamp();
1540  MPI_CHECK(MPI_Barrier(testComm), "barrier error");
1541  RemoveFile(testFileName, params->filePerProc, params);
1542  MPI_CHECK(MPI_Barrier(testComm), "barrier error");
1543  finish = GetTimeStamp();
1544  PrintRemoveTiming(start, finish, rep);
1545  } else {
1546  MPI_CHECK(MPI_Barrier(testComm), "barrier error");
1547  }
1548  params->errorFound = FALSE;
1549  rankOffset = 0;
1550 
1551  }
1552  PrintRepeatEnd();
1553 
1554  if (params->summary_every_test) {
1557  } else {
1558  PrintShortSummary(test);
1559  }
1560 
1561  XferBuffersFree(&ioBuffers, params);
1562 
1563  if (hog_buf != NULL)
1564  free(hog_buf);
1565 }
1566 
1567 /*
1568  * Determine if valid tests from parameters.
1569  */
1570 static void ValidateTests(IOR_param_t * test, MPI_Comm com)
1571 {
1572  IOR_param_t defaults;
1573  init_IOR_Param_t(&defaults, com);
1574 
1575  if (test->repetitions <= 0)
1576  WARN_RESET("too few test repetitions",
1577  test, &defaults, repetitions);
1578  if (test->numTasks <= 0)
1579  ERR("too few tasks for testing");
1580  if (test->interTestDelay < 0)
1581  WARN_RESET("inter-test delay must be nonnegative value",
1582  test, &defaults, interTestDelay);
1583  if (test->readFile != TRUE && test->writeFile != TRUE
1584  && test->checkRead != TRUE && test->checkWrite != TRUE)
1585  ERR("test must write, read, or check read/write file");
1586  if(! test->setTimeStampSignature && test->writeFile != TRUE && test->checkRead == TRUE)
1587  ERR("using readCheck only requires to write a timeStampSignature -- use -G");
1588  if (test->segmentCount < 0)
1589  ERR("segment count must be positive value");
1590  if ((test->blockSize % sizeof(IOR_size_t)) != 0)
1591  ERR("block size must be a multiple of access size");
1592  if (test->blockSize < 0)
1593  ERR("block size must be non-negative integer");
1594  if ((test->transferSize % sizeof(IOR_size_t)) != 0)
1595  ERR("transfer size must be a multiple of access size");
1596  if (test->transferSize < 0)
1597  ERR("transfer size must be non-negative integer");
1598  if (test->transferSize == 0) {
1599  ERR("test will not complete with zero transfer size");
1600  } else {
1601  if ((test->blockSize % test->transferSize) != 0)
1602  ERR("block size must be a multiple of transfer size");
1603  }
1604  if (test->blockSize < test->transferSize)
1605  ERR("block size must not be smaller than transfer size");
1606  if (test->randomOffset && test->blockSize == test->transferSize)
1607  ERR("IOR will randomize access within a block and repeats the same pattern for all segments, therefore choose blocksize > transferSize");
1608  if (! test->randomOffset && test->randomPrefillBlocksize)
1609  ERR("Setting the randomPrefill option without using random is not useful");
1610  if (test->randomPrefillBlocksize && (test->blockSize % test->randomPrefillBlocksize != 0))
1611  ERR("The randomPrefill option must divide the blockSize");
1612  /* specific APIs */
1613  if ((strcasecmp(test->api, "MPIIO") == 0)
1614  && (test->blockSize < sizeof(IOR_size_t)
1615  || test->transferSize < sizeof(IOR_size_t)))
1616  ERR("block/transfer size may not be smaller than IOR_size_t for MPIIO");
1617  if ((strcasecmp(test->api, "HDF5") == 0)
1618  && (test->blockSize < sizeof(IOR_size_t)
1619  || test->transferSize < sizeof(IOR_size_t)))
1620  ERR("block/transfer size may not be smaller than IOR_size_t for HDF5");
1621  if ((strcasecmp(test->api, "NCMPI") == 0)
1622  && (test->blockSize < sizeof(IOR_size_t)
1623  || test->transferSize < sizeof(IOR_size_t)))
1624  ERR("block/transfer size may not be smaller than IOR_size_t for NCMPI");
1625  if (((strcasecmp(test->api, "POSIX") != 0)
1626  && (strcasecmp(test->api, "MPIIO") != 0)
1627  && (strcasecmp(test->api, "MMAP") != 0)
1628  && (strcasecmp(test->api, "HDFS") != 0)
1629  && (strcasecmp(test->api, "DFS") != 0)
1630  && (strcasecmp(test->api, "Gfarm") != 0)
1631  && (strcasecmp(test->api, "RADOS") != 0)
1632  && (strcasecmp(test->api, "CEPHFS") != 0)) && test->fsync)
1633  WARN_RESET("fsync() not supported in selected backend",
1634  test, &defaults, fsync);
1635  /* parameter consistency */
1636  if (test->reorderTasks == TRUE && test->reorderTasksRandom == TRUE)
1637  ERR("Both Constant and Random task re-ordering specified. Choose one and resubmit");
1638  if (test->randomOffset && test->reorderTasksRandom
1639  && test->filePerProc == FALSE)
1640  ERR("random offset and random reorder tasks specified with single-shared-file. Choose one and resubmit");
1641  if (test->randomOffset && test->reorderTasks
1642  && test->filePerProc == FALSE)
1643  ERR("random offset and constant reorder tasks specified with single-shared-file. Choose one and resubmit");
1644  if (test->randomOffset && test->checkRead && test->randomSeed == -1)
1645  ERR("random offset with read check option requires to set the random seed");
1646  if (test->randomOffset && test->storeFileOffset)
1647  ERR("random offset not available with store file offset option)");
1648  if ((strcasecmp(test->api, "HDF5") == 0) && test->randomOffset)
1649  ERR("random offset not available with HDF5");
1650  if ((strcasecmp(test->api, "NCMPI") == 0) && test->randomOffset)
1651  ERR("random offset not available with NCMPI");
1652  if ((strcasecmp(test->api, "NCMPI") == 0) && test->filePerProc)
1653  ERR("file-per-proc not available in current NCMPI");
1654 
1655  backend = test->backend;
1656  ior_set_xfer_hints(test);
1657  /* allow the backend to validate the options */
1658  if(test->backend->check_params){
1659  int check = test->backend->check_params(test->backend_options);
1660  if (check){
1661  ERR("The backend returned that the test parameters are invalid.");
1662  }
1663  }
1664 }
1665 
1679 IOR_offset_t *GetOffsetArrayRandom(IOR_param_t * test, int pretendRank, IOR_offset_t * out_count)
1680 {
1681  int seed;
1682  IOR_offset_t i;
1683  IOR_offset_t offsets;
1684  IOR_offset_t offsetCnt = 0;
1685  IOR_offset_t *offsetArray;
1686 
1687  if (test->filePerProc) {
1688  /* set up seed, each process can determine which regions to access individually */
1689  if (test->randomSeed == -1) {
1690  seed = time(NULL);
1691  test->randomSeed = seed;
1692  } else {
1693  seed = test->randomSeed + pretendRank;
1694  }
1695  }else{
1696  /* Shared file requires that the seed is synchronized */
1697  if (test->randomSeed == -1) {
1698  // all processes need to have the same seed.
1699  if(rank == 0){
1700  seed = time(NULL);
1701  }
1702  MPI_CHECK(MPI_Bcast(& seed, 1, MPI_INT, 0, test->testComm), "cannot broadcast random seed value");
1703  test->randomSeed = seed;
1704  }else{
1705  seed = test->randomSeed;
1706  }
1707  }
1708  srandom(seed);
1709 
1710  /* count needed offsets (pass 1) */
1711  if (test->filePerProc) {
1712  offsets = test->blockSize / test->transferSize;
1713  }else{
1714  offsets = 0;
1715  for (i = 0; i < test->blockSize * test->numTasks; i += test->transferSize) {
1716  // this counts which process get how many transferes in the shared file
1717  if ((rand() % test->numTasks) == pretendRank) {
1718  offsets++;
1719  }
1720  }
1721  }
1722 
1723  /* setup empty array */
1724  offsetArray = (IOR_offset_t *) safeMalloc(offsets * sizeof(IOR_offset_t));
1725 
1726  *out_count = offsets;
1727 
1728  if (test->filePerProc) {
1729  /* fill array */
1730  for (i = 0; i < offsets; i++) {
1731  offsetArray[i] = i * test->transferSize;
1732  }
1733  } else {
1734  /* fill with offsets (pass 2) */
1735  srandom(seed); /* need same seed to get same transfers as counted in the beginning*/
1736  for (i = 0; i < test->blockSize * test->numTasks; i += test->transferSize) {
1737  if ((rand() % test->numTasks) == pretendRank) {
1738  offsetArray[offsetCnt] = i;
1739  offsetCnt++;
1740  }
1741  }
1742  }
1743  /* reorder array */
1744  for (i = 0; i < offsets; i++) {
1745  IOR_offset_t value, tmp;
1746  value = rand() % offsets;
1747  tmp = offsetArray[value];
1748  offsetArray[value] = offsetArray[i];
1749  offsetArray[i] = tmp;
1750  }
1751 
1752  return (offsetArray);
1753 }
1754 
1755 static IOR_offset_t WriteOrReadSingle(IOR_offset_t offset, int pretendRank, IOR_offset_t transfer, IOR_offset_t * transferCount, int * errors, IOR_param_t * test, aiori_fd_t * fd, IOR_io_buffers* ioBuffers, int access){
1756  IOR_offset_t amtXferred = 0;
1757 
1758  void *buffer = ioBuffers->buffer;
1759  if (access == WRITE) {
1760  /* fills each transfer with a unique pattern
1761  * containing the offset into the file */
1762  if (test->storeFileOffset == TRUE) {
1763  FillBuffer(buffer, test, offset, pretendRank);
1764  }
1765  amtXferred = backend->xfer(access, fd, buffer, transfer, offset, test->backend_options);
1766  if (amtXferred != transfer)
1767  ERR("cannot write to file");
1768  if (test->fsyncPerWrite)
1769  backend->fsync(fd, test->backend_options);
1770  if (test->interIODelay > 0){
1771  struct timespec wait = {test->interIODelay / 1000 / 1000, 1000l * (test->interIODelay % 1000000)};
1772  nanosleep( & wait, NULL);
1773  }
1774  } else if (access == READ) {
1775  amtXferred = backend->xfer(access, fd, buffer, transfer, offset, test->backend_options);
1776  if (amtXferred != transfer)
1777  ERR("cannot read from file");
1778  if (test->interIODelay > 0){
1779  struct timespec wait = {test->interIODelay / 1000 / 1000, 1000l * (test->interIODelay % 1000000)};
1780  nanosleep( & wait, NULL);
1781  }
1782  } else if (access == WRITECHECK) {
1783  ((long long int*) buffer)[0] = ~((long long int*) buffer)[0]; // changes the buffer, no memset to reduce the memory pressure
1784  amtXferred = backend->xfer(access, fd, buffer, transfer, offset, test->backend_options);
1785  if (amtXferred != transfer)
1786  ERR("cannot read from file write check");
1787  (*transferCount)++;
1788  *errors += CompareData(buffer, transfer, *transferCount, test, offset, pretendRank, WRITECHECK);
1789  } else if (access == READCHECK) {
1790  ((long long int*) buffer)[0] = ~((long long int*) buffer)[0]; // changes the buffer, no memset to reduce the memory pressure
1791  amtXferred = backend->xfer(access, fd, buffer, transfer, offset, test->backend_options);
1792  if (amtXferred != transfer){
1793  ERR("cannot read from file");
1794  }
1795  *errors += CompareData(buffer, transfer, *transferCount, test, offset, pretendRank, READCHECK);
1796  }
1797  return amtXferred;
1798 }
1799 
1800 static void prefillSegment(IOR_param_t *test, void * randomPrefillBuffer, int pretendRank, aiori_fd_t *fd, IOR_io_buffers *ioBuffers, int startSegment, int endSegment){
1801  // prefill the whole file already with an invalid pattern
1802  int offsets = test->blockSize / test->randomPrefillBlocksize;
1803  void * oldBuffer = ioBuffers->buffer;
1804  IOR_offset_t transferCount;
1805  int errors;
1806  ioBuffers->buffer = randomPrefillBuffer;
1807  for (int i = startSegment; i < endSegment; i++){
1808  for (int j = 0; j < offsets; j++) {
1810  if (test->filePerProc) {
1811  offset += i * test->blockSize;
1812  } else {
1813  offset += (i * test->numTasks * test->blockSize) + (pretendRank * test->blockSize);
1814  }
1815  WriteOrReadSingle(offset, pretendRank, test->randomPrefillBlocksize, & transferCount, & errors, test, fd, ioBuffers, WRITE);
1816  }
1817  }
1818  ioBuffers->buffer = oldBuffer;
1819 }
1820 
1821 /*
1822  * Write or Read data to file(s). This loops through the strides, writing
1823  * out the data to each block in transfer sizes, until the remainder left is 0.
1824  */
1826  aiori_fd_t *fd, const int access, IOR_io_buffers *ioBuffers)
1827 {
1828  int errors = 0;
1829  IOR_offset_t transferCount = 0;
1830  uint64_t pairCnt = 0;
1831  int pretendRank;
1832  IOR_offset_t dataMoved = 0; /* for data rate calculation */
1833  double startForStonewall;
1834  int hitStonewall;
1835  int i, j;
1836  IOR_point_t *point = ((access == WRITE) || (access == WRITECHECK)) ?
1837  &results->write : &results->read;
1838 
1839  /* initialize values */
1840  pretendRank = (rank + rankOffset) % test->numTasks;
1841 
1842  // offsetArray = GetOffsetArraySequential(test, pretendRank);
1843 
1844  IOR_offset_t offsets;
1845  IOR_offset_t * offsets_rnd;
1846  if (test->randomOffset) {
1847  offsets_rnd = GetOffsetArrayRandom(test, pretendRank, & offsets);
1848  }else{
1849  offsets = (test->blockSize / test->transferSize);
1850  }
1851 
1852  void * randomPrefillBuffer = NULL;
1853  if(test->randomPrefillBlocksize && (access == WRITE || access == WRITECHECK)){
1854  randomPrefillBuffer = aligned_buffer_alloc(test->randomPrefillBlocksize, test->gpuMemoryFlags);
1855  // store invalid data into the buffer
1856  memset(randomPrefillBuffer, -1, test->randomPrefillBlocksize);
1857  }
1858 
1859  // start timer after random offset was generated
1860  startForStonewall = GetTimeStamp();
1861  hitStonewall = 0;
1862 
1863  if(randomPrefillBuffer && test->deadlineForStonewalling == 0){
1864  double t_start = GetTimeStamp();
1865  prefillSegment(test, randomPrefillBuffer, pretendRank, fd, ioBuffers, 0, test->segmentCount);
1866  if(rank == 0 && verbose > VERBOSE_1){
1867  fprintf(out_logfile, "Random prefill took: %fs\n", GetTimeStamp() - t_start);
1868  }
1869  // must synchronize processes to ensure they are not running ahead
1870  MPI_Barrier(test->testComm);
1871  }
1872 
1873  for (i = 0; i < test->segmentCount && !hitStonewall; i++) {
1874  if(randomPrefillBuffer && test->deadlineForStonewalling != 0){
1875  // prefill the whole segment with data, this needs to be done collectively
1876  double t_start = GetTimeStamp();
1877  prefillSegment(test, randomPrefillBuffer, pretendRank, fd, ioBuffers, i, i+1);
1878  MPI_Barrier(test->testComm);
1879  if(rank == 0 && verbose > VERBOSE_1){
1880  fprintf(out_logfile, "Random: synchronizing segment count with barrier and prefill took: %fs\n", GetTimeStamp() - t_start);
1881  }
1882  }
1883  for (j = 0; j < offsets && !hitStonewall ; j++) {
1885  if (test->randomOffset) {
1886  if(test->filePerProc){
1887  offset = offsets_rnd[j] + (i * test->blockSize);
1888  }else{
1889  offset = offsets_rnd[j] + (i * test->numTasks * test->blockSize);
1890  }
1891  }else{
1892  offset = j * test->transferSize;
1893  if (test->filePerProc) {
1894  offset += i * test->blockSize;
1895  } else {
1896  offset += (i * test->numTasks * test->blockSize) + (pretendRank * test->blockSize);
1897  }
1898  }
1899  dataMoved += WriteOrReadSingle(offset, pretendRank, test->transferSize, & transferCount, & errors, test, fd, ioBuffers, access);
1900  pairCnt++;
1901 
1902  hitStonewall = ((test->deadlineForStonewalling != 0
1903  && (GetTimeStamp() - startForStonewall) > test->deadlineForStonewalling))
1904  || (test->stoneWallingWearOutIterations != 0 && pairCnt == test->stoneWallingWearOutIterations) ;
1905 
1906  if ( test->collective && test->deadlineForStonewalling ) {
1907  // if collective-mode, you'll get a HANG, if some rank 'accidentally' leave this loop
1908  // it absolutely must be an 'all or none':
1909  MPI_CHECK(MPI_Bcast(&hitStonewall, 1, MPI_INT, 0, testComm), "hitStonewall broadcast failed");
1910  }
1911  }
1912  }
1913  if (test->stoneWallingWearOut){
1914  if (verbose >= VERBOSE_1){
1915  fprintf(out_logfile, "%d: stonewalling pairs accessed: %lld\n", rank, (long long) pairCnt);
1916  }
1917  long long data_moved_ll = (long long) dataMoved;
1918  long long pairs_accessed_min = 0;
1919  MPI_CHECK(MPI_Allreduce(& pairCnt, &point->pairs_accessed,
1920  1, MPI_LONG_LONG_INT, MPI_MAX, testComm), "cannot reduce pairs moved");
1921  double stonewall_runtime = GetTimeStamp() - startForStonewall;
1922  point->stonewall_time = stonewall_runtime;
1923  MPI_CHECK(MPI_Reduce(& pairCnt, & pairs_accessed_min,
1924  1, MPI_LONG_LONG_INT, MPI_MIN, 0, testComm), "cannot reduce pairs moved");
1925  MPI_CHECK(MPI_Reduce(& data_moved_ll, &point->stonewall_min_data_accessed,
1926  1, MPI_LONG_LONG_INT, MPI_MIN, 0, testComm), "cannot reduce pairs moved");
1927  MPI_CHECK(MPI_Reduce(& data_moved_ll, &point->stonewall_total_data_accessed,
1928  1, MPI_LONG_LONG_INT, MPI_SUM, 0, testComm), "cannot reduce pairs moved");
1929 
1930  if(rank == 0){
1932  fprintf(out_logfile, "stonewalling pairs accessed min: %lld max: %zu -- min data: %.1f GiB mean data: %.1f GiB time: %.1fs\n",
1933  pairs_accessed_min, point->pairs_accessed,
1934  point->stonewall_min_data_accessed /1024.0 / 1024 / 1024, point->stonewall_avg_data_accessed / 1024.0 / 1024 / 1024 , point->stonewall_time);
1935  }
1936  if(pairCnt != point->pairs_accessed){
1937  // some work needs still to be done, complete the current block !
1938  i--;
1939  if(j == offsets){
1940  j = 0; // current block is completed
1941  i++;
1942  }
1943  for ( ; pairCnt < point->pairs_accessed; i++) {
1944  for ( ; j < offsets && pairCnt < point->pairs_accessed ; j++) {
1946  if (test->randomOffset) {
1947  if(test->filePerProc){
1948  offset = offsets_rnd[j] + (i * test->blockSize);
1949  }else{
1950  offset = offsets_rnd[j] + (i * test->numTasks * test->blockSize);
1951  }
1952  }else{
1953  offset = j * test->transferSize;
1954  if (test->filePerProc) {
1955  offset += i * test->blockSize;
1956  } else {
1957  offset += (i * test->numTasks * test->blockSize) + (pretendRank * test->blockSize);
1958  }
1959  }
1960  dataMoved += WriteOrReadSingle(offset, pretendRank, test->transferSize, & transferCount, & errors, test, fd, ioBuffers, access);
1961  pairCnt++;
1962  }
1963  j = 0;
1964  }
1965  }
1966  }else{
1967  point->pairs_accessed = pairCnt;
1968  }
1969 
1970  totalErrorCount += CountErrors(test, access, errors);
1971 
1972  if (access == WRITE && test->fsync == TRUE) {
1973  backend->fsync(fd, test->backend_options); /*fsync after all accesses */
1974  }
1975  if(randomPrefillBuffer){
1976  aligned_buffer_free(randomPrefillBuffer, test->gpuMemoryFlags);
1977  }
1978 
1979  return (dataMoved);
1980 }
int reorderTasks
Definition: ior.h:126
int uniqueDir
Definition: ior.h:143
#define ERRF(FORMAT,...)
Definition: aiori-debug.h:77
void init_IOR_Param_t(IOR_param_t *p, MPI_Comm com)
Definition: ior.c:248
int GetNumTasks(MPI_Comm comm)
Definition: utilities.c:368
IOR_test_t * ParseCommandLine(int argc, char **argv, MPI_Comm com)
int reorderTasksRandomSeed
Definition: ior.h:129
int ior_main(int argc, char **argv)
Definition: ior.c:183
size_t pairs_accessed
Definition: ior.h:189
int warningAsErrors
Definition: ior.h:181
long long stonewall_avg_data_accessed
Definition: ior.h:193
unsigned long GetProcessorAndCore(int *chip, int *core)
Definition: utilities.c:911
MPI_Comm mpi_comm_world
Definition: ior.h:107
int errors
Definition: ior.h:202
int multiFile
Definition: ior.h:119
static void file_hits_histogram(IOR_param_t *params)
Definition: ior.c:1068
static void DisplayOutliers(int numTasks, double timerVal, char *timeString, int access, int outlierThreshold)
Definition: ior.c:285
void PrintTestEnds()
Definition: ior-output.c:212
IOR_offset_t segmentCount
Definition: aiori.h:71
unsigned int incompressibleSeed
Definition: ior.h:157
#define VERBOSE_0
Definition: iordef.h:92
static void ValidateTests(IOR_param_t *params, MPI_Comm com)
Definition: ior.c:1570
char * GetPlatformName()
Definition: ior.c:671
IOR_offset_t aggFileSizeFromStat
Definition: ior.h:196
unsigned int timeStampSignatureValue
Definition: ior.h:155
int filePerProc
Definition: ior.h:125
FILE * out_logfile
Definition: utilities.c:72
int gpuID
Definition: ior.h:112
#define VERBOSE_3
Definition: iordef.h:95
double stonewall_time
Definition: ior.h:191
int repetitions
Definition: ior.h:117
int64_t ReadStoneWallingIterations(char *const filename, MPI_Comm com)
Definition: utilities.c:791
IOR_offset_t segmentCount
Definition: ior.h:135
IOR_offset_t blockSize
Definition: aiori.h:72
int keepFile
Definition: ior.h:132
void PrintHeader(int argc, char **argv)
Definition: ior-output.c:269
char ** environ
static void XferBuffersFree(IOR_io_buffers *ioBuffers, IOR_param_t *test)
Definition: ior.c:1035
int checkRead
Definition: ior.h:131
void PrintLongSummaryOneTest(IOR_test_t *test)
Definition: ior-output.c:632
int test_time_elapsed(IOR_param_t *params, double startTime)
Definition: ior.c:1113
int numTasksOnNode0
Definition: ior.h:115
void(* delete)(char *, aiori_mod_opt_t *module_options)
Definition: aiori.h:100
void FreeResults(IOR_test_t *test)
Definition: ior.c:531
IOR_offset_t transferSize
Definition: ior.h:137
size_t memoryPerNode
Definition: ior.h:160
#define WRITECHECK
Definition: iordef.h:87
IOR_param_t params
Definition: ior.h:209
void PrintLongSummaryHeader()
Definition: ior-output.c:642
ior_memory_flags gpuMemoryFlags
Definition: ior.h:110
#define READCHECK
Definition: iordef.h:89
int(* mkdir)(const char *path, mode_t mode, aiori_mod_opt_t *module_options)
Definition: aiori.h:105
static size_t CompareData(void *expectedBuffer, size_t size, IOR_offset_t transferCount, IOR_param_t *test, IOR_offset_t offset, int fillrank, int access)
Definition: ior.c:409
int storeFileOffset
Definition: ior.h:145
int errorFound
Definition: ior.h:134
IOR_offset_t aggFileSizeFromXfer
Definition: ior.h:197
double sd
Definition: ior-internal.h:35
int QueryNodeMapping(MPI_Comm comm, int print_nodemap)
Definition: utilities.c:287
static int totalErrorCount
Definition: ior.c:52
#define FAIL(...)
Definition: aiori-debug.h:12
int numTasks
static void * HogMemory(IOR_param_t *params)
Definition: ior.c:1128
int summary_every_test
Definition: ior.h:142
IOR_offset_t * GetOffsetArrayRandom(IOR_param_t *test, int pretendRank, IOR_offset_t *out_count)
Definition: ior.c:1679
static void DestroyTest(IOR_test_t *test)
Definition: ior.c:558
static void ReduceIterResults(IOR_test_t *test, double *timer, const int rep, const int access)
Definition: ior.c:864
int numNodes
Definition: ior.h:114
int setTimeStampSignature
Definition: ior.h:154
IOR_offset_t expectedAggFileSize
Definition: aiori.h:74
#define IOR_RDONLY
Definition: aiori.h:28
int(* access)(const char *path, int mode, aiori_mod_opt_t *module_options)
Definition: aiori.h:107
#define WARN_RESET(MSG, TO_STRUCT_PTR, FROM_STRUCT_PTR, MEMBER)
Definition: aiori-debug.h:21
int fsyncPerWrite
Definition: ior.h:170
int interTestDelay
Definition: ior.h:120
#define MPI_CHECK(MPI_STATUS, MSG)
Definition: aiori-debug.h:127
#define WRITE
Definition: iordef.h:86
IOR_test_t * ior_run(int argc, char **argv, MPI_Comm world_com, FILE *world_out)
Definition: ior.c:149
int maxTimeDuration
Definition: ior.h:151
char * testFileName
Definition: ior.h:102
static IOR_offset_t WriteOrRead(IOR_param_t *test, IOR_results_t *results, aiori_fd_t *fd, const int access, IOR_io_buffers *ioBuffers)
Definition: ior.c:1825
#define VERBOSE_5
Definition: iordef.h:97
char * stoneWallingStatusFile
Definition: ior.h:149
void ShowTestStart(IOR_param_t *params)
Definition: ior-output.c:323
#define READ
Definition: iordef.h:88
double * val
Definition: ior-internal.h:37
void * backend_options
Definition: ior.h:166
MPI_Comm testComm
Definition: ior.h:106
int taskPerNodeOffset
Definition: ior.h:127
#define IOR_CREAT
Definition: aiori.h:32
static char ** ParseFileName(char *, int *)
Definition: ior.c:715
double sum
Definition: ior-internal.h:36
int fsync
Definition: ior.h:171
double var
Definition: ior-internal.h:34
struct IOR_test_t * next
Definition: ior.h:211
int outlierThreshold
Definition: ior.h:152
static void ProcessIterResults(IOR_test_t *test, double *timer, const int rep, const int access)
Definition: ior.c:1253
int intraTestBarriers
Definition: ior.h:180
void GetTestFileName(char *testFileName, IOR_param_t *test)
Definition: ior.c:756
MPI_Comm testComm
Definition: utilities.c:71
int reorderTasksRandom
Definition: ior.h:128
int(* check_params)(aiori_mod_opt_t *)
Definition: aiori.h:113
int checkWrite
Definition: ior.h:130
IOR_point_t write
Definition: ior.h:203
#define IOR_TRUNC
Definition: aiori.h:33
static void DistributeHints(MPI_Comm com)
Definition: ior.c:577
static unsigned int reseed_incompressible_prng
Definition: ior.c:620
void ShowSetup(IOR_param_t *params)
Definition: ior-output.c:410
static aiori_xfer_hint_t * hints
Definition: aiori-aio.c:75
Definition: ior.h:72
void init_clock(MPI_Comm com)
Definition: utilities.c:772
IOR_offset_t aggFileSizeForBW
Definition: ior.h:198
void(* initialize)(aiori_mod_opt_t *options)
Definition: aiori.h:109
int verbose
Definition: ior.h:153
static void XferBuffersSetup(IOR_io_buffers *ioBuffers, IOR_param_t *test, int pretendRank)
Definition: ior.c:1026
char * CurrentTimeString(void)
Definition: utilities.c:241
void PrintRemoveTiming(double start, double finish, int rep)
Definition: ior-output.c:732
#define WARN(MSG)
Definition: aiori-debug.h:32
void(* fsync)(aiori_fd_t *, aiori_mod_opt_t *module_options)
Definition: aiori.h:102
int collective
Definition: aiori.h:66
static void FillBuffer(void *buffer, IOR_param_t *test, unsigned long long offset, int fillrank)
Definition: ior.c:644
static int test_initialize(IOR_test_t *test)
Definition: ior.c:90
double time
Definition: ior.h:188
IOR_point_t read
Definition: ior.h:204
static void RemoveFile(char *testFileName, int filePerProc, IOR_param_t *test)
Definition: ior.c:925
static void CheckForOutliers(IOR_param_t *test, const double *timer, const int access)
Definition: ior.c:326
IOR_offset_t(* get_file_size)(aiori_mod_opt_t *module_options, char *filename)
Definition: aiori.h:103
int dryRun
Definition: ior.h:108
IOR_offset_t expectedAggFileSize
Definition: ior.h:138
char * platform
Definition: ior.h:101
int singleXferAttempt
Definition: aiori.h:75
int GetNumNodes(MPI_Comm comm)
Definition: utilities.c:331
int singleXferAttempt
Definition: ior.h:169
static void DestroyTests(IOR_test_t *tests_head)
Definition: ior.c:564
void(* xfer_hints)(aiori_xfer_hint_t *params)
Definition: aiori.h:96
Definition: ior.h:56
void(* close)(aiori_fd_t *, aiori_mod_opt_t *module_options)
Definition: aiori.h:99
int aiori_warning_as_errors
Definition: ior.c:85
int(* stat)(const char *path, struct stat *buf, aiori_mod_opt_t *module_options)
Definition: aiori.h:108
int interIODelay
Definition: ior.h:121
#define EWARN(MSG)
Definition: aiori-debug.h:59
static void CheckFileSize(IOR_test_t *test, char *testFilename, IOR_offset_t dataMoved, int rep, const int access)
Definition: ior.c:348
char * saveRankDetailsCSV
Definition: ior.h:141
FILE * out_resultfile
Definition: utilities.c:73
double GetTimeStamp(void)
Definition: utilities.c:731
static void WriteTimes(IOR_param_t *test, const double *timer, const int iteration, const int access)
Definition: ior.c:1157
#define EWARNF(FORMAT,...)
Definition: aiori-debug.h:45
static void prefillSegment(IOR_param_t *test, void *randomPrefillBuffer, int pretendRank, aiori_fd_t *fd, IOR_io_buffers *ioBuffers, int startSegment, int endSegment)
Definition: ior.c:1800
IOR_offset_t transferSize
Definition: aiori.h:73
void PrintShortSummary(IOR_test_t *test)
Definition: ior-output.c:685
int stoneWallingWearOut
Definition: ior.h:147
static const ior_aiori_t * backend
Definition: ior.c:53
void PrintRepeatEnd()
Definition: ior-output.c:197
aiori_fd_t *(* create)(char *, int iorflags, aiori_mod_opt_t *)
Definition: aiori.h:90
long long stonewall_min_data_accessed
Definition: ior.h:192
IOR_offset_t(* xfer)(int access, aiori_fd_t *, IOR_size_t *, IOR_offset_t size, IOR_offset_t offset, aiori_mod_opt_t *module_options)
Definition: aiori.h:97
IOR_test_t * CreateTest(IOR_param_t *init_params, int test_num)
Definition: ior.c:542
aiori_xfer_hint_t hints
Definition: ior.h:183
char * URI
Definition: ior.h:173
static void TestIoSys(IOR_test_t *)
Definition: ior.c:1271
void * buffer
Definition: ior.h:74
void PrintTableHeader()
Definition: ior-output.c:18
void PrintLongSummaryAllTests(IOR_test_t *tests_head)
Definition: ior-output.c:661
#define IOR_WRONLY
Definition: aiori.h:29
void PrintReducedResult(IOR_test_t *test, int access, double bw, double iops, double latency, double *diff_subset, double totalTime, int rep)
Definition: ior-output.c:225
int keepFileWithError
Definition: ior.h:133
int randomSeed
Definition: ior.h:156
#define FALSE
Definition: iordef.h:62
int rankOffset
Definition: utilities.c:69
int useExistingTestFile
Definition: ior.h:144
enum PACKET_TYPE dataPacketType
Definition: ior.h:164
static void StoreRankInformation(IOR_test_t *test, double *timer, const int rep, const int access)
Definition: ior.c:1220
int readFile
Definition: ior.h:123
long long int IOR_size_t
Definition: iordef.h:110
int tasksBlockMapping
Definition: ior.h:116
int randomOffset
Definition: ior.h:158
int numTasks
Definition: ior.h:113
size_t memoryPerTask
Definition: ior.h:159
void(* finalize)(aiori_mod_opt_t *options)
Definition: aiori.h:110
const char * aiori_default(void)
Definition: aiori.c:296
#define VERBOSE_2
Definition: iordef.h:94
#define IOR_NB_TIMERS
Definition: ior.c:48
int writeFile
Definition: ior.h:124
uint64_t stoneWallingWearOutIterations
Definition: ior.h:148
int verbose
Definition: utilities.c:70
#define MAX_STR
Definition: iordef.h:99
#define MAX_HINTS
Definition: iordef.h:100
int collective
Definition: ior.h:105
int randomOffset
Definition: aiori.h:69
static int CountErrors(IOR_param_t *test, int access, int errors)
Definition: ior.c:492
#define VERBOSE_4
Definition: iordef.h:96
IOR_offset_t randomPrefillBlocksize
Definition: ior.h:139
#define MAX_PATHLEN
Definition: utilities.h:31
double mean
Definition: ior-internal.h:33
static void * malloc_and_touch(size_t size)
Definition: ior.c:1046
int open
Definition: ior.h:122
aiori_fd_t *(* open)(char *, int iorflags, aiori_mod_opt_t *)
Definition: aiori.h:92
const struct ior_aiori * backend
Definition: ior.h:96
static void FillIncompressibleBuffer(void *buffer, IOR_param_t *test)
Definition: ior.c:623
int dualMount
Definition: ior.h:109
#define ERR(MSG)
Definition: aiori-debug.h:92
static char * PrependDir(IOR_param_t *, char *)
Definition: ior.c:811
void DelaySecs(int delay)
Definition: utilities.c:828
#define VERBOSE_1
Definition: iordef.h:93
IOR_results_t * results
Definition: ior.h:210
int deadlineForStonewalling
Definition: ior.h:146
char * api
Definition: ior.h:99
#define FILENAME_DELIMITER
Definition: iordef.h:107
static void test_finalize(IOR_test_t *test)
Definition: ior.c:139
int repCounter
Definition: ior.h:118
long long stonewall_total_data_accessed
Definition: ior.h:194
int fsyncPerWrite
Definition: aiori.h:70
void aligned_buffer_free(void *buf, ior_memory_flags gpu)
Definition: utilities.c:973
int filePerProc
Definition: aiori.h:65
static void ior_set_xfer_hints(IOR_param_t *p)
Definition: ior.c:65
static void InitTests(IOR_test_t *)
Definition: ior.c:961
long long int IOR_offset_t
Definition: iordef.h:109
int rank
Definition: utilities.c:68
IOR_offset_t blockSize
Definition: ior.h:136
int GetNumTasksOnNode0(MPI_Comm comm)
Definition: utilities.c:406
#define TRUE
Definition: iordef.h:66
static IOR_offset_t WriteOrReadSingle(IOR_offset_t offset, int pretendRank, IOR_offset_t transfer, IOR_offset_t *transferCount, int *errors, IOR_param_t *test, aiori_fd_t *fd, IOR_io_buffers *ioBuffers, int access)
Definition: ior.c:1755
void ShowTestEnd(IOR_test_t *tptr)
Definition: ior-output.c:394
void * safeMalloc(uint64_t size)
Definition: utilities.c:125
#define NULL
Definition: iordef.h:70
int id
Definition: ior.h:179
void AllocResults(IOR_test_t *test)
Definition: ior.c:521
void * aligned_buffer_alloc(size_t size, ior_memory_flags type)
Definition: utilities.c:924