IOR
utilities.c
Go to the documentation of this file.
1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2  * vim:expandtab:shiftwidth=8:tabstop=8:
3  */
4 /******************************************************************************\
5 * *
6 * Copyright (c) 2003, The Regents of the University of California *
7 * See the file COPYRIGHT for a complete copyright notice and license. *
8 * *
9 ********************************************************************************
10 *
11 * Additional utilities
12 *
13 \******************************************************************************/
14 
15 #ifdef HAVE_CONFIG_H
16 # include "config.h"
17 #endif
18 
19 #ifdef HAVE_GETCPU_SYSCALL
20 # define _GNU_SOURCE
21 # include <unistd.h>
22 # include <sys/syscall.h>
23 #endif
24 
25 #ifdef __linux__
26 # define _GNU_SOURCE /* Needed for O_DIRECT in fcntl */
27 #endif /* __linux__ */
28 
29 #include <stdarg.h>
30 #include <stdio.h>
31 #include <stdlib.h>
32 #include <errno.h>
33 #include <fcntl.h>
34 #include <math.h> /* pow() */
35 #include <string.h>
36 #include <sys/stat.h>
37 #include <sys/types.h>
38 #include <time.h>
39 
40 #ifdef HAVE_CUDA
41 #include <cuda_runtime.h>
42 #endif
43 
44 #ifndef _WIN32
45 # include <regex.h>
46 # ifdef __sun /* SunOS does not support statfs(), instead uses statvfs() */
47 # include <sys/statvfs.h>
48 # elif (defined __APPLE__)
49 # include <sys/param.h>
50 # include <sys/mount.h>
51 # else /* ! __sun or __APPLE__ */
52 # include <sys/statfs.h>
53 # endif /* __sun */
54 # include <sys/time.h> /* gettimeofday() */
55 #endif
56 
57 #include "utilities.h"
58 #include "aiori.h"
59 #include "ior.h"
60 #include "ior-internal.h"
61 
62 /************************** D E C L A R A T I O N S ***************************/
63 
64 extern int errno;
65 extern int numTasks;
66 
67 /* globals used by other files, also defined "extern" in utilities.h */
68 int rank = 0;
69 int rankOffset = 0;
70 int verbose = VERBOSE_0; /* verbose output */
71 MPI_Comm testComm;
72 FILE * out_logfile = NULL;
75 
76 /***************************** F U N C T I O N S ******************************/
77 
78 void update_write_memory_pattern(uint64_t item, char * buf, size_t bytes, int buff_offset, int rank){
79  if(bytes >= 8){ // set the item number as first element of the buffer to be as much unique as possible
80  ((uint64_t*) buf)[0] = item;
81  }else{
82  buf[0] = (char) item;
83  }
84 }
85 
86 void generate_memory_pattern(char * buf, size_t bytes, int buff_offset, int rank){
87  uint64_t * buffi = (uint64_t*) buf;
88  // first half of 64 bits use the rank
89  const uint64_t ranki = ((uint64_t)(rank + 1) << 32) + buff_offset;
90  const size_t size = bytes / 8;
91  // the first 8 bytes are set to item number
92  for(size_t i=1; i < size; i++){
93  buffi[i] = (i + 1) + ranki;
94  }
95  for(size_t i=(bytes/8)*8; i < bytes; i++){
96  buf[i] = (char) i;
97  }
98 }
99 
100 int verify_memory_pattern(int item, char * buffer, size_t bytes, int buff_offset, int pretendRank){
101  int error = 0;
102  // always read all data to ensure that performance numbers stay the same
103  if((bytes >= 8 && ((uint64_t*) buffer)[0] != item) || (bytes < 8 && buffer[0] != (char) item)){
104  error = 1;
105  }
106 
107  uint64_t * buffi = (uint64_t*) buffer;
108  // first half of 64 bits use the rank, here need to apply rank shifting
109  uint64_t rank_mod = ((uint64_t)(pretendRank + 1) << 32) + buff_offset;
110  // the first 8 bytes are set to item number
111  for(size_t i=1; i < bytes/8; i++){
112  uint64_t exp = (i + 1) + rank_mod;
113  if(buffi[i] != exp){
114  error = 1;
115  }
116  }
117  for(size_t i=(bytes/8)*8; i < bytes; i++){
118  if(buffer[i] != (char) i){
119  error = 1;
120  }
121  }
122  return error;
123 }
124 
125 void* safeMalloc(uint64_t size){
126  void * d = malloc(size);
127  if (d == NULL){
128  ERR("Could not malloc an array");
129  }
130  memset(d, 0, size);
131  return d;
132 }
133 
134 void FailMessage(int rank, const char *location, char *format, ...) {
135  char msg[4096];
136  va_list args;
137  va_start(args, format);
138  vsnprintf(msg, 4096, format, args);
139  va_end(args);
140  fprintf(out_logfile, "%s: Process %d: FAILED in %s, %s\n",
141  PrintTimestamp(), rank, location, msg);
142  fflush(out_logfile);
143  MPI_Abort(testComm, 1);
144 }
145 
146 size_t NodeMemoryStringToBytes(char *size_str)
147 {
148  int percent;
149  int rc;
150  long page_size;
151  long num_pages;
152  long long mem;
153 
154  rc = sscanf(size_str, " %d %% ", &percent);
155  if (rc == 0)
156  return (size_t) string_to_bytes(size_str);
157  if (percent > 100 || percent < 0)
158  ERR("percentage must be between 0 and 100");
159 
160 #ifdef HAVE_SYSCONF
161  page_size = sysconf(_SC_PAGESIZE);
162 #else
163  page_size = getpagesize();
164 #endif
165 
166 #ifdef _SC_PHYS_PAGES
167  num_pages = sysconf(_SC_PHYS_PAGES);
168  if (num_pages == -1)
169  ERR("sysconf(_SC_PHYS_PAGES) is not supported");
170 #else
171  ERR("sysconf(_SC_PHYS_PAGES) is not supported");
172 #endif
173  mem = page_size * num_pages;
174 
175  return mem / 100 * percent;
176 }
177 
179  if (options->setTimeStampSignature){
180  options->incompressibleSeed = options->setTimeStampSignature;
181  }
182 
183  if (options->buffer_type && options->buffer_type[0] != 0){
184  switch(options->buffer_type[0]) {
185  case 'i': /* Incompressible */
186  options->dataPacketType = incompressible;
187  break;
188  case 't': /* timestamp */
189  options->dataPacketType = timestamp;
190  break;
191  case 'o': /* offset packet */
192  options->storeFileOffset = TRUE;
193  options->dataPacketType = offset;
194  break;
195  default:
196  fprintf(out_logfile,
197  "Unknown argument for -l %s; generic assumed\n", options->buffer_type);
198  break;
199  }
200  }
201  if (options->memoryPerNodeStr){
203  }
204  const ior_aiori_t * backend = aiori_select(options->api);
205  if (backend == NULL)
206  ERR("Unrecognized I/O API");
207 
208  options->backend = backend;
209  /* copy the actual module options into the test */
210  options->backend_options = airoi_update_module_options(backend, global_options);
211  options->apiVersion = backend->get_version();
212 }
213 
214 /* Used in aiori-POSIX.c and aiori-PLFS.c
215  */
216 
217 void set_o_direct_flag(int *flag)
218 {
219 /* note that TRU64 needs O_DIRECTIO, SunOS uses directio(),
220  and everyone else needs O_DIRECT */
221 #ifndef O_DIRECT
222 # ifndef O_DIRECTIO
223  WARN("cannot use O_DIRECT");
224 # define O_DIRECT 000000
225 # else /* O_DIRECTIO */
226 # define O_DIRECT O_DIRECTIO
227 # endif /* not O_DIRECTIO */
228 #endif /* not O_DIRECT */
229 
230  *flag |= O_DIRECT;
231 }
232 
233 
234 /*
235  * Returns string containing the current time.
236  *
237  * NOTE: On some systems, MPI jobs hang while ctime() waits for a lock.
238  * This is true even though CurrentTimeString() is only called for rank==0.
239  * ctime_r() fixes this.
240  */
241 char *CurrentTimeString(void)
242 {
243  static time_t currentTime;
244  char* currentTimePtr;
245 
246  if ((currentTime = time(NULL)) == -1)
247  ERR("cannot get current time");
248 
249 #if (_POSIX_C_SOURCE >= 1 || _XOPEN_SOURCE || _BSD_SOURCE || _SVID_SOURCE || _POSIX_SOURCE)
250  static char threadSafeBuff[32]; /* "must be at least 26 characters long" */
251  if ((currentTimePtr = ctime_r(&currentTime, threadSafeBuff)) == NULL) {
252  ERR("cannot read current time");
253  }
254 #else
255  if ((currentTimePtr = ctime(&currentTime)) == NULL) {
256  ERR("cannot read current time");
257  }
258 #endif
259  /* ctime string ends in \n */
260  return (currentTimePtr);
261 }
262 
263 /*
264  * Dump transfer buffer.
265  */
266 void DumpBuffer(void *buffer,
267  size_t size) /* <size> in bytes */
268 {
269  size_t i, j;
270  IOR_size_t *dumpBuf = (IOR_size_t *)buffer;
271 
272  /* Turns out, IOR_size_t is unsigned long long, but we don't want
273  to assume that it must always be */
274  for (i = 0; i < ((size / sizeof(IOR_size_t)) / 4); i++) {
275  for (j = 0; j < 4; j++) {
276  fprintf(out_logfile, IOR_format" ", dumpBuf[4 * i + j]);
277  }
278  fprintf(out_logfile, "\n");
279  }
280  return;
281 } /* DumpBuffer() */
282 
283 /* a function that prints an int array where each index corresponds to a rank
284  and the value is whether that rank is on the same host as root.
285  Also returns 1 if rank 1 is on same host and 0 otherwise
286 */
287 int QueryNodeMapping(MPI_Comm comm, int print_nodemap) {
288  char localhost[MAX_PATHLEN], roothost[MAX_PATHLEN];
289  int num_ranks;
290  MPI_Comm_size(comm, &num_ranks);
291  int *node_map = (int*)malloc(sizeof(int) * num_ranks);
292  if ( ! node_map ) {
293  FAIL("malloc");
294  }
295  if (gethostname(localhost, MAX_PATHLEN) != 0) {
296  FAIL("gethostname()");
297  }
298  if (rank==0) {
299  strncpy(roothost,localhost,MAX_PATHLEN);
300  }
301 
302  /* have rank 0 broadcast out its hostname */
303  MPI_Bcast(roothost, MAX_PATHLEN, MPI_CHAR, 0, comm);
304  //printf("Rank %d received root host as %s\n", rank, roothost);
305  /* then every rank figures out whether it is same host as root and then gathers that */
306  int same_as_root = strcmp(roothost,localhost) == 0;
307  MPI_Gather( &same_as_root, 1, MPI_INT, node_map, 1, MPI_INT, 0, comm);
308  if ( print_nodemap && rank==0) {
309  fprintf( out_logfile, "Nodemap: " );
310  for ( int i = 0; i < num_ranks; i++ ) {
311  fprintf( out_logfile, "%d", node_map[i] );
312  }
313  fprintf( out_logfile, "\n" );
314  }
315  int ret = 1;
316  if(num_ranks>1)
317  ret = node_map[1] == 1;
318  MPI_Bcast(&ret, 1, MPI_INT, 0, comm);
319  free(node_map);
320  return ret;
321 }
322 
323 /*
324  * There is a more direct way to determine the node count in modern MPI
325  * versions so we use that if possible.
326  *
327  * For older versions we use a method which should still provide accurate
328  * results even if the total number of tasks is not evenly divisible by the
329  * tasks on node rank 0.
330  */
331 int GetNumNodes(MPI_Comm comm) {
332  if (getenv("IOR_FAKE_NODES")){
333  int numNodes = atoi(getenv("IOR_FAKE_NODES"));
334  int rank;
335  MPI_Comm_rank(comm, & rank);
336  if(rank == 0){
337  printf("Fake number of node: using %d\n", numNodes);
338  }
339  return numNodes;
340  }
341 #if MPI_VERSION >= 3
342  MPI_Comm shared_comm;
343  int shared_rank = 0;
344  int local_result = 0;
345  int numNodes = 0;
346 
347  MPI_CHECK(MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &shared_comm),
348  "MPI_Comm_split_type() error");
349  MPI_CHECK(MPI_Comm_rank(shared_comm, &shared_rank), "MPI_Comm_rank() error");
350  local_result = shared_rank == 0? 1 : 0;
351  MPI_CHECK(MPI_Allreduce(&local_result, &numNodes, 1, MPI_INT, MPI_SUM, comm),
352  "MPI_Allreduce() error");
353  MPI_CHECK(MPI_Comm_free(&shared_comm), "MPI_Comm_free() error");
354 
355  return numNodes;
356 #else
357  int numTasks = 0;
358  int numTasksOnNode0 = 0;
359 
360  numTasks = GetNumTasks(comm);
361  numTasksOnNode0 = GetNumTasksOnNode0(comm);
362 
363  return ((numTasks - 1) / numTasksOnNode0) + 1;
364 #endif
365 }
366 
367 
368 int GetNumTasks(MPI_Comm comm) {
369  int numTasks = 0;
370 
371  MPI_CHECK(MPI_Comm_size(comm, &numTasks), "cannot get number of tasks");
372 
373  return numTasks;
374 }
375 
376 
377 /*
378  * It's very important that this method provide the same result to every
379  * process as it's used for redistributing which jobs read from which files.
380  * It was renamed accordingly.
381  *
382  * If different nodes get different results from this method then jobs get
383  * redistributed unevenly and you no longer have a 1:1 relationship with some
384  * nodes reading multiple files while others read none.
385  *
386  * In the common case the number of tasks on each node (MPI_Comm_size on an
387  * MPI_COMM_TYPE_SHARED communicator) will be the same. However, there is
388  * nothing which guarantees this. It's valid to have, for example, 64 jobs
389  * across 4 systems which can run 20 jobs each. In that scenario you end up
390  * with 3 MPI_COMM_TYPE_SHARED groups of 20, and one group of 4.
391  *
392  * In the (MPI_VERSION < 3) implementation of this method consistency is
393  * ensured by asking specifically about the number of tasks on the node with
394  * rank 0. In the original implementation for (MPI_VERSION >= 3) this was
395  * broken by using the LOCAL process count which differed depending on which
396  * node you were on.
397  *
398  * This was corrected below by first splitting the comm into groups by node
399  * (MPI_COMM_TYPE_SHARED) and then having only the node with world rank 0 and
400  * shared rank 0 return the MPI_Comm_size of its shared subgroup. This yields
401  * the original consistent behavior no matter which node asks.
402  *
403  * In the common case where every node has the same number of tasks this
404  * method will return the same value it always has.
405  */
406 int GetNumTasksOnNode0(MPI_Comm comm) {
407  if (getenv("IOR_FAKE_TASK_PER_NODES")){
408  int tasksPerNode = atoi(getenv("IOR_FAKE_TASK_PER_NODES"));
409  int rank;
410  MPI_Comm_rank(comm, & rank);
411  if(rank == 0){
412  printf("Fake tasks per node: using %d\n", tasksPerNode);
413  }
414  return tasksPerNode;
415  }
416 #if MPI_VERSION >= 3
417  MPI_Comm shared_comm;
418  int shared_rank = 0;
419  int tasks_on_node_rank0 = 0;
420  int local_result = 0;
421 
422  MPI_CHECK(MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &shared_comm),
423  "MPI_Comm_split_type() error");
424  MPI_CHECK(MPI_Comm_rank(shared_comm, &shared_rank), "MPI_Comm_rank() error");
425  if (rank == 0 && shared_rank == 0) {
426  MPI_CHECK(MPI_Comm_size(shared_comm, &local_result), "MPI_Comm_size() error");
427  }
428  MPI_CHECK(MPI_Allreduce(&local_result, &tasks_on_node_rank0, 1, MPI_INT, MPI_SUM, comm),
429  "MPI_Allreduce() error");
430  MPI_CHECK(MPI_Comm_free(&shared_comm), "MPI_Comm_free() error");
431 
432  return tasks_on_node_rank0;
433 #else
434 /*
435  * This version employs the gethostname() call, rather than using
436  * MPI_Get_processor_name(). We are interested in knowing the number
437  * of tasks that share a file system client (I/O node, compute node,
438  * whatever that may be). However on machines like BlueGene/Q,
439  * MPI_Get_processor_name() uniquely identifies a cpu in a compute node,
440  * not the node where the I/O is function shipped to. gethostname()
441  * is assumed to identify the shared filesystem client in more situations.
442  */
443  int size;
444  MPI_Comm_size(comm, & size);
445  /* for debugging and testing */
446  char localhost[MAX_PATHLEN],
447  hostname[MAX_PATHLEN];
448  int count = 1,
449  i;
450  MPI_Status status;
451 
452  if (( rank == 0 ) && ( verbose >= 1 )) {
453  fprintf( out_logfile, "V-1: Entering count_tasks_per_node...\n" );
454  fflush( out_logfile );
455  }
456 
457  if (gethostname(localhost, MAX_PATHLEN) != 0) {
458  FAIL("gethostname()");
459  }
460  if (rank == 0) {
461  /* MPI_receive all hostnames, and compares them to the local hostname */
462  for (i = 0; i < size-1; i++) {
463  MPI_Recv(hostname, MAX_PATHLEN, MPI_CHAR, MPI_ANY_SOURCE,
464  MPI_ANY_TAG, comm, &status);
465  if (strcmp(hostname, localhost) == 0) {
466  count++;
467  }
468  }
469  } else {
470  /* MPI_send hostname to root node */
471  MPI_Send(localhost, MAX_PATHLEN, MPI_CHAR, 0, 0, comm);
472  }
473  MPI_Bcast(&count, 1, MPI_INT, 0, comm);
474 
475  return(count);
476 #endif
477 }
478 
479 
480 /*
481  * Extract key/value pair from hint string.
482  */
483 void ExtractHint(char *settingVal, char *valueVal, char *hintString)
484 {
485  char *settingPtr, *valuePtr, *tmpPtr2;
486 
487  /* find the value */
488  settingPtr = (char *)strtok(hintString, " =");
489  valuePtr = (char *)strtok(NULL, " =\t\r\n");
490  /* is this an MPI hint? */
491  tmpPtr2 = (char *) strstr(settingPtr, "IOR_HINT__MPI__");
492  if (settingPtr == tmpPtr2) {
493  settingPtr += strlen("IOR_HINT__MPI__");
494  } else {
495  tmpPtr2 = (char *) strstr(hintString, "IOR_HINT__GPFS__");
496  /* is it an GPFS hint? */
497  if (settingPtr == tmpPtr2) {
498  settingPtr += strlen("IOR_HINT__GPFS__");
499  }else{
500  fprintf(out_logfile, "WARNING: Unable to set unknown hint type (not implemented.)\n");
501  return;
502  }
503  }
504  strcpy(settingVal, settingPtr);
505  strcpy(valueVal, valuePtr);
506 }
507 
508 /*
509  * Set hints for MPIIO, HDF5, or NCMPI.
510  */
511 void SetHints(MPI_Info * mpiHints, char *hintsFileName)
512 {
513  char hintString[MAX_STR];
514  char settingVal[MAX_STR];
515  char valueVal[MAX_STR];
516  extern char **environ;
517  int i;
518  FILE *fd;
519 
520  /*
521  * This routine checks for hints from the environment and/or from the
522  * hints files. The hints are of the form:
523  * 'IOR_HINT__<layer>__<hint>=<value>', where <layer> is either 'MPI'
524  * or 'GPFS', <hint> is the full name of the hint to be set, and <value>
525  * is the hint value. E.g., 'setenv IOR_HINT__MPI__IBM_largeblock_io true'
526  * or 'IOR_HINT__GPFS__hint=value' in the hints file.
527  */
528  MPI_CHECK(MPI_Info_create(mpiHints), "cannot create info object");
529 
530  /* get hints from environment */
531  for (i = 0; environ[i] != NULL; i++) {
532  /* if this is an IOR_HINT, pass the hint to the info object */
533  if (strncmp(environ[i], "IOR_HINT", strlen("IOR_HINT")) == 0) {
534  strcpy(hintString, environ[i]);
535  ExtractHint(settingVal, valueVal, hintString);
536  MPI_CHECK(MPI_Info_set(*mpiHints, settingVal, valueVal),
537  "cannot set info object");
538  }
539  }
540 
541  /* get hints from hints file */
542  if (hintsFileName != NULL && strcmp(hintsFileName, "") != 0) {
543 
544  /* open the hint file */
545  fd = fopen(hintsFileName, "r");
546  if (fd == NULL) {
547  WARN("cannot open hints file");
548  } else {
549  /* iterate over hints file */
550  while (fgets(hintString, MAX_STR, fd) != NULL) {
551  if (strncmp
552  (hintString, "IOR_HINT",
553  strlen("IOR_HINT")) == 0) {
554  ExtractHint(settingVal, valueVal,
555  hintString);
556  MPI_CHECK(MPI_Info_set
557  (*mpiHints, settingVal,
558  valueVal),
559  "cannot set info object");
560  }
561  }
562  /* close the hints files */
563  if (fclose(fd) != 0)
564  ERR("cannot close hints file");
565  }
566  }
567 }
568 
569 /*
570  * Show all hints (key/value pairs) in an MPI_Info object.
571  */
572 void ShowHints(MPI_Info * mpiHints)
573 {
574  char key[MPI_MAX_INFO_VAL];
575  char value[MPI_MAX_INFO_VAL];
576  int flag, i, nkeys;
577 
578  MPI_CHECK(MPI_Info_get_nkeys(*mpiHints, &nkeys),
579  "cannot get info object keys");
580 
581  for (i = 0; i < nkeys; i++) {
582  MPI_CHECK(MPI_Info_get_nthkey(*mpiHints, i, key),
583  "cannot get info object key");
584  MPI_CHECK(MPI_Info_get(*mpiHints, key, MPI_MAX_INFO_VAL - 1,
585  value, &flag),
586  "cannot get info object value");
587  fprintf(out_logfile, "\t%s = %s\n", key, value);
588  }
589 }
590 
591 /*
592  * Takes a string of the form 64, 8m, 128k, 4g, etc. and converts to bytes.
593  */
595 {
596  IOR_offset_t size = 0;
597  char range;
598  int rc;
599 
600  rc = sscanf(size_str, "%lld%c", &size, &range);
601  if (rc == 2) {
602  switch ((int)range) {
603  case 'k':
604  case 'K':
605  size <<= 10;
606  break;
607  case 'm':
608  case 'M':
609  size <<= 20;
610  break;
611  case 'g':
612  case 'G':
613  size <<= 30;
614  break;
615  }
616  } else if (rc == 0) {
617  size = -1;
618  }
619  return (size);
620 }
621 
622 /*
623  * Displays size of file system and percent of data blocks and inodes used.
624  */
625 void ShowFileSystemSize(char * filename, const struct ior_aiori * backend, void * backend_options) // this might be converted to an AIORI call
626 {
627  ior_aiori_statfs_t stat;
628  if(! backend->statfs){
629  WARN("Backend doesn't implement statfs");
630  return;
631  }
632  int ret = backend->statfs(filename, & stat, backend_options);
633  if( ret != 0 ){
634  WARN("Backend returned error during statfs");
635  return;
636  }
637  long long int totalFileSystemSize;
638  long long int freeFileSystemSize;
639  long long int totalInodes;
640  long long int freeInodes;
641  double totalFileSystemSizeHR;
642  double usedFileSystemPercentage;
643  double usedInodePercentage;
644  char *fileSystemUnitStr;
645 
646  totalFileSystemSize = stat.f_blocks * stat.f_bsize;
647  freeFileSystemSize = stat.f_bfree * stat.f_bsize;
648  usedFileSystemPercentage = (1 - ((double)freeFileSystemSize / (double)totalFileSystemSize)) * 100;
649  totalFileSystemSizeHR = (double)totalFileSystemSize / (double)(1<<30);
650 
651  /* inodes */
652  totalInodes = stat.f_files;
653  freeInodes = stat.f_ffree;
654  usedInodePercentage = (1 - ((double)freeInodes / (double)totalInodes)) * 100;
655 
656  fileSystemUnitStr = "GiB";
657  if (totalFileSystemSizeHR > 1024) {
658  totalFileSystemSizeHR = (double)totalFileSystemSize / (double)((long long)1<<40);
659  fileSystemUnitStr = "TiB";
660  }
662  fprintf(out_resultfile, "%-20s: %s\n", "Path", filename);
663  fprintf(out_resultfile, "%-20s: %.1f %s Used FS: %2.1f%% ",
664  "FS", totalFileSystemSizeHR, fileSystemUnitStr,
665  usedFileSystemPercentage);
666  fprintf(out_resultfile, "Inodes: %.1f Mi Used Inodes: %2.1f%%\n",
667  (double)totalInodes / (double)(1<<20),
668  usedInodePercentage);
669  fflush(out_logfile);
670  }else if(outputFormat == OUTPUT_JSON){
671  fprintf(out_resultfile, " , \"Path\": \"%s\",", filename);
672  fprintf(out_resultfile, "\"Capacity\": \"%.1f %s\", \"Used Capacity\": \"%2.1f%%\",",
673  totalFileSystemSizeHR, fileSystemUnitStr,
674  usedFileSystemPercentage);
675  fprintf(out_resultfile, "\"Inodes\": \"%.1f Mi\", \"Used Inodes\" : \"%2.1f%%\"\n",
676  (double)totalInodes / (double)(1<<20),
677  usedInodePercentage);
678  }else if(outputFormat == OUTPUT_CSV){
679 
680  }
681 
682  return;
683 }
684 
685 /*
686  * Return match of regular expression -- 0 is failure, 1 is success.
687  */
688 int Regex(char *string, char *pattern)
689 {
690  int retValue = 0;
691 #ifndef _WIN32 /* Okay to always not match */
692  regex_t regEx;
693  regmatch_t regMatch;
694 
695  regcomp(&regEx, pattern, REG_EXTENDED);
696  if (regexec(&regEx, string, 1, &regMatch, 0) == 0) {
697  retValue = 1;
698  }
699  regfree(&regEx);
700 #endif
701 
702  return (retValue);
703 }
704 
705 /*
706  * System info for Windows.
707  */
708 #ifdef _WIN32
709 int uname(struct utsname *name)
710 {
711  DWORD nodeNameSize = sizeof(name->nodename) - 1;
712 
713  memset(name, 0, sizeof(struct utsname));
714  if (!GetComputerNameEx
715  (ComputerNameDnsFullyQualified, name->nodename, &nodeNameSize))
716  ERR("GetComputerNameEx failed");
717 
718  strncpy(name->sysname, "Windows", sizeof(name->sysname) - 1);
719  /* FIXME - these should be easy to fetch */
720  strncpy(name->release, "-", sizeof(name->release) - 1);
721  strncpy(name->version, "-", sizeof(name->version) - 1);
722  strncpy(name->machine, "-", sizeof(name->machine) - 1);
723  return 0;
724 }
725 #endif /* _WIN32 */
726 
727 /*
728  * Get time stamp. Use MPI_Timer() unless _NO_MPI_TIMER is defined,
729  * in which case use gettimeofday().
730  */
731 double GetTimeStamp(void)
732 {
733  double timeVal;
734  struct timeval timer;
735 
736  if (gettimeofday(&timer, (struct timezone *)NULL) != 0)
737  ERR("cannot use gettimeofday()");
738  timeVal = (double)timer.tv_sec + ((double)timer.tv_usec / 1000000);
739 
740  return (timeVal);
741 }
742 
743 /*
744  * Determine any spread (range) between node times.
745  * Obsolete
746  */
747 static double TimeDeviation(MPI_Comm com)
748 {
749  double timestamp;
750  double min = 0;
751  double max = 0;
752  double roottimestamp;
753 
754  MPI_CHECK(MPI_Barrier(com), "barrier error");
755  timestamp = GetTimeStamp();
756  MPI_CHECK(MPI_Reduce(&timestamp, &min, 1, MPI_DOUBLE,
757  MPI_MIN, 0, com),
758  "cannot reduce tasks' times");
759  MPI_CHECK(MPI_Reduce(&timestamp, &max, 1, MPI_DOUBLE,
760  MPI_MAX, 0, com),
761  "cannot reduce tasks' times");
762 
763  /* delta between individual nodes' time and root node's time */
764  roottimestamp = timestamp;
765  MPI_CHECK(MPI_Bcast(&roottimestamp, 1, MPI_DOUBLE, 0, com),
766  "cannot broadcast root's time");
767  // wall_clock_delta = timestamp - roottimestamp;
768 
769  return max - min;
770 }
771 
772 void init_clock(MPI_Comm com){
773 
774 }
775 
776 char * PrintTimestamp() {
777  static char datestring[80];
778  time_t cur_timestamp;
779 
780  if (( rank == 0 ) && ( verbose >= 1 )) {
781  fprintf( out_logfile, "V-1: Entering PrintTimestamp...\n" );
782  }
783 
784  fflush(out_logfile);
785  cur_timestamp = time(NULL);
786  strftime(datestring, 80, "%m/%d/%Y %T", localtime(&cur_timestamp));
787 
788  return datestring;
789 }
790 
791 int64_t ReadStoneWallingIterations(char * const filename, MPI_Comm com){
792  long long data;
793  if(rank != 0){
794  MPI_Bcast( & data, 1, MPI_LONG_LONG_INT, 0, com);
795  return data;
796  }else{
797  FILE * out = fopen(filename, "r");
798  if (out == NULL){
799  data = -1;
800  MPI_Bcast( & data, 1, MPI_LONG_LONG_INT, 0, com);
801  return data;
802  }
803  int ret = fscanf(out, "%lld", & data);
804  if (ret != 1){
805  return -1;
806  }
807  fclose(out);
808  MPI_Bcast( & data, 1, MPI_LONG_LONG_INT, 0, com);
809  return data;
810  }
811 }
812 
813 void StoreStoneWallingIterations(char * const filename, int64_t count){
814  if(rank != 0){
815  return;
816  }
817  FILE * out = fopen(filename, "w");
818  if (out == NULL){
819  FAIL("Cannot write to the stonewalling file!");
820  }
821  fprintf(out, "%lld", (long long) count);
822  fclose(out);
823 }
824 
825 /*
826  * Sleep for 'delay' seconds.
827  */
828 void DelaySecs(int delay){
829  if (rank == 0 && delay > 0) {
830  if (verbose >= VERBOSE_1)
831  fprintf(out_logfile, "delaying %d seconds . . .\n", delay);
832  sleep(delay);
833  }
834 }
835 
836 
837 /*
838  * Convert IOR_offset_t value to human readable string. This routine uses a
839  * statically-allocated buffer internally and so is not re-entrant.
840  */
841 char *HumanReadable(IOR_offset_t value, int base)
842 {
843  static char valueStr[MAX_STR];
844  IOR_offset_t m = 0, g = 0, t = 0;
845  char m_str[8], g_str[8], t_str[8];
846 
847  if (base == BASE_TWO) {
848  m = MEBIBYTE;
849  g = GIBIBYTE;
850  t = GIBIBYTE * 1024llu;
851  strcpy(m_str, "MiB");
852  strcpy(g_str, "GiB");
853  strcpy(t_str, "TiB");
854  } else if (base == BASE_TEN) {
855  m = MEGABYTE;
856  g = GIGABYTE;
857  t = GIGABYTE * 1000llu;
858  strcpy(m_str, "MB");
859  strcpy(g_str, "GB");
860  strcpy(t_str, "TB");
861  }
862 
863  if (value >= t) {
864  if (value % t) {
865  snprintf(valueStr, MAX_STR-1, "%.2f %s",
866  (double)((double)value / t), t_str);
867  } else {
868  snprintf(valueStr, MAX_STR-1, "%d %s", (int)(value / t), t_str);
869  }
870  }else if (value >= g) {
871  if (value % g) {
872  snprintf(valueStr, MAX_STR-1, "%.2f %s",
873  (double)((double)value / g), g_str);
874  } else {
875  snprintf(valueStr, MAX_STR-1, "%d %s", (int)(value / g), g_str);
876  }
877  } else if (value >= m) {
878  if (value % m) {
879  snprintf(valueStr, MAX_STR-1, "%.2f %s",
880  (double)((double)value / m), m_str);
881  } else {
882  snprintf(valueStr, MAX_STR-1, "%d %s", (int)(value / m), m_str);
883  }
884  } else if (value >= 0) {
885  snprintf(valueStr, MAX_STR-1, "%d bytes", (int)value);
886  } else {
887  snprintf(valueStr, MAX_STR-1, "-");
888  }
889  return valueStr;
890 }
891 
892 #if defined(HAVE_GETCPU_SYSCALL)
893 // Assume we aren't worried about thread/process migration.
894 // Test on Intel systems and see if we can get rid of the architecture specificity
895 // of the code.
896 unsigned long GetProcessorAndCore(int *chip, int *core){
897  return syscall(SYS_getcpu, core, chip, NULL);
898 }
899 #elif defined(HAVE_RDTSCP_ASM)
900 // We're on an intel processor and use the
901 // rdtscp instruction.
902 unsigned long GetProcessorAndCore(int *chip, int *core){
903  unsigned long a,d,c;
904  __asm__ volatile("rdtscp" : "=a" (a), "=d" (d), "=c" (c));
905  *chip = (c & 0xFFF000)>>12;
906  *core = c & 0xFFF;
907  return ((unsigned long)a) | (((unsigned long)d) << 32);;
908 }
909 #else
910 // TODO: Add in AMD function
911 unsigned long GetProcessorAndCore(int *chip, int *core){
912 #warning GetProcessorAndCore is implemented as a dummy
913  *chip = 0;
914  *core = 0;
915  return 1;
916 }
917 #endif
918 
919 
920 
921 /*
922  * Allocate a page-aligned (required by O_DIRECT) buffer.
923  */
924 void *aligned_buffer_alloc(size_t size, ior_memory_flags type)
925 {
926  size_t pageMask;
927  char *buf, *tmp;
928  char *aligned;
929 
930  if(type == IOR_MEMORY_TYPE_GPU_MANAGED){
931 #ifdef HAVE_CUDA
932  // use unified memory here to allow drop-in-replacement
933  if (cudaMallocManaged((void**) & buf, size, cudaMemAttachGlobal) != cudaSuccess){
934  ERR("Cannot allocate buffer on GPU");
935  }
936  return buf;
937 #else
938  ERR("No CUDA supported, cannot allocate on the GPU");
939 #endif
940  }else if(type == IOR_MEMORY_TYPE_GPU_DEVICE_ONLY){
941 #ifdef HAVE_GPU_DIRECT
942  if (cudaMalloc((void**) & buf, size) != cudaSuccess){
943  ERR("Cannot allocate buffer on GPU");
944  }
945  return buf;
946 #else
947  ERR("No GPUDirect supported, cannot allocate on the GPU");
948 #endif
949  }
950 
951 #ifdef HAVE_SYSCONF
952  long pageSize = sysconf(_SC_PAGESIZE);
953 #else
954  size_t pageSize = getpagesize();
955 #endif
956 
957  pageMask = pageSize - 1;
958  buf = safeMalloc(size + pageSize + sizeof(void *));
959  /* find the alinged buffer */
960  tmp = buf + sizeof(char *);
961  aligned = tmp + pageSize - ((size_t) tmp & pageMask);
962  /* write a pointer to the original malloc()ed buffer into the bytes
963  preceding "aligned", so that the aligned buffer can later be free()ed */
964  tmp = aligned - sizeof(void *);
965  *(void **)tmp = buf;
966 
967  return (void *)aligned;
968 }
969 
970 /*
971  * Free a buffer allocated by aligned_buffer_alloc().
972  */
974 {
975  if(gpu){
976 #ifdef HAVE_CUDA
977  if (cudaFree(buf) != cudaSuccess){
978  WARN("Cannot free buffer on GPU");
979  }
980  return;
981 #else
982  ERR("No CUDA supported, cannot free on the GPU");
983 #endif
984  }
985  free(*(void **)((char *)buf - sizeof(char *)));
986 }
char * HumanReadable(IOR_offset_t value, int base)
Definition: utilities.c:841
MPI_Comm testComm
Definition: utilities.c:71
ior_memory_flags
Definition: ior.h:61
int GetNumTasks(MPI_Comm comm)
Definition: utilities.c:368
#define MEBIBYTE
Definition: iordef.h:78
uint64_t f_blocks
Definition: aiori.h:53
unsigned long GetProcessorAndCore(int *chip, int *core)
Definition: utilities.c:911
uint64_t f_bfree
Definition: aiori.h:54
void ShowHints(MPI_Info *mpiHints)
Definition: utilities.c:572
unsigned int incompressibleSeed
Definition: ior.h:157
#define VERBOSE_0
Definition: iordef.h:92
void * airoi_update_module_options(const ior_aiori_t *backend, options_all_t *opt)
Definition: aiori.c:93
CURLcode rc
Definition: aiori-S3-4c.c:111
char *(* get_version)(void)
Definition: aiori.h:101
int rankOffset
Definition: utilities.c:69
OutputFormat_t
Definition: iordef.h:55
int64_t ReadStoneWallingIterations(char *const filename, MPI_Comm com)
Definition: utilities.c:791
enum OutputFormat_t outputFormat
Definition: utilities.c:74
int(* statfs)(const char *, ior_aiori_statfs_t *, aiori_mod_opt_t *module_options)
Definition: aiori.h:104
size_t memoryPerNode
Definition: ior.h:160
int storeFileOffset
Definition: ior.h:145
#define min(a, b)
Definition: md-workbench.c:26
int QueryNodeMapping(MPI_Comm comm, int print_nodemap)
Definition: utilities.c:287
uint64_t f_ffree
Definition: aiori.h:57
#define FAIL(...)
Definition: aiori-debug.h:12
char * apiVersion
Definition: ior.h:100
int numTasks
int setTimeStampSignature
Definition: ior.h:154
IOR_offset_t StringToBytes(char *size_str)
Definition: utilities.c:594
#define GIBIBYTE
Definition: iordef.h:79
#define MPI_CHECK(MPI_STATUS, MSG)
Definition: aiori-debug.h:127
int verbose
Definition: utilities.c:70
void * backend_options
Definition: ior.h:166
char * PrintTimestamp()
Definition: utilities.c:776
const ior_aiori_t * aiori_select(const char *api)
Definition: aiori.c:237
uint64_t f_files
Definition: aiori.h:56
static double TimeDeviation(MPI_Comm com)
Definition: utilities.c:747
static option_help options[]
Definition: aiori-CEPHFS.c:54
uint64_t f_bsize
Definition: aiori.h:52
int verify_memory_pattern(int item, char *buffer, size_t bytes, int buff_offset, int pretendRank)
Definition: utilities.c:100
void init_clock(MPI_Comm com)
Definition: utilities.c:772
char * CurrentTimeString(void)
Definition: utilities.c:241
#define WARN(MSG)
Definition: aiori-debug.h:32
void updateParsedOptions(IOR_param_t *options, options_all_t *global_options)
Definition: utilities.c:178
int GetNumNodes(MPI_Comm comm)
Definition: utilities.c:331
int rank
Definition: utilities.c:68
Definition: ior.h:56
FILE * out_resultfile
Definition: utilities.c:73
double GetTimeStamp(void)
Definition: utilities.c:731
void generate_memory_pattern(char *buf, size_t bytes, int buff_offset, int rank)
Definition: utilities.c:86
static const ior_aiori_t * backend
Definition: ior.c:53
char ** environ
void StoreStoneWallingIterations(char *const filename, int64_t count)
Definition: utilities.c:813
enum PACKET_TYPE dataPacketType
Definition: ior.h:164
static options_all_t * global_options
Definition: parse_options.c:41
#define GIGABYTE
Definition: iordef.h:75
long long int IOR_size_t
Definition: iordef.h:110
char * buffer_type
Definition: ior.h:163
int64_t string_to_bytes(char *size_str)
Definition: option.c:30
Definition: ior.h:55
#define BASE_TWO
Definition: iordef.h:82
#define MAX_STR
Definition: iordef.h:99
void ExtractHint(char *settingVal, char *valueVal, char *hintString)
Definition: utilities.c:483
#define MAX_PATHLEN
Definition: utilities.h:31
#define O_DIRECT
void ShowFileSystemSize(char *filename, const struct ior_aiori *backend, void *backend_options)
Definition: utilities.c:625
int errno
const struct ior_aiori * backend
Definition: ior.h:96
void SetHints(MPI_Info *mpiHints, char *hintsFileName)
Definition: utilities.c:511
void set_o_direct_flag(int *flag)
Definition: utilities.c:217
#define MEGABYTE
Definition: iordef.h:74
void FailMessage(int rank, const char *location, char *format,...)
Definition: utilities.c:134
#define ERR(MSG)
Definition: aiori-debug.h:92
#define BASE_TEN
Definition: iordef.h:83
void DelaySecs(int delay)
Definition: utilities.c:828
#define VERBOSE_1
Definition: iordef.h:93
int Regex(char *string, char *pattern)
Definition: utilities.c:688
char * api
Definition: ior.h:99
size_t NodeMemoryStringToBytes(char *size_str)
Definition: utilities.c:146
void aligned_buffer_free(void *buf, ior_memory_flags gpu)
Definition: utilities.c:973
#define IOR_format
Definition: iordef.h:112
void DumpBuffer(void *buffer, size_t size)
Definition: utilities.c:266
char * memoryPerNodeStr
Definition: ior.h:161
FILE * out_logfile
Definition: utilities.c:72
long long int IOR_offset_t
Definition: iordef.h:109
int GetNumTasksOnNode0(MPI_Comm comm)
Definition: utilities.c:406
#define TRUE
Definition: iordef.h:66
void update_write_memory_pattern(uint64_t item, char *buf, size_t bytes, int buff_offset, int rank)
Definition: utilities.c:78
void * safeMalloc(uint64_t size)
Definition: utilities.c:125
#define NULL
Definition: iordef.h:70
void * aligned_buffer_alloc(size_t size, ior_memory_flags type)
Definition: utilities.c:924