22 #define DIRMODE S_IRUSR|S_IWUSR|S_IXUSR|S_IRGRP|S_IWGRP|S_IXGRP|S_IROTH|S_IXOTH 24 #define CHECK_MPI_RET(ret) if (ret != MPI_SUCCESS){ printf("Unexpected error in MPI on Line %d\n", __LINE__);} 25 #define LLU (long long unsigned) 26 #define min(a,b) (a < b ? a : b) 28 #define oprintf(...) do { fprintf(o.logfile, __VA_ARGS__); fflush(o.logfile); } while(0); 131 sprintf(out_name,
"%s/%d_%d",
o.
prefix, n, d);
135 sprintf(out_name,
"%s/%d_%d/file-%d",
o.
prefix, n, d, i);
143 .random_buffer_offset = -1,
149 .run_info_file =
"md-workbench.status"};
159 double end = cur + waittime;
165 w.tv_sec = (time_t) (waittime);
166 w.tv_nsec = (long) ((waittime - w.tv_sec) * 1000 * 1000 * 1000);
167 nanosleep(& w,
NULL);
182 float curtime = start - phase_start_timer;
184 results[pos].
runtime = (float) op_time;
186 if (op_time > *max_time){
189 *out_op_time = op_time;
194 printf(
"phase\t\td name\tcreate\tdelete\tob nam\tcreate\tread\tstat\tdelete\tt_inc_b\tt_no_bar\tthp\tmax_t\n");
203 for(
int i=0; i <
o.
size; i++){
212 for(
int i=0; i <
o.
size; i++){
213 sum += (mean - arr[i])*(mean - arr[i]);
215 return sqrt(sum / (
o.
size-1));
221 for(
int i=0; i <
o.
size; i++){
222 min = (arr[i] <
min) ? arr[i] : min;
223 max = (arr[i] > max) ? arr[i] : max;
246 sprintf(buff,
"%s \t%d\t%d\t%d\t%d\t%d\t%d\t%.3fs\t%.3fs\t%.2f MiB/s %.4e", name, p->
dset_create.
suc, p->
dset_delete.
suc, p->
obj_create.
suc, p->
obj_read.
suc, p->
obj_stat.
suc, p->
obj_delete.
suc, p->
t, t, tp, p->
max_op_time);
254 pos += sprintf(buff,
"%s process max:%.2fs ", name, t);
256 pos += sprintf(buff + pos,
"min:%.2fs mean: %.2fs balance:%.1f stddev:%.1f ", r_min, r_mean, r_min/r_max * 100.0, r_std);
258 int ioops_per_iter = 4;
268 pos += sprintf(buff + pos,
"rate:%.1f iops/s objects:%d rate:%.1f obj/s tp:%.1f MiB/s op-max:%.4es",
281 pos += sprintf(buff + pos,
"rate:%.1f iops/s dsets: %d objects:%d rate:%.3f dset/s rate:%.1f obj/s tp:%.1f MiB/s op-max:%.4es",
292 pos += sprintf(buff + pos,
"rate:%.1f iops/s objects:%d dsets: %d rate:%.1f obj/s rate:%.3f dset/s op-max:%.4es",
301 pos = sprintf(buff,
"%s: unknown phase", name);
316 pos += sprintf(buff + pos,
" (%d errs", errs);
318 pos += sprintf(buff + pos,
"!!!)" );
320 pos += sprintf(buff + pos,
")" );
329 pos += sprintf(buff + pos,
" read(%.4es, %.4es, %.4es, %.4es, %.4es, %.4es, %.4es)", stat.
min, stat.
q1, stat.
median, stat.
q3, stat.
q90, stat.
q99, stat.
max);
333 pos += sprintf(buff + pos,
" stat(%.4es, %.4es, %.4es, %.4es, %.4es, %.4es, %.4es)", stat.
min, stat.
q1, stat.
median, stat.
q3, stat.
q90, stat.
q99, stat.
max);
337 pos += sprintf(buff + pos,
" create(%.4es, %.4es, %.4es, %.4es, %.4es, %.4es, %.4es)", stat.
min, stat.
q1, stat.
median, stat.
q3, stat.
q90, stat.
q99, stat.
max);
341 pos += sprintf(buff + pos,
" delete(%.4es, %.4es, %.4es, %.4es, %.4es, %.4es, %.4es)", stat.
min, stat.
q1, stat.
median, stat.
q3, stat.
q90, stat.
q99, stat.
max);
351 int pos = round(quantile * (repeats - 1) + 0.49);
352 assert(pos < repeats);
362 memcpy(global_times, times, repeats * 2 *
sizeof(
float));
364 for(
int i=1; i <
o.
size; i++){
366 ret = MPI_Recv(& global_times[count], max_repeats*2, MPI_FLOAT, i, 888,
o.
com, & status);
368 MPI_Get_count(& status, MPI_FLOAT, & cnt);
372 ret = MPI_Send(times, repeats * 2, MPI_FLOAT, 0, 888,
o.
com);
383 FILE * f = fopen(file,
"w+");
385 ERRF(
"%d: Error writing to latency file: %s",
o.
rank, file);
388 fprintf(f,
"time,runtime\n");
389 for(
size_t i = 0; i < repeats; i++){
390 fprintf(f,
"%.7f,%.4e\n", times[i].time_since_app_start, times[i].runtime);
398 if(repeats % 2 == 0){
417 if(strcmp(name,
"benchmark") == 0){
425 ret = MPI_Reduce(& p->
t, & g_stat.
t, 2, MPI_DOUBLE, MPI_MAX, 0,
o.
com);
428 g_stat.
t_all = (
double*) malloc(
sizeof(
double) *
o.
size);
430 ret = MPI_Gather(& p->
t, 1, MPI_DOUBLE, g_stat.
t_all, 1, MPI_DOUBLE, 0,
o.
com);
443 if(strcmp(name,
"precreate") == 0){
449 }
else if(strcmp(name,
"cleanup") == 0){
455 }
else if(strcmp(name,
"benchmark") == 0){
493 for(
int i=1; i <
o.
size; i++){
494 MPI_Recv(buff,
MAX_PATHLEN, MPI_CHAR, i, 4711,
o.
com, MPI_STATUS_IGNORE);
550 ERRF(
"%d: Error while creating the dset: %s",
o.
rank, dset);
562 for(
int f=current_index; f <
o.
precreate; f++){
569 if (
NULL == aiori_fh){
570 FAIL(
"Unable to open file %s", obj_name);
578 ERRF(
"%d: Error while creating the obj: %s",
o.
rank, obj_name);
586 oprintf(
"%d: write %s:%s (%d) pretend: %d\n",
o.
rank, dset, obj_name, ret,
o.
rank);
601 int start_index = *current_index_p;
602 int total_num =
o.
num;
605 double phase_allreduce_time = 0;
608 for(f=0; f < total_num; f++){
609 float bench_runtime = 0;
612 struct stat stat_buf;
613 const int prevFile = f + start_index;
617 readRank = readRank < 0 ? readRank +
o.
size : readRank;
636 ERRF(
"%d: Error while stating the obj: %s",
o.
rank, obj_name);
643 oprintf(
"%d: read %s pretend: %d\n",
o.
rank, obj_name, readRank);
648 if (
NULL == aiori_fh){
649 FAIL(
"Unable to open file %s", obj_name);
663 EWARNF(
"%d: Error while reading the obj: %s",
o.
rank, obj_name);
688 const int newFileIndex =
o.
precreate + prevFile;
693 if (
NULL != aiori_fh){
702 ERRF(
"%d: Error while creating the obj: %s\n",
o.
rank, obj_name);
708 ERRF(
"%d: Error while creating the obj: %s",
o.
rank, obj_name);
710 EWARNF(
"Unable to open file %s", obj_name);
719 oprintf(
"%d: write %s (%d) pretend: %d\n",
o.
rank, obj_name, ret, writeRank);
731 armed_stone_wall = 0;
735 int ret = MPI_Allreduce(& cur_pos, & total_num, 1, MPI_INT, MPI_MAX,
o.
com);
740 oprintf(
"stonewall wear out %fs (%d iter)\n", bench_runtime, total_num);
750 int ret = MPI_Allreduce(& f, & total_num, 1, MPI_INT, MPI_MAX,
o.
com);
762 *current_index_p += f;
794 oprintf(
"Unable to remove directory %s\n", dset);
804 {
'O',
"offset",
"Offset in o.ranks between writers and readers. Writers and readers should be located on different nodes.",
OPTION_OPTIONAL_ARGUMENT,
'd', &
o.
offset},
823 {
'w',
"stonewall-timer",
"Stop each benchmark iteration after the specified seconds (if not used with -W this leads to process-specific progress!)",
OPTION_OPTIONAL_ARGUMENT,
'd', &
o.
stonewall_timer},
824 {
'W',
"stonewall-wear-out",
"Stop with stonewall after specified time and use a soft wear-out phase -- all processes perform the same number of iterations",
OPTION_FLAG,
'd', &
o.
stonewall_timer_wear_out},
829 {0,
"read-only",
"Run read-only during benchmarking phase (no deletes/writes), probably use with -2",
OPTION_FLAG,
'd', &
o.
read_only},
839 time_t now = time(0);
840 strftime (buff, 100,
"%Y-%m-%d %H:%M:%S", localtime (&now));
852 ret = fscanf(f,
"pos: %d", & position);
859 ret = MPI_Bcast( & position, 1, MPI_INT, 0,
o.
com );
872 fprintf(f,
"pos: %d\n", position);
879 char * limit_memory_P =
NULL;
891 for(
int i=1; i < argc; i++){
902 ERR(
"Unrecognized I/O API");
905 ERR(
"Backend doesn't support MDWorbench");
915 ERR(
"Invalid options, if running only the benchmark phase using -2 with stonewall option then use stonewall wear-out");
933 int current_index = 0;
949 oprintf(
"WARNING: num > precreate, this may cause the situation that no objects are available to read\n");
966 double t_bench_start;
1008 for(
int r=0; r <= 6; r++){
1042 oprintf(
"Total runtime: %.0fs time: ", t_all);
mdworkbench_results_t * md_workbench_run(int argc, char **argv, MPI_Comm world_com, FILE *out_logfile)
void run_benchmark(phase_stat_t *s, int *current_index_p)
static void def_dset_name(char *out_name, int n, int d)
time_statistics_t stats_read
time_statistics_t stats_create
static int sum_err(phase_stat_t *p)
static float add_timed_result(double start, double phase_start_timer, time_result_t *results, size_t pos, double *max_time, double *out_op_time)
void * airoi_update_module_options(const ior_aiori_t *backend, options_all_t *opt)
int option_parse(int argc, char **argv, options_all_t *opt_all)
float relative_waiting_factor
struct benchmark_options o
time_statistics_t stats_delete
static void statistics_minmax(int count, double *arr, double *out_min, double *out_max)
time_result_t * time_create
void run_precreate(phase_stat_t *s, int current_index)
void(* delete)(char *, aiori_mod_opt_t *module_options)
int(* mkdir)(const char *path, mode_t mode, aiori_mod_opt_t *module_options)
static uint64_t aggregate_timers(int repeats, int max_repeats, time_result_t *times, time_result_t *global_times)
time_statistics_t stats_stat
static void compute_histogram(const char *name, time_result_t *times, time_statistics_t *stats, size_t repeats, int writeLatencyFile)
int stonewall_timer_wear_out
static option_help options[]
static int compare_floats(time_result_t *x, time_result_t *y)
int(* rmdir)(const char *path, aiori_mod_opt_t *module_options)
static double statistics_mean(int count, double *arr)
const ior_aiori_t * aiori_select(const char *api)
time_result_t * time_stat
int(* check_params)(aiori_mod_opt_t *)
static double statistics_std_dev(int count, double *arr)
void run_cleanup(phase_stat_t *s, int start_index)
int verify_memory_pattern(int item, char *buffer, size_t bytes, int buff_offset, int pretendRank)
void init_clock(MPI_Comm com)
void(* initialize)(aiori_mod_opt_t *options)
float time_since_app_start
time_statistics_t stats_create
void(* xfer_hints)(aiori_xfer_hint_t *params)
void(* close)(aiori_fd_t *, aiori_mod_opt_t *module_options)
time_result_t * time_read
int(* stat)(const char *path, struct stat *buf, aiori_mod_opt_t *module_options)
mdworkbench_results_t * results
options_all_t * airoi_create_all_module_options(option_help *global_options)
double GetTimeStamp(void)
#define EWARNF(FORMAT,...)
static int return_position()
int adaptive_waiting_mode
static void store_position(int position)
#define CHECK_MPI_RET(ret)
ior_aiori_t const * backend
void generate_memory_pattern(char *buf, size_t bytes, int buff_offset, int rank)
aiori_fd_t *(* create)(char *, int iorflags, aiori_mod_opt_t *)
IOR_offset_t(* xfer)(int access, aiori_fd_t *, IOR_size_t *, IOR_offset_t size, IOR_offset_t offset, aiori_mod_opt_t *module_options)
static void def_obj_name(char *out_name, int n, int d, int i)
static void end_phase(const char *name, phase_stat_t *p)
static options_all_t * global_options
time_statistics_t stats_delete
time_result_t * time_delete
mdworkbench_result_t result[]
void(* finalize)(aiori_mod_opt_t *options)
static double runtime_quantile(int repeats, time_result_t *times, float quantile)
static void print_detailed_stat_header()
static void mdw_wait(double runtime)
time_statistics_t stats_stat
aiori_fd_t *(* open)(char *, int iorflags, aiori_mod_opt_t *)
int ignore_precreate_errors
static void print_p_stat(char *buff, const char *name, phase_stat_t *p, double t, int print_global)
void aligned_buffer_free(void *buf, ior_memory_flags gpu)
uint64_t start_item_number
static void init_stats(phase_stat_t *p, size_t repeats)
void update_write_memory_pattern(uint64_t item, char *buf, size_t bytes, int buff_offset, int rank)
time_statistics_t stats_read
char * latency_file_prefix
void * aligned_buffer_alloc(size_t size, ior_memory_flags type)