/****************************************************************************
*                                                                           *
* Copyright (C) 2024 Intel Corporation                                      *
*                                                                           *
*****************************************************************************

Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:

1. Redistributions of source code must retain the above copyright notice,
   this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
   this list of conditions and the following disclaimer in the documentation
   and/or other materials provided with the distribution.
3. Neither the name of the copyright holder nor the names of its contributors
   may be used to endorse or promote products derived from this software
   without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 ***************************************************************************

For more documentation than found here, see

[1] doc/ReadMe_IMB.txt 

[2] Intel(R) MPI Benchmarks
    Users Guide and Methodology Description
    In 
    doc/IMB_Users_Guide.pdf
    
 File: IMB_rma_put.c 

 Implemented functions: 

 IMB_rma_single_get;
 IMB_rma_get_all;
 IMB_rma_get_local;
 IMB_rma_get_all_local;
 IMB_rma_exchange_get;

 ***************************************************************************/

#include "IMB_declare.h"
#include "IMB_benchmark.h"
#include "IMB_prototypes.h"


/* Unidirectional and bidirectional get: communication is done 
 * between two processes only. */ 
void IMB_rma_single_get(struct comm_info* c_info, int size,
                        struct iter_schedule* iterations,
                        MODES run_mode, double* time) {
    double res_time = -1.;
    int target = -1;
    int receiver = 0;
    Type_Size r_size;
    int r_num = 0;
    int i;
    char *recv = (char *)c_info->r_buffer;
#ifdef CHECK 
    int asize = (int) sizeof(assign_type);
    defect = 0;
#endif

    if (c_info->rank == c_info->pair0) {
        target = c_info->pair1;
        receiver = 1;
    } else if (c_info->rank == c_info->pair1) {
        target = c_info->pair0;
        if (run_mode->BIDIR) {
            /* pair1 acts as origin
             * in bidirectional mode only */
            receiver = 1;
        }
    } else if (c_info->rank < 0) {
        *time = res_time;
        return;
    }

    /* Get size: recv and send sizes are equial, so just use one set of vars*/
    MPI_Type_size(c_info->s_data_type, &r_size);
    r_num = size / r_size;

    for (i = 0; i < N_BARR; i++)
        MPI_Barrier(c_info->communicator);

    /* in case of MPI_Get sender is target */
    if (receiver) {
        MPI_Win_lock(MPI_LOCK_SHARED, target, 0, c_info->WIN);
        if (run_mode->AGGREGATE) {
            res_time = MPI_Wtime();
            for (i = 0; i < iterations->n_sample; i++) {
                MPI_ERRHAND(MPI_Get((void*)(recv + i%iterations->r_cache_iter*iterations->r_offs),
                                    r_num, c_info->r_data_type, target,
                                    i%iterations->s_cache_iter*iterations->s_offs,
                                    r_num, c_info->s_data_type, c_info->WIN));
            }
            MPI_ERRHAND(MPI_Win_flush(target, c_info->WIN));
            res_time = (MPI_Wtime() - res_time) / iterations->n_sample;
        } else if (!run_mode->AGGREGATE) {
            res_time = MPI_Wtime();
            for (i = 0; i < iterations->n_sample; i++) {
                MPI_ERRHAND(MPI_Get((void*)(recv + i%iterations->r_cache_iter*iterations->r_offs),
                                    r_num, c_info->r_data_type, target,
                                    i%iterations->s_cache_iter*iterations->s_offs,
                                    r_num, c_info->s_data_type, c_info->WIN));

                MPI_ERRHAND(MPI_Win_flush(target, c_info->WIN));
            }
            res_time = (MPI_Wtime() - res_time) / iterations->n_sample;
        }
        MPI_Win_unlock(target, c_info->WIN);
    }

    /* Synchronize target and origin processes */
    MPI_Barrier(c_info->communicator);

#ifdef CHECK
    if (receiver || run_mode->BIDIR) {
        for (i = 0; i < ITER_MIN(iterations); i++) {
            CHK_DIFF("MPI_Get", c_info, (void*)(recv + i%iterations->r_cache_iter*iterations->r_offs),
                     0, size, size, asize, get, 0, iterations->n_sample, i, target, &defect);
        }
    }
#endif     
    *time = res_time;
    return;
}


/* Implements "One_get_all" and "All_get_all" benchmarks:
 * run_mode Collective corresponds to "All_get_all",
 * run_mode MultPassiveTransfer corresponds to "One_get_all"
 * */
void IMB_rma_get_all(struct comm_info* c_info, int size,
                     struct iter_schedule* iterations,
                     MODES run_mode, double* time) {
    double res_time = -1.;
    int target = 0;
    int peer = 0;
    int receiver = 0;
    Type_Size r_size;
    int r_num = 0;
    int i;
    char *recv = (char *)c_info->r_buffer;

    if (c_info->rank < 0) {
        *time = res_time;
        return;
    }

    if (c_info->rank == 0 || run_mode->type == Collective)
        receiver = 1;

    MPI_Type_size(c_info->r_data_type, &r_size);
    r_num = size / r_size;

    for (i = 0; i < N_BARR; i++)
        MPI_Barrier(c_info->communicator);

    if (receiver) {
        MPI_Win_lock_all(0, c_info->WIN);

        res_time = MPI_Wtime();
        for (i = 0; i < iterations->n_sample; i++) {
            for (peer = 0; peer < c_info->num_procs; peer++) {
                /* choose different target for each process to avoid congestion */
                target = (peer + c_info->rank) % c_info->num_procs;
                if (target == c_info->rank)
                    continue; /* do not get from itself*/

                MPI_ERRHAND(MPI_Get((void*)(recv + i%iterations->r_cache_iter*iterations->r_offs),
                                    r_num, c_info->r_data_type, target,
                                    i%iterations->s_cache_iter*iterations->s_offs,
                                    r_num, c_info->s_data_type, c_info->WIN));
            }
        }
        MPI_ERRHAND(MPI_Win_flush_all(c_info->WIN));
        res_time = (MPI_Wtime() - res_time) / iterations->n_sample;

        MPI_Win_unlock_all(c_info->WIN);
    }

    /* Synchronize origin and target processes */
    MPI_Barrier(c_info->communicator);

    *time = res_time;
    return;
}


/* Implements "Get_local" benchmark. One process gets some data
 * from the other and make sure of completion by MPI_Win_flush_local call
 * */
void IMB_rma_get_local(struct comm_info* c_info, int size,
                       struct iter_schedule* iterations,
                       MODES run_mode, double* time) {
    double res_time = -1.;
    Type_Size r_size;
    int r_num = 0;
    int i;
#ifdef CHECK
    int asize = (int) sizeof(assign_type);
#endif
    char *recv = (char *)c_info->r_buffer;

    if (c_info->rank < 0) {
        *time = res_time;
        return;
    }

    MPI_Type_size(c_info->r_data_type, &r_size);
    r_num = size / r_size;

    for (i = 0; i < N_BARR; i++)
        MPI_Barrier(c_info->communicator);

    if (c_info->rank == c_info->pair0) {
        MPI_Win_lock(MPI_LOCK_SHARED, c_info->pair1, 0, c_info->WIN);
        if (run_mode->AGGREGATE) {
            res_time = MPI_Wtime();
            for (i = 0; i < iterations->n_sample; i++) {
                MPI_ERRHAND(MPI_Get((void*)(recv + i%iterations->r_cache_iter*iterations->r_offs),
                                    r_num, c_info->r_data_type, c_info->pair1,
                                    i%iterations->s_cache_iter*iterations->s_offs,
                                    r_num, c_info->s_data_type, c_info->WIN));
            }
            MPI_ERRHAND(MPI_Win_flush_local(c_info->pair1, c_info->WIN));
            res_time = (MPI_Wtime() - res_time) / iterations->n_sample;
        } else if (!run_mode->AGGREGATE) {
            res_time = MPI_Wtime();
            for (i = 0; i < iterations->n_sample; i++) {
                MPI_ERRHAND(MPI_Get((void*)(recv + i%iterations->r_cache_iter*iterations->r_offs),
                                    r_num, c_info->r_data_type, c_info->pair1,
                                    i%iterations->s_cache_iter*iterations->s_offs,
                                    r_num, c_info->s_data_type, c_info->WIN));
                MPI_ERRHAND(MPI_Win_flush_local(c_info->pair1, c_info->WIN));
            }
            res_time = (MPI_Wtime() - res_time) / iterations->n_sample;
        }
        MPI_Win_unlock(c_info->pair1, c_info->WIN);
    }
    /* Synchronize target and origin processes */
    MPI_Barrier(c_info->communicator);
#ifdef CHECK
    if (c_info->rank == c_info->pair0) {
        /* Local completion of MPI_Get guarantees that recv buffer already contains target data,
         * so let's check the result */
        for (i = 0; i < ITER_MIN(iterations); i++) {
            CHK_DIFF("MPI_Get", c_info, (void*)(recv + i%iterations->r_cache_iter*iterations->r_offs),
                     0, size, size, asize, get, 0, iterations->n_sample, i, c_info->pair1, &defect);
        }
    }
#endif     

    *time = res_time;
    return;
}

/* Implements "get_all_local" benchmark. One process gets some data
 * from all other processes and make sure of completion by
 * MPI_Win_flush_local_all call
 * */
void IMB_rma_get_all_local(struct comm_info* c_info, int size,
                           struct iter_schedule* iterations,
                           MODES run_mode, double* time) {
    double res_time = -1.;
    int target = 0;
    int peer = 0;
    Type_Size r_size;
    int  r_num = 0;
    int i;
    char *recv = (char *)c_info->r_buffer;

    if (c_info->rank < 0) {
        *time = res_time;
        return;
    }

    MPI_Type_size(c_info->r_data_type, &r_size);
    r_num = size / r_size;

    for (i = 0; i < N_BARR; i++)
        MPI_Barrier(c_info->communicator);

    if (c_info->rank == 0) {
        MPI_Win_lock_all(0, c_info->WIN);

        res_time = MPI_Wtime();
        for (i = 0; i < iterations->n_sample; i++) {
            for (peer = 0; peer < c_info->num_procs; peer++) {
                /* choose different target for each process to avoid congestion */
                target = (peer + c_info->rank) % c_info->num_procs;
                if (target == c_info->rank)
                    continue; /* do not get from itself*/

                MPI_ERRHAND(MPI_Get((void*)(recv + i%iterations->r_cache_iter*iterations->r_offs),
                                    r_num, c_info->r_data_type, target,
                                    i%iterations->s_cache_iter*iterations->s_offs,
                                    r_num, c_info->s_data_type, c_info->WIN));
            }
        }
        MPI_ERRHAND(MPI_Win_flush_local_all(c_info->WIN));
        res_time = (MPI_Wtime() - res_time) / iterations->n_sample;

        MPI_Win_unlock_all(c_info->WIN);
    }
    /* Synchronize origin and target processes */
    MPI_Barrier(c_info->communicator);

    *time = res_time;
    return;
}

/* Implements "Exchange_get" benchmark. Each process gets some data
 * from two neighbor processes
 * */
void IMB_rma_exchange_get(struct comm_info* c_info, int size,
                          struct iter_schedule* iterations,
                          MODES run_mode, double* time) {
    double res_time = -1.;
    int left = -1;
    int right = -1;
    Type_Size r_size;
    int r_num = 0;
    int i;
    char *recv = (char *)c_info->r_buffer;
#ifdef CHECK
    int asize = (int) sizeof(assign_type);
    defect = 0;
#endif

    if (c_info->rank < 0) {
        *time = res_time;
        return;
    }

    MPI_Type_size(c_info->r_data_type, &r_size);
    r_num = size / r_size;

    left = (c_info->rank == 0) ? c_info->num_procs - 1 : c_info->rank - 1;
    right = (c_info->rank + 1) % c_info->num_procs;

    for (i = 0; i < N_BARR; i++)
        MPI_Barrier(c_info->communicator);

    if (left != right) {
        MPI_Win_lock(MPI_LOCK_SHARED, left, 0, c_info->WIN);
        MPI_Win_lock(MPI_LOCK_SHARED, right, 0, c_info->WIN);
    } else {
        /* Just two processes in the chain. Both messages will come
         * to the one peer */
        MPI_Win_lock(MPI_LOCK_SHARED, left, 0, c_info->WIN);
    }

    res_time = MPI_Wtime();
    for (i = 0; i < iterations->n_sample; i++) {
        MPI_ERRHAND(MPI_Get((void*)(recv + i%iterations->r_cache_iter*iterations->r_offs),
                            r_num, c_info->r_data_type, right,
                            i%iterations->s_cache_iter*iterations->s_offs,
                            r_num, c_info->s_data_type, c_info->WIN));

        MPI_ERRHAND(MPI_Get((void*)(recv + size + i%iterations->r_cache_iter*iterations->r_offs),
                            r_num, c_info->r_data_type, left,
                            size + i%iterations->s_cache_iter*iterations->s_offs,
                            r_num, c_info->s_data_type, c_info->WIN));

        if (left != right) {
            MPI_ERRHAND(MPI_Win_flush(left, c_info->WIN));

            MPI_ERRHAND(MPI_Win_flush(right, c_info->WIN));
        } else {
            MPI_ERRHAND(MPI_Win_flush(left, c_info->WIN));
        }
    }
    res_time = (MPI_Wtime() - res_time) / iterations->n_sample;
    if (left != right) {
        MPI_Win_unlock(left, c_info->WIN);
        MPI_Win_unlock(right, c_info->WIN);
    } else
        MPI_Win_unlock(left, c_info->WIN);

    /* Synchronize target and origin processes */
    MPI_Barrier(c_info->communicator);

#ifdef CHECK
    for (i = 0; i < ITER_MIN(iterations); i++) {
        CHK_DIFF("MPI_Get", c_info, (void*)(recv + size + i%iterations->r_cache_iter*iterations->r_offs),
                 size, size, size, asize, get, 0, iterations->n_sample, i, left, &defect);

        CHK_DIFF("MPI_Get", c_info, (void*)(recv + i%iterations->r_cache_iter*iterations->r_offs),
                 0, size, size, asize, get, 0, iterations->n_sample, i, right, &defect);
    }
#endif

    *time = res_time;
    return;
}
