/*******************************************************************************
* Copyright 2014-2020 Intel Corporation.
*
* This software and the related documents are Intel copyrighted  materials,  and
* your use of  them is  governed by the  express license  under which  they were
* provided to you (License).  Unless the License provides otherwise, you may not
* use, modify, copy, publish, distribute,  disclose or transmit this software or
* the related documents without Intel's prior written permission.
*
* This software and the related documents  are provided as  is,  with no express
* or implied  warranties,  other  than those  that are  expressly stated  in the
* License.
*******************************************************************************/

//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra     (dongarra@eecs.utk.edu)
// Piotr Luszczek    (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER

/*!
 @file ComputeResidual.cpp

 HPCG routine
 */
#ifndef HPCG_NO_MPI
#include <mpi.h>
#endif

#include "Vector.hpp"

#ifdef HPCG_DETAILED_DEBUG
#include <fstream>
#include "hpcg.hpp"
#endif

#include <cmath>  // needed for fabs
#include "ComputeResidual.hpp"
#ifdef HPCG_DETAILED_DEBUG
#include <iostream>
#endif

/*!
  Routine to compute the inf-norm difference between two vectors where:

  @param[in]  n        number of vector elements (local to this processor)
  @param[in]  v1, v2   input vectors
  @param[out] residual pointer to scalar value; on exit, will contain result: inf-norm difference

  @return Returns zero on success and a non-zero value otherwise.
*/
sycl::event ComputeResidual(const local_int_t n, const Vector & v1, const Vector & v2,
                            double & residual, int & ierr, sycl::queue & main_queue,
                            const std::vector<sycl::event> & deps) {

  double * v1v = v1.values;
  double * v2v = v2.values;
  double local_residual = 0.0;

    double * local_residual_ptr = (double *)sycl::malloc_device(1 * sizeof(double), main_queue);
    double * local_residual_ptr_host = (double *)sycl::malloc_host(1 * sizeof(double), main_queue);
    auto event = main_queue.submit([&](sycl::handler &cgh) {
        cgh.depends_on(deps);
        auto reductionMax = sycl::reduction(local_residual_ptr, sycl::maximum<>(),
                sycl::property::reduction::initialize_to_identity());
        auto kernel = [=](sycl::id<1> id, auto &maxResidual) {
            int i = id[0];
            double diff = std::fabs(v1v[i] - v2v[i]);
            maxResidual.combine(diff);
        };
        cgh.parallel_for<class reduceResidual>(sycl::range<1>(n), reductionMax, kernel);
    });

    main_queue.memcpy(local_residual_ptr_host, local_residual_ptr, sizeof(double), {event}).wait();
    local_residual = local_residual_ptr_host[0];
    sycl::free(local_residual_ptr, main_queue);
    sycl::free(local_residual_ptr_host, main_queue);

#ifndef HPCG_NO_MPI
  // Use MPI's reduce function to collect all partial sums
  double global_residual = 0;
  MPI_Allreduce(&local_residual, &global_residual, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
  residual = global_residual;
#else
  residual = local_residual;
#endif

  return sycl::event();
}
