#!/usr/bin/env python
# coding: utf-8
# /*##########################################################################
#
# Copyright (c) 2018-2019 European Synchrotron Radiation Facility
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
#
# ###########################################################################*/
import numpy as np

from .basefft import BaseFFT, check_version
try:
    import pyopencl as cl
    import pyopencl.array as parray
    import gpyfft
    from gpyfft.fft import FFT as cl_fft
    from ...opencl.common import ocl
    __have_clfft__ = True
except ImportError:
    __have_clfft__ = False


# Check gpyfft version
__required_gpyfft_version__ = "0.3.0"
if __have_clfft__:
    __have_clfft__ = check_version(gpyfft, __required_gpyfft_version__)


class CLFFT(BaseFFT):
    """Initialize a clfft plan.

    Please see FFT class for parameters help.

    CLFFT-specific parameters
    --------------------------

    :param pyopencl.Context ctx:
        If set to other than None, an existing pyopencl context is used.
    :param bool fast_math:
        If set to True, computations will be done with "fast math" mode,
        i.e., more speed but less accuracy.
    :param bool choose_best_device:
        Whether to automatically choose the best available OpenCL device.
    """
    def __init__(
        self,
        shape=None,
        dtype=None,
        template=None,
        shape_out=None,
        axes=None,
        normalize="rescale",
        ctx=None,
        fast_math=False,
        choose_best_device=True,
    ):
        if not(__have_clfft__) or not(__have_clfft__):
            raise ImportError("Please install pyopencl and gpyfft >= %s to use the OpenCL back-end" % __required_gpyfft_version__)

        super(CLFFT, self).__init__(
            shape=shape,
            dtype=dtype,
            template=template,
            shape_out=shape_out,
            axes=axes,
            normalize=normalize,
        )
        self.ctx = ctx
        self.choose_best_device = choose_best_device
        self.fast_math = fast_math
        self.backend = "clfft"

        self.fix_axes()
        self.init_context_queue()
        self.allocate_arrays()
        self.real_transform = np.isrealobj(self.data_in)
        self.compute_forward_plan()
        self.compute_inverse_plan()
        self.refs = {
            "data_in": self.data_in,
            "data_out": self.data_out,
        }
        # TODO
        #  Either pyopencl ElementWiseKernel, or built-in clfft callbacks
        if self.normalize != "rescale":
            raise NotImplementedError(
                "Normalization modes other than rescale are not implemented with OpenCL backend yet."
            )

    def fix_axes(self):
        """
        "Fix" axes.

        clfft does not have the same convention as FFTW/cuda/numpy.
        """
        self.axes = self.axes[::-1]

    def _allocate(self, shape, dtype):
        return parray.zeros(self.queue, shape, dtype=dtype)


    def check_array(self, array, shape, dtype, copy=True):
        if array.shape != shape:
            raise ValueError("Invalid data shape: expected %s, got %s" %
                (shape, array.shape)
            )
        if array.dtype != dtype:
            raise ValueError("Invalid data type: expected %s, got %s" %
                (dtype, array.dtype)
            )


    def set_data(self, dst, src, shape, dtype, copy=True, name=None):
        """
        dst is a device array owned by the current instance
        (either self.data_in or self.data_out).

        copy is ignored for device<-> arrays.
        """
        self.check_array(src, shape, dtype)
        if isinstance(src, np.ndarray):
            if name == "data_out":
                # Makes little sense to provide output=numpy_array
                return dst
            if not(src.flags["C_CONTIGUOUS"]):
                src = np.ascontiguousarray(src, dtype=dtype)
            # working on underlying buffer is notably faster
            #~ dst[:] = src[:]
            evt = cl.enqueue_copy(self.queue, dst.data, src)
            evt.wait()
        elif isinstance(src, parray.Array):
            # No copy, use the data as self.d_input or self.d_output
            # (this prevents the use of in-place transforms, however).
            # We have to keep their old references.
            if name is None:
                # This should not happen
                raise ValueError("Please provide either copy=True or name != None")
            assert id(self.refs[name]) == id(dst) # DEBUG
            setattr(self, name, src)
            return src
        else:
            raise ValueError(
                "Invalid array type %s, expected numpy.ndarray or pyopencl.array" %
                type(src)
            )
        return dst


    def recover_array_references(self):
        self.data_in = self.refs["data_in"]
        self.data_out = self.refs["data_out"]


    def init_context_queue(self):
        if self.ctx is None:
            if self.choose_best_device:
                self.ctx = ocl.create_context()
            else:
                self.ctx = cl.create_some_context()
        self.queue = cl.CommandQueue(self.ctx)


    def compute_forward_plan(self):
        self.plan_forward = cl_fft(
            self.ctx,
            self.queue,
            self.data_in,
            out_array=self.data_out,
            axes=self.axes,
            fast_math=self.fast_math,
            real=self.real_transform,
        )


    def compute_inverse_plan(self):
        self.plan_inverse = cl_fft(
            self.ctx,
            self.queue,
            self.data_out,
            out_array=self.data_in,
            axes=self.axes,
            fast_math=self.fast_math,
            real=self.real_transform,
        )


    def update_forward_plan_arrays(self):
        self.plan_forward.data = self.data_in
        self.plan_forward.result = self.data_out


    def update_inverse_plan_arrays(self):
        self.plan_inverse.data = self.data_out
        self.plan_inverse.result = self.data_in


    def copy_output_if_numpy(self, dst, src):
        if isinstance(dst, parray.Array):
            return
        # working on underlying buffer is notably faster
        #~ dst[:] = src[:]
        evt = cl.enqueue_copy(self.queue, dst, src.data)
        evt.wait()


    def fft(self, array, output=None, do_async=False):
        """
        Perform a (forward) Fast Fourier Transform.

        :param Union[numpy.ndarray,pyopencl.array] array:
            Input data. Must be consistent with the current context.
        :param Union[numpy.ndarray,pyopencl.array] output:
            Output data. By default, output is a numpy.ndarray.
        :param bool do_async:
            Whether to perform operation in asynchronous mode.
            Default is False, meaning that we wait for transform to complete.
        """
        data_in = self.set_input_data(array, copy=False)
        data_out = self.set_output_data(output, copy=False)
        self.update_forward_plan_arrays()
        event, = self.plan_forward.enqueue()
        if not(do_async):
            event.wait()
        if output is not None:
            self.copy_output_if_numpy(output, self.data_out)
            res = output
        else:
            res = self.data_out.get()
        self.recover_array_references()
        return res


    def ifft(self, array, output=None, do_async=False):
        """
        Perform a (inverse) Fast Fourier Transform.

        :param Union[numpy.ndarray,pyopencl.array] array:
            Input data. Must be consistent with the current context.
        :param Union[numpy.ndarray,pyopencl.array] output:
            Output data. By default, output is a numpy.ndarray.
        :param bool do_async:
            Whether to perform operation in asynchronous mode.
            Default is False, meaning that we wait for transform to complete.
        """
        data_in = self.set_output_data(array, copy=False)
        data_out = self.set_input_data(output, copy=False)
        self.update_inverse_plan_arrays()
        event, = self.plan_inverse.enqueue(forward=False)
        if not(do_async):
            event.wait()
        if output is not None:
            self.copy_output_if_numpy(output, self.data_in)
            res = output
        else:
            res = self.data_in.get()
        self.recover_array_references()
        return res


    def __del__(self):
        # It seems that gpyfft underlying clFFT destructors are not called.
        # This results in the following warning:
        #   Warning:  Program terminating, but clFFT resources not freed.
        #   Please consider explicitly calling clfftTeardown( )
        del self.plan_forward
        del self.plan_inverse