#!/usr/bin/env python
# coding: utf-8
# /*##########################################################################
#
# Copyright (c) 2018-2019 European Synchrotron Radiation Facility
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
#
# ###########################################################################*/
import numpy as np

from .basefft import BaseFFT
try:
    import pycuda.gpuarray as gpuarray
    from skcuda.fft import Plan
    from skcuda.fft import fft as cu_fft
    from skcuda.fft import ifft as cu_ifft
    __have_cufft__ = True
except ImportError:
    __have_cufft__ = False


class CUFFT(BaseFFT):
    """Initialize a cufft plan

    Please see FFT class for parameters help.

    CUFFT-specific parameters
    --------------------------

    :param pycuda.driver.Stream stream:
      Stream with which to associate the plan. If no stream is specified,
      the default stream is used.
    """
    def __init__(
        self,
        shape=None,
        dtype=None,
        template=None,
        shape_out=None,
        axes=None,
        normalize="rescale",
        stream=None,
    ):
        if not(__have_cufft__) or not(__have_cufft__):
            raise ImportError("Please install pycuda and scikit-cuda to use the CUDA back-end")

        super(CUFFT, self).__init__(
            shape=shape,
            dtype=dtype,
            template=template,
            shape_out=shape_out,
            axes=axes,
            normalize=normalize,
        )
        self.cufft_stream = stream
        self.backend = "cufft"

        self.configure_batched_transform()
        self.allocate_arrays()
        self.real_transform = np.isrealobj(self.data_in)
        self.compute_forward_plan()
        self.compute_inverse_plan()
        self.refs = {
            "data_in": self.data_in,
            "data_out": self.data_out,
        }
        self.configure_normalization()

    def _allocate(self, shape, dtype):
        return gpuarray.zeros(shape, dtype)

    # TODO support batched transform where batch is other than dimension 0
    def configure_batched_transform(self):
        self.cufft_batch_size = 1
        self.cufft_shape = self.shape
        if (self.axes is not None) and (len(self.axes) < len(self.shape)):
            # In the easiest case, the transform is computed along the fastest dimensions:
            #  - 1D transforms of lines of 2D data
            #  - 2D transforms of images of 3D data (stacked along slow dim)
            #  - 1D transforms of 3D data along fastest dim
            # Otherwise, we have to configure cuda "advanced memory layout",
            # which is not implemented yet.

            data_ndims = len(self.shape)
            supported_axes = {
                2: [(1,)],
                3: [(1, 2), (2, 1), (1,), (2,)],
            }
            if self.axes not in supported_axes[data_ndims]:
                raise NotImplementedError("With the CUDA backend, batched transform is only supported along fastest dimensions")
            self.cufft_batch_size = self.shape[0]
            self.cufft_shape = self.shape[1:]
            if data_ndims == 3 and len(self.axes) == 1:
                # 1D transform on 3D data: here only supported along fast dim,
                # so batch_size is Nx*Ny
                self.cufft_batch_size = np.prod(self.shape[:2])
                self.cufft_shape = (self.shape[-1],)
            if len(self.cufft_shape) == 1:
                self.cufft_shape = self.cufft_shape[0]

    def configure_normalization(self):
        # TODO
        if self.normalize == "ortho":
            raise NotImplementedError(
                "Normalization mode 'ortho' is not implemented with CUDA backend yet."
            )
        self.cufft_scale_inverse = (self.normalize == "rescale")

    def check_array(self, array, shape, dtype, copy=True):
        if array.shape != shape:
            raise ValueError("Invalid data shape: expected %s, got %s" %
                             (shape, array.shape))
        if array.dtype != dtype:
            raise ValueError("Invalid data type: expected %s, got %s" %
                             (dtype, array.dtype))

    def set_data(self, dst, src, shape, dtype, copy=True, name=None):
        """
        dst is a device array owned by the current instance
        (either self.data_in or self.data_out).

        copy is ignored for device<-> arrays.
        """
        self.check_array(src, shape, dtype)
        if isinstance(src, np.ndarray):
            if name == "data_out":
                # Makes little sense to provide output=numpy_array
                return dst
            if not(src.flags["C_CONTIGUOUS"]):
                src = np.ascontiguousarray(src, dtype=dtype)
            dst[:] = src[:]
        elif isinstance(src, gpuarray.GPUArray):
            # No copy, use the data as self.d_input or self.d_output
            # (this prevents the use of in-place transforms, however).
            # We have to keep their old references.
            if name is None:
                # This should not happen
                raise ValueError("Please provide either copy=True or name != None")
            assert id(self.refs[name]) == id(dst)  # DEBUG
            setattr(self, name, src)
            return src
        else:
            raise ValueError(
                "Invalid array type %s, expected numpy.ndarray or pycuda.gpuarray" %
                type(src)
            )
        return dst

    def recover_array_references(self):
        self.data_in = self.refs["data_in"]
        self.data_out = self.refs["data_out"]

    def compute_forward_plan(self):
        self.plan_forward = Plan(
            self.cufft_shape,
            self.dtype,
            self.dtype_out,
            batch=self.cufft_batch_size,
            stream=self.cufft_stream,
            # cufft extensible plan API is only supported after 0.5.1
            # (commit 65288d28ca0b93e1234133f8d460dc6becb65121)
            # but there is still no official 0.5.2
            #~ auto_allocate=True # cufft extensible plan API
        )

    def compute_inverse_plan(self):
        self.plan_inverse = Plan(
            self.cufft_shape, # not shape_out
            self.dtype_out,
            self.dtype,
            batch=self.cufft_batch_size,
            stream=self.cufft_stream,
            # cufft extensible plan API is only supported after 0.5.1
            # (commit 65288d28ca0b93e1234133f8d460dc6becb65121)
            # but there is still no official 0.5.2
            #~ auto_allocate=True
        )

    def copy_output_if_numpy(self, dst, src):
        if isinstance(dst, gpuarray.GPUArray):
            return
        dst[:] = src[:]

    def fft(self, array, output=None):
        """
        Perform a (forward) Fast Fourier Transform.

        :param Union[numpy.ndarray,pycuda.gpuarray] array:
            Input data. Must be consistent with the current context.
        :param Union[numpy.ndarray,pycuda.gpuarray] output:
            Output data. By default, output is a numpy.ndarray.
        """
        data_in = self.set_input_data(array, copy=False)
        data_out = self.set_output_data(output, copy=False)

        cu_fft(
            data_in,
            data_out,
            self.plan_forward,
            scale=False
        )

        if output is not None:
            self.copy_output_if_numpy(output, self.data_out)
            res = output
        else:
            res = self.data_out.get()
        self.recover_array_references()
        return res

    def ifft(self, array, output=None):
        """
        Perform a (inverse) Fast Fourier Transform.

        :param Union[numpy.ndarray,pycuda.gpuarray] array:
            Input data. Must be consistent with the current context.
        :param Union[numpy.ndarray,pycuda.gpuarray] output:
            Output data. By default, output is a numpy.ndarray.
        """
        data_in = self.set_output_data(array, copy=False)
        data_out = self.set_input_data(output, copy=False)

        cu_ifft(
            data_in,
            data_out,
            self.plan_inverse,
            scale=self.cufft_scale_inverse,
        )

        if output is not None:
            self.copy_output_if_numpy(output, self.data_in)
            res = output
        else:
            res = self.data_in.get()
        self.recover_array_references()
        return res