#!/usr/bin/env python
# coding: utf-8
# /*##########################################################################
#
# Copyright (c) 2016 European Synchrotron Radiation Facility
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
#
# ###########################################################################*/
"""Module for (filtered) backprojection on the GPU"""

from __future__ import absolute_import, print_function, with_statement, division

__authors__ = ["A. Mirone, P. Paleo"]
__license__ = "MIT"
__date__ = "19/01/2018"

import logging
import numpy

from .common import pyopencl
from .processing import EventDescription, OpenclProcessing, BufferDescription
from .utils import nextpower as nextpow2

if pyopencl:
    mf = pyopencl.mem_flags
    import pyopencl.array as parray
else:
    raise ImportError("pyopencl is not installed")
logger = logging.getLogger(__name__)

# put in .common ?
try:
    from pyfft.cl import Plan as pyfft_Plan
    _has_pyfft = True
except ImportError:
    _has_pyfft = False
# For silx v0.6 we disable the use FFT on GPU.
_has_pyfft = False


def _sizeof(Type):
    """
    return the size (in bytes) of a scalar type, like the C behavior
    """
    return numpy.dtype(Type).itemsize


def _idivup(a, b):
    """
    return the integer division, plus one if `a` is not a multiple of `b`
    """
    return (a + (b - 1)) // b


def fourier_filter(sino, filter_=None, fft_size=None):
    """Simple numpy based implementation of fourier space filter
    
    :param sino: of shape shape = (num_projs, num_bins)
    :param filter: filter function to apply in fourier space
    :fft_size: size on which perform the fft. May be larger than the sino array 
    :return: filtered sinogram
    """
    assert sino.ndim == 2
    num_projs, num_bins = sino.shape
    if fft_size is None:
        fft_size = nextpow2(num_bins * 2 - 1)
    else:
        assert fft_size >= num_bins
    if fft_size == num_bins:
        sino_zeropadded = sino.astype(numpy.float32)
    else:
        sino_zeropadded = numpy.zeros((num_projs, fft_size),
                                      dtype=numpy.complex64)
        sino_zeropadded[:, :num_bins] = sino.astype(numpy.float32)

    if filter_ is None:
        h = numpy.zeros(fft_size, dtype=numpy.float32)
        L2 = fft_size // 2 + 1
        h[0] = 1 / 4.
        j = numpy.linspace(1, L2, L2 // 2, False)
        h[1:L2:2] = -1. / (numpy.pi ** 2 * j ** 2)
        h[L2:] = numpy.copy(h[1:L2 - 1][::-1])
        filter_ = numpy.fft.fft(h).astype(numpy.complex64)

    # Linear convolution
    sino_f = numpy.fft.fft(sino, fft_size)
    sino_f = sino_f * filter_
    sino_filtered = numpy.fft.ifft(sino_f)[:, :num_bins].real
    # Send the filtered sinogram to device
    return numpy.ascontiguousarray(sino_filtered.real, dtype=numpy.float32)


class Backprojection(OpenclProcessing):
    """A class for performing the backprojection using OpenCL"""
    kernel_files = ["backproj.cl", "array_utils.cl"]
    if _has_pyfft:
        kernel_files.append("backproj_helper.cl")

    def __init__(self, sino_shape, slice_shape=None, axis_position=None,
                 angles=None, filter_name=None, ctx=None, devicetype="all",
                 platformid=None, deviceid=None, profile=False):
        """Constructor of the OpenCL (filtered) backprojection

        :param sino_shape: shape of the sinogram. The sinogram is in the format
                           (n_b, n_a) where n_b is the number of detector bins
                           and n_a is the number of angles.
        :param slice_shape: Optional, shape of the reconstructed slice. By
                            default, it is a square slice where the dimension
                            is the "x dimension" of the sinogram (number of
                            bins).
        :param axis_position: Optional, axis position. Default is
                              `(shape[1]-1)/2.0`.
        :param angles: Optional, a list of custom angles in radian.
        :param filter_name: Optional, name of the filter for FBP. Default is
                            the Ram-Lak filter.
        :param ctx: actual working context, left to None for automatic
                    initialization from device type or platformid/deviceid
        :param devicetype: type of device, can be "CPU", "GPU", "ACC" or "ALL"
        :param platformid: integer with the platform_identifier, as given by
                           clinfo
        :param deviceid: Integer with the device identifier, as given by clinfo
        :param profile: switch on profiling to be able to profile at the kernel
                        level, store profiling elements (makes code slightly
                        slower)
        """
        # OS X enforces a workgroup size of 1 when the kernel has
        # synchronization barriers if sys.platform.startswith('darwin'):
        #  assuming no discrete GPU
        #    raise NotImplementedError("Backprojection is not implemented on CPU for OS X yet")

        OpenclProcessing.__init__(self, ctx=ctx, devicetype=devicetype,
                                  platformid=platformid, deviceid=deviceid,
                                  profile=profile)
        self.shape = sino_shape

        self.num_bins = numpy.int32(sino_shape[1])
        self.num_projs = numpy.int32(sino_shape[0])
        self.angles = angles
        if slice_shape is None:
            self.slice_shape = (self.num_bins, self.num_bins)
        else:
            self.slice_shape = slice_shape
        self.dimrec_shape = (
            _idivup(self.slice_shape[0], 32) * 32,
            _idivup(self.slice_shape[1], 32) * 32
        )
        self.slice = numpy.zeros(self.dimrec_shape, dtype=numpy.float32)
        self.filter_name = filter_name if filter_name else "Ram-Lak"
        if axis_position:
            self.axis_pos = numpy.float32(axis_position)
        else:
            self.axis_pos = numpy.float32((sino_shape[1] - 1.) / 2)
        self.axis_array = None  # TODO: add axis correction front-end

        self.is_cpu = False
        if self.device.type == "CPU":
            self.is_cpu = True

        self.compute_fft_plans()
        self.buffers = [
                       BufferDescription("_d_slice", numpy.prod(self.dimrec_shape), numpy.float32, mf.READ_WRITE),
                       BufferDescription("d_sino", self.num_projs * self.num_bins, numpy.float32, mf.READ_WRITE),  # before transferring to texture (if available)
                       BufferDescription("d_cos", self.num_projs, numpy.float32, mf.READ_ONLY),
                       BufferDescription("d_sin", self.num_projs, numpy.float32, mf.READ_ONLY),
                       BufferDescription("d_axes", self.num_projs, numpy.float32, mf.READ_ONLY),
                      ]
        self.allocate_buffers()
        if not(self.is_cpu):
            self.allocate_textures()
        self.compute_filter()
        if self.pyfft_plan:
            self.add_to_cl_mem({
                "d_filter": self.d_filter,
                "d_sino_z": self.d_sino_z
            })
        self.d_sino = self.cl_mem["d_sino"]  # shorthand
        self.compute_angles()

        self.local_mem = 256 * 3 * _sizeof(numpy.float32)  # constant for all image sizes
        OpenclProcessing.compile_kernels(self, self.kernel_files)
        # check that workgroup can actually be (16, 16)
        self.compiletime_workgroup_size = self.kernels.max_workgroup_size("backproj_cpu_kernel")
        # Workgroup and ndrange sizes are always the same
        self.wg = (16, 16)
        self.ndrange = (
            _idivup(int(self.dimrec_shape[1]), 32) * self.wg[0],  # int(): pyopencl <= 2015.1
            _idivup(int(self.dimrec_shape[0]), 32) * self.wg[1]  # int(): pyopencl <= 2015.1
        )

    def compute_angles(self):
        if self.angles is None:
            self.angles = numpy.linspace(0, numpy.pi, self.num_projs, False)
        h_cos = numpy.cos(self.angles).astype(numpy.float32)
        h_sin = numpy.sin(self.angles).astype(numpy.float32)
        pyopencl.enqueue_copy(self.queue, self.cl_mem["d_cos"], h_cos)
        pyopencl.enqueue_copy(self.queue, self.cl_mem["d_sin"], h_sin)
        if self.axis_array:
            pyopencl.enqueue_copy(self.queue,
                                  self.cl_mem["d_axes"],
                                  self.axis_array.astype(numpy.float32))
        else:
            pyopencl.enqueue_copy(self.queue,
                                  self.cl_mem["d_axes"],
                                  numpy.ones(self.num_projs, dtype=numpy.float32) * self.axis_pos)

    def allocate_textures(self):
        """
        Allocate the texture for the sinogram.
        """
        self.d_sino_tex = pyopencl.Image(
                                        self.ctx,
                                        mf.READ_ONLY | mf.USE_HOST_PTR,
                                        pyopencl.ImageFormat(
                                                             pyopencl.channel_order.INTENSITY,
                                                             pyopencl.channel_type.FLOAT
                                                            ),
                                        hostbuf=numpy.zeros(self.shape[::-1], dtype=numpy.float32)
                                        )

    def compute_fft_plans(self):
        """
        If pyfft is installed, prepare a batched 1D FFT plan for the filtering
        of FBP

        """
        self.fft_size = nextpow2(self.num_bins * 2 - 1)
        if _has_pyfft:
            logger.debug("pyfft is available. Computing FFT plans...")
            # batched 1D transform
            self.pyfft_plan = pyfft_Plan(self.fft_size, queue=self.queue,
                                         wait_for_finish=True)
            self.d_sino_z = parray.zeros(self.queue,
                                         (self.num_projs, self.fft_size),
                                         dtype=numpy.complex64)
            logger.debug("... done")
        else:
            logger.debug("pyfft not available, using numpy.fft")
            self.pyfft_plan = None
            # TODO: fall-back to fftw if present ?

    def compute_filter(self):
        """
        Compute the filter for FBP
        """
        if self.filter_name == "Ram-Lak":
            L = self.fft_size
            h = numpy.zeros(L, dtype=numpy.float32)
            L2 = L // 2 + 1
            h[0] = 1 / 4.
            j = numpy.linspace(1, L2, L2 // 2, False)
            h[1:L2:2] = -1. / (numpy.pi ** 2 * j ** 2)
            h[L2:] = numpy.copy(h[1:L2 - 1][::-1])
        else:
            # TODO: other filters
            raise ValueError("Filter %s is not available" % self.filter_name)
        self.filter = h
        if self.pyfft_plan:
            self.d_filter = parray.to_device(self.queue, h.astype(numpy.complex64))
            self.pyfft_plan.execute(self.d_filter.data)
        else:
            self.filter = numpy.fft.fft(h).astype(numpy.complex64)
            self.d_filter = None

    def _get_local_mem(self):
        return pyopencl.LocalMemory(self.local_mem)  # constant for all image sizes

    def cpy2d_to_slice(self, dst):
        ndrange = (int(self.slice_shape[1]), int(self.slice_shape[0]))  # pyopencl < 2015.2
        slice_shape_ocl = numpy.int32(ndrange)
        wg = None
        kernel_args = (
            dst.data,
            self.cl_mem["_d_slice"],
            numpy.int32(self.slice_shape[1]),
            numpy.int32(self.dimrec_shape[1]),
            numpy.int32((0, 0)),
            numpy.int32((0, 0)),
            slice_shape_ocl
        )
        return self.kernels.cpy2d(self.queue, ndrange, wg, *kernel_args)

    def transfer_to_texture(self, sino):
        sino2 = sino
        if not(sino.flags["C_CONTIGUOUS"] and sino.dtype == numpy.float32):
            sino2 = numpy.ascontiguousarray(sino, dtype=numpy.float32)
        if self.is_cpu:
            ev = pyopencl.enqueue_copy(
                                        self.queue,
                                        self.d_sino,
                                        sino2
                                        )
            what = "transfer filtered sino H->D buffer"
        else:
            ev = pyopencl.enqueue_copy(
                                       self.queue,
                                       self.d_sino_tex,
                                       sino2,
                                       origin=(0, 0),
                                       region=self.shape[::-1]
                                       )
            what = "transfer filtered sino H->D texture"
        return EventDescription(what, ev)

    def transfer_device_to_texture(self, d_sino):
        if self.is_cpu:
            if id(self.d_sino) == id(d_sino):
                return
            ev = pyopencl.enqueue_copy(
                                       self.queue,
                                       self.d_sino,
                                       d_sino
                                       )
            what = "transfer filtered sino D->D buffer"
        else:
            ev = pyopencl.enqueue_copy(
                                       self.queue,
                                       self.d_sino_tex,
                                       d_sino,
                                       offset=0,
                                       origin=(0, 0),
                                       region=self.shape[::-1]
                                       )
            what = "transfer filtered sino D->D texture"
        return EventDescription(what, ev)

    def backprojection(self, sino=None, dst=None):
        """Perform the backprojection on an input sinogram

        :param sino: sinogram. If provided, it returns the plain backprojection.
        :param dst: destination (pyopencl.Array). If provided, the result will be written in this array.
        :return: backprojection of sinogram
        """
        events = []
        with self.sem:

            if sino is not None:  # assuming numpy.ndarray
                events.append(self.transfer_to_texture(sino))
            # Prepare arguments for the kernel call
            if self.is_cpu:
                d_sino_ref = self.d_sino
            else:
                d_sino_ref = self.d_sino_tex
            kernel_args = (
                self.num_projs,  # num of projections (int32)
                self.num_bins,  # num of bins (int32)
                self.axis_pos,  # axis position (float32)
                self.cl_mem["_d_slice"],  # d_slice (__global float32*)
                d_sino_ref,  # d_sino (__read_only image2d_t or float*)
                numpy.float32(0),  # gpu_offset_x (float32)
                numpy.float32(0),  # gpu_offset_y (float32)
                self.cl_mem["d_cos"],  # d_cos (__global float32*)
                self.cl_mem["d_sin"],  # d_sin (__global float32*)
                self.cl_mem["d_axes"],  # d_axis  (__global float32*)
                self._get_local_mem()  # shared mem (__local float32*)
            )
            # Call the kernel
            if self.is_cpu:
                kernel_to_call = self.kernels.backproj_cpu_kernel
            else:
                kernel_to_call = self.kernels.backproj_kernel
            event_bpj = kernel_to_call(
                self.queue,
                self.ndrange,
                self.wg,
                *kernel_args
            )
            if dst is None:
                self.slice[:] = 0
                events.append(EventDescription("backprojection", event_bpj))
                ev = pyopencl.enqueue_copy(self.queue, self.slice,
                                           self.cl_mem["_d_slice"])
                events.append(EventDescription("copy D->H result", ev))
                ev.wait()
                res = numpy.copy(self.slice)
                if self.dimrec_shape[0] > self.slice_shape[0] or self.dimrec_shape[1] > self.slice_shape[1]:
                    res = res[:self.slice_shape[0], :self.slice_shape[1]]
                # if the slice is backprojected onto a bigger grid
                if self.slice_shape[1] > self.num_bins:
                    res = res[:self.slice_shape[0], :self.slice_shape[1]]
            else:
                ev = self.cpy2d_to_slice(dst)
                events.append(EventDescription("copy D->D result", ev))
                ev.wait()
                res = dst

        # /with self.sem
        if self.profile:
            self.events += events

        return res

    def filter_projections(self, sino, rescale=True):
        """
        Performs the FBP on a given sinogram.

        :param sinogram: sinogram to (filter-)backproject
        :param rescale: if True (default), the sinogram is multiplied with
                        (pi/n_projs)
        """
        if sino.shape[0] != self.num_projs or sino.shape[1] != self.num_bins:
            raise ValueError("Expected sinogram with (projs, bins) = (%d, %d)" % (self.num_projs, self.num_bins))
        if rescale:
            sino = sino * numpy.pi / self.num_projs
        events = []
        # if pyfft is available, all can be done on the device
        if self.d_filter is not None:

            # Zero-pad the sinogram.
            # TODO: this can be done on GPU with a "Memcpy2D":
            #  cl.enqueue_copy(queue, dst, src, host_origin=(0,0), buffer_origin=(0,0), region=shape, host_pitches=(sino.shape[1],), buffer_pitches=(self.fft_size,))
            # However it does not work properly, and raises an error for pyopencl < 2017.1
            sino_zeropadded = numpy.zeros((sino.shape[0], self.fft_size), dtype=numpy.complex64)
            sino_zeropadded[:, :self.num_bins] = sino.astype(numpy.float32)
            sino_zeropadded = numpy.ascontiguousarray(sino_zeropadded, dtype=numpy.complex64)
            with self.sem:
                # send to GPU
                ev = pyopencl.enqueue_copy(self.queue, self.d_sino_z.data, sino_zeropadded)
                events.append(EventDescription("Send sino H->D", ev))

                # FFT (in-place)
                self.pyfft_plan.execute(self.d_sino_z.data, batch=self.num_projs)

                # Multiply (complex-wise) with the the filter
                ev = self.kernels.mult(self.queue,
                                       tuple(int(i) for i in self.d_sino_z.shape[::-1]),
                                       None,
                                       self.d_sino_z.data,
                                       self.d_filter.data,
                                       numpy.int32(self.fft_size),
                                       self.num_projs
                                       )
                events.append(EventDescription("complex 2D-1D multiplication", ev))
                # Inverse FFT (in-place)
                self.pyfft_plan.execute(self.d_sino_z.data, batch=self.num_projs, inverse=True)
                # Copy the real part of d_sino_z[:, :self.num_bins] (complex64) to d_sino (float32)
                ev = self.kernels.cpy2d_c2r(self.queue, self.shape[::-1], None,
                                            self.d_sino,
                                            self.d_sino_z.data,
                                            self.num_bins,
                                            self.num_projs,
                                            numpy.int32(self.fft_size)
                                            )
                events.append(EventDescription("conversion from complex padded sinogram to sinogram", ev))
                # debug
#                 ev.wait()
#                 h_sino = numpy.zeros(sino.shape, dtype=numpy.float32)
#                 ev = pyopencl.enqueue_copy(self.queue, h_sino, self.d_sino)
#                 ev.wait()
#                 numpy.save("/tmp/filtered_sinogram_%s.npy" % self.ctx.devices[0].platform.name.split()[0], h_sino)
                events.append(self.transfer_device_to_texture(self.d_sino))
            # ------
        else:  # no pyfft
            sino_filtered = fourier_filter(sino, filter_=self.filter, fft_size=self.fft_size)
            with self.sem:
                events.append(self.transfer_to_texture(sino_filtered))
        if self.profile:
            self.events += events

    def filtered_backprojection(self, sino):
        """
        Compute the filtered backprojection (FBP) on a sinogram.

        :param sino: sinogram (`numpy.ndarray`) in the format (projections,
                     bins)
        """

        self.filter_projections(sino)
        res = self.backprojection()
        return res

    __call__ = filtered_backprojection