#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
#    Project: S I L X project
#             https://github.com/silx-kit/silx
#
#    Copyright (C) 2012-2018 European Synchrotron Radiation Facility, Grenoble, France
#
#    Principal author:       Jérôme Kieffer (Jerome.Kieffer@ESRF.eu)
#
# Permission is hereby granted, free of charge, to any person
# obtaining a copy of this software and associated documentation
# files (the "Software"), to deal in the Software without
# restriction, including without limitation the rights to use,
# copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following
# conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
# OTHER DEALINGS IN THE SOFTWARE.
#

__author__ = "Jerome Kieffer"
__contact__ = "Jerome.Kieffer@ESRF.eu"
__license__ = "MIT"
__copyright__ = "2012-2017 European Synchrotron Radiation Facility, Grenoble, France"
__date__ = "30/11/2020"
__status__ = "stable"
__all__ = ["ocl", "pyopencl", "mf", "release_cl_buffers", "allocate_cl_buffers",
           "measure_workgroup_size", "kernel_workgroup_size"]

import os
import logging

import numpy

from .utils import get_opencl_code

logger = logging.getLogger(__name__)

if os.environ.get("SILX_OPENCL") in ["0", "False"]:
    logger.info("Use of OpenCL has been disabled from environment variable: SILX_OPENCL=0")
    pyopencl = None
else:
    try:
        import pyopencl
    except ImportError:
        logger.warning("Unable to import pyOpenCl. Please install it from: http://pypi.python.org/pypi/pyopencl")
        pyopencl = None
    else:
        try:
            pyopencl.get_platforms()
        except pyopencl.LogicError:
            logger.warning("The module pyOpenCL has been imported but can't be used here")
            pyopencl = None
        else:
            import pyopencl.array as array
            mf = pyopencl.mem_flags

if pyopencl is None:

    # Define default mem flags
    class mf(object):
        WRITE_ONLY = 1
        READ_ONLY = 1
        READ_WRITE = 1
else:
    mf = pyopencl.mem_flags

FLOP_PER_CORE = {"GPU": 64,  # GPU, Fermi at least perform 64 flops per cycle/multicore, G80 were at 24 or 48 ...
                 "CPU": 4,  # CPU, at least intel's have 4 operation per cycle
                 "ACC": 8}  # ACC: the Xeon-phi (MIC) appears to be able to process 8 Flops per hyperthreaded-core

# Sources : https://en.wikipedia.org/wiki/CUDA
NVIDIA_FLOP_PER_CORE = {(1, 0): 24,  # Guessed !
                        (1, 1): 24,  # Measured on G98 [Quadro NVS 295]
                        (1, 2): 24,  # Guessed !
                        (1, 3): 24,  # measured on a GT285 (GT200)
                        (2, 0): 64,  # Measured on a 580 (GF110)
                        (2, 1): 96,  # Measured on Quadro2000 GF106GL
                        (3, 0): 384,  # Guessed!
                        (3, 5): 384,  # Measured on K20
                        (3, 7): 384,  # K80: Guessed!
                        (5, 0): 256,  # Maxwell 4 warps/SM 2 flops/ CU
                        (5, 2): 256,  # Titan-X
                        (5, 3): 256,  # TX1
                        (6, 0): 128,  # GP100
                        (6, 1): 128,  # GP104
                        (6, 2): 128,  # ?
                        (7, 0): 128,  # Volta # measured on Telsa V100
                        (7, 1): 128,  # Volta ?
                        }

AMD_FLOP_PER_CORE = 160  # Measured on a M7820 10 core, 700MHz 1120GFlops


class Device(object):
    """
    Simple class that contains the structure of an OpenCL device
    """

    def __init__(self, name="None", dtype=None, version=None, driver_version=None,
                 extensions="", memory=None, available=None,
                 cores=None, frequency=None, flop_core=None, idx=0, workgroup=1):
        """
        Simple container with some important data for the OpenCL device description.

        :param name: name of the device
        :param dtype: device type: CPU/GPU/ACC...
        :param version: driver version
        :param driver_version:
        :param extensions: List of opencl extensions
        :param memory: maximum memory available on the device
        :param available: is the device deactivated or not
        :param cores: number of SM/cores
        :param frequency: frequency of the device
        :param flop_core: Flopating Point operation per core per cycle
        :param idx: index of the device within the platform
        :param workgroup: max workgroup size
        """
        self.name = name.strip()
        self.type = dtype
        self.version = version
        self.driver_version = driver_version
        self.extensions = extensions.split()
        self.memory = memory
        self.available = available
        self.cores = cores
        self.frequency = frequency
        self.id = idx
        self.max_work_group_size = workgroup
        if not flop_core:
            flop_core = FLOP_PER_CORE.get(dtype, 1)
        if cores and frequency:
            self.flops = cores * frequency * flop_core
        else:
            self.flops = flop_core

    def __repr__(self):
        return "%s" % self.name

    def pretty_print(self):
        """
        Complete device description

        :return: string
        """
        lst = ["Name\t\t:\t%s" % self.name,
               "Type\t\t:\t%s" % self.type,
               "Memory\t\t:\t%.3f MB" % (self.memory / 2.0 ** 20),
               "Cores\t\t:\t%s CU" % self.cores,
               "Frequency\t:\t%s MHz" % self.frequency,
               "Speed\t\t:\t%.3f GFLOPS" % (self.flops / 1000.),
               "Version\t\t:\t%s" % self.version,
               "Available\t:\t%s" % self.available]
        return os.linesep.join(lst)

    def set_unavailable(self):
        """Use this method to flag a faulty device
        """
        self.available = False


class Platform(object):
    """
    Simple class that contains the structure of an OpenCL platform
    """

    def __init__(self, name="None", vendor="None", version=None, extensions=None, idx=0):
        """
        Class containing all descriptions of a platform and all devices description within that platform.

        :param name: platform name
        :param vendor: name of the brand/vendor
        :param version:
        :param extensions: list of the extension provided by the platform to all of its devices
        :param idx: index of the platform
        """
        self.name = name.strip()
        self.vendor = vendor.strip()
        self.version = version
        self.extensions = extensions.split()
        self.devices = []
        self.id = idx

    def __repr__(self):
        return "%s" % self.name

    def add_device(self, device):
        """
        Add new device to the platform

        :param device: Device instance
        """
        self.devices.append(device)

    def get_device(self, key):
        """
        Return a device according to key

        :param key: identifier for a device, either it's id (int) or it's name
        :type key: int or str
        """
        out = None
        try:
            devid = int(key)
        except ValueError:
            for a_dev in self.devices:
                if a_dev.name == key:
                    out = a_dev
        else:
            if len(self.devices) > devid > 0:
                out = self.devices[devid]
        return out


def _measure_workgroup_size(device_or_context, fast=False):
    """Mesure the maximal work group size of the given device

    DEPRECATED since not perfectly correct !

    :param device_or_context: instance of pyopencl.Device or pyopencl.Context
                    or 2-tuple (platformid,deviceid)
    :param fast: ask the kernel the valid value, don't probe it
    :return: maximum size for the workgroup
    """
    if isinstance(device_or_context, pyopencl.Device):
        try:
            ctx = pyopencl.Context(devices=[device_or_context])
        except pyopencl._cl.LogicError as error:
            platform = device_or_context.platform
            platformid = pyopencl.get_platforms().index(platform)
            deviceid = platform.get_devices().index(device_or_context)
            ocl.platforms[platformid].devices[deviceid].set_unavailable()
            raise RuntimeError("Unable to create context on %s/%s: %s" % (platform, device_or_context, error))
        else:
            device = device_or_context
    elif isinstance(device_or_context, pyopencl.Context):
        ctx = device_or_context
        device = device_or_context.devices[0]
    elif isinstance(device_or_context, (tuple, list)) and len(device_or_context) == 2:
        ctx = ocl.create_context(platformid=device_or_context[0],
                                 deviceid=device_or_context[1])
        device = ctx.devices[0]
    else:
        raise RuntimeError("""given parameter device_or_context is not an
            instanciation of a device or a context""")
    shape = device.max_work_group_size
    # get the context

    assert ctx is not None
    queue = pyopencl.CommandQueue(ctx)

    max_valid_wg = 1
    data = numpy.random.random(shape).astype(numpy.float32)
    d_data = pyopencl.array.to_device(queue, data)
    d_data_1 = pyopencl.array.empty_like(d_data)
    d_data_1.fill(numpy.float32(1.0))

    program = pyopencl.Program(ctx, get_opencl_code("addition")).build()
    if fast:
        max_valid_wg = program.addition.get_work_group_info(pyopencl.kernel_work_group_info.WORK_GROUP_SIZE, device)
    else:
        maxi = int(round(numpy.log2(shape)))
        for i in range(maxi + 1):
            d_res = pyopencl.array.empty_like(d_data)
            wg = 1 << i
            try:
                evt = program.addition(
                    queue, (shape,), (wg,),
                    d_data.data, d_data_1.data, d_res.data, numpy.int32(shape))
                evt.wait()
            except Exception as error:
                logger.info("%s on device %s for WG=%s/%s", error, device.name, wg, shape)
                program = queue = d_res = d_data_1 = d_data = None
                break
            else:
                res = d_res.get()
                good = numpy.allclose(res, data + 1)
                if good:
                    if wg > max_valid_wg:
                        max_valid_wg = wg
                else:
                    logger.warning("ArithmeticError on %s for WG=%s/%s", wg, device.name, shape)

    return max_valid_wg


def _is_nvidia_gpu(vendor, devtype):
    return (vendor == "NVIDIA Corporation") and (devtype == "GPU")


class OpenCL(object):
    """
    Simple class that wraps the structure ocl_tools_extended.h

    This is a static class.
    ocl should be the only instance and shared among all python modules.
    """

    platforms = []
    nb_devices = 0
    context_cache = {}  # key: 2-tuple of int, value: context
    if pyopencl:
        platform = device = pypl = devtype = extensions = pydev = None
        for idx, platform in enumerate(pyopencl.get_platforms()):
            pypl = Platform(platform.name, platform.vendor, platform.version, platform.extensions, idx)
            for idd, device in enumerate(platform.get_devices()):
                ####################################################
                # Nvidia does not report int64 atomics (we are using) ...
                # this is a hack around as any nvidia GPU with double-precision supports int64 atomics
                ####################################################
                extensions = device.extensions
                if (pypl.vendor == "NVIDIA Corporation") and ('cl_khr_fp64' in extensions):
                    extensions += ' cl_khr_int64_base_atomics cl_khr_int64_extended_atomics'
                try:
                    devtype = pyopencl.device_type.to_string(device.type).upper()
                except ValueError:
                    # pocl does not describe itself as a CPU !
                    devtype = "CPU"
                if len(devtype) > 3:
                    if "GPU" in devtype:
                        devtype = "GPU"
                    elif "ACC" in devtype:
                        devtype = "ACC"
                    elif "CPU" in devtype:
                        devtype = "CPU"
                    else:
                        devtype = devtype[:3]
                if _is_nvidia_gpu(device.vendor, devtype) and ("compute_capability_major_nv" in dir(device)):
                    try:
                        comput_cap = device.compute_capability_major_nv, device.compute_capability_minor_nv
                    except pyopencl.LogicError:
                        flop_core = FLOP_PER_CORE["GPU"]
                    else:
                        flop_core = NVIDIA_FLOP_PER_CORE.get(comput_cap, FLOP_PER_CORE["GPU"])
                elif (pypl.vendor == "Advanced Micro Devices, Inc.") and (devtype == "GPU"):
                    flop_core = AMD_FLOP_PER_CORE
                elif devtype == "CPU":
                    flop_core = FLOP_PER_CORE.get(devtype, 1)
                else:
                    flop_core = 1
                workgroup = device.max_work_group_size
                if (devtype == "CPU") and (pypl.vendor == "Apple"):
                    logger.info("For Apple's OpenCL on CPU: Measuring actual valid max_work_goup_size.")
                    workgroup = _measure_workgroup_size(device, fast=True)
                if (devtype == "GPU") and os.environ.get("GPU") == "False":
                    # Environment variable to disable GPU devices
                    continue
                pydev = Device(device.name, devtype, device.version, device.driver_version, extensions,
                               device.global_mem_size, bool(device.available), device.max_compute_units,
                               device.max_clock_frequency, flop_core, idd, workgroup)
                pypl.add_device(pydev)
                nb_devices += 1
            platforms.append(pypl)
        del platform, device, pypl, devtype, extensions, pydev

    def __repr__(self):
        out = ["OpenCL devices:"]
        for platformid, platform in enumerate(self.platforms):
            deviceids = ["(%s,%s) %s" % (platformid, deviceid, dev.name)
                         for deviceid, dev in enumerate(platform.devices)]
            out.append("[%s] %s: " % (platformid, platform.name) + ", ".join(deviceids))
        return os.linesep.join(out)

    def get_platform(self, key):
        """
        Return a platform according

        :param key: identifier for a platform, either an Id (int) or it's name
        :type key: int or str
        """
        out = None
        try:
            platid = int(key)
        except ValueError:
            for a_plat in self.platforms:
                if a_plat.name == key:
                    out = a_plat
        else:
            if len(self.platforms) > platid > 0:
                out = self.platforms[platid]
        return out

    def select_device(self, dtype="ALL", memory=None, extensions=None, best=True, **kwargs):
        """
        Select a device based on few parameters (at the end, keep the one with most memory)

        :param dtype: "gpu" or "cpu" or "all" ....
        :param memory: minimum amount of memory (int)
        :param extensions: list of extensions to be present
        :param best: shall we look for the
        :returns: A tuple of plateform ID and device ID, else None if nothing
            found
        """
        if extensions is None:
            extensions = []
        if "type" in kwargs:
            dtype = kwargs["type"].upper()
        else:
            dtype = dtype.upper()
        if len(dtype) > 3:
            dtype = dtype[:3]
        best_found = None
        for platformid, platform in enumerate(self.platforms):
            for deviceid, device in enumerate(platform.devices):
                if not device.available:
                    continue
                if (dtype in ["ALL", "DEF"]) or (device.type == dtype):
                    if (memory is None) or (memory <= device.memory):
                        found = True
                        for ext in extensions:
                            if ext not in device.extensions:
                                found = False
                        if found:
                            if not best:
                                return platformid, deviceid
                            else:
                                if not best_found:
                                    best_found = platformid, deviceid, device.flops
                                elif best_found[2] < device.flops:
                                    best_found = platformid, deviceid, device.flops
        if best_found:
            return best_found[0], best_found[1]

        # Nothing found
        return None

    def create_context(self, devicetype="ALL", useFp64=False, platformid=None,
                       deviceid=None, cached=True, memory=None, extensions=None):
        """
        Choose a device and initiate a context.

        Devicetypes can be GPU,gpu,CPU,cpu,DEF,ACC,ALL.
        Suggested are GPU,CPU.
        For each setting to work there must be such an OpenCL device and properly installed.
        E.g.: If Nvidia driver is installed, GPU will succeed but CPU will fail.
              The AMD SDK kit is required for CPU via OpenCL.
        :param devicetype: string in ["cpu","gpu", "all", "acc"]
        :param useFp64: boolean specifying if double precision will be used: deprecated use extensions=["cl_khr_fp64"]
        :param platformid: integer
        :param deviceid: integer
        :param cached: True if we want to cache the context
        :param memory: minimum amount of memory of the device
        :param extensions: list of extensions to be present
        :return: OpenCL context on the selected device
        """
        if extensions is None:
            extensions = []
        if useFp64:
            logger.warning("Deprecation: please select your device using the extension name!, i.e. extensions=['cl_khr_fp64']")
            extensions.append('cl_khr_fp64')

        if (platformid is not None) and (deviceid is not None):
            platformid = int(platformid)
            deviceid = int(deviceid)
        elif "PYOPENCL_CTX" in os.environ:
            pyopencl_ctx = [int(i) if i.isdigit() else 0 for i in os.environ["PYOPENCL_CTX"].split(":")]
            pyopencl_ctx += [0] * (2 - len(pyopencl_ctx))  # pad with 0
            platformid, deviceid = pyopencl_ctx
        else:
            ids = ocl.select_device(type=devicetype, extensions=extensions)
            if ids:
                platformid, deviceid = ids
        ctx = None
        if (platformid is not None) and (deviceid is not None):
            if (platformid, deviceid) in self.context_cache:
                ctx = self.context_cache[(platformid, deviceid)]
            else:
                try:
                    ctx = pyopencl.Context(devices=[pyopencl.get_platforms()[platformid].get_devices()[deviceid]])
                except pyopencl._cl.LogicError as error:
                    self.platforms[platformid].devices[deviceid].set_unavailable()
                    logger.warning("Unable to create context on %s/%s: %s", platformid, deviceid, error)
                    ctx = None
                else:
                    if cached:
                        self.context_cache[(platformid, deviceid)] = ctx
        if ctx is None:
            logger.warning("Last chance to get an OpenCL device ... probably not the one requested")
            ctx = pyopencl.create_some_context(interactive=False)
        return ctx

    def device_from_context(self, context):
        """
        Retrieves the Device from the context

        :param context: OpenCL context
        :return: instance of Device
        """
        odevice = context.devices[0]
        oplat = odevice.platform
        device_id = oplat.get_devices().index(odevice)
        platform_id = pyopencl.get_platforms().index(oplat)
        return self.platforms[platform_id].devices[device_id]


if pyopencl:
    ocl = OpenCL()
    if ocl.nb_devices == 0:
        ocl = None
else:
    ocl = None


def release_cl_buffers(cl_buffers):
    """
    :param cl_buffers: the buffer you want to release
    :type cl_buffers: dict(str, pyopencl.Buffer)

    This method release the memory of the buffers store in the dict
    """
    for key, buffer_ in cl_buffers.items():
        if buffer_ is not None:
            if isinstance(buffer_, pyopencl.array.Array):
                try:
                    buffer_.data.release()
                except pyopencl.LogicError:
                    logger.error("Error while freeing buffer %s", key)
            else:
                try:
                    buffer_.release()
                except pyopencl.LogicError:
                    logger.error("Error while freeing buffer %s", key)
            cl_buffers[key] = None
    return cl_buffers


def allocate_cl_buffers(buffers, device=None, context=None):
    """
    :param buffers: the buffers info use to create the pyopencl.Buffer
    :type buffers: list(std, flag, numpy.dtype, int)
    :param device: one of the context device
    :param context: opencl contextdevice
    :return: a dict containing the instanciated pyopencl.Buffer
    :rtype: dict(str, pyopencl.Buffer)

    This method instanciate the pyopencl.Buffer from the buffers
    description.
    """
    mem = {}
    if device is None:
        device = ocl.device_from_context(context)

    # check if enough memory is available on the device
    ualloc = 0
    for _, _, dtype, size in buffers:
        ualloc += numpy.dtype(dtype).itemsize * size
    memory = device.memory
    logger.info("%.3fMB are needed on device which has %.3fMB",
                ualloc / 1.0e6, memory / 1.0e6)
    if ualloc >= memory:
        memError = "Fatal error in allocate_buffers."
        memError += "Not enough device memory for buffers"
        memError += "(%lu requested, %lu available)" % (ualloc, memory)
        raise MemoryError(memError)  # noqa

    # do the allocation
    try:
        for name, flag, dtype, size in buffers:
            mem[name] = pyopencl.Buffer(context, flag,
                                        numpy.dtype(dtype).itemsize * size)
    except pyopencl.MemoryError as error:
        release_cl_buffers(mem)
        raise MemoryError(error)

    return mem


def allocate_texture(ctx, shape, hostbuf=None, support_1D=False):
    """
    Allocate an OpenCL image ("texture").

    :param ctx: OpenCL context
    :param shape: Shape of the image. Note that pyopencl and OpenCL < 1.2
        do not support 1D images, so 1D images are handled as 2D with one row
    :param support_1D: force the image to be 1D if the shape has only one dim
    """
    if len(shape) == 1 and not(support_1D):
        shape = (1,) + shape
    return pyopencl.Image(
        ctx,
        pyopencl.mem_flags.READ_ONLY | pyopencl.mem_flags.USE_HOST_PTR,
        pyopencl.ImageFormat(
            pyopencl.channel_order.INTENSITY,
            pyopencl.channel_type.FLOAT
        ),
        hostbuf=numpy.zeros(shape[::-1], dtype=numpy.float32)
    )


def check_textures_availability(ctx):
    """
    Check whether textures are supported on the current OpenCL context.

    :param ctx: OpenCL context
    """
    try:
        dummy_texture = allocate_texture(ctx, (16, 16))
        # Need to further access some attributes (pocl)
        dummy_height = dummy_texture.height
        textures_available = True
        del dummy_texture, dummy_height
    except (pyopencl.RuntimeError, pyopencl.LogicError):
        textures_available = False
    # Nvidia Fermi GPUs (compute capability 2.X) do not support opencl read_imagef
    # There is no way to detect this until a kernel is compiled
    try:
        cc = ctx.devices[0].compute_capability_major_nv
        textures_available &= (cc >= 3)
    except (pyopencl.LogicError, AttributeError):  # probably not a Nvidia GPU
        pass
    #
    return textures_available


def measure_workgroup_size(device):
    """Measure the actual size of the workgroup

    :param device: device or context or 2-tuple with indexes
    :return: the actual measured workgroup size

    if device is "all", returns a dict with all devices with their ids as keys.
    """
    if (ocl is None) or (device is None):
        return None

    if isinstance(device, tuple) and (len(device) == 2):
        # this is probably a tuple (platformid, deviceid)
        device = ocl.create_context(platformid=device[0], deviceid=device[1])

    if device == "all":
        res = {}
        for pid, platform in enumerate(ocl.platforms):
            for did, _devices in enumerate(platform.devices):
                tup = (pid, did)
                res[tup] = measure_workgroup_size(tup)
    else:
        res = _measure_workgroup_size(device)
    return res


def query_kernel_info(program, kernel, what="WORK_GROUP_SIZE"):
    """Extract the compile time information from a kernel

    :param program: OpenCL program
    :param kernel: kernel or name of the kernel
    :param what: what is the query about ?
    :return: int or 3-int for the workgroup size.
    
    Possible information available are:
    * 'COMPILE_WORK_GROUP_SIZE': Returns the work-group size specified inside the kernel (__attribute__((reqd_work_gr oup_size(X, Y, Z))))
    * 'GLOBAL_WORK_SIZE': maximum global size that can be used to execute a kernel  #OCL2.1!
    * 'LOCAL_MEM_SIZE': amount of local memory in bytes being used by the kernel
    * 'PREFERRED_WORK_GROUP_SIZE_MULTIPLE': preferred multiple of workgroup size for launch. This is a performance hint.
    * 'PRIVATE_MEM_SIZE' Returns the minimum amount of private memory, in bytes, used by each workitem in the kernel
    * 'WORK_GROUP_SIZE': maximum work-group size that can be used to execute a kernel on a specific device given by device
    
    Further information on:
    https://www.khronos.org/registry/OpenCL/sdk/1.1/docs/man/xhtml/clGetKernelWorkGroupInfo.html
    
    """
    assert isinstance(program, pyopencl.Program)
    if not isinstance(kernel, pyopencl.Kernel):
        kernel_name = kernel
        assert kernel in (k.function_name for k in program.all_kernels()), "the kernel exists"
        kernel = program.__getattr__(kernel_name)

    device = program.devices[0]
    query_wg = getattr(pyopencl.kernel_work_group_info, what)
    return kernel.get_work_group_info(query_wg, device)


def kernel_workgroup_size(program, kernel):
    """Extract the compile time maximum workgroup size

    :param program: OpenCL program
    :param kernel: kernel or name of the kernel
    :return: the maximum acceptable workgroup size for the given kernel
    """
    return query_kernel_info(program, kernel, what="WORK_GROUP_SIZE")