1 files changed, 488 insertions, 0 deletions
diff --git a/silx/opencl/backprojection.py b/silx/opencl/backprojection.py
new file mode 100644
index 0000000..15a03b9
--- /dev/null
+++ b/silx/opencl/backprojection.py
@@ -0,0 +1,488 @@
+#!/usr/bin/env python
+# coding: utf-8
+# /*##########################################################################
+#
+# Copyright (c) 2016 European Synchrotron Radiation Facility
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+#
+# ###########################################################################*/
+"""Module for (filtered) backprojection on the GPU"""
+
+from __future__ import absolute_import, print_function, with_statement, division
+
+__authors__ = ["A. Mirone, P. Paleo"]
+__license__ = "MIT"
+__date__ = "05/10/2017"
+
+import logging
+import numpy
+
+from .common import pyopencl
+from .processing import EventDescription, OpenclProcessing, BufferDescription
+from .utils import nextpower as nextpow2
+
+if pyopencl:
+    mf = pyopencl.mem_flags
+    import pyopencl.array as parray
+else:
+    raise ImportError("pyopencl is not installed")
+logger = logging.getLogger(__name__)
+
+# put in .common ?
+try:
+    from pyfft.cl import Plan as pyfft_Plan
+    _has_pyfft = True
+except ImportError:
+    _has_pyfft = False
+# For silx v0.6 we disable the use FFT on GPU.
+_has_pyfft = False
+
+
+def _sizeof(Type):
+    """
+    return the size (in bytes) of a scalar type, like the C behavior
+    """
+    return numpy.dtype(Type).itemsize
+
+
+def _idivup(a, b):
+    """
+    return the integer division, plus one if `a` is not a multiple of `b`
+    """
+    return (a + (b - 1)) // b
+
+
+def fourier_filter(sino, filter_=None, fft_size=None):
+    """Simple numpy based implementation of fourier space filter
+    
+    :param sino: of shape shape = (num_projs, num_bins)
+    :param filter: filter function to apply in fourier space
+    :fft_size: size on which perform the fft. May be larger than the sino array 
+    :return: filtered sinogram
+    """
+    assert sino.ndim == 2
+    num_projs, num_bins = sino.shape
+    if fft_size is None:
+        fft_size = nextpow2(num_bins * 2 - 1)
+    else:
+        assert fft_size >= num_bins
+    if fft_size == num_bins:
+        sino_zeropadded = sino.astype(numpy.float32)
+    else:
+        sino_zeropadded = numpy.zeros((num_projs, fft_size),
+                                      dtype=numpy.complex64)
+        sino_zeropadded[:, :num_bins] = sino.astype(numpy.float32)
+
+    if filter_ is None:
+        h = numpy.zeros(fft_size, dtype=numpy.float32)
+        L2 = fft_size // 2 + 1
+        h[0] = 1 / 4.
+        j = numpy.linspace(1, L2, L2 // 2, False)
+        h[1:L2:2] = -1. / (numpy.pi ** 2 * j ** 2)
+        h[L2:] = numpy.copy(h[1:L2 - 1][::-1])
+        filter_ = numpy.fft.fft(h).astype(numpy.complex64)
+
+    # Linear convolution
+    sino_f = numpy.fft.fft(sino, fft_size)
+    sino_f = sino_f * filter_
+    sino_filtered = numpy.fft.ifft(sino_f)[:, :num_bins].real
+    # Send the filtered sinogram to device
+    return numpy.ascontiguousarray(sino_filtered.real, dtype=numpy.float32)
+
+
+class Backprojection(OpenclProcessing):
+    """A class for performing the backprojection using OpenCL"""
+    kernel_files = ["backproj.cl", "array_utils.cl"]
+    if _has_pyfft:
+        kernel_files.append("backproj_helper.cl")
+
+    def __init__(self, sino_shape, slice_shape=None, axis_position=None,
+                 angles=None, filter_name=None, ctx=None, devicetype="all",
+                 platformid=None, deviceid=None, profile=False):
+        """Constructor of the OpenCL (filtered) backprojection
+
+        :param sino_shape: shape of the sinogram. The sinogram is in the format
+                           (n_b, n_a) where n_b is the number of detector bins
+                           and n_a is the number of angles.
+        :param slice_shape: Optional, shape of the reconstructed slice. By
+                            default, it is a square slice where the dimension
+                            is the "x dimension" of the sinogram (number of
+                            bins).
+        :param axis_position: Optional, axis position. Default is
+                              `(shape[1]-1)/2.0`.
+        :param angles: Optional, a list of custom angles in radian.
+        :param filter_name: Optional, name of the filter for FBP. Default is
+                            the Ram-Lak filter.
+        :param ctx: actual working context, left to None for automatic
+                    initialization from device type or platformid/deviceid
+        :param devicetype: type of device, can be "CPU", "GPU", "ACC" or "ALL"
+        :param platformid: integer with the platform_identifier, as given by
+                           clinfo
+        :param deviceid: Integer with the device identifier, as given by clinfo
+        :param profile: switch on profiling to be able to profile at the kernel
+                        level, store profiling elements (makes code slightly
+                        slower)
+        """
+        # OS X enforces a workgroup size of 1 when the kernel has
+        # synchronization barriers if sys.platform.startswith('darwin'):
+        #  assuming no discrete GPU
+        #    raise NotImplementedError("Backprojection is not implemented on CPU for OS X yet")
+
+        OpenclProcessing.__init__(self, ctx=ctx, devicetype=devicetype,
+                                  platformid=platformid, deviceid=deviceid,
+                                  profile=profile)
+        self.shape = sino_shape
+
+        self.num_bins = numpy.int32(sino_shape[1])
+        self.num_projs = numpy.int32(sino_shape[0])
+        self.angles = angles
+        if slice_shape is None:
+            self.slice_shape = (self.num_bins, self.num_bins)
+        else:
+            self.slice_shape = slice_shape
+        self.dimrec_shape = (
+            _idivup(self.slice_shape[0], 32) * 32,
+            _idivup(self.slice_shape[1], 32) * 32
+        )
+        self.slice = numpy.zeros(self.dimrec_shape, dtype=numpy.float32)
+        self.filter_name = filter_name if filter_name else "Ram-Lak"
+        if axis_position:
+            self.axis_pos = numpy.float32(axis_position)
+        else:
+            self.axis_pos = numpy.float32((sino_shape[1] - 1.) / 2)
+        self.axis_array = None  # TODO: add axis correction front-end
+
+        self.is_cpu = False
+        if self.device.type == "CPU":
+            self.is_cpu = True
+
+        self.compute_fft_plans()
+        self.buffers = [
+                       BufferDescription("_d_slice", numpy.prod(self.dimrec_shape), numpy.float32, mf.READ_WRITE),
+                       BufferDescription("d_sino", self.num_projs * self.num_bins, numpy.float32, mf.READ_WRITE),  # before transferring to texture (if available)
+                       BufferDescription("d_cos", self.num_projs, numpy.float32, mf.READ_ONLY),
+                       BufferDescription("d_sin", self.num_projs, numpy.float32, mf.READ_ONLY),
+                       BufferDescription("d_axes", self.num_projs, numpy.float32, mf.READ_ONLY),
+                      ]
+        self.allocate_buffers()
+        if not(self.is_cpu):
+            self.allocate_textures()
+        self.compute_filter()
+        if self.pyfft_plan:
+            self.add_to_cl_mem({
+                "d_filter": self.d_filter,
+                "d_sino_z": self.d_sino_z
+            })
+        self.d_sino = self.cl_mem["d_sino"]  # shorthand
+        self.compute_angles()
+
+        self.local_mem = 256 * 3 * _sizeof(numpy.float32)  # constant for all image sizes
+        OpenclProcessing.compile_kernels(self, self.kernel_files)
+        # check that workgroup can actually be (16, 16)
+        self.check_workgroup_size("backproj_cpu_kernel")
+        # Workgroup and ndrange sizes are always the same
+        self.wg = (16, 16)
+        self.ndrange = (
+            _idivup(int(self.dimrec_shape[1]), 32) * self.wg[0],  # int(): pyopencl <= 2015.1
+            _idivup(int(self.dimrec_shape[0]), 32) * self.wg[1]  # int(): pyopencl <= 2015.1
+        )
+
+    def compute_angles(self):
+        if self.angles is None:
+            self.angles = numpy.linspace(0, numpy.pi, self.num_projs, False)
+        h_cos = numpy.cos(self.angles).astype(numpy.float32)
+        h_sin = numpy.sin(self.angles).astype(numpy.float32)
+        pyopencl.enqueue_copy(self.queue, self.cl_mem["d_cos"], h_cos)
+        pyopencl.enqueue_copy(self.queue, self.cl_mem["d_sin"], h_sin)
+        if self.axis_array:
+            pyopencl.enqueue_copy(self.queue,
+                                  self.cl_mem["d_axes"],
+                                  self.axis_array.astype(numpy.float32))
+        else:
+            pyopencl.enqueue_copy(self.queue,
+                                  self.cl_mem["d_axes"],
+                                  numpy.ones(self.num_projs, dtype=numpy.float32) * self.axis_pos)
+
+    def allocate_textures(self):
+        """
+        Allocate the texture for the sinogram.
+        """
+        self.d_sino_tex = pyopencl.Image(
+                                        self.ctx,
+                                        mf.READ_ONLY | mf.USE_HOST_PTR,
+                                        pyopencl.ImageFormat(
+                                                             pyopencl.channel_order.INTENSITY,
+                                                             pyopencl.channel_type.FLOAT
+                                                            ),
+                                        hostbuf=numpy.zeros(self.shape[::-1], dtype=numpy.float32)
+                                        )
+
+    def compute_fft_plans(self):
+        """
+        If pyfft is installed, prepare a batched 1D FFT plan for the filtering
+        of FBP
+
+        """
+        self.fft_size = nextpow2(self.num_bins * 2 - 1)
+        if _has_pyfft:
+            logger.debug("pyfft is available. Computing FFT plans...")
+            # batched 1D transform
+            self.pyfft_plan = pyfft_Plan(self.fft_size, queue=self.queue,
+                                         wait_for_finish=True)
+            self.d_sino_z = parray.zeros(self.queue,
+                                         (self.num_projs, self.fft_size),
+                                         dtype=numpy.complex64)
+            logger.debug("... done")
+        else:
+            logger.debug("pyfft not available, using numpy.fft")
+            self.pyfft_plan = None
+            # TODO: fall-back to fftw if present ?
+
+    def compute_filter(self):
+        """
+        Compute the filter for FBP
+        """
+        if self.filter_name == "Ram-Lak":
+            L = self.fft_size
+            h = numpy.zeros(L, dtype=numpy.float32)
+            L2 = L // 2 + 1
+            h[0] = 1 / 4.
+            j = numpy.linspace(1, L2, L2 // 2, False)
+            h[1:L2:2] = -1. / (numpy.pi ** 2 * j ** 2)
+            h[L2:] = numpy.copy(h[1:L2 - 1][::-1])
+        else:
+            # TODO: other filters
+            raise ValueError("Filter %s is not available" % self.filter_name)
+        self.filter = h
+        if self.pyfft_plan:
+            self.d_filter = parray.to_device(self.queue, h.astype(numpy.complex64))
+            self.pyfft_plan.execute(self.d_filter.data)
+        else:
+            self.filter = numpy.fft.fft(h).astype(numpy.complex64)
+            self.d_filter = None
+
+    def _get_local_mem(self):
+        return pyopencl.LocalMemory(self.local_mem)  # constant for all image sizes
+
+    def cpy2d_to_slice(self, dst):
+        ndrange = (int(self.slice_shape[1]), int(self.slice_shape[0]))  # pyopencl < 2015.2
+        slice_shape_ocl = numpy.int32(ndrange)
+        wg = None
+        kernel_args = (
+            dst.data,
+            self.cl_mem["_d_slice"],
+            numpy.int32(self.slice_shape[1]),
+            numpy.int32(self.dimrec_shape[1]),
+            numpy.int32((0, 0)),
+            numpy.int32((0, 0)),
+            slice_shape_ocl
+        )
+        return self.kernels.cpy2d(self.queue, ndrange, wg, *kernel_args)
+
+    def transfer_to_texture(self, sino):
+        sino2 = sino
+        if not(sino.flags["C_CONTIGUOUS"] and sino.dtype == numpy.float32):
+            sino2 = numpy.ascontiguousarray(sino, dtype=numpy.float32)
+        if self.is_cpu:
+            ev = pyopencl.enqueue_copy(
+                                        self.queue,
+                                        self.d_sino,
+                                        sino2
+                                        )
+            what = "transfer filtered sino H->D buffer"
+        else:
+            ev = pyopencl.enqueue_copy(
+                                       self.queue,
+                                       self.d_sino_tex,
+                                       sino2,
+                                       origin=(0, 0),
+                                       region=self.shape[::-1]
+                                       )
+            what = "transfer filtered sino H->D texture"
+        return EventDescription(what, ev)
+
+    def transfer_device_to_texture(self, d_sino):
+        if self.is_cpu:
+            if id(self.d_sino) == id(d_sino):
+                return
+            ev = pyopencl.enqueue_copy(
+                                       self.queue,
+                                       self.d_sino,
+                                       d_sino
+                                       )
+            what = "transfer filtered sino D->D buffer"
+        else:
+            ev = pyopencl.enqueue_copy(
+                                       self.queue,
+                                       self.d_sino_tex,
+                                       d_sino,
+                                       offset=0,
+                                       origin=(0, 0),
+                                       region=self.shape[::-1]
+                                       )
+            what = "transfer filtered sino D->D texture"
+        return EventDescription(what, ev)
+
+    def backprojection(self, sino=None, dst=None):
+        """Perform the backprojection on an input sinogram
+
+        :param sino: sinogram. If provided, it returns the plain backprojection.
+        :param dst: destination (pyopencl.Array). If provided, the result will be written in this array.
+        :return: backprojection of sinogram
+        """
+        events = []
+        with self.sem:
+
+            if sino is not None:  # assuming numpy.ndarray
+                events.append(self.transfer_to_texture(sino))
+            # Prepare arguments for the kernel call
+            if self.is_cpu:
+                d_sino_ref = self.d_sino
+            else:
+                d_sino_ref = self.d_sino_tex
+            kernel_args = (
+                self.num_projs,  # num of projections (int32)
+                self.num_bins,  # num of bins (int32)
+                self.axis_pos,  # axis position (float32)
+                self.cl_mem["_d_slice"],  # d_slice (__global float32*)
+                d_sino_ref,  # d_sino (__read_only image2d_t or float*)
+                numpy.float32(0),  # gpu_offset_x (float32)
+                numpy.float32(0),  # gpu_offset_y (float32)
+                self.cl_mem["d_cos"],  # d_cos (__global float32*)
+                self.cl_mem["d_sin"],  # d_sin (__global float32*)
+                self.cl_mem["d_axes"],  # d_axis  (__global float32*)
+                self._get_local_mem()  # shared mem (__local float32*)
+            )
+            # Call the kernel
+            if self.is_cpu:
+                kernel_to_call = self.kernels.backproj_cpu_kernel
+            else:
+                kernel_to_call = self.kernels.backproj_kernel
+            event_bpj = kernel_to_call(
+                self.queue,
+                self.ndrange,
+                self.wg,
+                *kernel_args
+            )
+            if dst is None:
+                self.slice[:] = 0
+                events.append(EventDescription("backprojection", event_bpj))
+                ev = pyopencl.enqueue_copy(self.queue, self.slice,
+                                           self.cl_mem["_d_slice"])
+                events.append(EventDescription("copy D->H result", ev))
+                ev.wait()
+                res = numpy.copy(self.slice)
+                if self.dimrec_shape[0] > self.slice_shape[0] or self.dimrec_shape[1] > self.slice_shape[1]:
+                    res = res[:self.slice_shape[0], :self.slice_shape[1]]
+                # if the slice is backprojected onto a bigger grid
+                if self.slice_shape[1] > self.num_bins:
+                    res = res[:self.slice_shape[0], :self.slice_shape[1]]
+            else:
+                ev = self.cpy2d_to_slice(dst)
+                events.append(EventDescription("copy D->D result", ev))
+                ev.wait()
+                res = dst
+
+        # /with self.sem
+        if self.profile:
+            self.events += events
+
+        return res
+
+    def filter_projections(self, sino, rescale=True):
+        """
+        Performs the FBP on a given sinogram.
+
+        :param sinogram: sinogram to (filter-)backproject
+        :param rescale: if True (default), the sinogram is multiplied with
+                        (pi/n_projs)
+        """
+        if sino.shape[0] != self.num_projs or sino.shape[1] != self.num_bins:
+            raise ValueError("Expected sinogram with (projs, bins) = (%d, %d)" % (self.num_projs, self.num_bins))
+        if rescale:
+            sino = sino * numpy.pi / self.num_projs
+        events = []
+        # if pyfft is available, all can be done on the device
+        if self.d_filter is not None:
+
+            # Zero-pad the sinogram.
+            # TODO: this can be done on GPU with a "Memcpy2D":
+            #  cl.enqueue_copy(queue, dst, src, host_origin=(0,0), buffer_origin=(0,0), region=shape, host_pitches=(sino.shape[1],), buffer_pitches=(self.fft_size,))
+            # However it does not work properly, and raises an error for pyopencl < 2017.1
+            sino_zeropadded = numpy.zeros((sino.shape[0], self.fft_size), dtype=numpy.complex64)
+            sino_zeropadded[:, :self.num_bins] = sino.astype(numpy.float32)
+            sino_zeropadded = numpy.ascontiguousarray(sino_zeropadded, dtype=numpy.complex64)
+            with self.sem:
+                # send to GPU
+                ev = pyopencl.enqueue_copy(self.queue, self.d_sino_z.data, sino_zeropadded)
+                events.append(EventDescription("Send sino H->D", ev))
+
+                # FFT (in-place)
+                self.pyfft_plan.execute(self.d_sino_z.data, batch=self.num_projs)
+
+                # Multiply (complex-wise) with the the filter
+                ev = self.kernels.mult(self.queue,
+                                       tuple(int(i) for i in self.d_sino_z.shape[::-1]),
+                                       None,
+                                       self.d_sino_z.data,
+                                       self.d_filter.data,
+                                       numpy.int32(self.fft_size),
+                                       self.num_projs
+                                       )
+                events.append(EventDescription("complex 2D-1D multiplication", ev))
+                # Inverse FFT (in-place)
+                self.pyfft_plan.execute(self.d_sino_z.data, batch=self.num_projs, inverse=True)
+                # Copy the real part of d_sino_z[:, :self.num_bins] (complex64) to d_sino (float32)
+                ev = self.kernels.cpy2d_c2r(self.queue, self.shape[::-1], None,
+                                            self.d_sino,
+                                            self.d_sino_z.data,
+                                            self.num_bins,
+                                            self.num_projs,
+                                            numpy.int32(self.fft_size)
+                                            )
+                events.append(EventDescription("conversion from complex padded sinogram to sinogram", ev))
+                # debug
+#                 ev.wait()
+#                 h_sino = numpy.zeros(sino.shape, dtype=numpy.float32)
+#                 ev = pyopencl.enqueue_copy(self.queue, h_sino, self.d_sino)
+#                 ev.wait()
+#                 numpy.save("/tmp/filtered_sinogram_%s.npy" % self.ctx.devices[0].platform.name.split()[0], h_sino)
+                events.append(self.transfer_device_to_texture(self.d_sino))
+            # ------
+        else:  # no pyfft
+            sino_filtered = fourier_filter(sino, filter_=self.filter, fft_size=self.fft_size)
+            with self.sem:
+                events.append(self.transfer_to_texture(sino_filtered))
+        if self.profile:
+            self.events += events
+
+    def filtered_backprojection(self, sino):
+        """
+        Compute the filtered backprojection (FBP) on a sinogram.
+
+        :param sino: sinogram (`numpy.ndarray`) in the format (projections,
+                     bins)
+        """
+
+        self.filter_projections(sino)
+        res = self.backprojection()
+        return res
+
+    __call__ = filtered_backprojection