Source code for skcuda.misc

#!/usr/bin/env python

"""
Miscellaneous PyCUDA functions.
"""

from __future__ import absolute_import, division

import atexit
import numbers
from string import Template

import pycuda.driver as drv
import pycuda.gpuarray as gpuarray
import pycuda.elementwise as elementwise
import pycuda.reduction as reduction
import pycuda.scan as scan
import pycuda.tools as tools
from pycuda.tools import context_dependent_memoize, dtype_to_ctype
from pycuda.compiler import SourceModule
from pytools import memoize
import numpy as np

from . import cuda
from . import cublas

import sys
if sys.version_info < (3,):
    range = xrange

try:
    from . import cula
    _has_cula = True
except (ImportError, OSError):
    _has_cula = False

try:
    from . import cusolver
    _has_cusolver = True
except (ImportError, OSError):
    _has_cusolver = False

try:
    from . import magma
    _has_magma = True
except (ImportError, OSError):
    _has_magma = False

isdoubletype = lambda x : True if x == np.float64 or \
               x == np.complex128 else False
isdoubletype.__doc__ = """
Check whether a type has double precision.

Parameters
----------
t : numpy float type
    Type to test.

Returns
-------
result : bool
    Result.
"""

iscomplextype = lambda x : True if x == np.complex64 or \
                x == np.complex128 else False
iscomplextype.__doc__ = """
Check whether a type is complex.

Parameters
----------
t : numpy float type
    Type to test.

Returns
-------
result : bool
    Result.

"""

[docs]def init_device(n=0): """ Initialize a GPU device. Initialize a specified GPU device rather than the default device found by `pycuda.autoinit`. Parameters ---------- n : int Device number. Returns ------- dev : pycuda.driver.Device Initialized device. """ drv.init() dev = drv.Device(n) return dev
[docs]def init_context(dev): """ Create a context that will be cleaned up properly. Create a context on the specified device and register its pop() method with atexit. Parameters ---------- dev : pycuda.driver.Device GPU device. Returns ------- ctx : pycuda.driver.Context Created context. """ ctx = dev.make_context() atexit.register(ctx.pop) return ctx
[docs]def done_context(ctx): """ Detach from a context cleanly. Detach from a context and remove its pop() from atexit. Parameters ---------- ctx : pycuda.driver.Context Context from which to detach. """ for i in range(len(atexit._exithandlers)): if atexit._exithandlers[i][0] == ctx.pop: del atexit._exithandlers[i] break ctx.detach()
global _global_cublas_handle _global_cublas_handle = None global _global_cusolver_handle _global_cusolver_handle = None global _global_cublas_allocator _global_cublas_allocator = None
[docs]def init(allocator=drv.mem_alloc): """ Initialize libraries used by scikit-cuda. Initialize the CUBLAS, CULA, CUSOLVER, and MAGMA libraries used by high-level functions provided by scikit-cuda. Parameters ---------- allocator : an allocator used internally by some of the high-level functions. Notes ----- This function does not initialize PyCUDA; it uses whatever device and context were initialized in the current host thread. """ # CUBLAS uses whatever device is being used by the host thread: global _global_cublas_handle, _global_cublas_allocator if not _global_cublas_handle: from . import cublas # nest to avoid requiring cublas e.g. for FFT _global_cublas_handle = cublas.cublasCreate() if _global_cublas_allocator is None: _global_cublas_allocator = allocator # Initializing MAGMA after CUSOLVER causes some functions in the latter to # fail with internal errors: if _has_magma: magma.magma_init() global _global_cusolver_handle if not _global_cusolver_handle: from . import cusolver _global_cusolver_handle = cusolver.cusolverDnCreate() # culaSelectDevice() need not (and, in fact, cannot) be called # here because the host thread has already been bound to a GPU # device: if _has_cula: cula.culaInitialize()
[docs]def shutdown(): """ Shutdown libraries used by scikit-cuda. Shutdown the CUBLAS, CULA, CUSOLVER, and MAGMA libraries used by high-level functions provided by scikits-cuda. Notes ----- This function does not shutdown PyCUDA. """ global _global_cublas_handle if _global_cublas_handle: from . import cublas # nest to avoid requiring cublas e.g. for FFT cublas.cublasDestroy(_global_cublas_handle) _global_cublas_handle = None global _global_cusolver_handle if _global_cusolver_handle: from . import cusolver cusolver.cusolverDnDestroy(_global_cusolver_handle) _global_cusolver_handle = None if _has_magma: magma.magma_finalize() if _has_cula: cula.culaShutdown()
[docs]def get_compute_capability(dev): """ Get the compute capability of the specified device. Retrieve the compute capability of the specified CUDA device and return it as a floating point value. Parameters ---------- d : pycuda.driver.Device Device object to examine. Returns ------- c : float Compute capability. """ return np.float('.'.join([str(i) for i in dev.compute_capability()]))
[docs]def get_current_device(): """ Get the device in use by the current context. Returns ------- d : pycuda.driver.Device Device in use by current context. """ return drv.Device(cuda.cudaGetDevice())
@memoize def get_dev_attrs(dev): """ Get select CUDA device attributes. Retrieve select attributes of the specified CUDA device that relate to maximum thread block and grid sizes. Parameters ---------- d : pycuda.driver.Device Device object to examine. Returns ------- attrs : list List containing [MAX_THREADS_PER_BLOCK, (MAX_BLOCK_DIM_X, MAX_BLOCK_DIM_Y, MAX_BLOCK_DIM_Z), (MAX_GRID_DIM_X, MAX_GRID_DIM_Y, MAX_GRID_DIM_Z)] """ attrs = dev.get_attributes() return [attrs[drv.device_attribute.MAX_THREADS_PER_BLOCK], (attrs[drv.device_attribute.MAX_BLOCK_DIM_X], attrs[drv.device_attribute.MAX_BLOCK_DIM_Y], attrs[drv.device_attribute.MAX_BLOCK_DIM_Z]), (attrs[drv.device_attribute.MAX_GRID_DIM_X], attrs[drv.device_attribute.MAX_GRID_DIM_Y], attrs[drv.device_attribute.MAX_GRID_DIM_Z])] iceil = lambda n: int(np.ceil(n)) @memoize def select_block_grid_sizes(dev, data_shape, threads_per_block=None): """ Determine CUDA block and grid dimensions given device constraints. Determine the CUDA block and grid dimensions allowed by a GPU device that are sufficient for processing every element of an array in a separate thread. Parameters ---------- d : pycuda.driver.Device Device object to be used. data_shape : tuple Shape of input data array. Must be of length 2. threads_per_block : int, optional Number of threads to execute in each block. If this is None, the maximum number of threads per block allowed by device `d` is used. Returns ------- block_dim : tuple X, Y, and Z dimensions of minimal required thread block. grid_dim : tuple X and Y dimensions of minimal required block grid. Notes ----- Using the scheme in this function, all of the threads in the grid can be enumerated as `i = blockIdx.y*max_threads_per_block*max_blocks_per_grid+ blockIdx.x*max_threads_per_block+threadIdx.x`. For 2D shapes, the subscripts of the element `data[a, b]` where `data.shape == (A, B)` can be computed as `a = i/B` `b = mod(i,B)`. For 3D shapes, the subscripts of the element `data[a, b, c]` where `data.shape == (A, B, C)` can be computed as `a = i/(B*C)` `b = mod(i, B*C)/C` `c = mod(mod(i, B*C), C)`. For 4D shapes, the subscripts of the element `data[a, b, c, d]` where `data.shape == (A, B, C, D)` can be computed as `a = i/(B*C*D)` `b = mod(i, B*C*D)/(C*D)` `c = mod(mod(i, B*C*D)%(C*D))/D` `d = mod(mod(mod(i, B*C*D)%(C*D)), D)` It is advisable that the number of threads per block be a multiple of the warp size to fully utilize a device's computing resources. """ # Sanity checks: if np.isscalar(data_shape): data_shape = (data_shape,) # Number of elements to process; we need to cast the result of # np.prod to a Python int to prevent PyCUDA's kernel execution # framework from getting confused when N = int(np.prod(data_shape)) # Get device constraints: max_threads_per_block, max_block_dim, max_grid_dim = get_dev_attrs(dev) if threads_per_block is not None: if threads_per_block > max_threads_per_block: raise ValueError('threads per block exceeds device maximum') else: max_threads_per_block = threads_per_block # Actual number of thread blocks needed: blocks_needed = iceil(N/float(max_threads_per_block)) if blocks_needed <= max_grid_dim[0]: return (max_threads_per_block, 1, 1), (blocks_needed, 1, 1) elif blocks_needed > max_grid_dim[0] and \ blocks_needed <= max_grid_dim[0]*max_grid_dim[1]: return (max_threads_per_block, 1, 1), \ (max_grid_dim[0], iceil(blocks_needed/float(max_grid_dim[0])), 1) elif blocks_needed > max_grid_dim[0]*max_grid_dim[1] and \ blocks_needed <= max_grid_dim[0]*max_grid_dim[1]*max_grid_dim[2]: return (max_threads_per_block, 1, 1), \ (max_grid_dim[0], max_grid_dim[1], iceil(blocks_needed/float(max_grid_dim[0]*max_grid_dim[1]))) else: raise ValueError('array size too large')
[docs]def zeros(shape, dtype, order='C', allocator=drv.mem_alloc): """ Return an array of the given shape and dtype filled with zeros. Parameters ---------- shape : tuple Array shape. dtype : data-type Data type for the array. order : {'C', 'F'}, optional Create array using row-major or column-major format. allocator : callable, optional Returns an object that represents the memory allocated for the requested array. Returns ------- out : pycuda.gpuarray.GPUArray Array of zeros with the given shape, dtype, and order. Notes ----- This function exists to work around the following numpy bug that prevents pycuda.gpuarray.zeros() from working properly with complex types in pycuda 2011.1.2: http://projects.scipy.org/numpy/ticket/1898 """ out = gpuarray.GPUArray(shape, dtype, allocator, order=order) z = np.zeros((), dtype) out.fill(z) return out
[docs]def zeros_like(a): """ Return an array of zeros with the same shape and type as a given array. Parameters ---------- a : array_like The shape and data type of `a` determine the corresponding attributes of the returned array. Returns ------- out : pycuda.gpuarray.GPUArray Array of zeros with the shape, dtype, and strides of `a`. """ out = gpuarray.GPUArray(a.shape, a.dtype, drv.mem_alloc, strides=a.strides) z = np.zeros((), a.dtype) out.fill(z) return out
[docs]def ones(shape, dtype, order='C', allocator=drv.mem_alloc): """ Return an array of the given shape and dtype filled with ones. Parameters ---------- shape : tuple Array shape. dtype : data-type Data type for the array. order : {'C', 'F'}, optional Create array using row-major or column-major format. allocator : callable, optional Returns an object that represents the memory allocated for the requested array. Returns ------- out : pycuda.gpuarray.GPUArray Array of ones with the given shape, dtype, and order. """ out = gpuarray.GPUArray(shape, dtype, allocator, order=order) o = np.ones((), dtype) out.fill(o) return out
[docs]def ones_like(a): """ Return an array of ones with the same shape and type as a given array. Parameters ---------- a : array_like The shape and data type of `a` determine the corresponding attributes of the returned array. Returns ------- out : pycuda.gpuarray.GPUArray Array of ones with the shape, dtype, and strides of `other`. """ out = gpuarray.GPUArray(a.shape, a.dtype, a.allocator, strides=a.strides) o = np.ones((), a.dtype) out.fill(o) return out
[docs]def inf(shape, dtype, order='C', allocator=drv.mem_alloc): """ Return an array of the given shape and dtype filled with infs. Parameters ---------- shape : tuple Array shape. dtype : data-type Data type for the array. order : {'C', 'F'}, optional Create array using row-major or column-major format. allocator : callable, optional Returns an object that represents the memory allocated for the requested array. Returns ------- out : pycuda.gpuarray.GPUArray Array of infs with the given shape, dtype, and order. """ out = gpuarray.GPUArray(shape, dtype, allocator, order=order) i = np.array(np.inf, dtype) out.fill(i) return out
[docs]def maxabs(x_gpu): """ Get maximum absolute value. Find maximum absolute value in the specified array. Parameters ---------- x_gpu : pycuda.gpuarray.GPUArray Input array. Returns ------- m_gpu : pycuda.gpuarray.GPUArray Array containing maximum absolute value in `x_gpu`. Examples -------- >>> import pycuda.autoinit >>> import pycuda.gpuarray as gpuarray >>> import misc >>> x_gpu = gpuarray.to_gpu(np.array([-1, 2, -3], np.float32)) >>> m_gpu = misc.maxabs(x_gpu) >>> np.allclose(m_gpu.get(), 3.0) True """ try: func = maxabs.cache[x_gpu.dtype] except KeyError: ctype = tools.dtype_to_ctype(x_gpu.dtype) use_double = int(x_gpu.dtype in [np.float64, np.complex128]) ret_type = np.float64 if use_double else np.float32 func = reduction.ReductionKernel(ret_type, neutral="0", reduce_expr="max(a,b)", map_expr="abs(x[i])", arguments="{ctype} *x".format(ctype=ctype)) maxabs.cache[x_gpu.dtype] = func return func(x_gpu)
maxabs.cache = {}
[docs]def cumsum(x_gpu): """ Cumulative sum. Return the cumulative sum of the elements in the specified array. Parameters ---------- x_gpu : pycuda.gpuarray.GPUArray Input array. Returns ------- c_gpu : pycuda.gpuarray.GPUArray Output array containing cumulative sum of `x_gpu`. Notes ----- Higher dimensional arrays are implicitly flattened row-wise by this function. Examples -------- >>> import pycuda.autoinit >>> import pycuda.gpuarray as gpuarray >>> import misc >>> x_gpu = gpuarray.to_gpu(np.random.rand(5).astype(np.float32)) >>> c_gpu = misc.cumsum(x_gpu) >>> np.allclose(c_gpu.get(), np.cumsum(x_gpu.get())) True """ try: func = cumsum.cache[x_gpu.dtype] except KeyError: func = scan.InclusiveScanKernel(x_gpu.dtype, 'a+b', preamble='#include <pycuda-complex.hpp>') cumsum.cache[x_gpu.dtype] = func return func(x_gpu)
cumsum.cache = {}
[docs]def diff(x_gpu): """ Calculate the discrete difference. Calculates the first order difference between the successive entries of a vector. Parameters ---------- x_gpu : pycuda.gpuarray.GPUArray Input vector. Returns ------- y_gpu : pycuda.gpuarray.GPUArray Discrete difference. Examples -------- >>> import pycuda.driver as drv >>> import pycuda.gpuarray as gpuarray >>> import pycuda.autoinit >>> import numpy as np >>> import misc >>> x = np.asarray(np.random.rand(5), np.float32) >>> x_gpu = gpuarray.to_gpu(x) >>> y_gpu = misc.diff(x_gpu) >>> np.allclose(np.diff(x), y_gpu.get()) True """ y_gpu = gpuarray.empty(len(x_gpu)-1, x_gpu.dtype) try: func = diff.cache[x_gpu.dtype] except KeyError: ctype = tools.dtype_to_ctype(x_gpu.dtype) func = elementwise.ElementwiseKernel("{ctype} *a, {ctype} *b".format(ctype=ctype), "b[i] = a[i+1]-a[i]") diff.cache[x_gpu.dtype] = func func(x_gpu, y_gpu) return y_gpu
diff.cache = {} # List of available numerical types provided by numpy: num_types = [np.typeDict[t] for t in \ np.typecodes['AllInteger']+np.typecodes['AllFloat']] # Numbers of bytes occupied by each numerical type: num_nbytes = dict((np.dtype(t),t(1).nbytes) for t in num_types)
[docs]def set_realloc(x_gpu, data): """ Transfer data into a GPUArray instance. Copies the contents of a numpy array into a GPUArray instance. If the array has a different type or dimensions than the instance, the GPU memory used by the instance is reallocated and the instance updated appropriately. Parameters ---------- x_gpu : pycuda.gpuarray.GPUArray GPUArray instance to modify. data : numpy.ndarray Array of data to transfer to the GPU. Examples -------- >>> import pycuda.gpuarray as gpuarray >>> import pycuda.autoinit >>> import numpy as np >>> import misc >>> x = np.asarray(np.random.rand(5), np.float32) >>> x_gpu = gpuarray.to_gpu(x) >>> x = np.asarray(np.random.rand(10, 1), np.float64) >>> set_realloc(x_gpu, x) >>> np.allclose(x, x_gpu.get()) True """ # Only reallocate if absolutely necessary: if x_gpu.shape != data.shape or x_gpu.size != data.size or \ x_gpu.strides != data.strides or x_gpu.dtype != data.dtype: # Free old memory: x_gpu.gpudata.free() # Allocate new memory: nbytes = num_nbytes[data.dtype] x_gpu.gpudata = drv.mem_alloc(nbytes*data.size) # Set array attributes: x_gpu.shape = data.shape x_gpu.size = data.size x_gpu.strides = data.strides x_gpu.dtype = data.dtype # Update the GPU memory: x_gpu.set(data)
[docs]def get_by_index(src_gpu, ind): """ Get values in a GPUArray by index. Parameters ---------- src_gpu : pycuda.gpuarray.GPUArray GPUArray instance from which to extract values. ind : pycuda.gpuarray.GPUArray or numpy.ndarray Array of element indices to set. Must have an integer dtype. Returns ------- res_gpu : pycuda.gpuarray.GPUArray GPUArray with length of `ind` and dtype of `src_gpu` containing selected values. Examples -------- >>> import pycuda.gpuarray as gpuarray >>> import pycuda.autoinit >>> import numpy as np >>> import misc >>> src = np.random.rand(5).astype(np.float32) >>> src_gpu = gpuarray.to_gpu(src) >>> ind = gpuarray.to_gpu(np.array([0, 2, 4])) >>> res_gpu = misc.get_by_index(src_gpu, ind) >>> np.allclose(res_gpu.get(), src[[0, 2, 4]]) True Notes ----- Only supports 1D index arrays. May not be efficient for certain index patterns because of lack of inability to coalesce memory operations. """ # Only support 1D index arrays: assert len(np.shape(ind)) == 1 assert issubclass(ind.dtype.type, numbers.Integral) N = len(ind) if not isinstance(ind, gpuarray.GPUArray): ind = gpuarray.to_gpu(ind) dest_gpu = gpuarray.empty(N, dtype=src_gpu.dtype) # Manually handle empty index array because it will cause the kernel to # fail if processed: if N == 0: return dest_gpu try: func = get_by_index.cache[(src_gpu.dtype, ind.dtype)] except KeyError: data_ctype = tools.dtype_to_ctype(src_gpu.dtype) ind_ctype = tools.dtype_to_ctype(ind.dtype) v = "{data_ctype} *dest, {ind_ctype} *ind, {data_ctype} *src".format(data_ctype=data_ctype, ind_ctype=ind_ctype) func = elementwise.ElementwiseKernel(v, "dest[i] = src[ind[i]]") get_by_index.cache[(src_gpu.dtype, ind.dtype)] = func func(dest_gpu, ind, src_gpu, range=slice(0, N, 1)) return dest_gpu
get_by_index.cache = {}
[docs]def set_by_index(dest_gpu, ind, src_gpu, ind_which='dest'): """ Set values in a GPUArray by index. Parameters ---------- dest_gpu : pycuda.gpuarray.GPUArray GPUArray instance to modify. ind : pycuda.gpuarray.GPUArray or numpy.ndarray 1D array of element indices to set. Must have an integer dtype. src_gpu : pycuda.gpuarray.GPUArray GPUArray instance from which to set values. ind_which : str If set to 'dest', set the elements in `dest_gpu` with indices `ind` to the successive values in `src_gpu`; the lengths of `ind` and `src_gpu` must be equal. If set to 'src', set the successive values in `dest_gpu` to the values in `src_gpu` with indices `ind`; the lengths of `ind` and `dest_gpu` must be equal. Examples -------- >>> import pycuda.gpuarray as gpuarray >>> import pycuda.autoinit >>> import numpy as np >>> import misc >>> dest_gpu = gpuarray.to_gpu(np.arange(5, dtype=np.float32)) >>> ind = gpuarray.to_gpu(np.array([0, 2, 4])) >>> src_gpu = gpuarray.to_gpu(np.array([1, 1, 1], dtype=np.float32)) >>> misc.set_by_index(dest_gpu, ind, src_gpu, 'dest') >>> np.allclose(dest_gpu.get(), np.array([1, 1, 1, 3, 1], dtype=np.float32)) True >>> dest_gpu = gpuarray.to_gpu(np.zeros(3, dtype=np.float32)) >>> ind = gpuarray.to_gpu(np.array([0, 2, 4])) >>> src_gpu = gpuarray.to_gpu(np.arange(5, dtype=np.float32)) >>> misc.set_by_index(dest_gpu, ind, src_gpu) >>> np.allclose(dest_gpu.get(), np.array([0, 2, 4], dtype=np.float32)) True Notes ----- Only supports 1D index arrays. May not be efficient for certain index patterns because of lack of inability to coalesce memory operations. """ # Only support 1D index arrays: assert len(np.shape(ind)) == 1 assert dest_gpu.dtype == src_gpu.dtype assert issubclass(ind.dtype.type, numbers.Integral) N = len(ind) # Manually handle empty index array because it will cause the kernel to # fail if processed: if N == 0: return if ind_which == 'dest': assert N == len(src_gpu) elif ind_which == 'src': assert N == len(dest_gpu) else: raise ValueError('invalid value for `ind_which`') if not isinstance(ind, gpuarray.GPUArray): ind = gpuarray.to_gpu(ind) try: func = set_by_index.cache[(dest_gpu.dtype, ind.dtype, ind_which)] except KeyError: data_ctype = tools.dtype_to_ctype(dest_gpu.dtype) ind_ctype = tools.dtype_to_ctype(ind.dtype) v = "{data_ctype} *dest, {ind_ctype} *ind, {data_ctype} *src".format(data_ctype=data_ctype, ind_ctype=ind_ctype) if ind_which == 'dest': func = elementwise.ElementwiseKernel(v, "dest[ind[i]] = src[i]") else: func = elementwise.ElementwiseKernel(v, "dest[i] = src[ind[i]]") set_by_index.cache[(dest_gpu.dtype, ind.dtype, ind_which)] = func func(dest_gpu, ind, src_gpu, range=slice(0, N, 1))
set_by_index.cache = {} @context_dependent_memoize def _get_binaryop_vecmat_kernel(dtype, binary_op): template = Template(""" #include <pycuda-complex.hpp> __global__ void opColVecToMat(const ${type} *mat, const ${type} *vec, ${type} *out, const int n, const int m){ const int tx = threadIdx.x; const int ty = threadIdx.y; const int tidx = blockIdx.x * blockDim.x + threadIdx.x; const int tidy = blockIdx.y * blockDim.y + threadIdx.y; extern __shared__ ${type} shared_vec[]; if ((ty == 0) & (tidx < n)) shared_vec[tx] = vec[tidx]; __syncthreads(); if ((tidy < m) & (tidx < n)) { out[tidx*m+tidy] = mat[tidx*m+tidy] ${binary_op} shared_vec[tx]; } } __global__ void opRowVecToMat(const ${type}* mat, const ${type}* vec, ${type}* out, const int n, const int m){ const int tx = threadIdx.x; const int ty = threadIdx.y; const int tidx = blockIdx.x * blockDim.x + threadIdx.x; const int tidy = blockIdx.y * blockDim.y + threadIdx.y; extern __shared__ ${type} shared_vec[]; if ((tx == 0) & (tidy < m)) shared_vec[ty] = vec[tidy]; __syncthreads(); if ((tidy < m) & (tidx < n)) { out[tidx*m+tidy] = mat[tidx*m+tidy] ${binary_op} shared_vec[ty]; } }""") cache_dir=None ctype = dtype_to_ctype(dtype) tmpl = template.substitute(type=ctype, binary_op=binary_op) mod = SourceModule(tmpl) add_row_vec_kernel = mod.get_function('opRowVecToMat') add_col_vec_kernel = mod.get_function('opColVecToMat') return add_row_vec_kernel, add_col_vec_kernel def binaryop_matvec(binary_op, x_gpu, a_gpu, axis=None, out=None, stream=None): """ Applies a binary operation to a vector and each column/row of a matrix. The numpy broadcasting rules apply so this would yield the same result as `x_gpu.get()` op `a_gpu.get()` in host-code. Parameters ---------- binary_op : string, ['+', '-', '/', '*' '%'] The operator to apply x_gpu : pycuda.gpuarray.GPUArray Matrix to which to add the vector. a_gpu : pycuda.gpuarray.GPUArray Vector to add to `x_gpu`. axis : int (optional) The axis onto which the vector is added. By default this is determined automatically by using the first axis with the correct dimensionality. out : pycuda.gpuarray.GPUArray (optional) Optional destination matrix. stream : pycuda.driver.Stream (optional) Optional Stream in which to perform this calculation. Returns ------- out : pycuda.gpuarray.GPUArray result of `x_gpu` + `a_gpu` """ if axis is None: if len(a_gpu.shape) == 1: if a_gpu.shape[0] == x_gpu.shape[1]: axis = 1 else: raise ValueError( "operands could not be broadcast together " "with shapes %s %s" % (x_gpu.shape, a_gpu.shape)) elif a_gpu.shape[1] == x_gpu.shape[1]: # numpy matches inner axes first axis = 1 elif a_gpu.shape[0] == x_gpu.shape[0]: axis = 0 else: raise ValueError( "operands could not be broadcast together " "with shapes %s %s" % (x_gpu.shape, a_gpu.shape)) else: if axis < 0: axis += 2 if axis > 1: raise ValueError('invalid axis') if binary_op not in ['+', '-', '/', '*', '%']: raise ValueError('invalid operator') row_kernel, col_kernel = _get_binaryop_vecmat_kernel(x_gpu.dtype, binary_op) n, m = np.int32(x_gpu.shape[0]), np.int32(x_gpu.shape[1]) block = (24, 24, 1) gridx = int(n // block[0] + 1 * (n % block[0] != 0)) gridy = int(m // block[1] + 1 * (m % block[1] != 0)) grid = (gridx, gridy, 1) if out is None: alloc = _global_cublas_allocator out = gpuarray.empty_like(x_gpu) else: assert out.dtype == x_gpu.dtype assert out.shape == x_gpu.shape if x_gpu.flags.c_contiguous: if axis == 0: col_kernel(x_gpu, a_gpu, out, n, m, block=block, grid=grid, stream=stream, shared=24*x_gpu.dtype.itemsize) elif axis == 1: row_kernel(x_gpu, a_gpu, out, n, m, block=block, grid=grid, stream=stream, shared=24*x_gpu.dtype.itemsize) else: if axis == 0: row_kernel(x_gpu, a_gpu, out, m, n, block=block, grid=grid, stream=stream, shared=24*x_gpu.dtype.itemsize) elif axis == 1: col_kernel(x_gpu, a_gpu, out, m, n, block=block, grid=grid, stream=stream, shared=24*x_gpu.dtype.itemsize) return out import operator def binaryop_2d(c_op, py_op, commutative, x_gpu, y_gpu): if x_gpu.flags.c_contiguous != y_gpu.flags.c_contiguous: raise ValueError('unsupported combination of input order') if x_gpu.shape == y_gpu.shape: return py_op(x_gpu, y_gpu) elif x_gpu.size == 1: return py_op(x_gpu.get().reshape(()), y_gpu) elif y_gpu.size == 1: return py_op(x_gpu, y_gpu.get().reshape(())) if len(x_gpu.shape) == 2: m, n = x_gpu.shape if y_gpu.shape == (n,): return binaryop_matvec(c_op, x_gpu, y_gpu, axis=1) elif y_gpu.shape == (1, n): return binaryop_matvec(c_op, x_gpu, y_gpu[0], axis=1) elif y_gpu.shape == (m, 1): return binaryop_matvec(c_op, x_gpu, y_gpu.ravel(), axis=0) if len(y_gpu.shape) == 2 and commutative: m, n = y_gpu.shape if x_gpu.shape == (n,): return binaryop_matvec(c_op, y_gpu, x_gpu, axis=1) elif x_gpu.shape == (1, n): return binaryop_matvec(c_op, y_gpu, x_gpu[0], axis=1) elif x_gpu.shape == (m, 1): return binaryop_matvec(c_op, y_gpu, x_gpu.ravel(), axis=0) raise TypeError("unsupported combination of shapes")
[docs]def add(x_gpu, y_gpu): """ Adds two scalars, vectors, or matrices. The numpy broadcasting rules apply so this would yield the same result as `x_gpu.get()` + `y_gpu.get()` in host code. Parameters ---------- x_gpu, y_gpu : pycuda.gpuarray.GPUArray The arrays to be added. Returns ------- out : pycuda.gpuarray.GPUArray Equivalent to `x_gpu.get()` + `y_gpu.get()`. Notes ----- The `out` and `stream` options are not supported because `GPUArray.__add__` doesn't provide them. """ return binaryop_2d("+", operator.add, True, x_gpu, y_gpu)
[docs]def subtract(x_gpu, y_gpu): """ Subtracts two scalars, vectors, or matrices with broadcasting. The numpy broadcasting rules apply so this would yield the same result as `x_gpu.get()` - `y_gpu.get()` in host code. Parameters ---------- x_gpu, y_gpu : pycuda.gpuarray.GPUArray The arrays to be subtracted. Returns ------- out : pycuda.gpuarray.GPUArray Equivalent to `x_gpu.get()` - `y_gpu.get()`. Notes ----- The `out` and `stream` options are not supported because `GPUArray.__sub__` doesn't provide them. """ return binaryop_2d("-", operator.sub, False, x_gpu, y_gpu)
[docs]def multiply(x_gpu, y_gpu): """ Multiplies two scalars, vectors, or matrices with broadcasting. The numpy broadcasting rules apply so this would yield the same result as `x_gpu.get()` * `y_gpu.get()` in host code. Parameters ---------- x_gpu, y_gpu : pycuda.gpuarray.GPUArray The arrays to be multiplied. Returns ------- out : pycuda.gpuarray.GPUArray Equivalent to `x_gpu.get()` * `y_gpu.get()`. Notes ----- The `out` and `stream` options are not supported because `GPUArray.__mul__` doesn't provide them. """ return binaryop_2d("*", operator.mul, True, x_gpu, y_gpu)
[docs]def divide(x_gpu, y_gpu): """ Divides two scalars, vectors, or matrices with broadcasting. The numpy broadcasting rules apply so this would yield the same result as `x_gpu.get()` / `y_gpu.get()` in host code. Parameters ---------- x_gpu, y_gpu : pycuda.gpuarray.GPUArray The arrays to be divided. Returns ------- out : pycuda.gpuarray.GPUArray Equivalent to `x_gpu.get()` / `y_gpu.get()`. Notes ----- The `out` and `stream` options are not supported because `GPUArray.__div__` doesn't provide them. """ return binaryop_2d("/", operator.truediv, False, x_gpu, y_gpu)
[docs]def add_matvec(x_gpu, a_gpu, axis=None, out=None, stream=None): """ Adds a vector to each column/row of the matrix. The numpy broadcasting rules apply so this would yield the same result as `x_gpu.get()` + `a_gpu.get()` in host-code. Parameters ---------- x_gpu : pycuda.gpuarray.GPUArray Matrix to which to add the vector. a_gpu : pycuda.gpuarray.GPUArray Vector to add to `x_gpu`. axis : int (optional) The axis onto which the vector is added. By default this is determined automatically by using the first axis with the correct dimensionality. out : pycuda.gpuarray.GPUArray (optional) Optional destination matrix. stream : pycuda.driver.Stream (optional) Optional Stream in which to perform this calculation. Returns ------- out : pycuda.gpuarray.GPUArray Result of `x_gpu` + `a_gpu` """ return binaryop_matvec('+', x_gpu, a_gpu, axis, out, stream)
[docs]def div_matvec(x_gpu, a_gpu, axis=None, out=None, stream=None): """ Divides each column/row of a matrix by a vector. The numpy broadcasting rules apply so this would yield the same result as `x_gpu.get()` / `a_gpu.get()` in host-code. Parameters ---------- x_gpu : pycuda.gpuarray.GPUArray Matrix to divide by the vector `a_gpu`. a_gpu : pycuda.gpuarray.GPUArray The matrix `x_gpu` will be divided by this vector. axis : int (optional) The axis on which division occurs. By default this is determined automatically by using the first axis with the correct dimensionality. out : pycuda.gpuarray.GPUArray (optional) Optional destination matrix. stream : pycuda.driver.Stream (optional) Optional Stream in which to perform this calculation. Returns ------- out : pycuda.gpuarray.GPUArray result of `x_gpu` / `a_gpu` """ return binaryop_matvec('/', x_gpu, a_gpu, axis, out, stream)
[docs]def mult_matvec(x_gpu, a_gpu, axis=None, out=None, stream=None): """ Multiplies a vector elementwise with each column/row of the matrix. The numpy broadcasting rules apply so this would yield the same result as `x_gpu.get()` * `a_gpu.get()` in host-code. Parameters ---------- x_gpu : pycuda.gpuarray.GPUArray Matrix to multiply by the vector `a_gpu`. a_gpu : pycuda.gpuarray.GPUArray The matrix `x_gpu` will be multiplied by this vector. axis : int (optional) The axis on which multiplication occurs. By default this is determined automatically by using the first axis with the correct dimensionality. out : pycuda.gpuarray.GPUArray (optional) Optional destination matrix. stream : pycuda.driver.Stream (optional) Optional Stream in which to perform this calculation. Returns ------- out : pycuda.gpuarray.GPUArray result of `x_gpu` * `a_gpu` """ return binaryop_matvec('*', x_gpu, a_gpu, axis, out, stream)
def _sum_axis(x_gpu, axis=None, out=None, calc_mean=False, ddof=0, keepdims=False): global _global_cublas_allocator assert isinstance(ddof, numbers.Integral) if axis is None or len(x_gpu.shape) <= 1: out_shape = (1,)*len(x_gpu.shape) if keepdims else () if calc_mean == False: return gpuarray.sum(x_gpu).reshape(out_shape) else: return gpuarray.sum(x_gpu).reshape(out_shape) / (x_gpu.dtype.type(x_gpu.size-ddof)) if axis < 0: axis += 2 if axis > 1: raise ValueError('invalid axis') if x_gpu.flags.c_contiguous: n, m = x_gpu.shape[1], x_gpu.shape[0] lda = x_gpu.shape[1] trans = "n" if axis == 0 else "t" sum_axis, out_axis = (m, n) if axis == 0 else (n, m) else: n, m = x_gpu.shape[0], x_gpu.shape[1] lda = x_gpu.shape[0] trans = "t" if axis == 0 else "n" sum_axis, out_axis = (n, m) if axis == 0 else (m, n) if calc_mean: alpha = (1.0 / (sum_axis-ddof)) else: alpha = 1.0 if (x_gpu.dtype == np.complex64): gemv = cublas.cublasCgemv elif (x_gpu.dtype == np.float32): gemv = cublas.cublasSgemv elif (x_gpu.dtype == np.complex128): gemv = cublas.cublasZgemv elif (x_gpu.dtype == np.float64): gemv = cublas.cublasDgemv alloc = _global_cublas_allocator ons = ones((sum_axis, ), x_gpu.dtype, allocator=alloc) if keepdims: out_shape = (1, out_axis) if axis == 0 else (out_axis, 1) else: out_shape = (out_axis,) if out is None: out = gpuarray.empty(out_shape, x_gpu.dtype, alloc) else: assert out.dtype == x_gpu.dtype assert out.size >= out_axis gemv(_global_cublas_handle, trans, n, m, alpha, x_gpu.gpudata, lda, ons.gpudata, 1, 0.0, out.gpudata, 1) return out
[docs]def sum(x_gpu, axis=None, out=None, keepdims=False): """ Compute the sum along the specified axis. Parameters ---------- x_gpu : pycuda.gpuarray.GPUArray Array containing numbers whose sum is desired. axis : int (optional) Axis along which the sums are computed. The default is to compute the sum of the flattened array. out : pycuda.gpuarray.GPUArray (optional) Output array in which to place the result. keepdims : bool (optional, default False) If True, the axes which are reduced are left in the result as dimensions with size one. Returns ------- out : pycuda.gpuarray.GPUArray sum of elements, or sums of elements along the desired axis. """ return _sum_axis(x_gpu, axis, out=out, keepdims=keepdims)
[docs]def mean(x_gpu, axis=None, out=None, keepdims=False): """ Compute the arithmetic means along the specified axis. Parameters ---------- x_gpu : pycuda.gpuarray.GPUArray Array containing numbers whose mean is desired. axis : int (optional) Axis along which the means are computed. The default is to compute the mean of the flattened array. out : pycuda.gpuarray.GPUArray (optional) Output array in which to place the result. keepdims : bool (optional, default False) If True, the axes which are reduced are left in the result as dimensions with size one. Returns ------- out : pycuda.gpuarray.GPUArray mean of elements, or means of elements along the desired axis. """ return _sum_axis(x_gpu, axis, calc_mean=True, out=out, keepdims=keepdims)
[docs]def var(x_gpu, ddof=0, axis=None, stream=None, keepdims=False): """ Compute the variance along the specified axis. Returns the variance of the array elements, a measure of the spread of a distribution. The variance is computed for the flattened array by default, otherwise over the specified axis. Parameters ---------- x_gpu : pycuda.gpuarray.GPUArray Array containing numbers whose variance is desired. ddof : int (optional) "Delta Degrees of Freedom": the divisor used in computing the variance is ``N - ddof``, where ``N`` is the number of elements. Setting ``ddof = 1`` is equivalent to applying Bessel's correction. axis : int (optional) Axis along which the variance are computed. The default is to compute the variance of the flattened array. stream : pycuda.driver.Stream (optional) Optional CUDA stream in which to perform this calculation keepdims : bool (optional, default False) If True, the axes which are reduced are left in the result as dimensions with size one. Returns ------- out : pycuda.gpuarray.GPUArray variance of elements, or variances of elements along the desired axis. """ def _inplace_pow(x_gpu, p, stream): func = elementwise.get_pow_kernel(x_gpu.dtype) func.prepared_async_call(x_gpu._grid, x_gpu._block, stream, p, x_gpu.gpudata, x_gpu.gpudata, x_gpu.mem_size) if axis is None: m = mean(x_gpu).get() out = x_gpu - m out **= 2 out = _sum_axis(out, axis=None, calc_mean=True, ddof=ddof, out=None, keepdims=keepdims) else: if axis < 0: axis += 2 m = mean(x_gpu, axis=axis) out = add_matvec(x_gpu, -m, axis=1-axis, stream=stream) _inplace_pow(out, 2, stream) out = _sum_axis(out, axis=axis, calc_mean=True, ddof=ddof, out=None, keepdims=keepdims) return out
[docs]def std(x_gpu, ddof=0, axis=None, stream=None, keepdims=False): """ Compute the standard deviation along the specified axis. Returns the standard deviation of the array elements, a measure of the spread of a distribution. The standard deviation is computed for the flattened array by default, otherwise over the specified axis. Parameters ---------- x_gpu : pycuda.gpuarray.GPUArray Array containing numbers whose std is desired. ddof : int (optional) "Delta Degrees of Freedom": the divisor used in computing the variance is ``N - ddof``, where ``N`` is the number of elements. Setting ``ddof = 1`` is equivalent to applying Bessel's correction. axis : int (optional) Axis along which the std are computed. The default is to compute the std of the flattened array. stream : pycuda.driver.Stream (optional) Optional CUDA stream in which to perform this calculation keepdims : bool (optional, default False) If True, the axes which are reduced are left in the result as dimensions with size one. Returns ------- out : pycuda.gpuarray.GPUArray or float std of elements, or stds of elements along the desired axis. """ def _inplace_pow(x_gpu, p, stream): func = elementwise.get_pow_kernel(x_gpu.dtype) func.prepared_async_call(x_gpu._grid, x_gpu._block, stream, p, x_gpu.gpudata, x_gpu.gpudata, x_gpu.mem_size) if axis is None: return var(x_gpu, ddof=ddof, stream=stream, keepdims=keepdims) ** 0.5 else: out = var(x_gpu, ddof=ddof, axis=axis, stream=stream, keepdims=keepdims) _inplace_pow(out, 0.5, stream) return out
@context_dependent_memoize def _get_minmax_kernel(dtype, min_or_max): template = Template(""" #include <pycuda-complex.hpp> __global__ void minmax_column_kernel(${type}* mat, ${type}* target, unsigned int *idx_target, unsigned int width, unsigned int height) { __shared__ ${type} max_vals[32]; __shared__ unsigned int max_idxs[32]; ${type} cur_max = ${init_value}; unsigned int cur_idx = 0; ${type} val = 0; for (unsigned int i = threadIdx.x; i < height; i += 32) { val = mat[blockIdx.x + i * width]; if (val ${cmp_op} cur_max) { cur_max = val; cur_idx = i; } } max_vals[threadIdx.x] = cur_max; max_idxs[threadIdx.x] = cur_idx; __syncthreads(); if (threadIdx.x == 0) { cur_max = ${init_value}; cur_idx = 0; for (unsigned int i = 0; i < 32; i++) if (max_vals[i] ${cmp_op} cur_max) { cur_max = max_vals[i]; cur_idx = max_idxs[i]; } target[blockIdx.x] = cur_max; idx_target[blockIdx.x] = cur_idx; } } __global__ void minmax_row_kernel(${type}* mat, ${type}* target, unsigned int* idx_target, unsigned int width, unsigned int height) { __shared__ ${type} max_vals[32]; __shared__ unsigned int max_idxs[32]; ${type} cur_max = ${init_value}; unsigned int cur_idx = 0; ${type} val = 0; for (unsigned int i = threadIdx.x; i < width; i += 32) { val = mat[blockIdx.x * width + i]; if (val ${cmp_op} cur_max) { cur_max = val; cur_idx = i; } } max_vals[threadIdx.x] = cur_max; max_idxs[threadIdx.x] = cur_idx; __syncthreads(); if (threadIdx.x == 0) { cur_max = ${init_value}; cur_idx = 0; for (unsigned int i = 0; i < 32; i++) if (max_vals[i] ${cmp_op} cur_max) { cur_max = max_vals[i]; cur_idx = max_idxs[i]; } target[blockIdx.x] = cur_max; idx_target[blockIdx.x] = cur_idx; } } """) cache_dir=None ctype = dtype_to_ctype(dtype) if min_or_max=='max': iv = str(np.finfo(dtype).min) tmpl = template.substitute(type=ctype, cmp_op='>', init_value=iv) elif min_or_max=='min': iv = str(np.finfo(dtype).max) tmpl = template.substitute(type=ctype, cmp_op='<', init_value=iv) else: raise ValueError('invalid argument') mod = SourceModule(tmpl) minmax_col_kernel = mod.get_function('minmax_column_kernel') minmax_row_kernel = mod.get_function('minmax_row_kernel') return minmax_col_kernel, minmax_row_kernel def _minmax_impl(a_gpu, axis, min_or_max, stream=None, keepdims=False): ''' Returns both max and argmax (min/argmin) along an axis.''' assert len(a_gpu.shape) < 3 if iscomplextype(a_gpu.dtype): raise ValueError("Cannot compute min/max of complex values") if axis is None or len(a_gpu.shape) <= 1: ## Note: PyCUDA doesn't have an overall argmax/argmin! out_shape = (1,) * len(a_gpu.shape) if min_or_max == 'max': return gpuarray.max(a_gpu).reshape(out_shape), None else: return gpuarray.min(a_gpu).reshape(out_shape), None else: if axis < 0: axis += 2 assert axis in (0, 1) global _global_cublas_allocator alloc = _global_cublas_allocator n, m = a_gpu.shape if a_gpu.flags.c_contiguous else (a_gpu.shape[1], a_gpu.shape[0]) col_kernel, row_kernel = _get_minmax_kernel(a_gpu.dtype, min_or_max) if (axis == 0 and a_gpu.flags.c_contiguous) or (axis == 1 and a_gpu.flags.f_contiguous): if keepdims: out_shape = (1, m) if axis == 0 else (m, 1) else: out_shape = (m,) target = gpuarray.empty(out_shape, dtype=a_gpu.dtype, allocator=alloc) idx = gpuarray.empty(out_shape, dtype=np.uint32, allocator=alloc) col_kernel(a_gpu, target, idx, np.uint32(m), np.uint32(n), block=(32, 1, 1), grid=(m, 1, 1), stream=stream) else: if keepdims: out_shape = (1, n) if axis == 0 else (n, 1) else: out_shape = (n,) target = gpuarray.empty(out_shape, dtype=a_gpu, allocator=alloc) idx = gpuarray.empty(out_shape, dtype=np.uint32, allocator=alloc) row_kernel(a_gpu, target, idx, np.uint32(m), np.uint32(n), block=(32, 1, 1), grid=(n, 1, 1), stream=stream) return target, idx
[docs]def max(a_gpu, axis=None, keepdims=False): ''' Return the maximum of an array or maximum along an axis. Parameters ---------- a_gpu : pycuda.gpuarray.GPUArray Input array axis : int (optional) Axis along which the maxima are computed. The default is to compute the maximum of the flattened array. keepdims : bool (optional, default False) If True, the axes which are reduced are left in the result as dimensions with size one. Returns ------- out : pycuda.gpuarray.GPUArray or float maximum of elements, or maxima of elements along the desired axis. ''' return _minmax_impl(a_gpu, axis, "max", keepdims=keepdims)[0]
[docs]def min(a_gpu, axis=None, keepdims=False): ''' Return the minimum of an array or minimum along an axis. Parameters ---------- a_gpu : pycuda.gpuarray.GPUArray Input array axis : int (optional) Axis along which the minima are computed. The default is to compute the minimum of the flattened array. keepdims : bool (optional, default False) If True, the axes which are reduced are left in the result as dimensions with size one. Returns ------- out : pycuda.gpuarray.GPUArray or float minimum of elements, or minima of elements along the desired axis. ''' return _minmax_impl(a_gpu, axis, "min", keepdims=keepdims)[0]
[docs]def argmax(a_gpu, axis, keepdims=False): ''' Indices of the maximum values along an axis. Parameters ---------- a_gpu : pycuda.gpuarray.GPUArray Input array axis : int Axis along which the maxima are computed. keepdims : bool (optional, default False) If True, the axes which are reduced are left in the result as dimensions with size one. Returns ------- out : pycuda.gpuarray.GPUArray Array of indices into the array. ''' if axis is None: raise NotImplementedError("Can't compute global argmax") return _minmax_impl(a_gpu, axis, "max", keepdims=keepdims)[1]
[docs]def argmin(a_gpu, axis, keepdims=False): ''' Indices of the minimum values along an axis. Parameters ---------- a_gpu : pycuda.gpuarray.GPUArray Input array axis : int Axis along which the minima are computed. keepdims : bool (optional, default False) If True, the axes which are reduced are left in the result as dimensions with size one. Returns ------- out : pycuda.gpuarray.GPUArray Array of indices into the array. ''' if axis is None: raise NotImplementedError("Can't compute global argmax") return _minmax_impl(a_gpu, axis, "min", keepdims=keepdims)[1]
if __name__ == "__main__": import doctest doctest.testmod()