cuda

Package Contents

cuda.msg = could not find 'cuda' support
cuda.version
cuda.copyright
cuda.license()
class cuda.DeviceManager(**kwds)

The singleton that provides access to what is known about CUDA capable hardware

count = 0
devices = []
current_device
device(self, did=0)

Set {did} as the default device

cuda.manager
cuda.devices
cuda.device
class cuda.vector(shape=1, source=None, dtype='float64', **kwds)

cuda vector (a python wrapper for c/c++ cuda_vector) typedef struct {

size_t size; // length char *data; // pointer to gpu memory size_t nbytes; // total bytes int dtype; // use numpy type_num

} cuda_vector;

data
copy_to_host(self, target=None, type='gsl')

copy cuda vector to host (gsl or numpy)vector gsl.vector is double precison only numpy.ndarray can be any type

copy_from_host(self, source)

copy from a host (gsl or numpy) vector

copy(self, other)

copy data from another vector

clone(self)

clone to a new vector

zero(self)

initialize all elements to 0

fill(self, value)

set all elements to a given value

print(self)

print elements by converting to numpy ndarray

sum(self)

summation

amin(self)

minimum value

amax(self)

maximum value

mean(self)

mean value

std(self, mean=None, ddof=1)

standard deviation :param mean: mean value :param ddof: delta degrees of freedom, or the dividing factor(n-ddof)

free(self)

force releasing gpu memory :return:

bcast(self, communicator=None, source=0)

Broadcast the given {vector} from {source} to all tasks in {communicator}

__len__(self)
__iadd__(self, other)

In-place addition with the elements of {other}

__isub__(self, other)

In-place subtraction with the elements of {other}

__imul__(self, other)

In-place scale with a factor {other}

__getitem__(self, index)

Get the value of v[index] :param index: index of the vector :return: float value (in cpu)

class cuda.matrix(shape=(1, 1), source=None, dtype='float64', **kwds)

cuda matrix ( a python wrapper for c/c++ cuda_matrix )

typedef struct {
size_t size1; // shape[0] size_t size2; // shape[1] size_t size; // total size char *data; // pointer to gpu memory size_t nbytes; // total bytes int dtype; // use numpy type_num

} cuda_matrix;

properties:
shape[2]: shape (size1, size2) data: PyCapsule for c/c++ cuda_matrix object dtype: data type as in numpy size: shape[0]*shape[1]
submatrix
data
copy_to_host(self, target=None, type='gsl')

copy cuda matrix to host (gsl or numpy) gsl.matrix is double precison only numpy.ndarray can be any type

copy_from_host(self, source)

copy from a gsl(host) matrix

copy(self, other)

copy data from another matrix

copy_to_device(self, out=None, dtype=None)

Copy a matrix to another gpu matrix with type conversion support :param out: pre-allocated output matrix :param dtype: output matrix data type if out is none :return: out

clone(self)

clone to a new matrix

view(self, out=None, start=(0, 0), size=None)

copy a submatrix with size=(m,n) from start=(ms, ns)

tovector(self, start=(0, 0), size=None, out=None)

view a continuous part or whole matrix as a vector (without data copying) :param start: tuple (row, col) as starting element :param size: number of elements in vector :return: a cuda vector of size size

get_row(self, row=0, out=None)

get one row :param row: row index :return: a cuda vector of size=columns

set_row(self, src, row=0)

set one row from a vector :param src: cuda vector :param row: row index :return: self

insert(self, src, start=(0, 0), shape=None)

insert (copy) a matrix from position start

copytile(self, src, start=(0, 0), src_start=(0, 0), shape=None)

copy a tile of matrix from src

copycols(self, dst, indices, batch=None)

copy one or more columns to another matrix, the columns to be copied are specified by indices :param dst: :param indices: :param batch: :return:

duplicateVector(self, src, size=None, incx=1)

Copy a vector to first one or few rows to this matrix :param src: cuda vector :param size: :param incx: :return:

copy_triangle(self, fill=1)

for nxn triangular matrix, copy upper triangle (fill=1) to lower, or vice versa(fill=0) :param fill: 0,1 for lower/upper filled matrix :return: self

zero(self)

initialize all elements to 0

fill(self, value)

set all elements to a given value

print(self)

print elements by converting to gsl(host) matrix at first

transpose(self, out=None)

transpose M(m,n)-> MT(n,m)

inverse_cholesky(self, out=None, uplo=1)

Matrix inverse (in place if out is not provided) for symmetric matrix only only the lower, upper part is used for uplo=0,1

inverse(self, out=None)

Matrix inverse with LU :param out: output matrix if different from input :param uplo: inverse matrix store mode :return: self or out

Cholesky(self, out=None, uplo=1)

Cholesky decomposition :param out: output matrix :param uplo: store mode for output, 0/1 = lower/upper triangle :return: self or out

determinant(self, triangular=False)

matrix determinant for real symmetric matrix

log_det(self, triangular=False)

matrix log determinant for real symmetric matrix

amin(self)

minimum value

amax(self)

maximum value

mean(self, axis=None, out=None)

mean values along axis=0(row), 1(column), or all elements (None) :param axis: int or None, axis along which the means are computed. None for all elements :param out: output vector for axis=0,1 vector size = columns(axis=0), rows(axis=1) :return: mean value(s) as a vector for axis=0 or 1, as a float

mean_sd(self, axis=0, out=None, ddof=1)

mean and stand deviations along row or column :param axis: int or None, axis along which the means are computed. None for all elements :param out: tuple of two vectors (mean, sd), vector size is 1 (axis=None), columns(axis=0), rows(axis=1) :param ddof: delta degrees of freedom :return: tuple of two vectors

free(self)

force releasing gpu memory :return:

bcast(self, communicator=None, source=0)

Broadcast the given {vector} from {source} to all tasks in {communicator}

__iadd__(self, other)

In-place addition with the elements of {other}

__isub__(self, other)

In-place subtraction with the elements of {other}

__imul__(self, other)

In-place scale with a factor {other}

class cuda.curand

CURAND lib utitilies

CURAND_RNG_TEST = [0]
CURAND_RNG_PSEUDO_DEFAULT = 100
CURAND_RNG_PSEUDO_XORWOW = 101
CURAND_RNG_PSEUDO_MRG32K3A = 121
CURAND_RNG_PSEUDO_MTGP32 = 141
CURAND_RNG_PSEUDO_MT19937 = 142
CURAND_RNG_PSEUDO_PHILOX4_32_10 = 161
CURAND_RNG_QUASI_DEFAULT = 200
CURAND_RNG_QUASI_SOBOL32 = 201
CURAND_RNG_QUASI_SCRAMBLED_SOBOL32 = 202
CURAND_RNG_QUASI_SOBOL64 = 203
CURAND_RNG_QUASI_SCRAMBLED_SOBOL64 = 204
create_generator(gentype=None, seed=None)

allocate a curand generator

set_seed(gen, seed)

Set seed for curand generator

get_current_generator()

Find the curand generator from current device

gaussian(gen=None, out=None, dtype='float64', loc=0, scale=1, size=1)

generate Gaussian(Normal) distribution random numbers

uniform(gen=None, out=None, dtype='float64', size=1)

generate uniform distribution random numbers (0,1]

class cuda.cublas

Wrapper for cublas lib utitilies

CUBLAS_FILL_MODE_LOWER = 0
CUBLAS_FILL_MODE_UPPER = 1
CUBLAS_FILL_MODE_FULL = 2
CUBLAS_DIAG_NON_UNIT = 0
CUBLAS_DIAG_UNIT = 1
CUBLAS_SIDE_LEFT = 0
CUBLAS_SIDE_RIGHT = 1
CUBLAS_OP_N = 0
CUBLAS_OP_T = 1
CUBLAS_OP_C = 2
CUBLAS_OP_HERMITAN = 2
CUBLAS_OP_CONJG = 3
FillModeLower = 0
FillModeUpper = 1
DiagNonUnit = 0
DiagUnit = 1
SideLeft = 0
SideRight = 1
OpNoTrans = 0
OpTrans = 1
create_handle()

create a cublas handle

get_current_handle()
axpy(alpha, x, y, handle=None, batch=None, incx=1, incy=1)

axpy : y = alpha x + y

gemm(A, B, handle=None, out=None, alpha=1.0, beta=0.0, rows=None, transa=0, transb=0)

Matrix-matrix multiplication (no complex support yet) Args: op(A) with shape(m,k), op(B) with shape (k, n) in row major

op(A) = A if transa=0, else A^T rows - only first rows are calculated (rows <=m)
Returns: out (C) with shape (m, n)
C = alpha A B + beta C
gemv(A, x, handle=None, out=None, trans=0, alpha=1.0, beta=0.0)

y(out) = alpha op(A) x + beta y :param A: matrix (m, n) :param x: vector with size= n/m if trans=0/1 (notrans/transpose) :param handle: cublas handle :param out: vector y with size = m/n if trans=0/1 :param trans: :param alpha: :param beta: :return: y

trmv(A, x, handle=None, uplo=1, transa=0, diag=0, incx=1, n=None)

triangular matrix-vector multiplication x= op(A) x Args: A symmetric nxn, x vector n Return: x

trmm(A, B, handle=None, out=None, alpha=1.0, uplo=1, side=0, transa=0, diag=0)

symmetric matrix-matrix multiplication C= A B (Note in blas B = A B) Args: if SideLeft A symmetric mxm, B mxn

if SideRight, A symmetric nxn B mxn

Return: out(C) m x n

symv(A, x, handle=None, uplo=1, n=None, alpha=1.0, beta=0.0, out=None)

symmetric matrix-vector multiplication y = alpha A x + beta y Args: A symmetric nxn, x vector n Return: x

symm(A, B, handle=None, out=None, alpha=1.0, beta=0.0, uplo=1, side=0)

symmetric matrix-matrix multiplication C= A B (Note in blas B = A B) Args: if SideLeft A symmetric mxm, B mxn

if SideRight, A symmetric nxn B mxn

Return: out(C) m x n

class cuda.cusolverdn

Wrapper for cusolverDn lib utitilies

create_handle()

create a cusolverDn handle

get_current_handle()
class cuda.timer(**kwds)

A cuda timer using cudaEvent

capsule
start(self)
stop(self)
profile(self, process, *args, **kwargs)

Profile a process :param process: :param args: :return:

cuda.current_device()

return current device