cuda¶
Submodules¶
Package Contents¶
-
cuda.msg= could not find 'cuda' support¶
-
cuda.version¶
-
cuda.copyright¶
-
cuda.license()¶
-
class
cuda.DeviceManager(**kwds)¶ The singleton that provides access to what is known about CUDA capable hardware
-
count= 0¶
-
devices= []¶
-
current_device¶
-
device(self, did=0)¶ Set {did} as the default device
-
-
cuda.manager¶
-
cuda.devices¶
-
cuda.device¶
-
class
cuda.vector(shape=1, source=None, dtype='float64', **kwds)¶ cuda vector (a python wrapper for c/c++ cuda_vector) typedef struct {
size_t size; // length char *data; // pointer to gpu memory size_t nbytes; // total bytes int dtype; // use numpy type_num} cuda_vector;
-
data¶
-
copy_to_host(self, target=None, type='gsl')¶ copy cuda vector to host (gsl or numpy)vector gsl.vector is double precison only numpy.ndarray can be any type
-
copy_from_host(self, source)¶ copy from a host (gsl or numpy) vector
-
copy(self, other)¶ copy data from another vector
-
clone(self)¶ clone to a new vector
-
zero(self)¶ initialize all elements to 0
-
fill(self, value)¶ set all elements to a given value
-
print(self)¶ print elements by converting to numpy ndarray
-
sum(self)¶ summation
-
amin(self)¶ minimum value
-
amax(self)¶ maximum value
-
mean(self)¶ mean value
-
std(self, mean=None, ddof=1)¶ standard deviation :param mean: mean value :param ddof: delta degrees of freedom, or the dividing factor(n-ddof)
-
free(self)¶ force releasing gpu memory :return:
-
bcast(self, communicator=None, source=0)¶ Broadcast the given {vector} from {source} to all tasks in {communicator}
-
__len__(self)¶
-
__iadd__(self, other)¶ In-place addition with the elements of {other}
-
__isub__(self, other)¶ In-place subtraction with the elements of {other}
-
__imul__(self, other)¶ In-place scale with a factor {other}
-
__getitem__(self, index)¶ Get the value of v[index] :param index: index of the vector :return: float value (in cpu)
-
-
class
cuda.matrix(shape=(1, 1), source=None, dtype='float64', **kwds)¶ cuda matrix ( a python wrapper for c/c++ cuda_matrix )
- typedef struct {
- size_t size1; // shape[0] size_t size2; // shape[1] size_t size; // total size char *data; // pointer to gpu memory size_t nbytes; // total bytes int dtype; // use numpy type_num
} cuda_matrix;
- properties:
- shape[2]: shape (size1, size2) data: PyCapsule for c/c++ cuda_matrix object dtype: data type as in numpy size: shape[0]*shape[1]
-
submatrix¶
-
data¶
-
copy_to_host(self, target=None, type='gsl')¶ copy cuda matrix to host (gsl or numpy) gsl.matrix is double precison only numpy.ndarray can be any type
-
copy_from_host(self, source)¶ copy from a gsl(host) matrix
-
copy(self, other)¶ copy data from another matrix
-
copy_to_device(self, out=None, dtype=None)¶ Copy a matrix to another gpu matrix with type conversion support :param out: pre-allocated output matrix :param dtype: output matrix data type if out is none :return: out
-
clone(self)¶ clone to a new matrix
-
view(self, out=None, start=(0, 0), size=None)¶ copy a submatrix with size=(m,n) from start=(ms, ns)
-
tovector(self, start=(0, 0), size=None, out=None)¶ view a continuous part or whole matrix as a vector (without data copying) :param start: tuple (row, col) as starting element :param size: number of elements in vector :return: a cuda vector of size size
-
get_row(self, row=0, out=None)¶ get one row :param row: row index :return: a cuda vector of size=columns
-
set_row(self, src, row=0)¶ set one row from a vector :param src: cuda vector :param row: row index :return: self
-
insert(self, src, start=(0, 0), shape=None)¶ insert (copy) a matrix from position start
-
copytile(self, src, start=(0, 0), src_start=(0, 0), shape=None)¶ copy a tile of matrix from src
-
copycols(self, dst, indices, batch=None)¶ copy one or more columns to another matrix, the columns to be copied are specified by indices :param dst: :param indices: :param batch: :return:
-
duplicateVector(self, src, size=None, incx=1)¶ Copy a vector to first one or few rows to this matrix :param src: cuda vector :param size: :param incx: :return:
-
copy_triangle(self, fill=1)¶ for nxn triangular matrix, copy upper triangle (fill=1) to lower, or vice versa(fill=0) :param fill: 0,1 for lower/upper filled matrix :return: self
-
zero(self)¶ initialize all elements to 0
-
fill(self, value)¶ set all elements to a given value
-
print(self)¶ print elements by converting to gsl(host) matrix at first
-
transpose(self, out=None)¶ transpose M(m,n)-> MT(n,m)
-
inverse_cholesky(self, out=None, uplo=1)¶ Matrix inverse (in place if out is not provided) for symmetric matrix only only the lower, upper part is used for uplo=0,1
-
inverse(self, out=None)¶ Matrix inverse with LU :param out: output matrix if different from input :param uplo: inverse matrix store mode :return: self or out
-
Cholesky(self, out=None, uplo=1)¶ Cholesky decomposition :param out: output matrix :param uplo: store mode for output, 0/1 = lower/upper triangle :return: self or out
-
determinant(self, triangular=False)¶ matrix determinant for real symmetric matrix
-
log_det(self, triangular=False)¶ matrix log determinant for real symmetric matrix
-
amin(self)¶ minimum value
-
amax(self)¶ maximum value
-
mean(self, axis=None, out=None)¶ mean values along axis=0(row), 1(column), or all elements (None) :param axis: int or None, axis along which the means are computed. None for all elements :param out: output vector for axis=0,1 vector size = columns(axis=0), rows(axis=1) :return: mean value(s) as a vector for axis=0 or 1, as a float
-
mean_sd(self, axis=0, out=None, ddof=1)¶ mean and stand deviations along row or column :param axis: int or None, axis along which the means are computed. None for all elements :param out: tuple of two vectors (mean, sd), vector size is 1 (axis=None), columns(axis=0), rows(axis=1) :param ddof: delta degrees of freedom :return: tuple of two vectors
-
free(self)¶ force releasing gpu memory :return:
-
bcast(self, communicator=None, source=0)¶ Broadcast the given {vector} from {source} to all tasks in {communicator}
-
__iadd__(self, other)¶ In-place addition with the elements of {other}
-
__isub__(self, other)¶ In-place subtraction with the elements of {other}
-
__imul__(self, other)¶ In-place scale with a factor {other}
-
class
cuda.curand¶ CURAND lib utitilies
-
CURAND_RNG_TEST= [0]¶
-
CURAND_RNG_PSEUDO_DEFAULT= 100¶
-
CURAND_RNG_PSEUDO_XORWOW= 101¶
-
CURAND_RNG_PSEUDO_MRG32K3A= 121¶
-
CURAND_RNG_PSEUDO_MTGP32= 141¶
-
CURAND_RNG_PSEUDO_MT19937= 142¶
-
CURAND_RNG_PSEUDO_PHILOX4_32_10= 161¶
-
CURAND_RNG_QUASI_DEFAULT= 200¶
-
CURAND_RNG_QUASI_SOBOL32= 201¶
-
CURAND_RNG_QUASI_SCRAMBLED_SOBOL32= 202¶
-
CURAND_RNG_QUASI_SOBOL64= 203¶
-
CURAND_RNG_QUASI_SCRAMBLED_SOBOL64= 204¶
-
create_generator(gentype=None, seed=None)¶ allocate a curand generator
-
set_seed(gen, seed)¶ Set seed for curand generator
-
get_current_generator()¶ Find the curand generator from current device
-
gaussian(gen=None, out=None, dtype='float64', loc=0, scale=1, size=1)¶ generate Gaussian(Normal) distribution random numbers
-
uniform(gen=None, out=None, dtype='float64', size=1)¶ generate uniform distribution random numbers (0,1]
-
-
class
cuda.cublas¶ Wrapper for cublas lib utitilies
-
CUBLAS_FILL_MODE_LOWER= 0¶
-
CUBLAS_FILL_MODE_UPPER= 1¶
-
CUBLAS_FILL_MODE_FULL= 2¶
-
CUBLAS_DIAG_NON_UNIT= 0¶
-
CUBLAS_DIAG_UNIT= 1¶
-
CUBLAS_SIDE_LEFT= 0¶
-
CUBLAS_SIDE_RIGHT= 1¶
-
CUBLAS_OP_N= 0¶
-
CUBLAS_OP_T= 1¶
-
CUBLAS_OP_C= 2¶
-
CUBLAS_OP_HERMITAN= 2¶
-
CUBLAS_OP_CONJG= 3¶
-
FillModeLower= 0¶
-
FillModeUpper= 1¶
-
DiagNonUnit= 0¶
-
DiagUnit= 1¶
-
SideLeft= 0¶
-
SideRight= 1¶
-
OpNoTrans= 0¶
-
OpTrans= 1¶
-
create_handle()¶ create a cublas handle
-
get_current_handle()¶
-
axpy(alpha, x, y, handle=None, batch=None, incx=1, incy=1)¶ axpy : y = alpha x + y
-
gemm(A, B, handle=None, out=None, alpha=1.0, beta=0.0, rows=None, transa=0, transb=0)¶ Matrix-matrix multiplication (no complex support yet) Args: op(A) with shape(m,k), op(B) with shape (k, n) in row major
op(A) = A if transa=0, else A^T rows - only first rows are calculated (rows <=m)- Returns: out (C) with shape (m, n)
- C = alpha A B + beta C
-
gemv(A, x, handle=None, out=None, trans=0, alpha=1.0, beta=0.0)¶ y(out) = alpha op(A) x + beta y :param A: matrix (m, n) :param x: vector with size= n/m if trans=0/1 (notrans/transpose) :param handle: cublas handle :param out: vector y with size = m/n if trans=0/1 :param trans: :param alpha: :param beta: :return: y
-
trmv(A, x, handle=None, uplo=1, transa=0, diag=0, incx=1, n=None)¶ triangular matrix-vector multiplication x= op(A) x Args: A symmetric nxn, x vector n Return: x
-
trmm(A, B, handle=None, out=None, alpha=1.0, uplo=1, side=0, transa=0, diag=0)¶ symmetric matrix-matrix multiplication C= A B (Note in blas B = A B) Args: if SideLeft A symmetric mxm, B mxn
if SideRight, A symmetric nxn B mxnReturn: out(C) m x n
-
symv(A, x, handle=None, uplo=1, n=None, alpha=1.0, beta=0.0, out=None)¶ symmetric matrix-vector multiplication y = alpha A x + beta y Args: A symmetric nxn, x vector n Return: x
-
symm(A, B, handle=None, out=None, alpha=1.0, beta=0.0, uplo=1, side=0)¶ symmetric matrix-matrix multiplication C= A B (Note in blas B = A B) Args: if SideLeft A symmetric mxm, B mxn
if SideRight, A symmetric nxn B mxnReturn: out(C) m x n
-
-
class
cuda.cusolverdn¶ Wrapper for cusolverDn lib utitilies
-
create_handle()¶ create a cusolverDn handle
-
get_current_handle()¶
-
-
class
cuda.timer(**kwds)¶ A cuda timer using cudaEvent
-
capsule¶
-
start(self)¶
-
stop(self)¶
-
profile(self, process, *args, **kwargs)¶ Profile a process :param process: :param args: :return:
-
-
cuda.current_device()¶ return current device