cg_raw_8cu-example.html

#include <cusp/csr_matrix.h>

#include <cusp/monitor.h>

#include <cusp/multiply.h>

#include <cusp/print.h>

#include <cusp/krylov/cg.h>


#ifdef __CUDACC__

#include <cuda_runtime_api.h>

#else

#include <cstdlib>

#include <cstring>

#endif


// This example shows how to solve a linear system A * x = b

// where the matrix A and vectors x and b are stored in

// "raw" memory on the device.  The matrix A will be wrapped

// with a coo_matrix_view while the vectors x and b will be

// wrapped with array1d_views.

//

// Views allow you to interface Cusp's solvers with

// data that is managed externally without needing to

// copy the data into a Cusp matrix container.

//

//  Example Matrix:

//   [ 2 -1  0  0]

//   [-1  2 -1  0]

//   [ 0 -1  2 -1]

//   [ 0  0 -1  2]


int main(void)

{

    // COO format in host memory

    int   host_I[10] = { 0, 0, 1, 1, 1, 2, 2, 2, 3, 3}; // COO row indices

    int   host_J[10] = { 0, 1, 0, 1, 2, 1, 2, 3, 2, 3}; // COO column indices

    float host_V[10] = { 2,-1,-1, 2,-1,-1, 2,-1,-1, 2}; // COO values


    // x and b arrays in host memory

    float host_x[4] = {0,0,0,0};

    float host_b[4] = {1,2,2,1};


    // allocate memory

    int   * device_I;

    int   * device_J;

    float * device_V;

    float * device_x;

    float * device_b;

#ifdef __CUDACC__

    cudaMalloc(&device_I, 10 * sizeof(int));

    cudaMalloc(&device_J, 10 * sizeof(int));

    cudaMalloc(&device_V, 10 * sizeof(float));

    cudaMalloc(&device_x,  4 * sizeof(float));

    cudaMalloc(&device_b,  4 * sizeof(float));


    cudaMemcpy(device_I, host_I, 10 * sizeof(int),   cudaMemcpyHostToDevice);

    cudaMemcpy(device_J, host_J, 10 * sizeof(int),   cudaMemcpyHostToDevice);

    cudaMemcpy(device_V, host_V, 10 * sizeof(float), cudaMemcpyHostToDevice);

    cudaMemcpy(device_x, host_x,  4 * sizeof(float), cudaMemcpyHostToDevice);

    cudaMemcpy(device_b, host_b,  4 * sizeof(float), cudaMemcpyHostToDevice);


    thrust::device_ptr<int>   wrapped_I(device_I);

    thrust::device_ptr<int>   wrapped_J(device_J);

    thrust::device_ptr<float> wrapped_V(device_V);

    thrust::device_ptr<float> wrapped_x(device_x);

    thrust::device_ptr<float> wrapped_b(device_b);


    typedef typename cusp::array1d_view< thrust::device_ptr<int>   > IndexArrayView;

    typedef typename cusp::array1d_view< thrust::device_ptr<float> > ValueArrayView;

#else

    device_I = (int*)  malloc(10 * sizeof(int));

    device_J = (int*)  malloc(10 * sizeof(int));

    device_V = (float*)malloc(10 * sizeof(float));

    device_x = (float*)malloc( 4 * sizeof(float));

    device_b = (float*)malloc( 4 * sizeof(float));


    memcpy(device_I, host_I, 10 * sizeof(int));

    memcpy(device_J, host_J, 10 * sizeof(int));

    memcpy(device_V, host_V, 10 * sizeof(float));

    memcpy(device_x, host_x,  4 * sizeof(float));

    memcpy(device_b, host_b,  4 * sizeof(float));


    int*   wrapped_I = device_I;

    int*   wrapped_J = device_J;

    float* wrapped_V = device_V;

    float* wrapped_x = device_x;

    float* wrapped_b = device_b;


    typedef typename cusp::array1d_view<int*>   IndexArrayView;

    typedef typename cusp::array1d_view<float*> ValueArrayView;

#endif


    IndexArrayView row_indices   (wrapped_I, wrapped_I + 10);

    IndexArrayView column_indices(wrapped_J, wrapped_J + 10);

    ValueArrayView values        (wrapped_V, wrapped_V + 10);

    ValueArrayView x             (wrapped_x, wrapped_x + 4);

    ValueArrayView b             (wrapped_b, wrapped_b + 4);


    typedef cusp::coo_matrix_view<IndexArrayView, IndexArrayView, ValueArrayView> View;


    View A(4, 4, 10, row_indices, column_indices, values);


    cusp::monitor<float> monitor(b, 100, 1e-5, 0, true);


    cusp::krylov::cg(A, x, b, monitor);


    // copy the solution back to the host

#ifdef __CUDACC__

    cudaMemcpy(host_x, device_x, 4 * sizeof(float), cudaMemcpyDeviceToHost);

    cudaFree(device_I);

    cudaFree(device_J);

    cudaFree(device_V);

    cudaFree(device_x);

    cudaFree(device_b);

#else

    memcpy(host_x, device_x, 4 * sizeof(float));

    free(device_I);

    free(device_J);

    free(device_V);

    free(device_x);

    free(device_b);

#endif


    return 0;

}