10 个c++ cuda 编程例子

2024-07-18 10:46:02
开发
22

以下是上述Numba CUDA代码示例翻译为C++ CUDA编程语言的对应例子：

向量加法

#include <iostream>
#include <cuda_runtime.h>
__global__ void vector_add(float *a, float *b, float *c, int n) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < n) {
        c[i] = a[i] + b[i];
    }
}
int main() {
    const int n = 3;
    float a[] = {1.0, 2.0, 3.0};
    float b[] = {4.0, 5.0, 6.0};
    float c[n];
    float *d_a, *d_b, *d_c;
    cudaMalloc(&d_a, n * sizeof(float));
    cudaMalloc(&d_b, n * sizeof(float));
    cudaMalloc(&d_c, n * sizeof(float));
    cudaMemcpy(d_a, a, n * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, b, n * sizeof(float), cudaMemcpyHostToDevice);
    int threadsPerBlock = 32;
    int blocksPerGrid = (n + threadsPerBlock - 1) / threadsPerBlock;
    vector_add<<<blocksPerGrid, threadsPerBlock>>>(d_a, d_b, d_c, n);
    cudaMemcpy(c, d_c, n * sizeof(float), cudaMemcpyDeviceToHost);
    for (int i = 0; i < n; ++i) {
        std::cout << c[i] << " ";
    }
    std::cout << std::endl;
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);
    return 0;
}

矩阵乘法

__global__ void matmul(float *A, float *B, float *C, int width) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    float value = 0.0;
    if (row < width && col < width) {
        for (int i = 0; i < width; ++i) {
            value += A[row * width + i] * B[i * width + col];
        }
        C[row * width + col] = value;
    }
}
// 主函数初始化和调用方式类似于向量加法

向量点乘

__global__ void dot_product(float *a, float *b, float *result, int n) {
    __shared__ float temp[256];
    int tid = threadIdx.x;
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    float dot = 0.0;
    if (i < n) {
        dot = a[i] * b[i];
    }
    temp[tid] = dot;
    __syncthreads();
    // Reduction in shared memory
    for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
        if (tid < s) {
            temp[tid] += temp[tid + s];
        }
        __syncthreads();
    }
    if (tid == 0) {
        atomicAdd(result, temp[0]);
    }
}
// 主函数初始化和调用方式类似于向量加法

向量乘法

__global__ void vector_multiply(float *a, float *b, float *c, int n) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < n) {
        c[i] = a[i] * b[i];
    }
}
// 主函数初始化和调用方式类似于向量加法

向量除法

__global__ void vector_divide(float *a, float *b, float *c, int n) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < n) {
        c[i] = a[i] / b[i];
    }
}
// 主函数初始化和调用方式类似于向量加法

向量求和

__global__ void vector_sum(float *a, float *result, int n) {
    __shared__ float temp[256];
    int tid = threadIdx.x;
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    float sum = 0.0;
    if (i < n) {
        sum = a[i];
    }
    temp[tid] = sum;
    __syncthreads();
    // Reduction in shared memory
    for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
        if (tid < s) {
    
            temp[tid] += temp[tid + s];
        }
        __syncthreads();
    }
    if (tid == 0) {
        atomicAdd(result, temp[0]);
    }
}
// 主函数初始化和调用方式类似于向量加法

向量求最大值

__global__ void vector_max(float *a, float *result, int n) {
    __shared__ float temp[256];
    int tid = threadIdx.x;
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    float max_val = -FLT_MAX;
    if (i < n) {
        max_val = a[i];
    }
    temp[tid] = max_val;
    __syncthreads();
    // Reduction in shared memory
    for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
        if (tid < s) {
            temp[tid] = max(temp[tid], temp[tid + s]);
        }
        __syncthreads();
    }
    if (tid == 0) {
        atomicMax(result, temp[0]);
    }
}
// 主函数初始化和调用方式类似于向量加法

向量求最小值

__global__ void vector_min(float *a, float *result, int n) {
    __shared__ float temp[256];
    int tid = threadIdx.x;
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    float min_val = FLT_MAX;
    if (i < n) {
        min_val = a[i];
    }
    temp[tid] = min_val;
    __syncthreads();
    // Reduction in shared memory
    for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
        if (tid < s) {
            temp[tid] = min(temp[tid], temp[tid + s]);
        }
        __syncthreads();
    }
    if (tid == 0) {
        atomicMin(result, temp[0]);
    }
}
// 主函数初始化和调用方式类似于向量加法

向量排序（冒泡排序）
请注意，冒泡排序在CUDA中效率极低，因为它不是并行算法。以下代码仅为演示目的，不推荐用于实际应用。

__global__ void bubble_sort(float *arr, int n) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    for (int j = 0; j < n - 1; j++) {
        if (i < n - j - 1) {
            if (arr[i] > arr[i + 1]) {
                // Swap the elements
                float temp = arr[i];
                arr[i] = arr[i + 1];
                arr[i + 1] = temp;
            }
        }
        __syncthreads();
    }
}
// 主函数初始化和调用方式类似于向量加法

向量求平均值

__global__ void vector_average(float *a, float *result, int n) {
    __shared__ float temp[256];
    int tid = threadIdx.x;
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    float sum = 0.0;
    if (i < n) {
        sum = a[i];
    }
    temp[tid] = sum;
    __syncthreads();
    // Reduction in shared memory
    for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
        if (tid < s) {
            temp[tid] += temp[tid + s];
        }
        __syncthreads();
    }
    if (tid == 0) {
        atomicAdd(result, temp[0]);
    }
}
// 主函数初始化和调用方式类似于向量加法，但需要除以n来得到平均值

这些代码示例展示了如何将Numba CUDA代码转换为C++ CUDA代码。请注意，在C++ CUDA中，我们通常需要更详细地管理内存分配和传输，以及并行算法的设计。此外，上述代码示例中的某些操作（如排序）在CUDA上可能不是最高效的实现方式。

原文地址:https://blog.csdn.net/weixin_32759777/article/details/140422923 本文来自互联网用户投稿，该文观点仅代表作者本人，不代表本站立场。本站仅提供信息存储空间服务，不拥有所有权，不承担相关法律责任。如若转载，请注明出处：https://www.suanlizi.com/kf/1813767151136411648.html 如若内容造成侵权/违法违规/事实不符，请联系《酸梨子》网邮箱：1419361763@qq.com进行投诉反馈，一经查实，立即删除！

阅读全部

10 个c++ cuda 编程例子

相关推荐

最近更新

热门阅读