OpenCL流程分析与示例

news/2024/9/20 10:56:32

OpenCL流程分析与示例

 

 

 OpenCL示例

Vortex存储库的tests/OpenCL目录中有OpenCL测试程序。OpenCL程序分为主机代码和设备代码。[cc|cpp]和kernel.cl。

OpenCL通过在设备端并行执行内核来加快速度。来看看tests/opencl/sgemm中的代码作为一个具体的例子。注意,代码经过了轻微修改,使差异更加清晰。

下面显示了主机上运行的main.cc中的matmul函数。

void matmul(const float* A,
            const float* B,
            float*       C,
            int          N) {
  for (int i = 0; i < N; ++i) {
    for (int j = 0; j < N; ++j) {
      float acc = 0.0f;
      for (int k = 0; k < N; ++k) {
        acc += A[i + k * N] * B[k + j * N];
      }
      C[i + j * N] = acc;
    }
  }
}

下面是在与上述功能相对应的设备上运行的kernel.cl。

__kernel void sgemm(__global const float* A,
                    __global const float* B,
                    __global float*       C,
                    int                   N) {
  const int i = get_global_id(0);
  const int j = get_global_id(1);
  float acc = 0.0f;
  for (int k = 0; k < N; ++k) {
    acc += A[i + k * N] * B[k + j * N];
  }
  C[i + j * N] = acc;
}

 

sudo apt-get install ocl-icd-opencl-dev

c. 按流程编码

//

// Created by yang on 24-2-2.

//

#include <CL/cl.h>

#include <stdio.h>

#include <stdlib.h>

#define ARRAY_SIZE 1024

// OpenCL kernel

const char* kernelSource =

        "__kernel void vectorAdd(__global const float* a, __global const float* b, __global float* result) {\n"

        "    int index = get_global_id(0);\n"

        "    result[index] = a[index] + b[index];\n"

        "}\n";

int main() {

    // Initialize input vectors

    float a[ARRAY_SIZE];

    float b[ARRAY_SIZE];

    float result[ARRAY_SIZE];

    for (int i = 0; i < ARRAY_SIZE; ++i) {

        a[i] = i;

        b[i] = i * 2;

    }

    // Load OpenCL platform

    cl_platform_id platform;

    clGetPlatformIDs(1, &platform, NULL);

    // Load OpenCL device

    cl_device_id device;

    clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);

    // Create OpenCL context

    cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL);

    // Create command queue

    cl_command_queue queue = clCreateCommandQueue(context, device, 0, NULL);

    // Create OpenCL program

    cl_program program = clCreateProgramWithSource(context, 1, &kernelSource, NULL, NULL);

    // Build OpenCL program

    clBuildProgram(program, 1, &device, NULL, NULL, NULL);

    // Create OpenCL kernel

    cl_kernel kernel = clCreateKernel(program, "vectorAdd", NULL);

    // Create OpenCL buffers

    cl_mem bufferA = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float) * ARRAY_SIZE, a, NULL);

    cl_mem bufferB = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float) * ARRAY_SIZE, b, NULL);

    cl_mem bufferResult = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * ARRAY_SIZE, NULL, NULL);

    // Set OpenCL kernel arguments

    clSetKernelArg(kernel, 0, sizeof(cl_mem), &bufferA);

    clSetKernelArg(kernel, 1, sizeof(cl_mem), &bufferB);

    clSetKernelArg(kernel, 2, sizeof(cl_mem), &bufferResult);

    // Execute OpenCL kernel

    size_t globalSize = ARRAY_SIZE;

    clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &globalSize, NULL, 0, NULL, NULL);

    clFinish(queue);

    // Read the result from OpenCL buffer

    clEnqueueReadBuffer(queue, bufferResult, CL_TRUE, 0, sizeof(float) * ARRAY_SIZE, result, 0, NULL, NULL);

    // Display the result

    for (int i = 0; i < ARRAY_SIZE; ++i) {

        printf("%f + %f = %f\n", a[i], b[i], result[i]);

    }

    // Clean up

    clReleaseMemObject(bufferA);

    clReleaseMemObject(bufferB);

    clReleaseMemObject(bufferResult);

    clReleaseKernel(kernel);

    clReleaseProgram(program);

    clReleaseCommandQueue(queue);

    clReleaseContext(context);

    return 0;

}

d. 编译
gcc -O hello_opencl hello_cl.c -lOpenCL

e. 执行
./hello_opencl

Demo示例

在vortex 下编写和运行OpenCL内核代码和程序(vecadd demo)

http://main.cc代码如下:

#include <stdio.h>

#include <stdlib.h>

#include <assert.h>

#include <math.h>

#include <CL/opencl.h>

#include <unistd.h>

#include <string.h>

#include <chrono>

#define KERNEL_NAME "vecadd"

#define CL_CHECK(_expr)                                                \

   do {                                                                \

     cl_int _err = _expr;                                              \

     if (_err == CL_SUCCESS)                                           \

       break;                                                          \

     printf("OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err);   \

    cleanup();                                                               \

     exit(-1);                                                         \

   } while (0)

#define CL_CHECK2(_expr)                                               \

   ({                                                                  \

     cl_int _err = CL_INVALID_VALUE;                                   \

     decltype(_expr) _ret = _expr;                                     \

     if (_err != CL_SUCCESS) {                                         \

       printf("OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err); \

      cleanup();                                                           \

       exit(-1);                                                       \

     }                                                                 \

     _ret;                                                             \

   })

static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) {

  if (nullptr == filename || nullptr == data || 0 == size)

    return -1;

  FILE* fp = fopen(filename, "r");

  if (NULL == fp) {

    fprintf(stderr, "Failed to load kernel.");

    return -1;

  }

  fseek(fp , 0 , SEEK_END);

  long fsize = ftell(fp);

  rewind(fp);

  *data = (uint8_t*)malloc(fsize);

  *size = fread(*data, 1, fsize, fp);

 

  fclose(fp);

 

  return 0;

}

static bool almost_equal(float a, float b, int ulp = 4) {

  union fi_t { int i; float f; };

  fi_t fa, fb;

  fa.f = a;

  fb.f = b;

  return std::abs(fa.i - fb.i) <= ulp;

}

cl_device_id device_id = NULL;

cl_context context = NULL;

cl_command_queue commandQueue = NULL;

cl_program program = NULL;

cl_kernel kernel = NULL;

cl_mem a_memobj = NULL;

cl_mem b_memobj = NULL;

cl_mem c_memobj = NULL; 

float *h_a = NULL;

float *h_b = NULL;

float *h_c = NULL;

uint8_t *kernel_bin = NULL;

static void cleanup() {

  if (commandQueue) clReleaseCommandQueue(commandQueue);

  if (kernel) clReleaseKernel(kernel);

  if (program) clReleaseProgram(program);

  if (a_memobj) clReleaseMemObject(a_memobj);

  if (b_memobj) clReleaseMemObject(b_memobj);

  if (c_memobj) clReleaseMemObject(c_memobj); 

  if (context) clReleaseContext(context);

  if (device_id) clReleaseDevice(device_id);

 

  if (kernel_bin) free(kernel_bin);

  if (h_a) free(h_a);

  if (h_b) free(h_b);

  if (h_c) free(h_c);

}

int size = 64;

static void show_usage() {

  printf("Usage: [-n size] [-h: help]\n");

}

static void parse_args(int argc, char **argv) {

  int c;

  while ((c = getopt(argc, argv, "n:h?")) != -1) {

    switch (c) {

    case 'n':

      size = atoi(optarg);

      break;

    case 'h':

    case '?': {

      show_usage();

      exit(0);

    } break;

    default:

      show_usage();

      exit(-1);

    }

  }

  printf("Workload size=%d\n", size);

}

int main (int argc, char **argv) {

  // parse command arguments

  parse_args(argc, argv);

 

  cl_platform_id platform_id;

  size_t kernel_size;

 

  // Getting platform and device information

  CL_CHECK(clGetPlatformIDs(1, &platform_id, NULL));

  CL_CHECK(clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, NULL));

  printf("Create context\n");

  context = CL_CHECK2(clCreateContext(NULL, 1, &device_id, NULL, NULL,  &_err));

  printf("Allocate device buffers\n");

  size_t nbytes = size * sizeof(float);

  a_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, nbytes, NULL, &_err));

  b_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, nbytes, NULL, &_err));

  c_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_WRITE_ONLY, nbytes, NULL, &_err));

  printf("Create program from kernel source\n");

#ifdef HOSTGPU

  if (0 != read_kernel_file("kernel.cl", &kernel_bin, &kernel_size))

    return -1;

  program = CL_CHECK2(clCreateProgramWithSource(

    context, 1, (const char**)&kernel_bin, &kernel_size, &_err)); 

#else

  if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))

    return -1;

  program = CL_CHECK2(clCreateProgramWithBinary(

    context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, NULL, &_err));

#endif

  // Build program

  CL_CHECK(clBuildProgram(program, 1, &device_id, NULL, NULL, NULL));

 

  // Create kernel

  kernel = CL_CHECK2(clCreateKernel(program, KERNEL_NAME, &_err));

  // Set kernel arguments

  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_memobj)); 

  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&b_memobj)); 

  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_memobj));

  // Allocate memories for input arrays and output arrays.   

  h_a = (float*)malloc(nbytes);

  h_b = (float*)malloc(nbytes);

  h_c = (float*)malloc(nbytes);   

  

  // Generate input values

  for (int i = 0; i < size; ++i) {

    h_a[i] = sinf(i)*sinf(i);

    h_b[i] = cosf(i)*cosf(i);

  }

  // Creating command queue

  commandQueue = CL_CHECK2(clCreateCommandQueue(context, device_id, 0, &_err)); 

   printf("Upload source buffers\n");

  CL_CHECK(clEnqueueWriteBuffer(commandQueue, a_memobj, CL_TRUE, 0, nbytes, h_a, 0, NULL, NULL));

  CL_CHECK(clEnqueueWriteBuffer(commandQueue, b_memobj, CL_TRUE, 0, nbytes, h_b, 0, NULL, NULL));

  printf("Execute the kernel\n");

  size_t global_work_size[1] = {size};

  auto time_start = std::chrono::high_resolution_clock::now();

  CL_CHECK(clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, global_work_size, NULL, 0, NULL, NULL));

  CL_CHECK(clFinish(commandQueue));

  auto time_end = std::chrono::high_resolution_clock::now();

  double elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(time_end - time_start).count();

  printf("Elapsed time: %lg ms\n", elapsed);

  printf("Download destination buffer\n");

  CL_CHECK(clEnqueueReadBuffer(commandQueue, c_memobj, CL_TRUE, 0, nbytes, h_c, 0, NULL, NULL));

  printf("Verify result\n");

  int errors = 0;

  for (int i = 0; i < size; ++i) {

    float ref = h_a[i] + h_b[i];

    if (!almost_equal(h_c[i], ref)) {

      if (errors < 100)

        printf("*** error: [%d] expected=%f, actual=%f, a=%f, b=%f\n", i, ref, h_c[i], h_a[i], h_b[i]);

      ++errors;

    }

  }

  if (0 == errors) {

    printf("PASSED!\n");

  } else {

    printf("FAILED! - %d errors\n", errors);   

  }

  // Clean up    

  cleanup(); 

  return errors;

}

openCL内核代码如下:

__kernel void vecadd (__global const float *A,

                       __global const float *B,

                       __global float *C)

{

  int gid = get_global_id(0);

  C[gid] = A[gid] + B[gid];

}

 

参考文献链接

https://www.luffca.com/2023/03/riscv-gpgpu-vortex-part2/

https://zhuanlan.zhihu.com/p/681397034

本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如若转载,请注明出处:http://www.ryyt.cn/news/47530.html

如若内容造成侵权/违法违规/事实不符,请联系我们进行投诉反馈,一经查实,立即删除!

相关文章

node-js,npm安装-详细教程

一、npm是什么?npm 是 NodeJS 下的包管理器,vue-cli脚手架模板就是基于 node 下的 npm 来完成安装的。 相关介绍~webpack: 它的主要用途是通过CommonJS的语法把所有浏览器端需要发布的静态资源做相应的准备,比如资源的合并和打包。vue-cli:官方提供的一个脚手架,用于快速生…

大众点评商家爬取

对显示搜索结果的网址发送请求import requestsurl = https://www.dianping.com/search/keyword/150/0_%E6%84%8F%E9%9D%A2headers = {Cookie:baiduappugc_ab=ugcdetail%3AA%3A1; _lxsdk_cuid=18fee40b7a3c8-005f5aa16f3f6f-26001c51-144000-18fee40b7a37e; _lxsdk=18fee40b7a3c…

全网最适合入门的面向对象编程教程:01 面向对象编程的基本概念

本文主要介绍了面向对象编程的基本概念:类和对象、三大特性-继承封装多态、UML类图和对象图的基本概念以及教程所需要的开发环境。全网最适合入门的面向对象编程教程:01 面向对象编程的基本概念 以下,我们将简单介绍面向对象编程的基本概念和图形化描述方法,在后面的章节中…

Kubernetes 审计(Auditing)

Kubernetes 审计(Auditing),Kubernetes 审计简介,审计策略简介,引入审计,启用审计,记录审计阶段为:ResponseStarted,审计级别为Metadata,apiVersion为group: "" 的日志,只记录audit命名空间里的日志,只记录audit命名空间的pods操作日志,只记录audit命名…

Excel动态日历1

日历是我们生活中不可或缺的一部分,它能帮助我们规划时间、安排计划,以及记录生活中的点滴。而动态日历图则更是一种流行的视觉呈现方式,它可以直观地展示出每个日期的数据变化和趋势,让我们更好地理解时间和事件的关系。在这篇文章中,我们将向你介绍如何使用Excel制作一个…

ubuntu通过安装包安装Mongodb

下载官方的压缩包 下载地址 选择对应的版本,这里用 x64架构的ubuntu20.04作为演示 在服务器创建mongodb的自定义安装目录 例如安装在/opt/mongodb#创建安装目录mkdir -p /opt/mongodb#创建mongoDB数据目录mkdir -p /opt/mongodbDATA#创建mongodb日志目录mkdir -p /opt/mongodb…

GUI学习

GUI学习 前言:本来不打算学习Gui的,不过最近需要用到还是学习一下吧 1.1 awt 与swing java的图形化界面的对象存在于awt与swing包中,awt需要调用本地系统方法实现功能,在不同的 平台下显示不同,swing是在awt的基础上实现的一套图形化界面,提供了更多组件,由于全部都适 用…

AI制图生产力:Midjourney v5和Stable Diffusion Reimagine

随着生成型AI技术的能力提升,越来越多的注意力放在了通过AI模型提升研发效率上。业内比较火的AI模型有很多,比如画图神器Midjourney、用途多样的Stable Diffusion,以及OpenAI此前刚刚迭代的DALL-E 2。对于研发团队而言,尽管Midjourney功能强大且不需要本地安装,但它对于硬…