当前位置：首页 > news >正文

深圳有实力的网站建设服务商百度站长提交网址

news 2025/7/1 6:11:23

深圳有实力的网站建设服务商,百度站长提交网址,湖北省工程建设协会网站,美食网站功能建设随着OpenCL的普及，现在有越来越多的移动设备以及平板、超级本等都支持OpenCL异构计算。而这些设备与桌面计算机、服务器相比而言性能不是占主要因素的，反而能耗更受人关注。因此，这些移动设备上的GPU与CPU基本都是在同一芯片上（So…

随着OpenCL的普及，现在有越来越多的移动设备以及平板、超级本等都支持OpenCL异构计算。而这些设备与桌面计算机、服务器相比而言性能不是占主要因素的，反而能耗更受人关注。因此，这些移动设备上的GPU与CPU基本都是在同一芯片上（SoC），或者GPU就已经成为了处理器的一部分，像Intel Ivy Bridge架构开始的处理器（Intel HD Graphics 4000开始支持OpenCL），AMD APU等。

因此，在这些设备上做OpenCL的异构并行计算的话，我们不需要像桌面端那些独立GPU那样，要把主存数据通过PCIe搬运到GPU端，然后等GPU计算结束后再搬回到主存。我们只需要将给GPU端分配的显存映射到主机端即可。这样，在主机端我们也能直接通过指针来操作这块存储数据。

下面编写了一个比较简单的例子来描述如何使用OpenCL的存储器映射特性。这个例子在MacBook Air，macOS 10.9.2下完成，并通过Xcode 5.1，Apple LLVM 5.1的编译与运行。硬件环境为：Intel Core i7 4650U, Intel Graphics 5000, 8GB DDR3L, 128GB SSD。

这是主机端代码（C源文件）：

#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <time.h>#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#endifint main(void)
{cl_int ret;cl_platform_id platform_id = NULL;cl_device_id device_id = NULL;cl_context context = NULL;cl_command_queue command_queue = NULL;cl_mem memObj = NULL;char *kernelSource = NULL;cl_program program = NULL;cl_kernel kernel = NULL;int *pHostBuffer = NULL;clGetPlatformIDs(1, &platform_id, NULL);if(platform_id == NULL){puts("Get OpenCL platform failed!");goto FINISH;}clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);if(device_id == NULL){puts("No GPU available as a compute device!");goto FINISH;}context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);if(context == NULL){puts("Context not established!");goto FINISH;}command_queue = clCreateCommandQueue(context, device_id, 0, &ret);if(command_queue == NULL){puts("Command queue cannot be created!");goto FINISH;}// 指定内核源文件路径const char *pFileName = "/Users/zennychen/Downloads/test.cl";FILE *fp = fopen(pFileName, "r");if (fp == NULL){puts("The specified kernel source file cannot be opened!");goto FINISH;}fseek(fp, 0, SEEK_END);const long kernelLength = ftell(fp);fseek(fp, 0, SEEK_SET);kernelSource = malloc(kernelLength);fread(kernelSource, 1, kernelLength, fp);fclose(fp);program = clCreateProgramWithSource(context, 1, (const char**)&kernelSource, (const size_t*)&kernelLength, &ret);ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);if (ret != CL_SUCCESS){size_t len;char buffer[8 * 1024];printf("Error: Failed to build program executable!\n");clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);printf("%s\n", buffer);goto FINISH;}kernel = clCreateKernel(program, "test", &ret);if(kernel == NULL){puts("Kernel failed to create!");goto FINISH;}const size_t contentLength = sizeof(*pHostBuffer) * 1024 * 1024;// 以下为在主机端分配输入缓存pHostBuffer = malloc(contentLength);// 然后对此工作缓存进行初始化for(int i = 0; i < 1024 * 1024; i++)pHostBuffer[i] = i + 1;// 这里预分配的缓存大小为4MB，第一个参数是读写的memObj = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, contentLength, pHostBuffer, &ret);if(memObj == NULL){puts("Memory object1 failed to create!");goto FINISH;}ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&memObj);if(ret != CL_SUCCESS){puts("Set arguments error!");goto FINISH;}// 做存储器映射int *pDeviceBuffer = clEnqueueMapBuffer(command_queue, memObj, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, contentLength, 0, NULL, NULL, &ret);if(pDeviceBuffer == NULL){puts("Memory map failed!");goto FINISH;}if(pDeviceBuffer != pHostBuffer){// 若从GPU端映射得到的存储器地址与原先主机端的不同，则将数据从主机端传递到GPU端ret = clEnqueueWriteBuffer(command_queue, memObj, CL_TRUE, 0, contentLength, pHostBuffer, 0, NULL, NULL);if(ret != CL_SUCCESS){puts("Data transfer failed");goto FINISH;}/** 如果主机端与设备端地址不同，我们不妨测试一下设备端存储器的Cache情况 */// 先测试主机端的时间int sum = 0;// 先过一遍存储器for(int j = 0; j < 1024; j++)sum += pHostBuffer[j];time_t t1 = time(NULL);for(int i = 0; i < 1000000; i++){for(int j = 0; j < 1024; j++)sum += pHostBuffer[j];}time_t t2 = time(NULL);printf("The host delta time is: %f. The value is: %d\n", difftime(t2, t1), sum);// 测试设备端sum = 0;// 先过一遍存储器for(int j = 0; j < 1024; j++)sum += pDeviceBuffer[j];t1 = time(NULL);for(int i = 0; i < 1000000; i++){for(int j = 0; j < 1024; j++)sum += pDeviceBuffer[j];}t2 = time(NULL);printf("The device delta time is: %f. The value is: %d\n", difftime(t2, t1), sum);}else{// 若主机端与设备端存储器地址相同，我们仅仅做CPU端测试int sum = 0;// 先过一遍存储器for(int j = 0; j < 1024; j++)sum += pHostBuffer[j];time_t t1 = time(NULL);for(int i = 0; i < 1000000; i++){for(int j = 0; j < 1024; j++)sum += pHostBuffer[j];}time_t t2 = time(NULL);printf("The host delta time is: %f. The value is: %d\n", difftime(t2, t1), sum);}// 这里指定将总共有1024 * 1024个work-itemret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, (const size_t[]){1024 * 1024}, NULL, 0, NULL, NULL);// 做次同步，这里偷懒，不用wait event机制了～clFinish(command_queue);// 做校验for(int i = 0; i < 1024 * 1024; i++){if(pDeviceBuffer[i] != (i + 1) * 2){puts("Result error!");break;}}puts("Compute finished!");FINISH:/* Finalization */if(pHostBuffer != NULL)free(pHostBuffer);if(kernelSource != NULL)free(kernelSource);if(memObj != NULL)clReleaseMemObject(memObj);if(kernel != NULL)clReleaseKernel(kernel);if(program != NULL)clReleaseProgram(program);if(command_queue != NULL)clReleaseCommandQueue(command_queue);if(context != NULL)clReleaseContext(context);return 0;
}

以下是OpenCL内核源代码：

kernel void test(__global int *pInOut)
{int index = get_global_id(0);pInOut[index] += pInOut[index];
}

另外，主机端代码部分中，OpenCL源文件路径是写死的。各位朋友可以根据自己环境来重新指定路径。

当然，我们还可以修改主机端 clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, contentLength, pHostBuffer, &ret); 这段创建存储器对象的属性。比如，将 CL_MEM_USE_HOST_PTR 去掉。然后可以再试试效果。

倘若 clCreateBuffer 的 flags 参数用的是 CL_MEM_ALLOC_HOST_PTR，那么其 host_ptr 参数必须为空。在调用 clEnqueueMapBuffer 之后，可以根据其返回的缓存地址，对存储区域做数据初始化。

CL_MEM_ALLOC_HOST_PTR 表示应用程序暗示OpenCL实现从主机端可访问的存储空间给设备端分配存储缓存。这个与 CL_MEM_USE_HOST_PTR 还是有所区别的。CL_MEM_USE_HOST_PTR 是完全从应用端当前的内存池分配存储空间；而 CL_MEM_ALLOC_HOST_PTR 对于CPU与GPU共享主存的环境下，可以在CPU端留下一个访问GPU端VRAM的入口点。我们通过以下程序来测试当前环境的OpenCL实现（以下代码在调用调用了 clEnqueueMapBuffer 函数之后做了缓存数据初始化的时间比较）：

    long deltaTimes[10];for(int i = 0; i < 10; i++){struct timeval tBegin, tEnd;gettimeofday(&tBegin, NULL);for(int i = 0; i < 1024 * 1024; i++)pDeviceBuffer[i] = i + 1;gettimeofday(&tEnd, NULL);deltaTimes[i] = 1000000 * (tEnd.tv_sec - tBegin.tv_sec ) + tEnd.tv_usec - tBegin.tv_usec;}long useTime = deltaTimes[0];for(int i = 1; i < 10; i++){if(useTime > deltaTimes[i])useTime = deltaTimes[i];}printf("Device memory time spent: %ldus\n", useTime);int *pHostBuffer = malloc(contentLength);for(int i = 0; i < 10; i++){struct timeval tBegin, tEnd;gettimeofday(&tBegin, NULL);for(int i = 0; i < 1024 * 1024; i++)pHostBuffer[i] = i + 1;gettimeofday(&tEnd, NULL);deltaTimes[i] = 1000000 * (tEnd.tv_sec - tBegin.tv_sec ) + tEnd.tv_usec - tBegin.tv_usec;}useTime = deltaTimes[0];for(int i = 1; i < 10; i++){if(useTime > deltaTimes[i])useTime = deltaTimes[i];}printf("Host memory time spent: %ldus\n", useTime);

其中，对 gettimeofday 的调用需要包含头文件 <sys/time.h>。这个函数所返回的时间可以精确到 μs（微秒）。

在Intel Core i7 4650U, Intel Graphics 5000环境下，花费时间差不多，都是2.6ms（毫秒）。因此，在内核真正执行的时候为了清空这部分存储空间的Cache，驱动还是要做点工作的。当然，驱动也可为这块内存区域分配 Write-Combined 类型的存储器，这样主机端对这部分数据的访问不会被Cache，尽管速度会慢很多，但是通过 non-temporal Stream 方式读写还是会很不错。况且大部分OpenCL应用对同一块内存数据的读写都只有一次，这么做也不会造成Cache污染。

查看全文

http://www.wooajung.com/news/28293.html