// #define TEST #define X 95 #define Y 22 //#define USE_DBL #ifdef USE_DBL #define TYPE_FLT double #define TYPE_INT long #define MASK 0xFFFF #define SHIFT 16 #define EPSILSON 4.94065645841247E-324 #else #define TYPE_FLT float #define TYPE_INT int #define MASK 0x00FF #define SHIFT 8 #define EPSILSON 1.401298E-45 #endif constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR; struct input { int state; int sri; int width; int height; }; TYPE_FLT read_data(image2d_t input_frame, int x, int z) { int2 gid = (int2)(x, z); uint4 pixel = read_imageui(input_frame, sampler, gid); TYPE_INT temp = (TYPE_INT)((TYPE_INT)pixel.x & MASK) | (TYPE_INT)(((TYPE_INT)pixel.y & MASK) << SHIFT) | (TYPE_INT)(((TYPE_INT)pixel.z & MASK) << (SHIFT * 2)) | (TYPE_INT)(((TYPE_INT)pixel.w & MASK) << (SHIFT * 3)); TYPE_FLT raw_data = *((TYPE_FLT*)(&temp)); return raw_data; } kernel void Sri(read_only image2d_t input_frame, read_write image2d_t output_frame, local TYPE_FLT* sum, global TYPE_FLT* scratch_pad, struct input params) { int2 gid = (int2)(get_global_id(0), get_global_id(1)); TYPE_FLT input = read_data(input_frame, gid.x, gid.y); TYPE_FLT output_data = 0; if(params.sri == 0) { output_data = input; } else if(params.state == 1 || params.state == 4) { uint local_id = get_local_id(0); uint group_size = get_local_size(0); sum[local_id] = input; uint is_odd = group_size % 2; // Loop for computing localMaxes : divide WorkGroup into 2 parts for (uint stride = group_size / 2; stride > 0; stride /= 2) { // Waiting barrier(CLK_LOCAL_MEM_FENCE); if (local_id < stride) sum[local_id] += sum[local_id + stride]; if (local_id == 0) { if(is_odd) sum[local_id] += sum[2 * stride]; is_odd = stride % 2; } } // Write result into scratchPad[nWorkGroups] if (local_id == 0) { scratch_pad[get_group_id(1)] = sum[0]; } return; } else if(params.state == 2 || params.state == 5) { uint local_id = get_local_id(0); uint group_size = get_local_size(0); sum[local_id] = scratch_pad[gid.x]; uint is_odd = group_size % 2; // Loop for computing localMaxes : divide WorkGroup into 2 parts for (uint stride = group_size / 2; stride > 0; stride /= 2) { // Waiting barrier(CLK_LOCAL_MEM_FENCE); if (local_id < stride) sum[local_id] += sum[local_id + stride]; if (local_id == 0) { if(is_odd) sum[local_id] += sum[2 * stride]; is_odd = stride % 2; } } // Write result into scratchPad[nWorkGroups] if (local_id == 0) { scratch_pad[0] = sum[0] / (params.width * params.height); } return; } else if(params.state == 3) { TYPE_FLT mean = scratch_pad[0]; output_data = pow((input - mean), 2); } else if(params.state == 6) { int temp[5] = {3, 3, 5, 7, 7}; int p = temp[params.sri - 1]; int q = params.width > 1 ? p : 1; float alpha = 1 - 0.25 * (params.sri - 1); TYPE_FLT rho2 = scratch_pad[0]; int lag1 = (p - 1) / 2; int lag2 = (q - 1) / 2; float window[49]; int window_size = p * q; for(int i = 0; i < window_size; i++) window[i] = 0; if(gid.x >= lag2 && gid.x < params.width - lag2 && gid.y >= lag1 && gid.y < params.height - lag1) { for(int i = -lag1; i <= lag1 ; i++) { for(int j = -lag2; j <= lag2 ; j++) { window[(i + lag1) * q + (j + lag2)] = read_data(input_frame, gid.x + j, gid.y + i); } } #ifdef TEST if(gid.x == X && gid.y == Y) { for(int i = 0; i < window_size; i++) printf("[%d] > %f; ", i, window[i]); } #endif float mean = 0; for(int i = 0; i < window_size; i++) { mean += window[i]; } mean /= window_size; for(int i = 0; i < window_size; i++) { window[i] = pow((window[i] - mean), 2); } float sigma2 = 0; for(int i = 0; i < window_size; i++) { sigma2 += window[i]; } sigma2 /= window_size; output_data = mean + alpha * (sigma2 / (sigma2 + rho2)) * (input - mean); } else { output_data = input; } if(output_data < 0) output_data = 0; } TYPE_INT out = *((TYPE_INT*)(&output_data)); uint4 pixel; pixel.x = (TYPE_INT)(out & MASK); pixel.y = (TYPE_INT)((out >> SHIFT) & MASK); pixel.z = (TYPE_INT)((out >> (SHIFT *2)) & MASK); pixel.w = (TYPE_INT)((out >> (SHIFT * 3)) & MASK); write_imageui(output_frame, gid, pixel); }