#include #include #include #define BLKX 16 #define BLKY 14 #define BLKZ 2 #define HORIZONTAL_PATCHES 2 #define VERTICAL_PATCHES 2 #define BATCH_SIZE HORIZONTAL_PATCHES * VERTICAL_PATCHES #define GPU_DST_COLS 8 #define GPU_DST_ROWS 8 #define INPUT_W 4 #define INPUT_H 4 // forward declaration static inline void compute_patch_origins( int* patch_origins, const int img_width = 2064, const int img_height = 1544, const int horizontal_patches = 4, const int vertical_patches = 4, const int inference_patch_width = 640, const int inference_patch_height = 640); void preprocess_kernel_img_to_batch(uint8_t* src, int src_width, int src_height, float* dst, int dst_width, int dst_height, int batch_size, int* patch_origins, cudaStream_t stream); int main(int argc, char **argv) { int patch_origins[BATCH_SIZE * 2]; compute_patch_origins(patch_origins, GPU_DST_COLS, GPU_DST_ROWS, HORIZONTAL_PATCHES, VERTICAL_PATCHES, INPUT_W, INPUT_H); size_t src_size = GPU_DST_COLS * GPU_DST_ROWS * 3; size_t dst_size = INPUT_W * INPUT_H * 3; uint8_t src[src_size] = {}; float dst[dst_size] = {}; for(uint8_t i = 0; i < src_size; i++){ src[i] = i; } for(uint8_t i = 0; i < src_size; i++){ src[i] = i; } uint8_t* src_device; float* dst_device; cudaMalloc(&src_device, src_size * sizeof(uint8_t)); cudaMalloc(&dst_device, dst_size * sizeof(float)); cudaMemcpy(src_device, &src, src_size * sizeof(uint8_t), cudaMemcpyHostToDevice); preprocess_kernel_img_to_batch(src, GPU_DST_COLS, GPU_DST_ROWS, dst, INPUT_W, INPUT_H, HORIZONTAL_PATCHES * VERTICAL_PATCHES, patch_origins, 0); cudaMemcpy(&dst, dst_device, dst_size * sizeof(float), cudaMemcpyDeviceToHost); cudaFree(src_device); cudaFree(dst_device); } __global__ void batching_kernel( uint8_t* src, int src_line_size, int src_width, int src_height, float* dst, int dst_width, int dst_height, int* patch_origins, int batch_size) { // int test = 1; // int test2 = test + 1; int idx = threadIdx.x + blockDim.x*blockIdx.x; int idy = threadIdx.y + blockDim.y*blockIdx.y; int idz = threadIdx.z + blockDim.z*blockIdx.z; if (idx >= batch_size || idz >= dst_width || idy >= dst_height) return; int batch_id = idx / 3; int channel = idx % 3; // bgr to rgb int new_channel = 2 - channel; int src_x = patch_origins[2*batch_id] + idx; int src_y = patch_origins[2*batch_id + 1] + idy; int image_channel_size_in_bytes = dst_width * dst_height; int image_size_in_bytes = image_channel_size_in_bytes* 3; float* dst_address = dst + batch_id * image_size_in_bytes + new_channel * image_channel_size_in_bytes + dst_width * idy + idx; uint8_t* src_address = src + 3 * src_width * src_y + 3 * src_x + channel; *dst_address = *src_address / 255.0f; } void preprocess_kernel_img_to_batch( uint8_t* src, int src_width, int src_height, float* dst, int dst_width, int dst_height, int batch_size, int* patch_origins, cudaStream_t stream) { dim3 block(BLKX, BLKY, BLKZ); dim3 grid(3 * batch_size / BLKX, dst_width / BLKY, dst_height / BLKZ); batching_kernel<<>>( src, src_width*3, src_width, src_height, dst, dst_width, dst_height, patch_origins, batch_size); } static inline void compute_patch_origins( int* patch_origins, const int img_width, const int img_height, const int horizontal_patches, const int vertical_patches, const int inference_patch_width, const int inference_patch_height) { const int horizontal_step = img_width / horizontal_patches; // assumes img_width is divisible by horizontal patches const int vertical_step = img_height / vertical_patches; // assumes img_height is divisible by vertical_patches int idx = 0; for(int r = 0; r < vertical_patches; r++){ for(int c = 0; c < horizontal_patches; c++){ int x, y; // ensure that the rightmost patches align with the image border if(c == horizontal_patches - 1){ x = img_width - inference_patch_width; } else{ x = horizontal_step * c; } // ensure that the bottommost patches align with the image border if(r == vertical_patches - 1){ y = img_height - inference_patch_height; } else{ y = vertical_step * r; } patch_origins[idx] = x; patch_origins[++idx] = y; idx++; } } }