cuda-debug/main.cu

#include <cuda_runtime.h>
#include <cstdint>
#include <iostream>

#define BLKX 16
#define BLKY 14
#define BLKZ 2

#define HORIZONTAL_PATCHES 2
#define VERTICAL_PATCHES 2
#define BATCH_SIZE HORIZONTAL_PATCHES * VERTICAL_PATCHES

#define GPU_DST_COLS 8
#define GPU_DST_ROWS 8
#define INPUT_W 4
#define INPUT_H 4


// forward declaration
static inline void compute_patch_origins(
        int* patch_origins, const int img_width = 2064, const int img_height = 1544, 
        const int horizontal_patches = 4, const int vertical_patches = 4,
        const int inference_patch_width = 640, const int inference_patch_height = 640);
void preprocess_kernel_img_to_batch(uint8_t* src, int src_width, int src_height,
    float* dst, int dst_width, int dst_height, int batch_size, int* patch_origins,
    cudaStream_t stream);

int main(int argc, char **argv) {
    int patch_origins[BATCH_SIZE * 2];
    compute_patch_origins(patch_origins, GPU_DST_COLS, GPU_DST_ROWS, HORIZONTAL_PATCHES, VERTICAL_PATCHES, INPUT_W, INPUT_H);

    size_t src_size = GPU_DST_COLS * GPU_DST_ROWS * 3;
    size_t dst_size = INPUT_W * INPUT_H * 3;
    uint8_t src[src_size] = {};
    float dst[dst_size] = {};

    for(uint8_t i = 0; i < src_size; i++){
        src[i] = i;
    }

    for(uint8_t i = 0; i < src_size; i++){
        src[i] = i;
    }

    uint8_t* src_device;
    float* dst_device;  
    cudaMalloc(&src_device, src_size * sizeof(uint8_t));
    cudaMalloc(&dst_device, dst_size * sizeof(float));

    cudaMemcpy(src_device, &src, src_size * sizeof(uint8_t), cudaMemcpyHostToDevice);

    preprocess_kernel_img_to_batch(src, GPU_DST_COLS, GPU_DST_ROWS, dst, INPUT_W, INPUT_H, HORIZONTAL_PATCHES * VERTICAL_PATCHES, patch_origins, 0);

    cudaMemcpy(&dst, dst_device, dst_size * sizeof(float), cudaMemcpyDeviceToHost);

    cudaFree(src_device);
    cudaFree(dst_device);
}

__global__ void batching_kernel( 
    uint8_t* src, int src_line_size, int src_width,
    int src_height, float* dst, int dst_width, 
    int dst_height, int* patch_origins, int batch_size) {
    
    // int test = 1;
    // int test2 = test + 1;

    int idx = threadIdx.x + blockDim.x*blockIdx.x;
    int idy = threadIdx.y + blockDim.y*blockIdx.y;
    int idz = threadIdx.z + blockDim.z*blockIdx.z;

    if (idx >= batch_size || idz >= dst_width || idy >= dst_height) return;

    int batch_id = idx / 3;
    int channel = idx % 3;

    // bgr to rgb
    int new_channel = 2 - channel;

    int src_x = patch_origins[2*batch_id] + idx;
    int src_y = patch_origins[2*batch_id + 1] + idy;

    int image_channel_size_in_bytes = dst_width * dst_height;
    int image_size_in_bytes = image_channel_size_in_bytes* 3;
    float* dst_address = dst + batch_id * image_size_in_bytes + new_channel * image_channel_size_in_bytes + dst_width * idy + idx;
    uint8_t* src_address = src + 3 * src_width * src_y + 3 * src_x + channel;

    *dst_address = *src_address / 255.0f;
}

void preprocess_kernel_img_to_batch(
    uint8_t* src, int src_width, int src_height,
    float* dst, int dst_width, int dst_height, int batch_size, int* patch_origins,
    cudaStream_t stream) {

    dim3 block(BLKX, BLKY, BLKZ);
    dim3 grid(3 * batch_size / BLKX, dst_width / BLKY, dst_height / BLKZ);

    batching_kernel<<<grid, block, 0>>>(
        src, src_width*3, src_width,
        src_height, dst, dst_width,
        dst_height, patch_origins, batch_size);

}

static inline void compute_patch_origins(
        int* patch_origins, const int img_width, const int img_height, 
        const int horizontal_patches, const int vertical_patches,
        const int inference_patch_width, const int inference_patch_height) {
    const int horizontal_step = img_width / horizontal_patches; // assumes img_width is divisible by horizontal patches
    const int vertical_step = img_height / vertical_patches; // assumes img_height is divisible by vertical_patches

    int idx = 0;
    for(int r = 0; r < vertical_patches; r++){
        for(int c = 0; c < horizontal_patches; c++){            
            int x, y;
            // ensure that the rightmost patches align with the image border
            if(c == horizontal_patches - 1){
                x = img_width - inference_patch_width;
            }
            else{
                x = horizontal_step * c;
            }
            // ensure that the bottommost patches align with the image border
            if(r == vertical_patches - 1){
                y = img_height - inference_patch_height;
            }
            else{
                y = vertical_step * r;
            }
            
            patch_origins[idx] = x;
            patch_origins[++idx] = y;
            idx++;
        }
    }
}
init 2022-07-29 10:51:26 +02:00			`#include <cuda_runtime.h>`
			`#include <cstdint>`
			`#include <iostream>`

			`#define BLKX 16`
			`#define BLKY 14`
			`#define BLKZ 2`

			`#define HORIZONTAL_PATCHES 2`
			`#define VERTICAL_PATCHES 2`
			`#define BATCH_SIZE HORIZONTAL_PATCHES * VERTICAL_PATCHES`

			`#define GPU_DST_COLS 8`
			`#define GPU_DST_ROWS 8`
			`#define INPUT_W 4`
			`#define INPUT_H 4`



			`// forward declaration`
			`static inline void compute_patch_origins(`
			`int* patch_origins, const int img_width = 2064, const int img_height = 1544,`
			`const int horizontal_patches = 4, const int vertical_patches = 4,`
			`const int inference_patch_width = 640, const int inference_patch_height = 640);`
			`void preprocess_kernel_img_to_batch(uint8_t* src, int src_width, int src_height,`
			`float* dst, int dst_width, int dst_height, int batch_size, int* patch_origins,`
			`cudaStream_t stream);`

			`int main(int argc, char **argv) {`
			`int patch_origins[BATCH_SIZE * 2];`
			`compute_patch_origins(patch_origins, GPU_DST_COLS, GPU_DST_ROWS, HORIZONTAL_PATCHES, VERTICAL_PATCHES, INPUT_W, INPUT_H);`

			`size_t src_size = GPU_DST_COLS * GPU_DST_ROWS * 3;`
			`size_t dst_size = INPUT_W * INPUT_H * 3;`
			`uint8_t src[src_size] = {};`
			`float dst[dst_size] = {};`

			`for(uint8_t i = 0; i < src_size; i++){`
			`src[i] = i;`
			`}`

			`for(uint8_t i = 0; i < src_size; i++){`
			`src[i] = i;`
			`}`

			`uint8_t* src_device;`
			`float* dst_device;`
			`cudaMalloc(&src_device, src_size * sizeof(uint8_t));`
			`cudaMalloc(&dst_device, dst_size * sizeof(float));`

			`cudaMemcpy(src_device, &src, src_size * sizeof(uint8_t), cudaMemcpyHostToDevice);`

			`preprocess_kernel_img_to_batch(src, GPU_DST_COLS, GPU_DST_ROWS, dst, INPUT_W, INPUT_H, HORIZONTAL_PATCHES * VERTICAL_PATCHES, patch_origins, 0);`

			`cudaMemcpy(&dst, dst_device, dst_size * sizeof(float), cudaMemcpyDeviceToHost);`

			`cudaFree(src_device);`
			`cudaFree(dst_device);`
			`}`

			`__global__ void batching_kernel(`
			`uint8_t* src, int src_line_size, int src_width,`
			`int src_height, float* dst, int dst_width,`
			`int dst_height, int* patch_origins, int batch_size) {`

			`// int test = 1;`
			`// int test2 = test + 1;`

			`int idx = threadIdx.x + blockDim.x*blockIdx.x;`
			`int idy = threadIdx.y + blockDim.y*blockIdx.y;`
			`int idz = threadIdx.z + blockDim.z*blockIdx.z;`

			`if (idx >= batch_size \|\| idz >= dst_width \|\| idy >= dst_height) return;`

			`int batch_id = idx / 3;`
			`int channel = idx % 3;`

			`// bgr to rgb`
			`int new_channel = 2 - channel;`

			`int src_x = patch_origins[2*batch_id] + idx;`
			`int src_y = patch_origins[2*batch_id + 1] + idy;`

			`int image_channel_size_in_bytes = dst_width * dst_height;`
			`int image_size_in_bytes = image_channel_size_in_bytes* 3;`
			`float* dst_address = dst + batch_id * image_size_in_bytes + new_channel * image_channel_size_in_bytes + dst_width * idy + idx;`
			`uint8_t* src_address = src + 3 * src_width * src_y + 3 * src_x + channel;`

			`dst_address = src_address / 255.0f;`
			`}`

			`void preprocess_kernel_img_to_batch(`
			`uint8_t* src, int src_width, int src_height,`
			`float* dst, int dst_width, int dst_height, int batch_size, int* patch_origins,`
			`cudaStream_t stream) {`

			`dim3 block(BLKX, BLKY, BLKZ);`
			`dim3 grid(3 * batch_size / BLKX, dst_width / BLKY, dst_height / BLKZ);`

			`batching_kernel<<<grid, block, 0>>>(`
			`src, src_width*3, src_width,`
			`src_height, dst, dst_width,`
			`dst_height, patch_origins, batch_size);`

			`}`

			`static inline void compute_patch_origins(`
			`int* patch_origins, const int img_width, const int img_height,`
			`const int horizontal_patches, const int vertical_patches,`
			`const int inference_patch_width, const int inference_patch_height) {`
			`const int horizontal_step = img_width / horizontal_patches; // assumes img_width is divisible by horizontal patches`
			`const int vertical_step = img_height / vertical_patches; // assumes img_height is divisible by vertical_patches`

			`int idx = 0;`
			`for(int r = 0; r < vertical_patches; r++){`
			`for(int c = 0; c < horizontal_patches; c++){`
			`int x, y;`
			`// ensure that the rightmost patches align with the image border`
			`if(c == horizontal_patches - 1){`
			`x = img_width - inference_patch_width;`
			`}`
			`else{`
			`x = horizontal_step * c;`
			`}`
			`// ensure that the bottommost patches align with the image border`
			`if(r == vertical_patches - 1){`
			`y = img_height - inference_patch_height;`
			`}`
			`else{`
			`y = vertical_step * r;`
			`}`

			`patch_origins[idx] = x;`
			`patch_origins[++idx] = y;`
			`idx++;`
			`}`
			`}`
			`}`