init
This commit is contained in:
commit
da55294019
138
main.cu
Normal file
138
main.cu
Normal file
@ -0,0 +1,138 @@
|
|||||||
|
#include <cuda_runtime.h>
|
||||||
|
#include <cstdint>
|
||||||
|
#include <iostream>
|
||||||
|
|
||||||
|
#define BLKX 16
|
||||||
|
#define BLKY 14
|
||||||
|
#define BLKZ 2
|
||||||
|
|
||||||
|
#define HORIZONTAL_PATCHES 2
|
||||||
|
#define VERTICAL_PATCHES 2
|
||||||
|
#define BATCH_SIZE HORIZONTAL_PATCHES * VERTICAL_PATCHES
|
||||||
|
|
||||||
|
#define GPU_DST_COLS 8
|
||||||
|
#define GPU_DST_ROWS 8
|
||||||
|
#define INPUT_W 4
|
||||||
|
#define INPUT_H 4
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
// forward declaration
|
||||||
|
static inline void compute_patch_origins(
|
||||||
|
int* patch_origins, const int img_width = 2064, const int img_height = 1544,
|
||||||
|
const int horizontal_patches = 4, const int vertical_patches = 4,
|
||||||
|
const int inference_patch_width = 640, const int inference_patch_height = 640);
|
||||||
|
void preprocess_kernel_img_to_batch(uint8_t* src, int src_width, int src_height,
|
||||||
|
float* dst, int dst_width, int dst_height, int batch_size, int* patch_origins,
|
||||||
|
cudaStream_t stream);
|
||||||
|
|
||||||
|
int main(int argc, char **argv) {
|
||||||
|
int patch_origins[BATCH_SIZE * 2];
|
||||||
|
compute_patch_origins(patch_origins, GPU_DST_COLS, GPU_DST_ROWS, HORIZONTAL_PATCHES, VERTICAL_PATCHES, INPUT_W, INPUT_H);
|
||||||
|
|
||||||
|
size_t src_size = GPU_DST_COLS * GPU_DST_ROWS * 3;
|
||||||
|
size_t dst_size = INPUT_W * INPUT_H * 3;
|
||||||
|
uint8_t src[src_size] = {};
|
||||||
|
float dst[dst_size] = {};
|
||||||
|
|
||||||
|
for(uint8_t i = 0; i < src_size; i++){
|
||||||
|
src[i] = i;
|
||||||
|
}
|
||||||
|
|
||||||
|
for(uint8_t i = 0; i < src_size; i++){
|
||||||
|
src[i] = i;
|
||||||
|
}
|
||||||
|
|
||||||
|
uint8_t* src_device;
|
||||||
|
float* dst_device;
|
||||||
|
cudaMalloc(&src_device, src_size * sizeof(uint8_t));
|
||||||
|
cudaMalloc(&dst_device, dst_size * sizeof(float));
|
||||||
|
|
||||||
|
cudaMemcpy(src_device, &src, src_size * sizeof(uint8_t), cudaMemcpyHostToDevice);
|
||||||
|
|
||||||
|
preprocess_kernel_img_to_batch(src, GPU_DST_COLS, GPU_DST_ROWS, dst, INPUT_W, INPUT_H, HORIZONTAL_PATCHES * VERTICAL_PATCHES, patch_origins, 0);
|
||||||
|
|
||||||
|
cudaMemcpy(&dst, dst_device, dst_size * sizeof(float), cudaMemcpyDeviceToHost);
|
||||||
|
|
||||||
|
cudaFree(src_device);
|
||||||
|
cudaFree(dst_device);
|
||||||
|
}
|
||||||
|
|
||||||
|
__global__ void batching_kernel(
|
||||||
|
uint8_t* src, int src_line_size, int src_width,
|
||||||
|
int src_height, float* dst, int dst_width,
|
||||||
|
int dst_height, int* patch_origins, int batch_size) {
|
||||||
|
|
||||||
|
// int test = 1;
|
||||||
|
// int test2 = test + 1;
|
||||||
|
|
||||||
|
int idx = threadIdx.x + blockDim.x*blockIdx.x;
|
||||||
|
int idy = threadIdx.y + blockDim.y*blockIdx.y;
|
||||||
|
int idz = threadIdx.z + blockDim.z*blockIdx.z;
|
||||||
|
|
||||||
|
if (idx >= batch_size || idz >= dst_width || idy >= dst_height) return;
|
||||||
|
|
||||||
|
int batch_id = idx / 3;
|
||||||
|
int channel = idx % 3;
|
||||||
|
|
||||||
|
// bgr to rgb
|
||||||
|
int new_channel = 2 - channel;
|
||||||
|
|
||||||
|
int src_x = patch_origins[2*batch_id] + idx;
|
||||||
|
int src_y = patch_origins[2*batch_id + 1] + idy;
|
||||||
|
|
||||||
|
int image_channel_size_in_bytes = dst_width * dst_height;
|
||||||
|
int image_size_in_bytes = image_channel_size_in_bytes* 3;
|
||||||
|
float* dst_address = dst + batch_id * image_size_in_bytes + new_channel * image_channel_size_in_bytes + dst_width * idy + idx;
|
||||||
|
uint8_t* src_address = src + 3 * src_width * src_y + 3 * src_x + channel;
|
||||||
|
|
||||||
|
*dst_address = *src_address / 255.0f;
|
||||||
|
}
|
||||||
|
|
||||||
|
void preprocess_kernel_img_to_batch(
|
||||||
|
uint8_t* src, int src_width, int src_height,
|
||||||
|
float* dst, int dst_width, int dst_height, int batch_size, int* patch_origins,
|
||||||
|
cudaStream_t stream) {
|
||||||
|
|
||||||
|
dim3 block(BLKX, BLKY, BLKZ);
|
||||||
|
dim3 grid(3 * batch_size / BLKX, dst_width / BLKY, dst_height / BLKZ);
|
||||||
|
|
||||||
|
batching_kernel<<<grid, block, 0>>>(
|
||||||
|
src, src_width*3, src_width,
|
||||||
|
src_height, dst, dst_width,
|
||||||
|
dst_height, patch_origins, batch_size);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void compute_patch_origins(
|
||||||
|
int* patch_origins, const int img_width, const int img_height,
|
||||||
|
const int horizontal_patches, const int vertical_patches,
|
||||||
|
const int inference_patch_width, const int inference_patch_height) {
|
||||||
|
const int horizontal_step = img_width / horizontal_patches; // assumes img_width is divisible by horizontal patches
|
||||||
|
const int vertical_step = img_height / vertical_patches; // assumes img_height is divisible by vertical_patches
|
||||||
|
|
||||||
|
int idx = 0;
|
||||||
|
for(int r = 0; r < vertical_patches; r++){
|
||||||
|
for(int c = 0; c < horizontal_patches; c++){
|
||||||
|
int x, y;
|
||||||
|
// ensure that the rightmost patches align with the image border
|
||||||
|
if(c == horizontal_patches - 1){
|
||||||
|
x = img_width - inference_patch_width;
|
||||||
|
}
|
||||||
|
else{
|
||||||
|
x = horizontal_step * c;
|
||||||
|
}
|
||||||
|
// ensure that the bottommost patches align with the image border
|
||||||
|
if(r == vertical_patches - 1){
|
||||||
|
y = img_height - inference_patch_height;
|
||||||
|
}
|
||||||
|
else{
|
||||||
|
y = vertical_step * r;
|
||||||
|
}
|
||||||
|
|
||||||
|
patch_origins[idx] = x;
|
||||||
|
patch_origins[++idx] = y;
|
||||||
|
idx++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
Loading…
x
Reference in New Issue
Block a user