diff --git a/main.cu b/main.cu index 656a15c..ca82fff 100644 --- a/main.cu +++ b/main.cu @@ -43,14 +43,17 @@ int main(int argc, char **argv) { src[i] = i; } + int* patch_origins_device; uint8_t* src_device; float* dst_device; + cudaMalloc(&patch_origins_device, BATCH_SIZE * 2 * sizeof(int)); cudaMalloc(&src_device, src_size * sizeof(uint8_t)); cudaMalloc(&dst_device, dst_size * sizeof(float)); + cudaMemcpy(patch_origins_device, &patch_origins, BATCH_SIZE * 2 * sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy(src_device, &src, src_size * sizeof(uint8_t), cudaMemcpyHostToDevice); - preprocess_kernel_img_to_batch(src, GPU_DST_COLS, GPU_DST_ROWS, dst, INPUT_W, INPUT_H, HORIZONTAL_PATCHES * VERTICAL_PATCHES, patch_origins, 0); + preprocess_kernel_img_to_batch(src_device, GPU_DST_COLS, GPU_DST_ROWS, dst_device, INPUT_W, INPUT_H, HORIZONTAL_PATCHES * VERTICAL_PATCHES, patch_origins_device, 0); cudaMemcpy(&dst, dst_device, dst_size * sizeof(float), cudaMemcpyDeviceToHost); @@ -94,8 +97,10 @@ void preprocess_kernel_img_to_batch( float* dst, int dst_width, int dst_height, int batch_size, int* patch_origins, cudaStream_t stream) { - dim3 block(BLKX, BLKY, BLKZ); - dim3 grid(3 * batch_size / BLKX, dst_width / BLKY, dst_height / BLKZ); + // dim3 block(BLKX, BLKY, BLKZ); + // dim3 grid(3 * batch_size / BLKX, dst_width / BLKY, dst_height / BLKZ); + dim3 block(1, 1, 1); + dim3 grid(4, 4, 3); batching_kernel<<>>( src, src_width*3, src_width,