HoviTron Video Pipeline: KernelCuda.cu Source File

/* ----------------------
* Copyright 2023 Universit� Libre de Bruxelles(ULB), Universidad Polit�cnica de Madrid(UPM), CREAL, Deutsches Zentrum f�r Luft - und Raumfahrt(DLR)
 
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at < http://www.apache.org/licenses/LICENSE-2.0%3E
 
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissionsand
* limitations under the License.
---------------------- */
 
 
#include "KernelCuda.cuh"
 
__global__ void copy_color(cudaSurfaceObject_t RGB_vulkan, uchar* RGB_cuda, size_t baseWidth, size_t baseHeight)
{
    for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < baseWidth * baseHeight; i += blockDim.x * gridDim.x)
    {
        int x = i % baseWidth;
        int y = i / baseWidth;
 
        uchar b = RGB_cuda[4 * i + 0];
        uchar g = RGB_cuda[4 * i + 1];
        uchar r = RGB_cuda[4 * i + 2];
 
        uchar4 color;
        color.x = b;
        color.y = g;
        color.z = r;
        color.w = 255;
 
        surf2Dwrite(color, RGB_vulkan, x * sizeof(uchar4), y);
    }
}
 
__global__ void copy_depth(cudaSurfaceObject_t D_vulkan, float* D_cuda, size_t baseWidth, size_t baseHeight)
{
    for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < baseWidth * baseHeight; i += blockDim.x * gridDim.x)
    {
        int x = i % baseWidth;
        int y = i / baseWidth;
 
        float depth = D_cuda[i];
 
        //float depth = 1;
 
        surf2Dwrite(depth, D_vulkan, x * sizeof(float), y);
    }
}
 
void copyColor(cudaSurfaceObject_t RGB_vulkan, uchar* RGB_cuda, size_t baseWidth, size_t baseHeight, cudaStream_t& stream) {
    // Copy to color
    copy_color << <blockspergrid, threadsperblock,0, stream >> > (RGB_vulkan, RGB_cuda, baseWidth, baseHeight);
    gpuErrchk(cudaGetLastError());
}
 
void copyDepth(cudaSurfaceObject_t D_vulkan, float* D_cuda, size_t baseWidth, size_t baseHeight, cudaStream_t& stream) {
    // Copy to depth
    copy_depth << <blockspergrid, threadsperblock, 0, stream >> > (D_vulkan, D_cuda, baseWidth, baseHeight);
    gpuErrchk(cudaGetLastError());
}
 
template<typename T> __global__ void remove_pitch(T* data_in, T* data_out, size_t baseWidth, size_t baseHeight, size_t nbInRow, size_t channels)
{
    for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < baseWidth * baseHeight; i += blockDim.x * gridDim.x)
    {
        int x = i % baseWidth;
        int y = i / baseWidth;
 
        for (size_t j = 0; j < channels; j++) {
            data_out[channels * i + j] = data_in[(x + y * nbInRow) * channels + j];
        }
    }
}
 
__global__ void uShort_2_uChar(unsigned short* data_in, unsigned char* data_out, size_t baseWidth, size_t baseHeight, size_t nbInRow, size_t channels)
{
    for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < baseWidth * baseHeight; i += blockDim.x * gridDim.x)
    {
        int x = i % baseWidth;
        int y = i / baseWidth;
 
        unsigned char tmp;
        unsigned short scale = (USHRT_MAX / UCHAR_MAX);
        for (size_t j = 0; j < channels; j++) {
            tmp = (unsigned char)(data_in[(x + y * nbInRow) * channels + j] / scale);
            data_out[channels * i + j] = (tmp < UCHAR_MAX) ? tmp : UCHAR_MAX;
        }
    }
}
 
__global__ void uShort_2_Float(unsigned short* data_in, float* data_out, size_t baseWidth, size_t baseHeight, size_t nbInRow, size_t channels)
{
    for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < baseWidth * baseHeight; i += blockDim.x * gridDim.x)
    {
        int x = i % baseWidth;
        int y = i / baseWidth;
        for (size_t j = 0; j < channels; j++) {
            data_out[channels * i + j] = (float)data_in[(x + y * nbInRow) * channels + j];
        }
    }
}
 
template<typename T> void removePitch(T* data_in, T* data_out, size_t baseWidth, size_t baseHeight, size_t nbInRow, size_t channels, cudaStream_t& stream) {
    remove_pitch<T> << <blockspergrid, threadsperblock, 0, stream >> > (data_in, data_out, baseWidth, baseHeight, nbInRow, channels);
    gpuErrchk(cudaGetLastError());
}
 
void uShort2uChar(unsigned short* data_in, unsigned char* data_out, size_t baseWidth, size_t baseHeight, size_t nbInRow, size_t channels, cudaStream_t& stream) {
    uShort_2_uChar << <blockspergrid, threadsperblock, 0, stream >> > (data_in, data_out, baseWidth, baseHeight, nbInRow, channels);
    gpuErrchk(cudaGetLastError());
}
 
void uShort2Float(unsigned short* data_in, float* data_out, size_t baseWidth, size_t baseHeight, size_t nbInRow, size_t channels, cudaStream_t& stream) {
    uShort_2_Float << <blockspergrid, threadsperblock, 0, stream >> > (data_in, data_out, baseWidth, baseHeight, nbInRow, channels);
    gpuErrchk(cudaGetLastError());
}
 
template<typename T> __global__ void scale_data_array(T* data_in, T* data_out, size_t baseWidth, size_t baseHeight, size_t nbInRow, size_t channels, float scale)
{
    for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < baseWidth * baseHeight; i += blockDim.x * gridDim.x)
    {
        int x = i % baseWidth;
        int y = i / baseWidth;
 
        for (size_t j = 0; j < channels; j++) {
            data_out[channels * i + j] = data_in[(x + y * nbInRow) * channels + j] * scale;
        }
    }
}
 
template<typename T> void scaleDataArray(T* data_in, T* data_out, size_t baseWidth, size_t baseHeight, size_t nbInRow, size_t channels, float scale,cudaStream_t& stream) {
    scale_data_array<T> << <blockspergrid, threadsperblock, 0, stream >> > (data_in, data_out, baseWidth, baseHeight, nbInRow, channels, scale);
    gpuErrchk(cudaGetLastError());
}
 
template<typename T> __global__ void add_offset_data_array(T* data_in, T* data_out, size_t baseWidth, size_t baseHeight, size_t nbInRow, size_t channels, float offset)
{
    for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < baseWidth * baseHeight; i += blockDim.x * gridDim.x)
    {
        int x = i % baseWidth;
        int y = i / baseWidth;
 
        for (size_t j = 0; j < channels; j++) {
            data_out[channels * i + j] = data_in[(x + y * nbInRow) * channels + j] + offset;
            //data_out[channels * i + j] = x;
        }
    }
}
 
template<typename T> void addOffsetDataArray(T* data_in, T* data_out, size_t baseWidth, size_t baseHeight, size_t nbInRow, size_t channels, float offset, cudaStream_t& stream) {
    add_offset_data_array<T> << <blockspergrid, threadsperblock, 0, stream >> > (data_in, data_out, baseWidth, baseHeight, nbInRow, channels, offset);
    gpuErrchk(cudaGetLastError());
}
 
template<typename T> __global__ void scale_data_array_uchannel(T* data_in, T* data_out, size_t baseWidth, size_t baseHeight, size_t nbInRow, size_t channels, size_t nbOfChannel, float scale) {
    for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < baseWidth * baseHeight; i += blockDim.x * gridDim.x)
    {
        int x = i % baseWidth;
        int y = i / baseWidth;
 
        data_out[i + nbOfChannel] = data_in[(x + y * nbInRow) * channels + nbOfChannel] * scale;
    }
}
 
template<typename T> void scaleDataArrayUChannel(T* data_in, T* data_out, size_t baseWidth, size_t baseHeight, size_t nbInRow, size_t channels, size_t nbOfChannel, float scale, cudaStream_t& stream) {
    scale_data_array_uchannel<T> << <blockspergrid, threadsperblock, 0, stream >> > (data_in, data_out, baseWidth, baseHeight, nbInRow, channels, nbOfChannel,scale);
    gpuErrchk(cudaGetLastError());
}
 
template<typename T> __global__ void scale_add_data_array_uchannel(T* data_in, T* data_out, size_t baseWidth, size_t baseHeight, size_t nbInRow, size_t channels, size_t nbOfChannel, float scale, float offset) {
    for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < baseWidth * baseHeight; i += blockDim.x * gridDim.x)
    {
        int x = i % baseWidth;
        int y = i / baseWidth;
 
        data_out[i + nbOfChannel] = (data_in[(x + y * nbInRow) * channels + nbOfChannel] + offset) * scale;
    }
}
 
template<typename T> void scaleAddDataArrayUChannel(T* data_in, T* data_out, size_t baseWidth, size_t baseHeight, size_t nbInRow, size_t channels, size_t nbOfChannel, float scale, float offset, cudaStream_t& stream) {
    scale_add_data_array_uchannel<T> << <blockspergrid, threadsperblock, 0, stream >> > (data_in, data_out, baseWidth, baseHeight, nbInRow, channels, nbOfChannel, scale, offset);
    gpuErrchk(cudaGetLastError());
}
 
template<typename T> __global__ void temporal_consistency_adjustement(T* prev_depth, T* curr_depth, size_t baseWidth, size_t baseHeight, unsigned int sizePatch, float treshold, float adjustementFactor) {
    
    float avg = 0.0f;
    for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < baseWidth * baseHeight; i += blockDim.x * gridDim.x)
    {
        int x = i % baseWidth;
        int y = i / baseWidth;
 
        if (x + sizePatch >= baseWidth || x - sizePatch < 0 || y + sizePatch >= baseHeight || y - sizePatch < 0) {
            continue;
        }
 
        avg = 0.f;
        for (int x_win = x - sizePatch; x_win < x + sizePatch; x_win++) {
            for (int y_win = y - sizePatch; y_win < y + sizePatch; y_win++) {
                avg += fabsf(curr_depth[x_win + y_win * baseWidth] - prev_depth[x_win + y_win * baseWidth]);
            }
        }
        avg /= (float)(sizePatch * sizePatch);
        if (avg < treshold) {
            curr_depth[x + y * baseWidth] = (1.0f - adjustementFactor) * curr_depth[x + y * baseWidth] + adjustementFactor * prev_depth[x + y * baseWidth];
        }
        
    }
}
 
template<typename T> void temporalConsistencyAdjustement(T* prev_depth, T* curr_depth, size_t baseWidth, size_t baseHeight, unsigned int sizePatch, float treshold, float adjustementFactor, cudaStream_t& stream) {
    temporal_consistency_adjustement<T> << <blockspergrid, threadsperblock, 0, stream >> > (prev_depth, curr_depth, baseWidth, baseHeight, sizePatch,treshold, adjustementFactor);
    gpuErrchk(cudaGetLastError());
    gpuErrchk(cudaMemcpyAsync(prev_depth, curr_depth, sizeof(T) * baseHeight * baseWidth, cudaMemcpyDeviceToDevice, stream));
}
 
// !!! WARNING !!! THIS FUNCTION SHOULD NEVER BE USED - ONLY THERE TO FORCE THE COMPILER TO CREATE FUNCTION IN DIFFERENT TYPES !!! WARNING !!!
[[noreturn]] void unreachable_kernel() {
    throw HVT_ERROR_INVALID_HANDLE;
 
    int null = NULL;
    
    removePitch<float>((float*) nullptr, (float*) nullptr, 0, 0, 0, 0, (cudaStream_t&)(null));
    removePitch<int>((int*) nullptr, (int*) nullptr, 0, 0, 0, 0, (cudaStream_t&)(null));
    removePitch<uchar>((uchar*) nullptr, (uchar*) nullptr, 0, 0, 0, 0, (cudaStream_t&)(null));
    removePitch<USHORT>((USHORT*) nullptr, (USHORT*) nullptr, 0, 0, 0, 0, (cudaStream_t&)(null));
    removePitch<double>((double*) nullptr, (double*) nullptr, 0, 0, 0, 0, (cudaStream_t&)(null));
 
    scaleAddDataArrayUChannel<float>((float*) nullptr, (float*) nullptr,0,0,0,0,0,0.0f,0.0f,(cudaStream_t&) (null));
    scaleAddDataArrayUChannel<int>((int*) nullptr, (int*) nullptr, 0, 0, 0, 0, 0, 0.0f, 0.0f, (cudaStream_t&)(null));
    scaleAddDataArrayUChannel<uchar>((uchar*) nullptr, (uchar*) nullptr, 0, 0, 0, 0, 0, 0.0f, 0.0f, (cudaStream_t&)(null));
    scaleAddDataArrayUChannel<USHORT>((USHORT*) nullptr, (USHORT*) nullptr, 0, 0, 0, 0, 0, 0.0f, 0.0f, (cudaStream_t&)(null));
    scaleAddDataArrayUChannel<double>((double*) nullptr, (double*) nullptr, 0, 0, 0, 0, 0, 0.0f, 0.0f, (cudaStream_t&)(null));
 
    scaleDataArrayUChannel<float>((float*) nullptr, (float*) nullptr, 0, 0, 0, 0, 0, 0.0f, (cudaStream_t&)(null));
    scaleDataArrayUChannel<int>((int*) nullptr, (int*) nullptr, 0, 0, 0, 0, 0, 0.0f, (cudaStream_t&)(null));
    scaleDataArrayUChannel<uchar>((uchar*) nullptr, (uchar*) nullptr, 0, 0, 0, 0, 0, 0.0f, (cudaStream_t&)(null));
    scaleDataArrayUChannel<USHORT>((USHORT*) nullptr, (USHORT*) nullptr, 0, 0, 0, 0, 0,0.0f, (cudaStream_t&)(null));
    scaleDataArrayUChannel<double>((double*) nullptr, (double*) nullptr, 0, 0, 0, 0, 0,0.0f, (cudaStream_t&)(null));
 
    addOffsetDataArray<float>((float*) nullptr, (float*) nullptr, 0, 0, 0, 0, 0.0f, (cudaStream_t&)(null));
    addOffsetDataArray<int>((int*) nullptr, (int*) nullptr, 0, 0, 0, 0, 0.0f, (cudaStream_t&)(null));
    addOffsetDataArray<uchar>((uchar*) nullptr, (uchar*) nullptr, 0, 0, 0, 0, 0.0f, (cudaStream_t&)(null));
    addOffsetDataArray<USHORT>((USHORT*) nullptr, (USHORT*) nullptr, 0, 0, 0, 0, 0.0f, (cudaStream_t&)(null));
    addOffsetDataArray<double>((double*) nullptr, (double*) nullptr, 0, 0, 0, 0, 0.0f, (cudaStream_t&)(null));
 
    scaleDataArray<float>((float*) nullptr, (float*) nullptr, 0, 0, 0, 0, 0.0f, (cudaStream_t&)(null));
    scaleDataArray<int>((int*) nullptr, (int*) nullptr, 0, 0, 0, 0, 0.0f, (cudaStream_t&)(null));
    scaleDataArray<uchar>((uchar*) nullptr, (uchar*) nullptr, 0, 0, 0, 0, 0.0f, (cudaStream_t&)(null));
    scaleDataArray<USHORT>((USHORT*) nullptr, (USHORT*) nullptr, 0, 0, 0, 0, 0.0f, (cudaStream_t&)(null));
    scaleDataArray<double>((double*) nullptr, (double*) nullptr, 0, 0, 0, 0, 0.0f, (cudaStream_t&)(null));
 
    temporalConsistencyAdjustement<float>((float*) nullptr, (float*) nullptr, 0, 0, 0, 0.0f, 0.0f, (cudaStream_t&)(null));
    temporalConsistencyAdjustement<int>((int*) nullptr, (int*) nullptr, 0, 0, 0, 0.0f, 0.0f, (cudaStream_t&)(null));
    temporalConsistencyAdjustement<uchar>((uchar*) nullptr, (uchar*) nullptr, 0, 0, 0, 0.0f, 0.0f, (cudaStream_t&)(null));
    temporalConsistencyAdjustement<USHORT>((USHORT*) nullptr, (USHORT*) nullptr, 0, 0, 0, 0.0f, 0.0f, (cudaStream_t&)(null));
    temporalConsistencyAdjustement<double>((double*) nullptr, (double*) nullptr, 0, 0, 0, 0.0f, 0.0f, (cudaStream_t&)(null));
}