1/* ----------------------
2* Copyright 2023 Université Libre de Bruxelles(ULB), Universidad Politécnica de Madrid(UPM), CREAL, Deutsches Zentrum für Luft - und Raumfahrt(DLR)
4* Licensed under the Apache License, Version 2.0 (the "License");
5* you may not use this file except in compliance with the License.
6* You may obtain a copy of the License at < http://www.apache.org/licenses/LICENSE-2.0%3E
8* Unless required by applicable law or agreed to in writing, software
9* distributed under the License is distributed on an "AS IS" BASIS,
10* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11* See the License for the specific language governing permissionsand
12* limitations under the License.
13---------------------- */
16#include "KernelCuda.cuh"
18__global__ void copy_color(cudaSurfaceObject_t RGB_vulkan, uchar* RGB_cuda, size_t baseWidth, size_t baseHeight)
20 for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < baseWidth * baseHeight; i += blockDim.x * gridDim.x)
22 int x = i % baseWidth;
23 int y = i / baseWidth;
25 uchar b = RGB_cuda[4 * i + 0];
26 uchar g = RGB_cuda[4 * i + 1];
27 uchar r = RGB_cuda[4 * i + 2];
35 surf2Dwrite(color, RGB_vulkan, x * sizeof(uchar4), y);
39__global__ void copy_depth(cudaSurfaceObject_t D_vulkan, float* D_cuda, size_t baseWidth, size_t baseHeight)
41 for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < baseWidth * baseHeight; i += blockDim.x * gridDim.x)
43 int x = i % baseWidth;
44 int y = i / baseWidth;
46 float depth = D_cuda[i];
50 surf2Dwrite(depth, D_vulkan, x * sizeof(float), y);
54void copyColor(cudaSurfaceObject_t RGB_vulkan, uchar* RGB_cuda, size_t baseWidth, size_t baseHeight, cudaStream_t& stream) {
56 copy_color << <blockspergrid, threadsperblock,0, stream >> > (RGB_vulkan, RGB_cuda, baseWidth, baseHeight);
57 gpuErrchk(cudaGetLastError());
60void copyDepth(cudaSurfaceObject_t D_vulkan, float* D_cuda, size_t baseWidth, size_t baseHeight, cudaStream_t& stream) {
62 copy_depth << <blockspergrid, threadsperblock, 0, stream >> > (D_vulkan, D_cuda, baseWidth, baseHeight);
63 gpuErrchk(cudaGetLastError());
66template<typename T> __global__ void remove_pitch(T* data_in, T* data_out, size_t baseWidth, size_t baseHeight, size_t nbInRow, size_t channels)
68 for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < baseWidth * baseHeight; i += blockDim.x * gridDim.x)
70 int x = i % baseWidth;
71 int y = i / baseWidth;
73 for (size_t j = 0; j < channels; j++) {
74 data_out[channels * i + j] = data_in[(x + y * nbInRow) * channels + j];
79__global__ void uShort_2_uChar(unsigned short* data_in, unsigned char* data_out, size_t baseWidth, size_t baseHeight, size_t nbInRow, size_t channels)
81 for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < baseWidth * baseHeight; i += blockDim.x * gridDim.x)
83 int x = i % baseWidth;
84 int y = i / baseWidth;
87 unsigned short scale = (USHRT_MAX / UCHAR_MAX);
88 for (size_t j = 0; j < channels; j++) {
89 tmp = (unsigned char)(data_in[(x + y * nbInRow) * channels + j] / scale);
90 data_out[channels * i + j] = (tmp < UCHAR_MAX) ? tmp : UCHAR_MAX;
95__global__ void uShort_2_Float(unsigned short* data_in, float* data_out, size_t baseWidth, size_t baseHeight, size_t nbInRow, size_t channels)
97 for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < baseWidth * baseHeight; i += blockDim.x * gridDim.x)
99 int x = i % baseWidth;
100 int y = i / baseWidth;
101 for (size_t j = 0; j < channels; j++) {
102 data_out[channels * i + j] = (float)data_in[(x + y * nbInRow) * channels + j];
107template<typename T> void removePitch(T* data_in, T* data_out, size_t baseWidth, size_t baseHeight, size_t nbInRow, size_t channels, cudaStream_t& stream) {
108 remove_pitch<T> << <blockspergrid, threadsperblock, 0, stream >> > (data_in, data_out, baseWidth, baseHeight, nbInRow, channels);
109 gpuErrchk(cudaGetLastError());
112void uShort2uChar(unsigned short* data_in, unsigned char* data_out, size_t baseWidth, size_t baseHeight, size_t nbInRow, size_t channels, cudaStream_t& stream) {
113 uShort_2_uChar << <blockspergrid, threadsperblock, 0, stream >> > (data_in, data_out, baseWidth, baseHeight, nbInRow, channels);
114 gpuErrchk(cudaGetLastError());
117void uShort2Float(unsigned short* data_in, float* data_out, size_t baseWidth, size_t baseHeight, size_t nbInRow, size_t channels, cudaStream_t& stream) {
118 uShort_2_Float << <blockspergrid, threadsperblock, 0, stream >> > (data_in, data_out, baseWidth, baseHeight, nbInRow, channels);
119 gpuErrchk(cudaGetLastError());
122template<typename T> __global__ void scale_data_array(T* data_in, T* data_out, size_t baseWidth, size_t baseHeight, size_t nbInRow, size_t channels, float scale)
124 for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < baseWidth * baseHeight; i += blockDim.x * gridDim.x)
126 int x = i % baseWidth;
127 int y = i / baseWidth;
129 for (size_t j = 0; j < channels; j++) {
130 data_out[channels * i + j] = data_in[(x + y * nbInRow) * channels + j] * scale;
135template<typename T> void scaleDataArray(T* data_in, T* data_out, size_t baseWidth, size_t baseHeight, size_t nbInRow, size_t channels, float scale,cudaStream_t& stream) {
136 scale_data_array<T> << <blockspergrid, threadsperblock, 0, stream >> > (data_in, data_out, baseWidth, baseHeight, nbInRow, channels, scale);
137 gpuErrchk(cudaGetLastError());
140template<typename T> __global__ void add_offset_data_array(T* data_in, T* data_out, size_t baseWidth, size_t baseHeight, size_t nbInRow, size_t channels, float offset)
142 for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < baseWidth * baseHeight; i += blockDim.x * gridDim.x)
144 int x = i % baseWidth;
145 int y = i / baseWidth;
147 for (size_t j = 0; j < channels; j++) {
148 data_out[channels * i + j] = data_in[(x + y * nbInRow) * channels + j] + offset;
149 //data_out[channels * i + j] = x;
154template<typename T> void addOffsetDataArray(T* data_in, T* data_out, size_t baseWidth, size_t baseHeight, size_t nbInRow, size_t channels, float offset, cudaStream_t& stream) {
155 add_offset_data_array<T> << <blockspergrid, threadsperblock, 0, stream >> > (data_in, data_out, baseWidth, baseHeight, nbInRow, channels, offset);
156 gpuErrchk(cudaGetLastError());
159template<typename T> __global__ void scale_data_array_uchannel(T* data_in, T* data_out, size_t baseWidth, size_t baseHeight, size_t nbInRow, size_t channels, size_t nbOfChannel, float scale) {
160 for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < baseWidth * baseHeight; i += blockDim.x * gridDim.x)
162 int x = i % baseWidth;
163 int y = i / baseWidth;
165 data_out[i + nbOfChannel] = data_in[(x + y * nbInRow) * channels + nbOfChannel] * scale;
169template<typename T> void scaleDataArrayUChannel(T* data_in, T* data_out, size_t baseWidth, size_t baseHeight, size_t nbInRow, size_t channels, size_t nbOfChannel, float scale, cudaStream_t& stream) {
170 scale_data_array_uchannel<T> << <blockspergrid, threadsperblock, 0, stream >> > (data_in, data_out, baseWidth, baseHeight, nbInRow, channels, nbOfChannel,scale);
171 gpuErrchk(cudaGetLastError());
174template<typename T> __global__ void scale_add_data_array_uchannel(T* data_in, T* data_out, size_t baseWidth, size_t baseHeight, size_t nbInRow, size_t channels, size_t nbOfChannel, float scale, float offset) {
175 for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < baseWidth * baseHeight; i += blockDim.x * gridDim.x)
177 int x = i % baseWidth;
178 int y = i / baseWidth;
180 data_out[i + nbOfChannel] = (data_in[(x + y * nbInRow) * channels + nbOfChannel] + offset) * scale;
184template<typename T> void scaleAddDataArrayUChannel(T* data_in, T* data_out, size_t baseWidth, size_t baseHeight, size_t nbInRow, size_t channels, size_t nbOfChannel, float scale, float offset, cudaStream_t& stream) {
185 scale_add_data_array_uchannel<T> << <blockspergrid, threadsperblock, 0, stream >> > (data_in, data_out, baseWidth, baseHeight, nbInRow, channels, nbOfChannel, scale, offset);
186 gpuErrchk(cudaGetLastError());
189template<typename T> __global__ void temporal_consistency_adjustement(T* prev_depth, T* curr_depth, size_t baseWidth, size_t baseHeight, unsigned int sizePatch, float treshold, float adjustementFactor) {
192 for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < baseWidth * baseHeight; i += blockDim.x * gridDim.x)
194 int x = i % baseWidth;
195 int y = i / baseWidth;
197 if (x + sizePatch >= baseWidth || x - sizePatch < 0 || y + sizePatch >= baseHeight || y - sizePatch < 0) {
202 for (int x_win = x - sizePatch; x_win < x + sizePatch; x_win++) {
203 for (int y_win = y - sizePatch; y_win < y + sizePatch; y_win++) {
204 avg += fabsf(curr_depth[x_win + y_win * baseWidth] - prev_depth[x_win + y_win * baseWidth]);
207 avg /= (float)(sizePatch * sizePatch);
208 if (avg < treshold) {
209 curr_depth[x + y * baseWidth] = (1.0f - adjustementFactor) * curr_depth[x + y * baseWidth] + adjustementFactor * prev_depth[x + y * baseWidth];
215template<typename T> void temporalConsistencyAdjustement(T* prev_depth, T* curr_depth, size_t baseWidth, size_t baseHeight, unsigned int sizePatch, float treshold, float adjustementFactor, cudaStream_t& stream) {
216 temporal_consistency_adjustement<T> << <blockspergrid, threadsperblock, 0, stream >> > (prev_depth, curr_depth, baseWidth, baseHeight, sizePatch,treshold, adjustementFactor);
217 gpuErrchk(cudaGetLastError());
218 gpuErrchk(cudaMemcpyAsync(prev_depth, curr_depth, sizeof(T) * baseHeight * baseWidth, cudaMemcpyDeviceToDevice, stream));
221// !!! WARNING !!! THIS FUNCTION SHOULD NEVER BE USED - ONLY THERE TO FORCE THE COMPILER TO CREATE FUNCTION IN DIFFERENT TYPES !!! WARNING !!!
222[[noreturn]] void unreachable_kernel() {
223 throw HVT_ERROR_INVALID_HANDLE;
227 removePitch<float>((float*) nullptr, (float*) nullptr, 0, 0, 0, 0, (cudaStream_t&)(null));
228 removePitch<int>((int*) nullptr, (int*) nullptr, 0, 0, 0, 0, (cudaStream_t&)(null));
229 removePitch<uchar>((uchar*) nullptr, (uchar*) nullptr, 0, 0, 0, 0, (cudaStream_t&)(null));
230 removePitch<USHORT>((USHORT*) nullptr, (USHORT*) nullptr, 0, 0, 0, 0, (cudaStream_t&)(null));
231 removePitch<double>((double*) nullptr, (double*) nullptr, 0, 0, 0, 0, (cudaStream_t&)(null));
233 scaleAddDataArrayUChannel<float>((float*) nullptr, (float*) nullptr,0,0,0,0,0,0.0f,0.0f,(cudaStream_t&) (null));
234 scaleAddDataArrayUChannel<int>((int*) nullptr, (int*) nullptr, 0, 0, 0, 0, 0, 0.0f, 0.0f, (cudaStream_t&)(null));
235 scaleAddDataArrayUChannel<uchar>((uchar*) nullptr, (uchar*) nullptr, 0, 0, 0, 0, 0, 0.0f, 0.0f, (cudaStream_t&)(null));
236 scaleAddDataArrayUChannel<USHORT>((USHORT*) nullptr, (USHORT*) nullptr, 0, 0, 0, 0, 0, 0.0f, 0.0f, (cudaStream_t&)(null));
237 scaleAddDataArrayUChannel<double>((double*) nullptr, (double*) nullptr, 0, 0, 0, 0, 0, 0.0f, 0.0f, (cudaStream_t&)(null));
239 scaleDataArrayUChannel<float>((float*) nullptr, (float*) nullptr, 0, 0, 0, 0, 0, 0.0f, (cudaStream_t&)(null));
240 scaleDataArrayUChannel<int>((int*) nullptr, (int*) nullptr, 0, 0, 0, 0, 0, 0.0f, (cudaStream_t&)(null));
241 scaleDataArrayUChannel<uchar>((uchar*) nullptr, (uchar*) nullptr, 0, 0, 0, 0, 0, 0.0f, (cudaStream_t&)(null));
242 scaleDataArrayUChannel<USHORT>((USHORT*) nullptr, (USHORT*) nullptr, 0, 0, 0, 0, 0,0.0f, (cudaStream_t&)(null));
243 scaleDataArrayUChannel<double>((double*) nullptr, (double*) nullptr, 0, 0, 0, 0, 0,0.0f, (cudaStream_t&)(null));
245 addOffsetDataArray<float>((float*) nullptr, (float*) nullptr, 0, 0, 0, 0, 0.0f, (cudaStream_t&)(null));
246 addOffsetDataArray<int>((int*) nullptr, (int*) nullptr, 0, 0, 0, 0, 0.0f, (cudaStream_t&)(null));
247 addOffsetDataArray<uchar>((uchar*) nullptr, (uchar*) nullptr, 0, 0, 0, 0, 0.0f, (cudaStream_t&)(null));
248 addOffsetDataArray<USHORT>((USHORT*) nullptr, (USHORT*) nullptr, 0, 0, 0, 0, 0.0f, (cudaStream_t&)(null));
249 addOffsetDataArray<double>((double*) nullptr, (double*) nullptr, 0, 0, 0, 0, 0.0f, (cudaStream_t&)(null));
251 scaleDataArray<float>((float*) nullptr, (float*) nullptr, 0, 0, 0, 0, 0.0f, (cudaStream_t&)(null));
252 scaleDataArray<int>((int*) nullptr, (int*) nullptr, 0, 0, 0, 0, 0.0f, (cudaStream_t&)(null));
253 scaleDataArray<uchar>((uchar*) nullptr, (uchar*) nullptr, 0, 0, 0, 0, 0.0f, (cudaStream_t&)(null));
254 scaleDataArray<USHORT>((USHORT*) nullptr, (USHORT*) nullptr, 0, 0, 0, 0, 0.0f, (cudaStream_t&)(null));
255 scaleDataArray<double>((double*) nullptr, (double*) nullptr, 0, 0, 0, 0, 0.0f, (cudaStream_t&)(null));
257 temporalConsistencyAdjustement<float>((float*) nullptr, (float*) nullptr, 0, 0, 0, 0.0f, 0.0f, (cudaStream_t&)(null));
258 temporalConsistencyAdjustement<int>((int*) nullptr, (int*) nullptr, 0, 0, 0, 0.0f, 0.0f, (cudaStream_t&)(null));
259 temporalConsistencyAdjustement<uchar>((uchar*) nullptr, (uchar*) nullptr, 0, 0, 0, 0.0f, 0.0f, (cudaStream_t&)(null));
260 temporalConsistencyAdjustement<USHORT>((USHORT*) nullptr, (USHORT*) nullptr, 0, 0, 0, 0.0f, 0.0f, (cudaStream_t&)(null));
261 temporalConsistencyAdjustement<double>((double*) nullptr, (double*) nullptr, 0, 0, 0, 0.0f, 0.0f, (cudaStream_t&)(null));