HoviTron Video Pipeline
KernelCuda.cu
1/* ----------------------
2* Copyright 2023 Université Libre de Bruxelles(ULB), Universidad Politécnica de Madrid(UPM), CREAL, Deutsches Zentrum für Luft - und Raumfahrt(DLR)
3
4* Licensed under the Apache License, Version 2.0 (the "License");
5* you may not use this file except in compliance with the License.
6* You may obtain a copy of the License at < http://www.apache.org/licenses/LICENSE-2.0%3E
7
8* Unless required by applicable law or agreed to in writing, software
9* distributed under the License is distributed on an "AS IS" BASIS,
10* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11* See the License for the specific language governing permissionsand
12* limitations under the License.
13---------------------- */
14
15
16#include "KernelCuda.cuh"
17
18__global__ void copy_color(cudaSurfaceObject_t RGB_vulkan, uchar* RGB_cuda, size_t baseWidth, size_t baseHeight)
19{
20 for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < baseWidth * baseHeight; i += blockDim.x * gridDim.x)
21 {
22 int x = i % baseWidth;
23 int y = i / baseWidth;
24
25 uchar b = RGB_cuda[4 * i + 0];
26 uchar g = RGB_cuda[4 * i + 1];
27 uchar r = RGB_cuda[4 * i + 2];
28
29 uchar4 color;
30 color.x = b;
31 color.y = g;
32 color.z = r;
33 color.w = 255;
34
35 surf2Dwrite(color, RGB_vulkan, x * sizeof(uchar4), y);
36 }
37}
38
39__global__ void copy_depth(cudaSurfaceObject_t D_vulkan, float* D_cuda, size_t baseWidth, size_t baseHeight)
40{
41 for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < baseWidth * baseHeight; i += blockDim.x * gridDim.x)
42 {
43 int x = i % baseWidth;
44 int y = i / baseWidth;
45
46 float depth = D_cuda[i];
47
48 //float depth = 1;
49
50 surf2Dwrite(depth, D_vulkan, x * sizeof(float), y);
51 }
52}
53
54void copyColor(cudaSurfaceObject_t RGB_vulkan, uchar* RGB_cuda, size_t baseWidth, size_t baseHeight, cudaStream_t& stream) {
55 // Copy to color
56 copy_color << <blockspergrid, threadsperblock,0, stream >> > (RGB_vulkan, RGB_cuda, baseWidth, baseHeight);
57 gpuErrchk(cudaGetLastError());
58}
59
60void copyDepth(cudaSurfaceObject_t D_vulkan, float* D_cuda, size_t baseWidth, size_t baseHeight, cudaStream_t& stream) {
61 // Copy to depth
62 copy_depth << <blockspergrid, threadsperblock, 0, stream >> > (D_vulkan, D_cuda, baseWidth, baseHeight);
63 gpuErrchk(cudaGetLastError());
64}
65
66template<typename T> __global__ void remove_pitch(T* data_in, T* data_out, size_t baseWidth, size_t baseHeight, size_t nbInRow, size_t channels)
67{
68 for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < baseWidth * baseHeight; i += blockDim.x * gridDim.x)
69 {
70 int x = i % baseWidth;
71 int y = i / baseWidth;
72
73 for (size_t j = 0; j < channels; j++) {
74 data_out[channels * i + j] = data_in[(x + y * nbInRow) * channels + j];
75 }
76 }
77}
78
79__global__ void uShort_2_uChar(unsigned short* data_in, unsigned char* data_out, size_t baseWidth, size_t baseHeight, size_t nbInRow, size_t channels)
80{
81 for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < baseWidth * baseHeight; i += blockDim.x * gridDim.x)
82 {
83 int x = i % baseWidth;
84 int y = i / baseWidth;
85
86 unsigned char tmp;
87 unsigned short scale = (USHRT_MAX / UCHAR_MAX);
88 for (size_t j = 0; j < channels; j++) {
89 tmp = (unsigned char)(data_in[(x + y * nbInRow) * channels + j] / scale);
90 data_out[channels * i + j] = (tmp < UCHAR_MAX) ? tmp : UCHAR_MAX;
91 }
92 }
93}
94
95__global__ void uShort_2_Float(unsigned short* data_in, float* data_out, size_t baseWidth, size_t baseHeight, size_t nbInRow, size_t channels)
96{
97 for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < baseWidth * baseHeight; i += blockDim.x * gridDim.x)
98 {
99 int x = i % baseWidth;
100 int y = i / baseWidth;
101 for (size_t j = 0; j < channels; j++) {
102 data_out[channels * i + j] = (float)data_in[(x + y * nbInRow) * channels + j];
103 }
104 }
105}
106
107template<typename T> void removePitch(T* data_in, T* data_out, size_t baseWidth, size_t baseHeight, size_t nbInRow, size_t channels, cudaStream_t& stream) {
108 remove_pitch<T> << <blockspergrid, threadsperblock, 0, stream >> > (data_in, data_out, baseWidth, baseHeight, nbInRow, channels);
109 gpuErrchk(cudaGetLastError());
110}
111
112void uShort2uChar(unsigned short* data_in, unsigned char* data_out, size_t baseWidth, size_t baseHeight, size_t nbInRow, size_t channels, cudaStream_t& stream) {
113 uShort_2_uChar << <blockspergrid, threadsperblock, 0, stream >> > (data_in, data_out, baseWidth, baseHeight, nbInRow, channels);
114 gpuErrchk(cudaGetLastError());
115}
116
117void uShort2Float(unsigned short* data_in, float* data_out, size_t baseWidth, size_t baseHeight, size_t nbInRow, size_t channels, cudaStream_t& stream) {
118 uShort_2_Float << <blockspergrid, threadsperblock, 0, stream >> > (data_in, data_out, baseWidth, baseHeight, nbInRow, channels);
119 gpuErrchk(cudaGetLastError());
120}
121
122template<typename T> __global__ void scale_data_array(T* data_in, T* data_out, size_t baseWidth, size_t baseHeight, size_t nbInRow, size_t channels, float scale)
123{
124 for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < baseWidth * baseHeight; i += blockDim.x * gridDim.x)
125 {
126 int x = i % baseWidth;
127 int y = i / baseWidth;
128
129 for (size_t j = 0; j < channels; j++) {
130 data_out[channels * i + j] = data_in[(x + y * nbInRow) * channels + j] * scale;
131 }
132 }
133}
134
135template<typename T> void scaleDataArray(T* data_in, T* data_out, size_t baseWidth, size_t baseHeight, size_t nbInRow, size_t channels, float scale,cudaStream_t& stream) {
136 scale_data_array<T> << <blockspergrid, threadsperblock, 0, stream >> > (data_in, data_out, baseWidth, baseHeight, nbInRow, channels, scale);
137 gpuErrchk(cudaGetLastError());
138}
139
140template<typename T> __global__ void add_offset_data_array(T* data_in, T* data_out, size_t baseWidth, size_t baseHeight, size_t nbInRow, size_t channels, float offset)
141{
142 for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < baseWidth * baseHeight; i += blockDim.x * gridDim.x)
143 {
144 int x = i % baseWidth;
145 int y = i / baseWidth;
146
147 for (size_t j = 0; j < channels; j++) {
148 data_out[channels * i + j] = data_in[(x + y * nbInRow) * channels + j] + offset;
149 //data_out[channels * i + j] = x;
150 }
151 }
152}
153
154template<typename T> void addOffsetDataArray(T* data_in, T* data_out, size_t baseWidth, size_t baseHeight, size_t nbInRow, size_t channels, float offset, cudaStream_t& stream) {
155 add_offset_data_array<T> << <blockspergrid, threadsperblock, 0, stream >> > (data_in, data_out, baseWidth, baseHeight, nbInRow, channels, offset);
156 gpuErrchk(cudaGetLastError());
157}
158
159template<typename T> __global__ void scale_data_array_uchannel(T* data_in, T* data_out, size_t baseWidth, size_t baseHeight, size_t nbInRow, size_t channels, size_t nbOfChannel, float scale) {
160 for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < baseWidth * baseHeight; i += blockDim.x * gridDim.x)
161 {
162 int x = i % baseWidth;
163 int y = i / baseWidth;
164
165 data_out[i + nbOfChannel] = data_in[(x + y * nbInRow) * channels + nbOfChannel] * scale;
166 }
167}
168
169template<typename T> void scaleDataArrayUChannel(T* data_in, T* data_out, size_t baseWidth, size_t baseHeight, size_t nbInRow, size_t channels, size_t nbOfChannel, float scale, cudaStream_t& stream) {
170 scale_data_array_uchannel<T> << <blockspergrid, threadsperblock, 0, stream >> > (data_in, data_out, baseWidth, baseHeight, nbInRow, channels, nbOfChannel,scale);
171 gpuErrchk(cudaGetLastError());
172}
173
174template<typename T> __global__ void scale_add_data_array_uchannel(T* data_in, T* data_out, size_t baseWidth, size_t baseHeight, size_t nbInRow, size_t channels, size_t nbOfChannel, float scale, float offset) {
175 for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < baseWidth * baseHeight; i += blockDim.x * gridDim.x)
176 {
177 int x = i % baseWidth;
178 int y = i / baseWidth;
179
180 data_out[i + nbOfChannel] = (data_in[(x + y * nbInRow) * channels + nbOfChannel] + offset) * scale;
181 }
182}
183
184template<typename T> void scaleAddDataArrayUChannel(T* data_in, T* data_out, size_t baseWidth, size_t baseHeight, size_t nbInRow, size_t channels, size_t nbOfChannel, float scale, float offset, cudaStream_t& stream) {
185 scale_add_data_array_uchannel<T> << <blockspergrid, threadsperblock, 0, stream >> > (data_in, data_out, baseWidth, baseHeight, nbInRow, channels, nbOfChannel, scale, offset);
186 gpuErrchk(cudaGetLastError());
187}
188
189template<typename T> __global__ void temporal_consistency_adjustement(T* prev_depth, T* curr_depth, size_t baseWidth, size_t baseHeight, unsigned int sizePatch, float treshold, float adjustementFactor) {
190
191 float avg = 0.0f;
192 for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < baseWidth * baseHeight; i += blockDim.x * gridDim.x)
193 {
194 int x = i % baseWidth;
195 int y = i / baseWidth;
196
197 if (x + sizePatch >= baseWidth || x - sizePatch < 0 || y + sizePatch >= baseHeight || y - sizePatch < 0) {
198 continue;
199 }
200
201 avg = 0.f;
202 for (int x_win = x - sizePatch; x_win < x + sizePatch; x_win++) {
203 for (int y_win = y - sizePatch; y_win < y + sizePatch; y_win++) {
204 avg += fabsf(curr_depth[x_win + y_win * baseWidth] - prev_depth[x_win + y_win * baseWidth]);
205 }
206 }
207 avg /= (float)(sizePatch * sizePatch);
208 if (avg < treshold) {
209 curr_depth[x + y * baseWidth] = (1.0f - adjustementFactor) * curr_depth[x + y * baseWidth] + adjustementFactor * prev_depth[x + y * baseWidth];
210 }
211
212 }
213}
214
215template<typename T> void temporalConsistencyAdjustement(T* prev_depth, T* curr_depth, size_t baseWidth, size_t baseHeight, unsigned int sizePatch, float treshold, float adjustementFactor, cudaStream_t& stream) {
216 temporal_consistency_adjustement<T> << <blockspergrid, threadsperblock, 0, stream >> > (prev_depth, curr_depth, baseWidth, baseHeight, sizePatch,treshold, adjustementFactor);
217 gpuErrchk(cudaGetLastError());
218 gpuErrchk(cudaMemcpyAsync(prev_depth, curr_depth, sizeof(T) * baseHeight * baseWidth, cudaMemcpyDeviceToDevice, stream));
219}
220
221// !!! WARNING !!! THIS FUNCTION SHOULD NEVER BE USED - ONLY THERE TO FORCE THE COMPILER TO CREATE FUNCTION IN DIFFERENT TYPES !!! WARNING !!!
222[[noreturn]] void unreachable_kernel() {
223 throw HVT_ERROR_INVALID_HANDLE;
224
225 int null = NULL;
226
227 removePitch<float>((float*) nullptr, (float*) nullptr, 0, 0, 0, 0, (cudaStream_t&)(null));
228 removePitch<int>((int*) nullptr, (int*) nullptr, 0, 0, 0, 0, (cudaStream_t&)(null));
229 removePitch<uchar>((uchar*) nullptr, (uchar*) nullptr, 0, 0, 0, 0, (cudaStream_t&)(null));
230 removePitch<USHORT>((USHORT*) nullptr, (USHORT*) nullptr, 0, 0, 0, 0, (cudaStream_t&)(null));
231 removePitch<double>((double*) nullptr, (double*) nullptr, 0, 0, 0, 0, (cudaStream_t&)(null));
232
233 scaleAddDataArrayUChannel<float>((float*) nullptr, (float*) nullptr,0,0,0,0,0,0.0f,0.0f,(cudaStream_t&) (null));
234 scaleAddDataArrayUChannel<int>((int*) nullptr, (int*) nullptr, 0, 0, 0, 0, 0, 0.0f, 0.0f, (cudaStream_t&)(null));
235 scaleAddDataArrayUChannel<uchar>((uchar*) nullptr, (uchar*) nullptr, 0, 0, 0, 0, 0, 0.0f, 0.0f, (cudaStream_t&)(null));
236 scaleAddDataArrayUChannel<USHORT>((USHORT*) nullptr, (USHORT*) nullptr, 0, 0, 0, 0, 0, 0.0f, 0.0f, (cudaStream_t&)(null));
237 scaleAddDataArrayUChannel<double>((double*) nullptr, (double*) nullptr, 0, 0, 0, 0, 0, 0.0f, 0.0f, (cudaStream_t&)(null));
238
239 scaleDataArrayUChannel<float>((float*) nullptr, (float*) nullptr, 0, 0, 0, 0, 0, 0.0f, (cudaStream_t&)(null));
240 scaleDataArrayUChannel<int>((int*) nullptr, (int*) nullptr, 0, 0, 0, 0, 0, 0.0f, (cudaStream_t&)(null));
241 scaleDataArrayUChannel<uchar>((uchar*) nullptr, (uchar*) nullptr, 0, 0, 0, 0, 0, 0.0f, (cudaStream_t&)(null));
242 scaleDataArrayUChannel<USHORT>((USHORT*) nullptr, (USHORT*) nullptr, 0, 0, 0, 0, 0,0.0f, (cudaStream_t&)(null));
243 scaleDataArrayUChannel<double>((double*) nullptr, (double*) nullptr, 0, 0, 0, 0, 0,0.0f, (cudaStream_t&)(null));
244
245 addOffsetDataArray<float>((float*) nullptr, (float*) nullptr, 0, 0, 0, 0, 0.0f, (cudaStream_t&)(null));
246 addOffsetDataArray<int>((int*) nullptr, (int*) nullptr, 0, 0, 0, 0, 0.0f, (cudaStream_t&)(null));
247 addOffsetDataArray<uchar>((uchar*) nullptr, (uchar*) nullptr, 0, 0, 0, 0, 0.0f, (cudaStream_t&)(null));
248 addOffsetDataArray<USHORT>((USHORT*) nullptr, (USHORT*) nullptr, 0, 0, 0, 0, 0.0f, (cudaStream_t&)(null));
249 addOffsetDataArray<double>((double*) nullptr, (double*) nullptr, 0, 0, 0, 0, 0.0f, (cudaStream_t&)(null));
250
251 scaleDataArray<float>((float*) nullptr, (float*) nullptr, 0, 0, 0, 0, 0.0f, (cudaStream_t&)(null));
252 scaleDataArray<int>((int*) nullptr, (int*) nullptr, 0, 0, 0, 0, 0.0f, (cudaStream_t&)(null));
253 scaleDataArray<uchar>((uchar*) nullptr, (uchar*) nullptr, 0, 0, 0, 0, 0.0f, (cudaStream_t&)(null));
254 scaleDataArray<USHORT>((USHORT*) nullptr, (USHORT*) nullptr, 0, 0, 0, 0, 0.0f, (cudaStream_t&)(null));
255 scaleDataArray<double>((double*) nullptr, (double*) nullptr, 0, 0, 0, 0, 0.0f, (cudaStream_t&)(null));
256
257 temporalConsistencyAdjustement<float>((float*) nullptr, (float*) nullptr, 0, 0, 0, 0.0f, 0.0f, (cudaStream_t&)(null));
258 temporalConsistencyAdjustement<int>((int*) nullptr, (int*) nullptr, 0, 0, 0, 0.0f, 0.0f, (cudaStream_t&)(null));
259 temporalConsistencyAdjustement<uchar>((uchar*) nullptr, (uchar*) nullptr, 0, 0, 0, 0.0f, 0.0f, (cudaStream_t&)(null));
260 temporalConsistencyAdjustement<USHORT>((USHORT*) nullptr, (USHORT*) nullptr, 0, 0, 0, 0.0f, 0.0f, (cudaStream_t&)(null));
261 temporalConsistencyAdjustement<double>((double*) nullptr, (double*) nullptr, 0, 0, 0, 0.0f, 0.0f, (cudaStream_t&)(null));
262}
263
264