avoid loop in case intrinsic coefficient are zero

IntelRealSense · Jan 7, 2025 · e338df2 · e338df2
1 parent d26ec5c
commit e338df2
Show file tree

Hide file tree

Showing 14 changed files with 26 additions and 68 deletions.
diff --git a/CMake/lrs_options.cmake b/CMake/lrs_options.cmake
@@ -42,7 +42,7 @@ if (NOT APPLE)
 else()
     option(CHECK_FOR_UPDATES "Checks for versions updates" OFF) 
 endif()
-option(BUILD_WITH_CPU_EXTENSIONS "Enable compiler optimizations using CPU extensions (such as AVX)" OFF)
+option(BUILD_WITH_CPU_EXTENSIONS "Enable compiler optimizations using CPU extensions (such as AVX)" ON)
 set(UNIT_TESTS_ARGS "" CACHE STRING "Command-line arguments to pass to unit-tests-config.py, e.g. '-t <tag> -r <regex>'")
 #Performance improvement with Ubuntu 18/20
 if(UNIX AND (NOT ANDROID_NDK_TOOLCHAIN_INCLUDED))

diff --git a/examples/measure/rs-measure.cpp b/examples/measure/rs-measure.cpp
@@ -15,7 +15,6 @@
 #include <thread>
 #include <atomic>
 #include <mutex>
-#include <rsutils/easylogging/easyloggingpp.h>
 
 using pixel = std::pair<int, int>;
 
@@ -101,7 +100,6 @@ void render_simple_distance(const rs2::depth_frame& depth,
 
 int main(int argc, char * argv[]) try
 {
-    rs2::log_to_console(RS2_LOG_SEVERITY_ERROR);
     auto settings = rs2::cli( "rs-measure example" )
         .process( argc, argv );
 
@@ -293,24 +291,9 @@ float dist_3d(const rs2::depth_frame& frame, pixel u, pixel v)
 
     // Deproject from pixel to point in 3D
     rs2_intrinsics intr = frame.get_profile().as<rs2::video_stream_profile>().get_intrinsics(); // Calibration data
-    // Get the starting time point
-    auto start = std::chrono::high_resolution_clock::now();
-
     rs2_deproject_pixel_to_point(upoint, &intr, upixel, udist);
-
-    // Get the ending time point
-    auto end = std::chrono::high_resolution_clock::now();
-
-    // Calculate the elapsed time in milliseconds
-    auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
-
-    // Output the elapsed time
-    LOG_ERROR(duration.count());
-
     rs2_deproject_pixel_to_point(vpoint, &intr, vpixel, vdist);
 
-
-
     // Calculate euclidean distance between the two points
     return sqrt(pow(upoint[0] - vpoint[0], 2.f) +
                 pow(upoint[1] - vpoint[1], 2.f) +

diff --git a/src/image-avx.cpp b/src/image-avx.cpp
@@ -6,7 +6,7 @@
 #include "image-avx.h"
 
 #ifndef ANDROID
-    #if defined(__SSE4__) && defined(__AVX2__)
+    #if defined(__SSSE3__) && defined(__AVX2__)
     #include <tmmintrin.h> // For SSE3 intrinsic used in unpack_yuy2_sse
     #include <immintrin.h>
 

diff --git a/src/image-avx.h b/src/image-avx.h
@@ -10,7 +10,7 @@
 namespace librealsense
 {
 #ifndef ANDROID
-    #if defined(__SSE4__) && defined(__AVX2__)
+    #if defined(__SSSE3__) && defined(__AVX2__)
     void unpack_yuy2_avx_y8(uint8_t * const d[], const uint8_t * s, int n);
     void unpack_yuy2_avx_y16(uint8_t * const d[], const uint8_t * s, int n);
     void unpack_yuy2_avx_rgb8(uint8_t * const d[], const uint8_t * s, int n);

diff --git a/src/proc/align.cpp b/src/proc/align.cpp
@@ -13,7 +13,7 @@
 
 #if defined(RS2_USE_CUDA)
 #include "proc/cuda/cuda-align.h"
-#elif defined(__SSE4__)
+#elif defined(__SSSE3__)
 #include "proc/sse/sse-align.h"
 #endif
 #include "proc/neon/neon-align.h"
@@ -26,7 +26,7 @@ namespace librealsense
     {
         #if defined(RS2_USE_CUDA)
             return std::make_shared<librealsense::align_cuda>(align_to);
-        #elif defined(__SSE4__)
+        #elif defined(__SSSE3__)
             return std::make_shared<librealsense::align_sse>(align_to);
         #elif defined(__ARM_NEON) && ! defined(ANDROID)
             return std::make_shared<librealsense::align_neon>(align_to);
@@ -39,7 +39,6 @@ namespace librealsense
     void align_images(const rs2_intrinsics& depth_intrin, const rs2_extrinsics& depth_to_other,
         const rs2_intrinsics& other_intrin, GET_DEPTH get_depth, TRANSFER_PIXEL transfer_pixel)
     {
-        auto start = std::chrono::high_resolution_clock::now();
         // Iterate over the pixels of the depth image
 #pragma omp parallel for schedule(dynamic)
         for (int depth_y = 0; depth_y < depth_intrin.height; ++depth_y)
@@ -52,11 +51,7 @@ namespace librealsense
                 {
                     // Map the top-left corner of the depth pixel onto the other image
                     float depth_pixel[2] = { depth_x - 0.5f, depth_y - 0.5f }, depth_point[3], other_point[3], other_pixel[2];
-
-
                     rs2_deproject_pixel_to_point(depth_point, &depth_intrin, depth_pixel, depth);
-
-
                     rs2_transform_point_to_point(other_point, &depth_to_other, depth_point);
                     rs2_project_point_to_pixel(other_pixel, &other_intrin, other_point);
                     const int other_x0 = static_cast<int>(other_pixel[0] + 0.5f);
@@ -84,10 +79,6 @@ namespace librealsense
                 }
             }
         }
-        auto end = std::chrono::high_resolution_clock::now();
-        auto duration = std::chrono::duration_cast<std::chrono::nanoseconds>(end - start);
-        // Output the duration
-        std::cout << duration.count() << "\n";
     }
 
     align::align(rs2_stream to_stream) : align(to_stream, "Align")

diff --git a/src/proc/color-formats-converter.cpp b/src/proc/color-formats-converter.cpp
@@ -14,7 +14,7 @@
 #ifdef RS2_USE_CUDA
 #include "cuda/cuda-conversion.cuh"
 #endif
-#ifdef __SSE4__
+#ifdef __SSSE3__
 #include <tmmintrin.h> // For SSSE3 intrinsics
 #endif
 #include "neon/image-neon.h"
@@ -60,7 +60,7 @@ namespace librealsense
         rscuda::unpack_yuy2_cuda<FORMAT>(d, s, n);
         return;
 #endif
-#if defined __SSE4__ && ! defined ANDROID
+#if defined __SSSE3__ && ! defined ANDROID
         static bool do_avx = has_avx();
 #ifdef __AVX2__
 
@@ -477,7 +477,7 @@ namespace librealsense
         }
     }
 
-#if defined __SSE4__ && ! defined ANDROID
+#if defined __SSSE3__ && ! defined ANDROID
     // This method receives 1 line of y and one line of uv.
     // source_chunks_y  // yyyyyyyyyyyyyyyy
     // source_chunks_uv // uvuvuvuvuvuvuvuv
@@ -631,7 +631,7 @@ namespace librealsense
         auto n = width * height;
         assert(n % 16 == 0); // All currently supported color resolutions are multiples of 16 pixels. Could easily extend support to other resolutions by copying final n<16 pixels into a zero-padded buffer and recursively calling self for final iteration.
 
-#if defined __SSE4__ && ! defined ANDROID
+#if defined __SSSE3__ && ! defined ANDROID
         static bool do_avx = has_avx();
 
         auto src = reinterpret_cast<const __m128i*>(s);
@@ -753,7 +753,7 @@ namespace librealsense
             m420_parse_one_line<FORMAT>(start_of_second_line, start_of_uv, &dst, width);
         }
         return;
-#endif // __SSE4__
+#endif // __SSSE3__
     }
 
     void unpack_yuy2(rs2_format dst_format, rs2_stream dst_stream, uint8_t * const d[], const uint8_t * s, int w, int h, int actual_size)
@@ -822,7 +822,7 @@ namespace librealsense
     {
         auto n = width * height;
         assert(n % 16 == 0); // All currently supported color resolutions are multiples of 16 pixels. Could easily extend support to other resolutions by copying final n<16 pixels into a zero-padded buffer and recursively calling self for final iteration.
-#ifdef __SSE4__
+#ifdef __SSSE3__
         auto src = reinterpret_cast<const __m128i *>(s);
         auto dst = reinterpret_cast<__m128i *>(d[0]);
         for (; n; n -= 16)

diff --git a/src/proc/pointcloud.cpp b/src/proc/pointcloud.cpp
@@ -19,7 +19,7 @@
 #ifdef RS2_USE_CUDA
 #include "proc/cuda/cuda-pointcloud.h"
 #endif
-#ifdef __SSE4__
+#ifdef __SSSE3__
 #include "proc/sse/sse-pointcloud.h"
 #endif
 #include "proc/neon/neon-pointcloud.h"
@@ -397,7 +397,7 @@ namespace librealsense
     {
         #ifdef RS2_USE_CUDA
             return std::make_shared<librealsense::pointcloud_cuda>();
-        #elif defined(__SSE4__)
+        #elif defined(__SSSE3__)
             return std::make_shared<librealsense::pointcloud_sse>();
         #elif defined(__ARM_NEON)  && ! defined ANDROID
             return std::make_shared<librealsense::pointcloud_neon>();

diff --git a/src/proc/sse/sse-align.cpp b/src/proc/sse/sse-align.cpp
@@ -1,6 +1,6 @@
 // License: Apache 2.0. See LICENSE file in root directory.
 // Copyright(c) 2019 Intel Corporation. All Rights Reserved.
-#ifdef __SSE4__
+#ifdef __SSSE3__
 
 #include "sse-align.h"
 #include <tmmintrin.h> // For SSE3 intrinsic used in unpack_yuy2_sse

diff --git a/src/proc/sse/sse-align.h b/src/proc/sse/sse-align.h
@@ -1,7 +1,7 @@
 // License: Apache 2.0. See LICENSE file in root directory.
 // Copyright(c) 2024 Intel Corporation. All Rights Reserved.
 #pragma once
-#ifdef __SSE4__
+#ifdef __SSSE3__
 
 #include "proc/align.h"
 #include <src/float3.h>
@@ -87,4 +87,4 @@ namespace librealsense
         std::shared_ptr<image_transform> _stream_transform;
     };
 }
-#endif // __SSE4__
+#endif // __SSSE3__
diff --git a/src/proc/sse/sse-pointcloud.cpp b/src/proc/sse/sse-pointcloud.cpp
@@ -11,7 +11,7 @@
 
 #include <iostream>
 
-#ifdef __SSE4__
+#ifdef __SSSE3__
 
 #include <tmmintrin.h> // For SSSE3 intrinsics
 
@@ -56,7 +56,7 @@ namespace librealsense
             const rs2_intrinsics &depth_intrinsics, 
             const rs2::depth_frame& depth_frame)
     {
-#ifdef __SSE4__
+#ifdef __SSSE3__
 
         auto depth_image = (const uint16_t*)depth_frame.get_data();
 
@@ -145,7 +145,7 @@ namespace librealsense
     {
         auto tex_ptr = texture_map;
 
-#ifdef __SSE4__
+#ifdef __SSSE3__
         auto point = reinterpret_cast<const float*>(points);
         auto res = reinterpret_cast<float*>(tex_ptr);
         auto res1 = reinterpret_cast<float*>(pixels_ptr);

diff --git a/src/proc/y411-converter.cpp b/src/proc/y411-converter.cpp
@@ -6,7 +6,7 @@
 #ifdef RS2_USE_CUDA
 #include "cuda/cuda-conversion.cuh"
 #endif
-#ifdef __SSE4__
+#ifdef __SSSE3__
 #include <tmmintrin.h> // For SSSE3 intrinsics
 #endif
 
@@ -44,7 +44,7 @@ namespace librealsense
     // See https://www.fourcc.org/pixel-format/yuv-y411/ 
     //
 
-#if defined __SSE4__ && ! defined ANDROID
+#if defined __SSSE3__ && ! defined ANDROID
     void unpack_y411_sse( uint8_t * const dest, const uint8_t * const s, int w, int h, int actual_size)
     {
         auto n = w * h;
@@ -297,7 +297,7 @@ namespace librealsense
     // The size of the frame must be bigger than 4 pixels and product of 32
     void unpack_y411( uint8_t * const dest[], const uint8_t * const s, int w, int h, int actual_size )
     {
-#if defined __SSE4__ && ! defined ANDROID
+#if defined __SSSE3__ && ! defined ANDROID
         unpack_y411_sse(dest[0], s, w, h, actual_size);
 #else
         unpack_y411_native(dest[0], s, w, h, actual_size);

diff --git a/src/proc/y411-converter.h b/src/proc/y411-converter.h
@@ -24,7 +24,7 @@ namespace librealsense
 
     void unpack_y411( uint8_t * const dest[], const uint8_t * const s, int w, int h, int actual_size);
 
-#if defined __SSE4__ && ! defined ANDROID
+#if defined __SSSE3__ && ! defined ANDROID
     void unpack_y411_sse( uint8_t * const dest, const uint8_t * const s, int w, int h, int actual_size);
 #endif
 

diff --git a/src/rs.cpp b/src/rs.cpp
@@ -64,11 +64,6 @@
 #include <src/core/time-service.h>
 #include <rsutils/string/from.h>
 
-#include <iostream>
-#include <chrono>
-#include <thread> // For std::this_thread::sleep_for
-#include <random>  // For random number generation
-
 ////////////////////////
 // API implementation //
 ////////////////////////
@@ -4113,7 +4108,6 @@ NOEXCEPT_RETURN(, pixel)
 /* Helper inner function (not part of the API) */
 inline bool is_intrinsics_distortion_zero(const struct rs2_intrinsics* intrin)
 {
-    //return false;
     return (abs(intrin->coeffs[0]) < std::numeric_limits<double>::epsilon() && abs(intrin->coeffs[1]) < std::numeric_limits<double>::epsilon() &&
         abs(intrin->coeffs[2]) < std::numeric_limits<double>::epsilon() && abs(intrin->coeffs[3]) < std::numeric_limits<double>::epsilon() &&
        abs(intrin->coeffs[4]) < std::numeric_limits<double>::epsilon());
@@ -4150,10 +4144,9 @@ void rs2_deproject_pixel_to_point(float point[3], const struct rs2_intrinsics* i
         }
         if (intrin->model == RS2_DISTORTION_BROWN_CONRADY)
         {
-            int i = 0;
             // need to loop until convergence
             // 10 iterations determined empirically
-            for (; i < 10; i++)
+            for (int i = 0; i < 10; i++)
             {
                 float r2 = x * x + y * y;
                 float icdist = (float)1 / (float)(1 + ((intrin->coeffs[4] * r2 + intrin->coeffs[1]) * r2 + intrin->coeffs[0]) * r2);
@@ -4205,15 +4198,6 @@ void rs2_deproject_pixel_to_point(float point[3], const struct rs2_intrinsics* i
     point[0] = depth * x;
     point[1] = depth * y;
     point[2] = depth;
-
-    // Get the ending time point
-    //auto end = std::chrono::high_resolution_clock::now();
-
-    // Calculate the elapsed time in milliseconds
-    //auto duration = std::chrono::duration_cast<std::chrono::nanoseconds>(end - start);
-
-    // Output the elapsed time
-    //std::cout << duration.count()  << std::endl;
 }
 NOEXCEPT_RETURN(, point)
 

diff --git a/wrappers/opencv/depth-filter/downsample.cpp b/wrappers/opencv/depth-filter/downsample.cpp
@@ -5,7 +5,7 @@
 
 #include <assert.h>
 
-#ifdef __SSE4__
+#ifdef __SSSE3__
 #include <emmintrin.h>
 #include <smmintrin.h>
 #endif
@@ -24,7 +24,7 @@ void downsample_min_4x4(const cv::Mat& source, cv::Mat* pDest)
 
     const size_t sizeYresized = source.rows / DOWNSAMPLE_FACTOR;
 
-#ifdef __SSE4__
+#ifdef __SSSE3__
     __m128i ones = _mm_set1_epi16(1);
 
     // Note on multi-threading here, 2018-08-17