Panda3D
convert_srgb_sse2.cxx
1 // Filename: convert_srgb_sse2.cxx
2 // Created by: rdb (13Nov14)
3 //
4 ////////////////////////////////////////////////////////////////////
5 //
6 // PANDA 3D SOFTWARE
7 // Copyright (c) Carnegie Mellon University. All rights reserved.
8 //
9 // All use of this software is subject to the terms of the revised BSD
10 // license. You should have received a copy of this license along
11 // with this source code in a file named "LICENSE."
12 //
13 ////////////////////////////////////////////////////////////////////
14 
15 // This file should always be compiled with SSE2 support. These
16 // functions will only be called when SSE2 support is detected at
17 // run-time.
18 
19 #include "convert_srgb.h"
20 #include "luse.h"
21 
22 #if defined(__SSE2__) || (_M_IX86_FP >= 2) || defined(_M_X64) || defined(_M_AMD64)
23 
24 #include <xmmintrin.h>
25 #include <emmintrin.h>
26 
27 static INLINE __m128i _encode_sRGB_sse2_mul255(__m128 val) {
28  // This an SSE2-based approximation of the sRGB encode function.
29  // It has a maximum error of around 0.001, which is by far small
30  // enough for a uchar. It is also at least 10x as fast as the
31  // original; up to 40x when taking advantage of vectorization.
32  // Note that the fourth float is only multiplied with 255.
33 
34  // Part of the code in this function is derived from:
35  // http://stackoverflow.com/a/6486630/2135754
36 
37  // Clamp to 0-1 range.
38  val = _mm_max_ps(val, _mm_set1_ps(0.0f));
39  val = _mm_min_ps(val, _mm_set1_ps(1.0f));
40 
41  // Pre-multiply with constant factor to adjust for exp bias.
42  __m128 xf = _mm_mul_ps(val, _mm_set1_ps(6.3307e18f));
43 
44  // Approximate logarithm by... casting!
45  xf = _mm_cvtepi32_ps(_mm_castps_si128(xf));
46 
47  // Multiply 'logarithm' by power.
48  xf = _mm_mul_ps(xf, _mm_set1_ps(2.0f / 3.0f));
49 
50  // Reverse operation of above: cast the other way.
51  xf = _mm_castsi128_ps(_mm_cvtps_epi32(xf));
52 
53  // Make an overestimate and an underestimate.
54  __m128 xover = _mm_mul_ps(val, xf);
55  __m128 xunder = _mm_mul_ps(_mm_mul_ps(val, val),
56  _mm_rsqrt_ps(xf));
57 
58  // Average the two factors, with a slight bias.
59  __m128 xavg = _mm_mul_ps(_mm_add_ps(xover, xunder),
60  _mm_set1_ps(0.5286098f));
61 
62  // Take square root twice. Note that this is faster than
63  // the more expensive _mm_sqrt_ps instruction.
64  xavg = _mm_mul_ps(xavg, _mm_rsqrt_ps(xavg));
65  xavg = _mm_mul_ps(xavg, _mm_rsqrt_ps(xavg));
66 
67  // Bring it into the correct range. These factors are determined
68  // not on the basis of accuracy, but are chosen such that the
69  // decoder lookup table produces an equivalent result for any value.
70  xavg = _mm_mul_ps(xavg, _mm_set1_ps(269.122f));
71  xavg = _mm_sub_ps(xavg, _mm_set1_ps(13.55f));
72 
73  // Compute the linear section. This is also the path that
74  // the alpha channel takes, so we set the alpha multiplier
75  // to 255 (since alpha is not sRGB-converted).
76  __m128 lval = _mm_mul_ps(val,
77  _mm_set_ps(255.0f, 3294.6f, 3294.6f, 3294.6f));
78 
79  lval = _mm_add_ps(lval, _mm_set1_ps(0.5f));
80 
81  // Decide which version to return. Rig the alpha
82  // comparator to always fail so that the linear path
83  // is always chosen for alpha.
84  __m128 mask = _mm_cmpge_ps(val,
85  _mm_set_ps(2.0f, 0.0031308f, 0.0031308f, 0.0031308f));
86 
87  // This is a non-branching way to return one or the other value.
88  return _mm_cvttps_epi32(_mm_or_ps(
89  _mm_and_ps(mask, xavg),
90  _mm_andnot_ps(mask, lval)));
91 }
92 
93 unsigned char
94 encode_sRGB_uchar_sse2(float val) {
95  // Running only a single component through this function is still
96  // way faster than the equivalent non-SSE2 version.
97  return (unsigned char)
98  _mm_extract_epi16(_encode_sRGB_sse2_mul255(_mm_set1_ps(val)), 0);
99 }
100 
101 void
102 encode_sRGB_uchar_sse2(const LColorf &color, xel &into) {
103 #ifdef LINMATH_ALIGN
104  __m128 vec = _mm_load_ps(color.get_data());
105 #else
106  __m128 vec = _mm_loadu_ps(color.get_data());
107 #endif
108 
109  __m128i vals = _encode_sRGB_sse2_mul255(vec);
110  into.r = _mm_extract_epi16(vals, 0);
111  into.g = _mm_extract_epi16(vals, 2);
112  into.b = _mm_extract_epi16(vals, 4);
113 }
114 
115 void
116 encode_sRGB_uchar_sse2(const LColorf &color, xel &into, xelval &into_alpha) {
117 #ifdef LINMATH_ALIGN
118  __m128 vec = _mm_load_ps(color.get_data());
119 #else
120  __m128 vec = _mm_loadu_ps(color.get_data());
121 #endif
122 
123  __m128i vals = _encode_sRGB_sse2_mul255(vec);
124  into.r = _mm_extract_epi16(vals, 0);
125  into.g = _mm_extract_epi16(vals, 2);
126  into.b = _mm_extract_epi16(vals, 4);
127  into_alpha = _mm_extract_epi16(vals, 6);
128 }
129 
130 #else
131 // Somehow we're still compiling this without SSE2 support. We'll
132 // still have to define these functions, but emit a warning that the
133 // build system isn't configured properly.
134 #warning convert_srgb_sse2.cxx is being compiled without SSE2 support!
135 
136 unsigned char
137 encode_sRGB_uchar_sse2(float val) {
138  return encode_sRGB_uchar(val);
139 }
140 
141 void
142 encode_sRGB_uchar_sse2(const LColorf &color, xel &into) {
143  encode_sRGB_uchar(color, into);
144 }
145 
146 void
147 encode_sRGB_uchar_sse2(const LColorf &color, xel &into, xelval &into_alpha) {
148  encode_sRGB_uchar(color, into, into_alpha);
149 }
150 
151 #endif
const float * get_data() const
Returns the address of the first of the four data elements in the vector.
Definition: lvecBase4.h:746
This is the base class for all three-component vectors and points.
Definition: lvecBase4.h:111