Panda3D
convert_srgb_sse2.cxx
Go to the documentation of this file.
1 /**
2  * PANDA 3D SOFTWARE
3  * Copyright (c) Carnegie Mellon University. All rights reserved.
4  *
5  * All use of this software is subject to the terms of the revised BSD
6  * license. You should have received a copy of this license along
7  * with this source code in a file named "LICENSE."
8  *
9  * @file convert_srgb_sse2.cxx
10  * @author rdb
11  * @date 2014-11-13
12  */
13 
14 // This file should always be compiled with SSE2 support. These functions
15 // will only be called when SSE2 support is detected at run-time.
16 
17 #include "convert_srgb.h"
18 #include "luse.h"
19 
20 #if defined(__SSE2__) || (_M_IX86_FP >= 2) || defined(_M_X64) || defined(_M_AMD64)
21 
22 #include <xmmintrin.h>
23 #include <emmintrin.h>
24 
25 static INLINE __m128i _encode_sRGB_sse2_mul255(__m128 val) {
26  // This an SSE2-based approximation of the sRGB encode function. It has a
27  // maximum error of around 0.001, which is by far small enough for a uchar.
28  // It is also at least 10x as fast as the original; up to 40x when taking
29  // advantage of vectorization. Note that the fourth float is only
30  // multiplied with 255.
31 
32  // Part of the code in this function is derived from:
33  // http:stackoverflow.coma64866302135754
34 
35  // Clamp to 0-1 range.
36  val = _mm_max_ps(val, _mm_set1_ps(0.0f));
37  val = _mm_min_ps(val, _mm_set1_ps(1.0f));
38 
39  // Pre-multiply with constant factor to adjust for exp bias.
40  __m128 xf = _mm_mul_ps(val, _mm_set1_ps(6.3307e18f));
41 
42  // Approximate logarithm by... casting!
43  xf = _mm_cvtepi32_ps(_mm_castps_si128(xf));
44 
45  // Multiply 'logarithm' by power.
46  xf = _mm_mul_ps(xf, _mm_set1_ps(2.0f / 3.0f));
47 
48  // Reverse operation of above: cast the other way.
49  xf = _mm_castsi128_ps(_mm_cvtps_epi32(xf));
50 
51  // Make an overestimate and an underestimate.
52  __m128 xover = _mm_mul_ps(val, xf);
53  __m128 xunder = _mm_mul_ps(_mm_mul_ps(val, val),
54  _mm_rsqrt_ps(xf));
55 
56  // Average the two factors, with a slight bias.
57  __m128 xavg = _mm_mul_ps(_mm_add_ps(xover, xunder),
58  _mm_set1_ps(0.5286098f));
59 
60  // Take square root twice. Note that this is faster than the more expensive
61  // _mm_sqrt_ps instruction.
62  xavg = _mm_mul_ps(xavg, _mm_rsqrt_ps(xavg));
63  xavg = _mm_mul_ps(xavg, _mm_rsqrt_ps(xavg));
64 
65  // Bring it into the correct range. These factors are determined not on the
66  // basis of accuracy, but are chosen such that the decoder lookup table
67  // produces an equivalent result for any value.
68  xavg = _mm_mul_ps(xavg, _mm_set1_ps(269.122f));
69  xavg = _mm_sub_ps(xavg, _mm_set1_ps(13.55f));
70 
71  // Compute the linear section. This is also the path that the alpha channel
72  // takes, so we set the alpha multiplier to 255 (since alpha is not sRGB-
73  // converted).
74  __m128 lval = _mm_mul_ps(val,
75  _mm_set_ps(255.0f, 3294.6f, 3294.6f, 3294.6f));
76 
77  lval = _mm_add_ps(lval, _mm_set1_ps(0.5f));
78 
79  // Decide which version to return. Rig the alpha comparator to always fail
80  // so that the linear path is always chosen for alpha.
81  __m128 mask = _mm_cmpge_ps(val,
82  _mm_set_ps(2.0f, 0.0031308f, 0.0031308f, 0.0031308f));
83 
84  // This is a non-branching way to return one or the other value.
85  return _mm_cvttps_epi32(_mm_or_ps(
86  _mm_and_ps(mask, xavg),
87  _mm_andnot_ps(mask, lval)));
88 }
89 
90 unsigned char
91 encode_sRGB_uchar_sse2(float val) {
92  // Running only a single component through this function is still way faster
93  // than the equivalent non-SSE2 version.
94  return (unsigned char)
95  _mm_extract_epi16(_encode_sRGB_sse2_mul255(_mm_set1_ps(val)), 0);
96 }
97 
98 void
99 encode_sRGB_uchar_sse2(const LColorf &color, xel &into) {
100 #ifdef LINMATH_ALIGN
101  __m128 vec = _mm_load_ps(color.get_data());
102 #else
103  __m128 vec = _mm_loadu_ps(color.get_data());
104 #endif
105 
106  __m128i vals = _encode_sRGB_sse2_mul255(vec);
107  into.r = _mm_extract_epi16(vals, 0);
108  into.g = _mm_extract_epi16(vals, 2);
109  into.b = _mm_extract_epi16(vals, 4);
110 }
111 
112 void
113 encode_sRGB_uchar_sse2(const LColorf &color, xel &into, xelval &into_alpha) {
114 #ifdef LINMATH_ALIGN
115  __m128 vec = _mm_load_ps(color.get_data());
116 #else
117  __m128 vec = _mm_loadu_ps(color.get_data());
118 #endif
119 
120  __m128i vals = _encode_sRGB_sse2_mul255(vec);
121  into.r = _mm_extract_epi16(vals, 0);
122  into.g = _mm_extract_epi16(vals, 2);
123  into.b = _mm_extract_epi16(vals, 4);
124  into_alpha = _mm_extract_epi16(vals, 6);
125 }
126 
127 #elif defined(__i386__) || defined(_M_IX86)
128 // Somehow we're still compiling this without SSE2 support, even though the
129 // target architecture could (in theory) support SSE2. We still have to
130 // define these functions, but emit a warning that the build system isn't
131 // configured properly.
132 #warning convert_srgb_sse2.cxx is being compiled without SSE2 support!
133 
134 unsigned char
135 encode_sRGB_uchar_sse2(float val) {
136  return encode_sRGB_uchar(val);
137 }
138 
139 void
140 encode_sRGB_uchar_sse2(const LColorf &color, xel &into) {
141  encode_sRGB_uchar(color, into);
142 }
143 
144 void
145 encode_sRGB_uchar_sse2(const LColorf &color, xel &into, xelval &into_alpha) {
146  encode_sRGB_uchar(color, into, into_alpha);
147 }
148 
149 #endif
PANDA 3D SOFTWARE Copyright (c) Carnegie Mellon University.
PANDA 3D SOFTWARE Copyright (c) Carnegie Mellon University.
EXPCL_PANDA_PNMIMAGE unsigned char encode_sRGB_uchar(unsigned char val)
Encodes the linearized unsigned char value to an sRGB-encoded unsigned char value.
Definition: convert_srgb.I:80