Panda3D
Loading...
Searching...
No Matches
convert_srgb_sse2.cxx
Go to the documentation of this file.
1/**
2 * PANDA 3D SOFTWARE
3 * Copyright (c) Carnegie Mellon University. All rights reserved.
4 *
5 * All use of this software is subject to the terms of the revised BSD
6 * license. You should have received a copy of this license along
7 * with this source code in a file named "LICENSE."
8 *
9 * @file convert_srgb_sse2.cxx
10 * @author rdb
11 * @date 2014-11-13
12 */
13
14// This file should always be compiled with SSE2 support. These functions
15// will only be called when SSE2 support is detected at run-time.
16
17#include "convert_srgb.h"
18#include "luse.h"
19
20#if defined(__SSE2__) || (_M_IX86_FP >= 2) || defined(_M_X64) || defined(_M_AMD64)
21
22#include <xmmintrin.h>
23#include <emmintrin.h>
24
25static INLINE __m128i _encode_sRGB_sse2_mul255(__m128 val) {
26 // This an SSE2-based approximation of the sRGB encode function. It has a
27 // maximum error of around 0.001, which is by far small enough for a uchar.
28 // It is also at least 10x as fast as the original; up to 40x when taking
29 // advantage of vectorization. Note that the fourth float is only
30 // multiplied with 255.
31
32 // Part of the code in this function is derived from:
33 // http:stackoverflow.coma64866302135754
34
35 // Clamp to 0-1 range.
36 val = _mm_max_ps(val, _mm_set1_ps(0.0f));
37 val = _mm_min_ps(val, _mm_set1_ps(1.0f));
38
39 // Pre-multiply with constant factor to adjust for exp bias.
40 __m128 xf = _mm_mul_ps(val, _mm_set1_ps(6.3307e18f));
41
42 // Approximate logarithm by... casting!
43 xf = _mm_cvtepi32_ps(_mm_castps_si128(xf));
44
45 // Multiply 'logarithm' by power.
46 xf = _mm_mul_ps(xf, _mm_set1_ps(2.0f / 3.0f));
47
48 // Reverse operation of above: cast the other way.
49 xf = _mm_castsi128_ps(_mm_cvtps_epi32(xf));
50
51 // Make an overestimate and an underestimate.
52 __m128 xover = _mm_mul_ps(val, xf);
53 __m128 xunder = _mm_mul_ps(_mm_mul_ps(val, val),
54 _mm_rsqrt_ps(xf));
55
56 // Average the two factors, with a slight bias.
57 __m128 xavg = _mm_mul_ps(_mm_add_ps(xover, xunder),
58 _mm_set1_ps(0.5286098f));
59
60 // Take square root twice. Note that this is faster than the more expensive
61 // _mm_sqrt_ps instruction.
62 xavg = _mm_mul_ps(xavg, _mm_rsqrt_ps(xavg));
63 xavg = _mm_mul_ps(xavg, _mm_rsqrt_ps(xavg));
64
65 // Bring it into the correct range. These factors are determined not on the
66 // basis of accuracy, but are chosen such that the decoder lookup table
67 // produces an equivalent result for any value.
68 xavg = _mm_mul_ps(xavg, _mm_set1_ps(269.122f));
69 xavg = _mm_sub_ps(xavg, _mm_set1_ps(13.55f));
70
71 // Compute the linear section. This is also the path that the alpha channel
72 // takes, so we set the alpha multiplier to 255 (since alpha is not sRGB-
73 // converted).
74 __m128 lval = _mm_mul_ps(val,
75 _mm_set_ps(255.0f, 3294.6f, 3294.6f, 3294.6f));
76
77 lval = _mm_add_ps(lval, _mm_set1_ps(0.5f));
78
79 // Decide which version to return. Rig the alpha comparator to always fail
80 // so that the linear path is always chosen for alpha.
81 __m128 mask = _mm_cmpge_ps(val,
82 _mm_set_ps(2.0f, 0.0031308f, 0.0031308f, 0.0031308f));
83
84 // This is a non-branching way to return one or the other value.
85 return _mm_cvttps_epi32(_mm_or_ps(
86 _mm_and_ps(mask, xavg),
87 _mm_andnot_ps(mask, lval)));
88}
89
90unsigned char
91encode_sRGB_uchar_sse2(float val) {
92 // Running only a single component through this function is still way faster
93 // than the equivalent non-SSE2 version.
94 return (unsigned char)
95 _mm_extract_epi16(_encode_sRGB_sse2_mul255(_mm_set1_ps(val)), 0);
96}
97
98void
99encode_sRGB_uchar_sse2(const LColorf &color, xel &into) {
100#ifdef LINMATH_ALIGN
101 __m128 vec = _mm_load_ps(color.get_data());
102#else
103 __m128 vec = _mm_loadu_ps(color.get_data());
104#endif
105
106 __m128i vals = _encode_sRGB_sse2_mul255(vec);
107 into.r = _mm_extract_epi16(vals, 0);
108 into.g = _mm_extract_epi16(vals, 2);
109 into.b = _mm_extract_epi16(vals, 4);
110}
111
112void
113encode_sRGB_uchar_sse2(const LColorf &color, xel &into, xelval &into_alpha) {
114#ifdef LINMATH_ALIGN
115 __m128 vec = _mm_load_ps(color.get_data());
116#else
117 __m128 vec = _mm_loadu_ps(color.get_data());
118#endif
119
120 __m128i vals = _encode_sRGB_sse2_mul255(vec);
121 into.r = _mm_extract_epi16(vals, 0);
122 into.g = _mm_extract_epi16(vals, 2);
123 into.b = _mm_extract_epi16(vals, 4);
124 into_alpha = _mm_extract_epi16(vals, 6);
125}
126
127#elif defined(__i386__) || defined(_M_IX86)
128// Somehow we're still compiling this without SSE2 support, even though the
129// target architecture could (in theory) support SSE2. We still have to
130// define these functions, but emit a warning that the build system isn't
131// configured properly.
132#warning convert_srgb_sse2.cxx is being compiled without SSE2 support!
133
134unsigned char
135encode_sRGB_uchar_sse2(float val) {
136 return encode_sRGB_uchar(val);
137}
138
139void
140encode_sRGB_uchar_sse2(const LColorf &color, xel &into) {
141 encode_sRGB_uchar(color, into);
142}
143
144void
145encode_sRGB_uchar_sse2(const LColorf &color, xel &into, xelval &into_alpha) {
146 encode_sRGB_uchar(color, into, into_alpha);
147}
148
149#endif
PANDA 3D SOFTWARE Copyright (c) Carnegie Mellon University.
EXPCL_PANDA_PNMIMAGE unsigned char encode_sRGB_uchar(unsigned char val)
Encodes the linearized unsigned char value to an sRGB-encoded unsigned char value.
PANDA 3D SOFTWARE Copyright (c) Carnegie Mellon University.