1/*
2 * Copyright 2018 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8#ifndef SkRasterPipeline_opts_DEFINED
9#define SkRasterPipeline_opts_DEFINED
10
11#include "include/core/SkData.h"
12#include "include/core/SkTypes.h"
13#include "include/private/base/SkMalloc.h"
14#include "modules/skcms/skcms.h"
15#include "src/base/SkUtils.h" // unaligned_{load,store}
16#include "src/core/SkRasterPipeline.h"
17#include "src/core/SkRasterPipelineContextUtils.h"
18#include "src/sksl/tracing/SkSLTraceHook.h"
19
20#include <cstdint>
21#include <type_traits>
22
23// Every function in this file should be marked static and inline using SI.
24#if defined(__clang__)
25 #define SI __attribute__((always_inline)) static inline
26#else
27 #define SI static inline
28#endif
29
30#if defined(__clang__)
31 #define SK_UNROLL _Pragma("unroll")
32#else
33 #define SK_UNROLL
34#endif
35
36template <typename Dst, typename Src>
37SI Dst widen_cast(const Src& src) {
38 static_assert(sizeof(Dst) > sizeof(Src));
39 static_assert(std::is_trivially_copyable<Dst>::value);
40 static_assert(std::is_trivially_copyable<Src>::value);
41 Dst dst;
42 memcpy(&dst, &src, sizeof(Src));
43 return dst;
44}
45
46struct Ctx {
47 SkRasterPipelineStage* fStage;
48
49 template <typename T>
50 operator T*() {
51 return (T*)fStage->ctx;
52 }
53};
54
55using NoCtx = const void*;
56
57#if !defined(__clang__)
58 #define JUMPER_IS_SCALAR
59#elif defined(SK_ARM_HAS_NEON)
60 #define JUMPER_IS_NEON
61#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
62 #define JUMPER_IS_HSW
63#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX
64 #define JUMPER_IS_AVX
65#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
66 #define JUMPER_IS_SSE41
67#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
68 #define JUMPER_IS_SSE2
69#else
70 #define JUMPER_IS_SCALAR
71#endif
72
73// Older Clangs seem to crash when generating non-optimized NEON code for ARMv7.
74#if defined(__clang__) && !defined(__OPTIMIZE__) && defined(SK_CPU_ARM32)
75 // Apple Clang 9 and vanilla Clang 5 are fine, and may even be conservative.
76 #if defined(__apple_build_version__) && __clang_major__ < 9
77 #define JUMPER_IS_SCALAR
78 #elif __clang_major__ < 5
79 #define JUMPER_IS_SCALAR
80 #endif
81
82 #if defined(JUMPER_IS_NEON) && defined(JUMPER_IS_SCALAR)
83 #undef JUMPER_IS_NEON
84 #endif
85#endif
86
87#if defined(JUMPER_IS_SCALAR)
88 #include <math.h>
89#elif defined(JUMPER_IS_NEON)
90 #include <arm_neon.h>
91#else
92 #include <immintrin.h>
93#endif
94
95// Notes:
96// * rcp_fast and rcp_precise both produce a reciprocal, but rcp_fast is an estimate with at least
97// 12 bits of precision while rcp_precise should be accurate for float size. For ARM rcp_precise
98// requires 2 Newton-Raphson refinement steps because its estimate has 8 bit precision, and for
99// Intel this requires one additional step because its estimate has 12 bit precision.
100
101namespace SK_OPTS_NS {
102#if defined(JUMPER_IS_SCALAR)
103 // This path should lead to portable scalar code.
104 using F = float ;
105 using I32 = int32_t;
106 using U64 = uint64_t;
107 using U32 = uint32_t;
108 using U16 = uint16_t;
109 using U8 = uint8_t ;
110
111 SI F min(F a, F b) { return fminf(a,b); }
112 SI I32 min(I32 a, I32 b) { return a < b ? a : b; }
113 SI U32 min(U32 a, U32 b) { return a < b ? a : b; }
114 SI F max(F a, F b) { return fmaxf(a,b); }
115 SI I32 max(I32 a, I32 b) { return a > b ? a : b; }
116 SI U32 max(U32 a, U32 b) { return a > b ? a : b; }
117
118 SI F mad(F f, F m, F a) { return f*m+a; }
119 SI F abs_ (F v) { return fabsf(v); }
120 SI I32 abs_ (I32 v) { return v < 0 ? -v : v; }
121 SI F floor_(F v) { return floorf(v); }
122 SI F ceil_(F v) { return ceilf(v); }
123 SI F rcp_fast(F v) { return 1.0f / v; }
124 SI F rsqrt (F v) { return 1.0f / sqrtf(v); }
125 SI F sqrt_ (F v) { return sqrtf(v); }
126 SI F rcp_precise (F v) { return 1.0f / v; }
127
128 SI U32 round(F v) { return (uint32_t)(v + 0.5f); }
129 SI U32 round(F v, F scale) { return (uint32_t)(v*scale + 0.5f); }
130 SI U16 pack(U32 v) { return (U16)v; }
131 SI U8 pack(U16 v) { return (U8)v; }
132
133 SI F if_then_else(I32 c, F t, F e) { return c ? t : e; }
134 SI bool any(I32 c) { return c != 0; }
135 SI bool all(I32 c) { return c != 0; }
136
137 template <typename T>
138 SI T gather(const T* p, U32 ix) { return p[ix]; }
139
140 template <typename T>
141 SI void scatter_masked(T src, T* dst, U32 ix, I32 mask) {
142 dst[ix] = mask ? src : dst[ix];
143 }
144
145 SI void load2(const uint16_t* ptr, size_t tail, U16* r, U16* g) {
146 *r = ptr[0];
147 *g = ptr[1];
148 }
149 SI void store2(uint16_t* ptr, size_t tail, U16 r, U16 g) {
150 ptr[0] = r;
151 ptr[1] = g;
152 }
153 SI void load3(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) {
154 *r = ptr[0];
155 *g = ptr[1];
156 *b = ptr[2];
157 }
158 SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
159 *r = ptr[0];
160 *g = ptr[1];
161 *b = ptr[2];
162 *a = ptr[3];
163 }
164 SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
165 ptr[0] = r;
166 ptr[1] = g;
167 ptr[2] = b;
168 ptr[3] = a;
169 }
170
171 SI void load2(const float* ptr, size_t tail, F* r, F* g) {
172 *r = ptr[0];
173 *g = ptr[1];
174 }
175 SI void store2(float* ptr, size_t tail, F r, F g) {
176 ptr[0] = r;
177 ptr[1] = g;
178 }
179 SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) {
180 *r = ptr[0];
181 *g = ptr[1];
182 *b = ptr[2];
183 *a = ptr[3];
184 }
185 SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) {
186 ptr[0] = r;
187 ptr[1] = g;
188 ptr[2] = b;
189 ptr[3] = a;
190 }
191
192#elif defined(JUMPER_IS_NEON)
193 // Since we know we're using Clang, we can use its vector extensions.
194 template <typename T> using V = T __attribute__((ext_vector_type(4)));
195 using F = V<float >;
196 using I32 = V< int32_t>;
197 using U64 = V<uint64_t>;
198 using U32 = V<uint32_t>;
199 using U16 = V<uint16_t>;
200 using U8 = V<uint8_t >;
201
202 // We polyfill a few routines that Clang doesn't build into ext_vector_types.
203 SI F min(F a, F b) { return vminq_f32(a,b); }
204 SI I32 min(I32 a, I32 b) { return vminq_s32(a,b); }
205 SI U32 min(U32 a, U32 b) { return vminq_u32(a,b); }
206 SI F max(F a, F b) { return vmaxq_f32(a,b); }
207 SI I32 max(I32 a, I32 b) { return vmaxq_s32(a,b); }
208 SI U32 max(U32 a, U32 b) { return vmaxq_u32(a,b); }
209
210 SI F abs_ (F v) { return vabsq_f32(v); }
211 SI I32 abs_ (I32 v) { return vabsq_s32(v); }
212 SI F rcp_fast(F v) { auto e = vrecpeq_f32 (v); return vrecpsq_f32 (v,e ) * e; }
213 SI F rcp_precise (F v) { auto e = rcp_fast(v); return vrecpsq_f32 (v,e ) * e; }
214 SI F rsqrt (F v) { auto e = vrsqrteq_f32(v); return vrsqrtsq_f32(v,e*e) * e; }
215
216 SI U16 pack(U32 v) { return __builtin_convertvector(v, U16); }
217 SI U8 pack(U16 v) { return __builtin_convertvector(v, U8); }
218
219 SI F if_then_else(I32 c, F t, F e) { return vbslq_f32((U32)c,t,e); }
220
221 #if defined(SK_CPU_ARM64)
222 SI bool any(I32 c) { return vmaxvq_u32((U32)c) != 0; }
223 SI bool all(I32 c) { return vminvq_u32((U32)c) != 0; }
224
225 SI F mad(F f, F m, F a) { return vfmaq_f32(a,f,m); }
226 SI F floor_(F v) { return vrndmq_f32(v); }
227 SI F ceil_(F v) { return vrndpq_f32(v); }
228 SI F sqrt_(F v) { return vsqrtq_f32(v); }
229 SI U32 round(F v) { return vcvtnq_u32_f32(v); }
230 SI U32 round(F v, F scale) { return vcvtnq_u32_f32(v*scale); }
231 #else
232 SI bool any(I32 c) { return c[0] | c[1] | c[2] | c[3]; }
233 SI bool all(I32 c) { return c[0] & c[1] & c[2] & c[3]; }
234
235 SI F mad(F f, F m, F a) { return vmlaq_f32(a,f,m); }
236 SI F floor_(F v) {
237 F roundtrip = vcvtq_f32_s32(vcvtq_s32_f32(v));
238 return roundtrip - if_then_else(roundtrip > v, 1, 0);
239 }
240
241 SI F ceil_(F v) {
242 F roundtrip = vcvtq_f32_s32(vcvtq_s32_f32(v));
243 return roundtrip + if_then_else(roundtrip < v, 1, 0);
244 }
245
246 SI F sqrt_(F v) {
247 auto e = vrsqrteq_f32(v); // Estimate and two refinement steps for e = rsqrt(v).
248 e *= vrsqrtsq_f32(v,e*e);
249 e *= vrsqrtsq_f32(v,e*e);
250 return v*e; // sqrt(v) == v*rsqrt(v).
251 }
252
253 SI U32 round(F v) {
254 return vcvtq_u32_f32(v + 0.5f);
255 }
256
257 SI U32 round(F v, F scale) {
258 return vcvtq_u32_f32(mad(v,scale,0.5f));
259 }
260 #endif
261
262 template <typename T>
263 SI V<T> gather(const T* p, U32 ix) {
264 return {p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]]};
265 }
266 template <typename V, typename S>
267 SI void scatter_masked(V src, S* dst, U32 ix, I32 mask) {
268 V before = gather(dst, ix);
269 V after = if_then_else(mask, src, before);
270 dst[ix[0]] = after[0];
271 dst[ix[1]] = after[1];
272 dst[ix[2]] = after[2];
273 dst[ix[3]] = after[3];
274 }
275 SI void load2(const uint16_t* ptr, size_t tail, U16* r, U16* g) {
276 uint16x4x2_t rg;
277 if (__builtin_expect(tail,0)) {
278 if ( true ) { rg = vld2_lane_u16(ptr + 0, rg, 0); }
279 if (tail > 1) { rg = vld2_lane_u16(ptr + 2, rg, 1); }
280 if (tail > 2) { rg = vld2_lane_u16(ptr + 4, rg, 2); }
281 } else {
282 rg = vld2_u16(ptr);
283 }
284 *r = rg.val[0];
285 *g = rg.val[1];
286 }
287 SI void store2(uint16_t* ptr, size_t tail, U16 r, U16 g) {
288 if (__builtin_expect(tail,0)) {
289 if ( true ) { vst2_lane_u16(ptr + 0, (uint16x4x2_t{{r,g}}), 0); }
290 if (tail > 1) { vst2_lane_u16(ptr + 2, (uint16x4x2_t{{r,g}}), 1); }
291 if (tail > 2) { vst2_lane_u16(ptr + 4, (uint16x4x2_t{{r,g}}), 2); }
292 } else {
293 vst2_u16(ptr, (uint16x4x2_t{{r,g}}));
294 }
295 }
296 SI void load3(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) {
297 uint16x4x3_t rgb;
298 if (__builtin_expect(tail,0)) {
299 if ( true ) { rgb = vld3_lane_u16(ptr + 0, rgb, 0); }
300 if (tail > 1) { rgb = vld3_lane_u16(ptr + 3, rgb, 1); }
301 if (tail > 2) { rgb = vld3_lane_u16(ptr + 6, rgb, 2); }
302 } else {
303 rgb = vld3_u16(ptr);
304 }
305 *r = rgb.val[0];
306 *g = rgb.val[1];
307 *b = rgb.val[2];
308 }
309 SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
310 uint16x4x4_t rgba;
311 if (__builtin_expect(tail,0)) {
312 if ( true ) { rgba = vld4_lane_u16(ptr + 0, rgba, 0); }
313 if (tail > 1) { rgba = vld4_lane_u16(ptr + 4, rgba, 1); }
314 if (tail > 2) { rgba = vld4_lane_u16(ptr + 8, rgba, 2); }
315 } else {
316 rgba = vld4_u16(ptr);
317 }
318 *r = rgba.val[0];
319 *g = rgba.val[1];
320 *b = rgba.val[2];
321 *a = rgba.val[3];
322 }
323
324 SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
325 if (__builtin_expect(tail,0)) {
326 if ( true ) { vst4_lane_u16(ptr + 0, (uint16x4x4_t{{r,g,b,a}}), 0); }
327 if (tail > 1) { vst4_lane_u16(ptr + 4, (uint16x4x4_t{{r,g,b,a}}), 1); }
328 if (tail > 2) { vst4_lane_u16(ptr + 8, (uint16x4x4_t{{r,g,b,a}}), 2); }
329 } else {
330 vst4_u16(ptr, (uint16x4x4_t{{r,g,b,a}}));
331 }
332 }
333 SI void load2(const float* ptr, size_t tail, F* r, F* g) {
334 float32x4x2_t rg;
335 if (__builtin_expect(tail,0)) {
336 if ( true ) { rg = vld2q_lane_f32(ptr + 0, rg, 0); }
337 if (tail > 1) { rg = vld2q_lane_f32(ptr + 2, rg, 1); }
338 if (tail > 2) { rg = vld2q_lane_f32(ptr + 4, rg, 2); }
339 } else {
340 rg = vld2q_f32(ptr);
341 }
342 *r = rg.val[0];
343 *g = rg.val[1];
344 }
345 SI void store2(float* ptr, size_t tail, F r, F g) {
346 if (__builtin_expect(tail,0)) {
347 if ( true ) { vst2q_lane_f32(ptr + 0, (float32x4x2_t{{r,g}}), 0); }
348 if (tail > 1) { vst2q_lane_f32(ptr + 2, (float32x4x2_t{{r,g}}), 1); }
349 if (tail > 2) { vst2q_lane_f32(ptr + 4, (float32x4x2_t{{r,g}}), 2); }
350 } else {
351 vst2q_f32(ptr, (float32x4x2_t{{r,g}}));
352 }
353 }
354 SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) {
355 float32x4x4_t rgba;
356 if (__builtin_expect(tail,0)) {
357 if ( true ) { rgba = vld4q_lane_f32(ptr + 0, rgba, 0); }
358 if (tail > 1) { rgba = vld4q_lane_f32(ptr + 4, rgba, 1); }
359 if (tail > 2) { rgba = vld4q_lane_f32(ptr + 8, rgba, 2); }
360 } else {
361 rgba = vld4q_f32(ptr);
362 }
363 *r = rgba.val[0];
364 *g = rgba.val[1];
365 *b = rgba.val[2];
366 *a = rgba.val[3];
367 }
368 SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) {
369 if (__builtin_expect(tail,0)) {
370 if ( true ) { vst4q_lane_f32(ptr + 0, (float32x4x4_t{{r,g,b,a}}), 0); }
371 if (tail > 1) { vst4q_lane_f32(ptr + 4, (float32x4x4_t{{r,g,b,a}}), 1); }
372 if (tail > 2) { vst4q_lane_f32(ptr + 8, (float32x4x4_t{{r,g,b,a}}), 2); }
373 } else {
374 vst4q_f32(ptr, (float32x4x4_t{{r,g,b,a}}));
375 }
376 }
377
378#elif defined(JUMPER_IS_HSW)
379 // These are __m256 and __m256i, but friendlier and strongly-typed.
380 template <typename T> using V = T __attribute__((ext_vector_type(8)));
381 using F = V<float >;
382 using I32 = V< int32_t>;
383 using U64 = V<uint64_t>;
384 using U32 = V<uint32_t>;
385 using U16 = V<uint16_t>;
386 using U8 = V<uint8_t >;
387
388 SI F mad(F f, F m, F a) { return _mm256_fmadd_ps(f, m, a); }
389
390 SI F min(F a, F b) { return _mm256_min_ps(a,b); }
391 SI I32 min(I32 a, I32 b) { return _mm256_min_epi32(a,b); }
392 SI U32 min(U32 a, U32 b) { return _mm256_min_epu32(a,b); }
393 SI F max(F a, F b) { return _mm256_max_ps(a,b); }
394 SI I32 max(I32 a, I32 b) { return _mm256_max_epi32(a,b); }
395 SI U32 max(U32 a, U32 b) { return _mm256_max_epu32(a,b); }
396
397 SI F abs_ (F v) { return _mm256_and_ps(v, 0-v); }
398 SI I32 abs_ (I32 v) { return _mm256_abs_epi32(v); }
399 SI F floor_(F v) { return _mm256_floor_ps(v); }
400 SI F ceil_(F v) { return _mm256_ceil_ps(v); }
401 SI F rcp_fast(F v) { return _mm256_rcp_ps (v); }
402 SI F rsqrt (F v) { return _mm256_rsqrt_ps(v); }
403 SI F sqrt_ (F v) { return _mm256_sqrt_ps (v); }
404 SI F rcp_precise (F v) {
405 F e = rcp_fast(v);
406 return _mm256_fnmadd_ps(v, e, _mm256_set1_ps(2.0f)) * e;
407 }
408
409 SI U32 round(F v) { return _mm256_cvtps_epi32(v); }
410 SI U32 round(F v, F scale) { return _mm256_cvtps_epi32(v*scale); }
411 SI U16 pack(U32 v) {
412 return _mm_packus_epi32(_mm256_extractf128_si256(v, 0),
413 _mm256_extractf128_si256(v, 1));
414 }
415 SI U8 pack(U16 v) {
416 auto r = _mm_packus_epi16(v,v);
417 return sk_unaligned_load<U8>(&r);
418 }
419
420 SI F if_then_else(I32 c, F t, F e) { return _mm256_blendv_ps(e,t,c); }
421 // NOTE: This version of 'all' only works with mask values (true == all bits set)
422 SI bool any(I32 c) { return !_mm256_testz_si256(c, _mm256_set1_epi32(-1)); }
423 SI bool all(I32 c) { return _mm256_testc_si256(c, _mm256_set1_epi32(-1)); }
424
425 template <typename T>
426 SI V<T> gather(const T* p, U32 ix) {
427 return { p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]],
428 p[ix[4]], p[ix[5]], p[ix[6]], p[ix[7]], };
429 }
430 SI F gather(const float* p, U32 ix) { return _mm256_i32gather_ps (p, ix, 4); }
431 SI U32 gather(const uint32_t* p, U32 ix) { return _mm256_i32gather_epi32(p, ix, 4); }
432 SI U64 gather(const uint64_t* p, U32 ix) {
433 __m256i parts[] = {
434 _mm256_i32gather_epi64(p, _mm256_extracti128_si256(ix,0), 8),
435 _mm256_i32gather_epi64(p, _mm256_extracti128_si256(ix,1), 8),
436 };
437 return sk_bit_cast<U64>(parts);
438 }
439 template <typename V, typename S>
440 SI void scatter_masked(V src, S* dst, U32 ix, I32 mask) {
441 V before = gather(dst, ix);
442 V after = if_then_else(mask, src, before);
443 dst[ix[0]] = after[0];
444 dst[ix[1]] = after[1];
445 dst[ix[2]] = after[2];
446 dst[ix[3]] = after[3];
447 dst[ix[4]] = after[4];
448 dst[ix[5]] = after[5];
449 dst[ix[6]] = after[6];
450 dst[ix[7]] = after[7];
451 }
452
453 SI void load2(const uint16_t* ptr, size_t tail, U16* r, U16* g) {
454 U16 _0123, _4567;
455 if (__builtin_expect(tail,0)) {
456 _0123 = _4567 = _mm_setzero_si128();
457 auto* d = &_0123;
458 if (tail > 3) {
459 *d = _mm_loadu_si128(((__m128i*)ptr) + 0);
460 tail -= 4;
461 ptr += 8;
462 d = &_4567;
463 }
464 bool high = false;
465 if (tail > 1) {
466 *d = _mm_loadu_si64(ptr);
467 tail -= 2;
468 ptr += 4;
469 high = true;
470 }
471 if (tail > 0) {
472 (*d)[high ? 4 : 0] = *(ptr + 0);
473 (*d)[high ? 5 : 1] = *(ptr + 1);
474 }
475 } else {
476 _0123 = _mm_loadu_si128(((__m128i*)ptr) + 0);
477 _4567 = _mm_loadu_si128(((__m128i*)ptr) + 1);
478 }
479 *r = _mm_packs_epi32(_mm_srai_epi32(_mm_slli_epi32(_0123, 16), 16),
480 _mm_srai_epi32(_mm_slli_epi32(_4567, 16), 16));
481 *g = _mm_packs_epi32(_mm_srai_epi32(_0123, 16),
482 _mm_srai_epi32(_4567, 16));
483 }
484 SI void store2(uint16_t* ptr, size_t tail, U16 r, U16 g) {
485 auto _0123 = _mm_unpacklo_epi16(r, g),
486 _4567 = _mm_unpackhi_epi16(r, g);
487 if (__builtin_expect(tail,0)) {
488 const auto* s = &_0123;
489 if (tail > 3) {
490 _mm_storeu_si128((__m128i*)ptr, *s);
491 s = &_4567;
492 tail -= 4;
493 ptr += 8;
494 }
495 bool high = false;
496 if (tail > 1) {
497 _mm_storel_epi64((__m128i*)ptr, *s);
498 ptr += 4;
499 tail -= 2;
500 high = true;
501 }
502 if (tail > 0) {
503 if (high) {
504 *(int32_t*)ptr = _mm_extract_epi32(*s, 2);
505 } else {
506 *(int32_t*)ptr = _mm_cvtsi128_si32(*s);
507 }
508 }
509 } else {
510 _mm_storeu_si128((__m128i*)ptr + 0, _0123);
511 _mm_storeu_si128((__m128i*)ptr + 1, _4567);
512 }
513 }
514
515 SI void load3(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) {
516 __m128i _0,_1,_2,_3,_4,_5,_6,_7;
517 if (__builtin_expect(tail,0)) {
518 auto load_rgb = [](const uint16_t* src) {
519 auto v = _mm_cvtsi32_si128(*(const uint32_t*)src);
520 return _mm_insert_epi16(v, src[2], 2);
521 };
522 _1 = _2 = _3 = _4 = _5 = _6 = _7 = _mm_setzero_si128();
523 if ( true ) { _0 = load_rgb(ptr + 0); }
524 if (tail > 1) { _1 = load_rgb(ptr + 3); }
525 if (tail > 2) { _2 = load_rgb(ptr + 6); }
526 if (tail > 3) { _3 = load_rgb(ptr + 9); }
527 if (tail > 4) { _4 = load_rgb(ptr + 12); }
528 if (tail > 5) { _5 = load_rgb(ptr + 15); }
529 if (tail > 6) { _6 = load_rgb(ptr + 18); }
530 } else {
531 // Load 0+1, 2+3, 4+5 normally, and 6+7 backed up 4 bytes so we don't run over.
532 auto _01 = _mm_loadu_si128((const __m128i*)(ptr + 0)) ;
533 auto _23 = _mm_loadu_si128((const __m128i*)(ptr + 6)) ;
534 auto _45 = _mm_loadu_si128((const __m128i*)(ptr + 12)) ;
535 auto _67 = _mm_srli_si128(_mm_loadu_si128((const __m128i*)(ptr + 16)), 4);
536 _0 = _01; _1 = _mm_srli_si128(_01, 6);
537 _2 = _23; _3 = _mm_srli_si128(_23, 6);
538 _4 = _45; _5 = _mm_srli_si128(_45, 6);
539 _6 = _67; _7 = _mm_srli_si128(_67, 6);
540 }
541
542 auto _02 = _mm_unpacklo_epi16(_0, _2), // r0 r2 g0 g2 b0 b2 xx xx
543 _13 = _mm_unpacklo_epi16(_1, _3),
544 _46 = _mm_unpacklo_epi16(_4, _6),
545 _57 = _mm_unpacklo_epi16(_5, _7);
546
547 auto rg0123 = _mm_unpacklo_epi16(_02, _13), // r0 r1 r2 r3 g0 g1 g2 g3
548 bx0123 = _mm_unpackhi_epi16(_02, _13), // b0 b1 b2 b3 xx xx xx xx
549 rg4567 = _mm_unpacklo_epi16(_46, _57),
550 bx4567 = _mm_unpackhi_epi16(_46, _57);
551
552 *r = _mm_unpacklo_epi64(rg0123, rg4567);
553 *g = _mm_unpackhi_epi64(rg0123, rg4567);
554 *b = _mm_unpacklo_epi64(bx0123, bx4567);
555 }
556 SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
557 __m128i _01, _23, _45, _67;
558 if (__builtin_expect(tail,0)) {
559 auto src = (const double*)ptr;
560 _01 = _23 = _45 = _67 = _mm_setzero_si128();
561 if (tail > 0) { _01 = _mm_loadl_pd(_01, src+0); }
562 if (tail > 1) { _01 = _mm_loadh_pd(_01, src+1); }
563 if (tail > 2) { _23 = _mm_loadl_pd(_23, src+2); }
564 if (tail > 3) { _23 = _mm_loadh_pd(_23, src+3); }
565 if (tail > 4) { _45 = _mm_loadl_pd(_45, src+4); }
566 if (tail > 5) { _45 = _mm_loadh_pd(_45, src+5); }
567 if (tail > 6) { _67 = _mm_loadl_pd(_67, src+6); }
568 } else {
569 _01 = _mm_loadu_si128(((__m128i*)ptr) + 0);
570 _23 = _mm_loadu_si128(((__m128i*)ptr) + 1);
571 _45 = _mm_loadu_si128(((__m128i*)ptr) + 2);
572 _67 = _mm_loadu_si128(((__m128i*)ptr) + 3);
573 }
574
575 auto _02 = _mm_unpacklo_epi16(_01, _23), // r0 r2 g0 g2 b0 b2 a0 a2
576 _13 = _mm_unpackhi_epi16(_01, _23), // r1 r3 g1 g3 b1 b3 a1 a3
577 _46 = _mm_unpacklo_epi16(_45, _67),
578 _57 = _mm_unpackhi_epi16(_45, _67);
579
580 auto rg0123 = _mm_unpacklo_epi16(_02, _13), // r0 r1 r2 r3 g0 g1 g2 g3
581 ba0123 = _mm_unpackhi_epi16(_02, _13), // b0 b1 b2 b3 a0 a1 a2 a3
582 rg4567 = _mm_unpacklo_epi16(_46, _57),
583 ba4567 = _mm_unpackhi_epi16(_46, _57);
584
585 *r = _mm_unpacklo_epi64(rg0123, rg4567);
586 *g = _mm_unpackhi_epi64(rg0123, rg4567);
587 *b = _mm_unpacklo_epi64(ba0123, ba4567);
588 *a = _mm_unpackhi_epi64(ba0123, ba4567);
589 }
590 SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
591 auto rg0123 = _mm_unpacklo_epi16(r, g), // r0 g0 r1 g1 r2 g2 r3 g3
592 rg4567 = _mm_unpackhi_epi16(r, g), // r4 g4 r5 g5 r6 g6 r7 g7
593 ba0123 = _mm_unpacklo_epi16(b, a),
594 ba4567 = _mm_unpackhi_epi16(b, a);
595
596 auto _01 = _mm_unpacklo_epi32(rg0123, ba0123),
597 _23 = _mm_unpackhi_epi32(rg0123, ba0123),
598 _45 = _mm_unpacklo_epi32(rg4567, ba4567),
599 _67 = _mm_unpackhi_epi32(rg4567, ba4567);
600
601 if (__builtin_expect(tail,0)) {
602 auto dst = (double*)ptr;
603 if (tail > 0) { _mm_storel_pd(dst+0, _01); }
604 if (tail > 1) { _mm_storeh_pd(dst+1, _01); }
605 if (tail > 2) { _mm_storel_pd(dst+2, _23); }
606 if (tail > 3) { _mm_storeh_pd(dst+3, _23); }
607 if (tail > 4) { _mm_storel_pd(dst+4, _45); }
608 if (tail > 5) { _mm_storeh_pd(dst+5, _45); }
609 if (tail > 6) { _mm_storel_pd(dst+6, _67); }
610 } else {
611 _mm_storeu_si128((__m128i*)ptr + 0, _01);
612 _mm_storeu_si128((__m128i*)ptr + 1, _23);
613 _mm_storeu_si128((__m128i*)ptr + 2, _45);
614 _mm_storeu_si128((__m128i*)ptr + 3, _67);
615 }
616 }
617
618 SI void load2(const float* ptr, size_t tail, F* r, F* g) {
619 F _0123, _4567;
620 if (__builtin_expect(tail, 0)) {
621 _0123 = _4567 = _mm256_setzero_ps();
622 F* d = &_0123;
623 if (tail > 3) {
624 *d = _mm256_loadu_ps(ptr);
625 ptr += 8;
626 tail -= 4;
627 d = &_4567;
628 }
629 bool high = false;
630 if (tail > 1) {
631 *d = _mm256_castps128_ps256(_mm_loadu_ps(ptr));
632 ptr += 4;
633 tail -= 2;
634 high = true;
635 }
636 if (tail > 0) {
637 *d = high ? _mm256_insertf128_ps(*d, _mm_loadu_si64(ptr), 1)
638 : _mm256_insertf128_ps(*d, _mm_loadu_si64(ptr), 0);
639 }
640 } else {
641 _0123 = _mm256_loadu_ps(ptr + 0);
642 _4567 = _mm256_loadu_ps(ptr + 8);
643 }
644
645 F _0145 = _mm256_permute2f128_pd(_0123, _4567, 0x20),
646 _2367 = _mm256_permute2f128_pd(_0123, _4567, 0x31);
647
648 *r = _mm256_shuffle_ps(_0145, _2367, 0x88);
649 *g = _mm256_shuffle_ps(_0145, _2367, 0xDD);
650 }
651 SI void store2(float* ptr, size_t tail, F r, F g) {
652 F _0145 = _mm256_unpacklo_ps(r, g),
653 _2367 = _mm256_unpackhi_ps(r, g);
654 F _0123 = _mm256_permute2f128_pd(_0145, _2367, 0x20),
655 _4567 = _mm256_permute2f128_pd(_0145, _2367, 0x31);
656
657 if (__builtin_expect(tail, 0)) {
658 const __m256* s = &_0123;
659 if (tail > 3) {
660 _mm256_storeu_ps(ptr, *s);
661 s = &_4567;
662 tail -= 4;
663 ptr += 8;
664 }
665 bool high = false;
666 if (tail > 1) {
667 _mm_storeu_ps(ptr, _mm256_extractf128_ps(*s, 0));
668 ptr += 4;
669 tail -= 2;
670 high = true;
671 }
672 if (tail > 0) {
673 *(ptr + 0) = (*s)[ high ? 4 : 0];
674 *(ptr + 1) = (*s)[ high ? 5 : 1];
675 }
676 } else {
677 _mm256_storeu_ps(ptr + 0, _0123);
678 _mm256_storeu_ps(ptr + 8, _4567);
679 }
680 }
681
682 SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) {
683 F _04, _15, _26, _37;
684 _04 = _15 = _26 = _37 = 0;
685 switch (tail) {
686 case 0: _37 = _mm256_insertf128_ps(_37, _mm_loadu_ps(ptr+28), 1); [[fallthrough]];
687 case 7: _26 = _mm256_insertf128_ps(_26, _mm_loadu_ps(ptr+24), 1); [[fallthrough]];
688 case 6: _15 = _mm256_insertf128_ps(_15, _mm_loadu_ps(ptr+20), 1); [[fallthrough]];
689 case 5: _04 = _mm256_insertf128_ps(_04, _mm_loadu_ps(ptr+16), 1); [[fallthrough]];
690 case 4: _37 = _mm256_insertf128_ps(_37, _mm_loadu_ps(ptr+12), 0); [[fallthrough]];
691 case 3: _26 = _mm256_insertf128_ps(_26, _mm_loadu_ps(ptr+ 8), 0); [[fallthrough]];
692 case 2: _15 = _mm256_insertf128_ps(_15, _mm_loadu_ps(ptr+ 4), 0); [[fallthrough]];
693 case 1: _04 = _mm256_insertf128_ps(_04, _mm_loadu_ps(ptr+ 0), 0);
694 }
695
696 F rg0145 = _mm256_unpacklo_ps(_04,_15), // r0 r1 g0 g1 | r4 r5 g4 g5
697 ba0145 = _mm256_unpackhi_ps(_04,_15),
698 rg2367 = _mm256_unpacklo_ps(_26,_37),
699 ba2367 = _mm256_unpackhi_ps(_26,_37);
700
701 *r = _mm256_unpacklo_pd(rg0145, rg2367);
702 *g = _mm256_unpackhi_pd(rg0145, rg2367);
703 *b = _mm256_unpacklo_pd(ba0145, ba2367);
704 *a = _mm256_unpackhi_pd(ba0145, ba2367);
705 }
706 SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) {
707 F rg0145 = _mm256_unpacklo_ps(r, g), // r0 g0 r1 g1 | r4 g4 r5 g5
708 rg2367 = _mm256_unpackhi_ps(r, g), // r2 ... | r6 ...
709 ba0145 = _mm256_unpacklo_ps(b, a), // b0 a0 b1 a1 | b4 a4 b5 a5
710 ba2367 = _mm256_unpackhi_ps(b, a); // b2 ... | b6 ...
711
712 F _04 = _mm256_unpacklo_pd(rg0145, ba0145), // r0 g0 b0 a0 | r4 g4 b4 a4
713 _15 = _mm256_unpackhi_pd(rg0145, ba0145), // r1 ... | r5 ...
714 _26 = _mm256_unpacklo_pd(rg2367, ba2367), // r2 ... | r6 ...
715 _37 = _mm256_unpackhi_pd(rg2367, ba2367); // r3 ... | r7 ...
716
717 if (__builtin_expect(tail, 0)) {
718 if (tail > 0) { _mm_storeu_ps(ptr+ 0, _mm256_extractf128_ps(_04, 0)); }
719 if (tail > 1) { _mm_storeu_ps(ptr+ 4, _mm256_extractf128_ps(_15, 0)); }
720 if (tail > 2) { _mm_storeu_ps(ptr+ 8, _mm256_extractf128_ps(_26, 0)); }
721 if (tail > 3) { _mm_storeu_ps(ptr+12, _mm256_extractf128_ps(_37, 0)); }
722 if (tail > 4) { _mm_storeu_ps(ptr+16, _mm256_extractf128_ps(_04, 1)); }
723 if (tail > 5) { _mm_storeu_ps(ptr+20, _mm256_extractf128_ps(_15, 1)); }
724 if (tail > 6) { _mm_storeu_ps(ptr+24, _mm256_extractf128_ps(_26, 1)); }
725 } else {
726 F _01 = _mm256_permute2f128_ps(_04, _15, 32), // 32 == 0010 0000 == lo, lo
727 _23 = _mm256_permute2f128_ps(_26, _37, 32),
728 _45 = _mm256_permute2f128_ps(_04, _15, 49), // 49 == 0011 0001 == hi, hi
729 _67 = _mm256_permute2f128_ps(_26, _37, 49);
730 _mm256_storeu_ps(ptr+ 0, _01);
731 _mm256_storeu_ps(ptr+ 8, _23);
732 _mm256_storeu_ps(ptr+16, _45);
733 _mm256_storeu_ps(ptr+24, _67);
734 }
735 }
736
737#elif defined(JUMPER_IS_SSE2) || defined(JUMPER_IS_SSE41) || defined(JUMPER_IS_AVX)
738template <typename T> using V = T __attribute__((ext_vector_type(4)));
739 using F = V<float >;
740 using I32 = V< int32_t>;
741 using U64 = V<uint64_t>;
742 using U32 = V<uint32_t>;
743 using U16 = V<uint16_t>;
744 using U8 = V<uint8_t >;
745
746 SI F if_then_else(I32 c, F t, F e) {
747 return _mm_or_ps(a: _mm_and_ps(a: c, b: t), b: _mm_andnot_ps(a: c, b: e));
748 }
749
750 SI F min(F a, F b) { return _mm_min_ps(a: a,b: b); }
751 SI F max(F a, F b) { return _mm_max_ps(a: a,b: b); }
752#if defined(JUMPER_IS_SSE41) || defined(JUMPER_IS_AVX)
753 SI I32 min(I32 a, I32 b) { return _mm_min_epi32(a,b); }
754 SI U32 min(U32 a, U32 b) { return _mm_min_epu32(a,b); }
755 SI I32 max(I32 a, I32 b) { return _mm_max_epi32(a,b); }
756 SI U32 max(U32 a, U32 b) { return _mm_max_epu32(a,b); }
757#else
758 SI I32 min(I32 a, I32 b) {
759 return sk_bit_cast<I32>(src: if_then_else(c: a < b, t: sk_bit_cast<F>(src: a), e: sk_bit_cast<F>(src: b)));
760 }
761 SI U32 min(U32 a, U32 b) {
762 return sk_bit_cast<U32>(src: if_then_else(c: a < b, t: sk_bit_cast<F>(src: a), e: sk_bit_cast<F>(src: b)));
763 }
764 SI I32 max(I32 a, I32 b) {
765 return sk_bit_cast<I32>(src: if_then_else(c: a > b, t: sk_bit_cast<F>(src: a), e: sk_bit_cast<F>(src: b)));
766 }
767 SI U32 max(U32 a, U32 b) {
768 return sk_bit_cast<U32>(src: if_then_else(c: a > b, t: sk_bit_cast<F>(src: a), e: sk_bit_cast<F>(src: b)));
769 }
770#endif
771
772 SI F mad(F f, F m, F a) { return f*m+a; }
773 SI F abs_(F v) { return _mm_and_ps(a: v, b: 0-v); }
774#if defined(JUMPER_IS_SSE41) || defined(JUMPER_IS_AVX)
775 SI I32 abs_(I32 v) { return _mm_abs_epi32(v); }
776#else
777 SI I32 abs_(I32 v) { return max(a: v, b: -v); }
778#endif
779 SI F rcp_fast(F v) { return _mm_rcp_ps (a: v); }
780 SI F rcp_precise (F v) { F e = rcp_fast(v); return e * (2.0f - v * e); }
781 SI F rsqrt (F v) { return _mm_rsqrt_ps(a: v); }
782 SI F sqrt_(F v) { return _mm_sqrt_ps (a: v); }
783
784 SI U32 round(F v) { return _mm_cvtps_epi32(a: v); }
785 SI U32 round(F v, F scale) { return _mm_cvtps_epi32(a: v*scale); }
786
787 SI U16 pack(U32 v) {
788 #if defined(JUMPER_IS_SSE41) || defined(JUMPER_IS_AVX)
789 auto p = _mm_packus_epi32(v,v);
790 #else
791 // Sign extend so that _mm_packs_epi32() does the pack we want.
792 auto p = _mm_srai_epi32(a: _mm_slli_epi32(a: v, count: 16), count: 16);
793 p = _mm_packs_epi32(a: p,b: p);
794 #endif
795 return sk_unaligned_load<U16>(ptr: &p); // We have two copies. Return (the lower) one.
796 }
797 SI U8 pack(U16 v) {
798 auto r = widen_cast<__m128i>(src: v);
799 r = _mm_packus_epi16(a: r,b: r);
800 return sk_unaligned_load<U8>(ptr: &r);
801 }
802
803 // NOTE: This only checks the top bit of each lane, and is incorrect with non-mask values.
804 SI bool any(I32 c) { return _mm_movemask_ps(a: c) != 0b0000; }
805 SI bool all(I32 c) { return _mm_movemask_ps(a: c) == 0b1111; }
806
807 SI F floor_(F v) {
808 #if defined(JUMPER_IS_SSE41) || defined(JUMPER_IS_AVX)
809 return _mm_floor_ps(v);
810 #else
811 F roundtrip = _mm_cvtepi32_ps(a: _mm_cvttps_epi32(a: v));
812 return roundtrip - if_then_else(c: roundtrip > v, t: 1, e: 0);
813 #endif
814 }
815
816 SI F ceil_(F v) {
817 #if defined(JUMPER_IS_SSE41) || defined(JUMPER_IS_AVX)
818 return _mm_ceil_ps(v);
819 #else
820 F roundtrip = _mm_cvtepi32_ps(a: _mm_cvttps_epi32(a: v));
821 return roundtrip + if_then_else(c: roundtrip < v, t: 1, e: 0);
822 #endif
823 }
824
825 template <typename T>
826 SI V<T> gather(const T* p, U32 ix) {
827 return {p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]]};
828 }
829 template <typename V, typename S>
830 SI void scatter_masked(V src, S* dst, U32 ix, I32 mask) {
831 V before = gather(dst, ix);
832 V after = if_then_else(mask, src, before);
833 dst[ix[0]] = after[0];
834 dst[ix[1]] = after[1];
835 dst[ix[2]] = after[2];
836 dst[ix[3]] = after[3];
837 }
838 SI void load2(const uint16_t* ptr, size_t tail, U16* r, U16* g) {
839 __m128i _01;
840 if (__builtin_expect(tail,0)) {
841 _01 = _mm_setzero_si128();
842 if (tail > 1) {
843 _01 = _mm_loadl_pd(a: _01, dp: (const double*)ptr); // r0 g0 r1 g1 00 00 00 00
844 if (tail > 2) {
845 _01 = _mm_insert_epi16(_01, *(ptr+4), 4); // r0 g0 r1 g1 r2 00 00 00
846 _01 = _mm_insert_epi16(_01, *(ptr+5), 5); // r0 g0 r1 g1 r2 g2 00 00
847 }
848 } else {
849 _01 = _mm_cvtsi32_si128(a: *(const uint32_t*)ptr); // r0 g0 00 00 00 00 00 00
850 }
851 } else {
852 _01 = _mm_loadu_si128(p: ((__m128i*)ptr) + 0); // r0 g0 r1 g1 r2 g2 r3 g3
853 }
854 auto rg01_23 = _mm_shufflelo_epi16(_01, 0xD8); // r0 r1 g0 g1 r2 g2 r3 g3
855 auto rg = _mm_shufflehi_epi16(rg01_23, 0xD8); // r0 r1 g0 g1 r2 r3 g2 g3
856
857 auto R = _mm_shuffle_epi32(rg, 0x88); // r0 r1 r2 r3 r0 r1 r2 r3
858 auto G = _mm_shuffle_epi32(rg, 0xDD); // g0 g1 g2 g3 g0 g1 g2 g3
859 *r = sk_unaligned_load<U16>(ptr: &R);
860 *g = sk_unaligned_load<U16>(ptr: &G);
861 }
862 SI void store2(uint16_t* ptr, size_t tail, U16 r, U16 g) {
863 U32 rg = _mm_unpacklo_epi16(a: widen_cast<__m128i>(src: r), b: widen_cast<__m128i>(src: g));
864 if (__builtin_expect(tail, 0)) {
865 if (tail > 1) {
866 _mm_storel_epi64(p: (__m128i*)ptr, a: rg);
867 if (tail > 2) {
868 int32_t rgpair = rg[2];
869 memcpy(dest: ptr + 4, src: &rgpair, n: sizeof(rgpair));
870 }
871 } else {
872 int32_t rgpair = rg[0];
873 memcpy(dest: ptr, src: &rgpair, n: sizeof(rgpair));
874 }
875 } else {
876 _mm_storeu_si128(p: (__m128i*)ptr + 0, b: rg);
877 }
878 }
879
880 SI void load3(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) {
881 __m128i _0, _1, _2, _3;
882 if (__builtin_expect(tail,0)) {
883 _1 = _2 = _3 = _mm_setzero_si128();
884 auto load_rgb = [](const uint16_t* src) {
885 auto v = _mm_cvtsi32_si128(a: *(const uint32_t*)src);
886 return _mm_insert_epi16(v, src[2], 2);
887 };
888 if ( true ) { _0 = load_rgb(ptr + 0); }
889 if (tail > 1) { _1 = load_rgb(ptr + 3); }
890 if (tail > 2) { _2 = load_rgb(ptr + 6); }
891 } else {
892 // Load slightly weirdly to make sure we don't load past the end of 4x48 bits.
893 auto _01 = _mm_loadu_si128(p: (const __m128i*)(ptr + 0)) ,
894 _23 = _mm_srli_si128(_mm_loadu_si128((const __m128i*)(ptr + 4)), 4);
895
896 // Each _N holds R,G,B for pixel N in its lower 3 lanes (upper 5 are ignored).
897 _0 = _01;
898 _1 = _mm_srli_si128(_01, 6);
899 _2 = _23;
900 _3 = _mm_srli_si128(_23, 6);
901 }
902
903 // De-interlace to R,G,B.
904 auto _02 = _mm_unpacklo_epi16(a: _0, b: _2), // r0 r2 g0 g2 b0 b2 xx xx
905 _13 = _mm_unpacklo_epi16(a: _1, b: _3); // r1 r3 g1 g3 b1 b3 xx xx
906
907 auto R = _mm_unpacklo_epi16(a: _02, b: _13), // r0 r1 r2 r3 g0 g1 g2 g3
908 G = _mm_srli_si128(R, 8),
909 B = _mm_unpackhi_epi16(a: _02, b: _13); // b0 b1 b2 b3 xx xx xx xx
910
911 *r = sk_unaligned_load<U16>(ptr: &R);
912 *g = sk_unaligned_load<U16>(ptr: &G);
913 *b = sk_unaligned_load<U16>(ptr: &B);
914 }
915
916 SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
917 __m128i _01, _23;
918 if (__builtin_expect(tail,0)) {
919 _01 = _23 = _mm_setzero_si128();
920 auto src = (const double*)ptr;
921 if ( true ) { _01 = _mm_loadl_pd(a: _01, dp: src + 0); } // r0 g0 b0 a0 00 00 00 00
922 if (tail > 1) { _01 = _mm_loadh_pd(a: _01, dp: src + 1); } // r0 g0 b0 a0 r1 g1 b1 a1
923 if (tail > 2) { _23 = _mm_loadl_pd(a: _23, dp: src + 2); } // r2 g2 b2 a2 00 00 00 00
924 } else {
925 _01 = _mm_loadu_si128(p: ((__m128i*)ptr) + 0); // r0 g0 b0 a0 r1 g1 b1 a1
926 _23 = _mm_loadu_si128(p: ((__m128i*)ptr) + 1); // r2 g2 b2 a2 r3 g3 b3 a3
927 }
928
929 auto _02 = _mm_unpacklo_epi16(a: _01, b: _23), // r0 r2 g0 g2 b0 b2 a0 a2
930 _13 = _mm_unpackhi_epi16(a: _01, b: _23); // r1 r3 g1 g3 b1 b3 a1 a3
931
932 auto rg = _mm_unpacklo_epi16(a: _02, b: _13), // r0 r1 r2 r3 g0 g1 g2 g3
933 ba = _mm_unpackhi_epi16(a: _02, b: _13); // b0 b1 b2 b3 a0 a1 a2 a3
934
935 *r = sk_unaligned_load<U16>(ptr: (uint16_t*)&rg + 0);
936 *g = sk_unaligned_load<U16>(ptr: (uint16_t*)&rg + 4);
937 *b = sk_unaligned_load<U16>(ptr: (uint16_t*)&ba + 0);
938 *a = sk_unaligned_load<U16>(ptr: (uint16_t*)&ba + 4);
939 }
940
941 SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
942 auto rg = _mm_unpacklo_epi16(a: widen_cast<__m128i>(src: r), b: widen_cast<__m128i>(src: g)),
943 ba = _mm_unpacklo_epi16(a: widen_cast<__m128i>(src: b), b: widen_cast<__m128i>(src: a));
944
945 if (__builtin_expect(tail, 0)) {
946 auto dst = (double*)ptr;
947 if ( true ) { _mm_storel_pd(dp: dst + 0, a: _mm_unpacklo_epi32(a: rg, b: ba)); }
948 if (tail > 1) { _mm_storeh_pd(dp: dst + 1, a: _mm_unpacklo_epi32(a: rg, b: ba)); }
949 if (tail > 2) { _mm_storel_pd(dp: dst + 2, a: _mm_unpackhi_epi32(a: rg, b: ba)); }
950 } else {
951 _mm_storeu_si128(p: (__m128i*)ptr + 0, b: _mm_unpacklo_epi32(a: rg, b: ba));
952 _mm_storeu_si128(p: (__m128i*)ptr + 1, b: _mm_unpackhi_epi32(a: rg, b: ba));
953 }
954 }
955
956 SI void load2(const float* ptr, size_t tail, F* r, F* g) {
957 F _01, _23;
958 if (__builtin_expect(tail, 0)) {
959 _01 = _23 = _mm_setzero_si128();
960 if ( true ) { _01 = _mm_loadl_pi(a: _01, p: (__m64 const*)(ptr + 0)); }
961 if (tail > 1) { _01 = _mm_loadh_pi(a: _01, p: (__m64 const*)(ptr + 2)); }
962 if (tail > 2) { _23 = _mm_loadl_pi(a: _23, p: (__m64 const*)(ptr + 4)); }
963 } else {
964 _01 = _mm_loadu_ps(p: ptr + 0);
965 _23 = _mm_loadu_ps(p: ptr + 4);
966 }
967 *r = _mm_shuffle_ps(_01, _23, 0x88);
968 *g = _mm_shuffle_ps(_01, _23, 0xDD);
969 }
970 SI void store2(float* ptr, size_t tail, F r, F g) {
971 F _01 = _mm_unpacklo_ps(a: r, b: g),
972 _23 = _mm_unpackhi_ps(a: r, b: g);
973 if (__builtin_expect(tail, 0)) {
974 if ( true ) { _mm_storel_pi(p: (__m64*)(ptr + 0), a: _01); }
975 if (tail > 1) { _mm_storeh_pi(p: (__m64*)(ptr + 2), a: _01); }
976 if (tail > 2) { _mm_storel_pi(p: (__m64*)(ptr + 4), a: _23); }
977 } else {
978 _mm_storeu_ps(p: ptr + 0, a: _01);
979 _mm_storeu_ps(p: ptr + 4, a: _23);
980 }
981 }
982
983 SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) {
984 F _0, _1, _2, _3;
985 if (__builtin_expect(tail, 0)) {
986 _1 = _2 = _3 = _mm_setzero_si128();
987 if ( true ) { _0 = _mm_loadu_ps(p: ptr + 0); }
988 if (tail > 1) { _1 = _mm_loadu_ps(p: ptr + 4); }
989 if (tail > 2) { _2 = _mm_loadu_ps(p: ptr + 8); }
990 } else {
991 _0 = _mm_loadu_ps(p: ptr + 0);
992 _1 = _mm_loadu_ps(p: ptr + 4);
993 _2 = _mm_loadu_ps(p: ptr + 8);
994 _3 = _mm_loadu_ps(p: ptr +12);
995 }
996 _MM_TRANSPOSE4_PS(_0,_1,_2,_3);
997 *r = _0;
998 *g = _1;
999 *b = _2;
1000 *a = _3;
1001 }
1002
1003 SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) {
1004 _MM_TRANSPOSE4_PS(r,g,b,a);
1005 if (__builtin_expect(tail, 0)) {
1006 if ( true ) { _mm_storeu_ps(p: ptr + 0, a: r); }
1007 if (tail > 1) { _mm_storeu_ps(p: ptr + 4, a: g); }
1008 if (tail > 2) { _mm_storeu_ps(p: ptr + 8, a: b); }
1009 } else {
1010 _mm_storeu_ps(p: ptr + 0, a: r);
1011 _mm_storeu_ps(p: ptr + 4, a: g);
1012 _mm_storeu_ps(p: ptr + 8, a: b);
1013 _mm_storeu_ps(p: ptr +12, a: a);
1014 }
1015 }
1016#endif
1017
1018// We need to be a careful with casts.
1019// (F)x means cast x to float in the portable path, but bit_cast x to float in the others.
1020// These named casts and bit_cast() are always what they seem to be.
1021#if defined(JUMPER_IS_SCALAR)
1022 SI F cast (U32 v) { return (F)v; }
1023 SI F cast64(U64 v) { return (F)v; }
1024 SI U32 trunc_(F v) { return (U32)v; }
1025 SI U32 expand(U16 v) { return (U32)v; }
1026 SI U32 expand(U8 v) { return (U32)v; }
1027#else
1028 SI F cast (U32 v) { return __builtin_convertvector((I32)v, F); }
1029 SI F cast64(U64 v) { return __builtin_convertvector( v, F); }
1030 SI U32 trunc_(F v) { return (U32)__builtin_convertvector( v, I32); }
1031 SI U32 expand(U16 v) { return __builtin_convertvector( v, U32); }
1032 SI U32 expand(U8 v) { return __builtin_convertvector( v, U32); }
1033#endif
1034
1035template <typename V>
1036SI V if_then_else(I32 c, V t, V e) {
1037 return sk_bit_cast<V>(if_then_else(c, sk_bit_cast<F>(t), sk_bit_cast<F>(e)));
1038}
1039
1040SI U16 bswap(U16 x) {
1041#if defined(JUMPER_IS_SSE2) || defined(JUMPER_IS_SSE41)
1042 // Somewhat inexplicably Clang decides to do (x<<8) | (x>>8) in 32-bit lanes
1043 // when generating code for SSE2 and SSE4.1. We'll do it manually...
1044 auto v = widen_cast<__m128i>(src: x);
1045 v = _mm_slli_epi16(a: v,count: 8) | _mm_srli_epi16(a: v,count: 8);
1046 return sk_unaligned_load<U16>(ptr: &v);
1047#else
1048 return (x<<8) | (x>>8);
1049#endif
1050}
1051
1052SI F fract(F v) { return v - floor_(v); }
1053
1054// See http://www.machinedlearnings.com/2011/06/fast-approximate-logarithm-exponential.html
1055SI F approx_log2(F x) {
1056 // e - 127 is a fair approximation of log2(x) in its own right...
1057 F e = cast(v: sk_bit_cast<U32>(src: x)) * (1.0f / (1<<23));
1058
1059 // ... but using the mantissa to refine its error is _much_ better.
1060 F m = sk_bit_cast<F>(src: (sk_bit_cast<U32>(src: x) & 0x007fffff) | 0x3f000000);
1061 return e
1062 - 124.225514990f
1063 - 1.498030302f * m
1064 - 1.725879990f / (0.3520887068f + m);
1065}
1066
1067SI F approx_log(F x) {
1068 const float ln2 = 0.69314718f;
1069 return ln2 * approx_log2(x);
1070}
1071
1072SI F approx_pow2(F x) {
1073 constexpr float kInfinityBits = 0x7f800000;
1074
1075 F f = fract(v: x);
1076 F approx = x + 121.274057500f;
1077 approx -= f * 1.490129070f;
1078 approx += 27.728023300f / (4.84252568f - f);
1079 approx *= 1.0f * (1<<23);
1080 approx = min(a: max(a: approx, b: F(0)), b: kInfinityBits); // guard against underflow/overflow
1081
1082 return sk_bit_cast<F>(src: round(v: approx));
1083}
1084
1085SI F approx_exp(F x) {
1086 const float log2_e = 1.4426950408889634074f;
1087 return approx_pow2(x: log2_e * x);
1088}
1089
1090SI F approx_powf(F x, F y) {
1091 return if_then_else(c: (x == 0)|(x == 1), t: x
1092 , e: approx_pow2(x: approx_log2(x) * y));
1093}
1094
1095SI F from_half(U16 h) {
1096#if defined(JUMPER_IS_NEON) && defined(SK_CPU_ARM64) \
1097 && !defined(SK_BUILD_FOR_GOOGLE3) // Temporary workaround for some Google3 builds.
1098 return vcvt_f32_f16(h);
1099
1100#elif defined(JUMPER_IS_HSW)
1101 return _mm256_cvtph_ps(h);
1102
1103#else
1104 // Remember, a half is 1-5-10 (sign-exponent-mantissa) with 15 exponent bias.
1105 U32 sem = expand(v: h),
1106 s = sem & 0x8000,
1107 em = sem ^ s;
1108
1109 // Convert to 1-8-23 float with 127 bias, flushing denorm halfs (including zero) to zero.
1110 auto denorm = (I32)em < 0x0400; // I32 comparison is often quicker, and always safe here.
1111 return if_then_else(c: denorm, t: F(0)
1112 , e: sk_bit_cast<F>( src: (s<<16) + (em<<13) + ((127-15)<<23) ));
1113#endif
1114}
1115
1116SI U16 to_half(F f) {
1117#if defined(JUMPER_IS_NEON) && defined(SK_CPU_ARM64) \
1118 && !defined(SK_BUILD_FOR_GOOGLE3) // Temporary workaround for some Google3 builds.
1119 return vcvt_f16_f32(f);
1120
1121#elif defined(JUMPER_IS_HSW)
1122 return _mm256_cvtps_ph(f, _MM_FROUND_CUR_DIRECTION);
1123
1124#else
1125 // Remember, a float is 1-8-23 (sign-exponent-mantissa) with 127 exponent bias.
1126 U32 sem = sk_bit_cast<U32>(src: f),
1127 s = sem & 0x80000000,
1128 em = sem ^ s;
1129
1130 // Convert to 1-5-10 half with 15 bias, flushing denorm halfs (including zero) to zero.
1131 auto denorm = (I32)em < 0x38800000; // I32 comparison is often quicker, and always safe here.
1132 return pack(v: if_then_else(c: denorm, t: U32(0)
1133 , e: (s>>16) + (em>>13) - ((127-15)<<10)));
1134#endif
1135}
1136
1137// Our fundamental vector depth is our pixel stride.
1138static constexpr size_t N = sizeof(F) / sizeof(float);
1139
1140// We're finally going to get to what a Stage function looks like!
1141// tail == 0 ~~> work on a full N pixels
1142// tail != 0 ~~> work on only the first tail pixels
1143// tail is always < N.
1144
1145// Any custom ABI to use for all (non-externally-facing) stage functions?
1146// Also decide here whether to use narrow (compromise) or wide (ideal) stages.
1147#if defined(SK_CPU_ARM32) && defined(JUMPER_IS_NEON)
1148 // This lets us pass vectors more efficiently on 32-bit ARM.
1149 // We can still only pass 16 floats, so best as 4x {r,g,b,a}.
1150 #define ABI __attribute__((pcs("aapcs-vfp")))
1151 #define JUMPER_NARROW_STAGES 1
1152#elif defined(_MSC_VER)
1153 // Even if not vectorized, this lets us pass {r,g,b,a} as registers,
1154 // instead of {b,a} on the stack. Narrow stages work best for __vectorcall.
1155 #define ABI __vectorcall
1156 #define JUMPER_NARROW_STAGES 1
1157#elif defined(__x86_64__) || defined(SK_CPU_ARM64)
1158 // These platforms are ideal for wider stages, and their default ABI is ideal.
1159 #define ABI
1160 #define JUMPER_NARROW_STAGES 0
1161#else
1162 // 32-bit or unknown... shunt them down the narrow path.
1163 // Odds are these have few registers and are better off there.
1164 #define ABI
1165 #define JUMPER_NARROW_STAGES 1
1166#endif
1167
1168#if JUMPER_NARROW_STAGES
1169 struct Params {
1170 size_t dx, dy, tail;
1171 std::byte* base;
1172 F dr,dg,db,da;
1173 };
1174 using Stage = void(ABI*)(Params*, SkRasterPipelineStage* program, F r, F g, F b, F a);
1175#else
1176 using Stage = void(ABI*)(size_t tail, SkRasterPipelineStage* program, size_t dx, size_t dy,
1177 std::byte* base, F,F,F,F, F,F,F,F);
1178#endif
1179
1180static void start_pipeline(size_t dx, size_t dy,
1181 size_t xlimit, size_t ylimit,
1182 SkRasterPipelineStage* program) {
1183 auto start = (Stage)program->fn;
1184 const size_t x0 = dx;
1185 std::byte* const base = nullptr;
1186 for (; dy < ylimit; dy++) {
1187 #if JUMPER_NARROW_STAGES
1188 Params params = { x0,dy,0,base, 0,0,0,0 };
1189 while (params.dx + N <= xlimit) {
1190 start(&params,program, 0,0,0,0);
1191 params.dx += N;
1192 }
1193 if (size_t tail = xlimit - params.dx) {
1194 params.tail = tail;
1195 start(&params,program, 0,0,0,0);
1196 }
1197 #else
1198 dx = x0;
1199 while (dx + N <= xlimit) {
1200 start(0,program,dx,dy,base, 0,0,0,0, 0,0,0,0);
1201 dx += N;
1202 }
1203 if (size_t tail = xlimit - dx) {
1204 start(tail,program,dx,dy,base, 0,0,0,0, 0,0,0,0);
1205 }
1206 #endif
1207 }
1208}
1209
1210#if SK_HAS_MUSTTAIL
1211 #define JUMPER_MUSTTAIL [[clang::musttail]]
1212#else
1213 #define JUMPER_MUSTTAIL
1214#endif
1215
1216#if JUMPER_NARROW_STAGES
1217 #define DECLARE_STAGE(name, ARG, STAGE_RET, INC, OFFSET, MUSTTAIL) \
1218 SI STAGE_RET name##_k(ARG, size_t dx, size_t dy, size_t tail, std::byte*& base, \
1219 F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \
1220 static void ABI name(Params* params, SkRasterPipelineStage* program, \
1221 F r, F g, F b, F a) { \
1222 OFFSET name##_k(Ctx{program}, params->dx,params->dy,params->tail,params->base, \
1223 r,g,b,a, params->dr, params->dg, params->db, params->da); \
1224 INC; \
1225 auto fn = (Stage)program->fn; \
1226 MUSTTAIL return fn(params, program, r,g,b,a); \
1227 } \
1228 SI STAGE_RET name##_k(ARG, size_t dx, size_t dy, size_t tail, std::byte*& base, \
1229 F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da)
1230#else
1231 #define DECLARE_STAGE(name, ARG, STAGE_RET, INC, OFFSET, MUSTTAIL) \
1232 SI STAGE_RET name##_k(ARG, size_t dx, size_t dy, size_t tail, std::byte*& base, \
1233 F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \
1234 static void ABI name(size_t tail, SkRasterPipelineStage* program, size_t dx, size_t dy, \
1235 std::byte* base, F r, F g, F b, F a, F dr, F dg, F db, F da) { \
1236 OFFSET name##_k(Ctx{program}, dx,dy,tail,base, r,g,b,a, dr,dg,db,da); \
1237 INC; \
1238 auto fn = (Stage)program->fn; \
1239 MUSTTAIL return fn(tail, program, dx,dy,base, r,g,b,a, dr,dg,db,da); \
1240 } \
1241 SI STAGE_RET name##_k(ARG, size_t dx, size_t dy, size_t tail, std::byte*& base, \
1242 F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da)
1243#endif
1244
1245// A typical stage returns void, always increments the program counter by 1, and lets the optimizer
1246// decide whether or not tail-calling is appropriate.
1247#define STAGE(name, arg) \
1248 DECLARE_STAGE(name, arg, void, ++program, /*no offset*/, /*no musttail*/)
1249
1250// A tail stage returns void, always increments the program counter by 1, and uses tail-calling.
1251// Tail-calling is necessary in SkSL-generated programs, which can be thousands of ops long, and
1252// could overflow the stack (particularly in debug).
1253#define STAGE_TAIL(name, arg) \
1254 DECLARE_STAGE(name, arg, void, ++program, /*no offset*/, JUMPER_MUSTTAIL)
1255
1256// A branch stage returns an integer, which is added directly to the program counter, and tailcalls.
1257#define STAGE_BRANCH(name, arg) \
1258 DECLARE_STAGE(name, arg, int, /*no increment*/, program +=, JUMPER_MUSTTAIL)
1259
1260// just_return() is a simple no-op stage that only exists to end the chain,
1261// returning back up to start_pipeline(), and from there to the caller.
1262#if JUMPER_NARROW_STAGES
1263 static void ABI just_return(Params*, SkRasterPipelineStage*, F,F,F,F) {}
1264#else
1265 static void ABI just_return(size_t, SkRasterPipelineStage*, size_t,size_t, std::byte*,
1266 F,F,F,F, F,F,F,F) {}
1267#endif
1268
1269// Note that in release builds, most stages consume no stack (thanks to tail call optimization).
1270// However: certain builds (especially with non-clang compilers) may fail to optimize tail
1271// calls, resulting in actual stack frames being generated.
1272//
1273// stack_checkpoint() and stack_rewind() are special stages that can be used to manage stack growth.
1274// If a pipeline contains a stack_checkpoint, followed by any number of stack_rewind (at any point),
1275// the C++ stack will be reset to the state it was at when the stack_checkpoint was initially hit.
1276//
1277// All instances of stack_rewind (as well as the one instance of stack_checkpoint near the start of
1278// a pipeline) share a single context (of type SkRasterPipeline_RewindCtx). That context holds the
1279// full state of the mutable registers that are normally passed to the next stage in the program.
1280//
1281// stack_rewind is the only stage other than just_return that actually returns (rather than jumping
1282// to the next stage in the program). Before it does so, it stashes all of the registers in the
1283// context. This includes the updated `program` pointer. Unlike stages that tail call exactly once,
1284// stack_checkpoint calls the next stage in the program repeatedly, as long as the `program` in the
1285// context is overwritten (i.e., as long as a stack_rewind was the reason the pipeline returned,
1286// rather than a just_return).
1287//
1288// Normally, just_return is the only stage that returns, and no other stage does anything after a
1289// subsequent (called) stage returns, so the stack just unwinds all the way to start_pipeline.
1290// With stack_checkpoint on the stack, any stack_rewind stages will return all the way up to the
1291// stack_checkpoint. That grabs the values that would have been passed to the next stage (from the
1292// context), and continues the linear execution of stages, but has reclaimed all of the stack frames
1293// pushed before the stack_rewind before doing so.
1294#if JUMPER_NARROW_STAGES
1295 static void ABI stack_checkpoint(Params* params, SkRasterPipelineStage* program,
1296 F r, F g, F b, F a) {
1297 SkRasterPipeline_RewindCtx* ctx = Ctx{program};
1298 while (program) {
1299 auto next = (Stage)(++program)->fn;
1300
1301 ctx->stage = nullptr;
1302 next(params, program, r, g, b, a);
1303 program = ctx->stage;
1304
1305 if (program) {
1306 r = sk_unaligned_load<F>(ctx->r );
1307 g = sk_unaligned_load<F>(ctx->g );
1308 b = sk_unaligned_load<F>(ctx->b );
1309 a = sk_unaligned_load<F>(ctx->a );
1310 params->dr = sk_unaligned_load<F>(ctx->dr);
1311 params->dg = sk_unaligned_load<F>(ctx->dg);
1312 params->db = sk_unaligned_load<F>(ctx->db);
1313 params->da = sk_unaligned_load<F>(ctx->da);
1314 params->base = ctx->base;
1315 }
1316 }
1317 }
1318 static void ABI stack_rewind(Params* params, SkRasterPipelineStage* program,
1319 F r, F g, F b, F a) {
1320 SkRasterPipeline_RewindCtx* ctx = Ctx{program};
1321 sk_unaligned_store(ctx->r , r );
1322 sk_unaligned_store(ctx->g , g );
1323 sk_unaligned_store(ctx->b , b );
1324 sk_unaligned_store(ctx->a , a );
1325 sk_unaligned_store(ctx->dr, params->dr);
1326 sk_unaligned_store(ctx->dg, params->dg);
1327 sk_unaligned_store(ctx->db, params->db);
1328 sk_unaligned_store(ctx->da, params->da);
1329 ctx->base = params->base;
1330 ctx->stage = program;
1331 }
1332#else
1333 static void ABI stack_checkpoint(size_t tail, SkRasterPipelineStage* program,
1334 size_t dx, size_t dy, std::byte* base,
1335 F r, F g, F b, F a, F dr, F dg, F db, F da) {
1336 SkRasterPipeline_RewindCtx* ctx = Ctx{.fStage: program};
1337 while (program) {
1338 auto next = (Stage)(++program)->fn;
1339
1340 ctx->stage = nullptr;
1341 next(tail, program, dx, dy, base, r, g, b, a, dr, dg, db, da);
1342 program = ctx->stage;
1343
1344 if (program) {
1345 r = sk_unaligned_load<F>(ptr: ctx->r );
1346 g = sk_unaligned_load<F>(ptr: ctx->g );
1347 b = sk_unaligned_load<F>(ptr: ctx->b );
1348 a = sk_unaligned_load<F>(ptr: ctx->a );
1349 dr = sk_unaligned_load<F>(ptr: ctx->dr);
1350 dg = sk_unaligned_load<F>(ptr: ctx->dg);
1351 db = sk_unaligned_load<F>(ptr: ctx->db);
1352 da = sk_unaligned_load<F>(ptr: ctx->da);
1353 base = ctx->base;
1354 }
1355 }
1356 }
1357 static void ABI stack_rewind(size_t tail, SkRasterPipelineStage* program,
1358 size_t dx, size_t dy, std::byte* base,
1359 F r, F g, F b, F a, F dr, F dg, F db, F da) {
1360 SkRasterPipeline_RewindCtx* ctx = Ctx{.fStage: program};
1361 sk_unaligned_store(ptr: ctx->r , val: r );
1362 sk_unaligned_store(ptr: ctx->g , val: g );
1363 sk_unaligned_store(ptr: ctx->b , val: b );
1364 sk_unaligned_store(ptr: ctx->a , val: a );
1365 sk_unaligned_store(ptr: ctx->dr, val: dr);
1366 sk_unaligned_store(ptr: ctx->dg, val: dg);
1367 sk_unaligned_store(ptr: ctx->db, val: db);
1368 sk_unaligned_store(ptr: ctx->da, val: da);
1369 ctx->base = base;
1370 ctx->stage = program;
1371 }
1372#endif
1373
1374
1375// We could start defining normal Stages now. But first, some helper functions.
1376
1377// These load() and store() methods are tail-aware,
1378// but focus mainly on keeping the at-stride tail==0 case fast.
1379
1380template <typename V, typename T>
1381SI V load(const T* src, size_t tail) {
1382#if !defined(JUMPER_IS_SCALAR)
1383 __builtin_assume(tail < N);
1384 if (__builtin_expect(tail, 0)) {
1385 V v{}; // Any inactive lanes are zeroed.
1386 switch (tail) {
1387 case 7: v[6] = src[6]; [[fallthrough]];
1388 case 6: v[5] = src[5]; [[fallthrough]];
1389 case 5: v[4] = src[4]; [[fallthrough]];
1390 case 4: memcpy(&v, src, 4*sizeof(T)); break;
1391 case 3: v[2] = src[2]; [[fallthrough]];
1392 case 2: memcpy(&v, src, 2*sizeof(T)); break;
1393 case 1: memcpy(&v, src, 1*sizeof(T)); break;
1394 }
1395 return v;
1396 }
1397#endif
1398 return sk_unaligned_load<V>(src);
1399}
1400
1401template <typename V, typename T>
1402SI void store(T* dst, V v, size_t tail) {
1403#if !defined(JUMPER_IS_SCALAR)
1404 __builtin_assume(tail < N);
1405 if (__builtin_expect(tail, 0)) {
1406 switch (tail) {
1407 case 7: dst[6] = v[6]; [[fallthrough]];
1408 case 6: dst[5] = v[5]; [[fallthrough]];
1409 case 5: dst[4] = v[4]; [[fallthrough]];
1410 case 4: memcpy(dst, &v, 4*sizeof(T)); break;
1411 case 3: dst[2] = v[2]; [[fallthrough]];
1412 case 2: memcpy(dst, &v, 2*sizeof(T)); break;
1413 case 1: memcpy(dst, &v, 1*sizeof(T)); break;
1414 }
1415 return;
1416 }
1417#endif
1418 sk_unaligned_store(dst, v);
1419}
1420
1421SI F from_byte(U8 b) {
1422 return cast(v: expand(v: b)) * (1/255.0f);
1423}
1424SI F from_short(U16 s) {
1425 return cast(v: expand(v: s)) * (1/65535.0f);
1426}
1427SI void from_565(U16 _565, F* r, F* g, F* b) {
1428 U32 wide = expand(v: _565);
1429 *r = cast(v: wide & (31<<11)) * (1.0f / (31<<11));
1430 *g = cast(v: wide & (63<< 5)) * (1.0f / (63<< 5));
1431 *b = cast(v: wide & (31<< 0)) * (1.0f / (31<< 0));
1432}
1433SI void from_4444(U16 _4444, F* r, F* g, F* b, F* a) {
1434 U32 wide = expand(v: _4444);
1435 *r = cast(v: wide & (15<<12)) * (1.0f / (15<<12));
1436 *g = cast(v: wide & (15<< 8)) * (1.0f / (15<< 8));
1437 *b = cast(v: wide & (15<< 4)) * (1.0f / (15<< 4));
1438 *a = cast(v: wide & (15<< 0)) * (1.0f / (15<< 0));
1439}
1440SI void from_8888(U32 _8888, F* r, F* g, F* b, F* a) {
1441 *r = cast(v: (_8888 ) & 0xff) * (1/255.0f);
1442 *g = cast(v: (_8888 >> 8) & 0xff) * (1/255.0f);
1443 *b = cast(v: (_8888 >> 16) & 0xff) * (1/255.0f);
1444 *a = cast(v: (_8888 >> 24) ) * (1/255.0f);
1445}
1446SI void from_88(U16 _88, F* r, F* g) {
1447 U32 wide = expand(v: _88);
1448 *r = cast(v: (wide ) & 0xff) * (1/255.0f);
1449 *g = cast(v: (wide >> 8) & 0xff) * (1/255.0f);
1450}
1451SI void from_1010102(U32 rgba, F* r, F* g, F* b, F* a) {
1452 *r = cast(v: (rgba ) & 0x3ff) * (1/1023.0f);
1453 *g = cast(v: (rgba >> 10) & 0x3ff) * (1/1023.0f);
1454 *b = cast(v: (rgba >> 20) & 0x3ff) * (1/1023.0f);
1455 *a = cast(v: (rgba >> 30) ) * (1/ 3.0f);
1456}
1457SI void from_1010102_xr(U32 rgba, F* r, F* g, F* b, F* a) {
1458 static constexpr float min = -0.752941f;
1459 static constexpr float max = 1.25098f;
1460 static constexpr float range = max - min;
1461 *r = cast(v: (rgba ) & 0x3ff) * (1/1023.0f) * range + min;
1462 *g = cast(v: (rgba >> 10) & 0x3ff) * (1/1023.0f) * range + min;
1463 *b = cast(v: (rgba >> 20) & 0x3ff) * (1/1023.0f) * range + min;
1464 *a = cast(v: (rgba >> 30) ) * (1/ 3.0f);
1465}
1466SI void from_1616(U32 _1616, F* r, F* g) {
1467 *r = cast(v: (_1616 ) & 0xffff) * (1/65535.0f);
1468 *g = cast(v: (_1616 >> 16) & 0xffff) * (1/65535.0f);
1469}
1470SI void from_16161616(U64 _16161616, F* r, F* g, F* b, F* a) {
1471 *r = cast64(v: (_16161616 ) & 0xffff) * (1/65535.0f);
1472 *g = cast64(v: (_16161616 >> 16) & 0xffff) * (1/65535.0f);
1473 *b = cast64(v: (_16161616 >> 32) & 0xffff) * (1/65535.0f);
1474 *a = cast64(v: (_16161616 >> 48) & 0xffff) * (1/65535.0f);
1475}
1476
1477// Used by load_ and store_ stages to get to the right (dx,dy) starting point of contiguous memory.
1478template <typename T>
1479SI T* ptr_at_xy(const SkRasterPipeline_MemoryCtx* ctx, size_t dx, size_t dy) {
1480 return (T*)ctx->pixels + dy*ctx->stride + dx;
1481}
1482
1483// clamp v to [0,limit).
1484SI F clamp(F v, F limit) {
1485 F inclusive = sk_bit_cast<F>( src: sk_bit_cast<U32>(src: limit) - 1 ); // Exclusive -> inclusive.
1486 return min(a: max(a: 0.0f, b: v), b: inclusive);
1487}
1488
1489// clamp to (0,limit).
1490SI F clamp_ex(F v, F limit) {
1491 const F inclusiveZ = std::numeric_limits<float>::min(),
1492 inclusiveL = sk_bit_cast<F>( src: sk_bit_cast<U32>(src: limit) - 1 );
1493 return min(a: max(a: inclusiveZ, b: v), b: inclusiveL);
1494}
1495
1496// Polynomial approximation of degree 5 for sin(x * 2 * pi) in the range [-1/4, 1/4]
1497// Adapted from https://github.com/google/swiftshader/blob/master/docs/Sin-Cos-Optimization.pdf
1498SI F sin5q_(F x) {
1499 // A * x + B * x^3 + C * x^5
1500 // Exact at x = 0, 1/12, 1/6, 1/4, and their negatives,
1501 // which correspond to x * 2 * pi = 0, pi/6, pi/3, pi/2
1502 constexpr float A = 6.28230858f;
1503 constexpr float B = -41.1693687f;
1504 constexpr float C = 74.4388885f;
1505 F x2 = x * x;
1506 return x * mad(f: mad(f: x2, m: C, a: B), m: x2, a: A);
1507}
1508
1509SI F sin_(F x) {
1510 constexpr float one_over_pi2 = 1 / (2 * SK_FloatPI);
1511 x = mad(f: x, m: -one_over_pi2, a: 0.25f);
1512 x = 0.25f - abs_(v: x - floor_(v: x + 0.5f));
1513 return sin5q_(x);
1514}
1515
1516SI F cos_(F x) {
1517 constexpr float one_over_pi2 = 1 / (2 * SK_FloatPI);
1518 x *= one_over_pi2;
1519 x = 0.25f - abs_(v: x - floor_(v: x + 0.5f));
1520 return sin5q_(x);
1521}
1522
1523/* "GENERATING ACCURATE VALUES FOR THE TANGENT FUNCTION"
1524 https://mae.ufl.edu/~uhk/ACCURATE-TANGENT.pdf
1525
1526 approx = x + (1/3)x^3 + (2/15)x^5 + (17/315)x^7 + (62/2835)x^9
1527
1528 Some simplifications:
1529 1. tan(x) is periodic, -PI/2 < x < PI/2
1530 2. tan(x) is odd, so tan(-x) = -tan(x)
1531 3. Our polynomial approximation is best near zero, so we use the following identity
1532 tan(x) + tan(y)
1533 tan(x + y) = -----------------
1534 1 - tan(x)*tan(y)
1535 tan(PI/4) = 1
1536
1537 So for x > PI/8, we do the following refactor:
1538 x' = x - PI/4
1539
1540 1 + tan(x')
1541 tan(x) = ------------
1542 1 - tan(x')
1543 */
1544SI F tan_(F x) {
1545 constexpr float Pi = SK_FloatPI;
1546 // periodic between -pi/2 ... pi/2
1547 // shift to 0...Pi, scale 1/Pi to get into 0...1, then fract, scale-up, shift-back
1548 x = fract(v: (1/Pi)*x + 0.5f) * Pi - (Pi/2);
1549
1550 I32 neg = (x < 0.0f);
1551 x = if_then_else(c: neg, t: -x, e: x);
1552
1553 // minimize total error by shifting if x > pi/8
1554 I32 use_quotient = (x > (Pi/8));
1555 x = if_then_else(c: use_quotient, t: x - (Pi/4), e: x);
1556
1557 // 9th order poly = 4th order(x^2) * x
1558 const float c4 = 62 / 2835.0f;
1559 const float c3 = 17 / 315.0f;
1560 const float c2 = 2 / 15.0f;
1561 const float c1 = 1 / 3.0f;
1562 const float c0 = 1.0f;
1563 F x2 = x * x;
1564 x *= mad(f: x2, m: mad(f: x2, m: mad(f: x2, m: mad(f: x2, m: c4, a: c3), a: c2), a: c1), a: c0);
1565 x = if_then_else(c: use_quotient, t: (1+x)/(1-x), e: x);
1566 x = if_then_else(c: neg, t: -x, e: x);
1567 return x;
1568}
1569
1570/* Use 4th order polynomial approximation from https://arachnoid.com/polysolve/
1571 with 129 values of x,atan(x) for x:[0...1]
1572 This only works for 0 <= x <= 1
1573 */
1574SI F approx_atan_unit(F x) {
1575 // y = 0.14130025741326729 x⁴
1576 // - 0.34312835980675116 x³
1577 // - 0.016172900528248768 x²
1578 // + 1.00376969762003850 x
1579 // - 0.00014758242182738969
1580 const float c4 = 0.14130025741326729f;
1581 const float c3 = -0.34312835980675116f;
1582 const float c2 = -0.016172900528248768f;
1583 const float c1 = 1.0037696976200385f;
1584 const float c0 = -0.00014758242182738969f;
1585 return mad(f: x, m: mad(f: x, m: mad(f: x, m: mad(f: x, m: c4, a: c3), a: c2), a: c1), a: c0);
1586}
1587
1588// Use identity atan(x) = pi/2 - atan(1/x) for x > 1
1589SI F atan_(F x) {
1590 I32 neg = (x < 0.0f);
1591 x = if_then_else(c: neg, t: -x, e: x);
1592 I32 flip = (x > 1.0f);
1593 x = if_then_else(c: flip, t: 1/x, e: x);
1594 x = approx_atan_unit(x);
1595 x = if_then_else(c: flip, t: SK_FloatPI/2 - x, e: x);
1596 x = if_then_else(c: neg, t: -x, e: x);
1597 return x;
1598}
1599
1600// Handbook of Mathematical Functions, by Milton Abramowitz and Irene Stegun:
1601// https://books.google.com/books/content?id=ZboM5tOFWtsC&pg=PA81&img=1&zoom=3&hl=en&bul=1&sig=ACfU3U2M75tG_iGVOS92eQspr14LTq02Nw&ci=0%2C15%2C999%2C1279&edge=0
1602// http://screen/8YGJxUGFQ49bVX6
1603SI F asin_(F x) {
1604 I32 neg = (x < 0.0f);
1605 x = if_then_else(c: neg, t: -x, e: x);
1606 const float c3 = -0.0187293f;
1607 const float c2 = 0.0742610f;
1608 const float c1 = -0.2121144f;
1609 const float c0 = 1.5707288f;
1610 F poly = mad(f: x, m: mad(f: x, m: mad(f: x, m: c3, a: c2), a: c1), a: c0);
1611 x = SK_FloatPI/2 - sqrt_(v: 1 - x) * poly;
1612 x = if_then_else(c: neg, t: -x, e: x);
1613 return x;
1614}
1615
1616SI F acos_(F x) {
1617 return SK_FloatPI/2 - asin_(x);
1618}
1619
1620/* Use identity atan(x) = pi/2 - atan(1/x) for x > 1
1621 By swapping y,x to ensure the ratio is <= 1, we can safely call atan_unit()
1622 which avoids a 2nd divide instruction if we had instead called atan().
1623 */
1624SI F atan2_(F y0, F x0) {
1625 I32 flip = (abs_(v: y0) > abs_(v: x0));
1626 F y = if_then_else(c: flip, t: x0, e: y0);
1627 F x = if_then_else(c: flip, t: y0, e: x0);
1628 F arg = y/x;
1629
1630 I32 neg = (arg < 0.0f);
1631 arg = if_then_else(c: neg, t: -arg, e: arg);
1632
1633 F r = approx_atan_unit(x: arg);
1634 r = if_then_else(c: flip, t: SK_FloatPI/2 - r, e: r);
1635 r = if_then_else(c: neg, t: -r, e: r);
1636
1637 // handle quadrant distinctions
1638 r = if_then_else(c: (y0 >= 0) & (x0 < 0), t: r + SK_FloatPI, e: r);
1639 r = if_then_else(c: (y0 < 0) & (x0 <= 0), t: r - SK_FloatPI, e: r);
1640 // Note: we don't try to handle 0,0 or infinities
1641 return r;
1642}
1643
1644// Used by gather_ stages to calculate the base pointer and a vector of indices to load.
1645template <typename T>
1646SI U32 ix_and_ptr(T** ptr, const SkRasterPipeline_GatherCtx* ctx, F x, F y) {
1647 // We use exclusive clamp so that our min value is > 0 because ULP subtraction using U32 would
1648 // produce a NaN if applied to +0.f.
1649 x = clamp_ex(v: x, limit: ctx->width );
1650 y = clamp_ex(v: y, limit: ctx->height);
1651 x = sk_bit_cast<F>(src: sk_bit_cast<U32>(src: x) - (uint32_t)ctx->roundDownAtInteger);
1652 y = sk_bit_cast<F>(src: sk_bit_cast<U32>(src: y) - (uint32_t)ctx->roundDownAtInteger);
1653 *ptr = (const T*)ctx->pixels;
1654 return trunc_(v: y)*ctx->stride + trunc_(v: x);
1655}
1656
1657// We often have a nominally [0,1] float value we need to scale and convert to an integer,
1658// whether for a table lookup or to pack back down into bytes for storage.
1659//
1660// In practice, especially when dealing with interesting color spaces, that notionally
1661// [0,1] float may be out of [0,1] range. Unorms cannot represent that, so we must clamp.
1662//
1663// You can adjust the expected input to [0,bias] by tweaking that parameter.
1664SI U32 to_unorm(F v, F scale, F bias = 1.0f) {
1665 // Any time we use round() we probably want to use to_unorm().
1666 return round(v: min(a: max(a: 0.0f, b: v), b: bias), scale);
1667}
1668
1669SI I32 cond_to_mask(I32 cond) {
1670#if defined(JUMPER_IS_SCALAR)
1671 // In scalar mode, conditions are bools (0 or 1), but we want to store and operate on masks
1672 // (eg, using bitwise operations to select values).
1673 return if_then_else(cond, I32(~0), I32(0));
1674#else
1675 // In SIMD mode, our various instruction sets already represent conditions as masks.
1676 return cond;
1677#endif
1678}
1679
1680#if defined(JUMPER_IS_SCALAR)
1681// In scalar mode, `data` only contains a single lane.
1682template <typename T>
1683SI T select_lane(T data, int lane) {
1684 SkASSERT(lane == 0);
1685 return data;
1686}
1687#else
1688// In SIMD mode, `data` contains a vector of lanes.
1689template <typename T>
1690SI T select_lane(V<T> data, int lane) {
1691 return data[lane];
1692}
1693#endif
1694
1695// Now finally, normal Stages!
1696
1697STAGE(seed_shader, NoCtx) {
1698 static constexpr float iota[] = {
1699 0.5f, 1.5f, 2.5f, 3.5f, 4.5f, 5.5f, 6.5f, 7.5f,
1700 8.5f, 9.5f,10.5f,11.5f,12.5f,13.5f,14.5f,15.5f,
1701 };
1702 // It's important for speed to explicitly cast(dx) and cast(dy),
1703 // which has the effect of splatting them to vectors before converting to floats.
1704 // On Intel this breaks a data dependency on previous loop iterations' registers.
1705 r = cast(v: dx) + sk_unaligned_load<F>(ptr: iota);
1706 g = cast(v: dy) + 0.5f;
1707 b = 1.0f; // This is w=1 for matrix multiplies by the device coords.
1708 a = 0;
1709}
1710
1711STAGE(dither, const float* rate) {
1712 // Get [(dx,dy), (dx+1,dy), (dx+2,dy), ...] loaded up in integer vectors.
1713 uint32_t iota[] = {0,1,2,3,4,5,6,7};
1714 U32 X = dx + sk_unaligned_load<U32>(ptr: iota),
1715 Y = dy;
1716
1717 // We're doing 8x8 ordered dithering, see https://en.wikipedia.org/wiki/Ordered_dithering.
1718 // In this case n=8 and we're using the matrix that looks like 1/64 x [ 0 48 12 60 ... ].
1719
1720 // We only need X and X^Y from here on, so it's easier to just think of that as "Y".
1721 Y ^= X;
1722
1723 // We'll mix the bottom 3 bits of each of X and Y to make 6 bits,
1724 // for 2^6 == 64 == 8x8 matrix values. If X=abc and Y=def, we make fcebda.
1725 U32 M = (Y & 1) << 5 | (X & 1) << 4
1726 | (Y & 2) << 2 | (X & 2) << 1
1727 | (Y & 4) >> 1 | (X & 4) >> 2;
1728
1729 // Scale that dither to [0,1), then (-0.5,+0.5), here using 63/128 = 0.4921875 as 0.5-epsilon.
1730 // We want to make sure our dither is less than 0.5 in either direction to keep exact values
1731 // like 0 and 1 unchanged after rounding.
1732 F dither = cast(v: M) * (2/128.0f) - (63/128.0f);
1733
1734 r += *rate*dither;
1735 g += *rate*dither;
1736 b += *rate*dither;
1737
1738 r = max(a: 0.0f, b: min(a: r, b: a));
1739 g = max(a: 0.0f, b: min(a: g, b: a));
1740 b = max(a: 0.0f, b: min(a: b, b: a));
1741}
1742
1743// load 4 floats from memory, and splat them into r,g,b,a
1744STAGE(uniform_color, const SkRasterPipeline_UniformColorCtx* c) {
1745 r = c->r;
1746 g = c->g;
1747 b = c->b;
1748 a = c->a;
1749}
1750STAGE(unbounded_uniform_color, const SkRasterPipeline_UniformColorCtx* c) {
1751 r = c->r;
1752 g = c->g;
1753 b = c->b;
1754 a = c->a;
1755}
1756// load 4 floats from memory, and splat them into dr,dg,db,da
1757STAGE(uniform_color_dst, const SkRasterPipeline_UniformColorCtx* c) {
1758 dr = c->r;
1759 dg = c->g;
1760 db = c->b;
1761 da = c->a;
1762}
1763
1764// splats opaque-black into r,g,b,a
1765STAGE(black_color, NoCtx) {
1766 r = g = b = 0.0f;
1767 a = 1.0f;
1768}
1769
1770STAGE(white_color, NoCtx) {
1771 r = g = b = a = 1.0f;
1772}
1773
1774// load registers r,g,b,a from context (mirrors store_src)
1775STAGE(load_src, const float* ptr) {
1776 r = sk_unaligned_load<F>(ptr: ptr + 0*N);
1777 g = sk_unaligned_load<F>(ptr: ptr + 1*N);
1778 b = sk_unaligned_load<F>(ptr: ptr + 2*N);
1779 a = sk_unaligned_load<F>(ptr: ptr + 3*N);
1780}
1781
1782// store registers r,g,b,a into context (mirrors load_src)
1783STAGE(store_src, float* ptr) {
1784 sk_unaligned_store(ptr: ptr + 0*N, val: r);
1785 sk_unaligned_store(ptr: ptr + 1*N, val: g);
1786 sk_unaligned_store(ptr: ptr + 2*N, val: b);
1787 sk_unaligned_store(ptr: ptr + 3*N, val: a);
1788}
1789// store registers r,g into context
1790STAGE(store_src_rg, float* ptr) {
1791 sk_unaligned_store(ptr: ptr + 0*N, val: r);
1792 sk_unaligned_store(ptr: ptr + 1*N, val: g);
1793}
1794// load registers r,g from context
1795STAGE(load_src_rg, float* ptr) {
1796 r = sk_unaligned_load<F>(ptr: ptr + 0*N);
1797 g = sk_unaligned_load<F>(ptr: ptr + 1*N);
1798}
1799// store register a into context
1800STAGE(store_src_a, float* ptr) {
1801 sk_unaligned_store(ptr, val: a);
1802}
1803
1804// load registers dr,dg,db,da from context (mirrors store_dst)
1805STAGE(load_dst, const float* ptr) {
1806 dr = sk_unaligned_load<F>(ptr: ptr + 0*N);
1807 dg = sk_unaligned_load<F>(ptr: ptr + 1*N);
1808 db = sk_unaligned_load<F>(ptr: ptr + 2*N);
1809 da = sk_unaligned_load<F>(ptr: ptr + 3*N);
1810}
1811
1812// store registers dr,dg,db,da into context (mirrors load_dst)
1813STAGE(store_dst, float* ptr) {
1814 sk_unaligned_store(ptr: ptr + 0*N, val: dr);
1815 sk_unaligned_store(ptr: ptr + 1*N, val: dg);
1816 sk_unaligned_store(ptr: ptr + 2*N, val: db);
1817 sk_unaligned_store(ptr: ptr + 3*N, val: da);
1818}
1819
1820// Most blend modes apply the same logic to each channel.
1821#define BLEND_MODE(name) \
1822 SI F name##_channel(F s, F d, F sa, F da); \
1823 STAGE(name, NoCtx) { \
1824 r = name##_channel(r,dr,a,da); \
1825 g = name##_channel(g,dg,a,da); \
1826 b = name##_channel(b,db,a,da); \
1827 a = name##_channel(a,da,a,da); \
1828 } \
1829 SI F name##_channel(F s, F d, F sa, F da)
1830
1831SI F inv(F x) { return 1.0f - x; }
1832SI F two(F x) { return x + x; }
1833
1834
1835BLEND_MODE(clear) { return 0; }
1836BLEND_MODE(srcatop) { return s*da + d*inv(x: sa); }
1837BLEND_MODE(dstatop) { return d*sa + s*inv(x: da); }
1838BLEND_MODE(srcin) { return s * da; }
1839BLEND_MODE(dstin) { return d * sa; }
1840BLEND_MODE(srcout) { return s * inv(x: da); }
1841BLEND_MODE(dstout) { return d * inv(x: sa); }
1842BLEND_MODE(srcover) { return mad(f: d, m: inv(x: sa), a: s); }
1843BLEND_MODE(dstover) { return mad(f: s, m: inv(x: da), a: d); }
1844
1845BLEND_MODE(modulate) { return s*d; }
1846BLEND_MODE(multiply) { return s*inv(x: da) + d*inv(x: sa) + s*d; }
1847BLEND_MODE(plus_) { return min(a: s + d, b: 1.0f); } // We can clamp to either 1 or sa.
1848BLEND_MODE(screen) { return s + d - s*d; }
1849BLEND_MODE(xor_) { return s*inv(x: da) + d*inv(x: sa); }
1850#undef BLEND_MODE
1851
1852// Most other blend modes apply the same logic to colors, and srcover to alpha.
1853#define BLEND_MODE(name) \
1854 SI F name##_channel(F s, F d, F sa, F da); \
1855 STAGE(name, NoCtx) { \
1856 r = name##_channel(r,dr,a,da); \
1857 g = name##_channel(g,dg,a,da); \
1858 b = name##_channel(b,db,a,da); \
1859 a = mad(da, inv(a), a); \
1860 } \
1861 SI F name##_channel(F s, F d, F sa, F da)
1862
1863BLEND_MODE(darken) { return s + d - max(a: s*da, b: d*sa) ; }
1864BLEND_MODE(lighten) { return s + d - min(a: s*da, b: d*sa) ; }
1865BLEND_MODE(difference) { return s + d - two(x: min(a: s*da, b: d*sa)); }
1866BLEND_MODE(exclusion) { return s + d - two(x: s*d); }
1867
1868BLEND_MODE(colorburn) {
1869 return if_then_else(c: d == da, t: d + s*inv(x: da),
1870 e: if_then_else(c: s == 0, /* s + */ t: d*inv(x: sa),
1871 e: sa*(da - min(a: da, b: (da-d)*sa*rcp_fast(v: s))) + s*inv(x: da) + d*inv(x: sa)));
1872}
1873BLEND_MODE(colordodge) {
1874 return if_then_else(c: d == 0, /* d + */ t: s*inv(x: da),
1875 e: if_then_else(c: s == sa, t: s + d*inv(x: sa),
1876 e: sa*min(a: da, b: (d*sa)*rcp_fast(v: sa - s)) + s*inv(x: da) + d*inv(x: sa)));
1877}
1878BLEND_MODE(hardlight) {
1879 return s*inv(x: da) + d*inv(x: sa)
1880 + if_then_else(c: two(x: s) <= sa, t: two(x: s*d), e: sa*da - two(x: (da-d)*(sa-s)));
1881}
1882BLEND_MODE(overlay) {
1883 return s*inv(x: da) + d*inv(x: sa)
1884 + if_then_else(c: two(x: d) <= da, t: two(x: s*d), e: sa*da - two(x: (da-d)*(sa-s)));
1885}
1886
1887BLEND_MODE(softlight) {
1888 F m = if_then_else(c: da > 0, t: d / da, e: 0),
1889 s2 = two(x: s),
1890 m4 = two(x: two(x: m));
1891
1892 // The logic forks three ways:
1893 // 1. dark src?
1894 // 2. light src, dark dst?
1895 // 3. light src, light dst?
1896 F darkSrc = d*(sa + (s2 - sa)*(1.0f - m)), // Used in case 1.
1897 darkDst = (m4*m4 + m4)*(m - 1.0f) + 7.0f*m, // Used in case 2.
1898 liteDst = sqrt_(v: m) - m,
1899 liteSrc = d*sa + da*(s2 - sa) * if_then_else(c: two(x: two(x: d)) <= da, t: darkDst, e: liteDst); // 2 or 3?
1900 return s*inv(x: da) + d*inv(x: sa) + if_then_else(c: s2 <= sa, t: darkSrc, e: liteSrc); // 1 or (2 or 3)?
1901}
1902#undef BLEND_MODE
1903
1904// We're basing our implemenation of non-separable blend modes on
1905// https://www.w3.org/TR/compositing-1/#blendingnonseparable.
1906// and
1907// https://www.khronos.org/registry/OpenGL/specs/es/3.2/es_spec_3.2.pdf
1908// They're equivalent, but ES' math has been better simplified.
1909//
1910// Anything extra we add beyond that is to make the math work with premul inputs.
1911
1912SI F sat(F r, F g, F b) { return max(a: r, b: max(a: g,b)) - min(a: r, b: min(a: g,b)); }
1913SI F lum(F r, F g, F b) { return mad(f: r, m: 0.30f, a: mad(f: g, m: 0.59f, a: b*0.11f)); }
1914
1915SI void set_sat(F* r, F* g, F* b, F s) {
1916 F mn = min(a: *r, b: min(a: *g,b: *b)),
1917 mx = max(a: *r, b: max(a: *g,b: *b)),
1918 sat = mx - mn;
1919
1920 // Map min channel to 0, max channel to s, and scale the middle proportionally.
1921 auto scale = [=](F c) {
1922 return if_then_else(c: sat == 0, t: 0, e: (c - mn) * s / sat);
1923 };
1924 *r = scale(*r);
1925 *g = scale(*g);
1926 *b = scale(*b);
1927}
1928SI void set_lum(F* r, F* g, F* b, F l) {
1929 F diff = l - lum(r: *r, g: *g, b: *b);
1930 *r += diff;
1931 *g += diff;
1932 *b += diff;
1933}
1934SI void clip_color(F* r, F* g, F* b, F a) {
1935 F mn = min(a: *r, b: min(a: *g, b: *b)),
1936 mx = max(a: *r, b: max(a: *g, b: *b)),
1937 l = lum(r: *r, g: *g, b: *b);
1938
1939 auto clip = [=](F c) {
1940 c = if_then_else(c: mn < 0 && l != mn, t: l + (c - l) * ( l) / (l - mn), e: c);
1941 c = if_then_else(c: mx > a && l != mx, t: l + (c - l) * (a - l) / (mx - l), e: c);
1942 c = max(a: c, b: 0.0f); // Sometimes without this we may dip just a little negative.
1943 return c;
1944 };
1945 *r = clip(*r);
1946 *g = clip(*g);
1947 *b = clip(*b);
1948}
1949
1950STAGE(hue, NoCtx) {
1951 F R = r*a,
1952 G = g*a,
1953 B = b*a;
1954
1955 set_sat(r: &R, g: &G, b: &B, s: sat(r: dr,g: dg,b: db)*a);
1956 set_lum(r: &R, g: &G, b: &B, l: lum(r: dr,g: dg,b: db)*a);
1957 clip_color(r: &R,g: &G,b: &B, a: a*da);
1958
1959 r = r*inv(x: da) + dr*inv(x: a) + R;
1960 g = g*inv(x: da) + dg*inv(x: a) + G;
1961 b = b*inv(x: da) + db*inv(x: a) + B;
1962 a = a + da - a*da;
1963}
1964STAGE(saturation, NoCtx) {
1965 F R = dr*a,
1966 G = dg*a,
1967 B = db*a;
1968
1969 set_sat(r: &R, g: &G, b: &B, s: sat( r, g, b)*da);
1970 set_lum(r: &R, g: &G, b: &B, l: lum(r: dr,g: dg,b: db)* a); // (This is not redundant.)
1971 clip_color(r: &R,g: &G,b: &B, a: a*da);
1972
1973 r = r*inv(x: da) + dr*inv(x: a) + R;
1974 g = g*inv(x: da) + dg*inv(x: a) + G;
1975 b = b*inv(x: da) + db*inv(x: a) + B;
1976 a = a + da - a*da;
1977}
1978STAGE(color, NoCtx) {
1979 F R = r*da,
1980 G = g*da,
1981 B = b*da;
1982
1983 set_lum(r: &R, g: &G, b: &B, l: lum(r: dr,g: dg,b: db)*a);
1984 clip_color(r: &R,g: &G,b: &B, a: a*da);
1985
1986 r = r*inv(x: da) + dr*inv(x: a) + R;
1987 g = g*inv(x: da) + dg*inv(x: a) + G;
1988 b = b*inv(x: da) + db*inv(x: a) + B;
1989 a = a + da - a*da;
1990}
1991STAGE(luminosity, NoCtx) {
1992 F R = dr*a,
1993 G = dg*a,
1994 B = db*a;
1995
1996 set_lum(r: &R, g: &G, b: &B, l: lum(r,g,b)*da);
1997 clip_color(r: &R,g: &G,b: &B, a: a*da);
1998
1999 r = r*inv(x: da) + dr*inv(x: a) + R;
2000 g = g*inv(x: da) + dg*inv(x: a) + G;
2001 b = b*inv(x: da) + db*inv(x: a) + B;
2002 a = a + da - a*da;
2003}
2004
2005STAGE(srcover_rgba_8888, const SkRasterPipeline_MemoryCtx* ctx) {
2006 auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy);
2007
2008 U32 dst = load<U32>(src: ptr, tail);
2009 dr = cast(v: (dst ) & 0xff);
2010 dg = cast(v: (dst >> 8) & 0xff);
2011 db = cast(v: (dst >> 16) & 0xff);
2012 da = cast(v: (dst >> 24) );
2013 // {dr,dg,db,da} are in [0,255]
2014 // { r, g, b, a} are in [0, 1] (but may be out of gamut)
2015
2016 r = mad(f: dr, m: inv(x: a), a: r*255.0f);
2017 g = mad(f: dg, m: inv(x: a), a: g*255.0f);
2018 b = mad(f: db, m: inv(x: a), a: b*255.0f);
2019 a = mad(f: da, m: inv(x: a), a: a*255.0f);
2020 // { r, g, b, a} are now in [0,255] (but may be out of gamut)
2021
2022 // to_unorm() clamps back to gamut. Scaling by 1 since we're already 255-biased.
2023 dst = to_unorm(v: r, scale: 1, bias: 255)
2024 | to_unorm(v: g, scale: 1, bias: 255) << 8
2025 | to_unorm(v: b, scale: 1, bias: 255) << 16
2026 | to_unorm(v: a, scale: 1, bias: 255) << 24;
2027 store(dst: ptr, v: dst, tail);
2028}
2029
2030SI F clamp_01_(F v) { return min(a: max(a: 0.0f, b: v), b: 1.0f); }
2031
2032STAGE(clamp_01, NoCtx) {
2033 r = clamp_01_(v: r);
2034 g = clamp_01_(v: g);
2035 b = clamp_01_(v: b);
2036 a = clamp_01_(v: a);
2037}
2038
2039STAGE(clamp_gamut, NoCtx) {
2040 a = min(a: max(a, b: 0.0f), b: 1.0f);
2041 r = min(a: max(a: r, b: 0.0f), b: a);
2042 g = min(a: max(a: g, b: 0.0f), b: a);
2043 b = min(a: max(a: b, b: 0.0f), b: a);
2044}
2045
2046STAGE(set_rgb, const float* rgb) {
2047 r = rgb[0];
2048 g = rgb[1];
2049 b = rgb[2];
2050}
2051
2052STAGE(unbounded_set_rgb, const float* rgb) {
2053 r = rgb[0];
2054 g = rgb[1];
2055 b = rgb[2];
2056}
2057
2058STAGE(swap_rb, NoCtx) {
2059 auto tmp = r;
2060 r = b;
2061 b = tmp;
2062}
2063STAGE(swap_rb_dst, NoCtx) {
2064 auto tmp = dr;
2065 dr = db;
2066 db = tmp;
2067}
2068
2069STAGE(move_src_dst, NoCtx) {
2070 dr = r;
2071 dg = g;
2072 db = b;
2073 da = a;
2074}
2075STAGE(move_dst_src, NoCtx) {
2076 r = dr;
2077 g = dg;
2078 b = db;
2079 a = da;
2080}
2081STAGE(swap_src_dst, NoCtx) {
2082 std::swap(x&: r, y&: dr);
2083 std::swap(x&: g, y&: dg);
2084 std::swap(x&: b, y&: db);
2085 std::swap(x&: a, y&: da);
2086}
2087
2088STAGE(premul, NoCtx) {
2089 r = r * a;
2090 g = g * a;
2091 b = b * a;
2092}
2093STAGE(premul_dst, NoCtx) {
2094 dr = dr * da;
2095 dg = dg * da;
2096 db = db * da;
2097}
2098STAGE(unpremul, NoCtx) {
2099 float inf = sk_bit_cast<float>(src: 0x7f800000);
2100 auto scale = if_then_else(c: 1.0f/a < inf, t: 1.0f/a, e: 0);
2101 r *= scale;
2102 g *= scale;
2103 b *= scale;
2104}
2105STAGE(unpremul_polar, NoCtx) {
2106 float inf = sk_bit_cast<float>(src: 0x7f800000);
2107 auto scale = if_then_else(c: 1.0f/a < inf, t: 1.0f/a, e: 0);
2108 g *= scale;
2109 b *= scale;
2110}
2111
2112STAGE(force_opaque , NoCtx) { a = 1; }
2113STAGE(force_opaque_dst, NoCtx) { da = 1; }
2114
2115STAGE(rgb_to_hsl, NoCtx) {
2116 F mx = max(a: r, b: max(a: g,b)),
2117 mn = min(a: r, b: min(a: g,b)),
2118 d = mx - mn,
2119 d_rcp = 1.0f / d;
2120
2121 F h = (1/6.0f) *
2122 if_then_else(c: mx == mn, t: 0,
2123 e: if_then_else(c: mx == r, t: (g-b)*d_rcp + if_then_else(c: g < b, t: 6.0f, e: 0),
2124 e: if_then_else(c: mx == g, t: (b-r)*d_rcp + 2.0f,
2125 e: (r-g)*d_rcp + 4.0f)));
2126
2127 F l = (mx + mn) * 0.5f;
2128 F s = if_then_else(c: mx == mn, t: 0,
2129 e: d / if_then_else(c: l > 0.5f, t: 2.0f-mx-mn, e: mx+mn));
2130
2131 r = h;
2132 g = s;
2133 b = l;
2134}
2135STAGE(hsl_to_rgb, NoCtx) {
2136 // See GrRGBToHSLFilterEffect.fp
2137
2138 F h = r,
2139 s = g,
2140 l = b,
2141 c = (1.0f - abs_(v: 2.0f * l - 1)) * s;
2142
2143 auto hue_to_rgb = [&](F hue) {
2144 F q = clamp_01_(v: abs_(v: fract(v: hue) * 6.0f - 3.0f) - 1.0f);
2145 return (q - 0.5f) * c + l;
2146 };
2147
2148 r = hue_to_rgb(h + 0.0f/3.0f);
2149 g = hue_to_rgb(h + 2.0f/3.0f);
2150 b = hue_to_rgb(h + 1.0f/3.0f);
2151}
2152
2153// Color conversion functions used in gradient interpolation, based on
2154// https://www.w3.org/TR/css-color-4/#color-conversion-code
2155STAGE(css_lab_to_xyz, NoCtx) {
2156 constexpr float k = 24389 / 27.0f;
2157 constexpr float e = 216 / 24389.0f;
2158
2159 F f[3];
2160 f[1] = (r + 16) * (1 / 116.0f);
2161 f[0] = (g * (1 / 500.0f)) + f[1];
2162 f[2] = f[1] - (b * (1 / 200.0f));
2163
2164 F f_cubed[3] = { f[0]*f[0]*f[0], f[1]*f[1]*f[1], f[2]*f[2]*f[2] };
2165
2166 F xyz[3] = {
2167 if_then_else(c: f_cubed[0] > e, t: f_cubed[0], e: (116 * f[0] - 16) * (1 / k)),
2168 if_then_else(c: r > k * e, t: f_cubed[1], e: r * (1 / k)),
2169 if_then_else(c: f_cubed[2] > e, t: f_cubed[2], e: (116 * f[2] - 16) * (1 / k))
2170 };
2171
2172 constexpr float D50[3] = { 0.3457f / 0.3585f, 1.0f, (1.0f - 0.3457f - 0.3585f) / 0.3585f };
2173 r = xyz[0]*D50[0];
2174 g = xyz[1]*D50[1];
2175 b = xyz[2]*D50[2];
2176}
2177
2178STAGE(css_oklab_to_linear_srgb, NoCtx) {
2179 F l_ = r + 0.3963377774f * g + 0.2158037573f * b,
2180 m_ = r - 0.1055613458f * g - 0.0638541728f * b,
2181 s_ = r - 0.0894841775f * g - 1.2914855480f * b;
2182
2183 F l = l_*l_*l_,
2184 m = m_*m_*m_,
2185 s = s_*s_*s_;
2186
2187 r = +4.0767416621f * l - 3.3077115913f * m + 0.2309699292f * s;
2188 g = -1.2684380046f * l + 2.6097574011f * m - 0.3413193965f * s;
2189 b = -0.0041960863f * l - 0.7034186147f * m + 1.7076147010f * s;
2190}
2191
2192// Skia stores all polar colors with hue in the first component, so this "LCH -> Lab" transform
2193// actually takes "HCL". This is also used to do the same polar transform for OkHCL to OkLAB.
2194// See similar comments & logic in SkGradientBaseShader.cpp.
2195STAGE(css_hcl_to_lab, NoCtx) {
2196 F H = r,
2197 C = g,
2198 L = b;
2199
2200 F hueRadians = H * (SK_FloatPI / 180);
2201
2202 r = L;
2203 g = C * cos_(x: hueRadians);
2204 b = C * sin_(x: hueRadians);
2205}
2206
2207SI F mod_(F x, float y) {
2208 return x - y * floor_(v: x * (1 / y));
2209}
2210
2211struct RGB { F r, g, b; };
2212
2213SI RGB css_hsl_to_srgb_(F h, F s, F l) {
2214 h = mod_(x: h, y: 360);
2215
2216 s *= 0.01f;
2217 l *= 0.01f;
2218
2219 F k[3] = {
2220 mod_(x: 0 + h * (1 / 30.0f), y: 12),
2221 mod_(x: 8 + h * (1 / 30.0f), y: 12),
2222 mod_(x: 4 + h * (1 / 30.0f), y: 12)
2223 };
2224 F a = s * min(a: l, b: 1 - l);
2225 return {
2226 .r: l - a * max(a: -1.0f, b: min(a: min(a: k[0] - 3.0f, b: 9.0f - k[0]), b: 1.0f)),
2227 .g: l - a * max(a: -1.0f, b: min(a: min(a: k[1] - 3.0f, b: 9.0f - k[1]), b: 1.0f)),
2228 .b: l - a * max(a: -1.0f, b: min(a: min(a: k[2] - 3.0f, b: 9.0f - k[2]), b: 1.0f))
2229 };
2230}
2231
2232STAGE(css_hsl_to_srgb, NoCtx) {
2233 RGB rgb = css_hsl_to_srgb_(h: r, s: g, l: b);
2234 r = rgb.r;
2235 g = rgb.g;
2236 b = rgb.b;
2237}
2238
2239STAGE(css_hwb_to_srgb, NoCtx) {
2240 g *= 0.01f;
2241 b *= 0.01f;
2242
2243 F gray = g / (g + b);
2244
2245 RGB rgb = css_hsl_to_srgb_(h: r, s: 100.0f, l: 50.0f);
2246 rgb.r = rgb.r * (1 - g - b) + g;
2247 rgb.g = rgb.g * (1 - g - b) + g;
2248 rgb.b = rgb.b * (1 - g - b) + g;
2249
2250 auto isGray = (g + b) >= 1;
2251
2252 r = if_then_else(c: isGray, t: gray, e: rgb.r);
2253 g = if_then_else(c: isGray, t: gray, e: rgb.g);
2254 b = if_then_else(c: isGray, t: gray, e: rgb.b);
2255}
2256
2257// Derive alpha's coverage from rgb coverage and the values of src and dst alpha.
2258SI F alpha_coverage_from_rgb_coverage(F a, F da, F cr, F cg, F cb) {
2259 return if_then_else(c: a < da, t: min(a: cr, b: min(a: cg,b: cb))
2260 , e: max(a: cr, b: max(a: cg,b: cb)));
2261}
2262
2263STAGE(scale_1_float, const float* c) {
2264 r = r * *c;
2265 g = g * *c;
2266 b = b * *c;
2267 a = a * *c;
2268}
2269STAGE(scale_u8, const SkRasterPipeline_MemoryCtx* ctx) {
2270 auto ptr = ptr_at_xy<const uint8_t>(ctx, dx,dy);
2271
2272 auto scales = load<U8>(src: ptr, tail);
2273 auto c = from_byte(b: scales);
2274
2275 r = r * c;
2276 g = g * c;
2277 b = b * c;
2278 a = a * c;
2279}
2280STAGE(scale_565, const SkRasterPipeline_MemoryCtx* ctx) {
2281 auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy);
2282
2283 F cr,cg,cb;
2284 from_565(565: load<U16>(src: ptr, tail), r: &cr, g: &cg, b: &cb);
2285
2286 F ca = alpha_coverage_from_rgb_coverage(a,da, cr,cg,cb);
2287
2288 r = r * cr;
2289 g = g * cg;
2290 b = b * cb;
2291 a = a * ca;
2292}
2293
2294SI F lerp(F from, F to, F t) {
2295 return mad(f: to-from, m: t, a: from);
2296}
2297
2298STAGE(lerp_1_float, const float* c) {
2299 r = lerp(from: dr, to: r, t: *c);
2300 g = lerp(from: dg, to: g, t: *c);
2301 b = lerp(from: db, to: b, t: *c);
2302 a = lerp(from: da, to: a, t: *c);
2303}
2304STAGE(scale_native, const float scales[]) {
2305 auto c = sk_unaligned_load<F>(ptr: scales);
2306 r = r * c;
2307 g = g * c;
2308 b = b * c;
2309 a = a * c;
2310}
2311STAGE(lerp_native, const float scales[]) {
2312 auto c = sk_unaligned_load<F>(ptr: scales);
2313 r = lerp(from: dr, to: r, t: c);
2314 g = lerp(from: dg, to: g, t: c);
2315 b = lerp(from: db, to: b, t: c);
2316 a = lerp(from: da, to: a, t: c);
2317}
2318STAGE(lerp_u8, const SkRasterPipeline_MemoryCtx* ctx) {
2319 auto ptr = ptr_at_xy<const uint8_t>(ctx, dx,dy);
2320
2321 auto scales = load<U8>(src: ptr, tail);
2322 auto c = from_byte(b: scales);
2323
2324 r = lerp(from: dr, to: r, t: c);
2325 g = lerp(from: dg, to: g, t: c);
2326 b = lerp(from: db, to: b, t: c);
2327 a = lerp(from: da, to: a, t: c);
2328}
2329STAGE(lerp_565, const SkRasterPipeline_MemoryCtx* ctx) {
2330 auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy);
2331
2332 F cr,cg,cb;
2333 from_565(565: load<U16>(src: ptr, tail), r: &cr, g: &cg, b: &cb);
2334
2335 F ca = alpha_coverage_from_rgb_coverage(a,da, cr,cg,cb);
2336
2337 r = lerp(from: dr, to: r, t: cr);
2338 g = lerp(from: dg, to: g, t: cg);
2339 b = lerp(from: db, to: b, t: cb);
2340 a = lerp(from: da, to: a, t: ca);
2341}
2342
2343STAGE(emboss, const SkRasterPipeline_EmbossCtx* ctx) {
2344 auto mptr = ptr_at_xy<const uint8_t>(ctx: &ctx->mul, dx,dy),
2345 aptr = ptr_at_xy<const uint8_t>(ctx: &ctx->add, dx,dy);
2346
2347 F mul = from_byte(b: load<U8>(src: mptr, tail)),
2348 add = from_byte(b: load<U8>(src: aptr, tail));
2349
2350 r = mad(f: r, m: mul, a: add);
2351 g = mad(f: g, m: mul, a: add);
2352 b = mad(f: b, m: mul, a: add);
2353}
2354
2355STAGE(byte_tables, const SkRasterPipeline_TablesCtx* tables) {
2356 r = from_byte(b: gather(p: tables->r, ix: to_unorm(v: r, scale: 255)));
2357 g = from_byte(b: gather(p: tables->g, ix: to_unorm(v: g, scale: 255)));
2358 b = from_byte(b: gather(p: tables->b, ix: to_unorm(v: b, scale: 255)));
2359 a = from_byte(b: gather(p: tables->a, ix: to_unorm(v: a, scale: 255)));
2360}
2361
2362SI F strip_sign(F x, U32* sign) {
2363 U32 bits = sk_bit_cast<U32>(src: x);
2364 *sign = bits & 0x80000000;
2365 return sk_bit_cast<F>(src: bits ^ *sign);
2366}
2367
2368SI F apply_sign(F x, U32 sign) {
2369 return sk_bit_cast<F>(src: sign | sk_bit_cast<U32>(src: x));
2370}
2371
2372STAGE(parametric, const skcms_TransferFunction* ctx) {
2373 auto fn = [&](F v) {
2374 U32 sign;
2375 v = strip_sign(x: v, sign: &sign);
2376
2377 F r = if_then_else(c: v <= ctx->d, t: mad(f: ctx->c, m: v, a: ctx->f)
2378 , e: approx_powf(x: mad(f: ctx->a, m: v, a: ctx->b), y: ctx->g) + ctx->e);
2379 return apply_sign(x: r, sign);
2380 };
2381 r = fn(r);
2382 g = fn(g);
2383 b = fn(b);
2384}
2385
2386STAGE(gamma_, const float* G) {
2387 auto fn = [&](F v) {
2388 U32 sign;
2389 v = strip_sign(x: v, sign: &sign);
2390 return apply_sign(x: approx_powf(x: v, y: *G), sign);
2391 };
2392 r = fn(r);
2393 g = fn(g);
2394 b = fn(b);
2395}
2396
2397STAGE(PQish, const skcms_TransferFunction* ctx) {
2398 auto fn = [&](F v) {
2399 U32 sign;
2400 v = strip_sign(x: v, sign: &sign);
2401
2402 F r = approx_powf(x: max(a: mad(f: ctx->b, m: approx_powf(x: v, y: ctx->c), a: ctx->a), b: 0.0f)
2403 / (mad(f: ctx->e, m: approx_powf(x: v, y: ctx->c), a: ctx->d)),
2404 y: ctx->f);
2405
2406 return apply_sign(x: r, sign);
2407 };
2408 r = fn(r);
2409 g = fn(g);
2410 b = fn(b);
2411}
2412
2413STAGE(HLGish, const skcms_TransferFunction* ctx) {
2414 auto fn = [&](F v) {
2415 U32 sign;
2416 v = strip_sign(x: v, sign: &sign);
2417
2418 const float R = ctx->a, G = ctx->b,
2419 a = ctx->c, b = ctx->d, c = ctx->e,
2420 K = ctx->f + 1.0f;
2421
2422 F r = if_then_else(c: v*R <= 1, t: approx_powf(x: v*R, y: G)
2423 , e: approx_exp(x: (v-c)*a) + b);
2424
2425 return K * apply_sign(x: r, sign);
2426 };
2427 r = fn(r);
2428 g = fn(g);
2429 b = fn(b);
2430}
2431
2432STAGE(HLGinvish, const skcms_TransferFunction* ctx) {
2433 auto fn = [&](F v) {
2434 U32 sign;
2435 v = strip_sign(x: v, sign: &sign);
2436
2437 const float R = ctx->a, G = ctx->b,
2438 a = ctx->c, b = ctx->d, c = ctx->e,
2439 K = ctx->f + 1.0f;
2440
2441 v /= K;
2442 F r = if_then_else(c: v <= 1, t: R * approx_powf(x: v, y: G)
2443 , e: a * approx_log(x: v - b) + c);
2444
2445 return apply_sign(x: r, sign);
2446 };
2447 r = fn(r);
2448 g = fn(g);
2449 b = fn(b);
2450}
2451
2452STAGE(load_a8, const SkRasterPipeline_MemoryCtx* ctx) {
2453 auto ptr = ptr_at_xy<const uint8_t>(ctx, dx,dy);
2454
2455 r = g = b = 0.0f;
2456 a = from_byte(b: load<U8>(src: ptr, tail));
2457}
2458STAGE(load_a8_dst, const SkRasterPipeline_MemoryCtx* ctx) {
2459 auto ptr = ptr_at_xy<const uint8_t>(ctx, dx,dy);
2460
2461 dr = dg = db = 0.0f;
2462 da = from_byte(b: load<U8>(src: ptr, tail));
2463}
2464STAGE(gather_a8, const SkRasterPipeline_GatherCtx* ctx) {
2465 const uint8_t* ptr;
2466 U32 ix = ix_and_ptr(ptr: &ptr, ctx, x: r,y: g);
2467 r = g = b = 0.0f;
2468 a = from_byte(b: gather(p: ptr, ix));
2469}
2470STAGE(store_a8, const SkRasterPipeline_MemoryCtx* ctx) {
2471 auto ptr = ptr_at_xy<uint8_t>(ctx, dx,dy);
2472
2473 U8 packed = pack(v: pack(v: to_unorm(v: a, scale: 255)));
2474 store(dst: ptr, v: packed, tail);
2475}
2476STAGE(store_r8, const SkRasterPipeline_MemoryCtx* ctx) {
2477 auto ptr = ptr_at_xy<uint8_t>(ctx, dx,dy);
2478
2479 U8 packed = pack(v: pack(v: to_unorm(v: r, scale: 255)));
2480 store(dst: ptr, v: packed, tail);
2481}
2482
2483STAGE(load_565, const SkRasterPipeline_MemoryCtx* ctx) {
2484 auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy);
2485
2486 from_565(565: load<U16>(src: ptr, tail), r: &r,g: &g,b: &b);
2487 a = 1.0f;
2488}
2489STAGE(load_565_dst, const SkRasterPipeline_MemoryCtx* ctx) {
2490 auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy);
2491
2492 from_565(565: load<U16>(src: ptr, tail), r: &dr,g: &dg,b: &db);
2493 da = 1.0f;
2494}
2495STAGE(gather_565, const SkRasterPipeline_GatherCtx* ctx) {
2496 const uint16_t* ptr;
2497 U32 ix = ix_and_ptr(ptr: &ptr, ctx, x: r,y: g);
2498 from_565(565: gather(p: ptr, ix), r: &r,g: &g,b: &b);
2499 a = 1.0f;
2500}
2501STAGE(store_565, const SkRasterPipeline_MemoryCtx* ctx) {
2502 auto ptr = ptr_at_xy<uint16_t>(ctx, dx,dy);
2503
2504 U16 px = pack( v: to_unorm(v: r, scale: 31) << 11
2505 | to_unorm(v: g, scale: 63) << 5
2506 | to_unorm(v: b, scale: 31) );
2507 store(dst: ptr, v: px, tail);
2508}
2509
2510STAGE(load_4444, const SkRasterPipeline_MemoryCtx* ctx) {
2511 auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy);
2512 from_4444(4444: load<U16>(src: ptr, tail), r: &r,g: &g,b: &b,a: &a);
2513}
2514STAGE(load_4444_dst, const SkRasterPipeline_MemoryCtx* ctx) {
2515 auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy);
2516 from_4444(4444: load<U16>(src: ptr, tail), r: &dr,g: &dg,b: &db,a: &da);
2517}
2518STAGE(gather_4444, const SkRasterPipeline_GatherCtx* ctx) {
2519 const uint16_t* ptr;
2520 U32 ix = ix_and_ptr(ptr: &ptr, ctx, x: r,y: g);
2521 from_4444(4444: gather(p: ptr, ix), r: &r,g: &g,b: &b,a: &a);
2522}
2523STAGE(store_4444, const SkRasterPipeline_MemoryCtx* ctx) {
2524 auto ptr = ptr_at_xy<uint16_t>(ctx, dx,dy);
2525 U16 px = pack( v: to_unorm(v: r, scale: 15) << 12
2526 | to_unorm(v: g, scale: 15) << 8
2527 | to_unorm(v: b, scale: 15) << 4
2528 | to_unorm(v: a, scale: 15) );
2529 store(dst: ptr, v: px, tail);
2530}
2531
2532STAGE(load_8888, const SkRasterPipeline_MemoryCtx* ctx) {
2533 auto ptr = ptr_at_xy<const uint32_t>(ctx, dx,dy);
2534 from_8888(8888: load<U32>(src: ptr, tail), r: &r,g: &g,b: &b,a: &a);
2535}
2536STAGE(load_8888_dst, const SkRasterPipeline_MemoryCtx* ctx) {
2537 auto ptr = ptr_at_xy<const uint32_t>(ctx, dx,dy);
2538 from_8888(8888: load<U32>(src: ptr, tail), r: &dr,g: &dg,b: &db,a: &da);
2539}
2540STAGE(gather_8888, const SkRasterPipeline_GatherCtx* ctx) {
2541 const uint32_t* ptr;
2542 U32 ix = ix_and_ptr(ptr: &ptr, ctx, x: r,y: g);
2543 from_8888(8888: gather(p: ptr, ix), r: &r,g: &g,b: &b,a: &a);
2544}
2545STAGE(store_8888, const SkRasterPipeline_MemoryCtx* ctx) {
2546 auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy);
2547
2548 U32 px = to_unorm(v: r, scale: 255)
2549 | to_unorm(v: g, scale: 255) << 8
2550 | to_unorm(v: b, scale: 255) << 16
2551 | to_unorm(v: a, scale: 255) << 24;
2552 store(dst: ptr, v: px, tail);
2553}
2554
2555STAGE(load_rg88, const SkRasterPipeline_MemoryCtx* ctx) {
2556 auto ptr = ptr_at_xy<const uint16_t>(ctx, dx, dy);
2557 from_88(88: load<U16>(src: ptr, tail), r: &r, g: &g);
2558 b = 0;
2559 a = 1;
2560}
2561STAGE(load_rg88_dst, const SkRasterPipeline_MemoryCtx* ctx) {
2562 auto ptr = ptr_at_xy<const uint16_t>(ctx, dx, dy);
2563 from_88(88: load<U16>(src: ptr, tail), r: &dr, g: &dg);
2564 db = 0;
2565 da = 1;
2566}
2567STAGE(gather_rg88, const SkRasterPipeline_GatherCtx* ctx) {
2568 const uint16_t* ptr;
2569 U32 ix = ix_and_ptr(ptr: &ptr, ctx, x: r, y: g);
2570 from_88(88: gather(p: ptr, ix), r: &r, g: &g);
2571 b = 0;
2572 a = 1;
2573}
2574STAGE(store_rg88, const SkRasterPipeline_MemoryCtx* ctx) {
2575 auto ptr = ptr_at_xy<uint16_t>(ctx, dx, dy);
2576 U16 px = pack( v: to_unorm(v: r, scale: 255) | to_unorm(v: g, scale: 255) << 8 );
2577 store(dst: ptr, v: px, tail);
2578}
2579
2580STAGE(load_a16, const SkRasterPipeline_MemoryCtx* ctx) {
2581 auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy);
2582 r = g = b = 0;
2583 a = from_short(s: load<U16>(src: ptr, tail));
2584}
2585STAGE(load_a16_dst, const SkRasterPipeline_MemoryCtx* ctx) {
2586 auto ptr = ptr_at_xy<const uint16_t>(ctx, dx, dy);
2587 dr = dg = db = 0.0f;
2588 da = from_short(s: load<U16>(src: ptr, tail));
2589}
2590STAGE(gather_a16, const SkRasterPipeline_GatherCtx* ctx) {
2591 const uint16_t* ptr;
2592 U32 ix = ix_and_ptr(ptr: &ptr, ctx, x: r, y: g);
2593 r = g = b = 0.0f;
2594 a = from_short(s: gather(p: ptr, ix));
2595}
2596STAGE(store_a16, const SkRasterPipeline_MemoryCtx* ctx) {
2597 auto ptr = ptr_at_xy<uint16_t>(ctx, dx,dy);
2598
2599 U16 px = pack(v: to_unorm(v: a, scale: 65535));
2600 store(dst: ptr, v: px, tail);
2601}
2602
2603STAGE(load_rg1616, const SkRasterPipeline_MemoryCtx* ctx) {
2604 auto ptr = ptr_at_xy<const uint32_t>(ctx, dx, dy);
2605 b = 0; a = 1;
2606 from_1616(1616: load<U32>(src: ptr, tail), r: &r,g: &g);
2607}
2608STAGE(load_rg1616_dst, const SkRasterPipeline_MemoryCtx* ctx) {
2609 auto ptr = ptr_at_xy<const uint32_t>(ctx, dx, dy);
2610 from_1616(1616: load<U32>(src: ptr, tail), r: &dr, g: &dg);
2611 db = 0;
2612 da = 1;
2613}
2614STAGE(gather_rg1616, const SkRasterPipeline_GatherCtx* ctx) {
2615 const uint32_t* ptr;
2616 U32 ix = ix_and_ptr(ptr: &ptr, ctx, x: r, y: g);
2617 from_1616(1616: gather(p: ptr, ix), r: &r, g: &g);
2618 b = 0;
2619 a = 1;
2620}
2621STAGE(store_rg1616, const SkRasterPipeline_MemoryCtx* ctx) {
2622 auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy);
2623
2624 U32 px = to_unorm(v: r, scale: 65535)
2625 | to_unorm(v: g, scale: 65535) << 16;
2626 store(dst: ptr, v: px, tail);
2627}
2628
2629STAGE(load_16161616, const SkRasterPipeline_MemoryCtx* ctx) {
2630 auto ptr = ptr_at_xy<const uint64_t>(ctx, dx, dy);
2631 from_16161616(16161616: load<U64>(src: ptr, tail), r: &r,g: &g, b: &b, a: &a);
2632}
2633STAGE(load_16161616_dst, const SkRasterPipeline_MemoryCtx* ctx) {
2634 auto ptr = ptr_at_xy<const uint64_t>(ctx, dx, dy);
2635 from_16161616(16161616: load<U64>(src: ptr, tail), r: &dr, g: &dg, b: &db, a: &da);
2636}
2637STAGE(gather_16161616, const SkRasterPipeline_GatherCtx* ctx) {
2638 const uint64_t* ptr;
2639 U32 ix = ix_and_ptr(ptr: &ptr, ctx, x: r, y: g);
2640 from_16161616(16161616: gather(p: ptr, ix), r: &r, g: &g, b: &b, a: &a);
2641}
2642STAGE(store_16161616, const SkRasterPipeline_MemoryCtx* ctx) {
2643 auto ptr = ptr_at_xy<uint16_t>(ctx, dx: 4*dx,dy: 4*dy);
2644
2645 U16 R = pack(v: to_unorm(v: r, scale: 65535)),
2646 G = pack(v: to_unorm(v: g, scale: 65535)),
2647 B = pack(v: to_unorm(v: b, scale: 65535)),
2648 A = pack(v: to_unorm(v: a, scale: 65535));
2649
2650 store4(ptr,tail, r: R,g: G,b: B,a: A);
2651}
2652
2653
2654STAGE(load_1010102, const SkRasterPipeline_MemoryCtx* ctx) {
2655 auto ptr = ptr_at_xy<const uint32_t>(ctx, dx,dy);
2656 from_1010102(rgba: load<U32>(src: ptr, tail), r: &r,g: &g,b: &b,a: &a);
2657}
2658STAGE(load_1010102_dst, const SkRasterPipeline_MemoryCtx* ctx) {
2659 auto ptr = ptr_at_xy<const uint32_t>(ctx, dx,dy);
2660 from_1010102(rgba: load<U32>(src: ptr, tail), r: &dr,g: &dg,b: &db,a: &da);
2661}
2662STAGE(load_1010102_xr, const SkRasterPipeline_MemoryCtx* ctx) {
2663 auto ptr = ptr_at_xy<const uint32_t>(ctx, dx,dy);
2664 from_1010102_xr(rgba: load<U32>(src: ptr, tail), r: &r,g: &g,b: &b,a: &a);
2665}
2666STAGE(load_1010102_xr_dst, const SkRasterPipeline_MemoryCtx* ctx) {
2667 auto ptr = ptr_at_xy<const uint32_t>(ctx, dx,dy);
2668 from_1010102_xr(rgba: load<U32>(src: ptr, tail), r: &dr,g: &dg,b: &db,a: &da);
2669}
2670STAGE(gather_1010102, const SkRasterPipeline_GatherCtx* ctx) {
2671 const uint32_t* ptr;
2672 U32 ix = ix_and_ptr(ptr: &ptr, ctx, x: r,y: g);
2673 from_1010102(rgba: gather(p: ptr, ix), r: &r,g: &g,b: &b,a: &a);
2674}
2675STAGE(gather_1010102_xr, const SkRasterPipeline_GatherCtx* ctx) {
2676 const uint32_t* ptr;
2677 U32 ix = ix_and_ptr(ptr: &ptr, ctx, x: r, y: g);
2678 from_1010102_xr(rgba: gather(p: ptr, ix), r: &r,g: &g,b: &b,a: &a);
2679}
2680STAGE(store_1010102, const SkRasterPipeline_MemoryCtx* ctx) {
2681 auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy);
2682
2683 U32 px = to_unorm(v: r, scale: 1023)
2684 | to_unorm(v: g, scale: 1023) << 10
2685 | to_unorm(v: b, scale: 1023) << 20
2686 | to_unorm(v: a, scale: 3) << 30;
2687 store(dst: ptr, v: px, tail);
2688}
2689STAGE(store_1010102_xr, const SkRasterPipeline_MemoryCtx* ctx) {
2690 auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy);
2691 static constexpr float min = -0.752941f;
2692 static constexpr float max = 1.25098f;
2693 static constexpr float range = max - min;
2694 U32 px = to_unorm(v: (r - min) / range, scale: 1023)
2695 | to_unorm(v: (g - min) / range, scale: 1023) << 10
2696 | to_unorm(v: (b - min) / range, scale: 1023) << 20
2697 | to_unorm(v: a, scale: 3) << 30;
2698 store(dst: ptr, v: px, tail);
2699}
2700
2701STAGE(load_f16, const SkRasterPipeline_MemoryCtx* ctx) {
2702 auto ptr = ptr_at_xy<const uint64_t>(ctx, dx,dy);
2703
2704 U16 R,G,B,A;
2705 load4(ptr: (const uint16_t*)ptr,tail, r: &R,g: &G,b: &B,a: &A);
2706 r = from_half(h: R);
2707 g = from_half(h: G);
2708 b = from_half(h: B);
2709 a = from_half(h: A);
2710}
2711STAGE(load_f16_dst, const SkRasterPipeline_MemoryCtx* ctx) {
2712 auto ptr = ptr_at_xy<const uint64_t>(ctx, dx,dy);
2713
2714 U16 R,G,B,A;
2715 load4(ptr: (const uint16_t*)ptr,tail, r: &R,g: &G,b: &B,a: &A);
2716 dr = from_half(h: R);
2717 dg = from_half(h: G);
2718 db = from_half(h: B);
2719 da = from_half(h: A);
2720}
2721STAGE(gather_f16, const SkRasterPipeline_GatherCtx* ctx) {
2722 const uint64_t* ptr;
2723 U32 ix = ix_and_ptr(ptr: &ptr, ctx, x: r,y: g);
2724 auto px = gather(p: ptr, ix);
2725
2726 U16 R,G,B,A;
2727 load4(ptr: (const uint16_t*)&px,tail: 0, r: &R,g: &G,b: &B,a: &A);
2728 r = from_half(h: R);
2729 g = from_half(h: G);
2730 b = from_half(h: B);
2731 a = from_half(h: A);
2732}
2733STAGE(store_f16, const SkRasterPipeline_MemoryCtx* ctx) {
2734 auto ptr = ptr_at_xy<uint64_t>(ctx, dx,dy);
2735 store4(ptr: (uint16_t*)ptr,tail, r: to_half(f: r)
2736 , g: to_half(f: g)
2737 , b: to_half(f: b)
2738 , a: to_half(f: a));
2739}
2740
2741STAGE(store_u16_be, const SkRasterPipeline_MemoryCtx* ctx) {
2742 auto ptr = ptr_at_xy<uint16_t>(ctx, dx: 4*dx,dy);
2743
2744 U16 R = bswap(x: pack(v: to_unorm(v: r, scale: 65535))),
2745 G = bswap(x: pack(v: to_unorm(v: g, scale: 65535))),
2746 B = bswap(x: pack(v: to_unorm(v: b, scale: 65535))),
2747 A = bswap(x: pack(v: to_unorm(v: a, scale: 65535)));
2748
2749 store4(ptr,tail, r: R,g: G,b: B,a: A);
2750}
2751
2752STAGE(load_af16, const SkRasterPipeline_MemoryCtx* ctx) {
2753 auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy);
2754
2755 U16 A = load<U16>(src: (const uint16_t*)ptr, tail);
2756 r = 0;
2757 g = 0;
2758 b = 0;
2759 a = from_half(h: A);
2760}
2761STAGE(load_af16_dst, const SkRasterPipeline_MemoryCtx* ctx) {
2762 auto ptr = ptr_at_xy<const uint16_t>(ctx, dx, dy);
2763
2764 U16 A = load<U16>(src: (const uint16_t*)ptr, tail);
2765 dr = dg = db = 0.0f;
2766 da = from_half(h: A);
2767}
2768STAGE(gather_af16, const SkRasterPipeline_GatherCtx* ctx) {
2769 const uint16_t* ptr;
2770 U32 ix = ix_and_ptr(ptr: &ptr, ctx, x: r, y: g);
2771 r = g = b = 0.0f;
2772 a = from_half(h: gather(p: ptr, ix));
2773}
2774STAGE(store_af16, const SkRasterPipeline_MemoryCtx* ctx) {
2775 auto ptr = ptr_at_xy<uint16_t>(ctx, dx,dy);
2776 store(dst: ptr, v: to_half(f: a), tail);
2777}
2778
2779STAGE(load_rgf16, const SkRasterPipeline_MemoryCtx* ctx) {
2780 auto ptr = ptr_at_xy<const uint32_t>(ctx, dx, dy);
2781
2782 U16 R,G;
2783 load2(ptr: (const uint16_t*)ptr, tail, r: &R, g: &G);
2784 r = from_half(h: R);
2785 g = from_half(h: G);
2786 b = 0;
2787 a = 1;
2788}
2789STAGE(load_rgf16_dst, const SkRasterPipeline_MemoryCtx* ctx) {
2790 auto ptr = ptr_at_xy<const uint32_t>(ctx, dx, dy);
2791
2792 U16 R,G;
2793 load2(ptr: (const uint16_t*)ptr, tail, r: &R, g: &G);
2794 dr = from_half(h: R);
2795 dg = from_half(h: G);
2796 db = 0;
2797 da = 1;
2798}
2799STAGE(gather_rgf16, const SkRasterPipeline_GatherCtx* ctx) {
2800 const uint32_t* ptr;
2801 U32 ix = ix_and_ptr(ptr: &ptr, ctx, x: r, y: g);
2802 auto px = gather(p: ptr, ix);
2803
2804 U16 R,G;
2805 load2(ptr: (const uint16_t*)&px, tail: 0, r: &R, g: &G);
2806 r = from_half(h: R);
2807 g = from_half(h: G);
2808 b = 0;
2809 a = 1;
2810}
2811STAGE(store_rgf16, const SkRasterPipeline_MemoryCtx* ctx) {
2812 auto ptr = ptr_at_xy<uint32_t>(ctx, dx, dy);
2813 store2(ptr: (uint16_t*)ptr, tail, r: to_half(f: r)
2814 , g: to_half(f: g));
2815}
2816
2817STAGE(load_f32, const SkRasterPipeline_MemoryCtx* ctx) {
2818 auto ptr = ptr_at_xy<const float>(ctx, dx: 4*dx,dy: 4*dy);
2819 load4(ptr,tail, r: &r,g: &g,b: &b,a: &a);
2820}
2821STAGE(load_f32_dst, const SkRasterPipeline_MemoryCtx* ctx) {
2822 auto ptr = ptr_at_xy<const float>(ctx, dx: 4*dx,dy: 4*dy);
2823 load4(ptr,tail, r: &dr,g: &dg,b: &db,a: &da);
2824}
2825STAGE(gather_f32, const SkRasterPipeline_GatherCtx* ctx) {
2826 const float* ptr;
2827 U32 ix = ix_and_ptr(ptr: &ptr, ctx, x: r,y: g);
2828 r = gather(p: ptr, ix: 4*ix + 0);
2829 g = gather(p: ptr, ix: 4*ix + 1);
2830 b = gather(p: ptr, ix: 4*ix + 2);
2831 a = gather(p: ptr, ix: 4*ix + 3);
2832}
2833STAGE(store_f32, const SkRasterPipeline_MemoryCtx* ctx) {
2834 auto ptr = ptr_at_xy<float>(ctx, dx: 4*dx,dy: 4*dy);
2835 store4(ptr,tail, r,g,b,a);
2836}
2837
2838STAGE(load_rgf32, const SkRasterPipeline_MemoryCtx* ctx) {
2839 auto ptr = ptr_at_xy<const float>(ctx, dx: 2*dx,dy: 2*dy);
2840 load2(ptr, tail, r: &r, g: &g);
2841 b = 0;
2842 a = 1;
2843}
2844STAGE(store_rgf32, const SkRasterPipeline_MemoryCtx* ctx) {
2845 auto ptr = ptr_at_xy<float>(ctx, dx: 2*dx,dy: 2*dy);
2846 store2(ptr, tail, r, g);
2847}
2848
2849SI F exclusive_repeat(F v, const SkRasterPipeline_TileCtx* ctx) {
2850 return v - floor_(v: v*ctx->invScale)*ctx->scale;
2851}
2852SI F exclusive_mirror(F v, const SkRasterPipeline_TileCtx* ctx) {
2853 auto limit = ctx->scale;
2854 auto invLimit = ctx->invScale;
2855
2856 // This is "repeat" over the range 0..2*limit
2857 auto u = v - floor_(v: v*invLimit*0.5f)*2*limit;
2858 // s will be 0 when moving forward (e.g. [0, limit)) and 1 when moving backward (e.g.
2859 // [limit, 2*limit)).
2860 auto s = floor_(v: u*invLimit);
2861 // This is the mirror result.
2862 auto m = u - 2*s*(u - limit);
2863 // Apply a bias to m if moving backwards so that we snap consistently at exact integer coords in
2864 // the logical infinite image. This is tested by mirror_tile GM. Note that all values
2865 // that have a non-zero bias applied are > 0.
2866 auto biasInUlps = trunc_(v: s);
2867 return sk_bit_cast<F>(src: sk_bit_cast<U32>(src: m) + ctx->mirrorBiasDir*biasInUlps);
2868}
2869// Tile x or y to [0,limit) == [0,limit - 1 ulp] (think, sampling from images).
2870// The gather stages will hard clamp the output of these stages to [0,limit)...
2871// we just need to do the basic repeat or mirroring.
2872STAGE(repeat_x, const SkRasterPipeline_TileCtx* ctx) { r = exclusive_repeat(v: r, ctx); }
2873STAGE(repeat_y, const SkRasterPipeline_TileCtx* ctx) { g = exclusive_repeat(v: g, ctx); }
2874STAGE(mirror_x, const SkRasterPipeline_TileCtx* ctx) { r = exclusive_mirror(v: r, ctx); }
2875STAGE(mirror_y, const SkRasterPipeline_TileCtx* ctx) { g = exclusive_mirror(v: g, ctx); }
2876
2877STAGE( clamp_x_1, NoCtx) { r = clamp_01_(v: r); }
2878STAGE(repeat_x_1, NoCtx) { r = clamp_01_(v: r - floor_(v: r)); }
2879STAGE(mirror_x_1, NoCtx) { r = clamp_01_(v: abs_( v: (r-1.0f) - two(x: floor_(v: (r-1.0f)*0.5f)) - 1.0f )); }
2880
2881STAGE(clamp_x_and_y, const SkRasterPipeline_CoordClampCtx* ctx) {
2882 r = min(a: ctx->max_x, b: max(a: ctx->min_x, b: r));
2883 g = min(a: ctx->max_y, b: max(a: ctx->min_y, b: g));
2884}
2885
2886// Decal stores a 32bit mask after checking the coordinate (x and/or y) against its domain:
2887// mask == 0x00000000 if the coordinate(s) are out of bounds
2888// mask == 0xFFFFFFFF if the coordinate(s) are in bounds
2889// After the gather stage, the r,g,b,a values are AND'd with this mask, setting them to 0
2890// if either of the coordinates were out of bounds.
2891
2892STAGE(decal_x, SkRasterPipeline_DecalTileCtx* ctx) {
2893 auto w = ctx->limit_x;
2894 auto e = ctx->inclusiveEdge_x;
2895 auto cond = ((0 < r) & (r < w)) | (r == e);
2896 sk_unaligned_store(ptr: ctx->mask, val: cond_to_mask(cond));
2897}
2898STAGE(decal_y, SkRasterPipeline_DecalTileCtx* ctx) {
2899 auto h = ctx->limit_y;
2900 auto e = ctx->inclusiveEdge_y;
2901 auto cond = ((0 < g) & (g < h)) | (g == e);
2902 sk_unaligned_store(ptr: ctx->mask, val: cond_to_mask(cond));
2903}
2904STAGE(decal_x_and_y, SkRasterPipeline_DecalTileCtx* ctx) {
2905 auto w = ctx->limit_x;
2906 auto h = ctx->limit_y;
2907 auto ex = ctx->inclusiveEdge_x;
2908 auto ey = ctx->inclusiveEdge_y;
2909 auto cond = (((0 < r) & (r < w)) | (r == ex))
2910 & (((0 < g) & (g < h)) | (g == ey));
2911 sk_unaligned_store(ptr: ctx->mask, val: cond_to_mask(cond));
2912}
2913STAGE(check_decal_mask, SkRasterPipeline_DecalTileCtx* ctx) {
2914 auto mask = sk_unaligned_load<U32>(ptr: ctx->mask);
2915 r = sk_bit_cast<F>(src: sk_bit_cast<U32>(src: r) & mask);
2916 g = sk_bit_cast<F>(src: sk_bit_cast<U32>(src: g) & mask);
2917 b = sk_bit_cast<F>(src: sk_bit_cast<U32>(src: b) & mask);
2918 a = sk_bit_cast<F>(src: sk_bit_cast<U32>(src: a) & mask);
2919}
2920
2921STAGE(alpha_to_gray, NoCtx) {
2922 r = g = b = a;
2923 a = 1;
2924}
2925STAGE(alpha_to_gray_dst, NoCtx) {
2926 dr = dg = db = da;
2927 da = 1;
2928}
2929STAGE(alpha_to_red, NoCtx) {
2930 r = a;
2931 a = 1;
2932}
2933STAGE(alpha_to_red_dst, NoCtx) {
2934 dr = da;
2935 da = 1;
2936}
2937
2938STAGE(bt709_luminance_or_luma_to_alpha, NoCtx) {
2939 a = r*0.2126f + g*0.7152f + b*0.0722f;
2940 r = g = b = 0;
2941}
2942STAGE(bt709_luminance_or_luma_to_rgb, NoCtx) {
2943 r = g = b = r*0.2126f + g*0.7152f + b*0.0722f;
2944}
2945
2946STAGE(matrix_translate, const float* m) {
2947 r += m[0];
2948 g += m[1];
2949}
2950STAGE(matrix_scale_translate, const float* m) {
2951 r = mad(f: r,m: m[0], a: m[2]);
2952 g = mad(f: g,m: m[1], a: m[3]);
2953}
2954STAGE(matrix_2x3, const float* m) {
2955 auto R = mad(f: r,m: m[0], a: mad(f: g,m: m[1], a: m[2])),
2956 G = mad(f: r,m: m[3], a: mad(f: g,m: m[4], a: m[5]));
2957 r = R;
2958 g = G;
2959}
2960STAGE(matrix_3x3, const float* m) {
2961 auto R = mad(f: r,m: m[0], a: mad(f: g,m: m[3], a: b*m[6])),
2962 G = mad(f: r,m: m[1], a: mad(f: g,m: m[4], a: b*m[7])),
2963 B = mad(f: r,m: m[2], a: mad(f: g,m: m[5], a: b*m[8]));
2964 r = R;
2965 g = G;
2966 b = B;
2967}
2968STAGE(matrix_3x4, const float* m) {
2969 auto R = mad(f: r,m: m[0], a: mad(f: g,m: m[3], a: mad(f: b,m: m[6], a: m[ 9]))),
2970 G = mad(f: r,m: m[1], a: mad(f: g,m: m[4], a: mad(f: b,m: m[7], a: m[10]))),
2971 B = mad(f: r,m: m[2], a: mad(f: g,m: m[5], a: mad(f: b,m: m[8], a: m[11])));
2972 r = R;
2973 g = G;
2974 b = B;
2975}
2976STAGE(matrix_4x5, const float* m) {
2977 auto R = mad(f: r,m: m[ 0], a: mad(f: g,m: m[ 1], a: mad(f: b,m: m[ 2], a: mad(f: a,m: m[ 3], a: m[ 4])))),
2978 G = mad(f: r,m: m[ 5], a: mad(f: g,m: m[ 6], a: mad(f: b,m: m[ 7], a: mad(f: a,m: m[ 8], a: m[ 9])))),
2979 B = mad(f: r,m: m[10], a: mad(f: g,m: m[11], a: mad(f: b,m: m[12], a: mad(f: a,m: m[13], a: m[14])))),
2980 A = mad(f: r,m: m[15], a: mad(f: g,m: m[16], a: mad(f: b,m: m[17], a: mad(f: a,m: m[18], a: m[19]))));
2981 r = R;
2982 g = G;
2983 b = B;
2984 a = A;
2985}
2986STAGE(matrix_4x3, const float* m) {
2987 auto X = r,
2988 Y = g;
2989
2990 r = mad(f: X, m: m[0], a: mad(f: Y, m: m[4], a: m[ 8]));
2991 g = mad(f: X, m: m[1], a: mad(f: Y, m: m[5], a: m[ 9]));
2992 b = mad(f: X, m: m[2], a: mad(f: Y, m: m[6], a: m[10]));
2993 a = mad(f: X, m: m[3], a: mad(f: Y, m: m[7], a: m[11]));
2994}
2995STAGE(matrix_perspective, const float* m) {
2996 // N.B. Unlike the other matrix_ stages, this matrix is row-major.
2997 auto R = mad(f: r,m: m[0], a: mad(f: g,m: m[1], a: m[2])),
2998 G = mad(f: r,m: m[3], a: mad(f: g,m: m[4], a: m[5])),
2999 Z = mad(f: r,m: m[6], a: mad(f: g,m: m[7], a: m[8]));
3000 r = R * rcp_precise(v: Z);
3001 g = G * rcp_precise(v: Z);
3002}
3003
3004SI void gradient_lookup(const SkRasterPipeline_GradientCtx* c, U32 idx, F t,
3005 F* r, F* g, F* b, F* a) {
3006 F fr, br, fg, bg, fb, bb, fa, ba;
3007#if defined(JUMPER_IS_HSW)
3008 if (c->stopCount <=8) {
3009 fr = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[0]), idx);
3010 br = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[0]), idx);
3011 fg = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[1]), idx);
3012 bg = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[1]), idx);
3013 fb = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[2]), idx);
3014 bb = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[2]), idx);
3015 fa = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[3]), idx);
3016 ba = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[3]), idx);
3017 } else
3018#endif
3019 {
3020 fr = gather(p: c->fs[0], ix: idx);
3021 br = gather(p: c->bs[0], ix: idx);
3022 fg = gather(p: c->fs[1], ix: idx);
3023 bg = gather(p: c->bs[1], ix: idx);
3024 fb = gather(p: c->fs[2], ix: idx);
3025 bb = gather(p: c->bs[2], ix: idx);
3026 fa = gather(p: c->fs[3], ix: idx);
3027 ba = gather(p: c->bs[3], ix: idx);
3028 }
3029
3030 *r = mad(f: t, m: fr, a: br);
3031 *g = mad(f: t, m: fg, a: bg);
3032 *b = mad(f: t, m: fb, a: bb);
3033 *a = mad(f: t, m: fa, a: ba);
3034}
3035
3036STAGE(evenly_spaced_gradient, const SkRasterPipeline_GradientCtx* c) {
3037 auto t = r;
3038 auto idx = trunc_(v: t * (c->stopCount-1));
3039 gradient_lookup(c, idx, t, r: &r, g: &g, b: &b, a: &a);
3040}
3041
3042STAGE(gradient, const SkRasterPipeline_GradientCtx* c) {
3043 auto t = r;
3044 U32 idx = 0;
3045
3046 // N.B. The loop starts at 1 because idx 0 is the color to use before the first stop.
3047 for (size_t i = 1; i < c->stopCount; i++) {
3048 idx += if_then_else(c: t >= c->ts[i], t: U32(1), e: U32(0));
3049 }
3050
3051 gradient_lookup(c, idx, t, r: &r, g: &g, b: &b, a: &a);
3052}
3053
3054STAGE(evenly_spaced_2_stop_gradient, const SkRasterPipeline_EvenlySpaced2StopGradientCtx* c) {
3055 auto t = r;
3056 r = mad(f: t, m: c->f[0], a: c->b[0]);
3057 g = mad(f: t, m: c->f[1], a: c->b[1]);
3058 b = mad(f: t, m: c->f[2], a: c->b[2]);
3059 a = mad(f: t, m: c->f[3], a: c->b[3]);
3060}
3061
3062STAGE(xy_to_unit_angle, NoCtx) {
3063 F X = r,
3064 Y = g;
3065 F xabs = abs_(v: X),
3066 yabs = abs_(v: Y);
3067
3068 F slope = min(a: xabs, b: yabs)/max(a: xabs, b: yabs);
3069 F s = slope * slope;
3070
3071 // Use a 7th degree polynomial to approximate atan.
3072 // This was generated using sollya.gforge.inria.fr.
3073 // A float optimized polynomial was generated using the following command.
3074 // P1 = fpminimax((1/(2*Pi))*atan(x),[|1,3,5,7|],[|24...|],[2^(-40),1],relative);
3075 F phi = slope
3076 * (0.15912117063999176025390625f + s
3077 * (-5.185396969318389892578125e-2f + s
3078 * (2.476101927459239959716796875e-2f + s
3079 * (-7.0547382347285747528076171875e-3f))));
3080
3081 phi = if_then_else(c: xabs < yabs, t: 1.0f/4.0f - phi, e: phi);
3082 phi = if_then_else(c: X < 0.0f , t: 1.0f/2.0f - phi, e: phi);
3083 phi = if_then_else(c: Y < 0.0f , t: 1.0f - phi , e: phi);
3084 phi = if_then_else(c: phi != phi , t: 0 , e: phi); // Check for NaN.
3085 r = phi;
3086}
3087
3088STAGE(xy_to_radius, NoCtx) {
3089 F X2 = r * r,
3090 Y2 = g * g;
3091 r = sqrt_(v: X2 + Y2);
3092}
3093
3094// Please see https://skia.org/dev/design/conical for how our 2pt conical shader works.
3095
3096STAGE(negate_x, NoCtx) { r = -r; }
3097
3098STAGE(xy_to_2pt_conical_strip, const SkRasterPipeline_2PtConicalCtx* ctx) {
3099 F x = r, y = g, &t = r;
3100 t = x + sqrt_(v: ctx->fP0 - y*y); // ctx->fP0 = r0 * r0
3101}
3102
3103STAGE(xy_to_2pt_conical_focal_on_circle, NoCtx) {
3104 F x = r, y = g, &t = r;
3105 t = x + y*y / x; // (x^2 + y^2) / x
3106}
3107
3108STAGE(xy_to_2pt_conical_well_behaved, const SkRasterPipeline_2PtConicalCtx* ctx) {
3109 F x = r, y = g, &t = r;
3110 t = sqrt_(v: x*x + y*y) - x * ctx->fP0; // ctx->fP0 = 1/r1
3111}
3112
3113STAGE(xy_to_2pt_conical_greater, const SkRasterPipeline_2PtConicalCtx* ctx) {
3114 F x = r, y = g, &t = r;
3115 t = sqrt_(v: x*x - y*y) - x * ctx->fP0; // ctx->fP0 = 1/r1
3116}
3117
3118STAGE(xy_to_2pt_conical_smaller, const SkRasterPipeline_2PtConicalCtx* ctx) {
3119 F x = r, y = g, &t = r;
3120 t = -sqrt_(v: x*x - y*y) - x * ctx->fP0; // ctx->fP0 = 1/r1
3121}
3122
3123STAGE(alter_2pt_conical_compensate_focal, const SkRasterPipeline_2PtConicalCtx* ctx) {
3124 F& t = r;
3125 t = t + ctx->fP1; // ctx->fP1 = f
3126}
3127
3128STAGE(alter_2pt_conical_unswap, NoCtx) {
3129 F& t = r;
3130 t = 1 - t;
3131}
3132
3133STAGE(mask_2pt_conical_nan, SkRasterPipeline_2PtConicalCtx* c) {
3134 F& t = r;
3135 auto is_degenerate = (t != t); // NaN
3136 t = if_then_else(c: is_degenerate, t: F(0), e: t);
3137 sk_unaligned_store(ptr: &c->fMask, val: cond_to_mask(cond: !is_degenerate));
3138}
3139
3140STAGE(mask_2pt_conical_degenerates, SkRasterPipeline_2PtConicalCtx* c) {
3141 F& t = r;
3142 auto is_degenerate = (t <= 0) | (t != t);
3143 t = if_then_else(c: is_degenerate, t: F(0), e: t);
3144 sk_unaligned_store(ptr: &c->fMask, val: cond_to_mask(cond: !is_degenerate));
3145}
3146
3147STAGE(apply_vector_mask, const uint32_t* ctx) {
3148 const U32 mask = sk_unaligned_load<U32>(ptr: ctx);
3149 r = sk_bit_cast<F>(src: sk_bit_cast<U32>(src: r) & mask);
3150 g = sk_bit_cast<F>(src: sk_bit_cast<U32>(src: g) & mask);
3151 b = sk_bit_cast<F>(src: sk_bit_cast<U32>(src: b) & mask);
3152 a = sk_bit_cast<F>(src: sk_bit_cast<U32>(src: a) & mask);
3153}
3154
3155SI void save_xy(F* r, F* g, SkRasterPipeline_SamplerCtx* c) {
3156 // Whether bilinear or bicubic, all sample points are at the same fractional offset (fx,fy).
3157 // They're either the 4 corners of a logical 1x1 pixel or the 16 corners of a 3x3 grid
3158 // surrounding (x,y) at (0.5,0.5) off-center.
3159 F fx = fract(v: *r + 0.5f),
3160 fy = fract(v: *g + 0.5f);
3161
3162 // Samplers will need to load x and fx, or y and fy.
3163 sk_unaligned_store(ptr: c->x, val: *r);
3164 sk_unaligned_store(ptr: c->y, val: *g);
3165 sk_unaligned_store(ptr: c->fx, val: fx);
3166 sk_unaligned_store(ptr: c->fy, val: fy);
3167}
3168
3169STAGE(accumulate, const SkRasterPipeline_SamplerCtx* c) {
3170 // Bilinear and bicubic filters are both separable, so we produce independent contributions
3171 // from x and y, multiplying them together here to get each pixel's total scale factor.
3172 auto scale = sk_unaligned_load<F>(ptr: c->scalex)
3173 * sk_unaligned_load<F>(ptr: c->scaley);
3174 dr = mad(f: scale, m: r, a: dr);
3175 dg = mad(f: scale, m: g, a: dg);
3176 db = mad(f: scale, m: b, a: db);
3177 da = mad(f: scale, m: a, a: da);
3178}
3179
3180// In bilinear interpolation, the 4 pixels at +/- 0.5 offsets from the sample pixel center
3181// are combined in direct proportion to their area overlapping that logical query pixel.
3182// At positive offsets, the x-axis contribution to that rectangle is fx, or (1-fx) at negative x.
3183// The y-axis is symmetric.
3184
3185template <int kScale>
3186SI void bilinear_x(SkRasterPipeline_SamplerCtx* ctx, F* x) {
3187 *x = sk_unaligned_load<F>(ptr: ctx->x) + (kScale * 0.5f);
3188 F fx = sk_unaligned_load<F>(ptr: ctx->fx);
3189
3190 F scalex;
3191 if (kScale == -1) { scalex = 1.0f - fx; }
3192 if (kScale == +1) { scalex = fx; }
3193 sk_unaligned_store(ptr: ctx->scalex, val: scalex);
3194}
3195template <int kScale>
3196SI void bilinear_y(SkRasterPipeline_SamplerCtx* ctx, F* y) {
3197 *y = sk_unaligned_load<F>(ptr: ctx->y) + (kScale * 0.5f);
3198 F fy = sk_unaligned_load<F>(ptr: ctx->fy);
3199
3200 F scaley;
3201 if (kScale == -1) { scaley = 1.0f - fy; }
3202 if (kScale == +1) { scaley = fy; }
3203 sk_unaligned_store(ptr: ctx->scaley, val: scaley);
3204}
3205
3206STAGE(bilinear_setup, SkRasterPipeline_SamplerCtx* ctx) {
3207 save_xy(r: &r, g: &g, c: ctx);
3208 // Init for accumulate
3209 dr = dg = db = da = 0;
3210}
3211
3212STAGE(bilinear_nx, SkRasterPipeline_SamplerCtx* ctx) { bilinear_x<-1>(ctx, x: &r); }
3213STAGE(bilinear_px, SkRasterPipeline_SamplerCtx* ctx) { bilinear_x<+1>(ctx, x: &r); }
3214STAGE(bilinear_ny, SkRasterPipeline_SamplerCtx* ctx) { bilinear_y<-1>(ctx, y: &g); }
3215STAGE(bilinear_py, SkRasterPipeline_SamplerCtx* ctx) { bilinear_y<+1>(ctx, y: &g); }
3216
3217
3218// In bicubic interpolation, the 16 pixels and +/- 0.5 and +/- 1.5 offsets from the sample
3219// pixel center are combined with a non-uniform cubic filter, with higher values near the center.
3220//
3221// This helper computes the total weight along one axis (our bicubic filter is separable), given one
3222// column of the sampling matrix, and a fractional pixel offset. See SkCubicResampler for details.
3223
3224SI F bicubic_wts(F t, float A, float B, float C, float D) {
3225 return mad(f: t, m: mad(f: t, m: mad(f: t, m: D, a: C), a: B), a: A);
3226}
3227
3228template <int kScale>
3229SI void bicubic_x(SkRasterPipeline_SamplerCtx* ctx, F* x) {
3230 *x = sk_unaligned_load<F>(ptr: ctx->x) + (kScale * 0.5f);
3231
3232 F scalex;
3233 if (kScale == -3) { scalex = sk_unaligned_load<F>(ptr: ctx->wx[0]); }
3234 if (kScale == -1) { scalex = sk_unaligned_load<F>(ptr: ctx->wx[1]); }
3235 if (kScale == +1) { scalex = sk_unaligned_load<F>(ptr: ctx->wx[2]); }
3236 if (kScale == +3) { scalex = sk_unaligned_load<F>(ptr: ctx->wx[3]); }
3237 sk_unaligned_store(ptr: ctx->scalex, val: scalex);
3238}
3239template <int kScale>
3240SI void bicubic_y(SkRasterPipeline_SamplerCtx* ctx, F* y) {
3241 *y = sk_unaligned_load<F>(ptr: ctx->y) + (kScale * 0.5f);
3242
3243 F scaley;
3244 if (kScale == -3) { scaley = sk_unaligned_load<F>(ptr: ctx->wy[0]); }
3245 if (kScale == -1) { scaley = sk_unaligned_load<F>(ptr: ctx->wy[1]); }
3246 if (kScale == +1) { scaley = sk_unaligned_load<F>(ptr: ctx->wy[2]); }
3247 if (kScale == +3) { scaley = sk_unaligned_load<F>(ptr: ctx->wy[3]); }
3248 sk_unaligned_store(ptr: ctx->scaley, val: scaley);
3249}
3250
3251STAGE(bicubic_setup, SkRasterPipeline_SamplerCtx* ctx) {
3252 save_xy(r: &r, g: &g, c: ctx);
3253
3254 const float* w = ctx->weights;
3255
3256 F fx = sk_unaligned_load<F>(ptr: ctx->fx);
3257 sk_unaligned_store(ptr: ctx->wx[0], val: bicubic_wts(t: fx, A: w[0], B: w[4], C: w[ 8], D: w[12]));
3258 sk_unaligned_store(ptr: ctx->wx[1], val: bicubic_wts(t: fx, A: w[1], B: w[5], C: w[ 9], D: w[13]));
3259 sk_unaligned_store(ptr: ctx->wx[2], val: bicubic_wts(t: fx, A: w[2], B: w[6], C: w[10], D: w[14]));
3260 sk_unaligned_store(ptr: ctx->wx[3], val: bicubic_wts(t: fx, A: w[3], B: w[7], C: w[11], D: w[15]));
3261
3262 F fy = sk_unaligned_load<F>(ptr: ctx->fy);
3263 sk_unaligned_store(ptr: ctx->wy[0], val: bicubic_wts(t: fy, A: w[0], B: w[4], C: w[ 8], D: w[12]));
3264 sk_unaligned_store(ptr: ctx->wy[1], val: bicubic_wts(t: fy, A: w[1], B: w[5], C: w[ 9], D: w[13]));
3265 sk_unaligned_store(ptr: ctx->wy[2], val: bicubic_wts(t: fy, A: w[2], B: w[6], C: w[10], D: w[14]));
3266 sk_unaligned_store(ptr: ctx->wy[3], val: bicubic_wts(t: fy, A: w[3], B: w[7], C: w[11], D: w[15]));
3267
3268 // Init for accumulate
3269 dr = dg = db = da = 0;
3270}
3271
3272STAGE(bicubic_n3x, SkRasterPipeline_SamplerCtx* ctx) { bicubic_x<-3>(ctx, x: &r); }
3273STAGE(bicubic_n1x, SkRasterPipeline_SamplerCtx* ctx) { bicubic_x<-1>(ctx, x: &r); }
3274STAGE(bicubic_p1x, SkRasterPipeline_SamplerCtx* ctx) { bicubic_x<+1>(ctx, x: &r); }
3275STAGE(bicubic_p3x, SkRasterPipeline_SamplerCtx* ctx) { bicubic_x<+3>(ctx, x: &r); }
3276
3277STAGE(bicubic_n3y, SkRasterPipeline_SamplerCtx* ctx) { bicubic_y<-3>(ctx, y: &g); }
3278STAGE(bicubic_n1y, SkRasterPipeline_SamplerCtx* ctx) { bicubic_y<-1>(ctx, y: &g); }
3279STAGE(bicubic_p1y, SkRasterPipeline_SamplerCtx* ctx) { bicubic_y<+1>(ctx, y: &g); }
3280STAGE(bicubic_p3y, SkRasterPipeline_SamplerCtx* ctx) { bicubic_y<+3>(ctx, y: &g); }
3281
3282STAGE(mipmap_linear_init, SkRasterPipeline_MipmapCtx* ctx) {
3283 sk_unaligned_store(ptr: ctx->x, val: r);
3284 sk_unaligned_store(ptr: ctx->y, val: g);
3285}
3286
3287STAGE(mipmap_linear_update, SkRasterPipeline_MipmapCtx* ctx) {
3288 sk_unaligned_store(ptr: ctx->r, val: r);
3289 sk_unaligned_store(ptr: ctx->g, val: g);
3290 sk_unaligned_store(ptr: ctx->b, val: b);
3291 sk_unaligned_store(ptr: ctx->a, val: a);
3292
3293 r = sk_unaligned_load<F>(ptr: ctx->x) * ctx->scaleX;
3294 g = sk_unaligned_load<F>(ptr: ctx->y) * ctx->scaleY;
3295}
3296
3297STAGE(mipmap_linear_finish, SkRasterPipeline_MipmapCtx* ctx) {
3298 r = lerp(from: sk_unaligned_load<F>(ptr: ctx->r), to: r, t: ctx->lowerWeight);
3299 g = lerp(from: sk_unaligned_load<F>(ptr: ctx->g), to: g, t: ctx->lowerWeight);
3300 b = lerp(from: sk_unaligned_load<F>(ptr: ctx->b), to: b, t: ctx->lowerWeight);
3301 a = lerp(from: sk_unaligned_load<F>(ptr: ctx->a), to: a, t: ctx->lowerWeight);
3302}
3303
3304STAGE(callback, SkRasterPipeline_CallbackCtx* c) {
3305 store4(ptr: c->rgba,tail: 0, r,g,b,a);
3306 c->fn(c, tail ? tail : N);
3307 load4(ptr: c->read_from,tail: 0, r: &r,g: &g,b: &b,a: &a);
3308}
3309
3310STAGE_TAIL(set_base_pointer, std::byte* p) {
3311 base = p;
3312}
3313
3314// All control flow stages used by SkSL maintain some state in the common registers:
3315// r: condition mask
3316// g: loop mask
3317// b: return mask
3318// a: execution mask (intersection of all three masks)
3319// After updating r/g/b, you must invoke update_execution_mask().
3320#define execution_mask() sk_bit_cast<I32>(a)
3321#define update_execution_mask() a = sk_bit_cast<F>(sk_bit_cast<I32>(r) & \
3322 sk_bit_cast<I32>(g) & \
3323 sk_bit_cast<I32>(b))
3324
3325STAGE_TAIL(init_lane_masks, NoCtx) {
3326 uint32_t iota[] = {0,1,2,3,4,5,6,7};
3327 I32 mask = tail ? cond_to_mask(cond: sk_unaligned_load<U32>(ptr: iota) < tail) : I32(~0);
3328 r = g = b = a = sk_bit_cast<F>(src: mask);
3329}
3330
3331STAGE_TAIL(store_device_xy01, F* dst) {
3332 // This is very similar to `seed_shader + store_src`, but b/a are backwards.
3333 // (sk_FragCoord actually puts w=1 in the w slot.)
3334 static constexpr float iota[] = {
3335 0.5f, 1.5f, 2.5f, 3.5f, 4.5f, 5.5f, 6.5f, 7.5f,
3336 8.5f, 9.5f,10.5f,11.5f,12.5f,13.5f,14.5f,15.5f,
3337 };
3338 dst[0] = cast(v: dx) + sk_unaligned_load<F>(ptr: iota);
3339 dst[1] = cast(v: dy) + 0.5f;
3340 dst[2] = 0.0f;
3341 dst[3] = 1.0f;
3342}
3343
3344STAGE_TAIL(exchange_src, F* rgba) {
3345 // Swaps r,g,b,a registers with the values at `rgba`.
3346 F temp[4] = {r, g, b, a};
3347 r = rgba[0];
3348 rgba[0] = temp[0];
3349 g = rgba[1];
3350 rgba[1] = temp[1];
3351 b = rgba[2];
3352 rgba[2] = temp[2];
3353 a = rgba[3];
3354 rgba[3] = temp[3];
3355}
3356
3357STAGE_TAIL(load_condition_mask, F* ctx) {
3358 r = sk_unaligned_load<F>(ptr: ctx);
3359 update_execution_mask();
3360}
3361
3362STAGE_TAIL(store_condition_mask, F* ctx) {
3363 sk_unaligned_store(ptr: ctx, val: r);
3364}
3365
3366STAGE_TAIL(merge_condition_mask, I32* ptr) {
3367 // Set the condition-mask to the intersection of two adjacent masks at the pointer.
3368 r = sk_bit_cast<F>(src: ptr[0] & ptr[1]);
3369 update_execution_mask();
3370}
3371
3372STAGE_TAIL(merge_inv_condition_mask, I32* ptr) {
3373 // Set the condition-mask to the intersection of the first mask and the inverse of the second.
3374 r = sk_bit_cast<F>(src: ptr[0] & ~ptr[1]);
3375 update_execution_mask();
3376}
3377
3378STAGE_TAIL(load_loop_mask, F* ctx) {
3379 g = sk_unaligned_load<F>(ptr: ctx);
3380 update_execution_mask();
3381}
3382
3383STAGE_TAIL(store_loop_mask, F* ctx) {
3384 sk_unaligned_store(ptr: ctx, val: g);
3385}
3386
3387STAGE_TAIL(mask_off_loop_mask, NoCtx) {
3388 // We encountered a break statement. If a lane was active, it should be masked off now, and stay
3389 // masked-off until the termination of the loop.
3390 g = sk_bit_cast<F>(src: sk_bit_cast<I32>(src: g) & ~execution_mask());
3391 update_execution_mask();
3392}
3393
3394STAGE_TAIL(reenable_loop_mask, I32* ptr) {
3395 // Set the loop-mask to the union of the current loop-mask with the mask at the pointer.
3396 g = sk_bit_cast<F>(src: sk_bit_cast<I32>(src: g) | ptr[0]);
3397 update_execution_mask();
3398}
3399
3400STAGE_TAIL(merge_loop_mask, I32* ptr) {
3401 // Set the loop-mask to the intersection of the current loop-mask with the mask at the pointer.
3402 // (Note: this behavior subtly differs from merge_condition_mask!)
3403 g = sk_bit_cast<F>(src: sk_bit_cast<I32>(src: g) & ptr[0]);
3404 update_execution_mask();
3405}
3406
3407STAGE_TAIL(continue_op, I32* continueMask) {
3408 // Set any currently-executing lanes in the continue-mask to true.
3409 *continueMask |= execution_mask();
3410
3411 // Disable any currently-executing lanes from the loop mask. (Just like `mask_off_loop_mask`.)
3412 g = sk_bit_cast<F>(src: sk_bit_cast<I32>(src: g) & ~execution_mask());
3413 update_execution_mask();
3414}
3415
3416STAGE_TAIL(case_op, SkRasterPipeline_CaseOpCtx* packed) {
3417 auto ctx = SkRPCtxUtils::Unpack(ctx: packed);
3418
3419 // Check each lane to see if the case value matches the expectation.
3420 I32* actualValue = (I32*)(base + ctx.offset);
3421 I32 caseMatches = cond_to_mask(cond: *actualValue == ctx.expectedValue);
3422
3423 // In lanes where we found a match, enable the loop mask...
3424 g = sk_bit_cast<F>(src: sk_bit_cast<I32>(src: g) | caseMatches);
3425 update_execution_mask();
3426
3427 // ... and clear the default-case mask.
3428 I32* defaultMask = actualValue + 1;
3429 *defaultMask &= ~caseMatches;
3430}
3431
3432STAGE_TAIL(load_return_mask, F* ctx) {
3433 b = sk_unaligned_load<F>(ptr: ctx);
3434 update_execution_mask();
3435}
3436
3437STAGE_TAIL(store_return_mask, F* ctx) {
3438 sk_unaligned_store(ptr: ctx, val: b);
3439}
3440
3441STAGE_TAIL(mask_off_return_mask, NoCtx) {
3442 // We encountered a return statement. If a lane was active, it should be masked off now, and
3443 // stay masked-off until the end of the function.
3444 b = sk_bit_cast<F>(src: sk_bit_cast<I32>(src: b) & ~execution_mask());
3445 update_execution_mask();
3446}
3447
3448STAGE_BRANCH(branch_if_all_lanes_active, SkRasterPipeline_BranchCtx* ctx) {
3449 if (tail) {
3450 uint32_t iota[] = {0,1,2,3,4,5,6,7};
3451 I32 tailLanes = cond_to_mask(cond: tail <= sk_unaligned_load<U32>(ptr: iota));
3452 return all(execution_mask() | tailLanes) ? ctx->offset : 1;
3453 } else {
3454 return all(execution_mask()) ? ctx->offset : 1;
3455 }
3456}
3457
3458STAGE_BRANCH(branch_if_any_lanes_active, SkRasterPipeline_BranchCtx* ctx) {
3459 return any(execution_mask()) ? ctx->offset : 1;
3460}
3461
3462STAGE_BRANCH(branch_if_no_lanes_active, SkRasterPipeline_BranchCtx* ctx) {
3463 return any(execution_mask()) ? 1 : ctx->offset;
3464}
3465
3466STAGE_BRANCH(jump, SkRasterPipeline_BranchCtx* ctx) {
3467 return ctx->offset;
3468}
3469
3470STAGE_BRANCH(branch_if_no_active_lanes_eq, SkRasterPipeline_BranchIfEqualCtx* ctx) {
3471 // Compare each lane against the expected value...
3472 I32 match = cond_to_mask(cond: *(I32*)ctx->ptr == ctx->value);
3473 // ... but mask off lanes that aren't executing.
3474 match &= execution_mask();
3475 // If any lanes matched, don't take the branch.
3476 return any(c: match) ? 1 : ctx->offset;
3477}
3478
3479STAGE_TAIL(trace_line, SkRasterPipeline_TraceLineCtx* ctx) {
3480 I32* traceMask = (I32*)ctx->traceMask;
3481 if (any(execution_mask() & *traceMask)) {
3482 ctx->traceHook->line(lineNum: ctx->lineNumber);
3483 }
3484}
3485
3486STAGE_TAIL(trace_enter, SkRasterPipeline_TraceFuncCtx* ctx) {
3487 I32* traceMask = (I32*)ctx->traceMask;
3488 if (any(execution_mask() & *traceMask)) {
3489 ctx->traceHook->enter(fnIdx: ctx->funcIdx);
3490 }
3491}
3492
3493STAGE_TAIL(trace_exit, SkRasterPipeline_TraceFuncCtx* ctx) {
3494 I32* traceMask = (I32*)ctx->traceMask;
3495 if (any(execution_mask() & *traceMask)) {
3496 ctx->traceHook->exit(fnIdx: ctx->funcIdx);
3497 }
3498}
3499
3500STAGE_TAIL(trace_scope, SkRasterPipeline_TraceScopeCtx* ctx) {
3501 // Note that trace_scope intentionally does not incorporate the execution mask. Otherwise, the
3502 // scopes would become unbalanced if the execution mask changed in the middle of a block. The
3503 // caller is responsible for providing a combined trace- and execution-mask.
3504 I32* traceMask = (I32*)ctx->traceMask;
3505 if (any(c: *traceMask)) {
3506 ctx->traceHook->scope(delta: ctx->delta);
3507 }
3508}
3509
3510STAGE_TAIL(trace_var, SkRasterPipeline_TraceVarCtx* ctx) {
3511 I32* traceMask = (I32*)ctx->traceMask;
3512 I32 mask = execution_mask() & *traceMask;
3513 if (any(c: mask)) {
3514 for (size_t lane = 0; lane < N; ++lane) {
3515 if (select_lane(data: mask, lane)) {
3516 I32* data = (I32*)ctx->data;
3517 int slotIdx = ctx->slotIdx, numSlots = ctx->numSlots;
3518 if (ctx->indirectOffset) {
3519 // If this was an indirect store, apply the indirect-offset to the data pointer.
3520 uint32_t indirectOffset = select_lane(data: *(U32*)ctx->indirectOffset, lane);
3521 indirectOffset = std::min<uint32_t>(a: indirectOffset, b: ctx->indirectLimit);
3522 data += indirectOffset;
3523 slotIdx += indirectOffset;
3524 }
3525 while (numSlots--) {
3526 ctx->traceHook->var(slot: slotIdx, val: select_lane(data: *data, lane));
3527 ++slotIdx;
3528 ++data;
3529 }
3530 break;
3531 }
3532 }
3533 }
3534}
3535
3536STAGE_TAIL(copy_uniform, SkRasterPipeline_UniformCtx* ctx) {
3537 const float* src = ctx->src;
3538 F* dst = (F*)ctx->dst;
3539 dst[0] = src[0];
3540}
3541STAGE_TAIL(copy_2_uniforms, SkRasterPipeline_UniformCtx* ctx) {
3542 const float* src = ctx->src;
3543 F* dst = (F*)ctx->dst;
3544 dst[0] = src[0];
3545 dst[1] = src[1];
3546}
3547STAGE_TAIL(copy_3_uniforms, SkRasterPipeline_UniformCtx* ctx) {
3548 const float* src = ctx->src;
3549 F* dst = (F*)ctx->dst;
3550 dst[0] = src[0];
3551 dst[1] = src[1];
3552 dst[2] = src[2];
3553}
3554STAGE_TAIL(copy_4_uniforms, SkRasterPipeline_UniformCtx* ctx) {
3555 const float* src = ctx->src;
3556 F* dst = (F*)ctx->dst;
3557 dst[0] = src[0];
3558 dst[1] = src[1];
3559 dst[2] = src[2];
3560 dst[3] = src[3];
3561}
3562
3563STAGE_TAIL(copy_constant, SkRasterPipeline_ConstantCtx* packed) {
3564 auto ctx = SkRPCtxUtils::Unpack(ctx: packed);
3565 F* dst = (F*)(base + ctx.dst);
3566 F value = ctx.value;
3567 dst[0] = value;
3568}
3569STAGE_TAIL(splat_2_constants, SkRasterPipeline_ConstantCtx* packed) {
3570 auto ctx = SkRPCtxUtils::Unpack(ctx: packed);
3571 F* dst = (F*)(base + ctx.dst);
3572 F value = ctx.value;
3573 dst[0] = dst[1] = value;
3574}
3575STAGE_TAIL(splat_3_constants, SkRasterPipeline_ConstantCtx* packed) {
3576 auto ctx = SkRPCtxUtils::Unpack(ctx: packed);
3577 F* dst = (F*)(base + ctx.dst);
3578 F value = ctx.value;
3579 dst[0] = dst[1] = dst[2] = value;
3580}
3581STAGE_TAIL(splat_4_constants, SkRasterPipeline_ConstantCtx* packed) {
3582 auto ctx = SkRPCtxUtils::Unpack(ctx: packed);
3583 F* dst = (F*)(base + ctx.dst);
3584 F value = ctx.value;
3585 dst[0] = dst[1] = dst[2] = dst[3] = value;
3586}
3587
3588template <int NumSlots>
3589SI void copy_n_slots_unmasked_fn(SkRasterPipeline_BinaryOpCtx* packed, std::byte* base) {
3590 auto ctx = SkRPCtxUtils::Unpack(ctx: packed);
3591 F* dst = (F*)(base + ctx.dst);
3592 F* src = (F*)(base + ctx.src);
3593 // We don't even bother masking off the tail; we're filling slots, not the destination surface.
3594 memcpy(dest: dst, src: src, n: sizeof(F) * NumSlots);
3595}
3596
3597STAGE_TAIL(copy_slot_unmasked, SkRasterPipeline_BinaryOpCtx* packed) {
3598 copy_n_slots_unmasked_fn<1>(packed, base);
3599}
3600STAGE_TAIL(copy_2_slots_unmasked, SkRasterPipeline_BinaryOpCtx* packed) {
3601 copy_n_slots_unmasked_fn<2>(packed, base);
3602}
3603STAGE_TAIL(copy_3_slots_unmasked, SkRasterPipeline_BinaryOpCtx* packed) {
3604 copy_n_slots_unmasked_fn<3>(packed, base);
3605}
3606STAGE_TAIL(copy_4_slots_unmasked, SkRasterPipeline_BinaryOpCtx* packed) {
3607 copy_n_slots_unmasked_fn<4>(packed, base);
3608}
3609
3610template <int NumSlots>
3611SI void copy_n_immutable_unmasked_fn(SkRasterPipeline_BinaryOpCtx* packed, std::byte* base) {
3612 auto ctx = SkRPCtxUtils::Unpack(ctx: packed);
3613
3614 // Load the scalar values.
3615 float* src = (float*)(base + ctx.src);
3616 float values[NumSlots];
3617 SK_UNROLL for (int index = 0; index < NumSlots; ++index) {
3618 values[index] = src[index];
3619 }
3620 // Broadcast the scalars into the destination.
3621 F* dst = (F*)(base + ctx.dst);
3622 SK_UNROLL for (int index = 0; index < NumSlots; ++index) {
3623 dst[index] = values[index];
3624 }
3625}
3626
3627STAGE_TAIL(copy_immutable_unmasked, SkRasterPipeline_BinaryOpCtx* packed) {
3628 copy_n_immutable_unmasked_fn<1>(packed, base);
3629}
3630STAGE_TAIL(copy_2_immutables_unmasked, SkRasterPipeline_BinaryOpCtx* packed) {
3631 copy_n_immutable_unmasked_fn<2>(packed, base);
3632}
3633STAGE_TAIL(copy_3_immutables_unmasked, SkRasterPipeline_BinaryOpCtx* packed) {
3634 copy_n_immutable_unmasked_fn<3>(packed, base);
3635}
3636STAGE_TAIL(copy_4_immutables_unmasked, SkRasterPipeline_BinaryOpCtx* packed) {
3637 copy_n_immutable_unmasked_fn<4>(packed, base);
3638}
3639
3640template <int NumSlots>
3641SI void copy_n_slots_masked_fn(SkRasterPipeline_BinaryOpCtx* packed, std::byte* base, I32 mask) {
3642 auto ctx = SkRPCtxUtils::Unpack(ctx: packed);
3643 F* dst = (F*)(base + ctx.dst);
3644 F* src = (F*)(base + ctx.src);
3645 SK_UNROLL for (int count = 0; count < NumSlots; ++count) {
3646 *dst = if_then_else(c: mask, t: *src, e: *dst);
3647 dst += 1;
3648 src += 1;
3649 }
3650}
3651
3652STAGE_TAIL(copy_slot_masked, SkRasterPipeline_BinaryOpCtx* packed) {
3653 copy_n_slots_masked_fn<1>(packed, base, execution_mask());
3654}
3655STAGE_TAIL(copy_2_slots_masked, SkRasterPipeline_BinaryOpCtx* packed) {
3656 copy_n_slots_masked_fn<2>(packed, base, execution_mask());
3657}
3658STAGE_TAIL(copy_3_slots_masked, SkRasterPipeline_BinaryOpCtx* packed) {
3659 copy_n_slots_masked_fn<3>(packed, base, execution_mask());
3660}
3661STAGE_TAIL(copy_4_slots_masked, SkRasterPipeline_BinaryOpCtx* packed) {
3662 copy_n_slots_masked_fn<4>(packed, base, execution_mask());
3663}
3664
3665template <int LoopCount, typename OffsetType>
3666SI void shuffle_fn(std::byte* ptr, OffsetType* offsets, int numSlots) {
3667 F scratch[16];
3668 SK_UNROLL for (int count = 0; count < LoopCount; ++count) {
3669 scratch[count] = *(F*)(ptr + offsets[count]);
3670 }
3671 // Surprisingly, this switch generates significantly better code than a memcpy (on x86-64) when
3672 // the number of slots is unknown at compile time, and generates roughly identical code when the
3673 // number of slots is hardcoded. Using a switch allows `scratch` to live in ymm0-ymm15 instead
3674 // of being written out to the stack and then read back in. Also, the intrinsic memcpy assumes
3675 // that `numSlots` could be arbitrarily large, and so it emits more code than we need.
3676 F* dst = (F*)ptr;
3677 switch (numSlots) {
3678 case 16: dst[15] = scratch[15]; [[fallthrough]];
3679 case 15: dst[14] = scratch[14]; [[fallthrough]];
3680 case 14: dst[13] = scratch[13]; [[fallthrough]];
3681 case 13: dst[12] = scratch[12]; [[fallthrough]];
3682 case 12: dst[11] = scratch[11]; [[fallthrough]];
3683 case 11: dst[10] = scratch[10]; [[fallthrough]];
3684 case 10: dst[ 9] = scratch[ 9]; [[fallthrough]];
3685 case 9: dst[ 8] = scratch[ 8]; [[fallthrough]];
3686 case 8: dst[ 7] = scratch[ 7]; [[fallthrough]];
3687 case 7: dst[ 6] = scratch[ 6]; [[fallthrough]];
3688 case 6: dst[ 5] = scratch[ 5]; [[fallthrough]];
3689 case 5: dst[ 4] = scratch[ 4]; [[fallthrough]];
3690 case 4: dst[ 3] = scratch[ 3]; [[fallthrough]];
3691 case 3: dst[ 2] = scratch[ 2]; [[fallthrough]];
3692 case 2: dst[ 1] = scratch[ 1]; [[fallthrough]];
3693 case 1: dst[ 0] = scratch[ 0];
3694 }
3695}
3696
3697template <int N>
3698SI void small_swizzle_fn(SkRasterPipeline_SwizzleCtx* packed, std::byte* base) {
3699 auto ctx = SkRPCtxUtils::Unpack(ctx: packed);
3700 shuffle_fn<N>(base + ctx.dst, ctx.offsets, N);
3701}
3702
3703STAGE_TAIL(swizzle_1, SkRasterPipeline_SwizzleCtx* packed) {
3704 small_swizzle_fn<1>(packed, base);
3705}
3706STAGE_TAIL(swizzle_2, SkRasterPipeline_SwizzleCtx* packed) {
3707 small_swizzle_fn<2>(packed, base);
3708}
3709STAGE_TAIL(swizzle_3, SkRasterPipeline_SwizzleCtx* packed) {
3710 small_swizzle_fn<3>(packed, base);
3711}
3712STAGE_TAIL(swizzle_4, SkRasterPipeline_SwizzleCtx* packed) {
3713 small_swizzle_fn<4>(packed, base);
3714}
3715STAGE_TAIL(shuffle, SkRasterPipeline_ShuffleCtx* ctx) {
3716 shuffle_fn<16>(ptr: (std::byte*)ctx->ptr, offsets: ctx->offsets, numSlots: ctx->count);
3717}
3718
3719template <int NumSlots>
3720SI void swizzle_copy_masked_fn(F* dst, const F* src, uint16_t* offsets, I32 mask) {
3721 std::byte* dstB = (std::byte*)dst;
3722 SK_UNROLL for (int count = 0; count < NumSlots; ++count) {
3723 F* dstS = (F*)(dstB + *offsets);
3724 *dstS = if_then_else(c: mask, t: *src, e: *dstS);
3725 offsets += 1;
3726 src += 1;
3727 }
3728}
3729
3730STAGE_TAIL(swizzle_copy_slot_masked, SkRasterPipeline_SwizzleCopyCtx* ctx) {
3731 swizzle_copy_masked_fn<1>(dst: (F*)ctx->dst, src: (F*)ctx->src, offsets: ctx->offsets, execution_mask());
3732}
3733STAGE_TAIL(swizzle_copy_2_slots_masked, SkRasterPipeline_SwizzleCopyCtx* ctx) {
3734 swizzle_copy_masked_fn<2>(dst: (F*)ctx->dst, src: (F*)ctx->src, offsets: ctx->offsets, execution_mask());
3735}
3736STAGE_TAIL(swizzle_copy_3_slots_masked, SkRasterPipeline_SwizzleCopyCtx* ctx) {
3737 swizzle_copy_masked_fn<3>(dst: (F*)ctx->dst, src: (F*)ctx->src, offsets: ctx->offsets, execution_mask());
3738}
3739STAGE_TAIL(swizzle_copy_4_slots_masked, SkRasterPipeline_SwizzleCopyCtx* ctx) {
3740 swizzle_copy_masked_fn<4>(dst: (F*)ctx->dst, src: (F*)ctx->src, offsets: ctx->offsets, execution_mask());
3741}
3742
3743STAGE_TAIL(copy_from_indirect_unmasked, SkRasterPipeline_CopyIndirectCtx* ctx) {
3744 // Clamp the indirect offsets to stay within the limit.
3745 U32 offsets = *(U32*)ctx->indirectOffset;
3746 offsets = min(a: offsets, b: ctx->indirectLimit);
3747
3748 // Scale up the offsets to account for the N lanes per value.
3749 offsets *= N;
3750
3751 // Adjust the offsets forward so that they fetch from the correct lane.
3752 static constexpr uint32_t iota[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
3753 offsets += sk_unaligned_load<I32>(ptr: iota);
3754
3755 // Use gather to perform indirect lookups; write the results into `dst`.
3756 const float* src = ctx->src;
3757 F* dst = (F*)ctx->dst;
3758 F* end = dst + ctx->slots;
3759 do {
3760 *dst = gather(p: src, ix: offsets);
3761 dst += 1;
3762 src += N;
3763 } while (dst != end);
3764}
3765
3766STAGE_TAIL(copy_from_indirect_uniform_unmasked, SkRasterPipeline_CopyIndirectCtx* ctx) {
3767 // Clamp the indirect offsets to stay within the limit.
3768 U32 offsets = *(U32*)ctx->indirectOffset;
3769 offsets = min(a: offsets, b: ctx->indirectLimit);
3770
3771 // Use gather to perform indirect lookups; write the results into `dst`.
3772 const float* src = ctx->src;
3773 F* dst = (F*)ctx->dst;
3774 F* end = dst + ctx->slots;
3775 do {
3776 *dst = gather(p: src, ix: offsets);
3777 dst += 1;
3778 src += 1;
3779 } while (dst != end);
3780}
3781
3782STAGE_TAIL(copy_to_indirect_masked, SkRasterPipeline_CopyIndirectCtx* ctx) {
3783 // Clamp the indirect offsets to stay within the limit.
3784 U32 offsets = *(U32*)ctx->indirectOffset;
3785 offsets = min(a: offsets, b: ctx->indirectLimit);
3786
3787 // Scale up the offsets to account for the N lanes per value.
3788 offsets *= N;
3789
3790 // Adjust the offsets forward so that they store into the correct lane.
3791 static constexpr uint32_t iota[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
3792 offsets += sk_unaligned_load<I32>(ptr: iota);
3793
3794 // Perform indirect, masked writes into `dst`.
3795 const F* src = (F*)ctx->src;
3796 const F* end = src + ctx->slots;
3797 float* dst = ctx->dst;
3798 I32 mask = execution_mask();
3799 do {
3800 scatter_masked(src: *src, dst, ix: offsets, mask);
3801 dst += N;
3802 src += 1;
3803 } while (src != end);
3804}
3805
3806STAGE_TAIL(swizzle_copy_to_indirect_masked, SkRasterPipeline_SwizzleCopyIndirectCtx* ctx) {
3807 // Clamp the indirect offsets to stay within the limit.
3808 U32 offsets = *(U32*)ctx->indirectOffset;
3809 offsets = min(a: offsets, b: ctx->indirectLimit);
3810
3811 // Scale up the offsets to account for the N lanes per value.
3812 offsets *= N;
3813
3814 // Adjust the offsets forward so that they store into the correct lane.
3815 static constexpr uint32_t iota[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
3816 offsets += sk_unaligned_load<I32>(ptr: iota);
3817
3818 // Perform indirect, masked, swizzled writes into `dst`.
3819 const F* src = (F*)ctx->src;
3820 const F* end = src + ctx->slots;
3821 std::byte* dstB = (std::byte*)ctx->dst;
3822 const uint16_t* swizzle = ctx->offsets;
3823 I32 mask = execution_mask();
3824 do {
3825 float* dst = (float*)(dstB + *swizzle);
3826 scatter_masked(src: *src, dst, ix: offsets, mask);
3827 swizzle += 1;
3828 src += 1;
3829 } while (src != end);
3830}
3831
3832// Unary operations take a single input, and overwrite it with their output.
3833// Unlike binary or ternary operations, we provide variations of 1-4 slots, but don't provide
3834// an arbitrary-width "n-slot" variation; the Builder can chain together longer sequences manually.
3835template <typename T, void (*ApplyFn)(T*)>
3836SI void apply_adjacent_unary(T* dst, T* end) {
3837 do {
3838 ApplyFn(dst);
3839 dst += 1;
3840 } while (dst != end);
3841}
3842
3843#if defined(JUMPER_IS_SCALAR)
3844template <typename T>
3845SI void cast_to_float_from_fn(T* dst) {
3846 *dst = sk_bit_cast<T>((F)*dst);
3847}
3848SI void cast_to_int_from_fn(F* dst) {
3849 *dst = sk_bit_cast<F>((I32)*dst);
3850}
3851SI void cast_to_uint_from_fn(F* dst) {
3852 *dst = sk_bit_cast<F>((U32)*dst);
3853}
3854#else
3855template <typename T>
3856SI void cast_to_float_from_fn(T* dst) {
3857 *dst = sk_bit_cast<T>(__builtin_convertvector(*dst, F));
3858}
3859SI void cast_to_int_from_fn(F* dst) {
3860 *dst = sk_bit_cast<F>(src: __builtin_convertvector(*dst, I32));
3861}
3862SI void cast_to_uint_from_fn(F* dst) {
3863 *dst = sk_bit_cast<F>(src: __builtin_convertvector(*dst, U32));
3864}
3865#endif
3866
3867SI void abs_fn(I32* dst) {
3868 *dst = abs_(v: *dst);
3869}
3870
3871SI void floor_fn(F* dst) {
3872 *dst = floor_(v: *dst);
3873}
3874
3875SI void ceil_fn(F* dst) {
3876 *dst = ceil_(v: *dst);
3877}
3878
3879SI void invsqrt_fn(F* dst) {
3880 *dst = rsqrt(v: *dst);
3881}
3882
3883#define DECLARE_UNARY_FLOAT(name) \
3884 STAGE_TAIL(name##_float, F* dst) { apply_adjacent_unary<F, &name##_fn>(dst, dst + 1); } \
3885 STAGE_TAIL(name##_2_floats, F* dst) { apply_adjacent_unary<F, &name##_fn>(dst, dst + 2); } \
3886 STAGE_TAIL(name##_3_floats, F* dst) { apply_adjacent_unary<F, &name##_fn>(dst, dst + 3); } \
3887 STAGE_TAIL(name##_4_floats, F* dst) { apply_adjacent_unary<F, &name##_fn>(dst, dst + 4); }
3888
3889#define DECLARE_UNARY_INT(name) \
3890 STAGE_TAIL(name##_int, I32* dst) { apply_adjacent_unary<I32, &name##_fn>(dst, dst + 1); } \
3891 STAGE_TAIL(name##_2_ints, I32* dst) { apply_adjacent_unary<I32, &name##_fn>(dst, dst + 2); } \
3892 STAGE_TAIL(name##_3_ints, I32* dst) { apply_adjacent_unary<I32, &name##_fn>(dst, dst + 3); } \
3893 STAGE_TAIL(name##_4_ints, I32* dst) { apply_adjacent_unary<I32, &name##_fn>(dst, dst + 4); }
3894
3895#define DECLARE_UNARY_UINT(name) \
3896 STAGE_TAIL(name##_uint, U32* dst) { apply_adjacent_unary<U32, &name##_fn>(dst, dst + 1); } \
3897 STAGE_TAIL(name##_2_uints, U32* dst) { apply_adjacent_unary<U32, &name##_fn>(dst, dst + 2); } \
3898 STAGE_TAIL(name##_3_uints, U32* dst) { apply_adjacent_unary<U32, &name##_fn>(dst, dst + 3); } \
3899 STAGE_TAIL(name##_4_uints, U32* dst) { apply_adjacent_unary<U32, &name##_fn>(dst, dst + 4); }
3900
3901DECLARE_UNARY_INT(cast_to_float_from) DECLARE_UNARY_UINT(cast_to_float_from)
3902DECLARE_UNARY_FLOAT(cast_to_int_from)
3903DECLARE_UNARY_FLOAT(cast_to_uint_from)
3904DECLARE_UNARY_FLOAT(floor)
3905DECLARE_UNARY_FLOAT(ceil)
3906DECLARE_UNARY_FLOAT(invsqrt)
3907DECLARE_UNARY_INT(abs)
3908
3909#undef DECLARE_UNARY_FLOAT
3910#undef DECLARE_UNARY_INT
3911#undef DECLARE_UNARY_UINT
3912
3913// For complex unary ops, we only provide a 1-slot version to reduce code bloat.
3914STAGE_TAIL(sin_float, F* dst) { *dst = sin_(x: *dst); }
3915STAGE_TAIL(cos_float, F* dst) { *dst = cos_(x: *dst); }
3916STAGE_TAIL(tan_float, F* dst) { *dst = tan_(x: *dst); }
3917STAGE_TAIL(asin_float, F* dst) { *dst = asin_(x: *dst); }
3918STAGE_TAIL(acos_float, F* dst) { *dst = acos_(x: *dst); }
3919STAGE_TAIL(atan_float, F* dst) { *dst = atan_(x: *dst); }
3920STAGE_TAIL(sqrt_float, F* dst) { *dst = sqrt_(v: *dst); }
3921STAGE_TAIL(exp_float, F* dst) { *dst = approx_exp(x: *dst); }
3922STAGE_TAIL(exp2_float, F* dst) { *dst = approx_pow2(x: *dst); }
3923STAGE_TAIL(log_float, F* dst) { *dst = approx_log(x: *dst); }
3924STAGE_TAIL(log2_float, F* dst) { *dst = approx_log2(x: *dst); }
3925
3926STAGE_TAIL(inverse_mat2, F* dst) {
3927 F a00 = dst[0], a01 = dst[1],
3928 a10 = dst[2], a11 = dst[3];
3929 F det = mad(f: a00, m: a11, a: -a01 * a10),
3930 invdet = rcp_precise(v: det);
3931 dst[0] = invdet * a11;
3932 dst[1] = -invdet * a01;
3933 dst[2] = -invdet * a10;
3934 dst[3] = invdet * a00;
3935}
3936
3937STAGE_TAIL(inverse_mat3, F* dst) {
3938 F a00 = dst[0], a01 = dst[1], a02 = dst[2],
3939 a10 = dst[3], a11 = dst[4], a12 = dst[5],
3940 a20 = dst[6], a21 = dst[7], a22 = dst[8];
3941 F b01 = mad(f: a22, m: a11, a: -a12 * a21),
3942 b11 = mad(f: a12, m: a20, a: -a22 * a10),
3943 b21 = mad(f: a21, m: a10, a: -a11 * a20);
3944 F det = mad(f: a00, m: b01, a: mad(f: a01, m: b11, a: a02 * b21)),
3945 invdet = rcp_precise(v: det);
3946 dst[0] = invdet * b01;
3947 dst[1] = invdet * mad(f: a02, m: a21, a: -a22 * a01);
3948 dst[2] = invdet * mad(f: a12, m: a01, a: -a02 * a11);
3949 dst[3] = invdet * b11;
3950 dst[4] = invdet * mad(f: a22, m: a00, a: -a02 * a20);
3951 dst[5] = invdet * mad(f: a02, m: a10, a: -a12 * a00);
3952 dst[6] = invdet * b21;
3953 dst[7] = invdet * mad(f: a01, m: a20, a: -a21 * a00);
3954 dst[8] = invdet * mad(f: a11, m: a00, a: -a01 * a10);
3955}
3956
3957STAGE_TAIL(inverse_mat4, F* dst) {
3958 F a00 = dst[0], a01 = dst[1], a02 = dst[2], a03 = dst[3],
3959 a10 = dst[4], a11 = dst[5], a12 = dst[6], a13 = dst[7],
3960 a20 = dst[8], a21 = dst[9], a22 = dst[10], a23 = dst[11],
3961 a30 = dst[12], a31 = dst[13], a32 = dst[14], a33 = dst[15];
3962 F b00 = mad(f: a00, m: a11, a: -a01 * a10),
3963 b01 = mad(f: a00, m: a12, a: -a02 * a10),
3964 b02 = mad(f: a00, m: a13, a: -a03 * a10),
3965 b03 = mad(f: a01, m: a12, a: -a02 * a11),
3966 b04 = mad(f: a01, m: a13, a: -a03 * a11),
3967 b05 = mad(f: a02, m: a13, a: -a03 * a12),
3968 b06 = mad(f: a20, m: a31, a: -a21 * a30),
3969 b07 = mad(f: a20, m: a32, a: -a22 * a30),
3970 b08 = mad(f: a20, m: a33, a: -a23 * a30),
3971 b09 = mad(f: a21, m: a32, a: -a22 * a31),
3972 b10 = mad(f: a21, m: a33, a: -a23 * a31),
3973 b11 = mad(f: a22, m: a33, a: -a23 * a32),
3974 det = mad(f: b00, m: b11, a: b05 * b06) + mad(f: b02, m: b09, a: b03 * b08) - mad(f: b01, m: b10, a: b04 * b07),
3975 invdet = rcp_precise(v: det);
3976 b00 *= invdet;
3977 b01 *= invdet;
3978 b02 *= invdet;
3979 b03 *= invdet;
3980 b04 *= invdet;
3981 b05 *= invdet;
3982 b06 *= invdet;
3983 b07 *= invdet;
3984 b08 *= invdet;
3985 b09 *= invdet;
3986 b10 *= invdet;
3987 b11 *= invdet;
3988 dst[0] = mad(f: a11, m: b11, a: a13*b09) - a12*b10;
3989 dst[1] = a02*b10 - mad(f: a01, m: b11, a: a03*b09);
3990 dst[2] = mad(f: a31, m: b05, a: a33*b03) - a32*b04;
3991 dst[3] = a22*b04 - mad(f: a21, m: b05, a: a23*b03);
3992 dst[4] = a12*b08 - mad(f: a10, m: b11, a: a13*b07);
3993 dst[5] = mad(f: a00, m: b11, a: a03*b07) - a02*b08;
3994 dst[6] = a32*b02 - mad(f: a30, m: b05, a: a33*b01);
3995 dst[7] = mad(f: a20, m: b05, a: a23*b01) - a22*b02;
3996 dst[8] = mad(f: a10, m: b10, a: a13*b06) - a11*b08;
3997 dst[9] = a01*b08 - mad(f: a00, m: b10, a: a03*b06);
3998 dst[10] = mad(f: a30, m: b04, a: a33*b00) - a31*b02;
3999 dst[11] = a21*b02 - mad(f: a20, m: b04, a: a23*b00);
4000 dst[12] = a11*b07 - mad(f: a10, m: b09, a: a12*b06);
4001 dst[13] = mad(f: a00, m: b09, a: a02*b06) - a01*b07;
4002 dst[14] = a31*b01 - mad(f: a30, m: b03, a: a32*b00);
4003 dst[15] = mad(f: a20, m: b03, a: a22*b00) - a21*b01;
4004}
4005
4006// Binary operations take two adjacent inputs, and write their output in the first position.
4007template <typename T, void (*ApplyFn)(T*, T*)>
4008SI void apply_adjacent_binary(T* dst, T* src) {
4009 T* end = src;
4010 do {
4011 ApplyFn(dst, src);
4012 dst += 1;
4013 src += 1;
4014 } while (dst != end);
4015}
4016
4017template <typename T, void (*ApplyFn)(T*, T*)>
4018SI void apply_adjacent_binary_packed(SkRasterPipeline_BinaryOpCtx* packed, std::byte* base) {
4019 auto ctx = SkRPCtxUtils::Unpack(ctx: packed);
4020 std::byte* dst = base + ctx.dst;
4021 std::byte* src = base + ctx.src;
4022 apply_adjacent_binary<T, ApplyFn>((T*)dst, (T*)src);
4023}
4024
4025template <int N, typename V, typename S, void (*ApplyFn)(V*, V*)>
4026SI void apply_binary_immediate(SkRasterPipeline_ConstantCtx* packed, std::byte* base) {
4027 auto ctx = SkRPCtxUtils::Unpack(ctx: packed);
4028 V* dst = (V*)(base + ctx.dst); // get a pointer to the destination
4029 S scalar = sk_bit_cast<S>(ctx.value); // bit-pun the constant value as desired
4030 V src = scalar; // broadcast the constant value into a vector
4031 SK_UNROLL for (int index = 0; index < N; ++index) {
4032 ApplyFn(dst, &src); // perform the operation
4033 dst += 1;
4034 }
4035}
4036
4037template <typename T>
4038SI void add_fn(T* dst, T* src) {
4039 *dst += *src;
4040}
4041
4042template <typename T>
4043SI void sub_fn(T* dst, T* src) {
4044 *dst -= *src;
4045}
4046
4047template <typename T>
4048SI void mul_fn(T* dst, T* src) {
4049 *dst *= *src;
4050}
4051
4052template <typename T>
4053SI void div_fn(T* dst, T* src) {
4054 T divisor = *src;
4055 if constexpr (!std::is_same_v<T, F>) {
4056 // We will crash if we integer-divide against zero. Convert 0 to ~0 to avoid this.
4057 divisor |= cond_to_mask(divisor == 0);
4058 }
4059 *dst /= divisor;
4060}
4061
4062SI void bitwise_and_fn(I32* dst, I32* src) {
4063 *dst &= *src;
4064}
4065
4066SI void bitwise_or_fn(I32* dst, I32* src) {
4067 *dst |= *src;
4068}
4069
4070SI void bitwise_xor_fn(I32* dst, I32* src) {
4071 *dst ^= *src;
4072}
4073
4074template <typename T>
4075SI void max_fn(T* dst, T* src) {
4076 *dst = max(*dst, *src);
4077}
4078
4079template <typename T>
4080SI void min_fn(T* dst, T* src) {
4081 *dst = min(*dst, *src);
4082}
4083
4084template <typename T>
4085SI void cmplt_fn(T* dst, T* src) {
4086 static_assert(sizeof(T) == sizeof(I32));
4087 I32 result = cond_to_mask(*dst < *src);
4088 memcpy(dst, &result, sizeof(I32));
4089}
4090
4091template <typename T>
4092SI void cmple_fn(T* dst, T* src) {
4093 static_assert(sizeof(T) == sizeof(I32));
4094 I32 result = cond_to_mask(*dst <= *src);
4095 memcpy(dst, &result, sizeof(I32));
4096}
4097
4098template <typename T>
4099SI void cmpeq_fn(T* dst, T* src) {
4100 static_assert(sizeof(T) == sizeof(I32));
4101 I32 result = cond_to_mask(*dst == *src);
4102 memcpy(dst, &result, sizeof(I32));
4103}
4104
4105template <typename T>
4106SI void cmpne_fn(T* dst, T* src) {
4107 static_assert(sizeof(T) == sizeof(I32));
4108 I32 result = cond_to_mask(*dst != *src);
4109 memcpy(dst, &result, sizeof(I32));
4110}
4111
4112SI void atan2_fn(F* dst, F* src) {
4113 *dst = atan2_(y0: *dst, x0: *src);
4114}
4115
4116SI void pow_fn(F* dst, F* src) {
4117 *dst = approx_powf(x: *dst, y: *src);
4118}
4119
4120SI void mod_fn(F* dst, F* src) {
4121 *dst = *dst - *src * floor_(v: *dst / *src);
4122}
4123
4124#define DECLARE_N_WAY_BINARY_FLOAT(name) \
4125 STAGE_TAIL(name##_n_floats, SkRasterPipeline_BinaryOpCtx* packed) { \
4126 apply_adjacent_binary_packed<F, &name##_fn>(packed, base); \
4127 }
4128
4129#define DECLARE_BINARY_FLOAT(name) \
4130 STAGE_TAIL(name##_float, F* dst) { apply_adjacent_binary<F, &name##_fn>(dst, dst + 1); } \
4131 STAGE_TAIL(name##_2_floats, F* dst) { apply_adjacent_binary<F, &name##_fn>(dst, dst + 2); } \
4132 STAGE_TAIL(name##_3_floats, F* dst) { apply_adjacent_binary<F, &name##_fn>(dst, dst + 3); } \
4133 STAGE_TAIL(name##_4_floats, F* dst) { apply_adjacent_binary<F, &name##_fn>(dst, dst + 4); } \
4134 DECLARE_N_WAY_BINARY_FLOAT(name)
4135
4136#define DECLARE_N_WAY_BINARY_INT(name) \
4137 STAGE_TAIL(name##_n_ints, SkRasterPipeline_BinaryOpCtx* packed) { \
4138 apply_adjacent_binary_packed<I32, &name##_fn>(packed, base); \
4139 }
4140
4141#define DECLARE_BINARY_INT(name) \
4142 STAGE_TAIL(name##_int, I32* dst) { apply_adjacent_binary<I32, &name##_fn>(dst, dst + 1); } \
4143 STAGE_TAIL(name##_2_ints, I32* dst) { apply_adjacent_binary<I32, &name##_fn>(dst, dst + 2); } \
4144 STAGE_TAIL(name##_3_ints, I32* dst) { apply_adjacent_binary<I32, &name##_fn>(dst, dst + 3); } \
4145 STAGE_TAIL(name##_4_ints, I32* dst) { apply_adjacent_binary<I32, &name##_fn>(dst, dst + 4); } \
4146 DECLARE_N_WAY_BINARY_INT(name)
4147
4148#define DECLARE_N_WAY_BINARY_UINT(name) \
4149 STAGE_TAIL(name##_n_uints, SkRasterPipeline_BinaryOpCtx* packed) { \
4150 apply_adjacent_binary_packed<U32, &name##_fn>(packed, base); \
4151 }
4152
4153#define DECLARE_BINARY_UINT(name) \
4154 STAGE_TAIL(name##_uint, U32* dst) { apply_adjacent_binary<U32, &name##_fn>(dst, dst + 1); } \
4155 STAGE_TAIL(name##_2_uints, U32* dst) { apply_adjacent_binary<U32, &name##_fn>(dst, dst + 2); } \
4156 STAGE_TAIL(name##_3_uints, U32* dst) { apply_adjacent_binary<U32, &name##_fn>(dst, dst + 3); } \
4157 STAGE_TAIL(name##_4_uints, U32* dst) { apply_adjacent_binary<U32, &name##_fn>(dst, dst + 4); } \
4158 DECLARE_N_WAY_BINARY_UINT(name)
4159
4160// Many ops reuse the int stages when performing uint arithmetic, since they're equivalent on a
4161// two's-complement machine. (Even multiplication is equivalent in the lower 32 bits.)
4162DECLARE_BINARY_FLOAT(add) DECLARE_BINARY_INT(add)
4163DECLARE_BINARY_FLOAT(sub) DECLARE_BINARY_INT(sub)
4164DECLARE_BINARY_FLOAT(mul) DECLARE_BINARY_INT(mul)
4165DECLARE_BINARY_FLOAT(div) DECLARE_BINARY_INT(div) DECLARE_BINARY_UINT(div)
4166 DECLARE_BINARY_INT(bitwise_and)
4167 DECLARE_BINARY_INT(bitwise_or)
4168 DECLARE_BINARY_INT(bitwise_xor)
4169DECLARE_BINARY_FLOAT(mod)
4170DECLARE_BINARY_FLOAT(min) DECLARE_BINARY_INT(min) DECLARE_BINARY_UINT(min)
4171DECLARE_BINARY_FLOAT(max) DECLARE_BINARY_INT(max) DECLARE_BINARY_UINT(max)
4172DECLARE_BINARY_FLOAT(cmplt) DECLARE_BINARY_INT(cmplt) DECLARE_BINARY_UINT(cmplt)
4173DECLARE_BINARY_FLOAT(cmple) DECLARE_BINARY_INT(cmple) DECLARE_BINARY_UINT(cmple)
4174DECLARE_BINARY_FLOAT(cmpeq) DECLARE_BINARY_INT(cmpeq)
4175DECLARE_BINARY_FLOAT(cmpne) DECLARE_BINARY_INT(cmpne)
4176
4177// Sufficiently complex ops only provide an N-way version, to avoid code bloat from the dedicated
4178// 1-4 slot versions.
4179DECLARE_N_WAY_BINARY_FLOAT(atan2)
4180DECLARE_N_WAY_BINARY_FLOAT(pow)
4181
4182// Some ops have an optimized version when the right-side is an immediate value.
4183#define DECLARE_IMM_BINARY_FLOAT(name) \
4184 STAGE_TAIL(name##_imm_float, SkRasterPipeline_ConstantCtx* packed) { \
4185 apply_binary_immediate<1, F, float, &name##_fn>(packed, base); \
4186 }
4187#define DECLARE_IMM_BINARY_INT(name) \
4188 STAGE_TAIL(name##_imm_int, SkRasterPipeline_ConstantCtx* packed) { \
4189 apply_binary_immediate<1, I32, int32_t, &name##_fn>(packed, base); \
4190 }
4191#define DECLARE_MULTI_IMM_BINARY_INT(name) \
4192 STAGE_TAIL(name##_imm_int, SkRasterPipeline_ConstantCtx* packed) { \
4193 apply_binary_immediate<1, I32, int32_t, &name##_fn>(packed, base); \
4194 } \
4195 STAGE_TAIL(name##_imm_2_ints, SkRasterPipeline_ConstantCtx* packed) { \
4196 apply_binary_immediate<2, I32, int32_t, &name##_fn>(packed, base); \
4197 } \
4198 STAGE_TAIL(name##_imm_3_ints, SkRasterPipeline_ConstantCtx* packed) { \
4199 apply_binary_immediate<3, I32, int32_t, &name##_fn>(packed, base); \
4200 } \
4201 STAGE_TAIL(name##_imm_4_ints, SkRasterPipeline_ConstantCtx* packed) { \
4202 apply_binary_immediate<4, I32, int32_t, &name##_fn>(packed, base); \
4203 }
4204#define DECLARE_IMM_BINARY_UINT(name) \
4205 STAGE_TAIL(name##_imm_uint, SkRasterPipeline_ConstantCtx* packed) { \
4206 apply_binary_immediate<1, U32, uint32_t, &name##_fn>(packed, base); \
4207 }
4208
4209DECLARE_IMM_BINARY_FLOAT(add) DECLARE_IMM_BINARY_INT(add)
4210DECLARE_IMM_BINARY_FLOAT(mul) DECLARE_IMM_BINARY_INT(mul)
4211 DECLARE_MULTI_IMM_BINARY_INT(bitwise_and)
4212 DECLARE_IMM_BINARY_FLOAT(max)
4213 DECLARE_IMM_BINARY_FLOAT(min)
4214 DECLARE_IMM_BINARY_INT(bitwise_xor)
4215DECLARE_IMM_BINARY_FLOAT(cmplt) DECLARE_IMM_BINARY_INT(cmplt) DECLARE_IMM_BINARY_UINT(cmplt)
4216DECLARE_IMM_BINARY_FLOAT(cmple) DECLARE_IMM_BINARY_INT(cmple) DECLARE_IMM_BINARY_UINT(cmple)
4217DECLARE_IMM_BINARY_FLOAT(cmpeq) DECLARE_IMM_BINARY_INT(cmpeq)
4218DECLARE_IMM_BINARY_FLOAT(cmpne) DECLARE_IMM_BINARY_INT(cmpne)
4219
4220#undef DECLARE_MULTI_IMM_BINARY_INT
4221#undef DECLARE_IMM_BINARY_FLOAT
4222#undef DECLARE_IMM_BINARY_INT
4223#undef DECLARE_IMM_BINARY_UINT
4224#undef DECLARE_BINARY_FLOAT
4225#undef DECLARE_BINARY_INT
4226#undef DECLARE_BINARY_UINT
4227#undef DECLARE_N_WAY_BINARY_FLOAT
4228#undef DECLARE_N_WAY_BINARY_INT
4229#undef DECLARE_N_WAY_BINARY_UINT
4230
4231// Dots can be represented with multiply and add ops, but they are so foundational that it's worth
4232// having dedicated ops.
4233STAGE_TAIL(dot_2_floats, F* dst) {
4234 dst[0] = mad(f: dst[0], m: dst[2],
4235 a: dst[1] * dst[3]);
4236}
4237
4238STAGE_TAIL(dot_3_floats, F* dst) {
4239 dst[0] = mad(f: dst[0], m: dst[3],
4240 a: mad(f: dst[1], m: dst[4],
4241 a: dst[2] * dst[5]));
4242}
4243
4244STAGE_TAIL(dot_4_floats, F* dst) {
4245 dst[0] = mad(f: dst[0], m: dst[4],
4246 a: mad(f: dst[1], m: dst[5],
4247 a: mad(f: dst[2], m: dst[6],
4248 a: dst[3] * dst[7])));
4249}
4250
4251// MxM, VxM and MxV multiplication all use matrix_multiply. Vectors are treated like a matrix with a
4252// single column or row.
4253template <int N>
4254SI void matrix_multiply(SkRasterPipeline_MatrixMultiplyCtx* packed, std::byte* base) {
4255 auto ctx = SkRPCtxUtils::Unpack(ctx: packed);
4256
4257 int outColumns = ctx.rightColumns,
4258 outRows = ctx.leftRows;
4259
4260 SkASSERT(outColumns >= 1);
4261 SkASSERT(outRows >= 1);
4262 SkASSERT(outColumns <= 4);
4263 SkASSERT(outRows <= 4);
4264
4265 SkASSERT(ctx.leftColumns == ctx.rightRows);
4266 SkASSERT(N == ctx.leftColumns); // N should match the result width
4267
4268#if !defined(JUMPER_IS_SCALAR)
4269 // This prevents Clang from generating early-out checks for zero-sized matrices.
4270 __builtin_assume(outColumns >= 1);
4271 __builtin_assume(outRows >= 1);
4272 __builtin_assume(outColumns <= 4);
4273 __builtin_assume(outRows <= 4);
4274#endif
4275
4276 // Get pointers to the adjacent left- and right-matrices.
4277 F* resultMtx = (F*)(base + ctx.dst);
4278 F* leftMtx = &resultMtx[ctx.rightColumns * ctx.leftRows];
4279 F* rightMtx = &leftMtx[N * ctx.leftRows];
4280
4281 // Emit each matrix element.
4282 for (int c = 0; c < outColumns; ++c) {
4283 for (int r = 0; r < outRows; ++r) {
4284 // Dot a vector from leftMtx[*][r] with rightMtx[c][*].
4285 F* leftRow = &leftMtx [r];
4286 F* rightColumn = &rightMtx[c * N];
4287
4288 F element = *leftRow * *rightColumn;
4289 for (int idx = 1; idx < N; ++idx) {
4290 leftRow += outRows;
4291 rightColumn += 1;
4292 element = mad(f: *leftRow, m: *rightColumn, a: element);
4293 }
4294
4295 *resultMtx++ = element;
4296 }
4297 }
4298}
4299
4300STAGE_TAIL(matrix_multiply_2, SkRasterPipeline_MatrixMultiplyCtx* packed) {
4301 matrix_multiply<2>(packed, base);
4302}
4303
4304STAGE_TAIL(matrix_multiply_3, SkRasterPipeline_MatrixMultiplyCtx* packed) {
4305 matrix_multiply<3>(packed, base);
4306}
4307
4308STAGE_TAIL(matrix_multiply_4, SkRasterPipeline_MatrixMultiplyCtx* packed) {
4309 matrix_multiply<4>(packed, base);
4310}
4311
4312// Refract always operates on 4-wide incident and normal vectors; for narrower inputs, the code
4313// generator fills in the input columns with zero, and discards the extra output columns.
4314STAGE_TAIL(refract_4_floats, F* dst) {
4315 // Algorithm adapted from https://registry.khronos.org/OpenGL-Refpages/gl4/html/refract.xhtml
4316 F *incident = dst + 0;
4317 F *normal = dst + 4;
4318 F eta = dst[8];
4319
4320 F dotNI = mad(f: normal[0], m: incident[0],
4321 a: mad(f: normal[1], m: incident[1],
4322 a: mad(f: normal[2], m: incident[2],
4323 a: normal[3] * incident[3])));
4324
4325 F k = 1.0 - eta * eta * (1.0 - dotNI * dotNI);
4326 F sqrt_k = sqrt_(v: k);
4327
4328 for (int idx = 0; idx < 4; ++idx) {
4329 dst[idx] = if_then_else(c: k >= 0,
4330 t: eta * incident[idx] - (eta * dotNI + sqrt_k) * normal[idx],
4331 e: 0.0);
4332 }
4333}
4334
4335// Ternary operations work like binary ops (see immediately above) but take two source inputs.
4336template <typename T, void (*ApplyFn)(T*, T*, T*)>
4337SI void apply_adjacent_ternary(T* dst, T* src0, T* src1) {
4338 int count = src0 - dst;
4339#if !defined(JUMPER_IS_SCALAR)
4340 __builtin_assume(count >= 1);
4341#endif
4342
4343 for (int index = 0; index < count; ++index) {
4344 ApplyFn(dst, src0, src1);
4345 dst += 1;
4346 src0 += 1;
4347 src1 += 1;
4348 }
4349}
4350
4351template <typename T, void (*ApplyFn)(T*, T*, T*)>
4352SI void apply_adjacent_ternary_packed(SkRasterPipeline_TernaryOpCtx* packed, std::byte* base) {
4353 auto ctx = SkRPCtxUtils::Unpack(ctx: packed);
4354 std::byte* dst = base + ctx.dst;
4355 std::byte* src0 = dst + ctx.delta;
4356 std::byte* src1 = src0 + ctx.delta;
4357 apply_adjacent_ternary<T, ApplyFn>((T*)dst, (T*)src0, (T*)src1);
4358}
4359
4360SI void mix_fn(F* a, F* x, F* y) {
4361 // We reorder the arguments here to match lerp's GLSL-style order (interpolation point last).
4362 *a = lerp(from: *x, to: *y, t: *a);
4363}
4364
4365SI void mix_fn(I32* a, I32* x, I32* y) {
4366 // We reorder the arguments here to match if_then_else's expected order (y before x).
4367 *a = if_then_else(c: *a, t: *y, e: *x);
4368}
4369
4370SI void smoothstep_fn(F* edge0, F* edge1, F* x) {
4371 F t = clamp_01_(v: (*x - *edge0) / (*edge1 - *edge0));
4372 *edge0 = t * t * (3.0 - 2.0 * t);
4373}
4374
4375#define DECLARE_N_WAY_TERNARY_FLOAT(name) \
4376 STAGE_TAIL(name##_n_floats, SkRasterPipeline_TernaryOpCtx* packed) { \
4377 apply_adjacent_ternary_packed<F, &name##_fn>(packed, base); \
4378 }
4379
4380#define DECLARE_TERNARY_FLOAT(name) \
4381 STAGE_TAIL(name##_float, F* p) { apply_adjacent_ternary<F, &name##_fn>(p, p+1, p+2); } \
4382 STAGE_TAIL(name##_2_floats, F* p) { apply_adjacent_ternary<F, &name##_fn>(p, p+2, p+4); } \
4383 STAGE_TAIL(name##_3_floats, F* p) { apply_adjacent_ternary<F, &name##_fn>(p, p+3, p+6); } \
4384 STAGE_TAIL(name##_4_floats, F* p) { apply_adjacent_ternary<F, &name##_fn>(p, p+4, p+8); } \
4385 DECLARE_N_WAY_TERNARY_FLOAT(name)
4386
4387#define DECLARE_TERNARY_INT(name) \
4388 STAGE_TAIL(name##_int, I32* p) { apply_adjacent_ternary<I32, &name##_fn>(p, p+1, p+2); } \
4389 STAGE_TAIL(name##_2_ints, I32* p) { apply_adjacent_ternary<I32, &name##_fn>(p, p+2, p+4); } \
4390 STAGE_TAIL(name##_3_ints, I32* p) { apply_adjacent_ternary<I32, &name##_fn>(p, p+3, p+6); } \
4391 STAGE_TAIL(name##_4_ints, I32* p) { apply_adjacent_ternary<I32, &name##_fn>(p, p+4, p+8); } \
4392 STAGE_TAIL(name##_n_ints, SkRasterPipeline_TernaryOpCtx* packed) { \
4393 apply_adjacent_ternary_packed<I32, &name##_fn>(packed, base); \
4394 }
4395
4396DECLARE_N_WAY_TERNARY_FLOAT(smoothstep)
4397DECLARE_TERNARY_FLOAT(mix)
4398DECLARE_TERNARY_INT(mix)
4399
4400#undef DECLARE_N_WAY_TERNARY_FLOAT
4401#undef DECLARE_TERNARY_FLOAT
4402#undef DECLARE_TERNARY_INT
4403
4404STAGE(gauss_a_to_rgba, NoCtx) {
4405 // x = 1 - x;
4406 // exp(-x * x * 4) - 0.018f;
4407 // ... now approximate with quartic
4408 //
4409 const float c4 = -2.26661229133605957031f;
4410 const float c3 = 2.89795351028442382812f;
4411 const float c2 = 0.21345567703247070312f;
4412 const float c1 = 0.15489584207534790039f;
4413 const float c0 = 0.00030726194381713867f;
4414 a = mad(f: a, m: mad(f: a, m: mad(f: a, m: mad(f: a, m: c4, a: c3), a: c2), a: c1), a: c0);
4415 r = a;
4416 g = a;
4417 b = a;
4418}
4419
4420// A specialized fused image shader for clamp-x, clamp-y, non-sRGB sampling.
4421STAGE(bilerp_clamp_8888, const SkRasterPipeline_GatherCtx* ctx) {
4422 // (cx,cy) are the center of our sample.
4423 F cx = r,
4424 cy = g;
4425
4426 // All sample points are at the same fractional offset (fx,fy).
4427 // They're the 4 corners of a logical 1x1 pixel surrounding (x,y) at (0.5,0.5) offsets.
4428 F fx = fract(v: cx + 0.5f),
4429 fy = fract(v: cy + 0.5f);
4430
4431 // We'll accumulate the color of all four samples into {r,g,b,a} directly.
4432 r = g = b = a = 0;
4433
4434 for (float py = -0.5f; py <= +0.5f; py += 1.0f)
4435 for (float px = -0.5f; px <= +0.5f; px += 1.0f) {
4436 // (x,y) are the coordinates of this sample point.
4437 F x = cx + px,
4438 y = cy + py;
4439
4440 // ix_and_ptr() will clamp to the image's bounds for us.
4441 const uint32_t* ptr;
4442 U32 ix = ix_and_ptr(ptr: &ptr, ctx, x,y);
4443
4444 F sr,sg,sb,sa;
4445 from_8888(8888: gather(p: ptr, ix), r: &sr,g: &sg,b: &sb,a: &sa);
4446
4447 // In bilinear interpolation, the 4 pixels at +/- 0.5 offsets from the sample pixel center
4448 // are combined in direct proportion to their area overlapping that logical query pixel.
4449 // At positive offsets, the x-axis contribution to that rectangle is fx,
4450 // or (1-fx) at negative x. Same deal for y.
4451 F sx = (px > 0) ? fx : 1.0f - fx,
4452 sy = (py > 0) ? fy : 1.0f - fy,
4453 area = sx * sy;
4454
4455 r += sr * area;
4456 g += sg * area;
4457 b += sb * area;
4458 a += sa * area;
4459 }
4460}
4461
4462// A specialized fused image shader for clamp-x, clamp-y, non-sRGB sampling.
4463STAGE(bicubic_clamp_8888, const SkRasterPipeline_GatherCtx* ctx) {
4464 // (cx,cy) are the center of our sample.
4465 F cx = r,
4466 cy = g;
4467
4468 // All sample points are at the same fractional offset (fx,fy).
4469 // They're the 4 corners of a logical 1x1 pixel surrounding (x,y) at (0.5,0.5) offsets.
4470 F fx = fract(v: cx + 0.5f),
4471 fy = fract(v: cy + 0.5f);
4472
4473 // We'll accumulate the color of all four samples into {r,g,b,a} directly.
4474 r = g = b = a = 0;
4475
4476 const float* w = ctx->weights;
4477 const F scaley[4] = {bicubic_wts(t: fy, A: w[0], B: w[4], C: w[ 8], D: w[12]),
4478 bicubic_wts(t: fy, A: w[1], B: w[5], C: w[ 9], D: w[13]),
4479 bicubic_wts(t: fy, A: w[2], B: w[6], C: w[10], D: w[14]),
4480 bicubic_wts(t: fy, A: w[3], B: w[7], C: w[11], D: w[15])};
4481 const F scalex[4] = {bicubic_wts(t: fx, A: w[0], B: w[4], C: w[ 8], D: w[12]),
4482 bicubic_wts(t: fx, A: w[1], B: w[5], C: w[ 9], D: w[13]),
4483 bicubic_wts(t: fx, A: w[2], B: w[6], C: w[10], D: w[14]),
4484 bicubic_wts(t: fx, A: w[3], B: w[7], C: w[11], D: w[15])};
4485
4486 F sample_y = cy - 1.5f;
4487 for (int yy = 0; yy <= 3; ++yy) {
4488 F sample_x = cx - 1.5f;
4489 for (int xx = 0; xx <= 3; ++xx) {
4490 F scale = scalex[xx] * scaley[yy];
4491
4492 // ix_and_ptr() will clamp to the image's bounds for us.
4493 const uint32_t* ptr;
4494 U32 ix = ix_and_ptr(ptr: &ptr, ctx, x: sample_x, y: sample_y);
4495
4496 F sr,sg,sb,sa;
4497 from_8888(8888: gather(p: ptr, ix), r: &sr,g: &sg,b: &sb,a: &sa);
4498
4499 r = mad(f: scale, m: sr, a: r);
4500 g = mad(f: scale, m: sg, a: g);
4501 b = mad(f: scale, m: sb, a: b);
4502 a = mad(f: scale, m: sa, a);
4503
4504 sample_x += 1;
4505 }
4506 sample_y += 1;
4507 }
4508}
4509
4510// ~~~~~~ skgpu::Swizzle stage ~~~~~~ //
4511
4512STAGE(swizzle, void* ctx) {
4513 auto ir = r, ig = g, ib = b, ia = a;
4514 F* o[] = {&r, &g, &b, &a};
4515 char swiz[4];
4516 memcpy(dest: swiz, src: &ctx, n: sizeof(swiz));
4517
4518 for (int i = 0; i < 4; ++i) {
4519 switch (swiz[i]) {
4520 case 'r': *o[i] = ir; break;
4521 case 'g': *o[i] = ig; break;
4522 case 'b': *o[i] = ib; break;
4523 case 'a': *o[i] = ia; break;
4524 case '0': *o[i] = F(0); break;
4525 case '1': *o[i] = F(1); break;
4526 default: break;
4527 }
4528 }
4529}
4530
4531namespace lowp {
4532#if defined(JUMPER_IS_SCALAR) || defined(SK_DISABLE_LOWP_RASTER_PIPELINE)
4533 // If we're not compiled by Clang, or otherwise switched into scalar mode (old Clang, manually),
4534 // we don't generate lowp stages. All these nullptrs will tell SkJumper.cpp to always use the
4535 // highp float pipeline.
4536 #define M(st) static void (*st)(void) = nullptr;
4537 SK_RASTER_PIPELINE_OPS_LOWP(M)
4538 #undef M
4539 static void (*just_return)(void) = nullptr;
4540
4541 static void start_pipeline(size_t,size_t,size_t,size_t, SkRasterPipelineStage*) {}
4542
4543#else // We are compiling vector code with Clang... let's make some lowp stages!
4544
4545#if defined(JUMPER_IS_HSW)
4546 using U8 = uint8_t __attribute__((ext_vector_type(16)));
4547 using U16 = uint16_t __attribute__((ext_vector_type(16)));
4548 using I16 = int16_t __attribute__((ext_vector_type(16)));
4549 using I32 = int32_t __attribute__((ext_vector_type(16)));
4550 using U32 = uint32_t __attribute__((ext_vector_type(16)));
4551 using I64 = int64_t __attribute__((ext_vector_type(16)));
4552 using U64 = uint64_t __attribute__((ext_vector_type(16)));
4553 using F = float __attribute__((ext_vector_type(16)));
4554#else
4555 using U8 = uint8_t __attribute__((ext_vector_type(8)));
4556 using U16 = uint16_t __attribute__((ext_vector_type(8)));
4557 using I16 = int16_t __attribute__((ext_vector_type(8)));
4558 using I32 = int32_t __attribute__((ext_vector_type(8)));
4559 using U32 = uint32_t __attribute__((ext_vector_type(8)));
4560 using I64 = int64_t __attribute__((ext_vector_type(8)));
4561 using U64 = uint64_t __attribute__((ext_vector_type(8)));
4562 using F = float __attribute__((ext_vector_type(8)));
4563#endif
4564
4565static constexpr size_t N = sizeof(U16) / sizeof(uint16_t);
4566
4567// Once again, some platforms benefit from a restricted Stage calling convention,
4568// but others can pass tons and tons of registers and we're happy to exploit that.
4569// It's exactly the same decision and implementation strategy as the F stages above.
4570#if JUMPER_NARROW_STAGES
4571 struct Params {
4572 size_t dx, dy, tail;
4573 U16 dr,dg,db,da;
4574 };
4575 using Stage = void (ABI*)(Params*, SkRasterPipelineStage* program, U16 r, U16 g, U16 b, U16 a);
4576#else
4577 using Stage = void (ABI*)(size_t tail, SkRasterPipelineStage* program,
4578 size_t dx, size_t dy,
4579 U16 r, U16 g, U16 b, U16 a,
4580 U16 dr, U16 dg, U16 db, U16 da);
4581#endif
4582
4583static void start_pipeline(const size_t x0, const size_t y0,
4584 const size_t xlimit, const size_t ylimit,
4585 SkRasterPipelineStage* program) {
4586 auto start = (Stage)program->fn;
4587 for (size_t dy = y0; dy < ylimit; dy++) {
4588 #if JUMPER_NARROW_STAGES
4589 Params params = { x0,dy,0, 0,0,0,0 };
4590 for (; params.dx + N <= xlimit; params.dx += N) {
4591 start(&params, program, 0,0,0,0);
4592 }
4593 if (size_t tail = xlimit - params.dx) {
4594 params.tail = tail;
4595 start(&params, program, 0,0,0,0);
4596 }
4597 #else
4598 size_t dx = x0;
4599 for (; dx + N <= xlimit; dx += N) {
4600 start( 0, program, dx,dy, 0,0,0,0, 0,0,0,0);
4601 }
4602 if (size_t tail = xlimit - dx) {
4603 start(tail, program, dx,dy, 0,0,0,0, 0,0,0,0);
4604 }
4605 #endif
4606 }
4607}
4608
4609#if JUMPER_NARROW_STAGES
4610 static void ABI just_return(Params*, SkRasterPipelineStage*, U16,U16,U16,U16) {}
4611#else
4612 static void ABI just_return(size_t, SkRasterPipelineStage*,size_t,size_t,
4613 U16,U16,U16,U16, U16,U16,U16,U16) {}
4614#endif
4615
4616// All stages use the same function call ABI to chain into each other, but there are three types:
4617// GG: geometry in, geometry out -- think, a matrix
4618// GP: geometry in, pixels out. -- think, a memory gather
4619// PP: pixels in, pixels out. -- think, a blend mode
4620//
4621// (Some stages ignore their inputs or produce no logical output. That's perfectly fine.)
4622//
4623// These three STAGE_ macros let you define each type of stage,
4624// and will have (x,y) geometry and/or (r,g,b,a, dr,dg,db,da) pixel arguments as appropriate.
4625
4626#if JUMPER_NARROW_STAGES
4627 #define STAGE_GG(name, ARG) \
4628 SI void name##_k(ARG, size_t dx, size_t dy, size_t tail, F& x, F& y); \
4629 static void ABI name(Params* params, SkRasterPipelineStage* program, \
4630 U16 r, U16 g, U16 b, U16 a) { \
4631 auto x = join<F>(r,g), \
4632 y = join<F>(b,a); \
4633 name##_k(Ctx{program}, params->dx,params->dy,params->tail, x,y); \
4634 split(x, &r,&g); \
4635 split(y, &b,&a); \
4636 auto fn = (Stage)(++program)->fn; \
4637 fn(params, program, r,g,b,a); \
4638 } \
4639 SI void name##_k(ARG, size_t dx, size_t dy, size_t tail, F& x, F& y)
4640
4641 #define STAGE_GP(name, ARG) \
4642 SI void name##_k(ARG, size_t dx, size_t dy, size_t tail, F x, F y, \
4643 U16& r, U16& g, U16& b, U16& a, \
4644 U16& dr, U16& dg, U16& db, U16& da); \
4645 static void ABI name(Params* params, SkRasterPipelineStage* program, \
4646 U16 r, U16 g, U16 b, U16 a) { \
4647 auto x = join<F>(r,g), \
4648 y = join<F>(b,a); \
4649 name##_k(Ctx{program}, params->dx,params->dy,params->tail, x,y, r,g,b,a, \
4650 params->dr,params->dg,params->db,params->da); \
4651 auto fn = (Stage)(++program)->fn; \
4652 fn(params, program, r,g,b,a); \
4653 } \
4654 SI void name##_k(ARG, size_t dx, size_t dy, size_t tail, F x, F y, \
4655 U16& r, U16& g, U16& b, U16& a, \
4656 U16& dr, U16& dg, U16& db, U16& da)
4657
4658 #define STAGE_PP(name, ARG) \
4659 SI void name##_k(ARG, size_t dx, size_t dy, size_t tail, \
4660 U16& r, U16& g, U16& b, U16& a, \
4661 U16& dr, U16& dg, U16& db, U16& da); \
4662 static void ABI name(Params* params, SkRasterPipelineStage* program, \
4663 U16 r, U16 g, U16 b, U16 a) { \
4664 name##_k(Ctx{program}, params->dx,params->dy,params->tail, r,g,b,a, \
4665 params->dr,params->dg,params->db,params->da); \
4666 auto fn = (Stage)(++program)->fn; \
4667 fn(params, program, r,g,b,a); \
4668 } \
4669 SI void name##_k(ARG, size_t dx, size_t dy, size_t tail, \
4670 U16& r, U16& g, U16& b, U16& a, \
4671 U16& dr, U16& dg, U16& db, U16& da)
4672#else
4673 #define STAGE_GG(name, ARG) \
4674 SI void name##_k(ARG, size_t dx, size_t dy, size_t tail, F& x, F& y); \
4675 static void ABI name(size_t tail, SkRasterPipelineStage* program, \
4676 size_t dx, size_t dy, \
4677 U16 r, U16 g, U16 b, U16 a, \
4678 U16 dr, U16 dg, U16 db, U16 da) { \
4679 auto x = join<F>(r,g), \
4680 y = join<F>(b,a); \
4681 name##_k(Ctx{program}, dx,dy,tail, x,y); \
4682 split(x, &r,&g); \
4683 split(y, &b,&a); \
4684 auto fn = (Stage)(++program)->fn; \
4685 fn(tail, program, dx,dy, r,g,b,a, dr,dg,db,da); \
4686 } \
4687 SI void name##_k(ARG, size_t dx, size_t dy, size_t tail, F& x, F& y)
4688
4689 #define STAGE_GP(name, ARG) \
4690 SI void name##_k(ARG, size_t dx, size_t dy, size_t tail, F x, F y, \
4691 U16& r, U16& g, U16& b, U16& a, \
4692 U16& dr, U16& dg, U16& db, U16& da); \
4693 static void ABI name(size_t tail, SkRasterPipelineStage* program, \
4694 size_t dx, size_t dy, \
4695 U16 r, U16 g, U16 b, U16 a, \
4696 U16 dr, U16 dg, U16 db, U16 da) { \
4697 auto x = join<F>(r,g), \
4698 y = join<F>(b,a); \
4699 name##_k(Ctx{program}, dx,dy,tail, x,y, r,g,b,a, dr,dg,db,da); \
4700 auto fn = (Stage)(++program)->fn; \
4701 fn(tail, program, dx,dy, r,g,b,a, dr,dg,db,da); \
4702 } \
4703 SI void name##_k(ARG, size_t dx, size_t dy, size_t tail, F x, F y, \
4704 U16& r, U16& g, U16& b, U16& a, \
4705 U16& dr, U16& dg, U16& db, U16& da)
4706
4707 #define STAGE_PP(name, ARG) \
4708 SI void name##_k(ARG, size_t dx, size_t dy, size_t tail, \
4709 U16& r, U16& g, U16& b, U16& a, \
4710 U16& dr, U16& dg, U16& db, U16& da); \
4711 static void ABI name(size_t tail, SkRasterPipelineStage* program, \
4712 size_t dx, size_t dy, \
4713 U16 r, U16 g, U16 b, U16 a, \
4714 U16 dr, U16 dg, U16 db, U16 da) { \
4715 name##_k(Ctx{program}, dx,dy,tail, r,g,b,a, dr,dg,db,da); \
4716 auto fn = (Stage)(++program)->fn; \
4717 fn(tail, program, dx,dy, r,g,b,a, dr,dg,db,da); \
4718 } \
4719 SI void name##_k(ARG, size_t dx, size_t dy, size_t tail, \
4720 U16& r, U16& g, U16& b, U16& a, \
4721 U16& dr, U16& dg, U16& db, U16& da)
4722#endif
4723
4724// ~~~~~~ Commonly used helper functions ~~~~~~ //
4725
4726/**
4727 * Helpers to to properly rounded division (by 255). The ideal answer we want to compute is slow,
4728 * thanks to a division by a non-power of two:
4729 * [1] (v + 127) / 255
4730 *
4731 * There is a two-step process that computes the correct answer for all inputs:
4732 * [2] (v + 128 + ((v + 128) >> 8)) >> 8
4733 *
4734 * There is also a single iteration approximation, but it's wrong (+-1) ~25% of the time:
4735 * [3] (v + 255) >> 8;
4736 *
4737 * We offer two different implementations here, depending on the requirements of the calling stage.
4738 */
4739
4740/**
4741 * div255 favors speed over accuracy. It uses formula [2] on NEON (where we can compute it as fast
4742 * as [3]), and uses [3] elsewhere.
4743 */
4744SI U16 div255(U16 v) {
4745#if defined(JUMPER_IS_NEON)
4746 // With NEON we can compute [2] just as fast as [3], so let's be correct.
4747 // First we compute v + ((v+128)>>8), then one more round of (...+128)>>8 to finish up:
4748 return vrshrq_n_u16(vrsraq_n_u16(v, v, 8), 8);
4749#else
4750 // Otherwise, use [3], which is never wrong by more than 1:
4751 return (v+255)/256;
4752#endif
4753}
4754
4755/**
4756 * div255_accurate guarantees the right answer on all platforms, at the expense of performance.
4757 */
4758SI U16 div255_accurate(U16 v) {
4759#if defined(JUMPER_IS_NEON)
4760 // Our NEON implementation of div255 is already correct for all inputs:
4761 return div255(v);
4762#else
4763 // This is [2] (the same formulation as NEON), but written without the benefit of intrinsics:
4764 v += 128;
4765 return (v+(v/256))/256;
4766#endif
4767}
4768
4769SI U16 inv(U16 v) { return 255-v; }
4770
4771SI U16 if_then_else(I16 c, U16 t, U16 e) { return (t & c) | (e & ~c); }
4772SI U32 if_then_else(I32 c, U32 t, U32 e) { return (t & c) | (e & ~c); }
4773
4774SI U16 max(U16 x, U16 y) { return if_then_else(x < y, y, x); }
4775SI U16 min(U16 x, U16 y) { return if_then_else(x < y, x, y); }
4776
4777SI U16 from_float(float f) { return f * 255.0f + 0.5f; }
4778
4779SI U16 lerp(U16 from, U16 to, U16 t) { return div255( from*inv(t) + to*t ); }
4780
4781template <typename D, typename S>
4782SI D cast(S src) {
4783 return __builtin_convertvector(src, D);
4784}
4785
4786template <typename D, typename S>
4787SI void split(S v, D* lo, D* hi) {
4788 static_assert(2*sizeof(D) == sizeof(S), "");
4789 memcpy(lo, (const char*)&v + 0*sizeof(D), sizeof(D));
4790 memcpy(hi, (const char*)&v + 1*sizeof(D), sizeof(D));
4791}
4792template <typename D, typename S>
4793SI D join(S lo, S hi) {
4794 static_assert(sizeof(D) == 2*sizeof(S), "");
4795 D v;
4796 memcpy((char*)&v + 0*sizeof(S), &lo, sizeof(S));
4797 memcpy((char*)&v + 1*sizeof(S), &hi, sizeof(S));
4798 return v;
4799}
4800
4801SI F if_then_else(I32 c, F t, F e) {
4802 return sk_bit_cast<F>( (sk_bit_cast<I32>(t) & c) | (sk_bit_cast<I32>(e) & ~c) );
4803}
4804SI F max(F x, F y) { return if_then_else(x < y, y, x); }
4805SI F min(F x, F y) { return if_then_else(x < y, x, y); }
4806
4807SI I32 if_then_else(I32 c, I32 t, I32 e) {
4808 return (t & c) | (e & ~c);
4809}
4810SI I32 max(I32 x, I32 y) { return if_then_else(x < y, y, x); }
4811SI I32 min(I32 x, I32 y) { return if_then_else(x < y, x, y); }
4812
4813SI F mad(F f, F m, F a) { return f*m+a; }
4814SI U32 trunc_(F x) { return (U32)cast<I32>(x); }
4815
4816// Use approximate instructions and one Newton-Raphson step to calculate 1/x.
4817SI F rcp_precise(F x) {
4818#if defined(JUMPER_IS_HSW)
4819 __m256 lo,hi;
4820 split(x, &lo,&hi);
4821 return join<F>(SK_OPTS_NS::rcp_precise(lo), SK_OPTS_NS::rcp_precise(hi));
4822#elif defined(JUMPER_IS_SSE2) || defined(JUMPER_IS_SSE41) || defined(JUMPER_IS_AVX)
4823 __m128 lo,hi;
4824 split(x, &lo,&hi);
4825 return join<F>(SK_OPTS_NS::rcp_precise(lo), SK_OPTS_NS::rcp_precise(hi));
4826#elif defined(JUMPER_IS_NEON)
4827 float32x4_t lo,hi;
4828 split(x, &lo,&hi);
4829 return join<F>(SK_OPTS_NS::rcp_precise(lo), SK_OPTS_NS::rcp_precise(hi));
4830#else
4831 return 1.0f / x;
4832#endif
4833}
4834SI F sqrt_(F x) {
4835#if defined(JUMPER_IS_HSW)
4836 __m256 lo,hi;
4837 split(x, &lo,&hi);
4838 return join<F>(_mm256_sqrt_ps(lo), _mm256_sqrt_ps(hi));
4839#elif defined(JUMPER_IS_SSE2) || defined(JUMPER_IS_SSE41) || defined(JUMPER_IS_AVX)
4840 __m128 lo,hi;
4841 split(x, &lo,&hi);
4842 return join<F>(_mm_sqrt_ps(lo), _mm_sqrt_ps(hi));
4843#elif defined(SK_CPU_ARM64)
4844 float32x4_t lo,hi;
4845 split(x, &lo,&hi);
4846 return join<F>(vsqrtq_f32(lo), vsqrtq_f32(hi));
4847#elif defined(JUMPER_IS_NEON)
4848 auto sqrt = [](float32x4_t v) {
4849 auto est = vrsqrteq_f32(v); // Estimate and two refinement steps for est = rsqrt(v).
4850 est *= vrsqrtsq_f32(v,est*est);
4851 est *= vrsqrtsq_f32(v,est*est);
4852 return v*est; // sqrt(v) == v*rsqrt(v).
4853 };
4854 float32x4_t lo,hi;
4855 split(x, &lo,&hi);
4856 return join<F>(sqrt(lo), sqrt(hi));
4857#else
4858 return F{
4859 sqrtf(x[0]), sqrtf(x[1]), sqrtf(x[2]), sqrtf(x[3]),
4860 sqrtf(x[4]), sqrtf(x[5]), sqrtf(x[6]), sqrtf(x[7]),
4861 };
4862#endif
4863}
4864
4865SI F floor_(F x) {
4866#if defined(SK_CPU_ARM64)
4867 float32x4_t lo,hi;
4868 split(x, &lo,&hi);
4869 return join<F>(vrndmq_f32(lo), vrndmq_f32(hi));
4870#elif defined(JUMPER_IS_HSW)
4871 __m256 lo,hi;
4872 split(x, &lo,&hi);
4873 return join<F>(_mm256_floor_ps(lo), _mm256_floor_ps(hi));
4874#elif defined(JUMPER_IS_SSE41) || defined(JUMPER_IS_AVX)
4875 __m128 lo,hi;
4876 split(x, &lo,&hi);
4877 return join<F>(_mm_floor_ps(lo), _mm_floor_ps(hi));
4878#else
4879 F roundtrip = cast<F>(cast<I32>(x));
4880 return roundtrip - if_then_else(roundtrip > x, F(1), F(0));
4881#endif
4882}
4883
4884// scaled_mult interprets a and b as number on [-1, 1) which are numbers in Q15 format. Functionally
4885// this multiply is:
4886// (2 * a * b + (1 << 15)) >> 16
4887// The result is a number on [-1, 1).
4888// Note: on neon this is a saturating multiply while the others are not.
4889SI I16 scaled_mult(I16 a, I16 b) {
4890#if defined(JUMPER_IS_HSW)
4891 return _mm256_mulhrs_epi16(a, b);
4892#elif defined(JUMPER_IS_SSE41) || defined(JUMPER_IS_AVX)
4893 return _mm_mulhrs_epi16(a, b);
4894#elif defined(SK_CPU_ARM64)
4895 return vqrdmulhq_s16(a, b);
4896#elif defined(JUMPER_IS_NEON)
4897 return vqrdmulhq_s16(a, b);
4898#else
4899 const I32 roundingTerm = 1 << 14;
4900 return cast<I16>((cast<I32>(a) * cast<I32>(b) + roundingTerm) >> 15);
4901#endif
4902}
4903
4904// This sum is to support lerp where the result will always be a positive number. In general,
4905// a sum like this would require an additional bit, but because we know the range of the result
4906// we know that the extra bit will always be zero.
4907SI U16 constrained_add(I16 a, U16 b) {
4908 #if defined(SK_DEBUG)
4909 for (size_t i = 0; i < N; i++) {
4910 // Ensure that a + b is on the interval [0, UINT16_MAX]
4911 int ia = a[i],
4912 ib = b[i];
4913 // Use 65535 here because fuchsia's compiler evaluates UINT16_MAX - ib, which is
4914 // 65536U - ib, as an uint32_t instead of an int32_t. This was forcing ia to be
4915 // interpreted as an uint32_t.
4916 SkASSERT(-ib <= ia && ia <= 65535 - ib);
4917 }
4918 #endif
4919 return b + a;
4920}
4921
4922SI F fract(F x) { return x - floor_(x); }
4923SI F abs_(F x) { return sk_bit_cast<F>( sk_bit_cast<I32>(x) & 0x7fffffff ); }
4924
4925// ~~~~~~ Basic / misc. stages ~~~~~~ //
4926
4927STAGE_GG(seed_shader, NoCtx) {
4928 static constexpr float iota[] = {
4929 0.5f, 1.5f, 2.5f, 3.5f, 4.5f, 5.5f, 6.5f, 7.5f,
4930 8.5f, 9.5f,10.5f,11.5f,12.5f,13.5f,14.5f,15.5f,
4931 };
4932 x = cast<F>(I32(dx)) + sk_unaligned_load<F>(iota);
4933 y = cast<F>(I32(dy)) + 0.5f;
4934}
4935
4936STAGE_GG(matrix_translate, const float* m) {
4937 x += m[0];
4938 y += m[1];
4939}
4940STAGE_GG(matrix_scale_translate, const float* m) {
4941 x = mad(x,m[0], m[2]);
4942 y = mad(y,m[1], m[3]);
4943}
4944STAGE_GG(matrix_2x3, const float* m) {
4945 auto X = mad(x,m[0], mad(y,m[1], m[2])),
4946 Y = mad(x,m[3], mad(y,m[4], m[5]));
4947 x = X;
4948 y = Y;
4949}
4950STAGE_GG(matrix_perspective, const float* m) {
4951 // N.B. Unlike the other matrix_ stages, this matrix is row-major.
4952 auto X = mad(x,m[0], mad(y,m[1], m[2])),
4953 Y = mad(x,m[3], mad(y,m[4], m[5])),
4954 Z = mad(x,m[6], mad(y,m[7], m[8]));
4955 x = X * rcp_precise(Z);
4956 y = Y * rcp_precise(Z);
4957}
4958
4959STAGE_PP(uniform_color, const SkRasterPipeline_UniformColorCtx* c) {
4960 r = c->rgba[0];
4961 g = c->rgba[1];
4962 b = c->rgba[2];
4963 a = c->rgba[3];
4964}
4965STAGE_PP(uniform_color_dst, const SkRasterPipeline_UniformColorCtx* c) {
4966 dr = c->rgba[0];
4967 dg = c->rgba[1];
4968 db = c->rgba[2];
4969 da = c->rgba[3];
4970}
4971STAGE_PP(black_color, NoCtx) { r = g = b = 0; a = 255; }
4972STAGE_PP(white_color, NoCtx) { r = g = b = 255; a = 255; }
4973
4974STAGE_PP(set_rgb, const float rgb[3]) {
4975 r = from_float(rgb[0]);
4976 g = from_float(rgb[1]);
4977 b = from_float(rgb[2]);
4978}
4979
4980// No need to clamp against 0 here (values are unsigned)
4981STAGE_PP(clamp_01, NoCtx) {
4982 r = min(r, 255);
4983 g = min(g, 255);
4984 b = min(b, 255);
4985 a = min(a, 255);
4986}
4987
4988STAGE_PP(clamp_gamut, NoCtx) {
4989 a = min(a, 255);
4990 r = min(r, a);
4991 g = min(g, a);
4992 b = min(b, a);
4993}
4994
4995STAGE_PP(premul, NoCtx) {
4996 r = div255_accurate(r * a);
4997 g = div255_accurate(g * a);
4998 b = div255_accurate(b * a);
4999}
5000STAGE_PP(premul_dst, NoCtx) {
5001 dr = div255_accurate(dr * da);
5002 dg = div255_accurate(dg * da);
5003 db = div255_accurate(db * da);
5004}
5005
5006STAGE_PP(force_opaque , NoCtx) { a = 255; }
5007STAGE_PP(force_opaque_dst, NoCtx) { da = 255; }
5008
5009STAGE_PP(swap_rb, NoCtx) {
5010 auto tmp = r;
5011 r = b;
5012 b = tmp;
5013}
5014STAGE_PP(swap_rb_dst, NoCtx) {
5015 auto tmp = dr;
5016 dr = db;
5017 db = tmp;
5018}
5019
5020STAGE_PP(move_src_dst, NoCtx) {
5021 dr = r;
5022 dg = g;
5023 db = b;
5024 da = a;
5025}
5026
5027STAGE_PP(move_dst_src, NoCtx) {
5028 r = dr;
5029 g = dg;
5030 b = db;
5031 a = da;
5032}
5033
5034STAGE_PP(swap_src_dst, NoCtx) {
5035 std::swap(r, dr);
5036 std::swap(g, dg);
5037 std::swap(b, db);
5038 std::swap(a, da);
5039}
5040
5041// ~~~~~~ Blend modes ~~~~~~ //
5042
5043// The same logic applied to all 4 channels.
5044#define BLEND_MODE(name) \
5045 SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da); \
5046 STAGE_PP(name, NoCtx) { \
5047 r = name##_channel(r,dr,a,da); \
5048 g = name##_channel(g,dg,a,da); \
5049 b = name##_channel(b,db,a,da); \
5050 a = name##_channel(a,da,a,da); \
5051 } \
5052 SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da)
5053
5054#if defined(SK_USE_INACCURATE_DIV255_IN_BLEND)
5055 BLEND_MODE(clear) { return 0; }
5056 BLEND_MODE(srcatop) { return div255( s*da + d*inv(sa) ); }
5057 BLEND_MODE(dstatop) { return div255( d*sa + s*inv(da) ); }
5058 BLEND_MODE(srcin) { return div255( s*da ); }
5059 BLEND_MODE(dstin) { return div255( d*sa ); }
5060 BLEND_MODE(srcout) { return div255( s*inv(da) ); }
5061 BLEND_MODE(dstout) { return div255( d*inv(sa) ); }
5062 BLEND_MODE(srcover) { return s + div255( d*inv(sa) ); }
5063 BLEND_MODE(dstover) { return d + div255( s*inv(da) ); }
5064 BLEND_MODE(modulate) { return div255( s*d ); }
5065 BLEND_MODE(multiply) { return div255( s*inv(da) + d*inv(sa) + s*d ); }
5066 BLEND_MODE(plus_) { return min(s+d, 255); }
5067 BLEND_MODE(screen) { return s + d - div255( s*d ); }
5068 BLEND_MODE(xor_) { return div255( s*inv(da) + d*inv(sa) ); }
5069#else
5070 BLEND_MODE(clear) { return 0; }
5071 BLEND_MODE(srcatop) { return div255( s*da + d*inv(sa) ); }
5072 BLEND_MODE(dstatop) { return div255( d*sa + s*inv(da) ); }
5073 BLEND_MODE(srcin) { return div255_accurate( s*da ); }
5074 BLEND_MODE(dstin) { return div255_accurate( d*sa ); }
5075 BLEND_MODE(srcout) { return div255_accurate( s*inv(da) ); }
5076 BLEND_MODE(dstout) { return div255_accurate( d*inv(sa) ); }
5077 BLEND_MODE(srcover) { return s + div255_accurate( d*inv(sa) ); }
5078 BLEND_MODE(dstover) { return d + div255_accurate( s*inv(da) ); }
5079 BLEND_MODE(modulate) { return div255_accurate( s*d ); }
5080 BLEND_MODE(multiply) { return div255( s*inv(da) + d*inv(sa) + s*d ); }
5081 BLEND_MODE(plus_) { return min(s+d, 255); }
5082 BLEND_MODE(screen) { return s + d - div255_accurate( s*d ); }
5083 BLEND_MODE(xor_) { return div255( s*inv(da) + d*inv(sa) ); }
5084#endif
5085#undef BLEND_MODE
5086
5087// The same logic applied to color, and srcover for alpha.
5088#define BLEND_MODE(name) \
5089 SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da); \
5090 STAGE_PP(name, NoCtx) { \
5091 r = name##_channel(r,dr,a,da); \
5092 g = name##_channel(g,dg,a,da); \
5093 b = name##_channel(b,db,a,da); \
5094 a = a + div255( da*inv(a) ); \
5095 } \
5096 SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da)
5097
5098 BLEND_MODE(darken) { return s + d - div255( max(s*da, d*sa) ); }
5099 BLEND_MODE(lighten) { return s + d - div255( min(s*da, d*sa) ); }
5100 BLEND_MODE(difference) { return s + d - 2*div255( min(s*da, d*sa) ); }
5101 BLEND_MODE(exclusion) { return s + d - 2*div255( s*d ); }
5102
5103 BLEND_MODE(hardlight) {
5104 return div255( s*inv(da) + d*inv(sa) +
5105 if_then_else(2*s <= sa, 2*s*d, sa*da - 2*(sa-s)*(da-d)) );
5106 }
5107 BLEND_MODE(overlay) {
5108 return div255( s*inv(da) + d*inv(sa) +
5109 if_then_else(2*d <= da, 2*s*d, sa*da - 2*(sa-s)*(da-d)) );
5110 }
5111#undef BLEND_MODE
5112
5113// ~~~~~~ Helpers for interacting with memory ~~~~~~ //
5114
5115template <typename T>
5116SI T* ptr_at_xy(const SkRasterPipeline_MemoryCtx* ctx, size_t dx, size_t dy) {
5117 return (T*)ctx->pixels + dy*ctx->stride + dx;
5118}
5119
5120template <typename T>
5121SI U32 ix_and_ptr(T** ptr, const SkRasterPipeline_GatherCtx* ctx, F x, F y) {
5122 // Exclusive -> inclusive.
5123 const F w = sk_bit_cast<float>( sk_bit_cast<uint32_t>(ctx->width ) - 1),
5124 h = sk_bit_cast<float>( sk_bit_cast<uint32_t>(ctx->height) - 1);
5125
5126 const F z = std::numeric_limits<float>::min();
5127
5128 x = min(max(z, x), w);
5129 y = min(max(z, y), h);
5130
5131 x = sk_bit_cast<F>(sk_bit_cast<U32>(x) - (uint32_t)ctx->roundDownAtInteger);
5132 y = sk_bit_cast<F>(sk_bit_cast<U32>(y) - (uint32_t)ctx->roundDownAtInteger);
5133
5134 *ptr = (const T*)ctx->pixels;
5135 return trunc_(y)*ctx->stride + trunc_(x);
5136}
5137
5138template <typename T>
5139SI U32 ix_and_ptr(T** ptr, const SkRasterPipeline_GatherCtx* ctx, I32 x, I32 y) {
5140 // This flag doesn't make sense when the coords are integers.
5141 SkASSERT(ctx->roundDownAtInteger == 0);
5142 // Exclusive -> inclusive.
5143 const I32 w = ctx->width - 1,
5144 h = ctx->height - 1;
5145
5146 U32 ax = cast<U32>(min(max(0, x), w)),
5147 ay = cast<U32>(min(max(0, y), h));
5148
5149 *ptr = (const T*)ctx->pixels;
5150 return ay * ctx->stride + ax;
5151}
5152
5153template <typename V, typename T>
5154SI V load(const T* ptr, size_t tail) {
5155 V v = 0;
5156 switch (tail & (N-1)) {
5157 case 0: memcpy(&v, ptr, sizeof(v)); break;
5158 #if defined(JUMPER_IS_HSW)
5159 case 15: v[14] = ptr[14]; [[fallthrough]];
5160 case 14: v[13] = ptr[13]; [[fallthrough]];
5161 case 13: v[12] = ptr[12]; [[fallthrough]];
5162 case 12: memcpy(&v, ptr, 12*sizeof(T)); break;
5163 case 11: v[10] = ptr[10]; [[fallthrough]];
5164 case 10: v[ 9] = ptr[ 9]; [[fallthrough]];
5165 case 9: v[ 8] = ptr[ 8]; [[fallthrough]];
5166 case 8: memcpy(&v, ptr, 8*sizeof(T)); break;
5167 #endif
5168 case 7: v[ 6] = ptr[ 6]; [[fallthrough]];
5169 case 6: v[ 5] = ptr[ 5]; [[fallthrough]];
5170 case 5: v[ 4] = ptr[ 4]; [[fallthrough]];
5171 case 4: memcpy(&v, ptr, 4*sizeof(T)); break;
5172 case 3: v[ 2] = ptr[ 2]; [[fallthrough]];
5173 case 2: memcpy(&v, ptr, 2*sizeof(T)); break;
5174 case 1: v[ 0] = ptr[ 0];
5175 }
5176 return v;
5177}
5178template <typename V, typename T>
5179SI void store(T* ptr, size_t tail, V v) {
5180 switch (tail & (N-1)) {
5181 case 0: memcpy(ptr, &v, sizeof(v)); break;
5182 #if defined(JUMPER_IS_HSW)
5183 case 15: ptr[14] = v[14]; [[fallthrough]];
5184 case 14: ptr[13] = v[13]; [[fallthrough]];
5185 case 13: ptr[12] = v[12]; [[fallthrough]];
5186 case 12: memcpy(ptr, &v, 12*sizeof(T)); break;
5187 case 11: ptr[10] = v[10]; [[fallthrough]];
5188 case 10: ptr[ 9] = v[ 9]; [[fallthrough]];
5189 case 9: ptr[ 8] = v[ 8]; [[fallthrough]];
5190 case 8: memcpy(ptr, &v, 8*sizeof(T)); break;
5191 #endif
5192 case 7: ptr[ 6] = v[ 6]; [[fallthrough]];
5193 case 6: ptr[ 5] = v[ 5]; [[fallthrough]];
5194 case 5: ptr[ 4] = v[ 4]; [[fallthrough]];
5195 case 4: memcpy(ptr, &v, 4*sizeof(T)); break;
5196 case 3: ptr[ 2] = v[ 2]; [[fallthrough]];
5197 case 2: memcpy(ptr, &v, 2*sizeof(T)); break;
5198 case 1: ptr[ 0] = v[ 0];
5199 }
5200}
5201
5202#if defined(JUMPER_IS_HSW)
5203 template <typename V, typename T>
5204 SI V gather(const T* ptr, U32 ix) {
5205 return V{ ptr[ix[ 0]], ptr[ix[ 1]], ptr[ix[ 2]], ptr[ix[ 3]],
5206 ptr[ix[ 4]], ptr[ix[ 5]], ptr[ix[ 6]], ptr[ix[ 7]],
5207 ptr[ix[ 8]], ptr[ix[ 9]], ptr[ix[10]], ptr[ix[11]],
5208 ptr[ix[12]], ptr[ix[13]], ptr[ix[14]], ptr[ix[15]], };
5209 }
5210
5211 template<>
5212 F gather(const float* ptr, U32 ix) {
5213 __m256i lo, hi;
5214 split(ix, &lo, &hi);
5215
5216 return join<F>(_mm256_i32gather_ps(ptr, lo, 4),
5217 _mm256_i32gather_ps(ptr, hi, 4));
5218 }
5219
5220 template<>
5221 U32 gather(const uint32_t* ptr, U32 ix) {
5222 __m256i lo, hi;
5223 split(ix, &lo, &hi);
5224
5225 return join<U32>(_mm256_i32gather_epi32(ptr, lo, 4),
5226 _mm256_i32gather_epi32(ptr, hi, 4));
5227 }
5228#else
5229 template <typename V, typename T>
5230 SI V gather(const T* ptr, U32 ix) {
5231 return V{ ptr[ix[ 0]], ptr[ix[ 1]], ptr[ix[ 2]], ptr[ix[ 3]],
5232 ptr[ix[ 4]], ptr[ix[ 5]], ptr[ix[ 6]], ptr[ix[ 7]], };
5233 }
5234#endif
5235
5236
5237// ~~~~~~ 32-bit memory loads and stores ~~~~~~ //
5238
5239SI void from_8888(U32 rgba, U16* r, U16* g, U16* b, U16* a) {
5240#if defined(JUMPER_IS_HSW)
5241 // Swap the middle 128-bit lanes to make _mm256_packus_epi32() in cast_U16() work out nicely.
5242 __m256i _01,_23;
5243 split(rgba, &_01, &_23);
5244 __m256i _02 = _mm256_permute2x128_si256(_01,_23, 0x20),
5245 _13 = _mm256_permute2x128_si256(_01,_23, 0x31);
5246 rgba = join<U32>(_02, _13);
5247
5248 auto cast_U16 = [](U32 v) -> U16 {
5249 __m256i _02,_13;
5250 split(v, &_02,&_13);
5251 return _mm256_packus_epi32(_02,_13);
5252 };
5253#else
5254 auto cast_U16 = [](U32 v) -> U16 {
5255 return cast<U16>(v);
5256 };
5257#endif
5258 *r = cast_U16(rgba & 65535) & 255;
5259 *g = cast_U16(rgba & 65535) >> 8;
5260 *b = cast_U16(rgba >> 16) & 255;
5261 *a = cast_U16(rgba >> 16) >> 8;
5262}
5263
5264SI void load_8888_(const uint32_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
5265#if 1 && defined(JUMPER_IS_NEON)
5266 uint8x8x4_t rgba;
5267 switch (tail & (N-1)) {
5268 case 0: rgba = vld4_u8 ((const uint8_t*)(ptr+0) ); break;
5269 case 7: rgba = vld4_lane_u8((const uint8_t*)(ptr+6), rgba, 6); [[fallthrough]];
5270 case 6: rgba = vld4_lane_u8((const uint8_t*)(ptr+5), rgba, 5); [[fallthrough]];
5271 case 5: rgba = vld4_lane_u8((const uint8_t*)(ptr+4), rgba, 4); [[fallthrough]];
5272 case 4: rgba = vld4_lane_u8((const uint8_t*)(ptr+3), rgba, 3); [[fallthrough]];
5273 case 3: rgba = vld4_lane_u8((const uint8_t*)(ptr+2), rgba, 2); [[fallthrough]];
5274 case 2: rgba = vld4_lane_u8((const uint8_t*)(ptr+1), rgba, 1); [[fallthrough]];
5275 case 1: rgba = vld4_lane_u8((const uint8_t*)(ptr+0), rgba, 0);
5276 }
5277 *r = cast<U16>(rgba.val[0]);
5278 *g = cast<U16>(rgba.val[1]);
5279 *b = cast<U16>(rgba.val[2]);
5280 *a = cast<U16>(rgba.val[3]);
5281#else
5282 from_8888(load<U32>(ptr, tail), r,g,b,a);
5283#endif
5284}
5285SI void store_8888_(uint32_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
5286 r = min(r, 255);
5287 g = min(g, 255);
5288 b = min(b, 255);
5289 a = min(a, 255);
5290
5291#if 1 && defined(JUMPER_IS_NEON)
5292 uint8x8x4_t rgba = {{
5293 cast<U8>(r),
5294 cast<U8>(g),
5295 cast<U8>(b),
5296 cast<U8>(a),
5297 }};
5298 switch (tail & (N-1)) {
5299 case 0: vst4_u8 ((uint8_t*)(ptr+0), rgba ); break;
5300 case 7: vst4_lane_u8((uint8_t*)(ptr+6), rgba, 6); [[fallthrough]];
5301 case 6: vst4_lane_u8((uint8_t*)(ptr+5), rgba, 5); [[fallthrough]];
5302 case 5: vst4_lane_u8((uint8_t*)(ptr+4), rgba, 4); [[fallthrough]];
5303 case 4: vst4_lane_u8((uint8_t*)(ptr+3), rgba, 3); [[fallthrough]];
5304 case 3: vst4_lane_u8((uint8_t*)(ptr+2), rgba, 2); [[fallthrough]];
5305 case 2: vst4_lane_u8((uint8_t*)(ptr+1), rgba, 1); [[fallthrough]];
5306 case 1: vst4_lane_u8((uint8_t*)(ptr+0), rgba, 0);
5307 }
5308#else
5309 store(ptr, tail, cast<U32>(r | (g<<8)) << 0
5310 | cast<U32>(b | (a<<8)) << 16);
5311#endif
5312}
5313
5314STAGE_PP(load_8888, const SkRasterPipeline_MemoryCtx* ctx) {
5315 load_8888_(ptr_at_xy<const uint32_t>(ctx, dx,dy), tail, &r,&g,&b,&a);
5316}
5317STAGE_PP(load_8888_dst, const SkRasterPipeline_MemoryCtx* ctx) {
5318 load_8888_(ptr_at_xy<const uint32_t>(ctx, dx,dy), tail, &dr,&dg,&db,&da);
5319}
5320STAGE_PP(store_8888, const SkRasterPipeline_MemoryCtx* ctx) {
5321 store_8888_(ptr_at_xy<uint32_t>(ctx, dx,dy), tail, r,g,b,a);
5322}
5323STAGE_GP(gather_8888, const SkRasterPipeline_GatherCtx* ctx) {
5324 const uint32_t* ptr;
5325 U32 ix = ix_and_ptr(&ptr, ctx, x,y);
5326 from_8888(gather<U32>(ptr, ix), &r, &g, &b, &a);
5327}
5328
5329// ~~~~~~ 16-bit memory loads and stores ~~~~~~ //
5330
5331SI void from_565(U16 rgb, U16* r, U16* g, U16* b) {
5332 // Format for 565 buffers: 15|rrrrr gggggg bbbbb|0
5333 U16 R = (rgb >> 11) & 31,
5334 G = (rgb >> 5) & 63,
5335 B = (rgb >> 0) & 31;
5336
5337 // These bit replications are the same as multiplying by 255/31 or 255/63 to scale to 8-bit.
5338 *r = (R << 3) | (R >> 2);
5339 *g = (G << 2) | (G >> 4);
5340 *b = (B << 3) | (B >> 2);
5341}
5342SI void load_565_(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) {
5343 from_565(load<U16>(ptr, tail), r,g,b);
5344}
5345SI void store_565_(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b) {
5346 r = min(r, 255);
5347 g = min(g, 255);
5348 b = min(b, 255);
5349
5350 // Round from [0,255] to [0,31] or [0,63], as if x * (31/255.0f) + 0.5f.
5351 // (Don't feel like you need to find some fundamental truth in these...
5352 // they were brute-force searched.)
5353 U16 R = (r * 9 + 36) / 74, // 9/74 ≈ 31/255, plus 36/74, about half.
5354 G = (g * 21 + 42) / 85, // 21/85 = 63/255 exactly.
5355 B = (b * 9 + 36) / 74;
5356 // Pack them back into 15|rrrrr gggggg bbbbb|0.
5357 store(ptr, tail, R << 11
5358 | G << 5
5359 | B << 0);
5360}
5361
5362STAGE_PP(load_565, const SkRasterPipeline_MemoryCtx* ctx) {
5363 load_565_(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &r,&g,&b);
5364 a = 255;
5365}
5366STAGE_PP(load_565_dst, const SkRasterPipeline_MemoryCtx* ctx) {
5367 load_565_(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &dr,&dg,&db);
5368 da = 255;
5369}
5370STAGE_PP(store_565, const SkRasterPipeline_MemoryCtx* ctx) {
5371 store_565_(ptr_at_xy<uint16_t>(ctx, dx,dy), tail, r,g,b);
5372}
5373STAGE_GP(gather_565, const SkRasterPipeline_GatherCtx* ctx) {
5374 const uint16_t* ptr;
5375 U32 ix = ix_and_ptr(&ptr, ctx, x,y);
5376 from_565(gather<U16>(ptr, ix), &r, &g, &b);
5377 a = 255;
5378}
5379
5380SI void from_4444(U16 rgba, U16* r, U16* g, U16* b, U16* a) {
5381 // Format for 4444 buffers: 15|rrrr gggg bbbb aaaa|0.
5382 U16 R = (rgba >> 12) & 15,
5383 G = (rgba >> 8) & 15,
5384 B = (rgba >> 4) & 15,
5385 A = (rgba >> 0) & 15;
5386
5387 // Scale [0,15] to [0,255].
5388 *r = (R << 4) | R;
5389 *g = (G << 4) | G;
5390 *b = (B << 4) | B;
5391 *a = (A << 4) | A;
5392}
5393SI void load_4444_(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
5394 from_4444(load<U16>(ptr, tail), r,g,b,a);
5395}
5396SI void store_4444_(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
5397 r = min(r, 255);
5398 g = min(g, 255);
5399 b = min(b, 255);
5400 a = min(a, 255);
5401
5402 // Round from [0,255] to [0,15], producing the same value as (x*(15/255.0f) + 0.5f).
5403 U16 R = (r + 8) / 17,
5404 G = (g + 8) / 17,
5405 B = (b + 8) / 17,
5406 A = (a + 8) / 17;
5407 // Pack them back into 15|rrrr gggg bbbb aaaa|0.
5408 store(ptr, tail, R << 12
5409 | G << 8
5410 | B << 4
5411 | A << 0);
5412}
5413
5414STAGE_PP(load_4444, const SkRasterPipeline_MemoryCtx* ctx) {
5415 load_4444_(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &r,&g,&b,&a);
5416}
5417STAGE_PP(load_4444_dst, const SkRasterPipeline_MemoryCtx* ctx) {
5418 load_4444_(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &dr,&dg,&db,&da);
5419}
5420STAGE_PP(store_4444, const SkRasterPipeline_MemoryCtx* ctx) {
5421 store_4444_(ptr_at_xy<uint16_t>(ctx, dx,dy), tail, r,g,b,a);
5422}
5423STAGE_GP(gather_4444, const SkRasterPipeline_GatherCtx* ctx) {
5424 const uint16_t* ptr;
5425 U32 ix = ix_and_ptr(&ptr, ctx, x,y);
5426 from_4444(gather<U16>(ptr, ix), &r,&g,&b,&a);
5427}
5428
5429SI void from_88(U16 rg, U16* r, U16* g) {
5430 *r = (rg & 0xFF);
5431 *g = (rg >> 8);
5432}
5433
5434SI void load_88_(const uint16_t* ptr, size_t tail, U16* r, U16* g) {
5435#if 1 && defined(JUMPER_IS_NEON)
5436 uint8x8x2_t rg;
5437 switch (tail & (N-1)) {
5438 case 0: rg = vld2_u8 ((const uint8_t*)(ptr+0) ); break;
5439 case 7: rg = vld2_lane_u8((const uint8_t*)(ptr+6), rg, 6); [[fallthrough]];
5440 case 6: rg = vld2_lane_u8((const uint8_t*)(ptr+5), rg, 5); [[fallthrough]];
5441 case 5: rg = vld2_lane_u8((const uint8_t*)(ptr+4), rg, 4); [[fallthrough]];
5442 case 4: rg = vld2_lane_u8((const uint8_t*)(ptr+3), rg, 3); [[fallthrough]];
5443 case 3: rg = vld2_lane_u8((const uint8_t*)(ptr+2), rg, 2); [[fallthrough]];
5444 case 2: rg = vld2_lane_u8((const uint8_t*)(ptr+1), rg, 1); [[fallthrough]];
5445 case 1: rg = vld2_lane_u8((const uint8_t*)(ptr+0), rg, 0);
5446 }
5447 *r = cast<U16>(rg.val[0]);
5448 *g = cast<U16>(rg.val[1]);
5449#else
5450 from_88(load<U16>(ptr, tail), r,g);
5451#endif
5452}
5453
5454SI void store_88_(uint16_t* ptr, size_t tail, U16 r, U16 g) {
5455 r = min(r, 255);
5456 g = min(g, 255);
5457
5458#if 1 && defined(JUMPER_IS_NEON)
5459 uint8x8x2_t rg = {{
5460 cast<U8>(r),
5461 cast<U8>(g),
5462 }};
5463 switch (tail & (N-1)) {
5464 case 0: vst2_u8 ((uint8_t*)(ptr+0), rg ); break;
5465 case 7: vst2_lane_u8((uint8_t*)(ptr+6), rg, 6); [[fallthrough]];
5466 case 6: vst2_lane_u8((uint8_t*)(ptr+5), rg, 5); [[fallthrough]];
5467 case 5: vst2_lane_u8((uint8_t*)(ptr+4), rg, 4); [[fallthrough]];
5468 case 4: vst2_lane_u8((uint8_t*)(ptr+3), rg, 3); [[fallthrough]];
5469 case 3: vst2_lane_u8((uint8_t*)(ptr+2), rg, 2); [[fallthrough]];
5470 case 2: vst2_lane_u8((uint8_t*)(ptr+1), rg, 1); [[fallthrough]];
5471 case 1: vst2_lane_u8((uint8_t*)(ptr+0), rg, 0);
5472 }
5473#else
5474 store(ptr, tail, cast<U16>(r | (g<<8)) << 0);
5475#endif
5476}
5477
5478STAGE_PP(load_rg88, const SkRasterPipeline_MemoryCtx* ctx) {
5479 load_88_(ptr_at_xy<const uint16_t>(ctx, dx, dy), tail, &r, &g);
5480 b = 0;
5481 a = 255;
5482}
5483STAGE_PP(load_rg88_dst, const SkRasterPipeline_MemoryCtx* ctx) {
5484 load_88_(ptr_at_xy<const uint16_t>(ctx, dx, dy), tail, &dr, &dg);
5485 db = 0;
5486 da = 255;
5487}
5488STAGE_PP(store_rg88, const SkRasterPipeline_MemoryCtx* ctx) {
5489 store_88_(ptr_at_xy<uint16_t>(ctx, dx, dy), tail, r, g);
5490}
5491STAGE_GP(gather_rg88, const SkRasterPipeline_GatherCtx* ctx) {
5492 const uint16_t* ptr;
5493 U32 ix = ix_and_ptr(&ptr, ctx, x, y);
5494 from_88(gather<U16>(ptr, ix), &r, &g);
5495 b = 0;
5496 a = 255;
5497}
5498
5499// ~~~~~~ 8-bit memory loads and stores ~~~~~~ //
5500
5501SI U16 load_8(const uint8_t* ptr, size_t tail) {
5502 return cast<U16>(load<U8>(ptr, tail));
5503}
5504SI void store_8(uint8_t* ptr, size_t tail, U16 v) {
5505 v = min(v, 255);
5506 store(ptr, tail, cast<U8>(v));
5507}
5508
5509STAGE_PP(load_a8, const SkRasterPipeline_MemoryCtx* ctx) {
5510 r = g = b = 0;
5511 a = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail);
5512}
5513STAGE_PP(load_a8_dst, const SkRasterPipeline_MemoryCtx* ctx) {
5514 dr = dg = db = 0;
5515 da = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail);
5516}
5517STAGE_PP(store_a8, const SkRasterPipeline_MemoryCtx* ctx) {
5518 store_8(ptr_at_xy<uint8_t>(ctx, dx,dy), tail, a);
5519}
5520STAGE_GP(gather_a8, const SkRasterPipeline_GatherCtx* ctx) {
5521 const uint8_t* ptr;
5522 U32 ix = ix_and_ptr(&ptr, ctx, x,y);
5523 r = g = b = 0;
5524 a = cast<U16>(gather<U8>(ptr, ix));
5525}
5526STAGE_PP(store_r8, const SkRasterPipeline_MemoryCtx* ctx) {
5527 store_8(ptr_at_xy<uint8_t>(ctx, dx,dy), tail, r);
5528}
5529
5530STAGE_PP(alpha_to_gray, NoCtx) {
5531 r = g = b = a;
5532 a = 255;
5533}
5534STAGE_PP(alpha_to_gray_dst, NoCtx) {
5535 dr = dg = db = da;
5536 da = 255;
5537}
5538STAGE_PP(alpha_to_red, NoCtx) {
5539 r = a;
5540 a = 255;
5541}
5542STAGE_PP(alpha_to_red_dst, NoCtx) {
5543 dr = da;
5544 da = 255;
5545}
5546
5547STAGE_PP(bt709_luminance_or_luma_to_alpha, NoCtx) {
5548 a = (r*54 + g*183 + b*19)/256; // 0.2126, 0.7152, 0.0722 with 256 denominator.
5549 r = g = b = 0;
5550}
5551STAGE_PP(bt709_luminance_or_luma_to_rgb, NoCtx) {
5552 r = g = b =(r*54 + g*183 + b*19)/256; // 0.2126, 0.7152, 0.0722 with 256 denominator.
5553}
5554
5555// ~~~~~~ Coverage scales / lerps ~~~~~~ //
5556
5557STAGE_PP(load_src, const uint16_t* ptr) {
5558 r = sk_unaligned_load<U16>(ptr + 0*N);
5559 g = sk_unaligned_load<U16>(ptr + 1*N);
5560 b = sk_unaligned_load<U16>(ptr + 2*N);
5561 a = sk_unaligned_load<U16>(ptr + 3*N);
5562}
5563STAGE_PP(store_src, uint16_t* ptr) {
5564 sk_unaligned_store(ptr + 0*N, r);
5565 sk_unaligned_store(ptr + 1*N, g);
5566 sk_unaligned_store(ptr + 2*N, b);
5567 sk_unaligned_store(ptr + 3*N, a);
5568}
5569STAGE_PP(store_src_a, uint16_t* ptr) {
5570 sk_unaligned_store(ptr, a);
5571}
5572STAGE_PP(load_dst, const uint16_t* ptr) {
5573 dr = sk_unaligned_load<U16>(ptr + 0*N);
5574 dg = sk_unaligned_load<U16>(ptr + 1*N);
5575 db = sk_unaligned_load<U16>(ptr + 2*N);
5576 da = sk_unaligned_load<U16>(ptr + 3*N);
5577}
5578STAGE_PP(store_dst, uint16_t* ptr) {
5579 sk_unaligned_store(ptr + 0*N, dr);
5580 sk_unaligned_store(ptr + 1*N, dg);
5581 sk_unaligned_store(ptr + 2*N, db);
5582 sk_unaligned_store(ptr + 3*N, da);
5583}
5584
5585// ~~~~~~ Coverage scales / lerps ~~~~~~ //
5586
5587STAGE_PP(scale_1_float, const float* f) {
5588 U16 c = from_float(*f);
5589 r = div255( r * c );
5590 g = div255( g * c );
5591 b = div255( b * c );
5592 a = div255( a * c );
5593}
5594STAGE_PP(lerp_1_float, const float* f) {
5595 U16 c = from_float(*f);
5596 r = lerp(dr, r, c);
5597 g = lerp(dg, g, c);
5598 b = lerp(db, b, c);
5599 a = lerp(da, a, c);
5600}
5601STAGE_PP(scale_native, const uint16_t scales[]) {
5602 auto c = sk_unaligned_load<U16>(scales);
5603 r = div255( r * c );
5604 g = div255( g * c );
5605 b = div255( b * c );
5606 a = div255( a * c );
5607}
5608
5609STAGE_PP(lerp_native, const uint16_t scales[]) {
5610 auto c = sk_unaligned_load<U16>(scales);
5611 r = lerp(dr, r, c);
5612 g = lerp(dg, g, c);
5613 b = lerp(db, b, c);
5614 a = lerp(da, a, c);
5615}
5616
5617STAGE_PP(scale_u8, const SkRasterPipeline_MemoryCtx* ctx) {
5618 U16 c = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail);
5619 r = div255( r * c );
5620 g = div255( g * c );
5621 b = div255( b * c );
5622 a = div255( a * c );
5623}
5624STAGE_PP(lerp_u8, const SkRasterPipeline_MemoryCtx* ctx) {
5625 U16 c = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail);
5626 r = lerp(dr, r, c);
5627 g = lerp(dg, g, c);
5628 b = lerp(db, b, c);
5629 a = lerp(da, a, c);
5630}
5631
5632// Derive alpha's coverage from rgb coverage and the values of src and dst alpha.
5633SI U16 alpha_coverage_from_rgb_coverage(U16 a, U16 da, U16 cr, U16 cg, U16 cb) {
5634 return if_then_else(a < da, min(cr, min(cg,cb))
5635 , max(cr, max(cg,cb)));
5636}
5637STAGE_PP(scale_565, const SkRasterPipeline_MemoryCtx* ctx) {
5638 U16 cr,cg,cb;
5639 load_565_(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &cr,&cg,&cb);
5640 U16 ca = alpha_coverage_from_rgb_coverage(a,da, cr,cg,cb);
5641
5642 r = div255( r * cr );
5643 g = div255( g * cg );
5644 b = div255( b * cb );
5645 a = div255( a * ca );
5646}
5647STAGE_PP(lerp_565, const SkRasterPipeline_MemoryCtx* ctx) {
5648 U16 cr,cg,cb;
5649 load_565_(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &cr,&cg,&cb);
5650 U16 ca = alpha_coverage_from_rgb_coverage(a,da, cr,cg,cb);
5651
5652 r = lerp(dr, r, cr);
5653 g = lerp(dg, g, cg);
5654 b = lerp(db, b, cb);
5655 a = lerp(da, a, ca);
5656}
5657
5658STAGE_PP(emboss, const SkRasterPipeline_EmbossCtx* ctx) {
5659 U16 mul = load_8(ptr_at_xy<const uint8_t>(&ctx->mul, dx,dy), tail),
5660 add = load_8(ptr_at_xy<const uint8_t>(&ctx->add, dx,dy), tail);
5661
5662 r = min(div255(r*mul) + add, a);
5663 g = min(div255(g*mul) + add, a);
5664 b = min(div255(b*mul) + add, a);
5665}
5666
5667
5668// ~~~~~~ Gradient stages ~~~~~~ //
5669
5670// Clamp x to [0,1], both sides inclusive (think, gradients).
5671// Even repeat and mirror funnel through a clamp to handle bad inputs like +Inf, NaN.
5672SI F clamp_01_(F v) { return min(max(0, v), 1); }
5673
5674STAGE_GG(clamp_x_1 , NoCtx) { x = clamp_01_(x); }
5675STAGE_GG(repeat_x_1, NoCtx) { x = clamp_01_(x - floor_(x)); }
5676STAGE_GG(mirror_x_1, NoCtx) {
5677 auto two = [](F x){ return x+x; };
5678 x = clamp_01_(abs_( (x-1.0f) - two(floor_((x-1.0f)*0.5f)) - 1.0f ));
5679}
5680
5681SI I16 cond_to_mask_16(I32 cond) { return cast<I16>(cond); }
5682
5683STAGE_GG(decal_x, SkRasterPipeline_DecalTileCtx* ctx) {
5684 auto w = ctx->limit_x;
5685 sk_unaligned_store(ctx->mask, cond_to_mask_16((0 <= x) & (x < w)));
5686}
5687STAGE_GG(decal_y, SkRasterPipeline_DecalTileCtx* ctx) {
5688 auto h = ctx->limit_y;
5689 sk_unaligned_store(ctx->mask, cond_to_mask_16((0 <= y) & (y < h)));
5690}
5691STAGE_GG(decal_x_and_y, SkRasterPipeline_DecalTileCtx* ctx) {
5692 auto w = ctx->limit_x;
5693 auto h = ctx->limit_y;
5694 sk_unaligned_store(ctx->mask, cond_to_mask_16((0 <= x) & (x < w) & (0 <= y) & (y < h)));
5695}
5696STAGE_GG(clamp_x_and_y, SkRasterPipeline_CoordClampCtx* ctx) {
5697 x = min(ctx->max_x, max(ctx->min_x, x));
5698 y = min(ctx->max_y, max(ctx->min_y, y));
5699}
5700STAGE_PP(check_decal_mask, SkRasterPipeline_DecalTileCtx* ctx) {
5701 auto mask = sk_unaligned_load<U16>(ctx->mask);
5702 r = r & mask;
5703 g = g & mask;
5704 b = b & mask;
5705 a = a & mask;
5706}
5707
5708SI void round_F_to_U16(F R, F G, F B, F A, U16* r, U16* g, U16* b, U16* a) {
5709 auto round_color = [](F x) { return cast<U16>(x * 255.0f + 0.5f); };
5710
5711 *r = round_color(min(max(0, R), 1));
5712 *g = round_color(min(max(0, G), 1));
5713 *b = round_color(min(max(0, B), 1));
5714 *a = round_color(A); // we assume alpha is already in [0,1].
5715}
5716
5717SI void gradient_lookup(const SkRasterPipeline_GradientCtx* c, U32 idx, F t,
5718 U16* r, U16* g, U16* b, U16* a) {
5719
5720 F fr, fg, fb, fa, br, bg, bb, ba;
5721#if defined(JUMPER_IS_HSW)
5722 if (c->stopCount <=8) {
5723 __m256i lo, hi;
5724 split(idx, &lo, &hi);
5725
5726 fr = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[0]), lo),
5727 _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[0]), hi));
5728 br = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[0]), lo),
5729 _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[0]), hi));
5730 fg = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[1]), lo),
5731 _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[1]), hi));
5732 bg = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[1]), lo),
5733 _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[1]), hi));
5734 fb = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[2]), lo),
5735 _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[2]), hi));
5736 bb = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[2]), lo),
5737 _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[2]), hi));
5738 fa = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[3]), lo),
5739 _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[3]), hi));
5740 ba = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[3]), lo),
5741 _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[3]), hi));
5742 } else
5743#endif
5744 {
5745 fr = gather<F>(c->fs[0], idx);
5746 fg = gather<F>(c->fs[1], idx);
5747 fb = gather<F>(c->fs[2], idx);
5748 fa = gather<F>(c->fs[3], idx);
5749 br = gather<F>(c->bs[0], idx);
5750 bg = gather<F>(c->bs[1], idx);
5751 bb = gather<F>(c->bs[2], idx);
5752 ba = gather<F>(c->bs[3], idx);
5753 }
5754 round_F_to_U16(mad(t, fr, br),
5755 mad(t, fg, bg),
5756 mad(t, fb, bb),
5757 mad(t, fa, ba),
5758 r,g,b,a);
5759}
5760
5761STAGE_GP(gradient, const SkRasterPipeline_GradientCtx* c) {
5762 auto t = x;
5763 U32 idx = 0;
5764
5765 // N.B. The loop starts at 1 because idx 0 is the color to use before the first stop.
5766 for (size_t i = 1; i < c->stopCount; i++) {
5767 idx += if_then_else(t >= c->ts[i], U32(1), U32(0));
5768 }
5769
5770 gradient_lookup(c, idx, t, &r, &g, &b, &a);
5771}
5772
5773STAGE_GP(evenly_spaced_gradient, const SkRasterPipeline_GradientCtx* c) {
5774 auto t = x;
5775 auto idx = trunc_(t * (c->stopCount-1));
5776 gradient_lookup(c, idx, t, &r, &g, &b, &a);
5777}
5778
5779STAGE_GP(evenly_spaced_2_stop_gradient, const SkRasterPipeline_EvenlySpaced2StopGradientCtx* c) {
5780 auto t = x;
5781 round_F_to_U16(mad(t, c->f[0], c->b[0]),
5782 mad(t, c->f[1], c->b[1]),
5783 mad(t, c->f[2], c->b[2]),
5784 mad(t, c->f[3], c->b[3]),
5785 &r,&g,&b,&a);
5786}
5787
5788STAGE_GP(bilerp_clamp_8888, const SkRasterPipeline_GatherCtx* ctx) {
5789 // Quantize sample point and transform into lerp coordinates converting them to 16.16 fixed
5790 // point number.
5791 I32 qx = cast<I32>(floor_(65536.0f * x + 0.5f)) - 32768,
5792 qy = cast<I32>(floor_(65536.0f * y + 0.5f)) - 32768;
5793
5794 // Calculate screen coordinates sx & sy by flooring qx and qy.
5795 I32 sx = qx >> 16,
5796 sy = qy >> 16;
5797
5798 // We are going to perform a change of parameters for qx on [0, 1) to tx on [-1, 1).
5799 // This will put tx in Q15 format for use with q_mult.
5800 // Calculate tx and ty on the interval of [-1, 1). Give {qx} and {qy} are on the interval
5801 // [0, 1), where {v} is fract(v), we can transform to tx in the following manner ty follows
5802 // the same math:
5803 // tx = 2 * {qx} - 1, so
5804 // {qx} = (tx + 1) / 2.
5805 // Calculate {qx} - 1 and {qy} - 1 where the {} operation is handled by the cast, and the - 1
5806 // is handled by the ^ 0x8000, dividing by 2 is deferred and handled in lerpX and lerpY in
5807 // order to use the full 16-bit resolution.
5808 I16 tx = cast<I16>(qx ^ 0x8000),
5809 ty = cast<I16>(qy ^ 0x8000);
5810
5811 // Substituting the {qx} by the equation for tx from above into the lerp equation where v is
5812 // the lerped value:
5813 // v = {qx}*(R - L) + L,
5814 // v = 1/2*(tx + 1)*(R - L) + L
5815 // 2 * v = (tx + 1)*(R - L) + 2*L
5816 // = tx*R - tx*L + R - L + 2*L
5817 // = tx*(R - L) + (R + L).
5818 // Since R and L are on [0, 255] we need them on the interval [0, 1/2] to get them into form
5819 // for Q15_mult. If L and R where in 16.16 format, this would be done by dividing by 2^9. In
5820 // code, we can multiply by 2^7 to get the value directly.
5821 // 2 * v = tx*(R - L) + (R + L)
5822 // 2^-9 * 2 * v = tx*(R - L)*2^-9 + (R + L)*2^-9
5823 // 2^-8 * v = 2^-9 * (tx*(R - L) + (R + L))
5824 // v = 1/2 * (tx*(R - L) + (R + L))
5825 auto lerpX = [&](U16 left, U16 right) -> U16 {
5826 I16 width = (I16)(right - left) << 7;
5827 U16 middle = (right + left) << 7;
5828 // The constrained_add is the most subtle part of lerp. The first term is on the interval
5829 // [-1, 1), and the second term is on the interval is on the interval [0, 1) because
5830 // both terms are too high by a factor of 2 which will be handled below. (Both R and L are
5831 // on [0, 1/2), but the sum R + L is on the interval [0, 1).) Generally, the sum below
5832 // should overflow, but because we know that sum produces an output on the
5833 // interval [0, 1) we know that the extra bit that would be needed will always be 0. So
5834 // we need to be careful to treat this sum as an unsigned positive number in the divide
5835 // by 2 below. Add +1 for rounding.
5836 U16 v2 = constrained_add(scaled_mult(tx, width), middle) + 1;
5837 // Divide by 2 to calculate v and at the same time bring the intermediate value onto the
5838 // interval [0, 1/2] to set up for the lerpY.
5839 return v2 >> 1;
5840 };
5841
5842 const uint32_t* ptr;
5843 U32 ix = ix_and_ptr(&ptr, ctx, sx, sy);
5844 U16 leftR, leftG, leftB, leftA;
5845 from_8888(gather<U32>(ptr, ix), &leftR,&leftG,&leftB,&leftA);
5846
5847 ix = ix_and_ptr(&ptr, ctx, sx+1, sy);
5848 U16 rightR, rightG, rightB, rightA;
5849 from_8888(gather<U32>(ptr, ix), &rightR,&rightG,&rightB,&rightA);
5850
5851 U16 topR = lerpX(leftR, rightR),
5852 topG = lerpX(leftG, rightG),
5853 topB = lerpX(leftB, rightB),
5854 topA = lerpX(leftA, rightA);
5855
5856 ix = ix_and_ptr(&ptr, ctx, sx, sy+1);
5857 from_8888(gather<U32>(ptr, ix), &leftR,&leftG,&leftB,&leftA);
5858
5859 ix = ix_and_ptr(&ptr, ctx, sx+1, sy+1);
5860 from_8888(gather<U32>(ptr, ix), &rightR,&rightG,&rightB,&rightA);
5861
5862 U16 bottomR = lerpX(leftR, rightR),
5863 bottomG = lerpX(leftG, rightG),
5864 bottomB = lerpX(leftB, rightB),
5865 bottomA = lerpX(leftA, rightA);
5866
5867 // lerpY plays the same mathematical tricks as lerpX, but the final divide is by 256 resulting
5868 // in a value on [0, 255].
5869 auto lerpY = [&](U16 top, U16 bottom) -> U16 {
5870 I16 width = (I16)bottom - top;
5871 U16 middle = bottom + top;
5872 // Add + 0x80 for rounding.
5873 U16 blend = constrained_add(scaled_mult(ty, width), middle) + 0x80;
5874
5875 return blend >> 8;
5876 };
5877
5878 r = lerpY(topR, bottomR);
5879 g = lerpY(topG, bottomG);
5880 b = lerpY(topB, bottomB);
5881 a = lerpY(topA, bottomA);
5882}
5883
5884STAGE_GG(xy_to_unit_angle, NoCtx) {
5885 F xabs = abs_(x),
5886 yabs = abs_(y);
5887
5888 F slope = min(xabs, yabs)/max(xabs, yabs);
5889 F s = slope * slope;
5890
5891 // Use a 7th degree polynomial to approximate atan.
5892 // This was generated using sollya.gforge.inria.fr.
5893 // A float optimized polynomial was generated using the following command.
5894 // P1 = fpminimax((1/(2*Pi))*atan(x),[|1,3,5,7|],[|24...|],[2^(-40),1],relative);
5895 F phi = slope
5896 * (0.15912117063999176025390625f + s
5897 * (-5.185396969318389892578125e-2f + s
5898 * (2.476101927459239959716796875e-2f + s
5899 * (-7.0547382347285747528076171875e-3f))));
5900
5901 phi = if_then_else(xabs < yabs, 1.0f/4.0f - phi, phi);
5902 phi = if_then_else(x < 0.0f , 1.0f/2.0f - phi, phi);
5903 phi = if_then_else(y < 0.0f , 1.0f - phi , phi);
5904 phi = if_then_else(phi != phi , 0 , phi); // Check for NaN.
5905 x = phi;
5906}
5907STAGE_GG(xy_to_radius, NoCtx) {
5908 x = sqrt_(x*x + y*y);
5909}
5910
5911// ~~~~~~ Compound stages ~~~~~~ //
5912
5913STAGE_PP(srcover_rgba_8888, const SkRasterPipeline_MemoryCtx* ctx) {
5914 auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy);
5915
5916 load_8888_(ptr, tail, &dr,&dg,&db,&da);
5917 r = r + div255( dr*inv(a) );
5918 g = g + div255( dg*inv(a) );
5919 b = b + div255( db*inv(a) );
5920 a = a + div255( da*inv(a) );
5921 store_8888_(ptr, tail, r,g,b,a);
5922}
5923
5924// ~~~~~~ skgpu::Swizzle stage ~~~~~~ //
5925
5926STAGE_PP(swizzle, void* ctx) {
5927 auto ir = r, ig = g, ib = b, ia = a;
5928 U16* o[] = {&r, &g, &b, &a};
5929 char swiz[4];
5930 memcpy(swiz, &ctx, sizeof(swiz));
5931
5932 for (int i = 0; i < 4; ++i) {
5933 switch (swiz[i]) {
5934 case 'r': *o[i] = ir; break;
5935 case 'g': *o[i] = ig; break;
5936 case 'b': *o[i] = ib; break;
5937 case 'a': *o[i] = ia; break;
5938 case '0': *o[i] = U16(0); break;
5939 case '1': *o[i] = U16(255); break;
5940 default: break;
5941 }
5942 }
5943}
5944
5945#endif//defined(JUMPER_IS_SCALAR) controlling whether we build lowp stages
5946} // namespace lowp
5947
5948/* This gives us SK_OPTS::lowp::N if lowp::N has been set, or SK_OPTS::N if it hasn't. */
5949namespace lowp { static constexpr size_t lowp_N = N; }
5950
5951/** Allow outside code to access the Raster Pipeline pixel stride. */
5952constexpr size_t raster_pipeline_lowp_stride() { return lowp::lowp_N; }
5953constexpr size_t raster_pipeline_highp_stride() { return N; }
5954
5955} // namespace SK_OPTS_NS
5956
5957#undef SI
5958
5959#endif//SkRasterPipeline_opts_DEFINED
5960

source code of flutter_engine/third_party/skia/src/opts/SkRasterPipeline_opts.h