SkRasterPipeline_opts.h source code [flutter_engine/third_party/skia/src/opts/SkRasterPipeline_opts.h]

1	/*
2	* Copyright 2018 Google Inc.
3	*
4	* Use of this source code is governed by a BSD-style license that can be
5	* found in the LICENSE file.
6	*/
7
8	#ifndef SkRasterPipeline_opts_DEFINED
9	#define SkRasterPipeline_opts_DEFINED
10
11	#include "include/core/SkData.h"
12	#include "include/core/SkTypes.h"
13	#include "include/private/base/SkMalloc.h"
14	#include "modules/skcms/skcms.h"
15	#include "src/base/SkUtils.h" // unaligned_{load,store}
16	#include "src/core/SkRasterPipeline.h"
17	#include "src/core/SkRasterPipelineContextUtils.h"
18	#include "src/sksl/tracing/SkSLTraceHook.h"
19
20	#include <cstdint>
21	#include <type_traits>
22
23	// Every function in this file should be marked static and inline using SI.
24	#if defined(__clang__)
25	#define SI __attribute__((always_inline)) static inline
26	#else
27	#define SI static inline
28	#endif
29
30	#if defined(__clang__)
31	#define SK_UNROLL _Pragma("unroll")
32	#else
33	#define SK_UNROLL
34	#endif
35
36	template <typename Dst, typename Src>
37	SI Dst widen_cast(const Src& src) {
38	static_assert(sizeof(Dst) > sizeof(Src));
39	static_assert(std::is_trivially_copyable<Dst>::value);
40	static_assert(std::is_trivially_copyable<Src>::value);
41	Dst dst;
42	memcpy(&dst, &src, sizeof(Src));
43	return dst;
44	}
45
46	struct Ctx {
47	SkRasterPipelineStage* fStage;
48
49	template <typename T>
50	operator T*() {
51	return (T*)fStage->ctx;
52	}
53	};
54
55	using NoCtx = const void*;
56
57	#if !defined(__clang__)
58	#define JUMPER_IS_SCALAR
59	#elif defined(SK_ARM_HAS_NEON)
60	#define JUMPER_IS_NEON
61	#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
62	#define JUMPER_IS_HSW
63	#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX
64	#define JUMPER_IS_AVX
65	#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
66	#define JUMPER_IS_SSE41
67	#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
68	#define JUMPER_IS_SSE2
69	#else
70	#define JUMPER_IS_SCALAR
71	#endif
72
73	// Older Clangs seem to crash when generating non-optimized NEON code for ARMv7.
74	#if defined(__clang__) && !defined(__OPTIMIZE__) && defined(SK_CPU_ARM32)
75	// Apple Clang 9 and vanilla Clang 5 are fine, and may even be conservative.
76	#if defined(__apple_build_version__) && __clang_major__ < 9
77	#define JUMPER_IS_SCALAR
78	#elif __clang_major__ < 5
79	#define JUMPER_IS_SCALAR
80	#endif
81
82	#if defined(JUMPER_IS_NEON) && defined(JUMPER_IS_SCALAR)
83	#undef JUMPER_IS_NEON
84	#endif
85	#endif
86
87	#if defined(JUMPER_IS_SCALAR)
88	#include <math.h>
89	#elif defined(JUMPER_IS_NEON)
90	#include <arm_neon.h>
91	#else
92	#include <immintrin.h>
93	#endif
94
95	// Notes:
96	// rcp_fast and rcp_precise both produce a reciprocal, but rcp_fast is an estimate with at least*
97	// 12 bits of precision while rcp_precise should be accurate for float size. For ARM rcp_precise
98	// requires 2 Newton-Raphson refinement steps because its estimate has 8 bit precision, and for
99	// Intel this requires one additional step because its estimate has 12 bit precision.
100
101	namespace SK_OPTS_NS {
102	#if defined(JUMPER_IS_SCALAR)
103	// This path should lead to portable scalar code.
104	using F = float ;
105	using I32 = int32_t;
106	using U64 = uint64_t;
107	using U32 = uint32_t;
108	using U16 = uint16_t;
109	using U8 = uint8_t ;
110
111	SI F min(F a, F b) { return fminf(a,b); }
112	SI I32 min(I32 a, I32 b) { return a < b ? a : b; }
113	SI U32 min(U32 a, U32 b) { return a < b ? a : b; }
114	SI F max(F a, F b) { return fmaxf(a,b); }
115	SI I32 max(I32 a, I32 b) { return a > b ? a : b; }
116	SI U32 max(U32 a, U32 b) { return a > b ? a : b; }
117
118	SI F mad(F f, F m, F a) { return f*m+a; }
119	SI F abs_ (F v) { return fabsf(v); }
120	SI I32 abs_ (I32 v) { return v < `0` ? -v : v; }
121	SI F floor_(F v) { return floorf(v); }
122	SI F ceil_(F v) { return ceilf(v); }
123	SI F rcp_fast(F v) { return `1.0f` / v; }
124	SI F rsqrt (F v) { return `1.0f` / sqrtf(v); }
125	SI F sqrt_ (F v) { return sqrtf(v); }
126	SI F rcp_precise (F v) { return `1.0f` / v; }
127
128	SI U32 round(F v) { return (uint32_t)(v + `0.5f`); }
129	SI U32 round(F v, F scale) { return (uint32_t)(v*scale + `0.5f`); }
130	SI U16 pack(U32 v) { return (U16)v; }
131	SI U8 pack(U16 v) { return (U8)v; }
132
133	SI F if_then_else(I32 c, F t, F e) { return c ? t : e; }
134	SI bool any(I32 c) { return c != `0`; }
135	SI bool all(I32 c) { return c != `0`; }
136
137	template <typename T>
138	SI T gather(const T* p, U32 ix) { return p[ix]; }
139
140	template <typename T>
141	SI void scatter_masked(T src, T* dst, U32 ix, I32 mask) {
142	dst[ix] = mask ? src : dst[ix];
143	}
144
145	SI void load2(const uint16_t* ptr, size_t tail, U16* r, U16* g) {
146	*r = ptr[`0`];
147	*g = ptr[`1`];
148	}
149	SI void store2(uint16_t* ptr, size_t tail, U16 r, U16 g) {
150	ptr[`0`] = r;
151	ptr[`1`] = g;
152	}
153	SI void load3(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) {
154	*r = ptr[`0`];
155	*g = ptr[`1`];
156	*b = ptr[`2`];
157	}
158	SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
159	*r = ptr[`0`];
160	*g = ptr[`1`];
161	*b = ptr[`2`];
162	*a = ptr[`3`];
163	}
164	SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
165	ptr[`0`] = r;
166	ptr[`1`] = g;
167	ptr[`2`] = b;
168	ptr[`3`] = a;
169	}
170
171	SI void load2(const float* ptr, size_t tail, F* r, F* g) {
172	*r = ptr[`0`];
173	*g = ptr[`1`];
174	}
175	SI void store2(float* ptr, size_t tail, F r, F g) {
176	ptr[`0`] = r;
177	ptr[`1`] = g;
178	}
179	SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) {
180	*r = ptr[`0`];
181	*g = ptr[`1`];
182	*b = ptr[`2`];
183	*a = ptr[`3`];
184	}
185	SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) {
186	ptr[`0`] = r;
187	ptr[`1`] = g;
188	ptr[`2`] = b;
189	ptr[`3`] = a;
190	}
191
192	#elif defined(JUMPER_IS_NEON)
193	// Since we know we're using Clang, we can use its vector extensions.
194	template <typename T> using V = T __attribute__((ext_vector_type(`4`)));
195	using F = V<float >;
196	using I32 = V< int32_t>;
197	using U64 = V<uint64_t>;
198	using U32 = V<uint32_t>;
199	using U16 = V<uint16_t>;
200	using U8 = V<uint8_t >;
201
202	// We polyfill a few routines that Clang doesn't build into ext_vector_types.
203	SI F min(F a, F b) { return vminq_f32(a,b); }
204	SI I32 min(I32 a, I32 b) { return vminq_s32(a,b); }
205	SI U32 min(U32 a, U32 b) { return vminq_u32(a,b); }
206	SI F max(F a, F b) { return vmaxq_f32(a,b); }
207	SI I32 max(I32 a, I32 b) { return vmaxq_s32(a,b); }
208	SI U32 max(U32 a, U32 b) { return vmaxq_u32(a,b); }
209
210	SI F abs_ (F v) { return vabsq_f32(v); }
211	SI I32 abs_ (I32 v) { return vabsq_s32(v); }
212	SI F rcp_fast(F v) { auto e = vrecpeq_f32 (v); return vrecpsq_f32 (v,e ) * e; }
213	SI F rcp_precise (F v) { auto e = rcp_fast(v); return vrecpsq_f32 (v,e ) * e; }
214	SI F rsqrt (F v) { auto e = vrsqrteq_f32(v); return vrsqrtsq_f32(v,ee) e; }
215
216	SI U16 pack(U32 v) { return __builtin_convertvector(v, U16); }
217	SI U8 pack(U16 v) { return __builtin_convertvector(v, U8); }
218
219	SI F if_then_else(I32 c, F t, F e) { return vbslq_f32((U32)c,t,e); }
220
221	#if defined(SK_CPU_ARM64)
222	SI bool any(I32 c) { return vmaxvq_u32((U32)c) != `0`; }
223	SI bool all(I32 c) { return vminvq_u32((U32)c) != `0`; }
224
225	SI F mad(F f, F m, F a) { return vfmaq_f32(a,f,m); }
226	SI F floor_(F v) { return vrndmq_f32(v); }
227	SI F ceil_(F v) { return vrndpq_f32(v); }
228	SI F sqrt_(F v) { return vsqrtq_f32(v); }
229	SI U32 round(F v) { return vcvtnq_u32_f32(v); }
230	SI U32 round(F v, F scale) { return vcvtnq_u32_f32(v*scale); }
231	#else
232	SI bool any(I32 c) { return c[`0`] \| c[`1`] \| c[`2`] \| c[`3`]; }
233	SI bool all(I32 c) { return c[`0`] & c[`1`] & c[`2`] & c[`3`]; }
234
235	SI F mad(F f, F m, F a) { return vmlaq_f32(a,f,m); }
236	SI F floor_(F v) {
237	F roundtrip = vcvtq_f32_s32(vcvtq_s32_f32(v));
238	return roundtrip - if_then_else(roundtrip > v, `1`, `0`);
239	}
240
241	SI F ceil_(F v) {
242	F roundtrip = vcvtq_f32_s32(vcvtq_s32_f32(v));
243	return roundtrip + if_then_else(roundtrip < v, `1`, `0`);
244	}
245
246	SI F sqrt_(F v) {
247	auto e = vrsqrteq_f32(v); // Estimate and two refinement steps for e = rsqrt(v).
248	e = vrsqrtsq_f32(v,ee);
249	e = vrsqrtsq_f32(v,ee);
250	return ve; // sqrt(v) == vrsqrt(v).
251	}
252
253	SI U32 round(F v) {
254	return vcvtq_u32_f32(v + `0.5f`);
255	}
256
257	SI U32 round(F v, F scale) {
258	return vcvtq_u32_f32(mad(v,scale,`0.5f`));
259	}
260	#endif
261
262	template <typename T>
263	SI V<T> gather(const T* p, U32 ix) {
264	return {p[ix[`0`]], p[ix[`1`]], p[ix[`2`]], p[ix[`3`]]};
265	}
266	template <typename V, typename S>
267	SI void scatter_masked(V src, S* dst, U32 ix, I32 mask) {
268	V before = gather(dst, ix);
269	V after = if_then_else(mask, src, before);
270	dst[ix[`0`]] = after[`0`];
271	dst[ix[`1`]] = after[`1`];
272	dst[ix[`2`]] = after[`2`];
273	dst[ix[`3`]] = after[`3`];
274	}
275	SI void load2(const uint16_t* ptr, size_t tail, U16* r, U16* g) {
276	uint16x4x2_t rg;
277	if (__builtin_expect(tail,`0`)) {
278	if ( true ) { rg = vld2_lane_u16(ptr + `0`, rg, `0`); }
279	if (tail > `1`) { rg = vld2_lane_u16(ptr + `2`, rg, `1`); }
280	if (tail > `2`) { rg = vld2_lane_u16(ptr + `4`, rg, `2`); }
281	} else {
282	rg = vld2_u16(ptr);
283	}
284	*r = rg.val[`0`];
285	*g = rg.val[`1`];
286	}
287	SI void store2(uint16_t* ptr, size_t tail, U16 r, U16 g) {
288	if (__builtin_expect(tail,`0`)) {
289	if ( true ) { vst2_lane_u16(ptr + `0`, (uint16x4x2_t{{r,g}}), `0`); }
290	if (tail > `1`) { vst2_lane_u16(ptr + `2`, (uint16x4x2_t{{r,g}}), `1`); }
291	if (tail > `2`) { vst2_lane_u16(ptr + `4`, (uint16x4x2_t{{r,g}}), `2`); }
292	} else {
293	vst2_u16(ptr, (uint16x4x2_t{{r,g}}));
294	}
295	}
296	SI void load3(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) {
297	uint16x4x3_t rgb;
298	if (__builtin_expect(tail,`0`)) {
299	if ( true ) { rgb = vld3_lane_u16(ptr + `0`, rgb, `0`); }
300	if (tail > `1`) { rgb = vld3_lane_u16(ptr + `3`, rgb, `1`); }
301	if (tail > `2`) { rgb = vld3_lane_u16(ptr + `6`, rgb, `2`); }
302	} else {
303	rgb = vld3_u16(ptr);
304	}
305	*r = rgb.val[`0`];
306	*g = rgb.val[`1`];
307	*b = rgb.val[`2`];
308	}
309	SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
310	uint16x4x4_t rgba;
311	if (__builtin_expect(tail,`0`)) {
312	if ( true ) { rgba = vld4_lane_u16(ptr + `0`, rgba, `0`); }
313	if (tail > `1`) { rgba = vld4_lane_u16(ptr + `4`, rgba, `1`); }
314	if (tail > `2`) { rgba = vld4_lane_u16(ptr + `8`, rgba, `2`); }
315	} else {
316	rgba = vld4_u16(ptr);
317	}
318	*r = rgba.val[`0`];
319	*g = rgba.val[`1`];
320	*b = rgba.val[`2`];
321	*a = rgba.val[`3`];
322	}
323
324	SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
325	if (__builtin_expect(tail,`0`)) {
326	if ( true ) { vst4_lane_u16(ptr + `0`, (uint16x4x4_t{{r,g,b,a}}), `0`); }
327	if (tail > `1`) { vst4_lane_u16(ptr + `4`, (uint16x4x4_t{{r,g,b,a}}), `1`); }
328	if (tail > `2`) { vst4_lane_u16(ptr + `8`, (uint16x4x4_t{{r,g,b,a}}), `2`); }
329	} else {
330	vst4_u16(ptr, (uint16x4x4_t{{r,g,b,a}}));
331	}
332	}
333	SI void load2(const float* ptr, size_t tail, F* r, F* g) {
334	float32x4x2_t rg;
335	if (__builtin_expect(tail,`0`)) {
336	if ( true ) { rg = vld2q_lane_f32(ptr + `0`, rg, `0`); }
337	if (tail > `1`) { rg = vld2q_lane_f32(ptr + `2`, rg, `1`); }
338	if (tail > `2`) { rg = vld2q_lane_f32(ptr + `4`, rg, `2`); }
339	} else {
340	rg = vld2q_f32(ptr);
341	}
342	*r = rg.val[`0`];
343	*g = rg.val[`1`];
344	}
345	SI void store2(float* ptr, size_t tail, F r, F g) {
346	if (__builtin_expect(tail,`0`)) {
347	if ( true ) { vst2q_lane_f32(ptr + `0`, (float32x4x2_t{{r,g}}), `0`); }
348	if (tail > `1`) { vst2q_lane_f32(ptr + `2`, (float32x4x2_t{{r,g}}), `1`); }
349	if (tail > `2`) { vst2q_lane_f32(ptr + `4`, (float32x4x2_t{{r,g}}), `2`); }
350	} else {
351	vst2q_f32(ptr, (float32x4x2_t{{r,g}}));
352	}
353	}
354	SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) {
355	float32x4x4_t rgba;
356	if (__builtin_expect(tail,`0`)) {
357	if ( true ) { rgba = vld4q_lane_f32(ptr + `0`, rgba, `0`); }
358	if (tail > `1`) { rgba = vld4q_lane_f32(ptr + `4`, rgba, `1`); }
359	if (tail > `2`) { rgba = vld4q_lane_f32(ptr + `8`, rgba, `2`); }
360	} else {
361	rgba = vld4q_f32(ptr);
362	}
363	*r = rgba.val[`0`];
364	*g = rgba.val[`1`];
365	*b = rgba.val[`2`];
366	*a = rgba.val[`3`];
367	}
368	SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) {
369	if (__builtin_expect(tail,`0`)) {
370	if ( true ) { vst4q_lane_f32(ptr + `0`, (float32x4x4_t{{r,g,b,a}}), `0`); }
371	if (tail > `1`) { vst4q_lane_f32(ptr + `4`, (float32x4x4_t{{r,g,b,a}}), `1`); }
372	if (tail > `2`) { vst4q_lane_f32(ptr + `8`, (float32x4x4_t{{r,g,b,a}}), `2`); }
373	} else {
374	vst4q_f32(ptr, (float32x4x4_t{{r,g,b,a}}));
375	}
376	}
377
378	#elif defined(JUMPER_IS_HSW)
379	// These are __m256 and __m256i, but friendlier and strongly-typed.
380	template <typename T> using V = T __attribute__((ext_vector_type(`8`)));
381	using F = V<float >;
382	using I32 = V< int32_t>;
383	using U64 = V<uint64_t>;
384	using U32 = V<uint32_t>;
385	using U16 = V<uint16_t>;
386	using U8 = V<uint8_t >;
387
388	SI F mad(F f, F m, F a) { return _mm256_fmadd_ps(f, m, a); }
389
390	SI F min(F a, F b) { return _mm256_min_ps(a,b); }
391	SI I32 min(I32 a, I32 b) { return _mm256_min_epi32(a,b); }
392	SI U32 min(U32 a, U32 b) { return _mm256_min_epu32(a,b); }
393	SI F max(F a, F b) { return _mm256_max_ps(a,b); }
394	SI I32 max(I32 a, I32 b) { return _mm256_max_epi32(a,b); }
395	SI U32 max(U32 a, U32 b) { return _mm256_max_epu32(a,b); }
396
397	SI F abs_ (F v) { return _mm256_and_ps(v, `0`-v); }
398	SI I32 abs_ (I32 v) { return _mm256_abs_epi32(v); }
399	SI F floor_(F v) { return _mm256_floor_ps(v); }
400	SI F ceil_(F v) { return _mm256_ceil_ps(v); }
401	SI F rcp_fast(F v) { return _mm256_rcp_ps (v); }
402	SI F rsqrt (F v) { return _mm256_rsqrt_ps(v); }
403	SI F sqrt_ (F v) { return _mm256_sqrt_ps (v); }
404	SI F rcp_precise (F v) {
405	F e = rcp_fast(v);
406	return _mm256_fnmadd_ps(v, e, _mm256_set1_ps(`2.0f`)) * e;
407	}
408
409	SI U32 round(F v) { return _mm256_cvtps_epi32(v); }
410	SI U32 round(F v, F scale) { return _mm256_cvtps_epi32(v*scale); }
411	SI U16 pack(U32 v) {
412	return _mm_packus_epi32(_mm256_extractf128_si256(v, `0`),
413	_mm256_extractf128_si256(v, `1`));
414	}
415	SI U8 pack(U16 v) {
416	auto r = _mm_packus_epi16(v,v);
417	return sk_unaligned_load<U8>(&r);
418	}
419
420	SI F if_then_else(I32 c, F t, F e) { return _mm256_blendv_ps(e,t,c); }
421	// NOTE: This version of 'all' only works with mask values (true == all bits set)
422	SI bool any(I32 c) { return !_mm256_testz_si256(c, _mm256_set1_epi32(-`1`)); }
423	SI bool all(I32 c) { return _mm256_testc_si256(c, _mm256_set1_epi32(-`1`)); }
424
425	template <typename T>
426	SI V<T> gather(const T* p, U32 ix) {
427	return { p[ix[`0`]], p[ix[`1`]], p[ix[`2`]], p[ix[`3`]],
428	p[ix[`4`]], p[ix[`5`]], p[ix[`6`]], p[ix[`7`]], };
429	}
430	SI F gather(const float* p, U32 ix) { return _mm256_i32gather_ps (p, ix, `4`); }
431	SI U32 gather(const uint32_t* p, U32 ix) { return _mm256_i32gather_epi32(p, ix, `4`); }
432	SI U64 gather(const uint64_t* p, U32 ix) {
433	__m256i parts[] = {
434	_mm256_i32gather_epi64(p, _mm256_extracti128_si256(ix,`0`), `8`),
435	_mm256_i32gather_epi64(p, _mm256_extracti128_si256(ix,`1`), `8`),
436	};
437	return sk_bit_cast<U64>(parts);
438	}
439	template <typename V, typename S>
440	SI void scatter_masked(V src, S* dst, U32 ix, I32 mask) {
441	V before = gather(dst, ix);
442	V after = if_then_else(mask, src, before);
443	dst[ix[`0`]] = after[`0`];
444	dst[ix[`1`]] = after[`1`];
445	dst[ix[`2`]] = after[`2`];
446	dst[ix[`3`]] = after[`3`];
447	dst[ix[`4`]] = after[`4`];
448	dst[ix[`5`]] = after[`5`];
449	dst[ix[`6`]] = after[`6`];
450	dst[ix[`7`]] = after[`7`];
451	}
452
453	SI void load2(const uint16_t* ptr, size_t tail, U16* r, U16* g) {
454	U16 _0123, _4567;
455	if (__builtin_expect(tail,`0`)) {
456	_0123 = _4567 = _mm_setzero_si128();
457	auto* d = &_0123;
458	if (tail > `3`) {
459	d = _mm_loadu_si128(((__m128i)ptr) + `0`);
460	tail -= `4`;
461	ptr += `8`;
462	d = &_4567;
463	}
464	bool high = false;
465	if (tail > `1`) {
466	*d = _mm_loadu_si64(ptr);
467	tail -= `2`;
468	ptr += `4`;
469	high = true;
470	}
471	if (tail > `0`) {
472	(d)[high ? `4` : `0`] = (ptr + `0`);
473	(d)[high ? `5` : `1`] = (ptr + `1`);
474	}
475	} else {
476	_0123 = _mm_loadu_si128(((__m128i*)ptr) + `0`);
477	_4567 = _mm_loadu_si128(((__m128i*)ptr) + `1`);
478	}
479	*r = _mm_packs_epi32(_mm_srai_epi32(_mm_slli_epi32(_0123, `16`), `16`),
480	_mm_srai_epi32(_mm_slli_epi32(_4567, `16`), `16`));
481	*g = _mm_packs_epi32(_mm_srai_epi32(_0123, `16`),
482	_mm_srai_epi32(_4567, `16`));
483	}
484	SI void store2(uint16_t* ptr, size_t tail, U16 r, U16 g) {
485	auto _0123 = _mm_unpacklo_epi16(r, g),
486	_4567 = _mm_unpackhi_epi16(r, g);
487	if (__builtin_expect(tail,`0`)) {
488	const auto* s = &_0123;
489	if (tail > `3`) {
490	_mm_storeu_si128((__m128i)ptr, s);
491	s = &_4567;
492	tail -= `4`;
493	ptr += `8`;
494	}
495	bool high = false;
496	if (tail > `1`) {
497	_mm_storel_epi64((__m128i)ptr, s);
498	ptr += `4`;
499	tail -= `2`;
500	high = true;
501	}
502	if (tail > `0`) {
503	if (high) {
504	(int32_t)ptr = _mm_extract_epi32(*s, `2`);
505	} else {
506	(int32_t)ptr = _mm_cvtsi128_si32(*s);
507	}
508	}
509	} else {
510	_mm_storeu_si128((__m128i*)ptr + `0`, _0123);
511	_mm_storeu_si128((__m128i*)ptr + `1`, _4567);
512	}
513	}
514
515	SI void load3(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) {
516	__m128i _0,_1,_2,_3,_4,_5,_6,_7;
517	if (__builtin_expect(tail,`0`)) {
518	auto load_rgb = [](const uint16_t* src) {
519	auto v = _mm_cvtsi32_si128((const* uint32_t*)src);
520	return _mm_insert_epi16(v, src[`2`], `2`);
521	};
522	_1 = _2 = _3 = _4 = _5 = _6 = _7 = _mm_setzero_si128();
523	if ( true ) { _0 = load_rgb(ptr + `0`); }
524	if (tail > `1`) { _1 = load_rgb(ptr + `3`); }
525	if (tail > `2`) { _2 = load_rgb(ptr + `6`); }
526	if (tail > `3`) { _3 = load_rgb(ptr + `9`); }
527	if (tail > `4`) { _4 = load_rgb(ptr + `12`); }
528	if (tail > `5`) { _5 = load_rgb(ptr + `15`); }
529	if (tail > `6`) { _6 = load_rgb(ptr + `18`); }
530	} else {
531	// Load 0+1, 2+3, 4+5 normally, and 6+7 backed up 4 bytes so we don't run over.
532	auto _01 = _mm_loadu_si128((const __m128i*)(ptr + `0`)) ;
533	auto _23 = _mm_loadu_si128((const __m128i*)(ptr + `6`)) ;
534	auto _45 = _mm_loadu_si128((const __m128i*)(ptr + `12`)) ;
535	auto _67 = _mm_srli_si128(_mm_loadu_si128((const __m128i*)(ptr + `16`)), `4`);
536	_0 = _01; _1 = _mm_srli_si128(_01, `6`);
537	_2 = _23; _3 = _mm_srli_si128(_23, `6`);
538	_4 = _45; _5 = _mm_srli_si128(_45, `6`);
539	_6 = _67; _7 = _mm_srli_si128(_67, `6`);
540	}
541
542	auto _02 = _mm_unpacklo_epi16(_0, _2), // r0 r2 g0 g2 b0 b2 xx xx
543	_13 = _mm_unpacklo_epi16(_1, _3),
544	_46 = _mm_unpacklo_epi16(_4, _6),
545	_57 = _mm_unpacklo_epi16(_5, _7);
546
547	auto rg0123 = _mm_unpacklo_epi16(_02, _13), // r0 r1 r2 r3 g0 g1 g2 g3
548	bx0123 = _mm_unpackhi_epi16(_02, _13), // b0 b1 b2 b3 xx xx xx xx
549	rg4567 = _mm_unpacklo_epi16(_46, _57),
550	bx4567 = _mm_unpackhi_epi16(_46, _57);
551
552	*r = _mm_unpacklo_epi64(rg0123, rg4567);
553	*g = _mm_unpackhi_epi64(rg0123, rg4567);
554	*b = _mm_unpacklo_epi64(bx0123, bx4567);
555	}
556	SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
557	__m128i _01, _23, _45, _67;
558	if (__builtin_expect(tail,`0`)) {
559	auto src = (const double*)ptr;
560	_01 = _23 = _45 = _67 = _mm_setzero_si128();
561	if (tail > `0`) { _01 = _mm_loadl_pd(_01, src+`0`); }
562	if (tail > `1`) { _01 = _mm_loadh_pd(_01, src+`1`); }
563	if (tail > `2`) { _23 = _mm_loadl_pd(_23, src+`2`); }
564	if (tail > `3`) { _23 = _mm_loadh_pd(_23, src+`3`); }
565	if (tail > `4`) { _45 = _mm_loadl_pd(_45, src+`4`); }
566	if (tail > `5`) { _45 = _mm_loadh_pd(_45, src+`5`); }
567	if (tail > `6`) { _67 = _mm_loadl_pd(_67, src+`6`); }
568	} else {
569	_01 = _mm_loadu_si128(((__m128i*)ptr) + `0`);
570	_23 = _mm_loadu_si128(((__m128i*)ptr) + `1`);
571	_45 = _mm_loadu_si128(((__m128i*)ptr) + `2`);
572	_67 = _mm_loadu_si128(((__m128i*)ptr) + `3`);
573	}
574
575	auto _02 = _mm_unpacklo_epi16(_01, _23), // r0 r2 g0 g2 b0 b2 a0 a2
576	_13 = _mm_unpackhi_epi16(_01, _23), // r1 r3 g1 g3 b1 b3 a1 a3
577	_46 = _mm_unpacklo_epi16(_45, _67),
578	_57 = _mm_unpackhi_epi16(_45, _67);
579
580	auto rg0123 = _mm_unpacklo_epi16(_02, _13), // r0 r1 r2 r3 g0 g1 g2 g3
581	ba0123 = _mm_unpackhi_epi16(_02, _13), // b0 b1 b2 b3 a0 a1 a2 a3
582	rg4567 = _mm_unpacklo_epi16(_46, _57),
583	ba4567 = _mm_unpackhi_epi16(_46, _57);
584
585	*r = _mm_unpacklo_epi64(rg0123, rg4567);
586	*g = _mm_unpackhi_epi64(rg0123, rg4567);
587	*b = _mm_unpacklo_epi64(ba0123, ba4567);
588	*a = _mm_unpackhi_epi64(ba0123, ba4567);
589	}
590	SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
591	auto rg0123 = _mm_unpacklo_epi16(r, g), // r0 g0 r1 g1 r2 g2 r3 g3
592	rg4567 = _mm_unpackhi_epi16(r, g), // r4 g4 r5 g5 r6 g6 r7 g7
593	ba0123 = _mm_unpacklo_epi16(b, a),
594	ba4567 = _mm_unpackhi_epi16(b, a);
595
596	auto _01 = _mm_unpacklo_epi32(rg0123, ba0123),
597	_23 = _mm_unpackhi_epi32(rg0123, ba0123),
598	_45 = _mm_unpacklo_epi32(rg4567, ba4567),
599	_67 = _mm_unpackhi_epi32(rg4567, ba4567);
600
601	if (__builtin_expect(tail,`0`)) {
602	auto dst = (double*)ptr;
603	if (tail > `0`) { _mm_storel_pd(dst+`0`, _01); }
604	if (tail > `1`) { _mm_storeh_pd(dst+`1`, _01); }
605	if (tail > `2`) { _mm_storel_pd(dst+`2`, _23); }
606	if (tail > `3`) { _mm_storeh_pd(dst+`3`, _23); }
607	if (tail > `4`) { _mm_storel_pd(dst+`4`, _45); }
608	if (tail > `5`) { _mm_storeh_pd(dst+`5`, _45); }
609	if (tail > `6`) { _mm_storel_pd(dst+`6`, _67); }
610	} else {
611	_mm_storeu_si128((__m128i*)ptr + `0`, _01);
612	_mm_storeu_si128((__m128i*)ptr + `1`, _23);
613	_mm_storeu_si128((__m128i*)ptr + `2`, _45);
614	_mm_storeu_si128((__m128i*)ptr + `3`, _67);
615	}
616	}
617
618	SI void load2(const float* ptr, size_t tail, F* r, F* g) {
619	F _0123, _4567;
620	if (__builtin_expect(tail, `0`)) {
621	_0123 = _4567 = _mm256_setzero_ps();
622	F* d = &_0123;
623	if (tail > `3`) {
624	*d = _mm256_loadu_ps(ptr);
625	ptr += `8`;
626	tail -= `4`;
627	d = &_4567;
628	}
629	bool high = false;
630	if (tail > `1`) {
631	*d = _mm256_castps128_ps256(_mm_loadu_ps(ptr));
632	ptr += `4`;
633	tail -= `2`;
634	high = true;
635	}
636	if (tail > `0`) {
637	d = high ? _mm256_insertf128_ps(d, _mm_loadu_si64(ptr), `1`)
638	: _mm256_insertf128_ps(*d, _mm_loadu_si64(ptr), `0`);
639	}
640	} else {
641	_0123 = _mm256_loadu_ps(ptr + `0`);
642	_4567 = _mm256_loadu_ps(ptr + `8`);
643	}
644
645	F _0145 = _mm256_permute2f128_pd(_0123, _4567, `0x20`),
646	_2367 = _mm256_permute2f128_pd(_0123, _4567, `0x31`);
647
648	*r = _mm256_shuffle_ps(_0145, _2367, `0x88`);
649	*g = _mm256_shuffle_ps(_0145, _2367, `0xDD`);
650	}
651	SI void store2(float* ptr, size_t tail, F r, F g) {
652	F _0145 = _mm256_unpacklo_ps(r, g),
653	_2367 = _mm256_unpackhi_ps(r, g);
654	F _0123 = _mm256_permute2f128_pd(_0145, _2367, `0x20`),
655	_4567 = _mm256_permute2f128_pd(_0145, _2367, `0x31`);
656
657	if (__builtin_expect(tail, `0`)) {
658	const __m256* s = &_0123;
659	if (tail > `3`) {
660	_mm256_storeu_ps(ptr, *s);
661	s = &_4567;
662	tail -= `4`;
663	ptr += `8`;
664	}
665	bool high = false;
666	if (tail > `1`) {
667	_mm_storeu_ps(ptr, _mm256_extractf128_ps(*s, `0`));
668	ptr += `4`;
669	tail -= `2`;
670	high = true;
671	}
672	if (tail > `0`) {
673	(ptr + `0`) = (s)[ high ? `4` : `0`];
674	(ptr + `1`) = (s)[ high ? `5` : `1`];
675	}
676	} else {
677	_mm256_storeu_ps(ptr + `0`, _0123);
678	_mm256_storeu_ps(ptr + `8`, _4567);
679	}
680	}
681
682	SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) {
683	F _04, _15, _26, _37;
684	_04 = _15 = _26 = _37 = `0`;
685	switch (tail) {
686	case `0`: _37 = _mm256_insertf128_ps(_37, _mm_loadu_ps(ptr+`28`), `1`); [[fallthrough]];
687	case `7`: _26 = _mm256_insertf128_ps(_26, _mm_loadu_ps(ptr+`24`), `1`); [[fallthrough]];
688	case `6`: _15 = _mm256_insertf128_ps(_15, _mm_loadu_ps(ptr+`20`), `1`); [[fallthrough]];
689	case `5`: _04 = _mm256_insertf128_ps(_04, _mm_loadu_ps(ptr+`16`), `1`); [[fallthrough]];
690	case `4`: _37 = _mm256_insertf128_ps(_37, _mm_loadu_ps(ptr+`12`), `0`); [[fallthrough]];
691	case `3`: _26 = _mm256_insertf128_ps(_26, _mm_loadu_ps(ptr+ `8`), `0`); [[fallthrough]];
692	case `2`: _15 = _mm256_insertf128_ps(_15, _mm_loadu_ps(ptr+ `4`), `0`); [[fallthrough]];
693	case `1`: _04 = _mm256_insertf128_ps(_04, _mm_loadu_ps(ptr+ `0`), `0`);
694	}
695
696	F rg0145 = _mm256_unpacklo_ps(_04,_15), // r0 r1 g0 g1 \| r4 r5 g4 g5
697	ba0145 = _mm256_unpackhi_ps(_04,_15),
698	rg2367 = _mm256_unpacklo_ps(_26,_37),
699	ba2367 = _mm256_unpackhi_ps(_26,_37);
700
701	*r = _mm256_unpacklo_pd(rg0145, rg2367);
702	*g = _mm256_unpackhi_pd(rg0145, rg2367);
703	*b = _mm256_unpacklo_pd(ba0145, ba2367);
704	*a = _mm256_unpackhi_pd(ba0145, ba2367);
705	}
706	SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) {
707	F rg0145 = _mm256_unpacklo_ps(r, g), // r0 g0 r1 g1 \| r4 g4 r5 g5
708	rg2367 = _mm256_unpackhi_ps(r, g), // r2 ... \| r6 ...
709	ba0145 = _mm256_unpacklo_ps(b, a), // b0 a0 b1 a1 \| b4 a4 b5 a5
710	ba2367 = _mm256_unpackhi_ps(b, a); // b2 ... \| b6 ...
711
712	F _04 = _mm256_unpacklo_pd(rg0145, ba0145), // r0 g0 b0 a0 \| r4 g4 b4 a4
713	_15 = _mm256_unpackhi_pd(rg0145, ba0145), // r1 ... \| r5 ...
714	_26 = _mm256_unpacklo_pd(rg2367, ba2367), // r2 ... \| r6 ...
715	_37 = _mm256_unpackhi_pd(rg2367, ba2367); // r3 ... \| r7 ...
716
717	if (__builtin_expect(tail, `0`)) {
718	if (tail > `0`) { _mm_storeu_ps(ptr+ `0`, _mm256_extractf128_ps(_04, `0`)); }
719	if (tail > `1`) { _mm_storeu_ps(ptr+ `4`, _mm256_extractf128_ps(_15, `0`)); }
720	if (tail > `2`) { _mm_storeu_ps(ptr+ `8`, _mm256_extractf128_ps(_26, `0`)); }
721	if (tail > `3`) { _mm_storeu_ps(ptr+`12`, _mm256_extractf128_ps(_37, `0`)); }
722	if (tail > `4`) { _mm_storeu_ps(ptr+`16`, _mm256_extractf128_ps(_04, `1`)); }
723	if (tail > `5`) { _mm_storeu_ps(ptr+`20`, _mm256_extractf128_ps(_15, `1`)); }
724	if (tail > `6`) { _mm_storeu_ps(ptr+`24`, _mm256_extractf128_ps(_26, `1`)); }
725	} else {
726	F _01 = _mm256_permute2f128_ps(_04, _15, `32`), // 32 == 0010 0000 == lo, lo
727	_23 = _mm256_permute2f128_ps(_26, _37, `32`),
728	_45 = _mm256_permute2f128_ps(_04, _15, `49`), // 49 == 0011 0001 == hi, hi
729	_67 = _mm256_permute2f128_ps(_26, _37, `49`);
730	_mm256_storeu_ps(ptr+ `0`, _01);
731	_mm256_storeu_ps(ptr+ `8`, _23);
732	_mm256_storeu_ps(ptr+`16`, _45);
733	_mm256_storeu_ps(ptr+`24`, _67);
734	}
735	}
736
737	#elif defined(JUMPER_IS_SSE2) \|\| defined(JUMPER_IS_SSE41) \|\| defined(JUMPER_IS_AVX)
738	template <typename T> using V = T __attribute__((ext_vector_type(`4`)));
739	using F = V<float >;
740	using I32 = V< int32_t>;
741	using U64 = V<uint64_t>;
742	using U32 = V<uint32_t>;
743	using U16 = V<uint16_t>;
744	using U8 = V<uint8_t >;
745
746	SI F if_then_else(I32 c, F t, F e) {
747	return _mm_or_ps(a: _mm_and_ps(a: c, b: t), b: _mm_andnot_ps(a: c, b: e));
748	}
749
750	SI F min(F a, F b) { return _mm_min_ps(a: a,b: b); }
751	SI F max(F a, F b) { return _mm_max_ps(a: a,b: b); }
752	#if defined(JUMPER_IS_SSE41) \|\| defined(JUMPER_IS_AVX)
753	SI I32 min(I32 a, I32 b) { return _mm_min_epi32(a,b); }
754	SI U32 min(U32 a, U32 b) { return _mm_min_epu32(a,b); }
755	SI I32 max(I32 a, I32 b) { return _mm_max_epi32(a,b); }
756	SI U32 max(U32 a, U32 b) { return _mm_max_epu32(a,b); }
757	#else
758	SI I32 min(I32 a, I32 b) {
759	return sk_bit_cast<I32>(src: if_then_else(c: a < b, t: sk_bit_cast<F>(src: a), e: sk_bit_cast<F>(src: b)));
760	}
761	SI U32 min(U32 a, U32 b) {
762	return sk_bit_cast<U32>(src: if_then_else(c: a < b, t: sk_bit_cast<F>(src: a), e: sk_bit_cast<F>(src: b)));
763	}
764	SI I32 max(I32 a, I32 b) {
765	return sk_bit_cast<I32>(src: if_then_else(c: a > b, t: sk_bit_cast<F>(src: a), e: sk_bit_cast<F>(src: b)));
766	}
767	SI U32 max(U32 a, U32 b) {
768	return sk_bit_cast<U32>(src: if_then_else(c: a > b, t: sk_bit_cast<F>(src: a), e: sk_bit_cast<F>(src: b)));
769	}
770	#endif
771
772	SI F mad(F f, F m, F a) { return f*m+a; }
773	SI F abs_(F v) { return _mm_and_ps(a: v, b: `0`-v); }
774	#if defined(JUMPER_IS_SSE41) \|\| defined(JUMPER_IS_AVX)
775	SI I32 abs_(I32 v) { return _mm_abs_epi32(v); }
776	#else
777	SI I32 abs_(I32 v) { return max(a: v, b: -v); }
778	#endif
779	SI F rcp_fast(F v) { return _mm_rcp_ps (a: v); }
780	SI F rcp_precise (F v) { F e = rcp_fast(v); return e * (`2.0f` - v * e); }
781	SI F rsqrt (F v) { return _mm_rsqrt_ps(a: v); }
782	SI F sqrt_(F v) { return _mm_sqrt_ps (a: v); }
783
784	SI U32 round(F v) { return _mm_cvtps_epi32(a: v); }
785	SI U32 round(F v, F scale) { return _mm_cvtps_epi32(a: v*scale); }
786
787	SI U16 pack(U32 v) {
788	#if defined(JUMPER_IS_SSE41) \|\| defined(JUMPER_IS_AVX)
789	auto p = _mm_packus_epi32(v,v);
790	#else
791	// Sign extend so that _mm_packs_epi32() does the pack we want.
792	auto p = _mm_srai_epi32(a: _mm_slli_epi32(a: v, count: `16`), count: `16`);
793	p = _mm_packs_epi32(a: p,b: p);
794	#endif
795	return sk_unaligned_load<U16>(ptr: &p); // We have two copies. Return (the lower) one.
796	}
797	SI U8 pack(U16 v) {
798	auto r = widen_cast<__m128i>(src: v);
799	r = _mm_packus_epi16(a: r,b: r);
800	return sk_unaligned_load<U8>(ptr: &r);
801	}
802
803	// NOTE: This only checks the top bit of each lane, and is incorrect with non-mask values.
804	SI bool any(I32 c) { return _mm_movemask_ps(a: c) != `0b0000`; }
805	SI bool all(I32 c) { return _mm_movemask_ps(a: c) == `0b1111`; }
806
807	SI F floor_(F v) {
808	#if defined(JUMPER_IS_SSE41) \|\| defined(JUMPER_IS_AVX)
809	return _mm_floor_ps(v);
810	#else
811	F roundtrip = _mm_cvtepi32_ps(a: _mm_cvttps_epi32(a: v));
812	return roundtrip - if_then_else(c: roundtrip > v, t: `1`, e: `0`);
813	#endif
814	}
815
816	SI F ceil_(F v) {
817	#if defined(JUMPER_IS_SSE41) \|\| defined(JUMPER_IS_AVX)
818	return _mm_ceil_ps(v);
819	#else
820	F roundtrip = _mm_cvtepi32_ps(a: _mm_cvttps_epi32(a: v));
821	return roundtrip + if_then_else(c: roundtrip < v, t: `1`, e: `0`);
822	#endif
823	}
824
825	template <typename T>
826	SI V<T> gather(const T* p, U32 ix) {
827	return {p[ix[`0`]], p[ix[`1`]], p[ix[`2`]], p[ix[`3`]]};
828	}
829	template <typename V, typename S>
830	SI void scatter_masked(V src, S* dst, U32 ix, I32 mask) {
831	V before = gather(dst, ix);
832	V after = if_then_else(mask, src, before);
833	dst[ix[`0`]] = after[`0`];
834	dst[ix[`1`]] = after[`1`];
835	dst[ix[`2`]] = after[`2`];
836	dst[ix[`3`]] = after[`3`];
837	}
838	SI void load2(const uint16_t* ptr, size_t tail, U16* r, U16* g) {
839	__m128i _01;
840	if (__builtin_expect(tail,`0`)) {
841	_01 = _mm_setzero_si128();
842	if (tail > `1`) {
843	_01 = _mm_loadl_pd(a: _01, dp: (const double)ptr); // r0 g0 r1 g1 00 00 00 00*
844	if (tail > `2`) {
845	_01 = _mm_insert_epi16(_01, (ptr+`4`), `4`); // r0 g0 r1 g1 r2 00 00 00*
846	_01 = _mm_insert_epi16(_01, (ptr+`5`), `5`); // r0 g0 r1 g1 r2 g2 00 00*
847	}
848	} else {
849	_01 = _mm_cvtsi32_si128(a: (const* uint32_t)ptr); // r0 g0 00 00 00 00 00 00*
850	}
851	} else {
852	_01 = _mm_loadu_si128(p: ((__m128i)ptr) + `0`); // r0 g0 r1 g1 r2 g2 r3 g3*
853	}
854	auto rg01_23 = _mm_shufflelo_epi16(_01, `0xD8`); // r0 r1 g0 g1 r2 g2 r3 g3
855	auto rg = _mm_shufflehi_epi16(rg01_23, `0xD8`); // r0 r1 g0 g1 r2 r3 g2 g3
856
857	auto R = _mm_shuffle_epi32(rg, `0x88`); // r0 r1 r2 r3 r0 r1 r2 r3
858	auto G = _mm_shuffle_epi32(rg, `0xDD`); // g0 g1 g2 g3 g0 g1 g2 g3
859	*r = sk_unaligned_load<U16>(ptr: &R);
860	*g = sk_unaligned_load<U16>(ptr: &G);
861	}
862	SI void store2(uint16_t* ptr, size_t tail, U16 r, U16 g) {
863	U32 rg = _mm_unpacklo_epi16(a: widen_cast<__m128i>(src: r), b: widen_cast<__m128i>(src: g));
864	if (__builtin_expect(tail, `0`)) {
865	if (tail > `1`) {
866	_mm_storel_epi64(p: (__m128i*)ptr, a: rg);
867	if (tail > `2`) {
868	int32_t rgpair = rg[`2`];
869	memcpy(dest: ptr + `4`, src: &rgpair, n: sizeof(rgpair));
870	}
871	} else {
872	int32_t rgpair = rg[`0`];
873	memcpy(dest: ptr, src: &rgpair, n: sizeof(rgpair));
874	}
875	} else {
876	_mm_storeu_si128(p: (__m128i*)ptr + `0`, b: rg);
877	}
878	}
879
880	SI void load3(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) {
881	__m128i _0, _1, _2, _3;
882	if (__builtin_expect(tail,`0`)) {
883	_1 = _2 = _3 = _mm_setzero_si128();
884	auto load_rgb = [](const uint16_t* src) {
885	auto v = _mm_cvtsi32_si128(a: (const* uint32_t*)src);
886	return _mm_insert_epi16(v, src[`2`], `2`);
887	};
888	if ( true ) { _0 = load_rgb(ptr + `0`); }
889	if (tail > `1`) { _1 = load_rgb(ptr + `3`); }
890	if (tail > `2`) { _2 = load_rgb(ptr + `6`); }
891	} else {
892	// Load slightly weirdly to make sure we don't load past the end of 4x48 bits.
893	auto _01 = _mm_loadu_si128(p: (const __m128i*)(ptr + `0`)) ,
894	_23 = _mm_srli_si128(_mm_loadu_si128((const __m128i*)(ptr + `4`)), `4`);
895
896	// Each _N holds R,G,B for pixel N in its lower 3 lanes (upper 5 are ignored).
897	_0 = _01;
898	_1 = _mm_srli_si128(_01, `6`);
899	_2 = _23;
900	_3 = _mm_srli_si128(_23, `6`);
901	}
902
903	// De-interlace to R,G,B.
904	auto _02 = _mm_unpacklo_epi16(a: _0, b: _2), // r0 r2 g0 g2 b0 b2 xx xx
905	_13 = _mm_unpacklo_epi16(a: _1, b: _3); // r1 r3 g1 g3 b1 b3 xx xx
906
907	auto R = _mm_unpacklo_epi16(a: _02, b: _13), // r0 r1 r2 r3 g0 g1 g2 g3
908	G = _mm_srli_si128(R, `8`),
909	B = _mm_unpackhi_epi16(a: _02, b: _13); // b0 b1 b2 b3 xx xx xx xx
910
911	*r = sk_unaligned_load<U16>(ptr: &R);
912	*g = sk_unaligned_load<U16>(ptr: &G);
913	*b = sk_unaligned_load<U16>(ptr: &B);
914	}
915
916	SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
917	__m128i _01, _23;
918	if (__builtin_expect(tail,`0`)) {
919	_01 = _23 = _mm_setzero_si128();
920	auto src = (const double*)ptr;
921	if ( true ) { _01 = _mm_loadl_pd(a: _01, dp: src + `0`); } // r0 g0 b0 a0 00 00 00 00
922	if (tail > `1`) { _01 = _mm_loadh_pd(a: _01, dp: src + `1`); } // r0 g0 b0 a0 r1 g1 b1 a1
923	if (tail > `2`) { _23 = _mm_loadl_pd(a: _23, dp: src + `2`); } // r2 g2 b2 a2 00 00 00 00
924	} else {
925	_01 = _mm_loadu_si128(p: ((__m128i)ptr) + `0`); // r0 g0 b0 a0 r1 g1 b1 a1*
926	_23 = _mm_loadu_si128(p: ((__m128i)ptr) + `1`); // r2 g2 b2 a2 r3 g3 b3 a3*
927	}
928
929	auto _02 = _mm_unpacklo_epi16(a: _01, b: _23), // r0 r2 g0 g2 b0 b2 a0 a2
930	_13 = _mm_unpackhi_epi16(a: _01, b: _23); // r1 r3 g1 g3 b1 b3 a1 a3
931
932	auto rg = _mm_unpacklo_epi16(a: _02, b: _13), // r0 r1 r2 r3 g0 g1 g2 g3
933	ba = _mm_unpackhi_epi16(a: _02, b: _13); // b0 b1 b2 b3 a0 a1 a2 a3
934
935	r = sk_unaligned_load<U16>(ptr: (uint16_t)&rg + `0`);
936	g = sk_unaligned_load<U16>(ptr: (uint16_t)&rg + `4`);
937	b = sk_unaligned_load<U16>(ptr: (uint16_t)&ba + `0`);
938	a = sk_unaligned_load<U16>(ptr: (uint16_t)&ba + `4`);
939	}
940
941	SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
942	auto rg = _mm_unpacklo_epi16(a: widen_cast<__m128i>(src: r), b: widen_cast<__m128i>(src: g)),
943	ba = _mm_unpacklo_epi16(a: widen_cast<__m128i>(src: b), b: widen_cast<__m128i>(src: a));
944
945	if (__builtin_expect(tail, `0`)) {
946	auto dst = (double*)ptr;
947	if ( true ) { _mm_storel_pd(dp: dst + `0`, a: _mm_unpacklo_epi32(a: rg, b: ba)); }
948	if (tail > `1`) { _mm_storeh_pd(dp: dst + `1`, a: _mm_unpacklo_epi32(a: rg, b: ba)); }
949	if (tail > `2`) { _mm_storel_pd(dp: dst + `2`, a: _mm_unpackhi_epi32(a: rg, b: ba)); }
950	} else {
951	_mm_storeu_si128(p: (__m128i*)ptr + `0`, b: _mm_unpacklo_epi32(a: rg, b: ba));
952	_mm_storeu_si128(p: (__m128i*)ptr + `1`, b: _mm_unpackhi_epi32(a: rg, b: ba));
953	}
954	}
955
956	SI void load2(const float* ptr, size_t tail, F* r, F* g) {
957	F _01, _23;
958	if (__builtin_expect(tail, `0`)) {
959	_01 = _23 = _mm_setzero_si128();
960	if ( true ) { _01 = _mm_loadl_pi(a: _01, p: (__m64 const*)(ptr + `0`)); }
961	if (tail > `1`) { _01 = _mm_loadh_pi(a: _01, p: (__m64 const*)(ptr + `2`)); }
962	if (tail > `2`) { _23 = _mm_loadl_pi(a: _23, p: (__m64 const*)(ptr + `4`)); }
963	} else {
964	_01 = _mm_loadu_ps(p: ptr + `0`);
965	_23 = _mm_loadu_ps(p: ptr + `4`);
966	}
967	*r = _mm_shuffle_ps(_01, _23, `0x88`);
968	*g = _mm_shuffle_ps(_01, _23, `0xDD`);
969	}
970	SI void store2(float* ptr, size_t tail, F r, F g) {
971	F _01 = _mm_unpacklo_ps(a: r, b: g),
972	_23 = _mm_unpackhi_ps(a: r, b: g);
973	if (__builtin_expect(tail, `0`)) {
974	if ( true ) { _mm_storel_pi(p: (__m64*)(ptr + `0`), a: _01); }
975	if (tail > `1`) { _mm_storeh_pi(p: (__m64*)(ptr + `2`), a: _01); }
976	if (tail > `2`) { _mm_storel_pi(p: (__m64*)(ptr + `4`), a: _23); }
977	} else {
978	_mm_storeu_ps(p: ptr + `0`, a: _01);
979	_mm_storeu_ps(p: ptr + `4`, a: _23);
980	}
981	}
982
983	SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) {
984	F _0, _1, _2, _3;
985	if (__builtin_expect(tail, `0`)) {
986	_1 = _2 = _3 = _mm_setzero_si128();
987	if ( true ) { _0 = _mm_loadu_ps(p: ptr + `0`); }
988	if (tail > `1`) { _1 = _mm_loadu_ps(p: ptr + `4`); }
989	if (tail > `2`) { _2 = _mm_loadu_ps(p: ptr + `8`); }
990	} else {
991	_0 = _mm_loadu_ps(p: ptr + `0`);
992	_1 = _mm_loadu_ps(p: ptr + `4`);
993	_2 = _mm_loadu_ps(p: ptr + `8`);
994	_3 = _mm_loadu_ps(p: ptr +`12`);
995	}
996	_MM_TRANSPOSE4_PS(_0,_1,_2,_3);
997	*r = _0;
998	*g = _1;
999	*b = _2;
1000	*a = _3;
1001	}
1002
1003	SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) {
1004	_MM_TRANSPOSE4_PS(r,g,b,a);
1005	if (__builtin_expect(tail, `0`)) {
1006	if ( true ) { _mm_storeu_ps(p: ptr + `0`, a: r); }
1007	if (tail > `1`) { _mm_storeu_ps(p: ptr + `4`, a: g); }
1008	if (tail > `2`) { _mm_storeu_ps(p: ptr + `8`, a: b); }
1009	} else {
1010	_mm_storeu_ps(p: ptr + `0`, a: r);
1011	_mm_storeu_ps(p: ptr + `4`, a: g);
1012	_mm_storeu_ps(p: ptr + `8`, a: b);
1013	_mm_storeu_ps(p: ptr +`12`, a: a);
1014	}
1015	}
1016	#endif
1017
1018	// We need to be a careful with casts.
1019	// (F)x means cast x to float in the portable path, but bit_cast x to float in the others.
1020	// These named casts and bit_cast() are always what they seem to be.
1021	#if defined(JUMPER_IS_SCALAR)
1022	SI F cast (U32 v) { return (F)v; }
1023	SI F cast64(U64 v) { return (F)v; }
1024	SI U32 trunc_(F v) { return (U32)v; }
1025	SI U32 expand(U16 v) { return (U32)v; }
1026	SI U32 expand(U8 v) { return (U32)v; }
1027	#else
1028	SI F cast (U32 v) { return __builtin_convertvector((I32)v, F); }
1029	SI F cast64(U64 v) { return __builtin_convertvector( v, F); }
1030	SI U32 trunc_(F v) { return (U32)__builtin_convertvector( v, I32); }
1031	SI U32 expand(U16 v) { return __builtin_convertvector( v, U32); }
1032	SI U32 expand(U8 v) { return __builtin_convertvector( v, U32); }
1033	#endif
1034
1035	template <typename V>
1036	SI V if_then_else(I32 c, V t, V e) {
1037	return sk_bit_cast<V>(if_then_else(c, sk_bit_cast<F>(t), sk_bit_cast<F>(e)));
1038	}
1039
1040	SI U16 bswap(U16 x) {
1041	#if defined(JUMPER_IS_SSE2) \|\| defined(JUMPER_IS_SSE41)
1042	// Somewhat inexplicably Clang decides to do (x<<8) \| (x>>8) in 32-bit lanes
1043	// when generating code for SSE2 and SSE4.1. We'll do it manually...
1044	auto v = widen_cast<__m128i>(src: x);
1045	v = _mm_slli_epi16(a: v,count: `8`) \| _mm_srli_epi16(a: v,count: `8`);
1046	return sk_unaligned_load<U16>(ptr: &v);
1047	#else
1048	return (x<<`8`) \| (x>>`8`);
1049	#endif
1050	}
1051
1052	SI F fract(F v) { return v - floor_(v); }
1053
1054	// See http://www.machinedlearnings.com/2011/06/fast-approximate-logarithm-exponential.html
1055	SI F approx_log2(F x) {
1056	// e - 127 is a fair approximation of log2(x) in its own right...
1057	F e = cast(v: sk_bit_cast<U32>(src: x)) * (`1.0f` / (`1`<<`23`));
1058
1059	// ... but using the mantissa to refine its error is _much_ better.
1060	F m = sk_bit_cast<F>(src: (sk_bit_cast<U32>(src: x) & `0x007fffff`) \| `0x3f000000`);
1061	return e
1062	- `124.225514990f`
1063	- `1.498030302f` * m
1064	- `1.725879990f` / (`0.3520887068f` + m);
1065	}
1066
1067	SI F approx_log(F x) {
1068	const float ln2 = `0.69314718f`;
1069	return ln2 * approx_log2(x);
1070	}
1071
1072	SI F approx_pow2(F x) {
1073	constexpr float kInfinityBits = `0x7f800000`;
1074
1075	F f = fract(v: x);
1076	F approx = x + `121.274057500f`;
1077	approx -= f * `1.490129070f`;
1078	approx += `27.728023300f` / (`4.84252568f` - f);
1079	approx = `1.0f` (`1`<<`23`);
1080	approx = min(a: max(a: approx, b: F(`0`)), b: kInfinityBits); // guard against underflow/overflow
1081
1082	return sk_bit_cast<F>(src: round(v: approx));
1083	}
1084
1085	SI F approx_exp(F x) {
1086	const float log2_e = `1.4426950408889634074f`;
1087	return approx_pow2(x: log2_e * x);
1088	}
1089
1090	SI F approx_powf(F x, F y) {
1091	return if_then_else(c: (x == `0`)\|(x == `1`), t: x
1092	, e: approx_pow2(x: approx_log2(x) * y));
1093	}
1094
1095	SI F from_half(U16 h) {
1096	#if defined(JUMPER_IS_NEON) && defined(SK_CPU_ARM64) \
1097	&& !defined(SK_BUILD_FOR_GOOGLE3) // Temporary workaround for some Google3 builds.
1098	return vcvt_f32_f16(h);
1099
1100	#elif defined(JUMPER_IS_HSW)
1101	return _mm256_cvtph_ps(h);
1102
1103	#else
1104	// Remember, a half is 1-5-10 (sign-exponent-mantissa) with 15 exponent bias.
1105	U32 sem = expand(v: h),
1106	s = sem & `0x8000`,
1107	em = sem ^ s;
1108
1109	// Convert to 1-8-23 float with 127 bias, flushing denorm halfs (including zero) to zero.
1110	auto denorm = (I32)em < `0x0400`; // I32 comparison is often quicker, and always safe here.
1111	return if_then_else(c: denorm, t: F(`0`)
1112	, e: sk_bit_cast<F>( src: (s<<`16`) + (em<<`13`) + ((`127`-`15`)<<`23`) ));
1113	#endif
1114	}
1115
1116	SI U16 to_half(F f) {
1117	#if defined(JUMPER_IS_NEON) && defined(SK_CPU_ARM64) \
1118	&& !defined(SK_BUILD_FOR_GOOGLE3) // Temporary workaround for some Google3 builds.
1119	return vcvt_f16_f32(f);
1120
1121	#elif defined(JUMPER_IS_HSW)
1122	return _mm256_cvtps_ph(f, _MM_FROUND_CUR_DIRECTION);
1123
1124	#else
1125	// Remember, a float is 1-8-23 (sign-exponent-mantissa) with 127 exponent bias.
1126	U32 sem = sk_bit_cast<U32>(src: f),
1127	s = sem & `0x80000000`,
1128	em = sem ^ s;
1129
1130	// Convert to 1-5-10 half with 15 bias, flushing denorm halfs (including zero) to zero.
1131	auto denorm = (I32)em < `0x38800000`; // I32 comparison is often quicker, and always safe here.
1132	return pack(v: if_then_else(c: denorm, t: U32(`0`)
1133	, e: (s>>`16`) + (em>>`13`) - ((`127`-`15`)<<`10`)));
1134	#endif
1135	}
1136
1137	// Our fundamental vector depth is our pixel stride.
1138	static constexpr size_t N = sizeof(F) / sizeof(float);
1139
1140	// We're finally going to get to what a Stage function looks like!
1141	// tail == 0 ~~> work on a full N pixels
1142	// tail != 0 ~~> work on only the first tail pixels
1143	// tail is always < N.
1144
1145	// Any custom ABI to use for all (non-externally-facing) stage functions?
1146	// Also decide here whether to use narrow (compromise) or wide (ideal) stages.
1147	#if defined(SK_CPU_ARM32) && defined(JUMPER_IS_NEON)
1148	// This lets us pass vectors more efficiently on 32-bit ARM.
1149	// We can still only pass 16 floats, so best as 4x {r,g,b,a}.
1150	#define ABI __attribute__((pcs("aapcs-vfp")))
1151	#define JUMPER_NARROW_STAGES 1
1152	#elif defined(_MSC_VER)
1153	// Even if not vectorized, this lets us pass {r,g,b,a} as registers,
1154	// instead of {b,a} on the stack. Narrow stages work best for __vectorcall.
1155	#define ABI __vectorcall
1156	#define JUMPER_NARROW_STAGES 1
1157	#elif defined(__x86_64__) \|\| defined(SK_CPU_ARM64)
1158	// These platforms are ideal for wider stages, and their default ABI is ideal.
1159	#define ABI
1160	#define JUMPER_NARROW_STAGES 0
1161	#else
1162	// 32-bit or unknown... shunt them down the narrow path.
1163	// Odds are these have few registers and are better off there.
1164	#define ABI
1165	#define JUMPER_NARROW_STAGES 1
1166	#endif
1167
1168	#if JUMPER_NARROW_STAGES
1169	struct Params {
1170	size_t dx, dy, tail;
1171	std::byte* base;
1172	F dr,dg,db,da;
1173	};
1174	using Stage = void(ABI)(Params, SkRasterPipelineStage* program, F r, F g, F b, F a);
1175	#else
1176	using Stage = void(ABI)(size_t tail, SkRasterPipelineStage program, size_t dx, size_t dy,
1177	std::byte* base, F,F,F,F, F,F,F,F);
1178	#endif
1179
1180	static void start_pipeline(size_t dx, size_t dy,
1181	size_t xlimit, size_t ylimit,
1182	SkRasterPipelineStage* program) {
1183	auto start = (Stage)program->fn;
1184	const size_t x0 = dx;
1185	std::byte* const base = nullptr;
1186	for (; dy < ylimit; dy++) {
1187	#if JUMPER_NARROW_STAGES
1188	Params params = { x0,dy,`0`,base, `0`,`0`,`0`,`0` };
1189	while (params.dx + N <= xlimit) {
1190	start(&params,program, `0`,`0`,`0`,`0`);
1191	params.dx += N;
1192	}
1193	if (size_t tail = xlimit - params.dx) {
1194	params.tail = tail;
1195	start(&params,program, `0`,`0`,`0`,`0`);
1196	}
1197	#else
1198	dx = x0;
1199	while (dx + N <= xlimit) {
1200	start(`0`,program,dx,dy,base, `0`,`0`,`0`,`0`, `0`,`0`,`0`,`0`);
1201	dx += N;
1202	}
1203	if (size_t tail = xlimit - dx) {
1204	start(tail,program,dx,dy,base, `0`,`0`,`0`,`0`, `0`,`0`,`0`,`0`);
1205	}
1206	#endif
1207	}
1208	}
1209
1210	#if SK_HAS_MUSTTAIL
1211	#define JUMPER_MUSTTAIL [[clang::musttail]]
1212	#else
1213	#define JUMPER_MUSTTAIL
1214	#endif
1215
1216	#if JUMPER_NARROW_STAGES
1217	#define DECLARE_STAGE(name, ARG, STAGE_RET, INC, OFFSET, MUSTTAIL) \
1218	SI STAGE_RET name##_k(ARG, size_t dx, size_t dy, size_t tail, std::byte*& base, \
1219	F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \
1220	static void ABI name(Params* params, SkRasterPipelineStage* program, \
1221	F r, F g, F b, F a) { \
1222	OFFSET name##_k(Ctx{program}, params->dx,params->dy,params->tail,params->base, \
1223	r,g,b,a, params->dr, params->dg, params->db, params->da); \
1224	INC; \
1225	auto fn = (Stage)program->fn; \
1226	MUSTTAIL return fn(params, program, r,g,b,a); \
1227	} \
1228	SI STAGE_RET name##_k(ARG, size_t dx, size_t dy, size_t tail, std::byte*& base, \
1229	F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da)
1230	#else
1231	#define DECLARE_STAGE(name, ARG, STAGE_RET, INC, OFFSET, MUSTTAIL) \
1232	SI STAGE_RET name##_k(ARG, size_t dx, size_t dy, size_t tail, std::byte*& base, \
1233	F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \
1234	static void ABI name(size_t tail, SkRasterPipelineStage* program, size_t dx, size_t dy, \
1235	std::byte* base, F r, F g, F b, F a, F dr, F dg, F db, F da) { \
1236	OFFSET name##_k(Ctx{program}, dx,dy,tail,base, r,g,b,a, dr,dg,db,da); \
1237	INC; \
1238	auto fn = (Stage)program->fn; \
1239	MUSTTAIL return fn(tail, program, dx,dy,base, r,g,b,a, dr,dg,db,da); \
1240	} \
1241	SI STAGE_RET name##_k(ARG, size_t dx, size_t dy, size_t tail, std::byte*& base, \
1242	F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da)
1243	#endif
1244
1245	// A typical stage returns void, always increments the program counter by 1, and lets the optimizer
1246	// decide whether or not tail-calling is appropriate.
1247	#define STAGE(name, arg) \
1248	DECLARE_STAGE(name, arg, void, ++program, /no offset/, /no musttail/)
1249
1250	// A tail stage returns void, always increments the program counter by 1, and uses tail-calling.
1251	// Tail-calling is necessary in SkSL-generated programs, which can be thousands of ops long, and
1252	// could overflow the stack (particularly in debug).
1253	#define STAGE_TAIL(name, arg) \
1254	DECLARE_STAGE(name, arg, void, ++program, /no offset/, JUMPER_MUSTTAIL)
1255
1256	// A branch stage returns an integer, which is added directly to the program counter, and tailcalls.
1257	#define STAGE_BRANCH(name, arg) \
1258	DECLARE_STAGE(name, arg, int, /no increment/, program +=, JUMPER_MUSTTAIL)
1259
1260	// just_return() is a simple no-op stage that only exists to end the chain,
1261	// returning back up to start_pipeline(), and from there to the caller.
1262	#if JUMPER_NARROW_STAGES
1263	static void ABI just_return(Params, SkRasterPipelineStage, F,F,F,F) {}
1264	#else
1265	static void ABI just_return(size_t, SkRasterPipelineStage, size_t,size_t, std::byte,
1266	F,F,F,F, F,F,F,F) {}
1267	#endif
1268
1269	// Note that in release builds, most stages consume no stack (thanks to tail call optimization).
1270	// However: certain builds (especially with non-clang compilers) may fail to optimize tail
1271	// calls, resulting in actual stack frames being generated.
1272	//
1273	// stack_checkpoint() and stack_rewind() are special stages that can be used to manage stack growth.
1274	// If a pipeline contains a stack_checkpoint, followed by any number of stack_rewind (at any point),
1275	// the C++ stack will be reset to the state it was at when the stack_checkpoint was initially hit.
1276	//
1277	// All instances of stack_rewind (as well as the one instance of stack_checkpoint near the start of
1278	// a pipeline) share a single context (of type SkRasterPipeline_RewindCtx). That context holds the
1279	// full state of the mutable registers that are normally passed to the next stage in the program.
1280	//
1281	// stack_rewind is the only stage other than just_return that actually returns (rather than jumping
1282	// to the next stage in the program). Before it does so, it stashes all of the registers in the
1283	// context. This includes the updated `program` pointer. Unlike stages that tail call exactly once,
1284	// stack_checkpoint calls the next stage in the program repeatedly, as long as the `program` in the
1285	// context is overwritten (i.e., as long as a stack_rewind was the reason the pipeline returned,
1286	// rather than a just_return).
1287	//
1288	// Normally, just_return is the only stage that returns, and no other stage does anything after a
1289	// subsequent (called) stage returns, so the stack just unwinds all the way to start_pipeline.
1290	// With stack_checkpoint on the stack, any stack_rewind stages will return all the way up to the
1291	// stack_checkpoint. That grabs the values that would have been passed to the next stage (from the
1292	// context), and continues the linear execution of stages, but has reclaimed all of the stack frames
1293	// pushed before the stack_rewind before doing so.
1294	#if JUMPER_NARROW_STAGES
1295	static void ABI stack_checkpoint(Params* params, SkRasterPipelineStage* program,
1296	F r, F g, F b, F a) {
1297	SkRasterPipeline_RewindCtx* ctx = Ctx{program};
1298	while (program) {
1299	auto next = (Stage)(++program)->fn;
1300
1301	ctx->stage = nullptr;
1302	next(params, program, r, g, b, a);
1303	program = ctx->stage;
1304
1305	if (program) {
1306	r = sk_unaligned_load<F>(ctx->r );
1307	g = sk_unaligned_load<F>(ctx->g );
1308	b = sk_unaligned_load<F>(ctx->b );
1309	a = sk_unaligned_load<F>(ctx->a );
1310	params->dr = sk_unaligned_load<F>(ctx->dr);
1311	params->dg = sk_unaligned_load<F>(ctx->dg);
1312	params->db = sk_unaligned_load<F>(ctx->db);
1313	params->da = sk_unaligned_load<F>(ctx->da);
1314	params->base = ctx->base;
1315	}
1316	}
1317	}
1318	static void ABI stack_rewind(Params* params, SkRasterPipelineStage* program,
1319	F r, F g, F b, F a) {
1320	SkRasterPipeline_RewindCtx* ctx = Ctx{program};
1321	sk_unaligned_store(ctx->r , r );
1322	sk_unaligned_store(ctx->g , g );
1323	sk_unaligned_store(ctx->b , b );
1324	sk_unaligned_store(ctx->a , a );
1325	sk_unaligned_store(ctx->dr, params->dr);
1326	sk_unaligned_store(ctx->dg, params->dg);
1327	sk_unaligned_store(ctx->db, params->db);
1328	sk_unaligned_store(ctx->da, params->da);
1329	ctx->base = params->base;
1330	ctx->stage = program;
1331	}
1332	#else
1333	static void ABI stack_checkpoint(size_t tail, SkRasterPipelineStage* program,
1334	size_t dx, size_t dy, std::byte* base,
1335	F r, F g, F b, F a, F dr, F dg, F db, F da) {
1336	SkRasterPipeline_RewindCtx* ctx = Ctx{.fStage: program};
1337	while (program) {
1338	auto next = (Stage)(++program)->fn;
1339
1340	ctx->stage = nullptr;
1341	next(tail, program, dx, dy, base, r, g, b, a, dr, dg, db, da);
1342	program = ctx->stage;
1343
1344	if (program) {
1345	r = sk_unaligned_load<F>(ptr: ctx->r );
1346	g = sk_unaligned_load<F>(ptr: ctx->g );
1347	b = sk_unaligned_load<F>(ptr: ctx->b );
1348	a = sk_unaligned_load<F>(ptr: ctx->a );
1349	dr = sk_unaligned_load<F>(ptr: ctx->dr);
1350	dg = sk_unaligned_load<F>(ptr: ctx->dg);
1351	db = sk_unaligned_load<F>(ptr: ctx->db);
1352	da = sk_unaligned_load<F>(ptr: ctx->da);
1353	base = ctx->base;
1354	}
1355	}
1356	}
1357	static void ABI stack_rewind(size_t tail, SkRasterPipelineStage* program,
1358	size_t dx, size_t dy, std::byte* base,
1359	F r, F g, F b, F a, F dr, F dg, F db, F da) {
1360	SkRasterPipeline_RewindCtx* ctx = Ctx{.fStage: program};
1361	sk_unaligned_store(ptr: ctx->r , val: r );
1362	sk_unaligned_store(ptr: ctx->g , val: g );
1363	sk_unaligned_store(ptr: ctx->b , val: b );
1364	sk_unaligned_store(ptr: ctx->a , val: a );
1365	sk_unaligned_store(ptr: ctx->dr, val: dr);
1366	sk_unaligned_store(ptr: ctx->dg, val: dg);
1367	sk_unaligned_store(ptr: ctx->db, val: db);
1368	sk_unaligned_store(ptr: ctx->da, val: da);
1369	ctx->base = base;
1370	ctx->stage = program;
1371	}
1372	#endif
1373
1374
1375	// We could start defining normal Stages now. But first, some helper functions.
1376
1377	// These load() and store() methods are tail-aware,
1378	// but focus mainly on keeping the at-stride tail==0 case fast.
1379
1380	template <typename V, typename T>
1381	SI V load(const T* src, size_t tail) {
1382	#if !defined(JUMPER_IS_SCALAR)
1383	__builtin_assume(tail < N);
1384	if (__builtin_expect(tail, `0`)) {
1385	V v{}; // Any inactive lanes are zeroed.
1386	switch (tail) {
1387	case `7`: v[`6`] = src[`6`]; [[fallthrough]];
1388	case `6`: v[`5`] = src[`5`]; [[fallthrough]];
1389	case `5`: v[`4`] = src[`4`]; [[fallthrough]];
1390	case `4`: memcpy(&v, src, `4`*sizeof(T)); break;
1391	case `3`: v[`2`] = src[`2`]; [[fallthrough]];
1392	case `2`: memcpy(&v, src, `2`*sizeof(T)); break;
1393	case `1`: memcpy(&v, src, `1`*sizeof(T)); break;
1394	}
1395	return v;
1396	}
1397	#endif
1398	return sk_unaligned_load<V>(src);
1399	}
1400
1401	template <typename V, typename T>
1402	SI void store(T* dst, V v, size_t tail) {
1403	#if !defined(JUMPER_IS_SCALAR)
1404	__builtin_assume(tail < N);
1405	if (__builtin_expect(tail, `0`)) {
1406	switch (tail) {
1407	case `7`: dst[`6`] = v[`6`]; [[fallthrough]];
1408	case `6`: dst[`5`] = v[`5`]; [[fallthrough]];
1409	case `5`: dst[`4`] = v[`4`]; [[fallthrough]];
1410	case `4`: memcpy(dst, &v, `4`*sizeof(T)); break;
1411	case `3`: dst[`2`] = v[`2`]; [[fallthrough]];
1412	case `2`: memcpy(dst, &v, `2`*sizeof(T)); break;
1413	case `1`: memcpy(dst, &v, `1`*sizeof(T)); break;
1414	}
1415	return;
1416	}
1417	#endif
1418	sk_unaligned_store(dst, v);
1419	}
1420
1421	SI F from_byte(U8 b) {
1422	return cast(v: expand(v: b)) * (`1`/`255.0f`);
1423	}
1424	SI F from_short(U16 s) {
1425	return cast(v: expand(v: s)) * (`1`/`65535.0f`);
1426	}
1427	SI void from_565(U16 _565, F* r, F* g, F* b) {
1428	U32 wide = expand(v: _565);
1429	r = cast(v: wide & (`31`<<`11`)) (`1.0f` / (`31`<<`11`));
1430	g = cast(v: wide & (`63`<< `5`)) (`1.0f` / (`63`<< `5`));
1431	b = cast(v: wide & (`31`<< `0`)) (`1.0f` / (`31`<< `0`));
1432	}
1433	SI void from_4444(U16 _4444, F* r, F* g, F* b, F* a) {
1434	U32 wide = expand(v: _4444);
1435	r = cast(v: wide & (`15`<<`12`)) (`1.0f` / (`15`<<`12`));
1436	g = cast(v: wide & (`15`<< `8`)) (`1.0f` / (`15`<< `8`));
1437	b = cast(v: wide & (`15`<< `4`)) (`1.0f` / (`15`<< `4`));
1438	a = cast(v: wide & (`15`<< `0`)) (`1.0f` / (`15`<< `0`));
1439	}
1440	SI void from_8888(U32 _8888, F* r, F* g, F* b, F* a) {
1441	r = cast(v: (_8888 ) & `0xff`) (`1`/`255.0f`);
1442	g = cast(v: (_8888 >> `8`) & `0xff`) (`1`/`255.0f`);
1443	b = cast(v: (_8888 >> `16`) & `0xff`) (`1`/`255.0f`);
1444	a = cast(v: (_8888 >> `24`) ) (`1`/`255.0f`);
1445	}
1446	SI void from_88(U16 _88, F* r, F* g) {
1447	U32 wide = expand(v: _88);
1448	r = cast(v: (wide ) & `0xff`) (`1`/`255.0f`);
1449	g = cast(v: (wide >> `8`) & `0xff`) (`1`/`255.0f`);
1450	}
1451	SI void from_1010102(U32 rgba, F* r, F* g, F* b, F* a) {
1452	r = cast(v: (rgba ) & `0x3ff`) (`1`/`1023.0f`);
1453	g = cast(v: (rgba >> `10`) & `0x3ff`) (`1`/`1023.0f`);
1454	b = cast(v: (rgba >> `20`) & `0x3ff`) (`1`/`1023.0f`);
1455	a = cast(v: (rgba >> `30`) ) (`1`/ `3.0f`);
1456	}
1457	SI void from_1010102_xr(U32 rgba, F* r, F* g, F* b, F* a) {
1458	static constexpr float min = -`0.752941f`;
1459	static constexpr float max = `1.25098f`;
1460	static constexpr float range = max - min;
1461	r = cast(v: (rgba ) & `0x3ff`) (`1`/`1023.0f`) * range + min;
1462	g = cast(v: (rgba >> `10`) & `0x3ff`) (`1`/`1023.0f`) * range + min;
1463	b = cast(v: (rgba >> `20`) & `0x3ff`) (`1`/`1023.0f`) * range + min;
1464	a = cast(v: (rgba >> `30`) ) (`1`/ `3.0f`);
1465	}
1466	SI void from_1616(U32 _1616, F* r, F* g) {
1467	r = cast(v: (_1616 ) & `0xffff`) (`1`/`65535.0f`);
1468	g = cast(v: (_1616 >> `16`) & `0xffff`) (`1`/`65535.0f`);
1469	}
1470	SI void from_16161616(U64 _16161616, F* r, F* g, F* b, F* a) {
1471	r = cast64(v: (_16161616 ) & `0xffff`) (`1`/`65535.0f`);
1472	g = cast64(v: (_16161616 >> `16`) & `0xffff`) (`1`/`65535.0f`);
1473	b = cast64(v: (_16161616 >> `32`) & `0xffff`) (`1`/`65535.0f`);
1474	a = cast64(v: (_16161616 >> `48`) & `0xffff`) (`1`/`65535.0f`);
1475	}
1476
1477	// Used by load_ and store_ stages to get to the right (dx,dy) starting point of contiguous memory.
1478	template <typename T>
1479	SI T* ptr_at_xy(const SkRasterPipeline_MemoryCtx* ctx, size_t dx, size_t dy) {
1480	return (T)ctx->pixels + dyctx->stride + dx;
1481	}
1482
1483	// clamp v to [0,limit).
1484	SI F clamp(F v, F limit) {
1485	F inclusive = sk_bit_cast<F>( src: sk_bit_cast<U32>(src: limit) - `1` ); // Exclusive -> inclusive.
1486	return min(a: max(a: `0.0f`, b: v), b: inclusive);
1487	}
1488
1489	// clamp to (0,limit).
1490	SI F clamp_ex(F v, F limit) {
1491	const F inclusiveZ = std::numeric_limits<float>::min(),
1492	inclusiveL = sk_bit_cast<F>( src: sk_bit_cast<U32>(src: limit) - `1` );
1493	return min(a: max(a: inclusiveZ, b: v), b: inclusiveL);
1494	}
1495
1496	// Polynomial approximation of degree 5 for sin(x 2 * pi) in the range [-1/4, 1/4]*
1497	// Adapted from https://github.com/google/swiftshader/blob/master/docs/Sin-Cos-Optimization.pdf
1498	SI F sin5q_(F x) {
1499	// A x + B * x^3 + C * x^5*
1500	// Exact at x = 0, 1/12, 1/6, 1/4, and their negatives,
1501	// which correspond to x 2 * pi = 0, pi/6, pi/3, pi/2*
1502	constexpr float A = `6.28230858f`;
1503	constexpr float B = -`41.1693687f`;
1504	constexpr float C = `74.4388885f`;
1505	F x2 = x * x;
1506	return x * mad(f: mad(f: x2, m: C, a: B), m: x2, a: A);
1507	}
1508
1509	SI F sin_(F x) {
1510	constexpr float one_over_pi2 = `1` / (`2` * SK_FloatPI);
1511	x = mad(f: x, m: -one_over_pi2, a: `0.25f`);
1512	x = `0.25f` - abs_(v: x - floor_(v: x + `0.5f`));
1513	return sin5q_(x);
1514	}
1515
1516	SI F cos_(F x) {
1517	constexpr float one_over_pi2 = `1` / (`2` * SK_FloatPI);
1518	x *= one_over_pi2;
1519	x = `0.25f` - abs_(v: x - floor_(v: x + `0.5f`));
1520	return sin5q_(x);
1521	}
1522
1523	/ "GENERATING ACCURATE VALUES FOR THE TANGENT FUNCTION"*
1524	https://mae.ufl.edu/~uhk/ACCURATE-TANGENT.pdf
1525
1526	approx = x + (1/3)x^3 + (2/15)x^5 + (17/315)x^7 + (62/2835)x^9
1527
1528	Some simplifications:
1529	1. tan(x) is periodic, -PI/2 < x < PI/2
1530	2. tan(x) is odd, so tan(-x) = -tan(x)
1531	3. Our polynomial approximation is best near zero, so we use the following identity
1532	tan(x) + tan(y)
1533	tan(x + y) = -----------------
1534	1 - tan(x)tan(y)*
1535	tan(PI/4) = 1
1536
1537	So for x > PI/8, we do the following refactor:
1538	x' = x - PI/4
1539
1540	1 + tan(x')
1541	tan(x) = ------------
1542	1 - tan(x')
1543	*/
1544	SI F tan_(F x) {
1545	constexpr float Pi = SK_FloatPI;
1546	// periodic between -pi/2 ... pi/2
1547	// shift to 0...Pi, scale 1/Pi to get into 0...1, then fract, scale-up, shift-back
1548	x = fract(v: (`1`/Pi)x + `0.5f`) Pi - (Pi/`2`);
1549
1550	I32 neg = (x < `0.0f`);
1551	x = if_then_else(c: neg, t: -x, e: x);
1552
1553	// minimize total error by shifting if x > pi/8
1554	I32 use_quotient = (x > (Pi/`8`));
1555	x = if_then_else(c: use_quotient, t: x - (Pi/`4`), e: x);
1556
1557	// 9th order poly = 4th order(x^2) x*
1558	const float c4 = `62` / `2835.0f`;
1559	const float c3 = `17` / `315.0f`;
1560	const float c2 = `2` / `15.0f`;
1561	const float c1 = `1` / `3.0f`;
1562	const float c0 = `1.0f`;
1563	F x2 = x * x;
1564	x *= mad(f: x2, m: mad(f: x2, m: mad(f: x2, m: mad(f: x2, m: c4, a: c3), a: c2), a: c1), a: c0);
1565	x = if_then_else(c: use_quotient, t: (`1`+x)/(`1`-x), e: x);
1566	x = if_then_else(c: neg, t: -x, e: x);
1567	return x;
1568	}
1569
1570	/ Use 4th order polynomial approximation from https://arachnoid.com/polysolve/*
1571	with 129 values of x,atan(x) for x:[0...1]
1572	This only works for 0 <= x <= 1
1573	*/
1574	SI F approx_atan_unit(F x) {
1575	// y = 0.14130025741326729 x⁴
1576	// - 0.34312835980675116 x³
1577	// - 0.016172900528248768 x²
1578	// + 1.00376969762003850 x
1579	// - 0.00014758242182738969
1580	const float c4 = `0.14130025741326729f`;
1581	const float c3 = -`0.34312835980675116f`;
1582	const float c2 = -`0.016172900528248768f`;
1583	const float c1 = `1.0037696976200385f`;
1584	const float c0 = -`0.00014758242182738969f`;
1585	return mad(f: x, m: mad(f: x, m: mad(f: x, m: mad(f: x, m: c4, a: c3), a: c2), a: c1), a: c0);
1586	}
1587
1588	// Use identity atan(x) = pi/2 - atan(1/x) for x > 1
1589	SI F atan_(F x) {
1590	I32 neg = (x < `0.0f`);
1591	x = if_then_else(c: neg, t: -x, e: x);
1592	I32 flip = (x > `1.0f`);
1593	x = if_then_else(c: flip, t: `1`/x, e: x);
1594	x = approx_atan_unit(x);
1595	x = if_then_else(c: flip, t: SK_FloatPI/`2` - x, e: x);
1596	x = if_then_else(c: neg, t: -x, e: x);
1597	return x;
1598	}
1599
1600	// Handbook of Mathematical Functions, by Milton Abramowitz and Irene Stegun:
1601	// https://books.google.com/books/content?id=ZboM5tOFWtsC&pg=PA81&img=1&zoom=3&hl=en&bul=1&sig=ACfU3U2M75tG_iGVOS92eQspr14LTq02Nw&ci=0%2C15%2C999%2C1279&edge=0
1602	// http://screen/8YGJxUGFQ49bVX6
1603	SI F asin_(F x) {
1604	I32 neg = (x < `0.0f`);
1605	x = if_then_else(c: neg, t: -x, e: x);
1606	const float c3 = -`0.0187293f`;
1607	const float c2 = `0.0742610f`;
1608	const float c1 = -`0.2121144f`;
1609	const float c0 = `1.5707288f`;
1610	F poly = mad(f: x, m: mad(f: x, m: mad(f: x, m: c3, a: c2), a: c1), a: c0);
1611	x = SK_FloatPI/`2` - sqrt_(v: `1` - x) * poly;
1612	x = if_then_else(c: neg, t: -x, e: x);
1613	return x;
1614	}
1615
1616	SI F acos_(F x) {
1617	return SK_FloatPI/`2` - asin_(x);
1618	}
1619
1620	/ Use identity atan(x) = pi/2 - atan(1/x) for x > 1*
1621	By swapping y,x to ensure the ratio is <= 1, we can safely call atan_unit()
1622	which avoids a 2nd divide instruction if we had instead called atan().
1623	*/
1624	SI F atan2_(F y0, F x0) {
1625	I32 flip = (abs_(v: y0) > abs_(v: x0));
1626	F y = if_then_else(c: flip, t: x0, e: y0);
1627	F x = if_then_else(c: flip, t: y0, e: x0);
1628	F arg = y/x;
1629
1630	I32 neg = (arg < `0.0f`);
1631	arg = if_then_else(c: neg, t: -arg, e: arg);
1632
1633	F r = approx_atan_unit(x: arg);
1634	r = if_then_else(c: flip, t: SK_FloatPI/`2` - r, e: r);
1635	r = if_then_else(c: neg, t: -r, e: r);
1636
1637	// handle quadrant distinctions
1638	r = if_then_else(c: (y0 >= `0`) & (x0 < `0`), t: r + SK_FloatPI, e: r);
1639	r = if_then_else(c: (y0 < `0`) & (x0 <= `0`), t: r - SK_FloatPI, e: r);
1640	// Note: we don't try to handle 0,0 or infinities
1641	return r;
1642	}
1643
1644	// Used by gather_ stages to calculate the base pointer and a vector of indices to load.
1645	template <typename T>
1646	SI U32 ix_and_ptr(T** ptr, const SkRasterPipeline_GatherCtx* ctx, F x, F y) {
1647	// We use exclusive clamp so that our min value is > 0 because ULP subtraction using U32 would
1648	// produce a NaN if applied to +0.f.
1649	x = clamp_ex(v: x, limit: ctx->width );
1650	y = clamp_ex(v: y, limit: ctx->height);
1651	x = sk_bit_cast<F>(src: sk_bit_cast<U32>(src: x) - (uint32_t)ctx->roundDownAtInteger);
1652	y = sk_bit_cast<F>(src: sk_bit_cast<U32>(src: y) - (uint32_t)ctx->roundDownAtInteger);
1653	ptr = (const* T*)ctx->pixels;
1654	return trunc_(v: y)*ctx->stride + trunc_(v: x);
1655	}
1656
1657	// We often have a nominally [0,1] float value we need to scale and convert to an integer,
1658	// whether for a table lookup or to pack back down into bytes for storage.
1659	//
1660	// In practice, especially when dealing with interesting color spaces, that notionally
1661	// [0,1] float may be out of [0,1] range. Unorms cannot represent that, so we must clamp.
1662	//
1663	// You can adjust the expected input to [0,bias] by tweaking that parameter.
1664	SI U32 to_unorm(F v, F scale, F bias = `1.0f`) {
1665	// Any time we use round() we probably want to use to_unorm().
1666	return round(v: min(a: max(a: `0.0f`, b: v), b: bias), scale);
1667	}
1668
1669	SI I32 cond_to_mask(I32 cond) {
1670	#if defined(JUMPER_IS_SCALAR)
1671	// In scalar mode, conditions are bools (0 or 1), but we want to store and operate on masks
1672	// (eg, using bitwise operations to select values).
1673	return if_then_else(cond, I32(~`0`), I32(`0`));
1674	#else
1675	// In SIMD mode, our various instruction sets already represent conditions as masks.
1676	return cond;
1677	#endif
1678	}
1679
1680	#if defined(JUMPER_IS_SCALAR)
1681	// In scalar mode, `data` only contains a single lane.
1682	template <typename T>
1683	SI T select_lane(T data, int lane) {
1684	SkASSERT(lane == `0`);
1685	return data;
1686	}
1687	#else
1688	// In SIMD mode, `data` contains a vector of lanes.
1689	template <typename T>
1690	SI T select_lane(V<T> data, int lane) {
1691	return data[lane];
1692	}
1693	#endif
1694
1695	// Now finally, normal Stages!
1696
1697	STAGE(seed_shader, NoCtx) {
1698	static constexpr float iota[] = {
1699	`0.5f`, `1.5f`, `2.5f`, `3.5f`, `4.5f`, `5.5f`, `6.5f`, `7.5f`,
1700	`8.5f`, `9.5f`,`10.5f`,`11.5f`,`12.5f`,`13.5f`,`14.5f`,`15.5f`,
1701	};
1702	// It's important for speed to explicitly cast(dx) and cast(dy),
1703	// which has the effect of splatting them to vectors before converting to floats.
1704	// On Intel this breaks a data dependency on previous loop iterations' registers.
1705	r = cast(v: dx) + sk_unaligned_load<F>(ptr: iota);
1706	g = cast(v: dy) + `0.5f`;
1707	b = `1.0f`; // This is w=1 for matrix multiplies by the device coords.
1708	a = `0`;
1709	}
1710
1711	STAGE(dither, const float* rate) {
1712	// Get [(dx,dy), (dx+1,dy), (dx+2,dy), ...] loaded up in integer vectors.
1713	uint32_t iota[] = {`0`,`1`,`2`,`3`,`4`,`5`,`6`,`7`};
1714	U32 X = dx + sk_unaligned_load<U32>(ptr: iota),
1715	Y = dy;
1716
1717	// We're doing 8x8 ordered dithering, see https://en.wikipedia.org/wiki/Ordered_dithering.
1718	// In this case n=8 and we're using the matrix that looks like 1/64 x [ 0 48 12 60 ... ].
1719
1720	// We only need X and X^Y from here on, so it's easier to just think of that as "Y".
1721	Y ^= X;
1722
1723	// We'll mix the bottom 3 bits of each of X and Y to make 6 bits,
1724	// for 2^6 == 64 == 8x8 matrix values. If X=abc and Y=def, we make fcebda.
1725	U32 M = (Y & `1`) << `5` \| (X & `1`) << `4`
1726	\| (Y & `2`) << `2` \| (X & `2`) << `1`
1727	\| (Y & `4`) >> `1` \| (X & `4`) >> `2`;
1728
1729	// Scale that dither to [0,1), then (-0.5,+0.5), here using 63/128 = 0.4921875 as 0.5-epsilon.
1730	// We want to make sure our dither is less than 0.5 in either direction to keep exact values
1731	// like 0 and 1 unchanged after rounding.
1732	F dither = cast(v: M) * (`2`/`128.0f`) - (`63`/`128.0f`);
1733
1734	r += ratedither;
1735	g += ratedither;
1736	b += ratedither;
1737
1738	r = max(a: `0.0f`, b: min(a: r, b: a));
1739	g = max(a: `0.0f`, b: min(a: g, b: a));
1740	b = max(a: `0.0f`, b: min(a: b, b: a));
1741	}
1742
1743	// load 4 floats from memory, and splat them into r,g,b,a
1744	STAGE(uniform_color, const SkRasterPipeline_UniformColorCtx* c) {
1745	r = c->r;
1746	g = c->g;
1747	b = c->b;
1748	a = c->a;
1749	}
1750	STAGE(unbounded_uniform_color, const SkRasterPipeline_UniformColorCtx* c) {
1751	r = c->r;
1752	g = c->g;
1753	b = c->b;
1754	a = c->a;
1755	}
1756	// load 4 floats from memory, and splat them into dr,dg,db,da
1757	STAGE(uniform_color_dst, const SkRasterPipeline_UniformColorCtx* c) {
1758	dr = c->r;
1759	dg = c->g;
1760	db = c->b;
1761	da = c->a;
1762	}
1763
1764	// splats opaque-black into r,g,b,a
1765	STAGE(black_color, NoCtx) {
1766	r = g = b = `0.0f`;
1767	a = `1.0f`;
1768	}
1769
1770	STAGE(white_color, NoCtx) {
1771	r = g = b = a = `1.0f`;
1772	}
1773
1774	// load registers r,g,b,a from context (mirrors store_src)
1775	STAGE(load_src, const float* ptr) {
1776	r = sk_unaligned_load<F>(ptr: ptr + `0`*N);
1777	g = sk_unaligned_load<F>(ptr: ptr + `1`*N);
1778	b = sk_unaligned_load<F>(ptr: ptr + `2`*N);
1779	a = sk_unaligned_load<F>(ptr: ptr + `3`*N);
1780	}
1781
1782	// store registers r,g,b,a into context (mirrors load_src)
1783	STAGE(store_src, float* ptr) {
1784	sk_unaligned_store(ptr: ptr + `0`*N, val: r);
1785	sk_unaligned_store(ptr: ptr + `1`*N, val: g);
1786	sk_unaligned_store(ptr: ptr + `2`*N, val: b);
1787	sk_unaligned_store(ptr: ptr + `3`*N, val: a);
1788	}
1789	// store registers r,g into context
1790	STAGE(store_src_rg, float* ptr) {
1791	sk_unaligned_store(ptr: ptr + `0`*N, val: r);
1792	sk_unaligned_store(ptr: ptr + `1`*N, val: g);
1793	}
1794	// load registers r,g from context
1795	STAGE(load_src_rg, float* ptr) {
1796	r = sk_unaligned_load<F>(ptr: ptr + `0`*N);
1797	g = sk_unaligned_load<F>(ptr: ptr + `1`*N);
1798	}
1799	// store register a into context
1800	STAGE(store_src_a, float* ptr) {
1801	sk_unaligned_store(ptr, val: a);
1802	}
1803
1804	// load registers dr,dg,db,da from context (mirrors store_dst)
1805	STAGE(load_dst, const float* ptr) {
1806	dr = sk_unaligned_load<F>(ptr: ptr + `0`*N);
1807	dg = sk_unaligned_load<F>(ptr: ptr + `1`*N);
1808	db = sk_unaligned_load<F>(ptr: ptr + `2`*N);
1809	da = sk_unaligned_load<F>(ptr: ptr + `3`*N);
1810	}
1811
1812	// store registers dr,dg,db,da into context (mirrors load_dst)
1813	STAGE(store_dst, float* ptr) {
1814	sk_unaligned_store(ptr: ptr + `0`*N, val: dr);
1815	sk_unaligned_store(ptr: ptr + `1`*N, val: dg);
1816	sk_unaligned_store(ptr: ptr + `2`*N, val: db);
1817	sk_unaligned_store(ptr: ptr + `3`*N, val: da);
1818	}
1819
1820	// Most blend modes apply the same logic to each channel.
1821	#define BLEND_MODE(name) \
1822	SI F name##_channel(F s, F d, F sa, F da); \
1823	STAGE(name, NoCtx) { \
1824	r = name##_channel(r,dr,a,da); \
1825	g = name##_channel(g,dg,a,da); \
1826	b = name##_channel(b,db,a,da); \
1827	a = name##_channel(a,da,a,da); \
1828	} \
1829	SI F name##_channel(F s, F d, F sa, F da)
1830
1831	SI F inv(F x) { return `1.0f` - x; }
1832	SI F two(F x) { return x + x; }
1833
1834
1835	BLEND_MODE(clear) { return `0`; }
1836	BLEND_MODE(srcatop) { return sda + dinv(x: sa); }
1837	BLEND_MODE(dstatop) { return dsa + sinv(x: da); }
1838	BLEND_MODE(srcin) { return s * da; }
1839	BLEND_MODE(dstin) { return d * sa; }
1840	BLEND_MODE(srcout) { return s * inv(x: da); }
1841	BLEND_MODE(dstout) { return d * inv(x: sa); }
1842	BLEND_MODE(srcover) { return mad(f: d, m: inv(x: sa), a: s); }
1843	BLEND_MODE(dstover) { return mad(f: s, m: inv(x: da), a: d); }
1844
1845	BLEND_MODE(modulate) { return s*d; }
1846	BLEND_MODE(multiply) { return sinv(x: da) + dinv(x: sa) + s*d; }
1847	BLEND_MODE(plus_) { return min(a: s + d, b: `1.0f`); } // We can clamp to either 1 or sa.
1848	BLEND_MODE(screen) { return s + d - s*d; }
1849	BLEND_MODE(xor_) { return sinv(x: da) + dinv(x: sa); }
1850	#undef BLEND_MODE
1851
1852	// Most other blend modes apply the same logic to colors, and srcover to alpha.
1853	#define BLEND_MODE(name) \
1854	SI F name##_channel(F s, F d, F sa, F da); \
1855	STAGE(name, NoCtx) { \
1856	r = name##_channel(r,dr,a,da); \
1857	g = name##_channel(g,dg,a,da); \
1858	b = name##_channel(b,db,a,da); \
1859	a = mad(da, inv(a), a); \
1860	} \
1861	SI F name##_channel(F s, F d, F sa, F da)
1862
1863	BLEND_MODE(darken) { return s + d - max(a: sda, b: dsa) ; }
1864	BLEND_MODE(lighten) { return s + d - min(a: sda, b: dsa) ; }
1865	BLEND_MODE(difference) { return s + d - two(x: min(a: sda, b: dsa)); }
1866	BLEND_MODE(exclusion) { return s + d - two(x: s*d); }
1867
1868	BLEND_MODE(colorburn) {
1869	return if_then_else(c: d == da, t: d + s*inv(x: da),
1870	e: if_then_else(c: s == `0`, / s + / t: d*inv(x: sa),
1871	e: sa(da - min(a: da, b: (da-d)sarcp_fast(v: s))) + sinv(x: da) + d*inv(x: sa)));
1872	}
1873	BLEND_MODE(colordodge) {
1874	return if_then_else(c: d == `0`, / d + / t: s*inv(x: da),
1875	e: if_then_else(c: s == sa, t: s + d*inv(x: sa),
1876	e: samin(a: da, b: (dsa)rcp_fast(v: sa - s)) + sinv(x: da) + d*inv(x: sa)));
1877	}
1878	BLEND_MODE(hardlight) {
1879	return sinv(x: da) + dinv(x: sa)
1880	+ if_then_else(c: two(x: s) <= sa, t: two(x: sd), e: sada - two(x: (da-d)*(sa-s)));
1881	}
1882	BLEND_MODE(overlay) {
1883	return sinv(x: da) + dinv(x: sa)
1884	+ if_then_else(c: two(x: d) <= da, t: two(x: sd), e: sada - two(x: (da-d)*(sa-s)));
1885	}
1886
1887	BLEND_MODE(softlight) {
1888	F m = if_then_else(c: da > `0`, t: d / da, e: `0`),
1889	s2 = two(x: s),
1890	m4 = two(x: two(x: m));
1891
1892	// The logic forks three ways:
1893	// 1. dark src?
1894	// 2. light src, dark dst?
1895	// 3. light src, light dst?
1896	F darkSrc = d(sa + (s2 - sa)(`1.0f` - m)), // Used in case 1.
1897	darkDst = (m4m4 + m4)(m - `1.0f`) + `7.0f`m, // Used in case 2.*
1898	liteDst = sqrt_(v: m) - m,
1899	liteSrc = dsa + da(s2 - sa) * if_then_else(c: two(x: two(x: d)) <= da, t: darkDst, e: liteDst); // 2 or 3?
1900	return sinv(x: da) + dinv(x: sa) + if_then_else(c: s2 <= sa, t: darkSrc, e: liteSrc); // 1 or (2 or 3)?
1901	}
1902	#undef BLEND_MODE
1903
1904	// We're basing our implemenation of non-separable blend modes on
1905	// https://www.w3.org/TR/compositing-1/#blendingnonseparable.
1906	// and
1907	// https://www.khronos.org/registry/OpenGL/specs/es/3.2/es_spec_3.2.pdf
1908	// They're equivalent, but ES' math has been better simplified.
1909	//
1910	// Anything extra we add beyond that is to make the math work with premul inputs.
1911
1912	SI F sat(F r, F g, F b) { return max(a: r, b: max(a: g,b)) - min(a: r, b: min(a: g,b)); }
1913	SI F lum(F r, F g, F b) { return mad(f: r, m: `0.30f`, a: mad(f: g, m: `0.59f`, a: b*`0.11f`)); }
1914
1915	SI void set_sat(F* r, F* g, F* b, F s) {
1916	F mn = min(a: r, b: min(a: g,b: *b)),
1917	mx = max(a: r, b: max(a: g,b: *b)),
1918	sat = mx - mn;
1919
1920	// Map min channel to 0, max channel to s, and scale the middle proportionally.
1921	auto scale = [=](F c) {
1922	return if_then_else(c: sat == `0`, t: `0`, e: (c - mn) * s / sat);
1923	};
1924	r = scale(r);
1925	g = scale(g);
1926	b = scale(b);
1927	}
1928	SI void set_lum(F* r, F* g, F* b, F l) {
1929	F diff = l - lum(r: r, g: g, b: *b);
1930	*r += diff;
1931	*g += diff;
1932	*b += diff;
1933	}
1934	SI void clip_color(F* r, F* g, F* b, F a) {
1935	F mn = min(a: r, b: min(a: g, b: *b)),
1936	mx = max(a: r, b: max(a: g, b: *b)),
1937	l = lum(r: r, g: g, b: *b);
1938
1939	auto clip = [=](F c) {
1940	c = if_then_else(c: mn < `0` && l != mn, t: l + (c - l) * ( l) / (l - mn), e: c);
1941	c = if_then_else(c: mx > a && l != mx, t: l + (c - l) * (a - l) / (mx - l), e: c);
1942	c = max(a: c, b: `0.0f`); // Sometimes without this we may dip just a little negative.
1943	return c;
1944	};
1945	r = clip(r);
1946	g = clip(g);
1947	b = clip(b);
1948	}
1949
1950	STAGE(hue, NoCtx) {
1951	F R = r*a,
1952	G = g*a,
1953	B = b*a;
1954
1955	set_sat(r: &R, g: &G, b: &B, s: sat(r: dr,g: dg,b: db)*a);
1956	set_lum(r: &R, g: &G, b: &B, l: lum(r: dr,g: dg,b: db)*a);
1957	clip_color(r: &R,g: &G,b: &B, a: a*da);
1958
1959	r = rinv(x: da) + drinv(x: a) + R;
1960	g = ginv(x: da) + dginv(x: a) + G;
1961	b = binv(x: da) + dbinv(x: a) + B;
1962	a = a + da - a*da;
1963	}
1964	STAGE(saturation, NoCtx) {
1965	F R = dr*a,
1966	G = dg*a,
1967	B = db*a;
1968
1969	set_sat(r: &R, g: &G, b: &B, s: sat( r, g, b)*da);
1970	set_lum(r: &R, g: &G, b: &B, l: lum(r: dr,g: dg,b: db)* a); // (This is not redundant.)
1971	clip_color(r: &R,g: &G,b: &B, a: a*da);
1972
1973	r = rinv(x: da) + drinv(x: a) + R;
1974	g = ginv(x: da) + dginv(x: a) + G;
1975	b = binv(x: da) + dbinv(x: a) + B;
1976	a = a + da - a*da;
1977	}
1978	STAGE(color, NoCtx) {
1979	F R = r*da,
1980	G = g*da,
1981	B = b*da;
1982
1983	set_lum(r: &R, g: &G, b: &B, l: lum(r: dr,g: dg,b: db)*a);
1984	clip_color(r: &R,g: &G,b: &B, a: a*da);
1985
1986	r = rinv(x: da) + drinv(x: a) + R;
1987	g = ginv(x: da) + dginv(x: a) + G;
1988	b = binv(x: da) + dbinv(x: a) + B;
1989	a = a + da - a*da;
1990	}
1991	STAGE(luminosity, NoCtx) {
1992	F R = dr*a,
1993	G = dg*a,
1994	B = db*a;
1995
1996	set_lum(r: &R, g: &G, b: &B, l: lum(r,g,b)*da);
1997	clip_color(r: &R,g: &G,b: &B, a: a*da);
1998
1999	r = rinv(x: da) + drinv(x: a) + R;
2000	g = ginv(x: da) + dginv(x: a) + G;
2001	b = binv(x: da) + dbinv(x: a) + B;
2002	a = a + da - a*da;
2003	}
2004
2005	STAGE(srcover_rgba_8888, const SkRasterPipeline_MemoryCtx* ctx) {
2006	auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy);
2007
2008	U32 dst = load<U32>(src: ptr, tail);
2009	dr = cast(v: (dst ) & `0xff`);
2010	dg = cast(v: (dst >> `8`) & `0xff`);
2011	db = cast(v: (dst >> `16`) & `0xff`);
2012	da = cast(v: (dst >> `24`) );
2013	// {dr,dg,db,da} are in [0,255]
2014	// { r, g, b, a} are in [0, 1] (but may be out of gamut)
2015
2016	r = mad(f: dr, m: inv(x: a), a: r*`255.0f`);
2017	g = mad(f: dg, m: inv(x: a), a: g*`255.0f`);
2018	b = mad(f: db, m: inv(x: a), a: b*`255.0f`);
2019	a = mad(f: da, m: inv(x: a), a: a*`255.0f`);
2020	// { r, g, b, a} are now in [0,255] (but may be out of gamut)
2021
2022	// to_unorm() clamps back to gamut. Scaling by 1 since we're already 255-biased.
2023	dst = to_unorm(v: r, scale: `1`, bias: `255`)
2024	\| to_unorm(v: g, scale: `1`, bias: `255`) << `8`
2025	\| to_unorm(v: b, scale: `1`, bias: `255`) << `16`
2026	\| to_unorm(v: a, scale: `1`, bias: `255`) << `24`;
2027	store(dst: ptr, v: dst, tail);
2028	}
2029
2030	SI F clamp_01_(F v) { return min(a: max(a: `0.0f`, b: v), b: `1.0f`); }
2031
2032	STAGE(clamp_01, NoCtx) {
2033	r = clamp_01_(v: r);
2034	g = clamp_01_(v: g);
2035	b = clamp_01_(v: b);
2036	a = clamp_01_(v: a);
2037	}
2038
2039	STAGE(clamp_gamut, NoCtx) {
2040	a = min(a: max(a, b: `0.0f`), b: `1.0f`);
2041	r = min(a: max(a: r, b: `0.0f`), b: a);
2042	g = min(a: max(a: g, b: `0.0f`), b: a);
2043	b = min(a: max(a: b, b: `0.0f`), b: a);
2044	}
2045
2046	STAGE(set_rgb, const float* rgb) {
2047	r = rgb[`0`];
2048	g = rgb[`1`];
2049	b = rgb[`2`];
2050	}
2051
2052	STAGE(unbounded_set_rgb, const float* rgb) {
2053	r = rgb[`0`];
2054	g = rgb[`1`];
2055	b = rgb[`2`];
2056	}
2057
2058	STAGE(swap_rb, NoCtx) {
2059	auto tmp = r;
2060	r = b;
2061	b = tmp;
2062	}
2063	STAGE(swap_rb_dst, NoCtx) {
2064	auto tmp = dr;
2065	dr = db;
2066	db = tmp;
2067	}
2068
2069	STAGE(move_src_dst, NoCtx) {
2070	dr = r;
2071	dg = g;
2072	db = b;
2073	da = a;
2074	}
2075	STAGE(move_dst_src, NoCtx) {
2076	r = dr;
2077	g = dg;
2078	b = db;
2079	a = da;
2080	}
2081	STAGE(swap_src_dst, NoCtx) {
2082	std::swap(x&: r, y&: dr);
2083	std::swap(x&: g, y&: dg);
2084	std::swap(x&: b, y&: db);
2085	std::swap(x&: a, y&: da);
2086	}
2087
2088	STAGE(premul, NoCtx) {
2089	r = r * a;
2090	g = g * a;
2091	b = b * a;
2092	}
2093	STAGE(premul_dst, NoCtx) {
2094	dr = dr * da;
2095	dg = dg * da;
2096	db = db * da;
2097	}
2098	STAGE(unpremul, NoCtx) {
2099	float inf = sk_bit_cast<float>(src: `0x7f800000`);
2100	auto scale = if_then_else(c: `1.0f`/a < inf, t: `1.0f`/a, e: `0`);
2101	r *= scale;
2102	g *= scale;
2103	b *= scale;
2104	}
2105	STAGE(unpremul_polar, NoCtx) {
2106	float inf = sk_bit_cast<float>(src: `0x7f800000`);
2107	auto scale = if_then_else(c: `1.0f`/a < inf, t: `1.0f`/a, e: `0`);
2108	g *= scale;
2109	b *= scale;
2110	}
2111
2112	STAGE(force_opaque , NoCtx) { a = `1`; }
2113	STAGE(force_opaque_dst, NoCtx) { da = `1`; }
2114
2115	STAGE(rgb_to_hsl, NoCtx) {
2116	F mx = max(a: r, b: max(a: g,b)),
2117	mn = min(a: r, b: min(a: g,b)),
2118	d = mx - mn,
2119	d_rcp = `1.0f` / d;
2120
2121	F h = (`1`/`6.0f`) *
2122	if_then_else(c: mx == mn, t: `0`,
2123	e: if_then_else(c: mx == r, t: (g-b)*d_rcp + if_then_else(c: g < b, t: `6.0f`, e: `0`),
2124	e: if_then_else(c: mx == g, t: (b-r)*d_rcp + `2.0f`,
2125	e: (r-g)*d_rcp + `4.0f`)));
2126
2127	F l = (mx + mn) * `0.5f`;
2128	F s = if_then_else(c: mx == mn, t: `0`,
2129	e: d / if_then_else(c: l > `0.5f`, t: `2.0f`-mx-mn, e: mx+mn));
2130
2131	r = h;
2132	g = s;
2133	b = l;
2134	}
2135	STAGE(hsl_to_rgb, NoCtx) {
2136	// See GrRGBToHSLFilterEffect.fp
2137
2138	F h = r,
2139	s = g,
2140	l = b,
2141	c = (`1.0f` - abs_(v: `2.0f` * l - `1`)) * s;
2142
2143	auto hue_to_rgb = [&](F hue) {
2144	F q = clamp_01_(v: abs_(v: fract(v: hue) * `6.0f` - `3.0f`) - `1.0f`);
2145	return (q - `0.5f`) * c + l;
2146	};
2147
2148	r = hue_to_rgb(h + `0.0f`/`3.0f`);
2149	g = hue_to_rgb(h + `2.0f`/`3.0f`);
2150	b = hue_to_rgb(h + `1.0f`/`3.0f`);
2151	}
2152
2153	// Color conversion functions used in gradient interpolation, based on
2154	// https://www.w3.org/TR/css-color-4/#color-conversion-code
2155	STAGE(css_lab_to_xyz, NoCtx) {
2156	constexpr float k = `24389` / `27.0f`;
2157	constexpr float e = `216` / `24389.0f`;
2158
2159	F f[`3`];
2160	f[`1`] = (r + `16`) * (`1` / `116.0f`);
2161	f[`0`] = (g * (`1` / `500.0f`)) + f[`1`];
2162	f[`2`] = f[`1`] - (b * (`1` / `200.0f`));
2163
2164	F f_cubed[`3`] = { f[`0`]f[`0`]f[`0`], f[`1`]f[`1`]f[`1`], f[`2`]f[`2`]f[`2`] };
2165
2166	F xyz[`3`] = {
2167	if_then_else(c: f_cubed[`0`] > e, t: f_cubed[`0`], e: (`116` * f[`0`] - `16`) * (`1` / k)),
2168	if_then_else(c: r > k * e, t: f_cubed[`1`], e: r * (`1` / k)),
2169	if_then_else(c: f_cubed[`2`] > e, t: f_cubed[`2`], e: (`116` * f[`2`] - `16`) * (`1` / k))
2170	};
2171
2172	constexpr float D50[`3`] = { `0.3457f` / `0.3585f`, `1.0f`, (`1.0f` - `0.3457f` - `0.3585f`) / `0.3585f` };
2173	r = xyz[`0`]*D50[`0`];
2174	g = xyz[`1`]*D50[`1`];
2175	b = xyz[`2`]*D50[`2`];
2176	}
2177
2178	STAGE(css_oklab_to_linear_srgb, NoCtx) {
2179	F l_ = r + `0.3963377774f` * g + `0.2158037573f` * b,
2180	m_ = r - `0.1055613458f` * g - `0.0638541728f` * b,
2181	s_ = r - `0.0894841775f` * g - `1.2914855480f` * b;
2182
2183	F l = l_l_l_,
2184	m = m_m_m_,
2185	s = s_s_s_;
2186
2187	r = +`4.0767416621f` * l - `3.3077115913f` * m + `0.2309699292f` * s;
2188	g = -`1.2684380046f` * l + `2.6097574011f` * m - `0.3413193965f` * s;
2189	b = -`0.0041960863f` * l - `0.7034186147f` * m + `1.7076147010f` * s;
2190	}
2191
2192	// Skia stores all polar colors with hue in the first component, so this "LCH -> Lab" transform
2193	// actually takes "HCL". This is also used to do the same polar transform for OkHCL to OkLAB.
2194	// See similar comments & logic in SkGradientBaseShader.cpp.
2195	STAGE(css_hcl_to_lab, NoCtx) {
2196	F H = r,
2197	C = g,
2198	L = b;
2199
2200	F hueRadians = H * (SK_FloatPI / `180`);
2201
2202	r = L;
2203	g = C * cos_(x: hueRadians);
2204	b = C * sin_(x: hueRadians);
2205	}
2206
2207	SI F mod_(F x, float y) {
2208	return x - y * floor_(v: x * (`1` / y));
2209	}
2210
2211	struct RGB { F r, g, b; };
2212
2213	SI RGB css_hsl_to_srgb_(F h, F s, F l) {
2214	h = mod_(x: h, y: `360`);
2215
2216	s *= `0.01f`;
2217	l *= `0.01f`;
2218
2219	F k[`3`] = {
2220	mod_(x: `0` + h * (`1` / `30.0f`), y: `12`),
2221	mod_(x: `8` + h * (`1` / `30.0f`), y: `12`),
2222	mod_(x: `4` + h * (`1` / `30.0f`), y: `12`)
2223	};
2224	F a = s * min(a: l, b: `1` - l);
2225	return {
2226	.r: l - a * max(a: -`1.0f`, b: min(a: min(a: k[`0`] - `3.0f`, b: `9.0f` - k[`0`]), b: `1.0f`)),
2227	.g: l - a * max(a: -`1.0f`, b: min(a: min(a: k[`1`] - `3.0f`, b: `9.0f` - k[`1`]), b: `1.0f`)),
2228	.b: l - a * max(a: -`1.0f`, b: min(a: min(a: k[`2`] - `3.0f`, b: `9.0f` - k[`2`]), b: `1.0f`))
2229	};
2230	}
2231
2232	STAGE(css_hsl_to_srgb, NoCtx) {
2233	RGB rgb = css_hsl_to_srgb_(h: r, s: g, l: b);
2234	r = rgb.r;
2235	g = rgb.g;
2236	b = rgb.b;
2237	}
2238
2239	STAGE(css_hwb_to_srgb, NoCtx) {
2240	g *= `0.01f`;
2241	b *= `0.01f`;
2242
2243	F gray = g / (g + b);
2244
2245	RGB rgb = css_hsl_to_srgb_(h: r, s: `100.0f`, l: `50.0f`);
2246	rgb.r = rgb.r * (`1` - g - b) + g;
2247	rgb.g = rgb.g * (`1` - g - b) + g;
2248	rgb.b = rgb.b * (`1` - g - b) + g;
2249
2250	auto isGray = (g + b) >= `1`;
2251
2252	r = if_then_else(c: isGray, t: gray, e: rgb.r);
2253	g = if_then_else(c: isGray, t: gray, e: rgb.g);
2254	b = if_then_else(c: isGray, t: gray, e: rgb.b);
2255	}
2256
2257	// Derive alpha's coverage from rgb coverage and the values of src and dst alpha.
2258	SI F alpha_coverage_from_rgb_coverage(F a, F da, F cr, F cg, F cb) {
2259	return if_then_else(c: a < da, t: min(a: cr, b: min(a: cg,b: cb))
2260	, e: max(a: cr, b: max(a: cg,b: cb)));
2261	}
2262
2263	STAGE(scale_1_float, const float* c) {
2264	r = r * *c;
2265	g = g * *c;
2266	b = b * *c;
2267	a = a * *c;
2268	}
2269	STAGE(scale_u8, const SkRasterPipeline_MemoryCtx* ctx) {
2270	auto ptr = ptr_at_xy<const uint8_t>(ctx, dx,dy);
2271
2272	auto scales = load<U8>(src: ptr, tail);
2273	auto c = from_byte(b: scales);
2274
2275	r = r * c;
2276	g = g * c;
2277	b = b * c;
2278	a = a * c;
2279	}
2280	STAGE(scale_565, const SkRasterPipeline_MemoryCtx* ctx) {
2281	auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy);
2282
2283	F cr,cg,cb;
2284	from_565(565: load<U16>(src: ptr, tail), r: &cr, g: &cg, b: &cb);
2285
2286	F ca = alpha_coverage_from_rgb_coverage(a,da, cr,cg,cb);
2287
2288	r = r * cr;
2289	g = g * cg;
2290	b = b * cb;
2291	a = a * ca;
2292	}
2293
2294	SI F lerp(F from, F to, F t) {
2295	return mad(f: to-from, m: t, a: from);
2296	}
2297
2298	STAGE(lerp_1_float, const float* c) {
2299	r = lerp(from: dr, to: r, t: *c);
2300	g = lerp(from: dg, to: g, t: *c);
2301	b = lerp(from: db, to: b, t: *c);
2302	a = lerp(from: da, to: a, t: *c);
2303	}
2304	STAGE(scale_native, const float scales[]) {
2305	auto c = sk_unaligned_load<F>(ptr: scales);
2306	r = r * c;
2307	g = g * c;
2308	b = b * c;
2309	a = a * c;
2310	}
2311	STAGE(lerp_native, const float scales[]) {
2312	auto c = sk_unaligned_load<F>(ptr: scales);
2313	r = lerp(from: dr, to: r, t: c);
2314	g = lerp(from: dg, to: g, t: c);
2315	b = lerp(from: db, to: b, t: c);
2316	a = lerp(from: da, to: a, t: c);
2317	}
2318	STAGE(lerp_u8, const SkRasterPipeline_MemoryCtx* ctx) {
2319	auto ptr = ptr_at_xy<const uint8_t>(ctx, dx,dy);
2320
2321	auto scales = load<U8>(src: ptr, tail);
2322	auto c = from_byte(b: scales);
2323
2324	r = lerp(from: dr, to: r, t: c);
2325	g = lerp(from: dg, to: g, t: c);
2326	b = lerp(from: db, to: b, t: c);
2327	a = lerp(from: da, to: a, t: c);
2328	}
2329	STAGE(lerp_565, const SkRasterPipeline_MemoryCtx* ctx) {
2330	auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy);
2331
2332	F cr,cg,cb;
2333	from_565(565: load<U16>(src: ptr, tail), r: &cr, g: &cg, b: &cb);
2334
2335	F ca = alpha_coverage_from_rgb_coverage(a,da, cr,cg,cb);
2336
2337	r = lerp(from: dr, to: r, t: cr);
2338	g = lerp(from: dg, to: g, t: cg);
2339	b = lerp(from: db, to: b, t: cb);
2340	a = lerp(from: da, to: a, t: ca);
2341	}
2342
2343	STAGE(emboss, const SkRasterPipeline_EmbossCtx* ctx) {
2344	auto mptr = ptr_at_xy<const uint8_t>(ctx: &ctx->mul, dx,dy),
2345	aptr = ptr_at_xy<const uint8_t>(ctx: &ctx->add, dx,dy);
2346
2347	F mul = from_byte(b: load<U8>(src: mptr, tail)),
2348	add = from_byte(b: load<U8>(src: aptr, tail));
2349
2350	r = mad(f: r, m: mul, a: add);
2351	g = mad(f: g, m: mul, a: add);
2352	b = mad(f: b, m: mul, a: add);
2353	}
2354
2355	STAGE(byte_tables, const SkRasterPipeline_TablesCtx* tables) {
2356	r = from_byte(b: gather(p: tables->r, ix: to_unorm(v: r, scale: `255`)));
2357	g = from_byte(b: gather(p: tables->g, ix: to_unorm(v: g, scale: `255`)));
2358	b = from_byte(b: gather(p: tables->b, ix: to_unorm(v: b, scale: `255`)));
2359	a = from_byte(b: gather(p: tables->a, ix: to_unorm(v: a, scale: `255`)));
2360	}
2361
2362	SI F strip_sign(F x, U32* sign) {
2363	U32 bits = sk_bit_cast<U32>(src: x);
2364	*sign = bits & `0x80000000`;
2365	return sk_bit_cast<F>(src: bits ^ *sign);
2366	}
2367
2368	SI F apply_sign(F x, U32 sign) {
2369	return sk_bit_cast<F>(src: sign \| sk_bit_cast<U32>(src: x));
2370	}
2371
2372	STAGE(parametric, const skcms_TransferFunction* ctx) {
2373	auto fn = [&](F v) {
2374	U32 sign;
2375	v = strip_sign(x: v, sign: &sign);
2376
2377	F r = if_then_else(c: v <= ctx->d, t: mad(f: ctx->c, m: v, a: ctx->f)
2378	, e: approx_powf(x: mad(f: ctx->a, m: v, a: ctx->b), y: ctx->g) + ctx->e);
2379	return apply_sign(x: r, sign);
2380	};
2381	r = fn(r);
2382	g = fn(g);
2383	b = fn(b);
2384	}
2385
2386	STAGE(gamma_, const float* G) {
2387	auto fn = [&](F v) {
2388	U32 sign;
2389	v = strip_sign(x: v, sign: &sign);
2390	return apply_sign(x: approx_powf(x: v, y: *G), sign);
2391	};
2392	r = fn(r);
2393	g = fn(g);
2394	b = fn(b);
2395	}
2396
2397	STAGE(PQish, const skcms_TransferFunction* ctx) {
2398	auto fn = [&](F v) {
2399	U32 sign;
2400	v = strip_sign(x: v, sign: &sign);
2401
2402	F r = approx_powf(x: max(a: mad(f: ctx->b, m: approx_powf(x: v, y: ctx->c), a: ctx->a), b: `0.0f`)
2403	/ (mad(f: ctx->e, m: approx_powf(x: v, y: ctx->c), a: ctx->d)),
2404	y: ctx->f);
2405
2406	return apply_sign(x: r, sign);
2407	};
2408	r = fn(r);
2409	g = fn(g);
2410	b = fn(b);
2411	}
2412
2413	STAGE(HLGish, const skcms_TransferFunction* ctx) {
2414	auto fn = [&](F v) {
2415	U32 sign;
2416	v = strip_sign(x: v, sign: &sign);
2417
2418	const float R = ctx->a, G = ctx->b,
2419	a = ctx->c, b = ctx->d, c = ctx->e,
2420	K = ctx->f + `1.0f`;
2421
2422	F r = if_then_else(c: vR <= `1`, t: approx_powf(x: vR, y: G)
2423	, e: approx_exp(x: (v-c)*a) + b);
2424
2425	return K * apply_sign(x: r, sign);
2426	};
2427	r = fn(r);
2428	g = fn(g);
2429	b = fn(b);
2430	}
2431
2432	STAGE(HLGinvish, const skcms_TransferFunction* ctx) {
2433	auto fn = [&](F v) {
2434	U32 sign;
2435	v = strip_sign(x: v, sign: &sign);
2436
2437	const float R = ctx->a, G = ctx->b,
2438	a = ctx->c, b = ctx->d, c = ctx->e,
2439	K = ctx->f + `1.0f`;
2440
2441	v /= K;
2442	F r = if_then_else(c: v <= `1`, t: R * approx_powf(x: v, y: G)
2443	, e: a * approx_log(x: v - b) + c);
2444
2445	return apply_sign(x: r, sign);
2446	};
2447	r = fn(r);
2448	g = fn(g);
2449	b = fn(b);
2450	}
2451
2452	STAGE(load_a8, const SkRasterPipeline_MemoryCtx* ctx) {
2453	auto ptr = ptr_at_xy<const uint8_t>(ctx, dx,dy);
2454
2455	r = g = b = `0.0f`;
2456	a = from_byte(b: load<U8>(src: ptr, tail));
2457	}
2458	STAGE(load_a8_dst, const SkRasterPipeline_MemoryCtx* ctx) {
2459	auto ptr = ptr_at_xy<const uint8_t>(ctx, dx,dy);
2460
2461	dr = dg = db = `0.0f`;
2462	da = from_byte(b: load<U8>(src: ptr, tail));
2463	}
2464	STAGE(gather_a8, const SkRasterPipeline_GatherCtx* ctx) {
2465	const uint8_t* ptr;
2466	U32 ix = ix_and_ptr(ptr: &ptr, ctx, x: r,y: g);
2467	r = g = b = `0.0f`;
2468	a = from_byte(b: gather(p: ptr, ix));
2469	}
2470	STAGE(store_a8, const SkRasterPipeline_MemoryCtx* ctx) {
2471	auto ptr = ptr_at_xy<uint8_t>(ctx, dx,dy);
2472
2473	U8 packed = pack(v: pack(v: to_unorm(v: a, scale: `255`)));
2474	store(dst: ptr, v: packed, tail);
2475	}
2476	STAGE(store_r8, const SkRasterPipeline_MemoryCtx* ctx) {
2477	auto ptr = ptr_at_xy<uint8_t>(ctx, dx,dy);
2478
2479	U8 packed = pack(v: pack(v: to_unorm(v: r, scale: `255`)));
2480	store(dst: ptr, v: packed, tail);
2481	}
2482
2483	STAGE(load_565, const SkRasterPipeline_MemoryCtx* ctx) {
2484	auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy);
2485
2486	from_565(565: load<U16>(src: ptr, tail), r: &r,g: &g,b: &b);
2487	a = `1.0f`;
2488	}
2489	STAGE(load_565_dst, const SkRasterPipeline_MemoryCtx* ctx) {
2490	auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy);
2491
2492	from_565(565: load<U16>(src: ptr, tail), r: &dr,g: &dg,b: &db);
2493	da = `1.0f`;
2494	}
2495	STAGE(gather_565, const SkRasterPipeline_GatherCtx* ctx) {
2496	const uint16_t* ptr;
2497	U32 ix = ix_and_ptr(ptr: &ptr, ctx, x: r,y: g);
2498	from_565(565: gather(p: ptr, ix), r: &r,g: &g,b: &b);
2499	a = `1.0f`;
2500	}
2501	STAGE(store_565, const SkRasterPipeline_MemoryCtx* ctx) {
2502	auto ptr = ptr_at_xy<uint16_t>(ctx, dx,dy);
2503
2504	U16 px = pack( v: to_unorm(v: r, scale: `31`) << `11`
2505	\| to_unorm(v: g, scale: `63`) << `5`
2506	\| to_unorm(v: b, scale: `31`) );
2507	store(dst: ptr, v: px, tail);
2508	}
2509
2510	STAGE(load_4444, const SkRasterPipeline_MemoryCtx* ctx) {
2511	auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy);
2512	from_4444(4444: load<U16>(src: ptr, tail), r: &r,g: &g,b: &b,a: &a);
2513	}
2514	STAGE(load_4444_dst, const SkRasterPipeline_MemoryCtx* ctx) {
2515	auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy);
2516	from_4444(4444: load<U16>(src: ptr, tail), r: &dr,g: &dg,b: &db,a: &da);
2517	}
2518	STAGE(gather_4444, const SkRasterPipeline_GatherCtx* ctx) {
2519	const uint16_t* ptr;
2520	U32 ix = ix_and_ptr(ptr: &ptr, ctx, x: r,y: g);
2521	from_4444(4444: gather(p: ptr, ix), r: &r,g: &g,b: &b,a: &a);
2522	}
2523	STAGE(store_4444, const SkRasterPipeline_MemoryCtx* ctx) {
2524	auto ptr = ptr_at_xy<uint16_t>(ctx, dx,dy);
2525	U16 px = pack( v: to_unorm(v: r, scale: `15`) << `12`
2526	\| to_unorm(v: g, scale: `15`) << `8`
2527	\| to_unorm(v: b, scale: `15`) << `4`
2528	\| to_unorm(v: a, scale: `15`) );
2529	store(dst: ptr, v: px, tail);
2530	}
2531
2532	STAGE(load_8888, const SkRasterPipeline_MemoryCtx* ctx) {
2533	auto ptr = ptr_at_xy<const uint32_t>(ctx, dx,dy);
2534	from_8888(8888: load<U32>(src: ptr, tail), r: &r,g: &g,b: &b,a: &a);
2535	}
2536	STAGE(load_8888_dst, const SkRasterPipeline_MemoryCtx* ctx) {
2537	auto ptr = ptr_at_xy<const uint32_t>(ctx, dx,dy);
2538	from_8888(8888: load<U32>(src: ptr, tail), r: &dr,g: &dg,b: &db,a: &da);
2539	}
2540	STAGE(gather_8888, const SkRasterPipeline_GatherCtx* ctx) {
2541	const uint32_t* ptr;
2542	U32 ix = ix_and_ptr(ptr: &ptr, ctx, x: r,y: g);
2543	from_8888(8888: gather(p: ptr, ix), r: &r,g: &g,b: &b,a: &a);
2544	}
2545	STAGE(store_8888, const SkRasterPipeline_MemoryCtx* ctx) {
2546	auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy);
2547
2548	U32 px = to_unorm(v: r, scale: `255`)
2549	\| to_unorm(v: g, scale: `255`) << `8`
2550	\| to_unorm(v: b, scale: `255`) << `16`
2551	\| to_unorm(v: a, scale: `255`) << `24`;
2552	store(dst: ptr, v: px, tail);
2553	}
2554
2555	STAGE(load_rg88, const SkRasterPipeline_MemoryCtx* ctx) {
2556	auto ptr = ptr_at_xy<const uint16_t>(ctx, dx, dy);
2557	from_88(88: load<U16>(src: ptr, tail), r: &r, g: &g);
2558	b = `0`;
2559	a = `1`;
2560	}
2561	STAGE(load_rg88_dst, const SkRasterPipeline_MemoryCtx* ctx) {
2562	auto ptr = ptr_at_xy<const uint16_t>(ctx, dx, dy);
2563	from_88(88: load<U16>(src: ptr, tail), r: &dr, g: &dg);
2564	db = `0`;
2565	da = `1`;
2566	}
2567	STAGE(gather_rg88, const SkRasterPipeline_GatherCtx* ctx) {
2568	const uint16_t* ptr;
2569	U32 ix = ix_and_ptr(ptr: &ptr, ctx, x: r, y: g);
2570	from_88(88: gather(p: ptr, ix), r: &r, g: &g);
2571	b = `0`;
2572	a = `1`;
2573	}
2574	STAGE(store_rg88, const SkRasterPipeline_MemoryCtx* ctx) {
2575	auto ptr = ptr_at_xy<uint16_t>(ctx, dx, dy);
2576	U16 px = pack( v: to_unorm(v: r, scale: `255`) \| to_unorm(v: g, scale: `255`) << `8` );
2577	store(dst: ptr, v: px, tail);
2578	}
2579
2580	STAGE(load_a16, const SkRasterPipeline_MemoryCtx* ctx) {
2581	auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy);
2582	r = g = b = `0`;
2583	a = from_short(s: load<U16>(src: ptr, tail));
2584	}
2585	STAGE(load_a16_dst, const SkRasterPipeline_MemoryCtx* ctx) {
2586	auto ptr = ptr_at_xy<const uint16_t>(ctx, dx, dy);
2587	dr = dg = db = `0.0f`;
2588	da = from_short(s: load<U16>(src: ptr, tail));
2589	}
2590	STAGE(gather_a16, const SkRasterPipeline_GatherCtx* ctx) {
2591	const uint16_t* ptr;
2592	U32 ix = ix_and_ptr(ptr: &ptr, ctx, x: r, y: g);
2593	r = g = b = `0.0f`;
2594	a = from_short(s: gather(p: ptr, ix));
2595	}
2596	STAGE(store_a16, const SkRasterPipeline_MemoryCtx* ctx) {
2597	auto ptr = ptr_at_xy<uint16_t>(ctx, dx,dy);
2598
2599	U16 px = pack(v: to_unorm(v: a, scale: `65535`));
2600	store(dst: ptr, v: px, tail);
2601	}
2602
2603	STAGE(load_rg1616, const SkRasterPipeline_MemoryCtx* ctx) {
2604	auto ptr = ptr_at_xy<const uint32_t>(ctx, dx, dy);
2605	b = `0`; a = `1`;
2606	from_1616(1616: load<U32>(src: ptr, tail), r: &r,g: &g);
2607	}
2608	STAGE(load_rg1616_dst, const SkRasterPipeline_MemoryCtx* ctx) {
2609	auto ptr = ptr_at_xy<const uint32_t>(ctx, dx, dy);
2610	from_1616(1616: load<U32>(src: ptr, tail), r: &dr, g: &dg);
2611	db = `0`;
2612	da = `1`;
2613	}
2614	STAGE(gather_rg1616, const SkRasterPipeline_GatherCtx* ctx) {
2615	const uint32_t* ptr;
2616	U32 ix = ix_and_ptr(ptr: &ptr, ctx, x: r, y: g);
2617	from_1616(1616: gather(p: ptr, ix), r: &r, g: &g);
2618	b = `0`;
2619	a = `1`;
2620	}
2621	STAGE(store_rg1616, const SkRasterPipeline_MemoryCtx* ctx) {
2622	auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy);
2623
2624	U32 px = to_unorm(v: r, scale: `65535`)
2625	\| to_unorm(v: g, scale: `65535`) << `16`;
2626	store(dst: ptr, v: px, tail);
2627	}
2628
2629	STAGE(load_16161616, const SkRasterPipeline_MemoryCtx* ctx) {
2630	auto ptr = ptr_at_xy<const uint64_t>(ctx, dx, dy);
2631	from_16161616(16161616: load<U64>(src: ptr, tail), r: &r,g: &g, b: &b, a: &a);
2632	}
2633	STAGE(load_16161616_dst, const SkRasterPipeline_MemoryCtx* ctx) {
2634	auto ptr = ptr_at_xy<const uint64_t>(ctx, dx, dy);
2635	from_16161616(16161616: load<U64>(src: ptr, tail), r: &dr, g: &dg, b: &db, a: &da);
2636	}
2637	STAGE(gather_16161616, const SkRasterPipeline_GatherCtx* ctx) {
2638	const uint64_t* ptr;
2639	U32 ix = ix_and_ptr(ptr: &ptr, ctx, x: r, y: g);
2640	from_16161616(16161616: gather(p: ptr, ix), r: &r, g: &g, b: &b, a: &a);
2641	}
2642	STAGE(store_16161616, const SkRasterPipeline_MemoryCtx* ctx) {
2643	auto ptr = ptr_at_xy<uint16_t>(ctx, dx: `4`dx,dy: `4`dy);
2644
2645	U16 R = pack(v: to_unorm(v: r, scale: `65535`)),
2646	G = pack(v: to_unorm(v: g, scale: `65535`)),
2647	B = pack(v: to_unorm(v: b, scale: `65535`)),
2648	A = pack(v: to_unorm(v: a, scale: `65535`));
2649
2650	store4(ptr,tail, r: R,g: G,b: B,a: A);
2651	}
2652
2653
2654	STAGE(load_1010102, const SkRasterPipeline_MemoryCtx* ctx) {
2655	auto ptr = ptr_at_xy<const uint32_t>(ctx, dx,dy);
2656	from_1010102(rgba: load<U32>(src: ptr, tail), r: &r,g: &g,b: &b,a: &a);
2657	}
2658	STAGE(load_1010102_dst, const SkRasterPipeline_MemoryCtx* ctx) {
2659	auto ptr = ptr_at_xy<const uint32_t>(ctx, dx,dy);
2660	from_1010102(rgba: load<U32>(src: ptr, tail), r: &dr,g: &dg,b: &db,a: &da);
2661	}
2662	STAGE(load_1010102_xr, const SkRasterPipeline_MemoryCtx* ctx) {
2663	auto ptr = ptr_at_xy<const uint32_t>(ctx, dx,dy);
2664	from_1010102_xr(rgba: load<U32>(src: ptr, tail), r: &r,g: &g,b: &b,a: &a);
2665	}
2666	STAGE(load_1010102_xr_dst, const SkRasterPipeline_MemoryCtx* ctx) {
2667	auto ptr = ptr_at_xy<const uint32_t>(ctx, dx,dy);
2668	from_1010102_xr(rgba: load<U32>(src: ptr, tail), r: &dr,g: &dg,b: &db,a: &da);
2669	}
2670	STAGE(gather_1010102, const SkRasterPipeline_GatherCtx* ctx) {
2671	const uint32_t* ptr;
2672	U32 ix = ix_and_ptr(ptr: &ptr, ctx, x: r,y: g);
2673	from_1010102(rgba: gather(p: ptr, ix), r: &r,g: &g,b: &b,a: &a);
2674	}
2675	STAGE(gather_1010102_xr, const SkRasterPipeline_GatherCtx* ctx) {
2676	const uint32_t* ptr;
2677	U32 ix = ix_and_ptr(ptr: &ptr, ctx, x: r, y: g);
2678	from_1010102_xr(rgba: gather(p: ptr, ix), r: &r,g: &g,b: &b,a: &a);
2679	}
2680	STAGE(store_1010102, const SkRasterPipeline_MemoryCtx* ctx) {
2681	auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy);
2682
2683	U32 px = to_unorm(v: r, scale: `1023`)
2684	\| to_unorm(v: g, scale: `1023`) << `10`
2685	\| to_unorm(v: b, scale: `1023`) << `20`
2686	\| to_unorm(v: a, scale: `3`) << `30`;
2687	store(dst: ptr, v: px, tail);
2688	}
2689	STAGE(store_1010102_xr, const SkRasterPipeline_MemoryCtx* ctx) {
2690	auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy);
2691	static constexpr float min = -`0.752941f`;
2692	static constexpr float max = `1.25098f`;
2693	static constexpr float range = max - min;
2694	U32 px = to_unorm(v: (r - min) / range, scale: `1023`)
2695	\| to_unorm(v: (g - min) / range, scale: `1023`) << `10`
2696	\| to_unorm(v: (b - min) / range, scale: `1023`) << `20`
2697	\| to_unorm(v: a, scale: `3`) << `30`;
2698	store(dst: ptr, v: px, tail);
2699	}
2700
2701	STAGE(load_f16, const SkRasterPipeline_MemoryCtx* ctx) {
2702	auto ptr = ptr_at_xy<const uint64_t>(ctx, dx,dy);
2703
2704	U16 R,G,B,A;
2705	load4(ptr: (const uint16_t*)ptr,tail, r: &R,g: &G,b: &B,a: &A);
2706	r = from_half(h: R);
2707	g = from_half(h: G);
2708	b = from_half(h: B);
2709	a = from_half(h: A);
2710	}
2711	STAGE(load_f16_dst, const SkRasterPipeline_MemoryCtx* ctx) {
2712	auto ptr = ptr_at_xy<const uint64_t>(ctx, dx,dy);
2713
2714	U16 R,G,B,A;
2715	load4(ptr: (const uint16_t*)ptr,tail, r: &R,g: &G,b: &B,a: &A);
2716	dr = from_half(h: R);
2717	dg = from_half(h: G);
2718	db = from_half(h: B);
2719	da = from_half(h: A);
2720	}
2721	STAGE(gather_f16, const SkRasterPipeline_GatherCtx* ctx) {
2722	const uint64_t* ptr;
2723	U32 ix = ix_and_ptr(ptr: &ptr, ctx, x: r,y: g);
2724	auto px = gather(p: ptr, ix);
2725
2726	U16 R,G,B,A;
2727	load4(ptr: (const uint16_t*)&px,tail: `0`, r: &R,g: &G,b: &B,a: &A);
2728	r = from_half(h: R);
2729	g = from_half(h: G);
2730	b = from_half(h: B);
2731	a = from_half(h: A);
2732	}
2733	STAGE(store_f16, const SkRasterPipeline_MemoryCtx* ctx) {
2734	auto ptr = ptr_at_xy<uint64_t>(ctx, dx,dy);
2735	store4(ptr: (uint16_t*)ptr,tail, r: to_half(f: r)
2736	, g: to_half(f: g)
2737	, b: to_half(f: b)
2738	, a: to_half(f: a));
2739	}
2740
2741	STAGE(store_u16_be, const SkRasterPipeline_MemoryCtx* ctx) {
2742	auto ptr = ptr_at_xy<uint16_t>(ctx, dx: `4`*dx,dy);
2743
2744	U16 R = bswap(x: pack(v: to_unorm(v: r, scale: `65535`))),
2745	G = bswap(x: pack(v: to_unorm(v: g, scale: `65535`))),
2746	B = bswap(x: pack(v: to_unorm(v: b, scale: `65535`))),
2747	A = bswap(x: pack(v: to_unorm(v: a, scale: `65535`)));
2748
2749	store4(ptr,tail, r: R,g: G,b: B,a: A);
2750	}
2751
2752	STAGE(load_af16, const SkRasterPipeline_MemoryCtx* ctx) {
2753	auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy);
2754
2755	U16 A = load<U16>(src: (const uint16_t*)ptr, tail);
2756	r = `0`;
2757	g = `0`;
2758	b = `0`;
2759	a = from_half(h: A);
2760	}
2761	STAGE(load_af16_dst, const SkRasterPipeline_MemoryCtx* ctx) {
2762	auto ptr = ptr_at_xy<const uint16_t>(ctx, dx, dy);
2763
2764	U16 A = load<U16>(src: (const uint16_t*)ptr, tail);
2765	dr = dg = db = `0.0f`;
2766	da = from_half(h: A);
2767	}
2768	STAGE(gather_af16, const SkRasterPipeline_GatherCtx* ctx) {
2769	const uint16_t* ptr;
2770	U32 ix = ix_and_ptr(ptr: &ptr, ctx, x: r, y: g);
2771	r = g = b = `0.0f`;
2772	a = from_half(h: gather(p: ptr, ix));
2773	}
2774	STAGE(store_af16, const SkRasterPipeline_MemoryCtx* ctx) {
2775	auto ptr = ptr_at_xy<uint16_t>(ctx, dx,dy);
2776	store(dst: ptr, v: to_half(f: a), tail);
2777	}
2778
2779	STAGE(load_rgf16, const SkRasterPipeline_MemoryCtx* ctx) {
2780	auto ptr = ptr_at_xy<const uint32_t>(ctx, dx, dy);
2781
2782	U16 R,G;
2783	load2(ptr: (const uint16_t*)ptr, tail, r: &R, g: &G);
2784	r = from_half(h: R);
2785	g = from_half(h: G);
2786	b = `0`;
2787	a = `1`;
2788	}
2789	STAGE(load_rgf16_dst, const SkRasterPipeline_MemoryCtx* ctx) {
2790	auto ptr = ptr_at_xy<const uint32_t>(ctx, dx, dy);
2791
2792	U16 R,G;
2793	load2(ptr: (const uint16_t*)ptr, tail, r: &R, g: &G);
2794	dr = from_half(h: R);
2795	dg = from_half(h: G);
2796	db = `0`;
2797	da = `1`;
2798	}
2799	STAGE(gather_rgf16, const SkRasterPipeline_GatherCtx* ctx) {
2800	const uint32_t* ptr;
2801	U32 ix = ix_and_ptr(ptr: &ptr, ctx, x: r, y: g);
2802	auto px = gather(p: ptr, ix);
2803
2804	U16 R,G;
2805	load2(ptr: (const uint16_t*)&px, tail: `0`, r: &R, g: &G);
2806	r = from_half(h: R);
2807	g = from_half(h: G);
2808	b = `0`;
2809	a = `1`;
2810	}
2811	STAGE(store_rgf16, const SkRasterPipeline_MemoryCtx* ctx) {
2812	auto ptr = ptr_at_xy<uint32_t>(ctx, dx, dy);
2813	store2(ptr: (uint16_t*)ptr, tail, r: to_half(f: r)
2814	, g: to_half(f: g));
2815	}
2816
2817	STAGE(load_f32, const SkRasterPipeline_MemoryCtx* ctx) {
2818	auto ptr = ptr_at_xy<const float>(ctx, dx: `4`dx,dy: `4`dy);
2819	load4(ptr,tail, r: &r,g: &g,b: &b,a: &a);
2820	}
2821	STAGE(load_f32_dst, const SkRasterPipeline_MemoryCtx* ctx) {
2822	auto ptr = ptr_at_xy<const float>(ctx, dx: `4`dx,dy: `4`dy);
2823	load4(ptr,tail, r: &dr,g: &dg,b: &db,a: &da);
2824	}
2825	STAGE(gather_f32, const SkRasterPipeline_GatherCtx* ctx) {
2826	const float* ptr;
2827	U32 ix = ix_and_ptr(ptr: &ptr, ctx, x: r,y: g);
2828	r = gather(p: ptr, ix: `4`*ix + `0`);
2829	g = gather(p: ptr, ix: `4`*ix + `1`);
2830	b = gather(p: ptr, ix: `4`*ix + `2`);
2831	a = gather(p: ptr, ix: `4`*ix + `3`);
2832	}
2833	STAGE(store_f32, const SkRasterPipeline_MemoryCtx* ctx) {
2834	auto ptr = ptr_at_xy<float>(ctx, dx: `4`dx,dy: `4`dy);
2835	store4(ptr,tail, r,g,b,a);
2836	}
2837
2838	STAGE(load_rgf32, const SkRasterPipeline_MemoryCtx* ctx) {
2839	auto ptr = ptr_at_xy<const float>(ctx, dx: `2`dx,dy: `2`dy);
2840	load2(ptr, tail, r: &r, g: &g);
2841	b = `0`;
2842	a = `1`;
2843	}
2844	STAGE(store_rgf32, const SkRasterPipeline_MemoryCtx* ctx) {
2845	auto ptr = ptr_at_xy<float>(ctx, dx: `2`dx,dy: `2`dy);
2846	store2(ptr, tail, r, g);
2847	}
2848
2849	SI F exclusive_repeat(F v, const SkRasterPipeline_TileCtx* ctx) {
2850	return v - floor_(v: vctx->invScale)ctx->scale;
2851	}
2852	SI F exclusive_mirror(F v, const SkRasterPipeline_TileCtx* ctx) {
2853	auto limit = ctx->scale;
2854	auto invLimit = ctx->invScale;
2855
2856	// This is "repeat" over the range 0..2limit*
2857	auto u = v - floor_(v: vinvLimit`0.5f`)`2`limit;
2858	// s will be 0 when moving forward (e.g. [0, limit)) and 1 when moving backward (e.g.
2859	// [limit, 2limit)).*
2860	auto s = floor_(v: u*invLimit);
2861	// This is the mirror result.
2862	auto m = u - `2`s(u - limit);
2863	// Apply a bias to m if moving backwards so that we snap consistently at exact integer coords in
2864	// the logical infinite image. This is tested by mirror_tile GM. Note that all values
2865	// that have a non-zero bias applied are > 0.
2866	auto biasInUlps = trunc_(v: s);
2867	return sk_bit_cast<F>(src: sk_bit_cast<U32>(src: m) + ctx->mirrorBiasDir*biasInUlps);
2868	}
2869	// Tile x or y to [0,limit) == [0,limit - 1 ulp] (think, sampling from images).
2870	// The gather stages will hard clamp the output of these stages to [0,limit)...
2871	// we just need to do the basic repeat or mirroring.
2872	STAGE(repeat_x, const SkRasterPipeline_TileCtx* ctx) { r = exclusive_repeat(v: r, ctx); }
2873	STAGE(repeat_y, const SkRasterPipeline_TileCtx* ctx) { g = exclusive_repeat(v: g, ctx); }
2874	STAGE(mirror_x, const SkRasterPipeline_TileCtx* ctx) { r = exclusive_mirror(v: r, ctx); }
2875	STAGE(mirror_y, const SkRasterPipeline_TileCtx* ctx) { g = exclusive_mirror(v: g, ctx); }
2876
2877	STAGE( clamp_x_1, NoCtx) { r = clamp_01_(v: r); }
2878	STAGE(repeat_x_1, NoCtx) { r = clamp_01_(v: r - floor_(v: r)); }
2879	STAGE(mirror_x_1, NoCtx) { r = clamp_01_(v: abs_( v: (r-`1.0f`) - two(x: floor_(v: (r-`1.0f`)*`0.5f`)) - `1.0f` )); }
2880
2881	STAGE(clamp_x_and_y, const SkRasterPipeline_CoordClampCtx* ctx) {
2882	r = min(a: ctx->max_x, b: max(a: ctx->min_x, b: r));
2883	g = min(a: ctx->max_y, b: max(a: ctx->min_y, b: g));
2884	}
2885
2886	// Decal stores a 32bit mask after checking the coordinate (x and/or y) against its domain:
2887	// mask == 0x00000000 if the coordinate(s) are out of bounds
2888	// mask == 0xFFFFFFFF if the coordinate(s) are in bounds
2889	// After the gather stage, the r,g,b,a values are AND'd with this mask, setting them to 0
2890	// if either of the coordinates were out of bounds.
2891
2892	STAGE(decal_x, SkRasterPipeline_DecalTileCtx* ctx) {
2893	auto w = ctx->limit_x;
2894	auto e = ctx->inclusiveEdge_x;
2895	auto cond = ((`0` < r) & (r < w)) \| (r == e);
2896	sk_unaligned_store(ptr: ctx->mask, val: cond_to_mask(cond));
2897	}
2898	STAGE(decal_y, SkRasterPipeline_DecalTileCtx* ctx) {
2899	auto h = ctx->limit_y;
2900	auto e = ctx->inclusiveEdge_y;
2901	auto cond = ((`0` < g) & (g < h)) \| (g == e);
2902	sk_unaligned_store(ptr: ctx->mask, val: cond_to_mask(cond));
2903	}
2904	STAGE(decal_x_and_y, SkRasterPipeline_DecalTileCtx* ctx) {
2905	auto w = ctx->limit_x;
2906	auto h = ctx->limit_y;
2907	auto ex = ctx->inclusiveEdge_x;
2908	auto ey = ctx->inclusiveEdge_y;
2909	auto cond = (((`0` < r) & (r < w)) \| (r == ex))
2910	& (((`0` < g) & (g < h)) \| (g == ey));
2911	sk_unaligned_store(ptr: ctx->mask, val: cond_to_mask(cond));
2912	}
2913	STAGE(check_decal_mask, SkRasterPipeline_DecalTileCtx* ctx) {
2914	auto mask = sk_unaligned_load<U32>(ptr: ctx->mask);
2915	r = sk_bit_cast<F>(src: sk_bit_cast<U32>(src: r) & mask);
2916	g = sk_bit_cast<F>(src: sk_bit_cast<U32>(src: g) & mask);
2917	b = sk_bit_cast<F>(src: sk_bit_cast<U32>(src: b) & mask);
2918	a = sk_bit_cast<F>(src: sk_bit_cast<U32>(src: a) & mask);
2919	}
2920
2921	STAGE(alpha_to_gray, NoCtx) {
2922	r = g = b = a;
2923	a = `1`;
2924	}
2925	STAGE(alpha_to_gray_dst, NoCtx) {
2926	dr = dg = db = da;
2927	da = `1`;
2928	}
2929	STAGE(alpha_to_red, NoCtx) {
2930	r = a;
2931	a = `1`;
2932	}
2933	STAGE(alpha_to_red_dst, NoCtx) {
2934	dr = da;
2935	da = `1`;
2936	}
2937
2938	STAGE(bt709_luminance_or_luma_to_alpha, NoCtx) {
2939	a = r`0.2126f` + g`0.7152f` + b*`0.0722f`;
2940	r = g = b = `0`;
2941	}
2942	STAGE(bt709_luminance_or_luma_to_rgb, NoCtx) {
2943	r = g = b = r`0.2126f` + g`0.7152f` + b*`0.0722f`;
2944	}
2945
2946	STAGE(matrix_translate, const float* m) {
2947	r += m[`0`];
2948	g += m[`1`];
2949	}
2950	STAGE(matrix_scale_translate, const float* m) {
2951	r = mad(f: r,m: m[`0`], a: m[`2`]);
2952	g = mad(f: g,m: m[`1`], a: m[`3`]);
2953	}
2954	STAGE(matrix_2x3, const float* m) {
2955	auto R = mad(f: r,m: m[`0`], a: mad(f: g,m: m[`1`], a: m[`2`])),
2956	G = mad(f: r,m: m[`3`], a: mad(f: g,m: m[`4`], a: m[`5`]));
2957	r = R;
2958	g = G;
2959	}
2960	STAGE(matrix_3x3, const float* m) {
2961	auto R = mad(f: r,m: m[`0`], a: mad(f: g,m: m[`3`], a: b*m[`6`])),
2962	G = mad(f: r,m: m[`1`], a: mad(f: g,m: m[`4`], a: b*m[`7`])),
2963	B = mad(f: r,m: m[`2`], a: mad(f: g,m: m[`5`], a: b*m[`8`]));
2964	r = R;
2965	g = G;
2966	b = B;
2967	}
2968	STAGE(matrix_3x4, const float* m) {
2969	auto R = mad(f: r,m: m[`0`], a: mad(f: g,m: m[`3`], a: mad(f: b,m: m[`6`], a: m[ `9`]))),
2970	G = mad(f: r,m: m[`1`], a: mad(f: g,m: m[`4`], a: mad(f: b,m: m[`7`], a: m[`10`]))),
2971	B = mad(f: r,m: m[`2`], a: mad(f: g,m: m[`5`], a: mad(f: b,m: m[`8`], a: m[`11`])));
2972	r = R;
2973	g = G;
2974	b = B;
2975	}
2976	STAGE(matrix_4x5, const float* m) {
2977	auto R = mad(f: r,m: m[ `0`], a: mad(f: g,m: m[ `1`], a: mad(f: b,m: m[ `2`], a: mad(f: a,m: m[ `3`], a: m[ `4`])))),
2978	G = mad(f: r,m: m[ `5`], a: mad(f: g,m: m[ `6`], a: mad(f: b,m: m[ `7`], a: mad(f: a,m: m[ `8`], a: m[ `9`])))),
2979	B = mad(f: r,m: m[`10`], a: mad(f: g,m: m[`11`], a: mad(f: b,m: m[`12`], a: mad(f: a,m: m[`13`], a: m[`14`])))),
2980	A = mad(f: r,m: m[`15`], a: mad(f: g,m: m[`16`], a: mad(f: b,m: m[`17`], a: mad(f: a,m: m[`18`], a: m[`19`]))));
2981	r = R;
2982	g = G;
2983	b = B;
2984	a = A;
2985	}
2986	STAGE(matrix_4x3, const float* m) {
2987	auto X = r,
2988	Y = g;
2989
2990	r = mad(f: X, m: m[`0`], a: mad(f: Y, m: m[`4`], a: m[ `8`]));
2991	g = mad(f: X, m: m[`1`], a: mad(f: Y, m: m[`5`], a: m[ `9`]));
2992	b = mad(f: X, m: m[`2`], a: mad(f: Y, m: m[`6`], a: m[`10`]));
2993	a = mad(f: X, m: m[`3`], a: mad(f: Y, m: m[`7`], a: m[`11`]));
2994	}
2995	STAGE(matrix_perspective, const float* m) {
2996	// N.B. Unlike the other matrix_ stages, this matrix is row-major.
2997	auto R = mad(f: r,m: m[`0`], a: mad(f: g,m: m[`1`], a: m[`2`])),
2998	G = mad(f: r,m: m[`3`], a: mad(f: g,m: m[`4`], a: m[`5`])),
2999	Z = mad(f: r,m: m[`6`], a: mad(f: g,m: m[`7`], a: m[`8`]));
3000	r = R * rcp_precise(v: Z);
3001	g = G * rcp_precise(v: Z);
3002	}
3003
3004	SI void gradient_lookup(const SkRasterPipeline_GradientCtx* c, U32 idx, F t,
3005	F* r, F* g, F* b, F* a) {
3006	F fr, br, fg, bg, fb, bb, fa, ba;
3007	#if defined(JUMPER_IS_HSW)
3008	if (c->stopCount <=`8`) {
3009	fr = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[`0`]), idx);
3010	br = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[`0`]), idx);
3011	fg = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[`1`]), idx);
3012	bg = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[`1`]), idx);
3013	fb = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[`2`]), idx);
3014	bb = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[`2`]), idx);
3015	fa = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[`3`]), idx);
3016	ba = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[`3`]), idx);
3017	} else
3018	#endif
3019	{
3020	fr = gather(p: c->fs[`0`], ix: idx);
3021	br = gather(p: c->bs[`0`], ix: idx);
3022	fg = gather(p: c->fs[`1`], ix: idx);
3023	bg = gather(p: c->bs[`1`], ix: idx);
3024	fb = gather(p: c->fs[`2`], ix: idx);
3025	bb = gather(p: c->bs[`2`], ix: idx);
3026	fa = gather(p: c->fs[`3`], ix: idx);
3027	ba = gather(p: c->bs[`3`], ix: idx);
3028	}
3029
3030	*r = mad(f: t, m: fr, a: br);
3031	*g = mad(f: t, m: fg, a: bg);
3032	*b = mad(f: t, m: fb, a: bb);
3033	*a = mad(f: t, m: fa, a: ba);
3034	}
3035
3036	STAGE(evenly_spaced_gradient, const SkRasterPipeline_GradientCtx* c) {
3037	auto t = r;
3038	auto idx = trunc_(v: t * (c->stopCount-`1`));
3039	gradient_lookup(c, idx, t, r: &r, g: &g, b: &b, a: &a);
3040	}
3041
3042	STAGE(gradient, const SkRasterPipeline_GradientCtx* c) {
3043	auto t = r;
3044	U32 idx = `0`;
3045
3046	// N.B. The loop starts at 1 because idx 0 is the color to use before the first stop.
3047	for (size_t i = `1`; i < c->stopCount; i++) {
3048	idx += if_then_else(c: t >= c->ts[i], t: U32(`1`), e: U32(`0`));
3049	}
3050
3051	gradient_lookup(c, idx, t, r: &r, g: &g, b: &b, a: &a);
3052	}
3053
3054	STAGE(evenly_spaced_2_stop_gradient, const SkRasterPipeline_EvenlySpaced2StopGradientCtx* c) {
3055	auto t = r;
3056	r = mad(f: t, m: c->f[`0`], a: c->b[`0`]);
3057	g = mad(f: t, m: c->f[`1`], a: c->b[`1`]);
3058	b = mad(f: t, m: c->f[`2`], a: c->b[`2`]);
3059	a = mad(f: t, m: c->f[`3`], a: c->b[`3`]);
3060	}
3061
3062	STAGE(xy_to_unit_angle, NoCtx) {
3063	F X = r,
3064	Y = g;
3065	F xabs = abs_(v: X),
3066	yabs = abs_(v: Y);
3067
3068	F slope = min(a: xabs, b: yabs)/max(a: xabs, b: yabs);
3069	F s = slope * slope;
3070
3071	// Use a 7th degree polynomial to approximate atan.
3072	// This was generated using sollya.gforge.inria.fr.
3073	// A float optimized polynomial was generated using the following command.
3074	// P1 = fpminimax((1/(2Pi))atan(x),[\|1,3,5,7\|],[\|24...\|],[2^(-40),1],relative);
3075	F phi = slope
3076	* (`0.15912117063999176025390625f` + s
3077	* (-`5.185396969318389892578125e-2f` + s
3078	* (`2.476101927459239959716796875e-2f` + s
3079	* (-`7.0547382347285747528076171875e-3f`))));
3080
3081	phi = if_then_else(c: xabs < yabs, t: `1.0f`/`4.0f` - phi, e: phi);
3082	phi = if_then_else(c: X < `0.0f` , t: `1.0f`/`2.0f` - phi, e: phi);
3083	phi = if_then_else(c: Y < `0.0f` , t: `1.0f` - phi , e: phi);
3084	phi = if_then_else(c: phi != phi , t: `0` , e: phi); // Check for NaN.
3085	r = phi;
3086	}
3087
3088	STAGE(xy_to_radius, NoCtx) {
3089	F X2 = r * r,
3090	Y2 = g * g;
3091	r = sqrt_(v: X2 + Y2);
3092	}
3093
3094	// Please see https://skia.org/dev/design/conical for how our 2pt conical shader works.
3095
3096	STAGE(negate_x, NoCtx) { r = -r; }
3097
3098	STAGE(xy_to_2pt_conical_strip, const SkRasterPipeline_2PtConicalCtx* ctx) {
3099	F x = r, y = g, &t = r;
3100	t = x + sqrt_(v: ctx->fP0 - yy); // ctx->fP0 = r0 * r0*
3101	}
3102
3103	STAGE(xy_to_2pt_conical_focal_on_circle, NoCtx) {
3104	F x = r, y = g, &t = r;
3105	t = x + yy / x; // (x^2 + y^2) / x*
3106	}
3107
3108	STAGE(xy_to_2pt_conical_well_behaved, const SkRasterPipeline_2PtConicalCtx* ctx) {
3109	F x = r, y = g, &t = r;
3110	t = sqrt_(v: xx + yy) - x * ctx->fP0; // ctx->fP0 = 1/r1
3111	}
3112
3113	STAGE(xy_to_2pt_conical_greater, const SkRasterPipeline_2PtConicalCtx* ctx) {
3114	F x = r, y = g, &t = r;
3115	t = sqrt_(v: xx - yy) - x * ctx->fP0; // ctx->fP0 = 1/r1
3116	}
3117
3118	STAGE(xy_to_2pt_conical_smaller, const SkRasterPipeline_2PtConicalCtx* ctx) {
3119	F x = r, y = g, &t = r;
3120	t = -sqrt_(v: xx - yy) - x * ctx->fP0; // ctx->fP0 = 1/r1
3121	}
3122
3123	STAGE(alter_2pt_conical_compensate_focal, const SkRasterPipeline_2PtConicalCtx* ctx) {
3124	F& t = r;
3125	t = t + ctx->fP1; // ctx->fP1 = f
3126	}
3127
3128	STAGE(alter_2pt_conical_unswap, NoCtx) {
3129	F& t = r;
3130	t = `1` - t;
3131	}
3132
3133	STAGE(mask_2pt_conical_nan, SkRasterPipeline_2PtConicalCtx* c) {
3134	F& t = r;
3135	auto is_degenerate = (t != t); // NaN
3136	t = if_then_else(c: is_degenerate, t: F(`0`), e: t);
3137	sk_unaligned_store(ptr: &c->fMask, val: cond_to_mask(cond: !is_degenerate));
3138	}
3139
3140	STAGE(mask_2pt_conical_degenerates, SkRasterPipeline_2PtConicalCtx* c) {
3141	F& t = r;
3142	auto is_degenerate = (t <= `0`) \| (t != t);
3143	t = if_then_else(c: is_degenerate, t: F(`0`), e: t);
3144	sk_unaligned_store(ptr: &c->fMask, val: cond_to_mask(cond: !is_degenerate));
3145	}
3146
3147	STAGE(apply_vector_mask, const uint32_t* ctx) {
3148	const U32 mask = sk_unaligned_load<U32>(ptr: ctx);
3149	r = sk_bit_cast<F>(src: sk_bit_cast<U32>(src: r) & mask);
3150	g = sk_bit_cast<F>(src: sk_bit_cast<U32>(src: g) & mask);
3151	b = sk_bit_cast<F>(src: sk_bit_cast<U32>(src: b) & mask);
3152	a = sk_bit_cast<F>(src: sk_bit_cast<U32>(src: a) & mask);
3153	}
3154
3155	SI void save_xy(F* r, F* g, SkRasterPipeline_SamplerCtx* c) {
3156	// Whether bilinear or bicubic, all sample points are at the same fractional offset (fx,fy).
3157	// They're either the 4 corners of a logical 1x1 pixel or the 16 corners of a 3x3 grid
3158	// surrounding (x,y) at (0.5,0.5) off-center.
3159	F fx = fract(v: *r + `0.5f`),
3160	fy = fract(v: *g + `0.5f`);
3161
3162	// Samplers will need to load x and fx, or y and fy.
3163	sk_unaligned_store(ptr: c->x, val: *r);
3164	sk_unaligned_store(ptr: c->y, val: *g);
3165	sk_unaligned_store(ptr: c->fx, val: fx);
3166	sk_unaligned_store(ptr: c->fy, val: fy);
3167	}
3168
3169	STAGE(accumulate, const SkRasterPipeline_SamplerCtx* c) {
3170	// Bilinear and bicubic filters are both separable, so we produce independent contributions
3171	// from x and y, multiplying them together here to get each pixel's total scale factor.
3172	auto scale = sk_unaligned_load<F>(ptr: c->scalex)
3173	* sk_unaligned_load<F>(ptr: c->scaley);
3174	dr = mad(f: scale, m: r, a: dr);
3175	dg = mad(f: scale, m: g, a: dg);
3176	db = mad(f: scale, m: b, a: db);
3177	da = mad(f: scale, m: a, a: da);
3178	}
3179
3180	// In bilinear interpolation, the 4 pixels at +/- 0.5 offsets from the sample pixel center
3181	// are combined in direct proportion to their area overlapping that logical query pixel.
3182	// At positive offsets, the x-axis contribution to that rectangle is fx, or (1-fx) at negative x.
3183	// The y-axis is symmetric.
3184
3185	template <int kScale>
3186	SI void bilinear_x(SkRasterPipeline_SamplerCtx* ctx, F* x) {
3187	x = sk_unaligned_load<F>(ptr: ctx->x) + (kScale `0.5f`);
3188	F fx = sk_unaligned_load<F>(ptr: ctx->fx);
3189
3190	F scalex;
3191	if (kScale == -`1`) { scalex = `1.0f` - fx; }
3192	if (kScale == +`1`) { scalex = fx; }
3193	sk_unaligned_store(ptr: ctx->scalex, val: scalex);
3194	}
3195	template <int kScale>
3196	SI void bilinear_y(SkRasterPipeline_SamplerCtx* ctx, F* y) {
3197	y = sk_unaligned_load<F>(ptr: ctx->y) + (kScale `0.5f`);
3198	F fy = sk_unaligned_load<F>(ptr: ctx->fy);
3199
3200	F scaley;
3201	if (kScale == -`1`) { scaley = `1.0f` - fy; }
3202	if (kScale == +`1`) { scaley = fy; }
3203	sk_unaligned_store(ptr: ctx->scaley, val: scaley);
3204	}
3205
3206	STAGE(bilinear_setup, SkRasterPipeline_SamplerCtx* ctx) {
3207	save_xy(r: &r, g: &g, c: ctx);
3208	// Init for accumulate
3209	dr = dg = db = da = `0`;
3210	}
3211
3212	STAGE(bilinear_nx, SkRasterPipeline_SamplerCtx* ctx) { bilinear_x<-`1`>(ctx, x: &r); }
3213	STAGE(bilinear_px, SkRasterPipeline_SamplerCtx* ctx) { bilinear_x<+`1`>(ctx, x: &r); }
3214	STAGE(bilinear_ny, SkRasterPipeline_SamplerCtx* ctx) { bilinear_y<-`1`>(ctx, y: &g); }
3215	STAGE(bilinear_py, SkRasterPipeline_SamplerCtx* ctx) { bilinear_y<+`1`>(ctx, y: &g); }
3216
3217
3218	// In bicubic interpolation, the 16 pixels and +/- 0.5 and +/- 1.5 offsets from the sample
3219	// pixel center are combined with a non-uniform cubic filter, with higher values near the center.
3220	//
3221	// This helper computes the total weight along one axis (our bicubic filter is separable), given one
3222	// column of the sampling matrix, and a fractional pixel offset. See SkCubicResampler for details.
3223
3224	SI F bicubic_wts(F t, float A, float B, float C, float D) {
3225	return mad(f: t, m: mad(f: t, m: mad(f: t, m: D, a: C), a: B), a: A);
3226	}
3227
3228	template <int kScale>
3229	SI void bicubic_x(SkRasterPipeline_SamplerCtx* ctx, F* x) {
3230	x = sk_unaligned_load<F>(ptr: ctx->x) + (kScale `0.5f`);
3231
3232	F scalex;
3233	if (kScale == -`3`) { scalex = sk_unaligned_load<F>(ptr: ctx->wx[`0`]); }
3234	if (kScale == -`1`) { scalex = sk_unaligned_load<F>(ptr: ctx->wx[`1`]); }
3235	if (kScale == +`1`) { scalex = sk_unaligned_load<F>(ptr: ctx->wx[`2`]); }
3236	if (kScale == +`3`) { scalex = sk_unaligned_load<F>(ptr: ctx->wx[`3`]); }
3237	sk_unaligned_store(ptr: ctx->scalex, val: scalex);
3238	}
3239	template <int kScale>
3240	SI void bicubic_y(SkRasterPipeline_SamplerCtx* ctx, F* y) {
3241	y = sk_unaligned_load<F>(ptr: ctx->y) + (kScale `0.5f`);
3242
3243	F scaley;
3244	if (kScale == -`3`) { scaley = sk_unaligned_load<F>(ptr: ctx->wy[`0`]); }
3245	if (kScale == -`1`) { scaley = sk_unaligned_load<F>(ptr: ctx->wy[`1`]); }
3246	if (kScale == +`1`) { scaley = sk_unaligned_load<F>(ptr: ctx->wy[`2`]); }
3247	if (kScale == +`3`) { scaley = sk_unaligned_load<F>(ptr: ctx->wy[`3`]); }
3248	sk_unaligned_store(ptr: ctx->scaley, val: scaley);
3249	}
3250
3251	STAGE(bicubic_setup, SkRasterPipeline_SamplerCtx* ctx) {
3252	save_xy(r: &r, g: &g, c: ctx);
3253
3254	const float* w = ctx->weights;
3255
3256	F fx = sk_unaligned_load<F>(ptr: ctx->fx);
3257	sk_unaligned_store(ptr: ctx->wx[`0`], val: bicubic_wts(t: fx, A: w[`0`], B: w[`4`], C: w[ `8`], D: w[`12`]));
3258	sk_unaligned_store(ptr: ctx->wx[`1`], val: bicubic_wts(t: fx, A: w[`1`], B: w[`5`], C: w[ `9`], D: w[`13`]));
3259	sk_unaligned_store(ptr: ctx->wx[`2`], val: bicubic_wts(t: fx, A: w[`2`], B: w[`6`], C: w[`10`], D: w[`14`]));
3260	sk_unaligned_store(ptr: ctx->wx[`3`], val: bicubic_wts(t: fx, A: w[`3`], B: w[`7`], C: w[`11`], D: w[`15`]));
3261
3262	F fy = sk_unaligned_load<F>(ptr: ctx->fy);
3263	sk_unaligned_store(ptr: ctx->wy[`0`], val: bicubic_wts(t: fy, A: w[`0`], B: w[`4`], C: w[ `8`], D: w[`12`]));
3264	sk_unaligned_store(ptr: ctx->wy[`1`], val: bicubic_wts(t: fy, A: w[`1`], B: w[`5`], C: w[ `9`], D: w[`13`]));
3265	sk_unaligned_store(ptr: ctx->wy[`2`], val: bicubic_wts(t: fy, A: w[`2`], B: w[`6`], C: w[`10`], D: w[`14`]));
3266	sk_unaligned_store(ptr: ctx->wy[`3`], val: bicubic_wts(t: fy, A: w[`3`], B: w[`7`], C: w[`11`], D: w[`15`]));
3267
3268	// Init for accumulate
3269	dr = dg = db = da = `0`;
3270	}
3271
3272	STAGE(bicubic_n3x, SkRasterPipeline_SamplerCtx* ctx) { bicubic_x<-`3`>(ctx, x: &r); }
3273	STAGE(bicubic_n1x, SkRasterPipeline_SamplerCtx* ctx) { bicubic_x<-`1`>(ctx, x: &r); }
3274	STAGE(bicubic_p1x, SkRasterPipeline_SamplerCtx* ctx) { bicubic_x<+`1`>(ctx, x: &r); }
3275	STAGE(bicubic_p3x, SkRasterPipeline_SamplerCtx* ctx) { bicubic_x<+`3`>(ctx, x: &r); }
3276
3277	STAGE(bicubic_n3y, SkRasterPipeline_SamplerCtx* ctx) { bicubic_y<-`3`>(ctx, y: &g); }
3278	STAGE(bicubic_n1y, SkRasterPipeline_SamplerCtx* ctx) { bicubic_y<-`1`>(ctx, y: &g); }
3279	STAGE(bicubic_p1y, SkRasterPipeline_SamplerCtx* ctx) { bicubic_y<+`1`>(ctx, y: &g); }
3280	STAGE(bicubic_p3y, SkRasterPipeline_SamplerCtx* ctx) { bicubic_y<+`3`>(ctx, y: &g); }
3281
3282	STAGE(mipmap_linear_init, SkRasterPipeline_MipmapCtx* ctx) {
3283	sk_unaligned_store(ptr: ctx->x, val: r);
3284	sk_unaligned_store(ptr: ctx->y, val: g);
3285	}
3286
3287	STAGE(mipmap_linear_update, SkRasterPipeline_MipmapCtx* ctx) {
3288	sk_unaligned_store(ptr: ctx->r, val: r);
3289	sk_unaligned_store(ptr: ctx->g, val: g);
3290	sk_unaligned_store(ptr: ctx->b, val: b);
3291	sk_unaligned_store(ptr: ctx->a, val: a);
3292
3293	r = sk_unaligned_load<F>(ptr: ctx->x) * ctx->scaleX;
3294	g = sk_unaligned_load<F>(ptr: ctx->y) * ctx->scaleY;
3295	}
3296
3297	STAGE(mipmap_linear_finish, SkRasterPipeline_MipmapCtx* ctx) {
3298	r = lerp(from: sk_unaligned_load<F>(ptr: ctx->r), to: r, t: ctx->lowerWeight);
3299	g = lerp(from: sk_unaligned_load<F>(ptr: ctx->g), to: g, t: ctx->lowerWeight);
3300	b = lerp(from: sk_unaligned_load<F>(ptr: ctx->b), to: b, t: ctx->lowerWeight);
3301	a = lerp(from: sk_unaligned_load<F>(ptr: ctx->a), to: a, t: ctx->lowerWeight);
3302	}
3303
3304	STAGE(callback, SkRasterPipeline_CallbackCtx* c) {
3305	store4(ptr: c->rgba,tail: `0`, r,g,b,a);
3306	c->fn(c, tail ? tail : N);
3307	load4(ptr: c->read_from,tail: `0`, r: &r,g: &g,b: &b,a: &a);
3308	}
3309
3310	STAGE_TAIL(set_base_pointer, std::byte* p) {
3311	base = p;
3312	}
3313
3314	// All control flow stages used by SkSL maintain some state in the common registers:
3315	// r: condition mask
3316	// g: loop mask
3317	// b: return mask
3318	// a: execution mask (intersection of all three masks)
3319	// After updating r/g/b, you must invoke update_execution_mask().
3320	#define execution_mask() sk_bit_cast<I32>(a)
3321	#define update_execution_mask() a = sk_bit_cast<F>(sk_bit_cast<I32>(r) & \
3322	sk_bit_cast<I32>(g) & \
3323	sk_bit_cast<I32>(b))
3324
3325	STAGE_TAIL(init_lane_masks, NoCtx) {
3326	uint32_t iota[] = {`0`,`1`,`2`,`3`,`4`,`5`,`6`,`7`};
3327	I32 mask = tail ? cond_to_mask(cond: sk_unaligned_load<U32>(ptr: iota) < tail) : I32(~`0`);
3328	r = g = b = a = sk_bit_cast<F>(src: mask);
3329	}
3330
3331	STAGE_TAIL(store_device_xy01, F* dst) {
3332	// This is very similar to `seed_shader + store_src`, but b/a are backwards.
3333	// (sk_FragCoord actually puts w=1 in the w slot.)
3334	static constexpr float iota[] = {
3335	`0.5f`, `1.5f`, `2.5f`, `3.5f`, `4.5f`, `5.5f`, `6.5f`, `7.5f`,
3336	`8.5f`, `9.5f`,`10.5f`,`11.5f`,`12.5f`,`13.5f`,`14.5f`,`15.5f`,
3337	};
3338	dst[`0`] = cast(v: dx) + sk_unaligned_load<F>(ptr: iota);
3339	dst[`1`] = cast(v: dy) + `0.5f`;
3340	dst[`2`] = `0.0f`;
3341	dst[`3`] = `1.0f`;
3342	}
3343
3344	STAGE_TAIL(exchange_src, F* rgba) {
3345	// Swaps r,g,b,a registers with the values at `rgba`.
3346	F temp[`4`] = {r, g, b, a};
3347	r = rgba[`0`];
3348	rgba[`0`] = temp[`0`];
3349	g = rgba[`1`];
3350	rgba[`1`] = temp[`1`];
3351	b = rgba[`2`];
3352	rgba[`2`] = temp[`2`];
3353	a = rgba[`3`];
3354	rgba[`3`] = temp[`3`];
3355	}
3356
3357	STAGE_TAIL(load_condition_mask, F* ctx) {
3358	r = sk_unaligned_load<F>(ptr: ctx);
3359	update_execution_mask();
3360	}
3361
3362	STAGE_TAIL(store_condition_mask, F* ctx) {
3363	sk_unaligned_store(ptr: ctx, val: r);
3364	}
3365
3366	STAGE_TAIL(merge_condition_mask, I32* ptr) {
3367	// Set the condition-mask to the intersection of two adjacent masks at the pointer.
3368	r = sk_bit_cast<F>(src: ptr[`0`] & ptr[`1`]);
3369	update_execution_mask();
3370	}
3371
3372	STAGE_TAIL(merge_inv_condition_mask, I32* ptr) {
3373	// Set the condition-mask to the intersection of the first mask and the inverse of the second.
3374	r = sk_bit_cast<F>(src: ptr[`0`] & ~ptr[`1`]);
3375	update_execution_mask();
3376	}
3377
3378	STAGE_TAIL(load_loop_mask, F* ctx) {
3379	g = sk_unaligned_load<F>(ptr: ctx);
3380	update_execution_mask();
3381	}
3382
3383	STAGE_TAIL(store_loop_mask, F* ctx) {
3384	sk_unaligned_store(ptr: ctx, val: g);
3385	}
3386
3387	STAGE_TAIL(mask_off_loop_mask, NoCtx) {
3388	// We encountered a break statement. If a lane was active, it should be masked off now, and stay
3389	// masked-off until the termination of the loop.
3390	g = sk_bit_cast<F>(src: sk_bit_cast<I32>(src: g) & ~execution_mask());
3391	update_execution_mask();
3392	}
3393
3394	STAGE_TAIL(reenable_loop_mask, I32* ptr) {
3395	// Set the loop-mask to the union of the current loop-mask with the mask at the pointer.
3396	g = sk_bit_cast<F>(src: sk_bit_cast<I32>(src: g) \| ptr[`0`]);
3397	update_execution_mask();
3398	}
3399
3400	STAGE_TAIL(merge_loop_mask, I32* ptr) {
3401	// Set the loop-mask to the intersection of the current loop-mask with the mask at the pointer.
3402	// (Note: this behavior subtly differs from merge_condition_mask!)
3403	g = sk_bit_cast<F>(src: sk_bit_cast<I32>(src: g) & ptr[`0`]);
3404	update_execution_mask();
3405	}
3406
3407	STAGE_TAIL(continue_op, I32* continueMask) {
3408	// Set any currently-executing lanes in the continue-mask to true.
3409	*continueMask \|= execution_mask();
3410
3411	// Disable any currently-executing lanes from the loop mask. (Just like `mask_off_loop_mask`.)
3412	g = sk_bit_cast<F>(src: sk_bit_cast<I32>(src: g) & ~execution_mask());
3413	update_execution_mask();
3414	}
3415
3416	STAGE_TAIL(case_op, SkRasterPipeline_CaseOpCtx* packed) {
3417	auto ctx = SkRPCtxUtils::Unpack(ctx: packed);
3418
3419	// Check each lane to see if the case value matches the expectation.
3420	I32* actualValue = (I32*)(base + ctx.offset);
3421	I32 caseMatches = cond_to_mask(cond: *actualValue == ctx.expectedValue);
3422
3423	// In lanes where we found a match, enable the loop mask...
3424	g = sk_bit_cast<F>(src: sk_bit_cast<I32>(src: g) \| caseMatches);
3425	update_execution_mask();
3426
3427	// ... and clear the default-case mask.
3428	I32* defaultMask = actualValue + `1`;
3429	*defaultMask &= ~caseMatches;
3430	}
3431
3432	STAGE_TAIL(load_return_mask, F* ctx) {
3433	b = sk_unaligned_load<F>(ptr: ctx);
3434	update_execution_mask();
3435	}
3436
3437	STAGE_TAIL(store_return_mask, F* ctx) {
3438	sk_unaligned_store(ptr: ctx, val: b);
3439	}
3440
3441	STAGE_TAIL(mask_off_return_mask, NoCtx) {
3442	// We encountered a return statement. If a lane was active, it should be masked off now, and
3443	// stay masked-off until the end of the function.
3444	b = sk_bit_cast<F>(src: sk_bit_cast<I32>(src: b) & ~execution_mask());
3445	update_execution_mask();
3446	}
3447
3448	STAGE_BRANCH(branch_if_all_lanes_active, SkRasterPipeline_BranchCtx* ctx) {
3449	if (tail) {
3450	uint32_t iota[] = {`0`,`1`,`2`,`3`,`4`,`5`,`6`,`7`};
3451	I32 tailLanes = cond_to_mask(cond: tail <= sk_unaligned_load<U32>(ptr: iota));
3452	return all(execution_mask() \| tailLanes) ? ctx->offset : `1`;
3453	} else {
3454	return all(execution_mask()) ? ctx->offset : `1`;
3455	}
3456	}
3457
3458	STAGE_BRANCH(branch_if_any_lanes_active, SkRasterPipeline_BranchCtx* ctx) {
3459	return any(execution_mask()) ? ctx->offset : `1`;
3460	}
3461
3462	STAGE_BRANCH(branch_if_no_lanes_active, SkRasterPipeline_BranchCtx* ctx) {
3463	return any(execution_mask()) ? `1` : ctx->offset;
3464	}
3465
3466	STAGE_BRANCH(jump, SkRasterPipeline_BranchCtx* ctx) {
3467	return ctx->offset;
3468	}
3469
3470	STAGE_BRANCH(branch_if_no_active_lanes_eq, SkRasterPipeline_BranchIfEqualCtx* ctx) {
3471	// Compare each lane against the expected value...
3472	I32 match = cond_to_mask(cond: (I32)ctx->ptr == ctx->value);
3473	// ... but mask off lanes that aren't executing.
3474	match &= execution_mask();
3475	// If any lanes matched, don't take the branch.
3476	return any(c: match) ? `1` : ctx->offset;
3477	}
3478
3479	STAGE_TAIL(trace_line, SkRasterPipeline_TraceLineCtx* ctx) {
3480	I32* traceMask = (I32*)ctx->traceMask;
3481	if (any(execution_mask() & *traceMask)) {
3482	ctx->traceHook->line(lineNum: ctx->lineNumber);
3483	}
3484	}
3485
3486	STAGE_TAIL(trace_enter, SkRasterPipeline_TraceFuncCtx* ctx) {
3487	I32* traceMask = (I32*)ctx->traceMask;
3488	if (any(execution_mask() & *traceMask)) {
3489	ctx->traceHook->enter(fnIdx: ctx->funcIdx);
3490	}
3491	}
3492
3493	STAGE_TAIL(trace_exit, SkRasterPipeline_TraceFuncCtx* ctx) {
3494	I32* traceMask = (I32*)ctx->traceMask;
3495	if (any(execution_mask() & *traceMask)) {
3496	ctx->traceHook->exit(fnIdx: ctx->funcIdx);
3497	}
3498	}
3499
3500	STAGE_TAIL(trace_scope, SkRasterPipeline_TraceScopeCtx* ctx) {
3501	// Note that trace_scope intentionally does not incorporate the execution mask. Otherwise, the
3502	// scopes would become unbalanced if the execution mask changed in the middle of a block. The
3503	// caller is responsible for providing a combined trace- and execution-mask.
3504	I32* traceMask = (I32*)ctx->traceMask;
3505	if (any(c: *traceMask)) {
3506	ctx->traceHook->scope(delta: ctx->delta);
3507	}
3508	}
3509
3510	STAGE_TAIL(trace_var, SkRasterPipeline_TraceVarCtx* ctx) {
3511	I32* traceMask = (I32*)ctx->traceMask;
3512	I32 mask = execution_mask() & *traceMask;
3513	if (any(c: mask)) {
3514	for (size_t lane = `0`; lane < N; ++lane) {
3515	if (select_lane(data: mask, lane)) {
3516	I32* data = (I32*)ctx->data;
3517	int slotIdx = ctx->slotIdx, numSlots = ctx->numSlots;
3518	if (ctx->indirectOffset) {
3519	// If this was an indirect store, apply the indirect-offset to the data pointer.
3520	uint32_t indirectOffset = select_lane(data: (U32)ctx->indirectOffset, lane);
3521	indirectOffset = std::min<uint32_t>(a: indirectOffset, b: ctx->indirectLimit);
3522	data += indirectOffset;
3523	slotIdx += indirectOffset;
3524	}
3525	while (numSlots--) {
3526	ctx->traceHook->var(slot: slotIdx, val: select_lane(data: *data, lane));
3527	++slotIdx;
3528	++data;
3529	}
3530	break;
3531	}
3532	}
3533	}
3534	}
3535
3536	STAGE_TAIL(copy_uniform, SkRasterPipeline_UniformCtx* ctx) {
3537	const float* src = ctx->src;
3538	F* dst = (F*)ctx->dst;
3539	dst[`0`] = src[`0`];
3540	}
3541	STAGE_TAIL(copy_2_uniforms, SkRasterPipeline_UniformCtx* ctx) {
3542	const float* src = ctx->src;
3543	F* dst = (F*)ctx->dst;
3544	dst[`0`] = src[`0`];
3545	dst[`1`] = src[`1`];
3546	}
3547	STAGE_TAIL(copy_3_uniforms, SkRasterPipeline_UniformCtx* ctx) {
3548	const float* src = ctx->src;
3549	F* dst = (F*)ctx->dst;
3550	dst[`0`] = src[`0`];
3551	dst[`1`] = src[`1`];
3552	dst[`2`] = src[`2`];
3553	}
3554	STAGE_TAIL(copy_4_uniforms, SkRasterPipeline_UniformCtx* ctx) {
3555	const float* src = ctx->src;
3556	F* dst = (F*)ctx->dst;
3557	dst[`0`] = src[`0`];
3558	dst[`1`] = src[`1`];
3559	dst[`2`] = src[`2`];
3560	dst[`3`] = src[`3`];
3561	}
3562
3563	STAGE_TAIL(copy_constant, SkRasterPipeline_ConstantCtx* packed) {
3564	auto ctx = SkRPCtxUtils::Unpack(ctx: packed);
3565	F* dst = (F*)(base + ctx.dst);
3566	F value = ctx.value;
3567	dst[`0`] = value;
3568	}
3569	STAGE_TAIL(splat_2_constants, SkRasterPipeline_ConstantCtx* packed) {
3570	auto ctx = SkRPCtxUtils::Unpack(ctx: packed);
3571	F* dst = (F*)(base + ctx.dst);
3572	F value = ctx.value;
3573	dst[`0`] = dst[`1`] = value;
3574	}
3575	STAGE_TAIL(splat_3_constants, SkRasterPipeline_ConstantCtx* packed) {
3576	auto ctx = SkRPCtxUtils::Unpack(ctx: packed);
3577	F* dst = (F*)(base + ctx.dst);
3578	F value = ctx.value;
3579	dst[`0`] = dst[`1`] = dst[`2`] = value;
3580	}
3581	STAGE_TAIL(splat_4_constants, SkRasterPipeline_ConstantCtx* packed) {
3582	auto ctx = SkRPCtxUtils::Unpack(ctx: packed);
3583	F* dst = (F*)(base + ctx.dst);
3584	F value = ctx.value;
3585	dst[`0`] = dst[`1`] = dst[`2`] = dst[`3`] = value;
3586	}
3587
3588	template <int NumSlots>
3589	SI void copy_n_slots_unmasked_fn(SkRasterPipeline_BinaryOpCtx* packed, std::byte* base) {
3590	auto ctx = SkRPCtxUtils::Unpack(ctx: packed);
3591	F* dst = (F*)(base + ctx.dst);
3592	F* src = (F*)(base + ctx.src);
3593	// We don't even bother masking off the tail; we're filling slots, not the destination surface.
3594	memcpy(dest: dst, src: src, n: sizeof(F) * NumSlots);
3595	}
3596
3597	STAGE_TAIL(copy_slot_unmasked, SkRasterPipeline_BinaryOpCtx* packed) {
3598	copy_n_slots_unmasked_fn<`1`>(packed, base);
3599	}
3600	STAGE_TAIL(copy_2_slots_unmasked, SkRasterPipeline_BinaryOpCtx* packed) {
3601	copy_n_slots_unmasked_fn<`2`>(packed, base);
3602	}
3603	STAGE_TAIL(copy_3_slots_unmasked, SkRasterPipeline_BinaryOpCtx* packed) {
3604	copy_n_slots_unmasked_fn<`3`>(packed, base);
3605	}
3606	STAGE_TAIL(copy_4_slots_unmasked, SkRasterPipeline_BinaryOpCtx* packed) {
3607	copy_n_slots_unmasked_fn<`4`>(packed, base);
3608	}
3609
3610	template <int NumSlots>
3611	SI void copy_n_immutable_unmasked_fn(SkRasterPipeline_BinaryOpCtx* packed, std::byte* base) {
3612	auto ctx = SkRPCtxUtils::Unpack(ctx: packed);
3613
3614	// Load the scalar values.
3615	float* src = (float*)(base + ctx.src);
3616	float values[NumSlots];
3617	SK_UNROLL for (int index = `0`; index < NumSlots; ++index) {
3618	values[index] = src[index];
3619	}
3620	// Broadcast the scalars into the destination.
3621	F* dst = (F*)(base + ctx.dst);
3622	SK_UNROLL for (int index = `0`; index < NumSlots; ++index) {
3623	dst[index] = values[index];
3624	}
3625	}
3626
3627	STAGE_TAIL(copy_immutable_unmasked, SkRasterPipeline_BinaryOpCtx* packed) {
3628	copy_n_immutable_unmasked_fn<`1`>(packed, base);
3629	}
3630	STAGE_TAIL(copy_2_immutables_unmasked, SkRasterPipeline_BinaryOpCtx* packed) {
3631	copy_n_immutable_unmasked_fn<`2`>(packed, base);
3632	}
3633	STAGE_TAIL(copy_3_immutables_unmasked, SkRasterPipeline_BinaryOpCtx* packed) {
3634	copy_n_immutable_unmasked_fn<`3`>(packed, base);
3635	}
3636	STAGE_TAIL(copy_4_immutables_unmasked, SkRasterPipeline_BinaryOpCtx* packed) {
3637	copy_n_immutable_unmasked_fn<`4`>(packed, base);
3638	}
3639
3640	template <int NumSlots>
3641	SI void copy_n_slots_masked_fn(SkRasterPipeline_BinaryOpCtx* packed, std::byte* base, I32 mask) {
3642	auto ctx = SkRPCtxUtils::Unpack(ctx: packed);
3643	F* dst = (F*)(base + ctx.dst);
3644	F* src = (F*)(base + ctx.src);
3645	SK_UNROLL for (int count = `0`; count < NumSlots; ++count) {
3646	dst = if_then_else(c: mask, t: src, e: *dst);
3647	dst += `1`;
3648	src += `1`;
3649	}
3650	}
3651
3652	STAGE_TAIL(copy_slot_masked, SkRasterPipeline_BinaryOpCtx* packed) {
3653	copy_n_slots_masked_fn<`1`>(packed, base, execution_mask());
3654	}
3655	STAGE_TAIL(copy_2_slots_masked, SkRasterPipeline_BinaryOpCtx* packed) {
3656	copy_n_slots_masked_fn<`2`>(packed, base, execution_mask());
3657	}
3658	STAGE_TAIL(copy_3_slots_masked, SkRasterPipeline_BinaryOpCtx* packed) {
3659	copy_n_slots_masked_fn<`3`>(packed, base, execution_mask());
3660	}
3661	STAGE_TAIL(copy_4_slots_masked, SkRasterPipeline_BinaryOpCtx* packed) {
3662	copy_n_slots_masked_fn<`4`>(packed, base, execution_mask());
3663	}
3664
3665	template <int LoopCount, typename OffsetType>
3666	SI void shuffle_fn(std::byte* ptr, OffsetType* offsets, int numSlots) {
3667	F scratch[`16`];
3668	SK_UNROLL for (int count = `0`; count < LoopCount; ++count) {
3669	scratch[count] = (F)(ptr + offsets[count]);
3670	}
3671	// Surprisingly, this switch generates significantly better code than a memcpy (on x86-64) when
3672	// the number of slots is unknown at compile time, and generates roughly identical code when the
3673	// number of slots is hardcoded. Using a switch allows `scratch` to live in ymm0-ymm15 instead
3674	// of being written out to the stack and then read back in. Also, the intrinsic memcpy assumes
3675	// that `numSlots` could be arbitrarily large, and so it emits more code than we need.
3676	F* dst = (F*)ptr;
3677	switch (numSlots) {
3678	case `16`: dst[`15`] = scratch[`15`]; [[fallthrough]];
3679	case `15`: dst[`14`] = scratch[`14`]; [[fallthrough]];
3680	case `14`: dst[`13`] = scratch[`13`]; [[fallthrough]];
3681	case `13`: dst[`12`] = scratch[`12`]; [[fallthrough]];
3682	case `12`: dst[`11`] = scratch[`11`]; [[fallthrough]];
3683	case `11`: dst[`10`] = scratch[`10`]; [[fallthrough]];
3684	case `10`: dst[ `9`] = scratch[ `9`]; [[fallthrough]];
3685	case `9`: dst[ `8`] = scratch[ `8`]; [[fallthrough]];
3686	case `8`: dst[ `7`] = scratch[ `7`]; [[fallthrough]];
3687	case `7`: dst[ `6`] = scratch[ `6`]; [[fallthrough]];
3688	case `6`: dst[ `5`] = scratch[ `5`]; [[fallthrough]];
3689	case `5`: dst[ `4`] = scratch[ `4`]; [[fallthrough]];
3690	case `4`: dst[ `3`] = scratch[ `3`]; [[fallthrough]];
3691	case `3`: dst[ `2`] = scratch[ `2`]; [[fallthrough]];
3692	case `2`: dst[ `1`] = scratch[ `1`]; [[fallthrough]];
3693	case `1`: dst[ `0`] = scratch[ `0`];
3694	}
3695	}
3696
3697	template <int N>
3698	SI void small_swizzle_fn(SkRasterPipeline_SwizzleCtx* packed, std::byte* base) {
3699	auto ctx = SkRPCtxUtils::Unpack(ctx: packed);
3700	shuffle_fn<N>(base + ctx.dst, ctx.offsets, N);
3701	}
3702
3703	STAGE_TAIL(swizzle_1, SkRasterPipeline_SwizzleCtx* packed) {
3704	small_swizzle_fn<`1`>(packed, base);
3705	}
3706	STAGE_TAIL(swizzle_2, SkRasterPipeline_SwizzleCtx* packed) {
3707	small_swizzle_fn<`2`>(packed, base);
3708	}
3709	STAGE_TAIL(swizzle_3, SkRasterPipeline_SwizzleCtx* packed) {
3710	small_swizzle_fn<`3`>(packed, base);
3711	}
3712	STAGE_TAIL(swizzle_4, SkRasterPipeline_SwizzleCtx* packed) {
3713	small_swizzle_fn<`4`>(packed, base);
3714	}
3715	STAGE_TAIL(shuffle, SkRasterPipeline_ShuffleCtx* ctx) {
3716	shuffle_fn<`16`>(ptr: (std::byte*)ctx->ptr, offsets: ctx->offsets, numSlots: ctx->count);
3717	}
3718
3719	template <int NumSlots>
3720	SI void swizzle_copy_masked_fn(F* dst, const F* src, uint16_t* offsets, I32 mask) {
3721	std::byte* dstB = (std::byte*)dst;
3722	SK_UNROLL for (int count = `0`; count < NumSlots; ++count) {
3723	F* dstS = (F)(dstB + offsets);
3724	dstS = if_then_else(c: mask, t: src, e: *dstS);
3725	offsets += `1`;
3726	src += `1`;
3727	}
3728	}
3729
3730	STAGE_TAIL(swizzle_copy_slot_masked, SkRasterPipeline_SwizzleCopyCtx* ctx) {
3731	swizzle_copy_masked_fn<`1`>(dst: (F)ctx->dst, src: (F)ctx->src, offsets: ctx->offsets, execution_mask());
3732	}
3733	STAGE_TAIL(swizzle_copy_2_slots_masked, SkRasterPipeline_SwizzleCopyCtx* ctx) {
3734	swizzle_copy_masked_fn<`2`>(dst: (F)ctx->dst, src: (F)ctx->src, offsets: ctx->offsets, execution_mask());
3735	}
3736	STAGE_TAIL(swizzle_copy_3_slots_masked, SkRasterPipeline_SwizzleCopyCtx* ctx) {
3737	swizzle_copy_masked_fn<`3`>(dst: (F)ctx->dst, src: (F)ctx->src, offsets: ctx->offsets, execution_mask());
3738	}
3739	STAGE_TAIL(swizzle_copy_4_slots_masked, SkRasterPipeline_SwizzleCopyCtx* ctx) {
3740	swizzle_copy_masked_fn<`4`>(dst: (F)ctx->dst, src: (F)ctx->src, offsets: ctx->offsets, execution_mask());
3741	}
3742
3743	STAGE_TAIL(copy_from_indirect_unmasked, SkRasterPipeline_CopyIndirectCtx* ctx) {
3744	// Clamp the indirect offsets to stay within the limit.
3745	U32 offsets = (U32)ctx->indirectOffset;
3746	offsets = min(a: offsets, b: ctx->indirectLimit);
3747
3748	// Scale up the offsets to account for the N lanes per value.
3749	offsets *= N;
3750
3751	// Adjust the offsets forward so that they fetch from the correct lane.
3752	static constexpr uint32_t iota[] = {`0`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`, `14`, `15`};
3753	offsets += sk_unaligned_load<I32>(ptr: iota);
3754
3755	// Use gather to perform indirect lookups; write the results into `dst`.
3756	const float* src = ctx->src;
3757	F* dst = (F*)ctx->dst;
3758	F* end = dst + ctx->slots;
3759	do {
3760	*dst = gather(p: src, ix: offsets);
3761	dst += `1`;
3762	src += N;
3763	} while (dst != end);
3764	}
3765
3766	STAGE_TAIL(copy_from_indirect_uniform_unmasked, SkRasterPipeline_CopyIndirectCtx* ctx) {
3767	// Clamp the indirect offsets to stay within the limit.
3768	U32 offsets = (U32)ctx->indirectOffset;
3769	offsets = min(a: offsets, b: ctx->indirectLimit);
3770
3771	// Use gather to perform indirect lookups; write the results into `dst`.
3772	const float* src = ctx->src;
3773	F* dst = (F*)ctx->dst;
3774	F* end = dst + ctx->slots;
3775	do {
3776	*dst = gather(p: src, ix: offsets);
3777	dst += `1`;
3778	src += `1`;
3779	} while (dst != end);
3780	}
3781
3782	STAGE_TAIL(copy_to_indirect_masked, SkRasterPipeline_CopyIndirectCtx* ctx) {
3783	// Clamp the indirect offsets to stay within the limit.
3784	U32 offsets = (U32)ctx->indirectOffset;
3785	offsets = min(a: offsets, b: ctx->indirectLimit);
3786
3787	// Scale up the offsets to account for the N lanes per value.
3788	offsets *= N;
3789
3790	// Adjust the offsets forward so that they store into the correct lane.
3791	static constexpr uint32_t iota[] = {`0`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`, `14`, `15`};
3792	offsets += sk_unaligned_load<I32>(ptr: iota);
3793
3794	// Perform indirect, masked writes into `dst`.
3795	const F* src = (F*)ctx->src;
3796	const F* end = src + ctx->slots;
3797	float* dst = ctx->dst;
3798	I32 mask = execution_mask();
3799	do {
3800	scatter_masked(src: *src, dst, ix: offsets, mask);
3801	dst += N;
3802	src += `1`;
3803	} while (src != end);
3804	}
3805
3806	STAGE_TAIL(swizzle_copy_to_indirect_masked, SkRasterPipeline_SwizzleCopyIndirectCtx* ctx) {
3807	// Clamp the indirect offsets to stay within the limit.
3808	U32 offsets = (U32)ctx->indirectOffset;
3809	offsets = min(a: offsets, b: ctx->indirectLimit);
3810
3811	// Scale up the offsets to account for the N lanes per value.
3812	offsets *= N;
3813
3814	// Adjust the offsets forward so that they store into the correct lane.
3815	static constexpr uint32_t iota[] = {`0`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`, `14`, `15`};
3816	offsets += sk_unaligned_load<I32>(ptr: iota);
3817
3818	// Perform indirect, masked, swizzled writes into `dst`.
3819	const F* src = (F*)ctx->src;
3820	const F* end = src + ctx->slots;
3821	std::byte* dstB = (std::byte*)ctx->dst;
3822	const uint16_t* swizzle = ctx->offsets;
3823	I32 mask = execution_mask();
3824	do {
3825	float* dst = (float)(dstB + swizzle);
3826	scatter_masked(src: *src, dst, ix: offsets, mask);
3827	swizzle += `1`;
3828	src += `1`;
3829	} while (src != end);
3830	}
3831
3832	// Unary operations take a single input, and overwrite it with their output.
3833	// Unlike binary or ternary operations, we provide variations of 1-4 slots, but don't provide
3834	// an arbitrary-width "n-slot" variation; the Builder can chain together longer sequences manually.
3835	template <typename T, void (ApplyFn)(T)>
3836	SI void apply_adjacent_unary(T* dst, T* end) {
3837	do {
3838	ApplyFn(dst);
3839	dst += `1`;
3840	} while (dst != end);
3841	}
3842
3843	#if defined(JUMPER_IS_SCALAR)
3844	template <typename T>
3845	SI void cast_to_float_from_fn(T* dst) {
3846	dst = sk_bit_cast<T>((F)dst);
3847	}
3848	SI void cast_to_int_from_fn(F* dst) {
3849	dst = sk_bit_cast<F>((I32)dst);
3850	}
3851	SI void cast_to_uint_from_fn(F* dst) {
3852	dst = sk_bit_cast<F>((U32)dst);
3853	}
3854	#else
3855	template <typename T>
3856	SI void cast_to_float_from_fn(T* dst) {
3857	dst = sk_bit_cast<T>(__builtin_convertvector(dst, F));
3858	}
3859	SI void cast_to_int_from_fn(F* dst) {
3860	dst = sk_bit_cast<F>(src: __builtin_convertvector(dst, I32));
3861	}
3862	SI void cast_to_uint_from_fn(F* dst) {
3863	dst = sk_bit_cast<F>(src: __builtin_convertvector(dst, U32));
3864	}
3865	#endif
3866
3867	SI void abs_fn(I32* dst) {
3868	dst = abs_(v: dst);
3869	}
3870
3871	SI void floor_fn(F* dst) {
3872	dst = floor_(v: dst);
3873	}
3874
3875	SI void ceil_fn(F* dst) {
3876	dst = ceil_(v: dst);
3877	}
3878
3879	SI void invsqrt_fn(F* dst) {
3880	dst = rsqrt(v: dst);
3881	}
3882
3883	#define DECLARE_UNARY_FLOAT(name) \
3884	STAGE_TAIL(name##_float, F* dst) { apply_adjacent_unary<F, &name##_fn>(dst, dst + 1); } \
3885	STAGE_TAIL(name##_2_floats, F* dst) { apply_adjacent_unary<F, &name##_fn>(dst, dst + 2); } \
3886	STAGE_TAIL(name##_3_floats, F* dst) { apply_adjacent_unary<F, &name##_fn>(dst, dst + 3); } \
3887	STAGE_TAIL(name##_4_floats, F* dst) { apply_adjacent_unary<F, &name##_fn>(dst, dst + 4); }
3888
3889	#define DECLARE_UNARY_INT(name) \
3890	STAGE_TAIL(name##_int, I32* dst) { apply_adjacent_unary<I32, &name##_fn>(dst, dst + 1); } \
3891	STAGE_TAIL(name##_2_ints, I32* dst) { apply_adjacent_unary<I32, &name##_fn>(dst, dst + 2); } \
3892	STAGE_TAIL(name##_3_ints, I32* dst) { apply_adjacent_unary<I32, &name##_fn>(dst, dst + 3); } \
3893	STAGE_TAIL(name##_4_ints, I32* dst) { apply_adjacent_unary<I32, &name##_fn>(dst, dst + 4); }
3894
3895	#define DECLARE_UNARY_UINT(name) \
3896	STAGE_TAIL(name##_uint, U32* dst) { apply_adjacent_unary<U32, &name##_fn>(dst, dst + 1); } \
3897	STAGE_TAIL(name##_2_uints, U32* dst) { apply_adjacent_unary<U32, &name##_fn>(dst, dst + 2); } \
3898	STAGE_TAIL(name##_3_uints, U32* dst) { apply_adjacent_unary<U32, &name##_fn>(dst, dst + 3); } \
3899	STAGE_TAIL(name##_4_uints, U32* dst) { apply_adjacent_unary<U32, &name##_fn>(dst, dst + 4); }
3900
3901	DECLARE_UNARY_INT(cast_to_float_from) DECLARE_UNARY_UINT(cast_to_float_from)
3902	DECLARE_UNARY_FLOAT(cast_to_int_from)
3903	DECLARE_UNARY_FLOAT(cast_to_uint_from)
3904	DECLARE_UNARY_FLOAT(floor)
3905	DECLARE_UNARY_FLOAT(ceil)
3906	DECLARE_UNARY_FLOAT(invsqrt)
3907	DECLARE_UNARY_INT(abs)
3908
3909	#undef DECLARE_UNARY_FLOAT
3910	#undef DECLARE_UNARY_INT
3911	#undef DECLARE_UNARY_UINT
3912
3913	// For complex unary ops, we only provide a 1-slot version to reduce code bloat.
3914	STAGE_TAIL(sin_float, F* dst) { dst = sin_(x: dst); }
3915	STAGE_TAIL(cos_float, F* dst) { dst = cos_(x: dst); }
3916	STAGE_TAIL(tan_float, F* dst) { dst = tan_(x: dst); }
3917	STAGE_TAIL(asin_float, F* dst) { dst = asin_(x: dst); }
3918	STAGE_TAIL(acos_float, F* dst) { dst = acos_(x: dst); }
3919	STAGE_TAIL(atan_float, F* dst) { dst = atan_(x: dst); }
3920	STAGE_TAIL(sqrt_float, F* dst) { dst = sqrt_(v: dst); }
3921	STAGE_TAIL(exp_float, F* dst) { dst = approx_exp(x: dst); }
3922	STAGE_TAIL(exp2_float, F* dst) { dst = approx_pow2(x: dst); }
3923	STAGE_TAIL(log_float, F* dst) { dst = approx_log(x: dst); }
3924	STAGE_TAIL(log2_float, F* dst) { dst = approx_log2(x: dst); }
3925
3926	STAGE_TAIL(inverse_mat2, F* dst) {
3927	F a00 = dst[`0`], a01 = dst[`1`],
3928	a10 = dst[`2`], a11 = dst[`3`];
3929	F det = mad(f: a00, m: a11, a: -a01 * a10),
3930	invdet = rcp_precise(v: det);
3931	dst[`0`] = invdet * a11;
3932	dst[`1`] = -invdet * a01;
3933	dst[`2`] = -invdet * a10;
3934	dst[`3`] = invdet * a00;
3935	}
3936
3937	STAGE_TAIL(inverse_mat3, F* dst) {
3938	F a00 = dst[`0`], a01 = dst[`1`], a02 = dst[`2`],
3939	a10 = dst[`3`], a11 = dst[`4`], a12 = dst[`5`],
3940	a20 = dst[`6`], a21 = dst[`7`], a22 = dst[`8`];
3941	F b01 = mad(f: a22, m: a11, a: -a12 * a21),
3942	b11 = mad(f: a12, m: a20, a: -a22 * a10),
3943	b21 = mad(f: a21, m: a10, a: -a11 * a20);
3944	F det = mad(f: a00, m: b01, a: mad(f: a01, m: b11, a: a02 * b21)),
3945	invdet = rcp_precise(v: det);
3946	dst[`0`] = invdet * b01;
3947	dst[`1`] = invdet * mad(f: a02, m: a21, a: -a22 * a01);
3948	dst[`2`] = invdet * mad(f: a12, m: a01, a: -a02 * a11);
3949	dst[`3`] = invdet * b11;
3950	dst[`4`] = invdet * mad(f: a22, m: a00, a: -a02 * a20);
3951	dst[`5`] = invdet * mad(f: a02, m: a10, a: -a12 * a00);
3952	dst[`6`] = invdet * b21;
3953	dst[`7`] = invdet * mad(f: a01, m: a20, a: -a21 * a00);
3954	dst[`8`] = invdet * mad(f: a11, m: a00, a: -a01 * a10);
3955	}
3956
3957	STAGE_TAIL(inverse_mat4, F* dst) {
3958	F a00 = dst[`0`], a01 = dst[`1`], a02 = dst[`2`], a03 = dst[`3`],
3959	a10 = dst[`4`], a11 = dst[`5`], a12 = dst[`6`], a13 = dst[`7`],
3960	a20 = dst[`8`], a21 = dst[`9`], a22 = dst[`10`], a23 = dst[`11`],
3961	a30 = dst[`12`], a31 = dst[`13`], a32 = dst[`14`], a33 = dst[`15`];
3962	F b00 = mad(f: a00, m: a11, a: -a01 * a10),
3963	b01 = mad(f: a00, m: a12, a: -a02 * a10),
3964	b02 = mad(f: a00, m: a13, a: -a03 * a10),
3965	b03 = mad(f: a01, m: a12, a: -a02 * a11),
3966	b04 = mad(f: a01, m: a13, a: -a03 * a11),
3967	b05 = mad(f: a02, m: a13, a: -a03 * a12),
3968	b06 = mad(f: a20, m: a31, a: -a21 * a30),
3969	b07 = mad(f: a20, m: a32, a: -a22 * a30),
3970	b08 = mad(f: a20, m: a33, a: -a23 * a30),
3971	b09 = mad(f: a21, m: a32, a: -a22 * a31),
3972	b10 = mad(f: a21, m: a33, a: -a23 * a31),
3973	b11 = mad(f: a22, m: a33, a: -a23 * a32),
3974	det = mad(f: b00, m: b11, a: b05 * b06) + mad(f: b02, m: b09, a: b03 * b08) - mad(f: b01, m: b10, a: b04 * b07),
3975	invdet = rcp_precise(v: det);
3976	b00 *= invdet;
3977	b01 *= invdet;
3978	b02 *= invdet;
3979	b03 *= invdet;
3980	b04 *= invdet;
3981	b05 *= invdet;
3982	b06 *= invdet;
3983	b07 *= invdet;
3984	b08 *= invdet;
3985	b09 *= invdet;
3986	b10 *= invdet;
3987	b11 *= invdet;
3988	dst[`0`] = mad(f: a11, m: b11, a: a13b09) - a12b10;
3989	dst[`1`] = a02b10 - mad(f: a01, m: b11, a: a03b09);
3990	dst[`2`] = mad(f: a31, m: b05, a: a33b03) - a32b04;
3991	dst[`3`] = a22b04 - mad(f: a21, m: b05, a: a23b03);
3992	dst[`4`] = a12b08 - mad(f: a10, m: b11, a: a13b07);
3993	dst[`5`] = mad(f: a00, m: b11, a: a03b07) - a02b08;
3994	dst[`6`] = a32b02 - mad(f: a30, m: b05, a: a33b01);
3995	dst[`7`] = mad(f: a20, m: b05, a: a23b01) - a22b02;
3996	dst[`8`] = mad(f: a10, m: b10, a: a13b06) - a11b08;
3997	dst[`9`] = a01b08 - mad(f: a00, m: b10, a: a03b06);
3998	dst[`10`] = mad(f: a30, m: b04, a: a33b00) - a31b02;
3999	dst[`11`] = a21b02 - mad(f: a20, m: b04, a: a23b00);
4000	dst[`12`] = a11b07 - mad(f: a10, m: b09, a: a12b06);
4001	dst[`13`] = mad(f: a00, m: b09, a: a02b06) - a01b07;
4002	dst[`14`] = a31b01 - mad(f: a30, m: b03, a: a32b00);
4003	dst[`15`] = mad(f: a20, m: b03, a: a22b00) - a21b01;
4004	}
4005
4006	// Binary operations take two adjacent inputs, and write their output in the first position.
4007	template <typename T, void (ApplyFn)(T, T*)>
4008	SI void apply_adjacent_binary(T* dst, T* src) {
4009	T* end = src;
4010	do {
4011	ApplyFn(dst, src);
4012	dst += `1`;
4013	src += `1`;
4014	} while (dst != end);
4015	}
4016
4017	template <typename T, void (ApplyFn)(T, T*)>
4018	SI void apply_adjacent_binary_packed(SkRasterPipeline_BinaryOpCtx* packed, std::byte* base) {
4019	auto ctx = SkRPCtxUtils::Unpack(ctx: packed);
4020	std::byte* dst = base + ctx.dst;
4021	std::byte* src = base + ctx.src;
4022	apply_adjacent_binary<T, ApplyFn>((T)dst, (T)src);
4023	}
4024
4025	template <int N, typename V, typename S, void (ApplyFn)(V, V*)>
4026	SI void apply_binary_immediate(SkRasterPipeline_ConstantCtx* packed, std::byte* base) {
4027	auto ctx = SkRPCtxUtils::Unpack(ctx: packed);
4028	V* dst = (V)(base + ctx.dst); // get a pointer to the destination*
4029	S scalar = sk_bit_cast<S>(ctx.value); // bit-pun the constant value as desired
4030	V src = scalar; // broadcast the constant value into a vector
4031	SK_UNROLL for (int index = `0`; index < N; ++index) {
4032	ApplyFn(dst, &src); // perform the operation
4033	dst += `1`;
4034	}
4035	}
4036
4037	template <typename T>
4038	SI void add_fn(T* dst, T* src) {
4039	dst += src;
4040	}
4041
4042	template <typename T>
4043	SI void sub_fn(T* dst, T* src) {
4044	dst -= src;
4045	}
4046
4047	template <typename T>
4048	SI void mul_fn(T* dst, T* src) {
4049	dst = *src;
4050	}
4051
4052	template <typename T>
4053	SI void div_fn(T* dst, T* src) {
4054	T divisor = *src;
4055	if constexpr (!std::is_same_v<T, F>) {
4056	// We will crash if we integer-divide against zero. Convert 0 to ~0 to avoid this.
4057	divisor \|= cond_to_mask(divisor == `0`);
4058	}
4059	*dst /= divisor;
4060	}
4061
4062	SI void bitwise_and_fn(I32* dst, I32* src) {
4063	dst &= src;
4064	}
4065
4066	SI void bitwise_or_fn(I32* dst, I32* src) {
4067	dst \|= src;
4068	}
4069
4070	SI void bitwise_xor_fn(I32* dst, I32* src) {
4071	dst ^= src;
4072	}
4073
4074	template <typename T>
4075	SI void max_fn(T* dst, T* src) {
4076	dst = max(dst, *src);
4077	}
4078
4079	template <typename T>
4080	SI void min_fn(T* dst, T* src) {
4081	dst = min(dst, *src);
4082	}
4083
4084	template <typename T>
4085	SI void cmplt_fn(T* dst, T* src) {
4086	static_assert(sizeof(T) == sizeof(I32));
4087	I32 result = cond_to_mask(dst < src);
4088	memcpy(dst, &result, sizeof(I32));
4089	}
4090
4091	template <typename T>
4092	SI void cmple_fn(T* dst, T* src) {
4093	static_assert(sizeof(T) == sizeof(I32));
4094	I32 result = cond_to_mask(dst <= src);
4095	memcpy(dst, &result, sizeof(I32));
4096	}
4097
4098	template <typename T>
4099	SI void cmpeq_fn(T* dst, T* src) {
4100	static_assert(sizeof(T) == sizeof(I32));
4101	I32 result = cond_to_mask(dst == src);
4102	memcpy(dst, &result, sizeof(I32));
4103	}
4104
4105	template <typename T>
4106	SI void cmpne_fn(T* dst, T* src) {
4107	static_assert(sizeof(T) == sizeof(I32));
4108	I32 result = cond_to_mask(dst != src);
4109	memcpy(dst, &result, sizeof(I32));
4110	}
4111
4112	SI void atan2_fn(F* dst, F* src) {
4113	dst = atan2_(y0: dst, x0: *src);
4114	}
4115
4116	SI void pow_fn(F* dst, F* src) {
4117	dst = approx_powf(x: dst, y: *src);
4118	}
4119
4120	SI void mod_fn(F* dst, F* src) {
4121	dst = dst - src floor_(v: dst / src);
4122	}
4123
4124	#define DECLARE_N_WAY_BINARY_FLOAT(name) \
4125	STAGE_TAIL(name##_n_floats, SkRasterPipeline_BinaryOpCtx* packed) { \
4126	apply_adjacent_binary_packed<F, &name##_fn>(packed, base); \
4127	}
4128
4129	#define DECLARE_BINARY_FLOAT(name) \
4130	STAGE_TAIL(name##_float, F* dst) { apply_adjacent_binary<F, &name##_fn>(dst, dst + 1); } \
4131	STAGE_TAIL(name##_2_floats, F* dst) { apply_adjacent_binary<F, &name##_fn>(dst, dst + 2); } \
4132	STAGE_TAIL(name##_3_floats, F* dst) { apply_adjacent_binary<F, &name##_fn>(dst, dst + 3); } \
4133	STAGE_TAIL(name##_4_floats, F* dst) { apply_adjacent_binary<F, &name##_fn>(dst, dst + 4); } \
4134	DECLARE_N_WAY_BINARY_FLOAT(name)
4135
4136	#define DECLARE_N_WAY_BINARY_INT(name) \
4137	STAGE_TAIL(name##_n_ints, SkRasterPipeline_BinaryOpCtx* packed) { \
4138	apply_adjacent_binary_packed<I32, &name##_fn>(packed, base); \
4139	}
4140
4141	#define DECLARE_BINARY_INT(name) \
4142	STAGE_TAIL(name##_int, I32* dst) { apply_adjacent_binary<I32, &name##_fn>(dst, dst + 1); } \
4143	STAGE_TAIL(name##_2_ints, I32* dst) { apply_adjacent_binary<I32, &name##_fn>(dst, dst + 2); } \
4144	STAGE_TAIL(name##_3_ints, I32* dst) { apply_adjacent_binary<I32, &name##_fn>(dst, dst + 3); } \
4145	STAGE_TAIL(name##_4_ints, I32* dst) { apply_adjacent_binary<I32, &name##_fn>(dst, dst + 4); } \
4146	DECLARE_N_WAY_BINARY_INT(name)
4147
4148	#define DECLARE_N_WAY_BINARY_UINT(name) \
4149	STAGE_TAIL(name##_n_uints, SkRasterPipeline_BinaryOpCtx* packed) { \
4150	apply_adjacent_binary_packed<U32, &name##_fn>(packed, base); \
4151	}
4152
4153	#define DECLARE_BINARY_UINT(name) \
4154	STAGE_TAIL(name##_uint, U32* dst) { apply_adjacent_binary<U32, &name##_fn>(dst, dst + 1); } \
4155	STAGE_TAIL(name##_2_uints, U32* dst) { apply_adjacent_binary<U32, &name##_fn>(dst, dst + 2); } \
4156	STAGE_TAIL(name##_3_uints, U32* dst) { apply_adjacent_binary<U32, &name##_fn>(dst, dst + 3); } \
4157	STAGE_TAIL(name##_4_uints, U32* dst) { apply_adjacent_binary<U32, &name##_fn>(dst, dst + 4); } \
4158	DECLARE_N_WAY_BINARY_UINT(name)
4159
4160	// Many ops reuse the int stages when performing uint arithmetic, since they're equivalent on a
4161	// two's-complement machine. (Even multiplication is equivalent in the lower 32 bits.)
4162	DECLARE_BINARY_FLOAT(add) DECLARE_BINARY_INT(add)
4163	DECLARE_BINARY_FLOAT(sub) DECLARE_BINARY_INT(sub)
4164	DECLARE_BINARY_FLOAT(mul) DECLARE_BINARY_INT(mul)
4165	DECLARE_BINARY_FLOAT(div) DECLARE_BINARY_INT(div) DECLARE_BINARY_UINT(div)
4166	DECLARE_BINARY_INT(bitwise_and)
4167	DECLARE_BINARY_INT(bitwise_or)
4168	DECLARE_BINARY_INT(bitwise_xor)
4169	DECLARE_BINARY_FLOAT(mod)
4170	DECLARE_BINARY_FLOAT(min) DECLARE_BINARY_INT(min) DECLARE_BINARY_UINT(min)
4171	DECLARE_BINARY_FLOAT(max) DECLARE_BINARY_INT(max) DECLARE_BINARY_UINT(max)
4172	DECLARE_BINARY_FLOAT(cmplt) DECLARE_BINARY_INT(cmplt) DECLARE_BINARY_UINT(cmplt)
4173	DECLARE_BINARY_FLOAT(cmple) DECLARE_BINARY_INT(cmple) DECLARE_BINARY_UINT(cmple)
4174	DECLARE_BINARY_FLOAT(cmpeq) DECLARE_BINARY_INT(cmpeq)
4175	DECLARE_BINARY_FLOAT(cmpne) DECLARE_BINARY_INT(cmpne)
4176
4177	// Sufficiently complex ops only provide an N-way version, to avoid code bloat from the dedicated
4178	// 1-4 slot versions.
4179	DECLARE_N_WAY_BINARY_FLOAT(atan2)
4180	DECLARE_N_WAY_BINARY_FLOAT(pow)
4181
4182	// Some ops have an optimized version when the right-side is an immediate value.
4183	#define DECLARE_IMM_BINARY_FLOAT(name) \
4184	STAGE_TAIL(name##_imm_float, SkRasterPipeline_ConstantCtx* packed) { \
4185	apply_binary_immediate<1, F, float, &name##_fn>(packed, base); \
4186	}
4187	#define DECLARE_IMM_BINARY_INT(name) \
4188	STAGE_TAIL(name##_imm_int, SkRasterPipeline_ConstantCtx* packed) { \
4189	apply_binary_immediate<1, I32, int32_t, &name##_fn>(packed, base); \
4190	}
4191	#define DECLARE_MULTI_IMM_BINARY_INT(name) \
4192	STAGE_TAIL(name##_imm_int, SkRasterPipeline_ConstantCtx* packed) { \
4193	apply_binary_immediate<1, I32, int32_t, &name##_fn>(packed, base); \
4194	} \
4195	STAGE_TAIL(name##_imm_2_ints, SkRasterPipeline_ConstantCtx* packed) { \
4196	apply_binary_immediate<2, I32, int32_t, &name##_fn>(packed, base); \
4197	} \
4198	STAGE_TAIL(name##_imm_3_ints, SkRasterPipeline_ConstantCtx* packed) { \
4199	apply_binary_immediate<3, I32, int32_t, &name##_fn>(packed, base); \
4200	} \
4201	STAGE_TAIL(name##_imm_4_ints, SkRasterPipeline_ConstantCtx* packed) { \
4202	apply_binary_immediate<4, I32, int32_t, &name##_fn>(packed, base); \
4203	}
4204	#define DECLARE_IMM_BINARY_UINT(name) \
4205	STAGE_TAIL(name##_imm_uint, SkRasterPipeline_ConstantCtx* packed) { \
4206	apply_binary_immediate<1, U32, uint32_t, &name##_fn>(packed, base); \
4207	}
4208
4209	DECLARE_IMM_BINARY_FLOAT(add) DECLARE_IMM_BINARY_INT(add)
4210	DECLARE_IMM_BINARY_FLOAT(mul) DECLARE_IMM_BINARY_INT(mul)
4211	DECLARE_MULTI_IMM_BINARY_INT(bitwise_and)
4212	DECLARE_IMM_BINARY_FLOAT(max)
4213	DECLARE_IMM_BINARY_FLOAT(min)
4214	DECLARE_IMM_BINARY_INT(bitwise_xor)
4215	DECLARE_IMM_BINARY_FLOAT(cmplt) DECLARE_IMM_BINARY_INT(cmplt) DECLARE_IMM_BINARY_UINT(cmplt)
4216	DECLARE_IMM_BINARY_FLOAT(cmple) DECLARE_IMM_BINARY_INT(cmple) DECLARE_IMM_BINARY_UINT(cmple)
4217	DECLARE_IMM_BINARY_FLOAT(cmpeq) DECLARE_IMM_BINARY_INT(cmpeq)
4218	DECLARE_IMM_BINARY_FLOAT(cmpne) DECLARE_IMM_BINARY_INT(cmpne)
4219
4220	#undef DECLARE_MULTI_IMM_BINARY_INT
4221	#undef DECLARE_IMM_BINARY_FLOAT
4222	#undef DECLARE_IMM_BINARY_INT
4223	#undef DECLARE_IMM_BINARY_UINT
4224	#undef DECLARE_BINARY_FLOAT
4225	#undef DECLARE_BINARY_INT
4226	#undef DECLARE_BINARY_UINT
4227	#undef DECLARE_N_WAY_BINARY_FLOAT
4228	#undef DECLARE_N_WAY_BINARY_INT
4229	#undef DECLARE_N_WAY_BINARY_UINT
4230
4231	// Dots can be represented with multiply and add ops, but they are so foundational that it's worth
4232	// having dedicated ops.
4233	STAGE_TAIL(dot_2_floats, F* dst) {
4234	dst[`0`] = mad(f: dst[`0`], m: dst[`2`],
4235	a: dst[`1`] * dst[`3`]);
4236	}
4237
4238	STAGE_TAIL(dot_3_floats, F* dst) {
4239	dst[`0`] = mad(f: dst[`0`], m: dst[`3`],
4240	a: mad(f: dst[`1`], m: dst[`4`],
4241	a: dst[`2`] * dst[`5`]));
4242	}
4243
4244	STAGE_TAIL(dot_4_floats, F* dst) {
4245	dst[`0`] = mad(f: dst[`0`], m: dst[`4`],
4246	a: mad(f: dst[`1`], m: dst[`5`],
4247	a: mad(f: dst[`2`], m: dst[`6`],
4248	a: dst[`3`] * dst[`7`])));
4249	}
4250
4251	// MxM, VxM and MxV multiplication all use matrix_multiply. Vectors are treated like a matrix with a
4252	// single column or row.
4253	template <int N>
4254	SI void matrix_multiply(SkRasterPipeline_MatrixMultiplyCtx* packed, std::byte* base) {
4255	auto ctx = SkRPCtxUtils::Unpack(ctx: packed);
4256
4257	int outColumns = ctx.rightColumns,
4258	outRows = ctx.leftRows;
4259
4260	SkASSERT(outColumns >= `1`);
4261	SkASSERT(outRows >= `1`);
4262	SkASSERT(outColumns <= `4`);
4263	SkASSERT(outRows <= `4`);
4264
4265	SkASSERT(ctx.leftColumns == ctx.rightRows);
4266	SkASSERT(N == ctx.leftColumns); // N should match the result width
4267
4268	#if !defined(JUMPER_IS_SCALAR)
4269	// This prevents Clang from generating early-out checks for zero-sized matrices.
4270	__builtin_assume(outColumns >= `1`);
4271	__builtin_assume(outRows >= `1`);
4272	__builtin_assume(outColumns <= `4`);
4273	__builtin_assume(outRows <= `4`);
4274	#endif
4275
4276	// Get pointers to the adjacent left- and right-matrices.
4277	F* resultMtx = (F*)(base + ctx.dst);
4278	F* leftMtx = &resultMtx[ctx.rightColumns * ctx.leftRows];
4279	F* rightMtx = &leftMtx[N * ctx.leftRows];
4280
4281	// Emit each matrix element.
4282	for (int c = `0`; c < outColumns; ++c) {
4283	for (int r = `0`; r < outRows; ++r) {
4284	// Dot a vector from leftMtx[][r] with rightMtx[c][].
4285	F* leftRow = &leftMtx [r];
4286	F* rightColumn = &rightMtx[c * N];
4287
4288	F element = leftRow *rightColumn;
4289	for (int idx = `1`; idx < N; ++idx) {
4290	leftRow += outRows;
4291	rightColumn += `1`;
4292	element = mad(f: leftRow, m: rightColumn, a: element);
4293	}
4294
4295	*resultMtx++ = element;
4296	}
4297	}
4298	}
4299
4300	STAGE_TAIL(matrix_multiply_2, SkRasterPipeline_MatrixMultiplyCtx* packed) {
4301	matrix_multiply<`2`>(packed, base);
4302	}
4303
4304	STAGE_TAIL(matrix_multiply_3, SkRasterPipeline_MatrixMultiplyCtx* packed) {
4305	matrix_multiply<`3`>(packed, base);
4306	}
4307
4308	STAGE_TAIL(matrix_multiply_4, SkRasterPipeline_MatrixMultiplyCtx* packed) {
4309	matrix_multiply<`4`>(packed, base);
4310	}
4311
4312	// Refract always operates on 4-wide incident and normal vectors; for narrower inputs, the code
4313	// generator fills in the input columns with zero, and discards the extra output columns.
4314	STAGE_TAIL(refract_4_floats, F* dst) {
4315	// Algorithm adapted from https://registry.khronos.org/OpenGL-Refpages/gl4/html/refract.xhtml
4316	F *incident = dst + `0`;
4317	F *normal = dst + `4`;
4318	F eta = dst[`8`];
4319
4320	F dotNI = mad(f: normal[`0`], m: incident[`0`],
4321	a: mad(f: normal[`1`], m: incident[`1`],
4322	a: mad(f: normal[`2`], m: incident[`2`],
4323	a: normal[`3`] * incident[`3`])));
4324
4325	F k = `1.0` - eta * eta * (`1.0` - dotNI * dotNI);
4326	F sqrt_k = sqrt_(v: k);
4327
4328	for (int idx = `0`; idx < `4`; ++idx) {
4329	dst[idx] = if_then_else(c: k >= `0`,
4330	t: eta * incident[idx] - (eta * dotNI + sqrt_k) * normal[idx],
4331	e: `0.0`);
4332	}
4333	}
4334
4335	// Ternary operations work like binary ops (see immediately above) but take two source inputs.
4336	template <typename T, void (ApplyFn)(T, T, T)>
4337	SI void apply_adjacent_ternary(T* dst, T* src0, T* src1) {
4338	int count = src0 - dst;
4339	#if !defined(JUMPER_IS_SCALAR)
4340	__builtin_assume(count >= `1`);
4341	#endif
4342
4343	for (int index = `0`; index < count; ++index) {
4344	ApplyFn(dst, src0, src1);
4345	dst += `1`;
4346	src0 += `1`;
4347	src1 += `1`;
4348	}
4349	}
4350
4351	template <typename T, void (ApplyFn)(T, T, T)>
4352	SI void apply_adjacent_ternary_packed(SkRasterPipeline_TernaryOpCtx* packed, std::byte* base) {
4353	auto ctx = SkRPCtxUtils::Unpack(ctx: packed);
4354	std::byte* dst = base + ctx.dst;
4355	std::byte* src0 = dst + ctx.delta;
4356	std::byte* src1 = src0 + ctx.delta;
4357	apply_adjacent_ternary<T, ApplyFn>((T)dst, (T)src0, (T*)src1);
4358	}
4359
4360	SI void mix_fn(F* a, F* x, F* y) {
4361	// We reorder the arguments here to match lerp's GLSL-style order (interpolation point last).
4362	a = lerp(from: x, to: y, t: a);
4363	}
4364
4365	SI void mix_fn(I32* a, I32* x, I32* y) {
4366	// We reorder the arguments here to match if_then_else's expected order (y before x).
4367	a = if_then_else(c: a, t: y, e: x);
4368	}
4369
4370	SI void smoothstep_fn(F* edge0, F* edge1, F* x) {
4371	F t = clamp_01_(v: (x - edge0) / (edge1 - edge0));
4372	edge0 = t t * (`3.0` - `2.0` * t);
4373	}
4374
4375	#define DECLARE_N_WAY_TERNARY_FLOAT(name) \
4376	STAGE_TAIL(name##_n_floats, SkRasterPipeline_TernaryOpCtx* packed) { \
4377	apply_adjacent_ternary_packed<F, &name##_fn>(packed, base); \
4378	}
4379
4380	#define DECLARE_TERNARY_FLOAT(name) \
4381	STAGE_TAIL(name##_float, F* p) { apply_adjacent_ternary<F, &name##_fn>(p, p+1, p+2); } \
4382	STAGE_TAIL(name##_2_floats, F* p) { apply_adjacent_ternary<F, &name##_fn>(p, p+2, p+4); } \
4383	STAGE_TAIL(name##_3_floats, F* p) { apply_adjacent_ternary<F, &name##_fn>(p, p+3, p+6); } \
4384	STAGE_TAIL(name##_4_floats, F* p) { apply_adjacent_ternary<F, &name##_fn>(p, p+4, p+8); } \
4385	DECLARE_N_WAY_TERNARY_FLOAT(name)
4386
4387	#define DECLARE_TERNARY_INT(name) \
4388	STAGE_TAIL(name##_int, I32* p) { apply_adjacent_ternary<I32, &name##_fn>(p, p+1, p+2); } \
4389	STAGE_TAIL(name##_2_ints, I32* p) { apply_adjacent_ternary<I32, &name##_fn>(p, p+2, p+4); } \
4390	STAGE_TAIL(name##_3_ints, I32* p) { apply_adjacent_ternary<I32, &name##_fn>(p, p+3, p+6); } \
4391	STAGE_TAIL(name##_4_ints, I32* p) { apply_adjacent_ternary<I32, &name##_fn>(p, p+4, p+8); } \
4392	STAGE_TAIL(name##_n_ints, SkRasterPipeline_TernaryOpCtx* packed) { \
4393	apply_adjacent_ternary_packed<I32, &name##_fn>(packed, base); \
4394	}
4395
4396	DECLARE_N_WAY_TERNARY_FLOAT(smoothstep)
4397	DECLARE_TERNARY_FLOAT(mix)
4398	DECLARE_TERNARY_INT(mix)
4399
4400	#undef DECLARE_N_WAY_TERNARY_FLOAT
4401	#undef DECLARE_TERNARY_FLOAT
4402	#undef DECLARE_TERNARY_INT
4403
4404	STAGE(gauss_a_to_rgba, NoCtx) {
4405	// x = 1 - x;
4406	// exp(-x x * 4) - 0.018f;*
4407	// ... now approximate with quartic
4408	//
4409	const float c4 = -`2.26661229133605957031f`;
4410	const float c3 = `2.89795351028442382812f`;
4411	const float c2 = `0.21345567703247070312f`;
4412	const float c1 = `0.15489584207534790039f`;
4413	const float c0 = `0.00030726194381713867f`;
4414	a = mad(f: a, m: mad(f: a, m: mad(f: a, m: mad(f: a, m: c4, a: c3), a: c2), a: c1), a: c0);
4415	r = a;
4416	g = a;
4417	b = a;
4418	}
4419
4420	// A specialized fused image shader for clamp-x, clamp-y, non-sRGB sampling.
4421	STAGE(bilerp_clamp_8888, const SkRasterPipeline_GatherCtx* ctx) {
4422	// (cx,cy) are the center of our sample.
4423	F cx = r,
4424	cy = g;
4425
4426	// All sample points are at the same fractional offset (fx,fy).
4427	// They're the 4 corners of a logical 1x1 pixel surrounding (x,y) at (0.5,0.5) offsets.
4428	F fx = fract(v: cx + `0.5f`),
4429	fy = fract(v: cy + `0.5f`);
4430
4431	// We'll accumulate the color of all four samples into {r,g,b,a} directly.
4432	r = g = b = a = `0`;
4433
4434	for (float py = -`0.5f`; py <= +`0.5f`; py += `1.0f`)
4435	for (float px = -`0.5f`; px <= +`0.5f`; px += `1.0f`) {
4436	// (x,y) are the coordinates of this sample point.
4437	F x = cx + px,
4438	y = cy + py;
4439
4440	// ix_and_ptr() will clamp to the image's bounds for us.
4441	const uint32_t* ptr;
4442	U32 ix = ix_and_ptr(ptr: &ptr, ctx, x,y);
4443
4444	F sr,sg,sb,sa;
4445	from_8888(8888: gather(p: ptr, ix), r: &sr,g: &sg,b: &sb,a: &sa);
4446
4447	// In bilinear interpolation, the 4 pixels at +/- 0.5 offsets from the sample pixel center
4448	// are combined in direct proportion to their area overlapping that logical query pixel.
4449	// At positive offsets, the x-axis contribution to that rectangle is fx,
4450	// or (1-fx) at negative x. Same deal for y.
4451	F sx = (px > `0`) ? fx : `1.0f` - fx,
4452	sy = (py > `0`) ? fy : `1.0f` - fy,
4453	area = sx * sy;
4454
4455	r += sr * area;
4456	g += sg * area;
4457	b += sb * area;
4458	a += sa * area;
4459	}
4460	}
4461
4462	// A specialized fused image shader for clamp-x, clamp-y, non-sRGB sampling.
4463	STAGE(bicubic_clamp_8888, const SkRasterPipeline_GatherCtx* ctx) {
4464	// (cx,cy) are the center of our sample.
4465	F cx = r,
4466	cy = g;
4467
4468	// All sample points are at the same fractional offset (fx,fy).
4469	// They're the 4 corners of a logical 1x1 pixel surrounding (x,y) at (0.5,0.5) offsets.
4470	F fx = fract(v: cx + `0.5f`),
4471	fy = fract(v: cy + `0.5f`);
4472
4473	// We'll accumulate the color of all four samples into {r,g,b,a} directly.
4474	r = g = b = a = `0`;
4475
4476	const float* w = ctx->weights;
4477	const F scaley[`4`] = {bicubic_wts(t: fy, A: w[`0`], B: w[`4`], C: w[ `8`], D: w[`12`]),
4478	bicubic_wts(t: fy, A: w[`1`], B: w[`5`], C: w[ `9`], D: w[`13`]),
4479	bicubic_wts(t: fy, A: w[`2`], B: w[`6`], C: w[`10`], D: w[`14`]),
4480	bicubic_wts(t: fy, A: w[`3`], B: w[`7`], C: w[`11`], D: w[`15`])};
4481	const F scalex[`4`] = {bicubic_wts(t: fx, A: w[`0`], B: w[`4`], C: w[ `8`], D: w[`12`]),
4482	bicubic_wts(t: fx, A: w[`1`], B: w[`5`], C: w[ `9`], D: w[`13`]),
4483	bicubic_wts(t: fx, A: w[`2`], B: w[`6`], C: w[`10`], D: w[`14`]),
4484	bicubic_wts(t: fx, A: w[`3`], B: w[`7`], C: w[`11`], D: w[`15`])};
4485
4486	F sample_y = cy - `1.5f`;
4487	for (int yy = `0`; yy <= `3`; ++yy) {
4488	F sample_x = cx - `1.5f`;
4489	for (int xx = `0`; xx <= `3`; ++xx) {
4490	F scale = scalex[xx] * scaley[yy];
4491
4492	// ix_and_ptr() will clamp to the image's bounds for us.
4493	const uint32_t* ptr;
4494	U32 ix = ix_and_ptr(ptr: &ptr, ctx, x: sample_x, y: sample_y);
4495
4496	F sr,sg,sb,sa;
4497	from_8888(8888: gather(p: ptr, ix), r: &sr,g: &sg,b: &sb,a: &sa);
4498
4499	r = mad(f: scale, m: sr, a: r);
4500	g = mad(f: scale, m: sg, a: g);
4501	b = mad(f: scale, m: sb, a: b);
4502	a = mad(f: scale, m: sa, a);
4503
4504	sample_x += `1`;
4505	}
4506	sample_y += `1`;
4507	}
4508	}
4509
4510	// ~~~~~~ skgpu::Swizzle stage ~~~~~~ //
4511
4512	STAGE(swizzle, void* ctx) {
4513	auto ir = r, ig = g, ib = b, ia = a;
4514	F* o[] = {&r, &g, &b, &a};
4515	char swiz[`4`];
4516	memcpy(dest: swiz, src: &ctx, n: sizeof(swiz));
4517
4518	for (int i = `0`; i < `4`; ++i) {
4519	switch (swiz[i]) {
4520	case `'r'`: o[i] = ir; break*;
4521	case `'g'`: o[i] = ig; break*;
4522	case `'b'`: o[i] = ib; break*;
4523	case `'a'`: o[i] = ia; break*;
4524	case `'0'`: o[i] = F(`0`); break*;
4525	case `'1'`: o[i] = F(`1`); break*;
4526	default: break;
4527	}
4528	}
4529	}
4530
4531	namespace lowp {
4532	#if defined(JUMPER_IS_SCALAR) \|\| defined(SK_DISABLE_LOWP_RASTER_PIPELINE)
4533	// If we're not compiled by Clang, or otherwise switched into scalar mode (old Clang, manually),
4534	// we don't generate lowp stages. All these nullptrs will tell SkJumper.cpp to always use the
4535	// highp float pipeline.
4536	#define M(st) static void (*st)(void) = nullptr;
4537	SK_RASTER_PIPELINE_OPS_LOWP(M)
4538	#undef M
4539	static void (just_return)(void) = nullptr*;
4540
4541	static void start_pipeline(size_t,size_t,size_t,size_t, SkRasterPipelineStage*) {}
4542
4543	#else // We are compiling vector code with Clang... let's make some lowp stages!
4544
4545	#if defined(JUMPER_IS_HSW)
4546	using U8 = uint8_t __attribute__((ext_vector_type(`16`)));
4547	using U16 = uint16_t __attribute__((ext_vector_type(`16`)));
4548	using I16 = int16_t __attribute__((ext_vector_type(`16`)));
4549	using I32 = int32_t __attribute__((ext_vector_type(`16`)));
4550	using U32 = uint32_t __attribute__((ext_vector_type(`16`)));
4551	using I64 = int64_t __attribute__((ext_vector_type(`16`)));
4552	using U64 = uint64_t __attribute__((ext_vector_type(`16`)));
4553	using F = float __attribute__((ext_vector_type(`16`)));
4554	#else
4555	using U8 = uint8_t __attribute__((ext_vector_type(`8`)));
4556	using U16 = uint16_t __attribute__((ext_vector_type(`8`)));
4557	using I16 = int16_t __attribute__((ext_vector_type(`8`)));
4558	using I32 = int32_t __attribute__((ext_vector_type(`8`)));
4559	using U32 = uint32_t __attribute__((ext_vector_type(`8`)));
4560	using I64 = int64_t __attribute__((ext_vector_type(`8`)));
4561	using U64 = uint64_t __attribute__((ext_vector_type(`8`)));
4562	using F = float __attribute__((ext_vector_type(`8`)));
4563	#endif
4564
4565	static constexpr size_t N = sizeof(U16) / sizeof(uint16_t);
4566
4567	// Once again, some platforms benefit from a restricted Stage calling convention,
4568	// but others can pass tons and tons of registers and we're happy to exploit that.
4569	// It's exactly the same decision and implementation strategy as the F stages above.
4570	#if JUMPER_NARROW_STAGES
4571	struct Params {
4572	size_t dx, dy, tail;
4573	U16 dr,dg,db,da;
4574	};
4575	using Stage = void (ABI)(Params, SkRasterPipelineStage* program, U16 r, U16 g, U16 b, U16 a);
4576	#else
4577	using Stage = void (ABI)(size_t tail, SkRasterPipelineStage program,
4578	size_t dx, size_t dy,
4579	U16 r, U16 g, U16 b, U16 a,
4580	U16 dr, U16 dg, U16 db, U16 da);
4581	#endif
4582
4583	static void start_pipeline(const size_t x0, const size_t y0,
4584	const size_t xlimit, const size_t ylimit,
4585	SkRasterPipelineStage* program) {
4586	auto start = (Stage)program->fn;
4587	for (size_t dy = y0; dy < ylimit; dy++) {
4588	#if JUMPER_NARROW_STAGES
4589	Params params = { x0,dy,`0`, `0`,`0`,`0`,`0` };
4590	for (; params.dx + N <= xlimit; params.dx += N) {
4591	start(&params, program, `0`,`0`,`0`,`0`);
4592	}
4593	if (size_t tail = xlimit - params.dx) {
4594	params.tail = tail;
4595	start(&params, program, `0`,`0`,`0`,`0`);
4596	}
4597	#else
4598	size_t dx = x0;
4599	for (; dx + N <= xlimit; dx += N) {
4600	start( `0`, program, dx,dy, `0`,`0`,`0`,`0`, `0`,`0`,`0`,`0`);
4601	}
4602	if (size_t tail = xlimit - dx) {
4603	start(tail, program, dx,dy, `0`,`0`,`0`,`0`, `0`,`0`,`0`,`0`);
4604	}
4605	#endif
4606	}
4607	}
4608
4609	#if JUMPER_NARROW_STAGES
4610	static void ABI just_return(Params, SkRasterPipelineStage, U16,U16,U16,U16) {}
4611	#else
4612	static void ABI just_return(size_t, SkRasterPipelineStage*,size_t,size_t,
4613	U16,U16,U16,U16, U16,U16,U16,U16) {}
4614	#endif
4615
4616	// All stages use the same function call ABI to chain into each other, but there are three types:
4617	// GG: geometry in, geometry out -- think, a matrix
4618	// GP: geometry in, pixels out. -- think, a memory gather
4619	// PP: pixels in, pixels out. -- think, a blend mode
4620	//
4621	// (Some stages ignore their inputs or produce no logical output. That's perfectly fine.)
4622	//
4623	// These three STAGE_ macros let you define each type of stage,
4624	// and will have (x,y) geometry and/or (r,g,b,a, dr,dg,db,da) pixel arguments as appropriate.
4625
4626	#if JUMPER_NARROW_STAGES
4627	#define STAGE_GG(name, ARG) \
4628	SI void name##_k(ARG, size_t dx, size_t dy, size_t tail, F& x, F& y); \
4629	static void ABI name(Params* params, SkRasterPipelineStage* program, \
4630	U16 r, U16 g, U16 b, U16 a) { \
4631	auto x = join<F>(r,g), \
4632	y = join<F>(b,a); \
4633	name##_k(Ctx{program}, params->dx,params->dy,params->tail, x,y); \
4634	split(x, &r,&g); \
4635	split(y, &b,&a); \
4636	auto fn = (Stage)(++program)->fn; \
4637	fn(params, program, r,g,b,a); \
4638	} \
4639	SI void name##_k(ARG, size_t dx, size_t dy, size_t tail, F& x, F& y)
4640
4641	#define STAGE_GP(name, ARG) \
4642	SI void name##_k(ARG, size_t dx, size_t dy, size_t tail, F x, F y, \
4643	U16& r, U16& g, U16& b, U16& a, \
4644	U16& dr, U16& dg, U16& db, U16& da); \
4645	static void ABI name(Params* params, SkRasterPipelineStage* program, \
4646	U16 r, U16 g, U16 b, U16 a) { \
4647	auto x = join<F>(r,g), \
4648	y = join<F>(b,a); \
4649	name##_k(Ctx{program}, params->dx,params->dy,params->tail, x,y, r,g,b,a, \
4650	params->dr,params->dg,params->db,params->da); \
4651	auto fn = (Stage)(++program)->fn; \
4652	fn(params, program, r,g,b,a); \
4653	} \
4654	SI void name##_k(ARG, size_t dx, size_t dy, size_t tail, F x, F y, \
4655	U16& r, U16& g, U16& b, U16& a, \
4656	U16& dr, U16& dg, U16& db, U16& da)
4657
4658	#define STAGE_PP(name, ARG) \
4659	SI void name##_k(ARG, size_t dx, size_t dy, size_t tail, \
4660	U16& r, U16& g, U16& b, U16& a, \
4661	U16& dr, U16& dg, U16& db, U16& da); \
4662	static void ABI name(Params* params, SkRasterPipelineStage* program, \
4663	U16 r, U16 g, U16 b, U16 a) { \
4664	name##_k(Ctx{program}, params->dx,params->dy,params->tail, r,g,b,a, \
4665	params->dr,params->dg,params->db,params->da); \
4666	auto fn = (Stage)(++program)->fn; \
4667	fn(params, program, r,g,b,a); \
4668	} \
4669	SI void name##_k(ARG, size_t dx, size_t dy, size_t tail, \
4670	U16& r, U16& g, U16& b, U16& a, \
4671	U16& dr, U16& dg, U16& db, U16& da)
4672	#else
4673	#define STAGE_GG(name, ARG) \
4674	SI void name##_k(ARG, size_t dx, size_t dy, size_t tail, F& x, F& y); \
4675	static void ABI name(size_t tail, SkRasterPipelineStage* program, \
4676	size_t dx, size_t dy, \
4677	U16 r, U16 g, U16 b, U16 a, \
4678	U16 dr, U16 dg, U16 db, U16 da) { \
4679	auto x = join<F>(r,g), \
4680	y = join<F>(b,a); \
4681	name##_k(Ctx{program}, dx,dy,tail, x,y); \
4682	split(x, &r,&g); \
4683	split(y, &b,&a); \
4684	auto fn = (Stage)(++program)->fn; \
4685	fn(tail, program, dx,dy, r,g,b,a, dr,dg,db,da); \
4686	} \
4687	SI void name##_k(ARG, size_t dx, size_t dy, size_t tail, F& x, F& y)
4688
4689	#define STAGE_GP(name, ARG) \
4690	SI void name##_k(ARG, size_t dx, size_t dy, size_t tail, F x, F y, \
4691	U16& r, U16& g, U16& b, U16& a, \
4692	U16& dr, U16& dg, U16& db, U16& da); \
4693	static void ABI name(size_t tail, SkRasterPipelineStage* program, \
4694	size_t dx, size_t dy, \
4695	U16 r, U16 g, U16 b, U16 a, \
4696	U16 dr, U16 dg, U16 db, U16 da) { \
4697	auto x = join<F>(r,g), \
4698	y = join<F>(b,a); \
4699	name##_k(Ctx{program}, dx,dy,tail, x,y, r,g,b,a, dr,dg,db,da); \
4700	auto fn = (Stage)(++program)->fn; \
4701	fn(tail, program, dx,dy, r,g,b,a, dr,dg,db,da); \
4702	} \
4703	SI void name##_k(ARG, size_t dx, size_t dy, size_t tail, F x, F y, \
4704	U16& r, U16& g, U16& b, U16& a, \
4705	U16& dr, U16& dg, U16& db, U16& da)
4706
4707	#define STAGE_PP(name, ARG) \
4708	SI void name##_k(ARG, size_t dx, size_t dy, size_t tail, \
4709	U16& r, U16& g, U16& b, U16& a, \
4710	U16& dr, U16& dg, U16& db, U16& da); \
4711	static void ABI name(size_t tail, SkRasterPipelineStage* program, \
4712	size_t dx, size_t dy, \
4713	U16 r, U16 g, U16 b, U16 a, \
4714	U16 dr, U16 dg, U16 db, U16 da) { \
4715	name##_k(Ctx{program}, dx,dy,tail, r,g,b,a, dr,dg,db,da); \
4716	auto fn = (Stage)(++program)->fn; \
4717	fn(tail, program, dx,dy, r,g,b,a, dr,dg,db,da); \
4718	} \
4719	SI void name##_k(ARG, size_t dx, size_t dy, size_t tail, \
4720	U16& r, U16& g, U16& b, U16& a, \
4721	U16& dr, U16& dg, U16& db, U16& da)
4722	#endif
4723
4724	// ~~~~~~ Commonly used helper functions ~~~~~~ //
4725
4726	/**
4727	* Helpers to to properly rounded division (by 255). The ideal answer we want to compute is slow,
4728	* thanks to a division by a non-power of two:
4729	* [1] (v + 127) / 255
4730	*
4731	* There is a two-step process that computes the correct answer for all inputs:
4732	* [2] (v + 128 + ((v + 128) >> 8)) >> 8
4733	*
4734	* There is also a single iteration approximation, but it's wrong (+-1) ~25% of the time:
4735	* [3] (v + 255) >> 8;
4736	*
4737	* We offer two different implementations here, depending on the requirements of the calling stage.
4738	*/
4739
4740	/**
4741	* div255 favors speed over accuracy. It uses formula [2] on NEON (where we can compute it as fast
4742	* as [3]), and uses [3] elsewhere.
4743	*/
4744	SI U16 div255(U16 v) {
4745	#if defined(JUMPER_IS_NEON)
4746	// With NEON we can compute [2] just as fast as [3], so let's be correct.
4747	// First we compute v + ((v+128)>>8), then one more round of (...+128)>>8 to finish up:
4748	return vrshrq_n_u16(vrsraq_n_u16(v, v, `8`), `8`);
4749	#else
4750	// Otherwise, use [3], which is never wrong by more than 1:
4751	return (v+`255`)/`256`;
4752	#endif
4753	}
4754
4755	/**
4756	* div255_accurate guarantees the right answer on all platforms, at the expense of performance.
4757	*/
4758	SI U16 div255_accurate(U16 v) {
4759	#if defined(JUMPER_IS_NEON)
4760	// Our NEON implementation of div255 is already correct for all inputs:
4761	return div255(v);
4762	#else
4763	// This is [2] (the same formulation as NEON), but written without the benefit of intrinsics:
4764	v += `128`;
4765	return (v+(v/`256`))/`256`;
4766	#endif
4767	}
4768
4769	SI U16 inv(U16 v) { return `255`-v; }
4770
4771	SI U16 if_then_else(I16 c, U16 t, U16 e) { return (t & c) \| (e & ~c); }
4772	SI U32 if_then_else(I32 c, U32 t, U32 e) { return (t & c) \| (e & ~c); }
4773
4774	SI U16 max(U16 x, U16 y) { return if_then_else(x < y, y, x); }
4775	SI U16 min(U16 x, U16 y) { return if_then_else(x < y, x, y); }
4776
4777	SI U16 from_float(float f) { return f * `255.0f` + `0.5f`; }
4778
4779	SI U16 lerp(U16 from, U16 to, U16 t) { return div255( frominv(t) + tot ); }
4780
4781	template <typename D, typename S>
4782	SI D cast(S src) {
4783	return __builtin_convertvector(src, D);
4784	}
4785
4786	template <typename D, typename S>
4787	SI void split(S v, D* lo, D* hi) {
4788	static_assert(`2`*sizeof(D) == sizeof(S), "");
4789	memcpy(lo, (const char)&v + `0`sizeof(D), sizeof(D));
4790	memcpy(hi, (const char)&v + `1`sizeof(D), sizeof(D));
4791	}
4792	template <typename D, typename S>
4793	SI D join(S lo, S hi) {
4794	static_assert(sizeof(D) == `2`*sizeof(S), "");
4795	D v;
4796	memcpy((char)&v + `0`sizeof(S), &lo, sizeof(S));
4797	memcpy((char)&v + `1`sizeof(S), &hi, sizeof(S));
4798	return v;
4799	}
4800
4801	SI F if_then_else(I32 c, F t, F e) {
4802	return sk_bit_cast<F>( (sk_bit_cast<I32>(t) & c) \| (sk_bit_cast<I32>(e) & ~c) );
4803	}
4804	SI F max(F x, F y) { return if_then_else(x < y, y, x); }
4805	SI F min(F x, F y) { return if_then_else(x < y, x, y); }
4806
4807	SI I32 if_then_else(I32 c, I32 t, I32 e) {
4808	return (t & c) \| (e & ~c);
4809	}
4810	SI I32 max(I32 x, I32 y) { return if_then_else(x < y, y, x); }
4811	SI I32 min(I32 x, I32 y) { return if_then_else(x < y, x, y); }
4812
4813	SI F mad(F f, F m, F a) { return f*m+a; }
4814	SI U32 trunc_(F x) { return (U32)cast<I32>(x); }
4815
4816	// Use approximate instructions and one Newton-Raphson step to calculate 1/x.
4817	SI F rcp_precise(F x) {
4818	#if defined(JUMPER_IS_HSW)
4819	__m256 lo,hi;
4820	split(x, &lo,&hi);
4821	return join<F>(SK_OPTS_NS::rcp_precise(lo), SK_OPTS_NS::rcp_precise(hi));
4822	#elif defined(JUMPER_IS_SSE2) \|\| defined(JUMPER_IS_SSE41) \|\| defined(JUMPER_IS_AVX)
4823	__m128 lo,hi;
4824	split(x, &lo,&hi);
4825	return join<F>(SK_OPTS_NS::rcp_precise(lo), SK_OPTS_NS::rcp_precise(hi));
4826	#elif defined(JUMPER_IS_NEON)
4827	float32x4_t lo,hi;
4828	split(x, &lo,&hi);
4829	return join<F>(SK_OPTS_NS::rcp_precise(lo), SK_OPTS_NS::rcp_precise(hi));
4830	#else
4831	return `1.0f` / x;
4832	#endif
4833	}
4834	SI F sqrt_(F x) {
4835	#if defined(JUMPER_IS_HSW)
4836	__m256 lo,hi;
4837	split(x, &lo,&hi);
4838	return join<F>(_mm256_sqrt_ps(lo), _mm256_sqrt_ps(hi));
4839	#elif defined(JUMPER_IS_SSE2) \|\| defined(JUMPER_IS_SSE41) \|\| defined(JUMPER_IS_AVX)
4840	__m128 lo,hi;
4841	split(x, &lo,&hi);
4842	return join<F>(_mm_sqrt_ps(lo), _mm_sqrt_ps(hi));
4843	#elif defined(SK_CPU_ARM64)
4844	float32x4_t lo,hi;
4845	split(x, &lo,&hi);
4846	return join<F>(vsqrtq_f32(lo), vsqrtq_f32(hi));
4847	#elif defined(JUMPER_IS_NEON)
4848	auto sqrt = [](float32x4_t v) {
4849	auto est = vrsqrteq_f32(v); // Estimate and two refinement steps for est = rsqrt(v).
4850	est = vrsqrtsq_f32(v,estest);
4851	est = vrsqrtsq_f32(v,estest);
4852	return vest; // sqrt(v) == vrsqrt(v).
4853	};
4854	float32x4_t lo,hi;
4855	split(x, &lo,&hi);
4856	return join<F>(sqrt(lo), sqrt(hi));
4857	#else
4858	return F{
4859	sqrtf(x[`0`]), sqrtf(x[`1`]), sqrtf(x[`2`]), sqrtf(x[`3`]),
4860	sqrtf(x[`4`]), sqrtf(x[`5`]), sqrtf(x[`6`]), sqrtf(x[`7`]),
4861	};
4862	#endif
4863	}
4864
4865	SI F floor_(F x) {
4866	#if defined(SK_CPU_ARM64)
4867	float32x4_t lo,hi;
4868	split(x, &lo,&hi);
4869	return join<F>(vrndmq_f32(lo), vrndmq_f32(hi));
4870	#elif defined(JUMPER_IS_HSW)
4871	__m256 lo,hi;
4872	split(x, &lo,&hi);
4873	return join<F>(_mm256_floor_ps(lo), _mm256_floor_ps(hi));
4874	#elif defined(JUMPER_IS_SSE41) \|\| defined(JUMPER_IS_AVX)
4875	__m128 lo,hi;
4876	split(x, &lo,&hi);
4877	return join<F>(_mm_floor_ps(lo), _mm_floor_ps(hi));
4878	#else
4879	F roundtrip = cast<F>(cast<I32>(x));
4880	return roundtrip - if_then_else(roundtrip > x, F(`1`), F(`0`));
4881	#endif
4882	}
4883
4884	// scaled_mult interprets a and b as number on [-1, 1) which are numbers in Q15 format. Functionally
4885	// this multiply is:
4886	// (2 a * b + (1 << 15)) >> 16*
4887	// The result is a number on [-1, 1).
4888	// Note: on neon this is a saturating multiply while the others are not.
4889	SI I16 scaled_mult(I16 a, I16 b) {
4890	#if defined(JUMPER_IS_HSW)
4891	return _mm256_mulhrs_epi16(a, b);
4892	#elif defined(JUMPER_IS_SSE41) \|\| defined(JUMPER_IS_AVX)
4893	return _mm_mulhrs_epi16(a, b);
4894	#elif defined(SK_CPU_ARM64)
4895	return vqrdmulhq_s16(a, b);
4896	#elif defined(JUMPER_IS_NEON)
4897	return vqrdmulhq_s16(a, b);
4898	#else
4899	const I32 roundingTerm = `1` << `14`;
4900	return cast<I16>((cast<I32>(a) * cast<I32>(b) + roundingTerm) >> `15`);
4901	#endif
4902	}
4903
4904	// This sum is to support lerp where the result will always be a positive number. In general,
4905	// a sum like this would require an additional bit, but because we know the range of the result
4906	// we know that the extra bit will always be zero.
4907	SI U16 constrained_add(I16 a, U16 b) {
4908	#if defined(SK_DEBUG)
4909	for (size_t i = `0`; i < N; i++) {
4910	// Ensure that a + b is on the interval [0, UINT16_MAX]
4911	int ia = a[i],
4912	ib = b[i];
4913	// Use 65535 here because fuchsia's compiler evaluates UINT16_MAX - ib, which is
4914	// 65536U - ib, as an uint32_t instead of an int32_t. This was forcing ia to be
4915	// interpreted as an uint32_t.
4916	SkASSERT(-ib <= ia && ia <= `65535` - ib);
4917	}
4918	#endif
4919	return b + a;
4920	}
4921
4922	SI F fract(F x) { return x - floor_(x); }
4923	SI F abs_(F x) { return sk_bit_cast<F>( sk_bit_cast<I32>(x) & `0x7fffffff` ); }
4924
4925	// ~~~~~~ Basic / misc. stages ~~~~~~ //
4926
4927	STAGE_GG(seed_shader, NoCtx) {
4928	static constexpr float iota[] = {
4929	`0.5f`, `1.5f`, `2.5f`, `3.5f`, `4.5f`, `5.5f`, `6.5f`, `7.5f`,
4930	`8.5f`, `9.5f`,`10.5f`,`11.5f`,`12.5f`,`13.5f`,`14.5f`,`15.5f`,
4931	};
4932	x = cast<F>(I32(dx)) + sk_unaligned_load<F>(iota);
4933	y = cast<F>(I32(dy)) + `0.5f`;
4934	}
4935
4936	STAGE_GG(matrix_translate, const float* m) {
4937	x += m[`0`];
4938	y += m[`1`];
4939	}
4940	STAGE_GG(matrix_scale_translate, const float* m) {
4941	x = mad(x,m[`0`], m[`2`]);
4942	y = mad(y,m[`1`], m[`3`]);
4943	}
4944	STAGE_GG(matrix_2x3, const float* m) {
4945	auto X = mad(x,m[`0`], mad(y,m[`1`], m[`2`])),
4946	Y = mad(x,m[`3`], mad(y,m[`4`], m[`5`]));
4947	x = X;
4948	y = Y;
4949	}
4950	STAGE_GG(matrix_perspective, const float* m) {
4951	// N.B. Unlike the other matrix_ stages, this matrix is row-major.
4952	auto X = mad(x,m[`0`], mad(y,m[`1`], m[`2`])),
4953	Y = mad(x,m[`3`], mad(y,m[`4`], m[`5`])),
4954	Z = mad(x,m[`6`], mad(y,m[`7`], m[`8`]));
4955	x = X * rcp_precise(Z);
4956	y = Y * rcp_precise(Z);
4957	}
4958
4959	STAGE_PP(uniform_color, const SkRasterPipeline_UniformColorCtx* c) {
4960	r = c->rgba[`0`];
4961	g = c->rgba[`1`];
4962	b = c->rgba[`2`];
4963	a = c->rgba[`3`];
4964	}
4965	STAGE_PP(uniform_color_dst, const SkRasterPipeline_UniformColorCtx* c) {
4966	dr = c->rgba[`0`];
4967	dg = c->rgba[`1`];
4968	db = c->rgba[`2`];
4969	da = c->rgba[`3`];
4970	}
4971	STAGE_PP(black_color, NoCtx) { r = g = b = `0`; a = `255`; }
4972	STAGE_PP(white_color, NoCtx) { r = g = b = `255`; a = `255`; }
4973
4974	STAGE_PP(set_rgb, const float rgb[`3`]) {
4975	r = from_float(rgb[`0`]);
4976	g = from_float(rgb[`1`]);
4977	b = from_float(rgb[`2`]);
4978	}
4979
4980	// No need to clamp against 0 here (values are unsigned)
4981	STAGE_PP(clamp_01, NoCtx) {
4982	r = min(r, `255`);
4983	g = min(g, `255`);
4984	b = min(b, `255`);
4985	a = min(a, `255`);
4986	}
4987
4988	STAGE_PP(clamp_gamut, NoCtx) {
4989	a = min(a, `255`);
4990	r = min(r, a);
4991	g = min(g, a);
4992	b = min(b, a);
4993	}
4994
4995	STAGE_PP(premul, NoCtx) {
4996	r = div255_accurate(r * a);
4997	g = div255_accurate(g * a);
4998	b = div255_accurate(b * a);
4999	}
5000	STAGE_PP(premul_dst, NoCtx) {
5001	dr = div255_accurate(dr * da);
5002	dg = div255_accurate(dg * da);
5003	db = div255_accurate(db * da);
5004	}
5005
5006	STAGE_PP(force_opaque , NoCtx) { a = `255`; }
5007	STAGE_PP(force_opaque_dst, NoCtx) { da = `255`; }
5008
5009	STAGE_PP(swap_rb, NoCtx) {
5010	auto tmp = r;
5011	r = b;
5012	b = tmp;
5013	}
5014	STAGE_PP(swap_rb_dst, NoCtx) {
5015	auto tmp = dr;
5016	dr = db;
5017	db = tmp;
5018	}
5019
5020	STAGE_PP(move_src_dst, NoCtx) {
5021	dr = r;
5022	dg = g;
5023	db = b;
5024	da = a;
5025	}
5026
5027	STAGE_PP(move_dst_src, NoCtx) {
5028	r = dr;
5029	g = dg;
5030	b = db;
5031	a = da;
5032	}
5033
5034	STAGE_PP(swap_src_dst, NoCtx) {
5035	std::swap(r, dr);
5036	std::swap(g, dg);
5037	std::swap(b, db);
5038	std::swap(a, da);
5039	}
5040
5041	// ~~~~~~ Blend modes ~~~~~~ //
5042
5043	// The same logic applied to all 4 channels.
5044	#define BLEND_MODE(name) \
5045	SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da); \
5046	STAGE_PP(name, NoCtx) { \
5047	r = name##_channel(r,dr,a,da); \
5048	g = name##_channel(g,dg,a,da); \
5049	b = name##_channel(b,db,a,da); \
5050	a = name##_channel(a,da,a,da); \
5051	} \
5052	SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da)
5053
5054	#if defined(SK_USE_INACCURATE_DIV255_IN_BLEND)
5055	BLEND_MODE(clear) { return `0`; }
5056	BLEND_MODE(srcatop) { return div255( sda + dinv(sa) ); }
5057	BLEND_MODE(dstatop) { return div255( dsa + sinv(da) ); }
5058	BLEND_MODE(srcin) { return div255( s*da ); }
5059	BLEND_MODE(dstin) { return div255( d*sa ); }
5060	BLEND_MODE(srcout) { return div255( s*inv(da) ); }
5061	BLEND_MODE(dstout) { return div255( d*inv(sa) ); }
5062	BLEND_MODE(srcover) { return s + div255( d*inv(sa) ); }
5063	BLEND_MODE(dstover) { return d + div255( s*inv(da) ); }
5064	BLEND_MODE(modulate) { return div255( s*d ); }
5065	BLEND_MODE(multiply) { return div255( sinv(da) + dinv(sa) + s*d ); }
5066	BLEND_MODE(plus_) { return min(s+d, `255`); }
5067	BLEND_MODE(screen) { return s + d - div255( s*d ); }
5068	BLEND_MODE(xor_) { return div255( sinv(da) + dinv(sa) ); }
5069	#else
5070	BLEND_MODE(clear) { return `0`; }
5071	BLEND_MODE(srcatop) { return div255( sda + dinv(sa) ); }
5072	BLEND_MODE(dstatop) { return div255( dsa + sinv(da) ); }
5073	BLEND_MODE(srcin) { return div255_accurate( s*da ); }
5074	BLEND_MODE(dstin) { return div255_accurate( d*sa ); }
5075	BLEND_MODE(srcout) { return div255_accurate( s*inv(da) ); }
5076	BLEND_MODE(dstout) { return div255_accurate( d*inv(sa) ); }
5077	BLEND_MODE(srcover) { return s + div255_accurate( d*inv(sa) ); }
5078	BLEND_MODE(dstover) { return d + div255_accurate( s*inv(da) ); }
5079	BLEND_MODE(modulate) { return div255_accurate( s*d ); }
5080	BLEND_MODE(multiply) { return div255( sinv(da) + dinv(sa) + s*d ); }
5081	BLEND_MODE(plus_) { return min(s+d, `255`); }
5082	BLEND_MODE(screen) { return s + d - div255_accurate( s*d ); }
5083	BLEND_MODE(xor_) { return div255( sinv(da) + dinv(sa) ); }
5084	#endif
5085	#undef BLEND_MODE
5086
5087	// The same logic applied to color, and srcover for alpha.
5088	#define BLEND_MODE(name) \
5089	SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da); \
5090	STAGE_PP(name, NoCtx) { \
5091	r = name##_channel(r,dr,a,da); \
5092	g = name##_channel(g,dg,a,da); \
5093	b = name##_channel(b,db,a,da); \
5094	a = a + div255( da*inv(a) ); \
5095	} \
5096	SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da)
5097
5098	BLEND_MODE(darken) { return s + d - div255( max(sda, dsa) ); }
5099	BLEND_MODE(lighten) { return s + d - div255( min(sda, dsa) ); }
5100	BLEND_MODE(difference) { return s + d - `2`div255( min(sda, d*sa) ); }
5101	BLEND_MODE(exclusion) { return s + d - `2`div255( sd ); }
5102
5103	BLEND_MODE(hardlight) {
5104	return div255( sinv(da) + dinv(sa) +
5105	if_then_else(`2`s <= sa, `2`sd, sada - `2`(sa-s)(da-d)) );
5106	}
5107	BLEND_MODE(overlay) {
5108	return div255( sinv(da) + dinv(sa) +
5109	if_then_else(`2`d <= da, `2`sd, sada - `2`(sa-s)(da-d)) );
5110	}
5111	#undef BLEND_MODE
5112
5113	// ~~~~~~ Helpers for interacting with memory ~~~~~~ //
5114
5115	template <typename T>
5116	SI T* ptr_at_xy(const SkRasterPipeline_MemoryCtx* ctx, size_t dx, size_t dy) {
5117	return (T)ctx->pixels + dyctx->stride + dx;
5118	}
5119
5120	template <typename T>
5121	SI U32 ix_and_ptr(T** ptr, const SkRasterPipeline_GatherCtx* ctx, F x, F y) {
5122	// Exclusive -> inclusive.
5123	const F w = sk_bit_cast<float>( sk_bit_cast<uint32_t>(ctx->width ) - `1`),
5124	h = sk_bit_cast<float>( sk_bit_cast<uint32_t>(ctx->height) - `1`);
5125
5126	const F z = std::numeric_limits<float>::min();
5127
5128	x = min(max(z, x), w);
5129	y = min(max(z, y), h);
5130
5131	x = sk_bit_cast<F>(sk_bit_cast<U32>(x) - (uint32_t)ctx->roundDownAtInteger);
5132	y = sk_bit_cast<F>(sk_bit_cast<U32>(y) - (uint32_t)ctx->roundDownAtInteger);
5133
5134	ptr = (const* T*)ctx->pixels;
5135	return trunc_(y)*ctx->stride + trunc_(x);
5136	}
5137
5138	template <typename T>
5139	SI U32 ix_and_ptr(T** ptr, const SkRasterPipeline_GatherCtx* ctx, I32 x, I32 y) {
5140	// This flag doesn't make sense when the coords are integers.
5141	SkASSERT(ctx->roundDownAtInteger == `0`);
5142	// Exclusive -> inclusive.
5143	const I32 w = ctx->width - `1`,
5144	h = ctx->height - `1`;
5145
5146	U32 ax = cast<U32>(min(max(`0`, x), w)),
5147	ay = cast<U32>(min(max(`0`, y), h));
5148
5149	ptr = (const* T*)ctx->pixels;
5150	return ay * ctx->stride + ax;
5151	}
5152
5153	template <typename V, typename T>
5154	SI V load(const T* ptr, size_t tail) {
5155	V v = `0`;
5156	switch (tail & (N-`1`)) {
5157	case `0`: memcpy(&v, ptr, sizeof(v)); break;
5158	#if defined(JUMPER_IS_HSW)
5159	case `15`: v[`14`] = ptr[`14`]; [[fallthrough]];
5160	case `14`: v[`13`] = ptr[`13`]; [[fallthrough]];
5161	case `13`: v[`12`] = ptr[`12`]; [[fallthrough]];
5162	case `12`: memcpy(&v, ptr, `12`*sizeof(T)); break;
5163	case `11`: v[`10`] = ptr[`10`]; [[fallthrough]];
5164	case `10`: v[ `9`] = ptr[ `9`]; [[fallthrough]];
5165	case `9`: v[ `8`] = ptr[ `8`]; [[fallthrough]];
5166	case `8`: memcpy(&v, ptr, `8`*sizeof(T)); break;
5167	#endif
5168	case `7`: v[ `6`] = ptr[ `6`]; [[fallthrough]];
5169	case `6`: v[ `5`] = ptr[ `5`]; [[fallthrough]];
5170	case `5`: v[ `4`] = ptr[ `4`]; [[fallthrough]];
5171	case `4`: memcpy(&v, ptr, `4`*sizeof(T)); break;
5172	case `3`: v[ `2`] = ptr[ `2`]; [[fallthrough]];
5173	case `2`: memcpy(&v, ptr, `2`*sizeof(T)); break;
5174	case `1`: v[ `0`] = ptr[ `0`];
5175	}
5176	return v;
5177	}
5178	template <typename V, typename T>
5179	SI void store(T* ptr, size_t tail, V v) {
5180	switch (tail & (N-`1`)) {
5181	case `0`: memcpy(ptr, &v, sizeof(v)); break;
5182	#if defined(JUMPER_IS_HSW)
5183	case `15`: ptr[`14`] = v[`14`]; [[fallthrough]];
5184	case `14`: ptr[`13`] = v[`13`]; [[fallthrough]];
5185	case `13`: ptr[`12`] = v[`12`]; [[fallthrough]];
5186	case `12`: memcpy(ptr, &v, `12`*sizeof(T)); break;
5187	case `11`: ptr[`10`] = v[`10`]; [[fallthrough]];
5188	case `10`: ptr[ `9`] = v[ `9`]; [[fallthrough]];
5189	case `9`: ptr[ `8`] = v[ `8`]; [[fallthrough]];
5190	case `8`: memcpy(ptr, &v, `8`*sizeof(T)); break;
5191	#endif
5192	case `7`: ptr[ `6`] = v[ `6`]; [[fallthrough]];
5193	case `6`: ptr[ `5`] = v[ `5`]; [[fallthrough]];
5194	case `5`: ptr[ `4`] = v[ `4`]; [[fallthrough]];
5195	case `4`: memcpy(ptr, &v, `4`*sizeof(T)); break;
5196	case `3`: ptr[ `2`] = v[ `2`]; [[fallthrough]];
5197	case `2`: memcpy(ptr, &v, `2`*sizeof(T)); break;
5198	case `1`: ptr[ `0`] = v[ `0`];
5199	}
5200	}
5201
5202	#if defined(JUMPER_IS_HSW)
5203	template <typename V, typename T>
5204	SI V gather(const T* ptr, U32 ix) {
5205	return V{ ptr[ix[ `0`]], ptr[ix[ `1`]], ptr[ix[ `2`]], ptr[ix[ `3`]],
5206	ptr[ix[ `4`]], ptr[ix[ `5`]], ptr[ix[ `6`]], ptr[ix[ `7`]],
5207	ptr[ix[ `8`]], ptr[ix[ `9`]], ptr[ix[`10`]], ptr[ix[`11`]],
5208	ptr[ix[`12`]], ptr[ix[`13`]], ptr[ix[`14`]], ptr[ix[`15`]], };
5209	}
5210
5211	template<>
5212	F gather(const float* ptr, U32 ix) {
5213	__m256i lo, hi;
5214	split(ix, &lo, &hi);
5215
5216	return join<F>(_mm256_i32gather_ps(ptr, lo, `4`),
5217	_mm256_i32gather_ps(ptr, hi, `4`));
5218	}
5219
5220	template<>
5221	U32 gather(const uint32_t* ptr, U32 ix) {
5222	__m256i lo, hi;
5223	split(ix, &lo, &hi);
5224
5225	return join<U32>(_mm256_i32gather_epi32(ptr, lo, `4`),
5226	_mm256_i32gather_epi32(ptr, hi, `4`));
5227	}
5228	#else
5229	template <typename V, typename T>
5230	SI V gather(const T* ptr, U32 ix) {
5231	return V{ ptr[ix[ `0`]], ptr[ix[ `1`]], ptr[ix[ `2`]], ptr[ix[ `3`]],
5232	ptr[ix[ `4`]], ptr[ix[ `5`]], ptr[ix[ `6`]], ptr[ix[ `7`]], };
5233	}
5234	#endif
5235
5236
5237	// ~~~~~~ 32-bit memory loads and stores ~~~~~~ //
5238
5239	SI void from_8888(U32 rgba, U16* r, U16* g, U16* b, U16* a) {
5240	#if defined(JUMPER_IS_HSW)
5241	// Swap the middle 128-bit lanes to make _mm256_packus_epi32() in cast_U16() work out nicely.
5242	__m256i _01,_23;
5243	split(rgba, &_01, &_23);
5244	__m256i _02 = _mm256_permute2x128_si256(_01,_23, `0x20`),
5245	_13 = _mm256_permute2x128_si256(_01,_23, `0x31`);
5246	rgba = join<U32>(_02, _13);
5247
5248	auto cast_U16 = [](U32 v) -> U16 {
5249	__m256i _02,_13;
5250	split(v, &_02,&_13);
5251	return _mm256_packus_epi32(_02,_13);
5252	};
5253	#else
5254	auto cast_U16 = [](U32 v) -> U16 {
5255	return cast<U16>(v);
5256	};
5257	#endif
5258	*r = cast_U16(rgba & `65535`) & `255`;
5259	*g = cast_U16(rgba & `65535`) >> `8`;
5260	*b = cast_U16(rgba >> `16`) & `255`;
5261	*a = cast_U16(rgba >> `16`) >> `8`;
5262	}
5263
5264	SI void load_8888_(const uint32_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
5265	#if 1 && defined(JUMPER_IS_NEON)
5266	uint8x8x4_t rgba;
5267	switch (tail & (N-`1`)) {
5268	case `0`: rgba = vld4_u8 ((const uint8_t)(ptr+`0`) ); break*;
5269	case `7`: rgba = vld4_lane_u8((const uint8_t*)(ptr+`6`), rgba, `6`); [[fallthrough]];
5270	case `6`: rgba = vld4_lane_u8((const uint8_t*)(ptr+`5`), rgba, `5`); [[fallthrough]];
5271	case `5`: rgba = vld4_lane_u8((const uint8_t*)(ptr+`4`), rgba, `4`); [[fallthrough]];
5272	case `4`: rgba = vld4_lane_u8((const uint8_t*)(ptr+`3`), rgba, `3`); [[fallthrough]];
5273	case `3`: rgba = vld4_lane_u8((const uint8_t*)(ptr+`2`), rgba, `2`); [[fallthrough]];
5274	case `2`: rgba = vld4_lane_u8((const uint8_t*)(ptr+`1`), rgba, `1`); [[fallthrough]];
5275	case `1`: rgba = vld4_lane_u8((const uint8_t*)(ptr+`0`), rgba, `0`);
5276	}
5277	*r = cast<U16>(rgba.val[`0`]);
5278	*g = cast<U16>(rgba.val[`1`]);
5279	*b = cast<U16>(rgba.val[`2`]);
5280	*a = cast<U16>(rgba.val[`3`]);
5281	#else
5282	from_8888(load<U32>(ptr, tail), r,g,b,a);
5283	#endif
5284	}
5285	SI void store_8888_(uint32_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
5286	r = min(r, `255`);
5287	g = min(g, `255`);
5288	b = min(b, `255`);
5289	a = min(a, `255`);
5290
5291	#if 1 && defined(JUMPER_IS_NEON)
5292	uint8x8x4_t rgba = {{
5293	cast<U8>(r),
5294	cast<U8>(g),
5295	cast<U8>(b),
5296	cast<U8>(a),
5297	}};
5298	switch (tail & (N-`1`)) {
5299	case `0`: vst4_u8 ((uint8_t)(ptr+`0`), rgba ); break*;
5300	case `7`: vst4_lane_u8((uint8_t*)(ptr+`6`), rgba, `6`); [[fallthrough]];
5301	case `6`: vst4_lane_u8((uint8_t*)(ptr+`5`), rgba, `5`); [[fallthrough]];
5302	case `5`: vst4_lane_u8((uint8_t*)(ptr+`4`), rgba, `4`); [[fallthrough]];
5303	case `4`: vst4_lane_u8((uint8_t*)(ptr+`3`), rgba, `3`); [[fallthrough]];
5304	case `3`: vst4_lane_u8((uint8_t*)(ptr+`2`), rgba, `2`); [[fallthrough]];
5305	case `2`: vst4_lane_u8((uint8_t*)(ptr+`1`), rgba, `1`); [[fallthrough]];
5306	case `1`: vst4_lane_u8((uint8_t*)(ptr+`0`), rgba, `0`);
5307	}
5308	#else
5309	store(ptr, tail, cast<U32>(r \| (g<<`8`)) << `0`
5310	\| cast<U32>(b \| (a<<`8`)) << `16`);
5311	#endif
5312	}
5313
5314	STAGE_PP(load_8888, const SkRasterPipeline_MemoryCtx* ctx) {
5315	load_8888_(ptr_at_xy<const uint32_t>(ctx, dx,dy), tail, &r,&g,&b,&a);
5316	}
5317	STAGE_PP(load_8888_dst, const SkRasterPipeline_MemoryCtx* ctx) {
5318	load_8888_(ptr_at_xy<const uint32_t>(ctx, dx,dy), tail, &dr,&dg,&db,&da);
5319	}
5320	STAGE_PP(store_8888, const SkRasterPipeline_MemoryCtx* ctx) {
5321	store_8888_(ptr_at_xy<uint32_t>(ctx, dx,dy), tail, r,g,b,a);
5322	}
5323	STAGE_GP(gather_8888, const SkRasterPipeline_GatherCtx* ctx) {
5324	const uint32_t* ptr;
5325	U32 ix = ix_and_ptr(&ptr, ctx, x,y);
5326	from_8888(gather<U32>(ptr, ix), &r, &g, &b, &a);
5327	}
5328
5329	// ~~~~~~ 16-bit memory loads and stores ~~~~~~ //
5330
5331	SI void from_565(U16 rgb, U16* r, U16* g, U16* b) {
5332	// Format for 565 buffers: 15\|rrrrr gggggg bbbbb\|0
5333	U16 R = (rgb >> `11`) & `31`,
5334	G = (rgb >> `5`) & `63`,
5335	B = (rgb >> `0`) & `31`;
5336
5337	// These bit replications are the same as multiplying by 255/31 or 255/63 to scale to 8-bit.
5338	*r = (R << `3`) \| (R >> `2`);
5339	*g = (G << `2`) \| (G >> `4`);
5340	*b = (B << `3`) \| (B >> `2`);
5341	}
5342	SI void load_565_(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) {
5343	from_565(load<U16>(ptr, tail), r,g,b);
5344	}
5345	SI void store_565_(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b) {
5346	r = min(r, `255`);
5347	g = min(g, `255`);
5348	b = min(b, `255`);
5349
5350	// Round from [0,255] to [0,31] or [0,63], as if x (31/255.0f) + 0.5f.*
5351	// (Don't feel like you need to find some fundamental truth in these...
5352	// they were brute-force searched.)
5353	U16 R = (r * `9` + `36`) / `74`, // 9/74 ≈ 31/255, plus 36/74, about half.
5354	G = (g * `21` + `42`) / `85`, // 21/85 = 63/255 exactly.
5355	B = (b * `9` + `36`) / `74`;
5356	// Pack them back into 15\|rrrrr gggggg bbbbb\|0.
5357	store(ptr, tail, R << `11`
5358	\| G << `5`
5359	\| B << `0`);
5360	}
5361
5362	STAGE_PP(load_565, const SkRasterPipeline_MemoryCtx* ctx) {
5363	load_565_(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &r,&g,&b);
5364	a = `255`;
5365	}
5366	STAGE_PP(load_565_dst, const SkRasterPipeline_MemoryCtx* ctx) {
5367	load_565_(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &dr,&dg,&db);
5368	da = `255`;
5369	}
5370	STAGE_PP(store_565, const SkRasterPipeline_MemoryCtx* ctx) {
5371	store_565_(ptr_at_xy<uint16_t>(ctx, dx,dy), tail, r,g,b);
5372	}
5373	STAGE_GP(gather_565, const SkRasterPipeline_GatherCtx* ctx) {
5374	const uint16_t* ptr;
5375	U32 ix = ix_and_ptr(&ptr, ctx, x,y);
5376	from_565(gather<U16>(ptr, ix), &r, &g, &b);
5377	a = `255`;
5378	}
5379
5380	SI void from_4444(U16 rgba, U16* r, U16* g, U16* b, U16* a) {
5381	// Format for 4444 buffers: 15\|rrrr gggg bbbb aaaa\|0.
5382	U16 R = (rgba >> `12`) & `15`,
5383	G = (rgba >> `8`) & `15`,
5384	B = (rgba >> `4`) & `15`,
5385	A = (rgba >> `0`) & `15`;
5386
5387	// Scale [0,15] to [0,255].
5388	*r = (R << `4`) \| R;
5389	*g = (G << `4`) \| G;
5390	*b = (B << `4`) \| B;
5391	*a = (A << `4`) \| A;
5392	}
5393	SI void load_4444_(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
5394	from_4444(load<U16>(ptr, tail), r,g,b,a);
5395	}
5396	SI void store_4444_(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
5397	r = min(r, `255`);
5398	g = min(g, `255`);
5399	b = min(b, `255`);
5400	a = min(a, `255`);
5401
5402	// Round from [0,255] to [0,15], producing the same value as (x(15/255.0f) + 0.5f).*
5403	U16 R = (r + `8`) / `17`,
5404	G = (g + `8`) / `17`,
5405	B = (b + `8`) / `17`,
5406	A = (a + `8`) / `17`;
5407	// Pack them back into 15\|rrrr gggg bbbb aaaa\|0.
5408	store(ptr, tail, R << `12`
5409	\| G << `8`
5410	\| B << `4`
5411	\| A << `0`);
5412	}
5413
5414	STAGE_PP(load_4444, const SkRasterPipeline_MemoryCtx* ctx) {
5415	load_4444_(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &r,&g,&b,&a);
5416	}
5417	STAGE_PP(load_4444_dst, const SkRasterPipeline_MemoryCtx* ctx) {
5418	load_4444_(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &dr,&dg,&db,&da);
5419	}
5420	STAGE_PP(store_4444, const SkRasterPipeline_MemoryCtx* ctx) {
5421	store_4444_(ptr_at_xy<uint16_t>(ctx, dx,dy), tail, r,g,b,a);
5422	}
5423	STAGE_GP(gather_4444, const SkRasterPipeline_GatherCtx* ctx) {
5424	const uint16_t* ptr;
5425	U32 ix = ix_and_ptr(&ptr, ctx, x,y);
5426	from_4444(gather<U16>(ptr, ix), &r,&g,&b,&a);
5427	}
5428
5429	SI void from_88(U16 rg, U16* r, U16* g) {
5430	*r = (rg & `0xFF`);
5431	*g = (rg >> `8`);
5432	}
5433
5434	SI void load_88_(const uint16_t* ptr, size_t tail, U16* r, U16* g) {
5435	#if 1 && defined(JUMPER_IS_NEON)
5436	uint8x8x2_t rg;
5437	switch (tail & (N-`1`)) {
5438	case `0`: rg = vld2_u8 ((const uint8_t)(ptr+`0`) ); break*;
5439	case `7`: rg = vld2_lane_u8((const uint8_t*)(ptr+`6`), rg, `6`); [[fallthrough]];
5440	case `6`: rg = vld2_lane_u8((const uint8_t*)(ptr+`5`), rg, `5`); [[fallthrough]];
5441	case `5`: rg = vld2_lane_u8((const uint8_t*)(ptr+`4`), rg, `4`); [[fallthrough]];
5442	case `4`: rg = vld2_lane_u8((const uint8_t*)(ptr+`3`), rg, `3`); [[fallthrough]];
5443	case `3`: rg = vld2_lane_u8((const uint8_t*)(ptr+`2`), rg, `2`); [[fallthrough]];
5444	case `2`: rg = vld2_lane_u8((const uint8_t*)(ptr+`1`), rg, `1`); [[fallthrough]];
5445	case `1`: rg = vld2_lane_u8((const uint8_t*)(ptr+`0`), rg, `0`);
5446	}
5447	*r = cast<U16>(rg.val[`0`]);
5448	*g = cast<U16>(rg.val[`1`]);
5449	#else
5450	from_88(load<U16>(ptr, tail), r,g);
5451	#endif
5452	}
5453
5454	SI void store_88_(uint16_t* ptr, size_t tail, U16 r, U16 g) {
5455	r = min(r, `255`);
5456	g = min(g, `255`);
5457
5458	#if 1 && defined(JUMPER_IS_NEON)
5459	uint8x8x2_t rg = {{
5460	cast<U8>(r),
5461	cast<U8>(g),
5462	}};
5463	switch (tail & (N-`1`)) {
5464	case `0`: vst2_u8 ((uint8_t)(ptr+`0`), rg ); break*;
5465	case `7`: vst2_lane_u8((uint8_t*)(ptr+`6`), rg, `6`); [[fallthrough]];
5466	case `6`: vst2_lane_u8((uint8_t*)(ptr+`5`), rg, `5`); [[fallthrough]];
5467	case `5`: vst2_lane_u8((uint8_t*)(ptr+`4`), rg, `4`); [[fallthrough]];
5468	case `4`: vst2_lane_u8((uint8_t*)(ptr+`3`), rg, `3`); [[fallthrough]];
5469	case `3`: vst2_lane_u8((uint8_t*)(ptr+`2`), rg, `2`); [[fallthrough]];
5470	case `2`: vst2_lane_u8((uint8_t*)(ptr+`1`), rg, `1`); [[fallthrough]];
5471	case `1`: vst2_lane_u8((uint8_t*)(ptr+`0`), rg, `0`);
5472	}
5473	#else
5474	store(ptr, tail, cast<U16>(r \| (g<<`8`)) << `0`);
5475	#endif
5476	}
5477
5478	STAGE_PP(load_rg88, const SkRasterPipeline_MemoryCtx* ctx) {
5479	load_88_(ptr_at_xy<const uint16_t>(ctx, dx, dy), tail, &r, &g);
5480	b = `0`;
5481	a = `255`;
5482	}
5483	STAGE_PP(load_rg88_dst, const SkRasterPipeline_MemoryCtx* ctx) {
5484	load_88_(ptr_at_xy<const uint16_t>(ctx, dx, dy), tail, &dr, &dg);
5485	db = `0`;
5486	da = `255`;
5487	}
5488	STAGE_PP(store_rg88, const SkRasterPipeline_MemoryCtx* ctx) {
5489	store_88_(ptr_at_xy<uint16_t>(ctx, dx, dy), tail, r, g);
5490	}
5491	STAGE_GP(gather_rg88, const SkRasterPipeline_GatherCtx* ctx) {
5492	const uint16_t* ptr;
5493	U32 ix = ix_and_ptr(&ptr, ctx, x, y);
5494	from_88(gather<U16>(ptr, ix), &r, &g);
5495	b = `0`;
5496	a = `255`;
5497	}
5498
5499	// ~~~~~~ 8-bit memory loads and stores ~~~~~~ //
5500
5501	SI U16 load_8(const uint8_t* ptr, size_t tail) {
5502	return cast<U16>(load<U8>(ptr, tail));
5503	}
5504	SI void store_8(uint8_t* ptr, size_t tail, U16 v) {
5505	v = min(v, `255`);
5506	store(ptr, tail, cast<U8>(v));
5507	}
5508
5509	STAGE_PP(load_a8, const SkRasterPipeline_MemoryCtx* ctx) {
5510	r = g = b = `0`;
5511	a = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail);
5512	}
5513	STAGE_PP(load_a8_dst, const SkRasterPipeline_MemoryCtx* ctx) {
5514	dr = dg = db = `0`;
5515	da = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail);
5516	}
5517	STAGE_PP(store_a8, const SkRasterPipeline_MemoryCtx* ctx) {
5518	store_8(ptr_at_xy<uint8_t>(ctx, dx,dy), tail, a);
5519	}
5520	STAGE_GP(gather_a8, const SkRasterPipeline_GatherCtx* ctx) {
5521	const uint8_t* ptr;
5522	U32 ix = ix_and_ptr(&ptr, ctx, x,y);
5523	r = g = b = `0`;
5524	a = cast<U16>(gather<U8>(ptr, ix));
5525	}
5526	STAGE_PP(store_r8, const SkRasterPipeline_MemoryCtx* ctx) {
5527	store_8(ptr_at_xy<uint8_t>(ctx, dx,dy), tail, r);
5528	}
5529
5530	STAGE_PP(alpha_to_gray, NoCtx) {
5531	r = g = b = a;
5532	a = `255`;
5533	}
5534	STAGE_PP(alpha_to_gray_dst, NoCtx) {
5535	dr = dg = db = da;
5536	da = `255`;
5537	}
5538	STAGE_PP(alpha_to_red, NoCtx) {
5539	r = a;
5540	a = `255`;
5541	}
5542	STAGE_PP(alpha_to_red_dst, NoCtx) {
5543	dr = da;
5544	da = `255`;
5545	}
5546
5547	STAGE_PP(bt709_luminance_or_luma_to_alpha, NoCtx) {
5548	a = (r`54` + g`183` + b`19`)/`256`; // 0.2126, 0.7152, 0.0722 with 256 denominator.*
5549	r = g = b = `0`;
5550	}
5551	STAGE_PP(bt709_luminance_or_luma_to_rgb, NoCtx) {
5552	r = g = b =(r`54` + g`183` + b`19`)/`256`; // 0.2126, 0.7152, 0.0722 with 256 denominator.*
5553	}
5554
5555	// ~~~~~~ Coverage scales / lerps ~~~~~~ //
5556
5557	STAGE_PP(load_src, const uint16_t* ptr) {
5558	r = sk_unaligned_load<U16>(ptr + `0`*N);
5559	g = sk_unaligned_load<U16>(ptr + `1`*N);
5560	b = sk_unaligned_load<U16>(ptr + `2`*N);
5561	a = sk_unaligned_load<U16>(ptr + `3`*N);
5562	}
5563	STAGE_PP(store_src, uint16_t* ptr) {
5564	sk_unaligned_store(ptr + `0`*N, r);
5565	sk_unaligned_store(ptr + `1`*N, g);
5566	sk_unaligned_store(ptr + `2`*N, b);
5567	sk_unaligned_store(ptr + `3`*N, a);
5568	}
5569	STAGE_PP(store_src_a, uint16_t* ptr) {
5570	sk_unaligned_store(ptr, a);
5571	}
5572	STAGE_PP(load_dst, const uint16_t* ptr) {
5573	dr = sk_unaligned_load<U16>(ptr + `0`*N);
5574	dg = sk_unaligned_load<U16>(ptr + `1`*N);
5575	db = sk_unaligned_load<U16>(ptr + `2`*N);
5576	da = sk_unaligned_load<U16>(ptr + `3`*N);
5577	}
5578	STAGE_PP(store_dst, uint16_t* ptr) {
5579	sk_unaligned_store(ptr + `0`*N, dr);
5580	sk_unaligned_store(ptr + `1`*N, dg);
5581	sk_unaligned_store(ptr + `2`*N, db);
5582	sk_unaligned_store(ptr + `3`*N, da);
5583	}
5584
5585	// ~~~~~~ Coverage scales / lerps ~~~~~~ //
5586
5587	STAGE_PP(scale_1_float, const float* f) {
5588	U16 c = from_float(*f);
5589	r = div255( r * c );
5590	g = div255( g * c );
5591	b = div255( b * c );
5592	a = div255( a * c );
5593	}
5594	STAGE_PP(lerp_1_float, const float* f) {
5595	U16 c = from_float(*f);
5596	r = lerp(dr, r, c);
5597	g = lerp(dg, g, c);
5598	b = lerp(db, b, c);
5599	a = lerp(da, a, c);
5600	}
5601	STAGE_PP(scale_native, const uint16_t scales[]) {
5602	auto c = sk_unaligned_load<U16>(scales);
5603	r = div255( r * c );
5604	g = div255( g * c );
5605	b = div255( b * c );
5606	a = div255( a * c );
5607	}
5608
5609	STAGE_PP(lerp_native, const uint16_t scales[]) {
5610	auto c = sk_unaligned_load<U16>(scales);
5611	r = lerp(dr, r, c);
5612	g = lerp(dg, g, c);
5613	b = lerp(db, b, c);
5614	a = lerp(da, a, c);
5615	}
5616
5617	STAGE_PP(scale_u8, const SkRasterPipeline_MemoryCtx* ctx) {
5618	U16 c = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail);
5619	r = div255( r * c );
5620	g = div255( g * c );
5621	b = div255( b * c );
5622	a = div255( a * c );
5623	}
5624	STAGE_PP(lerp_u8, const SkRasterPipeline_MemoryCtx* ctx) {
5625	U16 c = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail);
5626	r = lerp(dr, r, c);
5627	g = lerp(dg, g, c);
5628	b = lerp(db, b, c);
5629	a = lerp(da, a, c);
5630	}
5631
5632	// Derive alpha's coverage from rgb coverage and the values of src and dst alpha.
5633	SI U16 alpha_coverage_from_rgb_coverage(U16 a, U16 da, U16 cr, U16 cg, U16 cb) {
5634	return if_then_else(a < da, min(cr, min(cg,cb))
5635	, max(cr, max(cg,cb)));
5636	}
5637	STAGE_PP(scale_565, const SkRasterPipeline_MemoryCtx* ctx) {
5638	U16 cr,cg,cb;
5639	load_565_(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &cr,&cg,&cb);
5640	U16 ca = alpha_coverage_from_rgb_coverage(a,da, cr,cg,cb);
5641
5642	r = div255( r * cr );
5643	g = div255( g * cg );
5644	b = div255( b * cb );
5645	a = div255( a * ca );
5646	}
5647	STAGE_PP(lerp_565, const SkRasterPipeline_MemoryCtx* ctx) {
5648	U16 cr,cg,cb;
5649	load_565_(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &cr,&cg,&cb);
5650	U16 ca = alpha_coverage_from_rgb_coverage(a,da, cr,cg,cb);
5651
5652	r = lerp(dr, r, cr);
5653	g = lerp(dg, g, cg);
5654	b = lerp(db, b, cb);
5655	a = lerp(da, a, ca);
5656	}
5657
5658	STAGE_PP(emboss, const SkRasterPipeline_EmbossCtx* ctx) {
5659	U16 mul = load_8(ptr_at_xy<const uint8_t>(&ctx->mul, dx,dy), tail),
5660	add = load_8(ptr_at_xy<const uint8_t>(&ctx->add, dx,dy), tail);
5661
5662	r = min(div255(r*mul) + add, a);
5663	g = min(div255(g*mul) + add, a);
5664	b = min(div255(b*mul) + add, a);
5665	}
5666
5667
5668	// ~~~~~~ Gradient stages ~~~~~~ //
5669
5670	// Clamp x to [0,1], both sides inclusive (think, gradients).
5671	// Even repeat and mirror funnel through a clamp to handle bad inputs like +Inf, NaN.
5672	SI F clamp_01_(F v) { return min(max(`0`, v), `1`); }
5673
5674	STAGE_GG(clamp_x_1 , NoCtx) { x = clamp_01_(x); }
5675	STAGE_GG(repeat_x_1, NoCtx) { x = clamp_01_(x - floor_(x)); }
5676	STAGE_GG(mirror_x_1, NoCtx) {
5677	auto two = [](F x){ return x+x; };
5678	x = clamp_01_(abs_( (x-`1.0f`) - two(floor_((x-`1.0f`)*`0.5f`)) - `1.0f` ));
5679	}
5680
5681	SI I16 cond_to_mask_16(I32 cond) { return cast<I16>(cond); }
5682
5683	STAGE_GG(decal_x, SkRasterPipeline_DecalTileCtx* ctx) {
5684	auto w = ctx->limit_x;
5685	sk_unaligned_store(ctx->mask, cond_to_mask_16((`0` <= x) & (x < w)));
5686	}
5687	STAGE_GG(decal_y, SkRasterPipeline_DecalTileCtx* ctx) {
5688	auto h = ctx->limit_y;
5689	sk_unaligned_store(ctx->mask, cond_to_mask_16((`0` <= y) & (y < h)));
5690	}
5691	STAGE_GG(decal_x_and_y, SkRasterPipeline_DecalTileCtx* ctx) {
5692	auto w = ctx->limit_x;
5693	auto h = ctx->limit_y;
5694	sk_unaligned_store(ctx->mask, cond_to_mask_16((`0` <= x) & (x < w) & (`0` <= y) & (y < h)));
5695	}
5696	STAGE_GG(clamp_x_and_y, SkRasterPipeline_CoordClampCtx* ctx) {
5697	x = min(ctx->max_x, max(ctx->min_x, x));
5698	y = min(ctx->max_y, max(ctx->min_y, y));
5699	}
5700	STAGE_PP(check_decal_mask, SkRasterPipeline_DecalTileCtx* ctx) {
5701	auto mask = sk_unaligned_load<U16>(ctx->mask);
5702	r = r & mask;
5703	g = g & mask;
5704	b = b & mask;
5705	a = a & mask;
5706	}
5707
5708	SI void round_F_to_U16(F R, F G, F B, F A, U16* r, U16* g, U16* b, U16* a) {
5709	auto round_color = [](F x) { return cast<U16>(x * `255.0f` + `0.5f`); };
5710
5711	*r = round_color(min(max(`0`, R), `1`));
5712	*g = round_color(min(max(`0`, G), `1`));
5713	*b = round_color(min(max(`0`, B), `1`));
5714	a = round_color(A); // we assume alpha is already in [0,1].*
5715	}
5716
5717	SI void gradient_lookup(const SkRasterPipeline_GradientCtx* c, U32 idx, F t,
5718	U16* r, U16* g, U16* b, U16* a) {
5719
5720	F fr, fg, fb, fa, br, bg, bb, ba;
5721	#if defined(JUMPER_IS_HSW)
5722	if (c->stopCount <=`8`) {
5723	__m256i lo, hi;
5724	split(idx, &lo, &hi);
5725
5726	fr = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[`0`]), lo),
5727	_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[`0`]), hi));
5728	br = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[`0`]), lo),
5729	_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[`0`]), hi));
5730	fg = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[`1`]), lo),
5731	_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[`1`]), hi));
5732	bg = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[`1`]), lo),
5733	_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[`1`]), hi));
5734	fb = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[`2`]), lo),
5735	_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[`2`]), hi));
5736	bb = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[`2`]), lo),
5737	_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[`2`]), hi));
5738	fa = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[`3`]), lo),
5739	_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[`3`]), hi));
5740	ba = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[`3`]), lo),
5741	_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[`3`]), hi));
5742	} else
5743	#endif
5744	{
5745	fr = gather<F>(c->fs[`0`], idx);
5746	fg = gather<F>(c->fs[`1`], idx);
5747	fb = gather<F>(c->fs[`2`], idx);
5748	fa = gather<F>(c->fs[`3`], idx);
5749	br = gather<F>(c->bs[`0`], idx);
5750	bg = gather<F>(c->bs[`1`], idx);
5751	bb = gather<F>(c->bs[`2`], idx);
5752	ba = gather<F>(c->bs[`3`], idx);
5753	}
5754	round_F_to_U16(mad(t, fr, br),
5755	mad(t, fg, bg),
5756	mad(t, fb, bb),
5757	mad(t, fa, ba),
5758	r,g,b,a);
5759	}
5760
5761	STAGE_GP(gradient, const SkRasterPipeline_GradientCtx* c) {
5762	auto t = x;
5763	U32 idx = `0`;
5764
5765	// N.B. The loop starts at 1 because idx 0 is the color to use before the first stop.
5766	for (size_t i = `1`; i < c->stopCount; i++) {
5767	idx += if_then_else(t >= c->ts[i], U32(`1`), U32(`0`));
5768	}
5769
5770	gradient_lookup(c, idx, t, &r, &g, &b, &a);
5771	}
5772
5773	STAGE_GP(evenly_spaced_gradient, const SkRasterPipeline_GradientCtx* c) {
5774	auto t = x;
5775	auto idx = trunc_(t * (c->stopCount-`1`));
5776	gradient_lookup(c, idx, t, &r, &g, &b, &a);
5777	}
5778
5779	STAGE_GP(evenly_spaced_2_stop_gradient, const SkRasterPipeline_EvenlySpaced2StopGradientCtx* c) {
5780	auto t = x;
5781	round_F_to_U16(mad(t, c->f[`0`], c->b[`0`]),
5782	mad(t, c->f[`1`], c->b[`1`]),
5783	mad(t, c->f[`2`], c->b[`2`]),
5784	mad(t, c->f[`3`], c->b[`3`]),
5785	&r,&g,&b,&a);
5786	}
5787
5788	STAGE_GP(bilerp_clamp_8888, const SkRasterPipeline_GatherCtx* ctx) {
5789	// Quantize sample point and transform into lerp coordinates converting them to 16.16 fixed
5790	// point number.
5791	I32 qx = cast<I32>(floor_(`65536.0f` * x + `0.5f`)) - `32768`,
5792	qy = cast<I32>(floor_(`65536.0f` * y + `0.5f`)) - `32768`;
5793
5794	// Calculate screen coordinates sx & sy by flooring qx and qy.
5795	I32 sx = qx >> `16`,
5796	sy = qy >> `16`;
5797
5798	// We are going to perform a change of parameters for qx on [0, 1) to tx on [-1, 1).
5799	// This will put tx in Q15 format for use with q_mult.
5800	// Calculate tx and ty on the interval of [-1, 1). Give {qx} and {qy} are on the interval
5801	// [0, 1), where {v} is fract(v), we can transform to tx in the following manner ty follows
5802	// the same math:
5803	// tx = 2 {qx} - 1, so*
5804	// {qx} = (tx + 1) / 2.
5805	// Calculate {qx} - 1 and {qy} - 1 where the {} operation is handled by the cast, and the - 1
5806	// is handled by the ^ 0x8000, dividing by 2 is deferred and handled in lerpX and lerpY in
5807	// order to use the full 16-bit resolution.
5808	I16 tx = cast<I16>(qx ^ `0x8000`),
5809	ty = cast<I16>(qy ^ `0x8000`);
5810
5811	// Substituting the {qx} by the equation for tx from above into the lerp equation where v is
5812	// the lerped value:
5813	// v = {qx}(R - L) + L,*
5814	// v = 1/2(tx + 1)(R - L) + L
5815	// 2 v = (tx + 1)(R - L) + 2L*
5816	// = txR - txL + R - L + 2L*
5817	// = tx(R - L) + (R + L).*
5818	// Since R and L are on [0, 255] we need them on the interval [0, 1/2] to get them into form
5819	// for Q15_mult. If L and R where in 16.16 format, this would be done by dividing by 2^9. In
5820	// code, we can multiply by 2^7 to get the value directly.
5821	// 2 v = tx(R - L) + (R + L)
5822	// 2^-9 2 * v = tx(R - L)2^-9 + (R + L)2^-9
5823	// 2^-8 v = 2^-9 * (tx(R - L) + (R + L))
5824	// v = 1/2 (tx(R - L) + (R + L))
5825	auto lerpX = [&](U16 left, U16 right) -> U16 {
5826	I16 width = (I16)(right - left) << `7`;
5827	U16 middle = (right + left) << `7`;
5828	// The constrained_add is the most subtle part of lerp. The first term is on the interval
5829	// [-1, 1), and the second term is on the interval is on the interval [0, 1) because
5830	// both terms are too high by a factor of 2 which will be handled below. (Both R and L are
5831	// on [0, 1/2), but the sum R + L is on the interval [0, 1).) Generally, the sum below
5832	// should overflow, but because we know that sum produces an output on the
5833	// interval [0, 1) we know that the extra bit that would be needed will always be 0. So
5834	// we need to be careful to treat this sum as an unsigned positive number in the divide
5835	// by 2 below. Add +1 for rounding.
5836	U16 v2 = constrained_add(scaled_mult(tx, width), middle) + `1`;
5837	// Divide by 2 to calculate v and at the same time bring the intermediate value onto the
5838	// interval [0, 1/2] to set up for the lerpY.
5839	return v2 >> `1`;
5840	};
5841
5842	const uint32_t* ptr;
5843	U32 ix = ix_and_ptr(&ptr, ctx, sx, sy);
5844	U16 leftR, leftG, leftB, leftA;
5845	from_8888(gather<U32>(ptr, ix), &leftR,&leftG,&leftB,&leftA);
5846
5847	ix = ix_and_ptr(&ptr, ctx, sx+`1`, sy);
5848	U16 rightR, rightG, rightB, rightA;
5849	from_8888(gather<U32>(ptr, ix), &rightR,&rightG,&rightB,&rightA);
5850
5851	U16 topR = lerpX(leftR, rightR),
5852	topG = lerpX(leftG, rightG),
5853	topB = lerpX(leftB, rightB),
5854	topA = lerpX(leftA, rightA);
5855
5856	ix = ix_and_ptr(&ptr, ctx, sx, sy+`1`);
5857	from_8888(gather<U32>(ptr, ix), &leftR,&leftG,&leftB,&leftA);
5858
5859	ix = ix_and_ptr(&ptr, ctx, sx+`1`, sy+`1`);
5860	from_8888(gather<U32>(ptr, ix), &rightR,&rightG,&rightB,&rightA);
5861
5862	U16 bottomR = lerpX(leftR, rightR),
5863	bottomG = lerpX(leftG, rightG),
5864	bottomB = lerpX(leftB, rightB),
5865	bottomA = lerpX(leftA, rightA);
5866
5867	// lerpY plays the same mathematical tricks as lerpX, but the final divide is by 256 resulting
5868	// in a value on [0, 255].
5869	auto lerpY = [&](U16 top, U16 bottom) -> U16 {
5870	I16 width = (I16)bottom - top;
5871	U16 middle = bottom + top;
5872	// Add + 0x80 for rounding.
5873	U16 blend = constrained_add(scaled_mult(ty, width), middle) + `0x80`;
5874
5875	return blend >> `8`;
5876	};
5877
5878	r = lerpY(topR, bottomR);
5879	g = lerpY(topG, bottomG);
5880	b = lerpY(topB, bottomB);
5881	a = lerpY(topA, bottomA);
5882	}
5883
5884	STAGE_GG(xy_to_unit_angle, NoCtx) {
5885	F xabs = abs_(x),
5886	yabs = abs_(y);
5887
5888	F slope = min(xabs, yabs)/max(xabs, yabs);
5889	F s = slope * slope;
5890
5891	// Use a 7th degree polynomial to approximate atan.
5892	// This was generated using sollya.gforge.inria.fr.
5893	// A float optimized polynomial was generated using the following command.
5894	// P1 = fpminimax((1/(2Pi))atan(x),[\|1,3,5,7\|],[\|24...\|],[2^(-40),1],relative);
5895	F phi = slope
5896	* (`0.15912117063999176025390625f` + s
5897	* (-`5.185396969318389892578125e-2f` + s
5898	* (`2.476101927459239959716796875e-2f` + s
5899	* (-`7.0547382347285747528076171875e-3f`))));
5900
5901	phi = if_then_else(xabs < yabs, `1.0f`/`4.0f` - phi, phi);
5902	phi = if_then_else(x < `0.0f` , `1.0f`/`2.0f` - phi, phi);
5903	phi = if_then_else(y < `0.0f` , `1.0f` - phi , phi);
5904	phi = if_then_else(phi != phi , `0` , phi); // Check for NaN.
5905	x = phi;
5906	}
5907	STAGE_GG(xy_to_radius, NoCtx) {
5908	x = sqrt_(xx + yy);
5909	}
5910
5911	// ~~~~~~ Compound stages ~~~~~~ //
5912
5913	STAGE_PP(srcover_rgba_8888, const SkRasterPipeline_MemoryCtx* ctx) {
5914	auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy);
5915
5916	load_8888_(ptr, tail, &dr,&dg,&db,&da);
5917	r = r + div255( dr*inv(a) );
5918	g = g + div255( dg*inv(a) );
5919	b = b + div255( db*inv(a) );
5920	a = a + div255( da*inv(a) );
5921	store_8888_(ptr, tail, r,g,b,a);
5922	}
5923
5924	// ~~~~~~ skgpu::Swizzle stage ~~~~~~ //
5925
5926	STAGE_PP(swizzle, void* ctx) {
5927	auto ir = r, ig = g, ib = b, ia = a;
5928	U16* o[] = {&r, &g, &b, &a};
5929	char swiz[`4`];
5930	memcpy(swiz, &ctx, sizeof(swiz));
5931
5932	for (int i = `0`; i < `4`; ++i) {
5933	switch (swiz[i]) {
5934	case `'r'`: o[i] = ir; break*;
5935	case `'g'`: o[i] = ig; break*;
5936	case `'b'`: o[i] = ib; break*;
5937	case `'a'`: o[i] = ia; break*;
5938	case `'0'`: o[i] = U16(`0`); break*;
5939	case `'1'`: o[i] = U16(`255`); break*;
5940	default: break;
5941	}
5942	}
5943	}
5944
5945	#endif//defined(JUMPER_IS_SCALAR) controlling whether we build lowp stages
5946	} // namespace lowp
5947
5948	/ This gives us SK_OPTS::lowp::N if lowp::N has been set, or SK_OPTS::N if it hasn't. /
5949	namespace lowp { static constexpr size_t lowp_N = N; }
5950
5951	/* Allow outside code to access the Raster Pipeline pixel stride. /
5952	constexpr size_t raster_pipeline_lowp_stride() { return lowp::lowp_N; }
5953	constexpr size_t raster_pipeline_highp_stride() { return N; }
5954
5955	} // namespace SK_OPTS_NS
5956
5957	#undef SI
5958
5959	#endif//SkRasterPipeline_opts_DEFINED
5960

source code of flutter_engine/third_party/skia/src/opts/SkRasterPipeline_opts.h