00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00031
00032 #pragma once
00033
00034 #include "api_swrender.h"
00035 #include <emmintrin.h>
00036
00040 class CL_BlitARGB8SSE
00041 {
00043 public:
00044 static void copy_pixels(unsigned int *dest, const unsigned int *src);
00045 static void load_pixel(__m128i &xmm, const unsigned int &pixel);
00046 static void load_pixels(__m128i &xmm, const unsigned int *pixels);
00047 static void load_pixels(__m128i &xmm, const unsigned int &p1, unsigned int &p2);
00048 static void load_pixel_linear(__m128i &xmm, const unsigned int &p1, const unsigned int &p2, const unsigned int &p3, const unsigned int &p4, unsigned int ifracx, unsigned int ifracy);
00049 static void set_one(__m128i &xmm);
00050 static void set_half(__m128i &xmm);
00051 static void set_color(__m128i &xmm, unsigned short red, unsigned short green, unsigned short blue, unsigned short alpha);
00052 static void set_color(__m128i &xmm, unsigned short r1, unsigned short g1, unsigned short b1, unsigned short a1, unsigned short r2, unsigned short g2, unsigned short b2, unsigned short a2);
00053
00054 #ifdef _MSC_VER
00055 static void multiply_color(__m128i &src, __m128i &primcolor);
00056 #else
00057
00058 static void multiply_color(__m128i &src, __m128i primcolor);
00059 #endif
00060 static void blend_normal(__m128i &dest, __m128i &src, __m128i &one, __m128i &half);
00061 static void blend_premultiplied(__m128i &dest, __m128i &src, __m128i &one, __m128i &half);
00062 static void blend_lcd(__m128i &dest, __m128i &src, __m128i &one, __m128i &half, __m128i &color);
00063 static void store_pixel(unsigned int &pixel, __m128i &xmm);
00064 static void store_pixels(unsigned int *pixels, __m128i &xmm);
00065
00066 static void pixels_to_channels(__m128i &red, __m128i &green, __m128i &blue, __m128i &alpha, const __m128i &src0, const __m128i &src1);
00067 static void channels_to_pixels(__m128i &dest0, __m128i &dest1, __m128i &red, __m128i &green, __m128i &blue, __m128i &alpha);
00068
00069 };
00070
00071 inline void CL_BlitARGB8SSE::copy_pixels(unsigned int *dest, const unsigned int *src)
00072 {
00073 __m128i src0;
00074 src0 = _mm_loadl_epi64((const __m128i *) src);
00075 _mm_storel_epi64((__m128i *) dest, src0);
00076 }
00077
00078 inline void CL_BlitARGB8SSE::load_pixel(__m128i &xmm, const unsigned int &pixel)
00079 {
00080 xmm = _mm_cvtsi32_si128(pixel);
00081 xmm = _mm_unpacklo_epi8(xmm, _mm_setzero_si128());
00082 }
00083
00084 inline void CL_BlitARGB8SSE::load_pixels(__m128i &xmm, const unsigned int *pixels)
00085 {
00086 xmm = _mm_loadl_epi64((const __m128i *) pixels);
00087 xmm = _mm_unpacklo_epi8(xmm, _mm_setzero_si128());
00088 }
00089
00090 inline void CL_BlitARGB8SSE::load_pixels(__m128i &xmm, const unsigned int &p1, unsigned int &p2)
00091 {
00092 xmm = _mm_set_epi32(0, 0, p2, p1);
00093 xmm = _mm_unpacklo_epi8(xmm, _mm_setzero_si128());
00094 }
00095
00096 inline void CL_BlitARGB8SSE::load_pixel_linear(__m128i &xmm, const unsigned int &pixel1, const unsigned int &pixel2, const unsigned int &pixel3, const unsigned int &pixel4, unsigned int ifracx, unsigned int ifracy)
00097 {
00098 __m128i src0, src1, src2, src3;
00099 __m128i frac0, frac1, frac2, frac3;
00100 __m128i fracx, inv_fracx, fracy, inv_fracy;
00101 __m128i half = _mm_set1_epi16(64);
00102 fracx = _mm_set1_epi16(ifracx);
00103 fracy = _mm_set1_epi16(ifracy);
00104 inv_fracx = _mm_set1_epi16(0x80-ifracx);
00105 inv_fracy = _mm_set1_epi16(0x80-ifracy);
00106 frac0 = _mm_srli_epi16(_mm_mullo_epi16(inv_fracx, inv_fracy), 7);
00107 frac1 = _mm_srli_epi16(_mm_mullo_epi16(fracx, inv_fracy), 7);
00108 frac2 = _mm_srli_epi16(_mm_mullo_epi16(inv_fracx, fracy), 7);
00109 frac3 = _mm_srli_epi16(_mm_mullo_epi16(fracx, fracy), 7);
00110 src0 = _mm_mullo_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(pixel1), _mm_setzero_si128()), frac0);
00111 src1 = _mm_mullo_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(pixel2), _mm_setzero_si128()), frac1);
00112 src2 = _mm_mullo_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(pixel3), _mm_setzero_si128()), frac2);
00113 src3 = _mm_mullo_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(pixel4), _mm_setzero_si128()), frac3);
00114 xmm = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(_mm_add_epi16(_mm_add_epi16(src0, src1), src2), src3), half), 7);
00115 }
00116
00117 inline void CL_BlitARGB8SSE::set_one(__m128i &xmm)
00118 {
00119 xmm = _mm_set1_epi16(0x0100);
00120 }
00121
00122 inline void CL_BlitARGB8SSE::set_half(__m128i &xmm)
00123 {
00124 xmm = _mm_set1_epi16(0x007f);
00125 }
00126
00127 inline void CL_BlitARGB8SSE::set_color(__m128i &xmm, unsigned short red, unsigned short green, unsigned short blue, unsigned short alpha)
00128 {
00129 xmm = _mm_set_epi16(alpha, red, green, blue, alpha, red, green, blue);
00130 }
00131
00132 inline void CL_BlitARGB8SSE::set_color(__m128i &xmm, unsigned short r1, unsigned short g1, unsigned short b1, unsigned short a1, unsigned short r2, unsigned short g2, unsigned short b2, unsigned short a2)
00133 {
00134 xmm = _mm_set_epi16(a2, r2, g2, b2, a1, r1, g1, b1);
00135 }
00136
00137 #ifdef _MSC_VER
00138 inline void CL_BlitARGB8SSE::multiply_color(__m128i &src, __m128i &primcolor)
00139 {
00140 src = _mm_mullo_epi16(src, primcolor);
00141 src = _mm_srli_epi16(src, 8);
00142 }
00143 #else
00144
00145 inline void CL_BlitARGB8SSE::multiply_color(__m128i &src, __m128i primcolor)
00146 {
00147 src = _mm_mullo_epi16(src, primcolor);
00148 src = _mm_srli_epi16(src, 8);
00149 }
00150 #endif
00151
00152 #define cl_blitargb8sse_multiply_color(src, primcolor) \
00153 { \
00154 src = _mm_mullo_epi16(src, primcolor); \
00155 src = _mm_srli_epi16(src, 8); \
00156 }
00157
00158 inline void CL_BlitARGB8SSE::blend_normal(__m128i &dest, __m128i &src, __m128i &one, __m128i &half)
00159 {
00160 __m128i src_alpha, invsrc_alpha;
00161
00162 src_alpha = src;
00163 src_alpha = _mm_shufflelo_epi16(src_alpha, 0xff);
00164 src_alpha = _mm_shufflehi_epi16(src_alpha, 0xff);
00165
00166 invsrc_alpha = _mm_sub_epi16(one, src_alpha);
00167
00168 src = _mm_mullo_epi16(src, src_alpha);
00169 dest = _mm_mullo_epi16(dest, invsrc_alpha);
00170
00171 dest = _mm_add_epi16(dest, src);
00172 dest = _mm_add_epi16(dest, half);
00173 dest = _mm_srli_epi16(dest, 8);
00174 }
00175
00176 #define cl_blitargb8sse_blend_normal(dest, src, one, half) \
00177 { \
00178 __m128i src_alpha, invsrc_alpha; \
00179 \
00180 src_alpha = src; \
00181 src_alpha = _mm_shufflelo_epi16(src_alpha, 0xff); \
00182 src_alpha = _mm_shufflehi_epi16(src_alpha, 0xff); \
00183 \
00184 invsrc_alpha = _mm_sub_epi16(one, src_alpha); \
00185 \
00186 src = _mm_mullo_epi16(src, src_alpha); \
00187 dest = _mm_mullo_epi16(dest, invsrc_alpha); \
00188 \
00189 dest = _mm_add_epi16(dest, src); \
00190 dest = _mm_add_epi16(dest, half); \
00191 dest = _mm_srli_epi16(dest, 8); \
00192 }
00193
00194 inline void CL_BlitARGB8SSE::blend_premultiplied(__m128i &dest, __m128i &src, __m128i &one, __m128i &half)
00195 {
00196 __m128i src_alpha, invsrc_alpha;
00197
00198 src_alpha = src;
00199 src_alpha = _mm_shufflelo_epi16(src_alpha, 0xff);
00200 src_alpha = _mm_shufflehi_epi16(src_alpha, 0xff);
00201
00202 invsrc_alpha = _mm_sub_epi16(one, src_alpha);
00203
00204 dest = _mm_mullo_epi16(dest, invsrc_alpha);
00205 dest = _mm_add_epi16(dest, half);
00206 dest = _mm_srli_epi16(dest, 8);
00207 dest = _mm_add_epi16(dest, src);
00208 }
00209
00210 inline void CL_BlitARGB8SSE::blend_lcd(__m128i &dest, __m128i &src, __m128i &one, __m128i &half, __m128i &color)
00211 {
00212 __m128i invsrc;
00213 invsrc = _mm_sub_epi16(one, _mm_add_epi16(_mm_srli_epi16(src, 7), src));
00214
00215 dest = _mm_add_epi16(_mm_mullo_epi16(src, color), _mm_mullo_epi16(dest, invsrc));
00216 dest = _mm_add_epi16(dest, half);
00217 dest = _mm_srli_epi16(dest, 8);
00218 }
00219
00220 inline void CL_BlitARGB8SSE::store_pixel(unsigned int &pixel, __m128i &xmm)
00221 {
00222 xmm = _mm_packus_epi16(xmm, _mm_setzero_si128());
00223 pixel = _mm_cvtsi128_si32(xmm);
00224 }
00225
00226 inline void CL_BlitARGB8SSE::store_pixels(unsigned int *pixels, __m128i &xmm)
00227 {
00228 xmm = _mm_packus_epi16(xmm, _mm_setzero_si128());
00229 _mm_storel_epi64((__m128i *) pixels, xmm);
00230 }
00231
00232 inline void CL_BlitARGB8SSE::pixels_to_channels(__m128i &red, __m128i &green, __m128i &blue, __m128i &alpha, const __m128i &src0, const __m128i &src1)
00233 {
00234 __m128i alpha_mask = _mm_set1_epi32(0xff000000);
00235 __m128i red_mask = _mm_set1_epi32(0x00ff0000);
00236 __m128i green_mask = _mm_set1_epi32(0x0000ff00);
00237 __m128i blue_mask = _mm_set1_epi32(0x000000ff);
00238
00239 alpha = _mm_srli_si128(_mm_and_si128(alpha_mask, src0), 1);
00240 alpha = _mm_or_si128(alpha, _mm_srli_si128(_mm_and_si128(alpha_mask, src1), 3));
00241
00242 red = _mm_and_si128(red_mask, src0);
00243 red = _mm_or_si128(red, _mm_srli_si128(_mm_and_si128(red_mask, src1), 2));
00244
00245 green = _mm_slli_si128(_mm_and_si128(green_mask, src0), 1);
00246 green = _mm_or_si128(green, _mm_srli_si128(_mm_and_si128(green_mask, src1), 1));
00247
00248 blue = _mm_slli_si128(_mm_and_si128(blue_mask, src0), 2);
00249 blue = _mm_or_si128(blue, _mm_and_si128(blue_mask, src1));
00250 }
00251
00252 inline void CL_BlitARGB8SSE::channels_to_pixels(__m128i &dest0, __m128i &dest1, __m128i &red, __m128i &green, __m128i &blue, __m128i &alpha)
00253 {
00254 __m128i alpha_mask = _mm_set1_epi32(0xff000000);
00255 __m128i red_mask = _mm_set1_epi32(0x00ff0000);
00256 __m128i green_mask = _mm_set1_epi32(0x0000ff00);
00257 __m128i blue_mask = _mm_set1_epi32(0x000000ff);
00258
00259 dest0 = _mm_and_si128(alpha_mask, _mm_slli_si128(alpha, 1));
00260 dest1 = _mm_and_si128(alpha_mask, _mm_slli_si128(alpha, 3));
00261
00262 dest0 = _mm_or_si128(dest0, _mm_and_si128(red_mask, red));
00263 dest1 = _mm_or_si128(dest1, _mm_and_si128(red_mask, _mm_slli_si128(red, 2)));
00264
00265 dest0 = _mm_or_si128(dest0, _mm_and_si128(green_mask, _mm_srli_si128(green, 1)));
00266 dest1 = _mm_or_si128(dest1, _mm_and_si128(green_mask, _mm_slli_si128(green, 1)));
00267
00268 dest0 = _mm_or_si128(dest0, _mm_and_si128(blue_mask, _mm_srli_si128(blue, 2)));
00269 dest1 = _mm_or_si128(dest1, _mm_and_si128(blue_mask, blue));
00270 }
00271
00272 #ifdef _MSC_VER
00273
00274 #define cl_blitargb8sse_sample_nearest(out0, tx, ty, data, width) \
00275 { \
00276 __declspec(align(16)) unsigned int x[4], y[4]; \
00277 _mm_store_si128((__m128i*) x, _mm_srai_epi32(tx, 16)); \
00278 _mm_store_si128((__m128i*) y, _mm_srai_epi32(ty, 16)); \
00279 out0 = _mm_set_epi32(data[x[0]+y[0]*width], data[x[1]+y[1]*width], data[x[2]+y[2]*width], data[x[3]+y[3]*width]); \
00280 }
00281
00282 #else
00283
00284 #define cl_blitargb8sse_sample_nearest(out0, tx, ty, data, width) \
00285 { \
00286 __attribute__ ((aligned(16))) unsigned int x[4], y[4]; \
00287 _mm_store_si128((__m128i*) x, _mm_srai_epi32(tx, 16)); \
00288 _mm_store_si128((__m128i*) y, _mm_srai_epi32(ty, 16)); \
00289 out0 = _mm_set_epi32(data[x[0]+y[0]*width], data[x[1]+y[1]*width], data[x[2]+y[2]*width], data[x[3]+y[3]*width]); \
00290 }
00291
00292 #endif
00293
00294
00295
00296
00297 #define cl_blitargb8sse_texture_repeat(tx, ty, width, height) \
00298 { \
00299 while (true) \
00300 { \
00301 __m128i compare_result = _mm_cmplt_epi32(tx, _mm_setzero_si128()); \
00302 if (_mm_movemask_epi8(compare_result)) \
00303 tx = _mm_add_epi32(tx, _mm_and_si128(compare_result, width)); \
00304 else \
00305 break; \
00306 } \
00307 while (true) \
00308 { \
00309 __m128i compare_result = _mm_cmplt_epi32(tx, width); \
00310 if (_mm_movemask_epi8(compare_result)!=0xffff) \
00311 tx = _mm_sub_epi32(tx, _mm_andnot_si128(compare_result, width)); \
00312 else \
00313 break; \
00314 } \
00315 while (true) \
00316 { \
00317 __m128i compare_result = _mm_cmplt_epi32(ty, _mm_setzero_si128()); \
00318 if (_mm_movemask_epi8(compare_result)) \
00319 ty = _mm_add_epi32(ty, _mm_and_si128(compare_result, height)); \
00320 else \
00321 break; \
00322 } \
00323 while (true) \
00324 { \
00325 __m128i compare_result = _mm_cmplt_epi32(ty, height); \
00326 if (_mm_movemask_epi8(compare_result)!=0xffff) \
00327 ty = _mm_sub_epi32(ty, _mm_andnot_si128(compare_result, height)); \
00328 else \
00329 break; \
00330 } \
00331 }
00332