Image Component Library (ICL)
|
00001 /******************************************************************** 00002 ** Image Component Library (ICL) ** 00003 ** ** 00004 ** Copyright (C) 2006-2013 CITEC, University of Bielefeld ** 00005 ** Neuroinformatics Group ** 00006 ** Website: www.iclcv.org and ** 00007 ** http://opensource.cit-ec.de/projects/icl ** 00008 ** ** 00009 ** File : ICLUtils/src/ICLUtils/SSETypes.h ** 00010 ** Module : ICLUtils ** 00011 ** Authors: Sergius Gaulik ** 00012 ** ** 00013 ** ** 00014 ** GNU LESSER GENERAL PUBLIC LICENSE ** 00015 ** This file may be used under the terms of the GNU Lesser General ** 00016 ** Public License version 3.0 as published by the ** 00017 ** ** 00018 ** Free Software Foundation and appearing in the file LICENSE.LGPL ** 00019 ** included in the packaging of this file. Please review the ** 00020 ** following information to ensure the license requirements will ** 00021 ** be met: http://www.gnu.org/licenses/lgpl-3.0.txt ** 00022 ** ** 00023 ** The development of this software was supported by the ** 00024 ** Excellence Cluster EXC 277 Cognitive Interaction Technology. ** 00025 ** The Excellence Cluster EXC 277 is a grant of the Deutsche ** 00026 ** Forschungsgemeinschaft (DFG) in the context of the German ** 00027 ** Excellence Initiative. ** 00028 ** ** 00029 ********************************************************************/ 00030 00031 #pragma once 00032 00033 #ifdef ICL_USE_SSE 00034 #if defined __SSE2__ || defined _M_X64 || (defined _M_IX86_FP && _M_IX86_FP >= 2) 00035 #include "emmintrin.h" 00036 #define ICL_HAVE_SSE2 00037 #if defined __SSE3__ || (defined _MSC_VER && _MSC_VER >= 1500) 00038 #include "pmmintrin.h" 00039 #define ICL_HAVE_SSE3 00040 #if defined __SSSE3__ || (defined _MSC_VER && _MSC_VER >= 1500) 00041 #include "tmmintrin.h" 00042 #define ICL_HAVE_SSSE3 00043 #endif 00044 #endif 00045 #endif 00046 #endif 00047 00048 #include <ICLUtils/CompatMacros.h> 00049 #include <ICLUtils/BasicTypes.h> 00050 00051 /* This header wraps the 128 bit SSE types and defines some basic 00052 functions for them. The idea is to create an easier and more intuitive 00053 way to work with SSE types. 00054 The type names are combinations of the basic type name and the number 00055 of these basic types in the SSE type. For example: icl8ux16 is a type 00056 with 16 icl8u values. 00057 The following examples will show some diffrent ways to use the SSE types: 00058 00059 // ++++++++++++ black and white example ++++++++++++ // 00060 #include <ICLUtils/SSETypes.h> 00061 #include <ICLUtils/Time.h> 00062 00063 using namespace std; 00064 using namespace icl; 00065 using namespace icl::utils; 00066 00067 #define T_SIZE 111111111 00068 00069 // use a threshold to set values of an array to 0 or 255 00070 void createBinaryValues(icl8u *d, icl8u *dEnd, const icl8u threshold) { 00071 for (; d < dEnd; ++d) *d = (*d < threshold) ? 0 : 255; 00072 } 00073 00074 // use a threshold to set values of an array to 0 or 255 00075 void createBinaryValuesSSE(icl8u *d, icl8u *dEnd, const icl8u threshold) { 00076 // if we end up with less than 16 values at the end 00077 // we have to convert them value by value to prevent memory access violation 00078 icl8u *dSSEEnd = dEnd - 15; 00079 // some constants 00080 const icl8ux16 c128 = icl8ux16((icl8s)128); 00081 const icl8ux16 cT = icl8ux16((icl8s)threshold - 129); 00082 00083 // convert 16 values at the same time 00084 for (; d < dSSEEnd; d += 16) { 00085 // load the first 16 values 00086 icl8ux16 v = icl8ux16(d); 00087 // subtract 128 from every value in v 00088 v -= c128; 00089 // if a function for the SSE wrapper types does not exist 00090 // we can mix the wrapper types with the actual SSE functions 00091 v = _mm_cmpgt_epi8(v, cT); 00092 // storeu stores the values from v in d 00093 // (store works only with 16 aligned memory, 00094 // but storeu does not have this restriction) 00095 v.storeu(d); 00096 } 00097 00098 // convert 1 value at a time 00099 for (; d < dEnd; ++d) { 00100 *d = (*d < threshold) ? 0 : 255; 00101 } 00102 } 00103 00104 int main(int n, char **ppc){ 00105 icl::utils::Time t; 00106 icl8u *c = new icl8u[T_SIZE]; 00107 00108 for (unsigned int i = 0; i < T_SIZE; ++i) 00109 c[i] = rand() % 256; 00110 00111 t = icl::utils::Time::now(); 00112 createBinaryValues(c, c + T_SIZE, 123); 00113 t.showAge("without SSE"); 00114 00115 for (unsigned int i = 0; i < T_SIZE; ++i) 00116 c[i] = rand() % 256; 00117 00118 t = icl::utils::Time::now(); 00119 createBinaryValuesSSE(c, c + T_SIZE, 123); 00120 t.showAge("with SSE"); 00121 00122 delete c; 00123 00124 return 0; 00125 } 00126 // ------------ black and white example ------------ // 00127 00128 // ++++++++++++ rgb to gray ++++++++++++ // 00129 #include <ICLUtils/SSETypes.h> 00130 #include <ICLUtils/ClippedCast.h> 00131 #include <ICLUtils/Time.h> 00132 00133 using namespace std; 00134 using namespace icl; 00135 using namespace icl::utils; 00136 00137 #define T_SIZE 111111111 00138 00139 void RGBtoGray(const icl8u *r, const icl8u *g, const icl8u *b, icl8u *gr, icl8u *grEnd) { 00140 for (; gr != grEnd; ++gr, ++r, ++g, ++b) *gr = clipped_cast<icl32f, icl8u>((*r + *g + *b) / 3.0f + 0.5f); 00141 } 00142 void SSERGBtoGray(const icl8u *r, const icl8u *g, const icl8u *b, icl8u *gr, icl8u *grEnd) { 00143 icl8u *grSSEEnd = grEnd - 15; 00144 00145 for (; gr < grSSEEnd; gr += 16, r += 16, g += 16, b += 16) { 00146 // convert to icl16s for number higher than 255 00147 icl16sx16 vR(r); 00148 icl16sx16 vG(g); 00149 icl16sx16 vB(b); 00150 00151 vR += vB; 00152 vR += vG; 00153 00154 // convert to icl32s and then to icl32f type for floating point operations 00155 icl32fx16 vRes = icl32sx16(vR); 00156 00157 vRes *= icl32fx16(1.0f / 3.0f); 00158 vRes.storeu(gr); 00159 } 00160 00161 for (; gr != grEnd; ++gr, ++r, ++g, ++b) *gr = clipped_cast<icl8u, icl8u>((*r + *g + *b) / 3.0f + 0.5f); 00162 } 00163 00164 int main(int n, char **ppc){ 00165 icl::utils::Time t; 00166 icl8u *r = new icl8u[T_SIZE]; 00167 icl8u *g = new icl8u[T_SIZE]; 00168 icl8u *b = new icl8u[T_SIZE]; 00169 icl8u *gr = new icl8u[T_SIZE]; 00170 00171 for (unsigned int i = 0; i < T_SIZE; ++i) { 00172 r[i] = rand() % 256; 00173 g[i] = rand() % 256; 00174 b[i] = rand() % 256; 00175 } 00176 00177 t = icl::utils::Time::now(); 00178 RGBtoGray(r, g, b, gr, gr + T_SIZE); 00179 t.showAge("without SSE"); 00180 00181 t = icl::utils::Time::now(); 00182 SSERGBtoGray(r, g, b, gr, gr + T_SIZE); 00183 t.showAge("with SSE"); 00184 00185 delete r; 00186 delete g; 00187 delete b; 00188 delete gr; 00189 00190 return 0; 00191 } 00192 // ------------ rgb to gray ------------ // 00193 00194 */ 00195 00196 namespace icl{ 00197 namespace utils{ 00198 00199 #ifdef ICL_HAVE_SSE2 00200 00201 // ++ basic SSE types ++ // 00202 00203 struct Icl128 { 00204 __m128 v0; 00205 }; 00206 00207 struct Icl128i { 00208 __m128i v0; 00209 00210 inline Icl128i() { 00211 } 00212 00213 inline Icl128i(const Icl128i &v) { 00214 v0 = v.v0; 00215 } 00216 00217 inline Icl128i(const __m128i &v) { 00218 v0 = v; 00219 } 00220 00221 inline Icl128i(const __m128i *v) { 00222 v0 = _mm_loadu_si128(v); 00223 } 00224 00225 inline Icl128i& operator=(const Icl128i &v) { 00226 v0 = v.v0; 00227 return *this; 00228 } 00229 00230 inline operator __m128i () const { 00231 return v0; 00232 } 00233 00234 inline Icl128i& operator&=(const Icl128i &v) { 00235 v0 = _mm_and_si128(v0, v.v0); 00236 return *this; 00237 } 00238 00239 inline Icl128i& operator|=(const Icl128i &v) { 00240 v0 = _mm_or_si128(v0, v.v0); 00241 return *this; 00242 } 00243 00244 inline Icl128i& operator^=(const Icl128i &v) { 00245 v0 = _mm_xor_si128(v0, v.v0); 00246 return *this; 00247 } 00248 00249 inline Icl128i& andnot(const Icl128i &v) { 00250 v0 = _mm_andnot_si128(v.v0, v0); 00251 return *this; 00252 } 00253 00254 inline void store(__m128i *v) const { 00255 _mm_store_si128(v, v0); 00256 } 00257 00258 inline void storeu(__m128i *v) const { 00259 _mm_storeu_si128(v, v0); 00260 } 00261 }; 00262 00263 struct Icl128d { 00264 __m128d v0; 00265 }; 00266 00267 struct Icl256 { 00268 __m128 v0; __m128 v1; 00269 }; 00270 00271 struct Icl256i { 00272 __m128i v0; __m128i v1; 00273 00274 inline Icl256i() { 00275 } 00276 00277 inline Icl256i(const Icl256i &v) { 00278 v0 = v.v0; 00279 v1 = v.v1; 00280 } 00281 00282 inline Icl256i(const __m128i &vl, const __m128i &vh) { 00283 v0 = vl; 00284 v1 = vh; 00285 } 00286 00287 inline Icl256i(const __m128i *v) { 00288 v0 = *v; 00289 v1 = *(v + 1); 00290 } 00291 00292 inline Icl256i& operator=(const Icl256i &v) { 00293 v0 = v.v0; 00294 v1 = v.v1; 00295 return *this; 00296 } 00297 00298 inline Icl256i& operator&=(const Icl256i &v) { 00299 v0 = _mm_and_si128(v0, v.v0); 00300 v1 = _mm_and_si128(v1, v.v1); 00301 return *this; 00302 } 00303 00304 inline Icl256i& operator|=(const Icl256i &v) { 00305 v0 = _mm_or_si128(v0, v.v0); 00306 v1 = _mm_or_si128(v1, v.v1); 00307 return *this; 00308 } 00309 00310 inline Icl256i& operator^=(const Icl256i &v) { 00311 v0 = _mm_xor_si128(v0, v.v0); 00312 v1 = _mm_xor_si128(v1, v.v1); 00313 return *this; 00314 } 00315 00316 inline Icl256i& andnot(const Icl256i &v) { 00317 v0 = _mm_andnot_si128(v.v0, v0); 00318 v1 = _mm_andnot_si128(v.v1, v1); 00319 return *this; 00320 } 00321 00322 inline void store(__m128i *v) const { 00323 _mm_store_si128(v, v0); 00324 _mm_store_si128(v + 1, v1); 00325 } 00326 00327 inline void storeu(__m128i *v) const { 00328 _mm_storeu_si128(v, v0); 00329 _mm_storeu_si128(v + 1, v1); 00330 } 00331 }; 00332 00333 struct Icl256d { 00334 __m128d v0; __m128d v1; 00335 }; 00336 00337 struct Icl512 { 00338 __m128 v0; __m128 v1; __m128 v2; __m128 v3; 00339 }; 00340 00341 struct Icl512i { 00342 __m128i v0; __m128i v1; __m128i v2; __m128i v3; 00343 00344 inline Icl512i() { 00345 } 00346 00347 inline Icl512i(const Icl512i &v) { 00348 v0 = v.v0; 00349 v1 = v.v1; 00350 v2 = v.v2; 00351 v3 = v.v3; 00352 } 00353 00354 inline Icl512i(const __m128i &vll, const __m128i &vlh, 00355 const __m128i &vhl, const __m128i &vhh) { 00356 v0 = vll; 00357 v1 = vlh; 00358 v2 = vhl; 00359 v3 = vhh; 00360 } 00361 00362 inline Icl512i(const __m128i *v) { 00363 v0 = *v; 00364 v1 = *(v + 1); 00365 v0 = *(v + 2); 00366 v1 = *(v + 3); 00367 } 00368 00369 inline Icl512i& operator=(const Icl512i &v) { 00370 v0 = v.v0; 00371 v1 = v.v1; 00372 v2 = v.v2; 00373 v3 = v.v3; 00374 return *this; 00375 } 00376 00377 inline Icl512i& operator&=(const Icl512i &v) { 00378 v0 = _mm_and_si128(v0, v.v0); 00379 v1 = _mm_and_si128(v1, v.v1); 00380 v2 = _mm_and_si128(v2, v.v2); 00381 v3 = _mm_and_si128(v3, v.v3); 00382 return *this; 00383 } 00384 00385 inline Icl512i& operator|=(const Icl512i &v) { 00386 v0 = _mm_or_si128(v0, v.v0); 00387 v1 = _mm_or_si128(v1, v.v1); 00388 v2 = _mm_or_si128(v2, v.v2); 00389 v3 = _mm_or_si128(v3, v.v3); 00390 return *this; 00391 } 00392 00393 inline Icl512i& operator^=(const Icl512i &v) { 00394 v0 = _mm_xor_si128(v0, v.v0); 00395 v1 = _mm_xor_si128(v1, v.v1); 00396 v2 = _mm_xor_si128(v2, v.v2); 00397 v3 = _mm_xor_si128(v3, v.v3); 00398 return *this; 00399 } 00400 00401 inline Icl512i& andnot(const Icl512i &v) { 00402 v0 = _mm_andnot_si128(v.v0, v0); 00403 v1 = _mm_andnot_si128(v.v1, v1); 00404 v2 = _mm_andnot_si128(v.v2, v2); 00405 v3 = _mm_andnot_si128(v.v3, v3); 00406 return *this; 00407 } 00408 00409 inline void store(__m128i *v) const { 00410 _mm_store_si128(v, v0); 00411 _mm_store_si128(v + 1, v1); 00412 _mm_store_si128(v + 2, v2); 00413 _mm_store_si128(v + 3, v3); 00414 } 00415 00416 inline void storeu(__m128i *v) const { 00417 _mm_storeu_si128(v, v0); 00418 _mm_storeu_si128(v + 1, v1); 00419 _mm_storeu_si128(v + 2, v2); 00420 _mm_storeu_si128(v + 3, v3); 00421 } 00422 }; 00423 00424 struct Icl512d { 00425 __m128d v0; __m128d v1; __m128d v2; __m128d v3; 00426 }; 00427 00428 struct Icl1024d { 00429 __m128d v0; __m128d v1; __m128d v2; __m128d v3; 00430 __m128d v4; __m128d v5; __m128d v6; __m128d v7; 00431 }; 00432 00433 // -- basic SSE types -- // 00434 00435 00436 // ++ advanced SSE types ++ // 00437 00439 struct icl128 : Icl128 { 00440 inline icl128() { 00441 } 00442 00443 inline icl128(const icl128 &v) { 00444 v0 = v.v0; 00445 } 00446 00447 inline icl128(const __m128 &v) { 00448 v0 = v; 00449 } 00450 00451 inline icl128(const icl32f *v) { 00452 v0 = _mm_loadu_ps(v); 00453 } 00454 00455 inline icl128(const icl32f v) { 00456 v0 = _mm_set1_ps(v); 00457 } 00458 00459 inline icl128(const Icl128 &v) { 00460 v0 = v.v0; 00461 } 00462 00463 inline icl128(const Icl128i &v) { 00464 v0 = _mm_cvtepi32_ps(v.v0); 00465 } 00466 /* 00467 inline icl128& operator=(const __m128 &v) { 00468 v0 = v; 00469 return *this; 00470 } 00471 00472 inline icl128& operator=(const icl32f *v) { 00473 v0 = _mm_loadu_ps(v); 00474 return *this; 00475 } 00476 00477 inline icl128& operator=(const icl32f v) { 00478 v0 = _mm_set1_ps(v); 00479 return *this; 00480 } 00481 */ 00482 inline icl128& operator=(const icl128 &v) { 00483 v0 = v.v0; 00484 return *this; 00485 } 00486 00487 inline icl128& operator=(const Icl128 &v) { 00488 v0 = v.v0; 00489 return *this; 00490 } 00491 /* 00492 inline icl128& operator=(const Icl128i &v) { 00493 v0 = _mm_cvtepi32_ps(v.v0); 00494 return *this; 00495 } 00496 */ 00497 inline operator __m128 () const { 00498 return v0; 00499 } 00500 00501 inline icl128& operator+=(const Icl128 &v) { 00502 v0 = _mm_add_ps(v0, v.v0); 00503 return *this; 00504 } 00505 00506 inline icl128& operator-=(const Icl128 &v) { 00507 v0 = _mm_sub_ps(v0, v.v0); 00508 return *this; 00509 } 00510 00511 inline icl128& operator*=(const Icl128 &v) { 00512 v0 = _mm_mul_ps(v0, v.v0); 00513 return *this; 00514 } 00515 00516 inline icl128& operator/=(const Icl128 &v) { 00517 v0 = _mm_div_ps(v0, v.v0); 00518 return *this; 00519 } 00520 00521 inline icl128& operator&=(const Icl128 &v) { 00522 v0 = _mm_and_ps(v0, v.v0); 00523 return *this; 00524 } 00525 00526 inline icl128& operator|=(const Icl128 &v) { 00527 v0 = _mm_or_ps(v0, v.v0); 00528 return *this; 00529 } 00530 00531 inline icl128& operator^=(const Icl128 &v) { 00532 v0 = _mm_xor_ps(v0, v.v0); 00533 return *this; 00534 } 00535 00536 inline icl128& andnot(const Icl128 &v) { 00537 v0 = _mm_andnot_ps(v.v0, v0); 00538 return *this; 00539 } 00540 00541 inline icl128& rcp() { 00542 v0 = _mm_rcp_ps(v0); 00543 return *this; 00544 } 00545 00546 inline void store(icl32f *v) const { 00547 _mm_store_ps(v, v0); 00548 } 00549 00550 inline void storeu(icl32f *v) const { 00551 _mm_storeu_ps(v, v0); 00552 } 00553 }; 00554 00556 struct icl256 : Icl256 { 00557 inline icl256() { 00558 } 00559 00560 inline icl256(const icl256 &v) { 00561 v0 = v.v0; 00562 v1 = v.v1; 00563 } 00564 00565 inline icl256(const __m128 &vl, const __m128 &vh) { 00566 v0 = vl; 00567 v1 = vh; 00568 } 00569 00570 inline icl256(const __m128 *v) { 00571 v0 = *v; 00572 v1 = *(v + 1); 00573 } 00574 00575 inline icl256(const icl32f v) { 00576 v0 = _mm_set1_ps(v); 00577 v1 = _mm_set1_ps(v); 00578 } 00579 00580 inline icl256(const Icl256 &v) { 00581 v0 = v.v0; 00582 v1 = v.v1; 00583 } 00584 00585 inline icl256(const Icl256i &v) { 00586 v0 = _mm_cvtepi32_ps(v.v0); 00587 v1 = _mm_cvtepi32_ps(v.v1); 00588 } 00589 /* 00590 inline icl256& operator=(const __m128 *v) { 00591 v0 = *v; 00592 v1 = *(v + 1); 00593 return *this; 00594 } 00595 00596 inline icl256& operator=(const icl32f v) { 00597 v0 = _mm_set1_ps(v); 00598 v1 = _mm_set1_ps(v); 00599 return *this; 00600 } 00601 */ 00602 inline icl256& operator=(const icl256 &v) { 00603 v0 = v.v0; 00604 v1 = v.v1; 00605 return *this; 00606 } 00607 00608 inline icl256& operator=(const Icl256 &v) { 00609 v0 = v.v0; 00610 v1 = v.v1; 00611 return *this; 00612 } 00613 /* 00614 inline icl256& operator=(const Icl256i &v) { 00615 v0 = _mm_cvtepi32_ps(v.v0); 00616 v1 = _mm_cvtepi32_ps(v.v1); 00617 return *this; 00618 } 00619 */ 00620 inline icl256& operator+=(const Icl256 &v) { 00621 v0 = _mm_add_ps(v0, v.v0); 00622 v1 = _mm_add_ps(v1, v.v1); 00623 return *this; 00624 } 00625 00626 inline icl256& operator-=(const Icl256 &v) { 00627 v0 = _mm_sub_ps(v0, v.v0); 00628 v1 = _mm_sub_ps(v1, v.v1); 00629 return *this; 00630 } 00631 00632 inline icl256& operator*=(const Icl256 &v) { 00633 v0 = _mm_mul_ps(v0, v.v0); 00634 v1 = _mm_mul_ps(v1, v.v1); 00635 return *this; 00636 } 00637 00638 inline icl256& operator/=(const Icl256 &v) { 00639 v0 = _mm_div_ps(v0, v.v0); 00640 v1 = _mm_div_ps(v1, v.v1); 00641 return *this; 00642 } 00643 00644 inline icl256& operator&=(const Icl256 &v) { 00645 v0 = _mm_and_ps(v0, v.v0); 00646 v1 = _mm_and_ps(v1, v.v1); 00647 return *this; 00648 } 00649 00650 inline icl256& operator|=(const Icl256 &v) { 00651 v0 = _mm_or_ps(v0, v.v0); 00652 v1 = _mm_or_ps(v1, v.v1); 00653 return *this; 00654 } 00655 00656 inline icl256& operator^=(const Icl256 &v) { 00657 v0 = _mm_xor_ps(v0, v.v0); 00658 v1 = _mm_xor_ps(v1, v.v1); 00659 return *this; 00660 } 00661 00662 inline icl256& andnot(const Icl256 &v) { 00663 v0 = _mm_andnot_ps(v.v0, v0); 00664 v1 = _mm_andnot_ps(v.v1, v1); 00665 return *this; 00666 } 00667 00668 inline icl256& rcp() { 00669 v0 = _mm_rcp_ps(v0); 00670 v1 = _mm_rcp_ps(v1); 00671 return *this; 00672 } 00673 00674 inline void store(icl32f *v) const { 00675 _mm_store_ps(v, v0); 00676 _mm_store_ps(v + 4, v1); 00677 } 00678 00679 inline void storeu(icl32f *v) const { 00680 _mm_storeu_ps(v, v0); 00681 _mm_storeu_ps(v + 4, v1); 00682 } 00683 }; 00684 00686 struct icl512 : Icl512 { 00687 inline icl512() { 00688 } 00689 00690 inline icl512(const icl512 &v) { 00691 v0 = v.v0; 00692 v1 = v.v1; 00693 v2 = v.v2; 00694 v3 = v.v3; 00695 } 00696 00697 inline icl512(const __m128 &vll, const __m128 &vlh, 00698 const __m128 &vhl, const __m128 &vhh) { 00699 v0 = vll; 00700 v1 = vlh; 00701 v2 = vhl; 00702 v3 = vhh; 00703 } 00704 00705 inline icl512(const __m128 *v) { 00706 v0 = *v; 00707 v1 = *(v + 1); 00708 v2 = *(v + 2); 00709 v3 = *(v + 3); 00710 } 00711 00712 inline icl512(const icl8u *v) { 00713 const __m128i vk0 = _mm_setzero_si128(); 00714 __m128i vt0, vt1, vt2, vt3; 00715 00716 vt3 = _mm_loadu_si128((__m128i*)v); 00717 00718 vt1 = _mm_unpacklo_epi8(vt3, vk0); 00719 vt3 = _mm_unpackhi_epi8(vt3, vk0); 00720 00721 vt0 = _mm_unpacklo_epi16(vt1, vk0); 00722 vt1 = _mm_unpackhi_epi16(vt1, vk0); 00723 vt2 = _mm_unpacklo_epi16(vt3, vk0); 00724 vt3 = _mm_unpackhi_epi16(vt3, vk0); 00725 00726 v0 = _mm_cvtepi32_ps(vt0); 00727 v1 = _mm_cvtepi32_ps(vt1); 00728 v2 = _mm_cvtepi32_ps(vt2); 00729 v3 = _mm_cvtepi32_ps(vt3); 00730 } 00731 00732 inline icl512(const icl32f *v) { 00733 v0 = _mm_loadu_ps(v); 00734 v1 = _mm_loadu_ps(v + 4); 00735 v2 = _mm_loadu_ps(v + 8); 00736 v3 = _mm_loadu_ps(v + 12); 00737 } 00738 00739 inline icl512(const Icl512 &v) { 00740 v0 = v.v0; 00741 v1 = v.v1; 00742 v2 = v.v2; 00743 v3 = v.v3; 00744 } 00745 00746 inline icl512(const Icl512i &v) { 00747 v0 = _mm_cvtepi32_ps(v.v0); 00748 v1 = _mm_cvtepi32_ps(v.v1); 00749 v2 = _mm_cvtepi32_ps(v.v2); 00750 v3 = _mm_cvtepi32_ps(v.v3); 00751 } 00752 00753 inline icl512(const icl32f v) { 00754 v0 = _mm_set1_ps(v); 00755 v1 = _mm_set1_ps(v); 00756 v2 = _mm_set1_ps(v); 00757 v3 = _mm_set1_ps(v); 00758 } 00759 /* 00760 inline icl512& operator=(const __m128 *v) { 00761 v0 = *v; 00762 v1 = *(v + 1); 00763 v2 = *(v + 2); 00764 v3 = *(v + 3); 00765 return *this; 00766 } 00767 00768 inline icl512& operator=(const icl32f *v) { 00769 v0 = _mm_loadu_ps(v); 00770 v1 = _mm_loadu_ps(v + 4); 00771 v2 = _mm_loadu_ps(v + 8); 00772 v3 = _mm_loadu_ps(v + 12); 00773 return *this; 00774 } 00775 */ 00776 inline icl512& operator=(const icl512 &v) { 00777 v0 = v.v0; 00778 v1 = v.v1; 00779 v2 = v.v2; 00780 v3 = v.v3; 00781 return *this; 00782 } 00783 00784 inline icl512& operator=(const Icl512 &v) { 00785 v0 = v.v0; 00786 v1 = v.v1; 00787 v2 = v.v2; 00788 v3 = v.v3; 00789 return *this; 00790 } 00791 /* 00792 inline icl512& operator=(const Icl512i &v) { 00793 v0 = _mm_cvtepi32_ps(v.v0); 00794 v1 = _mm_cvtepi32_ps(v.v1); 00795 v2 = _mm_cvtepi32_ps(v.v2); 00796 v3 = _mm_cvtepi32_ps(v.v3); 00797 return *this; 00798 } 00799 00800 inline icl512& operator=(const icl32f v) { 00801 v0 = _mm_set1_ps(v); 00802 v1 = _mm_set1_ps(v); 00803 v2 = _mm_set1_ps(v); 00804 v3 = _mm_set1_ps(v); 00805 return *this; 00806 } 00807 */ 00808 inline icl512& operator+=(const Icl512 &v) { 00809 v0 = _mm_add_ps(v0, v.v0); 00810 v1 = _mm_add_ps(v1, v.v1); 00811 v2 = _mm_add_ps(v2, v.v2); 00812 v3 = _mm_add_ps(v3, v.v3); 00813 return *this; 00814 } 00815 00816 inline icl512& operator-=(const Icl512 &v) { 00817 v0 = _mm_sub_ps(v0, v.v0); 00818 v1 = _mm_sub_ps(v1, v.v1); 00819 v2 = _mm_sub_ps(v2, v.v2); 00820 v3 = _mm_sub_ps(v3, v.v3); 00821 return *this; 00822 } 00823 00824 inline icl512& operator*=(const Icl512 &v) { 00825 v0 = _mm_mul_ps(v0, v.v0); 00826 v1 = _mm_mul_ps(v1, v.v1); 00827 v2 = _mm_mul_ps(v2, v.v2); 00828 v3 = _mm_mul_ps(v3, v.v3); 00829 return *this; 00830 } 00831 00832 inline icl512& operator/=(const Icl512 &v) { 00833 v0 = _mm_div_ps(v0, v.v0); 00834 v1 = _mm_div_ps(v1, v.v1); 00835 v2 = _mm_div_ps(v2, v.v2); 00836 v3 = _mm_div_ps(v3, v.v3); 00837 return *this; 00838 } 00839 00840 inline icl512& operator&=(const Icl512 &v) { 00841 v0 = _mm_and_ps(v0, v.v0); 00842 v1 = _mm_and_ps(v1, v.v1); 00843 v2 = _mm_and_ps(v2, v.v2); 00844 v3 = _mm_and_ps(v3, v.v3); 00845 return *this; 00846 } 00847 00848 inline icl512& operator|=(const Icl512 &v) { 00849 v0 = _mm_or_ps(v0, v.v0); 00850 v1 = _mm_or_ps(v1, v.v1); 00851 v2 = _mm_or_ps(v2, v.v2); 00852 v3 = _mm_or_ps(v3, v.v3); 00853 return *this; 00854 } 00855 00856 inline icl512& operator^=(const Icl512 &v) { 00857 v0 = _mm_xor_ps(v0, v.v0); 00858 v1 = _mm_xor_ps(v1, v.v1); 00859 v2 = _mm_xor_ps(v2, v.v2); 00860 v3 = _mm_xor_ps(v3, v.v3); 00861 return *this; 00862 } 00863 00864 inline icl512& andnot(const Icl512 &v) { 00865 v0 = _mm_andnot_ps(v.v0, v0); 00866 v1 = _mm_andnot_ps(v.v1, v1); 00867 v2 = _mm_andnot_ps(v.v2, v2); 00868 v3 = _mm_andnot_ps(v.v3, v3); 00869 return *this; 00870 } 00871 00872 inline icl512& rcp() { 00873 v0 = _mm_rcp_ps(v0); 00874 v1 = _mm_rcp_ps(v1); 00875 v2 = _mm_rcp_ps(v2); 00876 v3 = _mm_rcp_ps(v3); 00877 return *this; 00878 } 00879 00880 inline void store(icl8u *v) const { 00881 //__m128 vMin = _mm_set1_ps(-2147483520.f); 00882 //__m128 vMax = _mm_set1_ps(2147483520.f); 00883 //v0 = _mm_cvtps_epi32(_mm_min_ps(_mm_max_ps(v.v0, vMin), vMax)); 00884 //v1 = _mm_cvtps_epi32(_mm_min_ps(_mm_max_ps(v.v1, vMin), vMax)); 00885 //v2 = _mm_cvtps_epi32(_mm_min_ps(_mm_max_ps(v.v2, vMin), vMax)); 00886 //v3 = _mm_cvtps_epi32(_mm_min_ps(_mm_max_ps(v.v3, vMin), vMax)); 00887 __m128i vt0 = _mm_cvtps_epi32(v0); 00888 __m128i vt1 = _mm_cvtps_epi32(v1); 00889 __m128i vt2 = _mm_cvtps_epi32(v2); 00890 __m128i vt3 = _mm_cvtps_epi32(v3); 00891 00892 vt0 = _mm_packus_epi16(_mm_packs_epi32(vt0, vt1), _mm_packs_epi32(vt2, vt3)); 00893 _mm_store_si128((__m128i*)v, vt0); 00894 } 00895 00896 inline void storeu(icl8u *v) const { 00897 //__m128 vMin = _mm_set1_ps(-2147483520.f); 00898 //__m128 vMax = _mm_set1_ps(2147483520.f); 00899 //v0 = _mm_cvtps_epi32(_mm_min_ps(_mm_max_ps(v.v0, vMin), vMax)); 00900 //v1 = _mm_cvtps_epi32(_mm_min_ps(_mm_max_ps(v.v1, vMin), vMax)); 00901 //v2 = _mm_cvtps_epi32(_mm_min_ps(_mm_max_ps(v.v2, vMin), vMax)); 00902 //v3 = _mm_cvtps_epi32(_mm_min_ps(_mm_max_ps(v.v3, vMin), vMax)); 00903 __m128i vt0 = _mm_cvtps_epi32(v0); 00904 __m128i vt1 = _mm_cvtps_epi32(v1); 00905 __m128i vt2 = _mm_cvtps_epi32(v2); 00906 __m128i vt3 = _mm_cvtps_epi32(v3); 00907 00908 vt0 = _mm_packus_epi16(_mm_packs_epi32(vt0, vt1), _mm_packs_epi32(vt2, vt3)); 00909 _mm_storeu_si128((__m128i*)v, vt0); 00910 } 00911 00912 inline void store(icl32f *v) const { 00913 _mm_store_ps(v, v0); 00914 _mm_store_ps(v + 4, v1); 00915 _mm_store_ps(v + 8, v2); 00916 _mm_store_ps(v + 12, v3); 00917 } 00918 00919 inline void storeu(icl32f *v) const { 00920 _mm_storeu_ps(v, v0); 00921 _mm_storeu_ps(v + 4, v1); 00922 _mm_storeu_ps(v + 8, v2); 00923 _mm_storeu_ps(v + 12, v3); 00924 } 00925 }; 00926 00928 struct icl128i8u : Icl128i { 00929 inline icl128i8u() { 00930 } 00931 00932 inline icl128i8u(const Icl128i &v) { 00933 v0 = v.v0; 00934 } 00935 00936 inline icl128i8u(const icl128i8u &v) { 00937 v0 = v.v0; 00938 } 00939 00940 inline icl128i8u(const __m128i &v) { 00941 v0 = v; 00942 } 00943 00944 inline icl128i8u(const __m128i *v) { 00945 v0 = _mm_loadu_si128(v); 00946 } 00947 00948 inline icl128i8u(const icl8s *v) { 00949 v0 = _mm_loadu_si128((__m128i*)v); 00950 } 00951 00952 inline icl128i8u(const icl8u *v) { 00953 v0 = _mm_loadu_si128((__m128i*)v); 00954 } 00955 00956 inline icl128i8u(const icl8s v) { 00957 v0 = _mm_set1_epi8(v); 00958 } 00959 00960 inline icl128i8u(const Icl256i &v) { 00961 //v0 = _mm_packs_epi16(v.v0, v.v1); // for icl8s 00962 v0 = _mm_packus_epi16(v.v0, v.v1); 00963 } 00964 00965 inline icl128i8u(const Icl512i &v) { 00966 //v0 = _mm_packs_epi16(_mm_packs_epi32(v.v0, v.v1), _mm_packs_epi32(v.v2, v.v3)); // for icl8s 00967 v0 = _mm_packus_epi16(_mm_packs_epi32(v.v0, v.v1), _mm_packs_epi32(v.v2, v.v3)); 00968 } 00969 00970 inline operator Icl128i () const { 00971 return *this; 00972 } 00973 00974 inline icl128i8u& operator=(const icl128i8u &v) { 00975 v0 = v.v0; 00976 return *this; 00977 } 00978 00979 inline icl128i8u& operator=(const Icl128i &v) { 00980 v0 = v.v0; 00981 return *this; 00982 } 00983 00984 inline icl128i8u& operator+=(const icl128i8u &v) { 00985 v0 = _mm_add_epi8(v0, v.v0); 00986 return *this; 00987 } 00988 00989 inline icl128i8u& operator-=(const icl128i8u &v) { 00990 v0 = _mm_sub_epi8(v0, v.v0); 00991 return *this; 00992 } 00993 00994 inline void store(__m128i *v) const { 00995 _mm_store_si128(v, v0); 00996 } 00997 00998 inline void storeu(__m128i *v) const { 00999 _mm_storeu_si128(v, v0); 01000 } 01001 01002 inline void store(icl8s *v) const { 01003 _mm_store_si128((__m128i*)v, v0); 01004 } 01005 01006 inline void storeu(icl8s *v) const { 01007 _mm_storeu_si128((__m128i*)v, v0); 01008 } 01009 01010 inline void store(icl8u *v) const { 01011 _mm_store_si128((__m128i*)v, v0); 01012 } 01013 01014 inline void storeu(icl8u *v) const { 01015 _mm_storeu_si128((__m128i*)v, v0); 01016 } 01017 }; 01018 01020 struct icl128i16s : Icl128i { 01021 inline icl128i16s() { 01022 } 01023 01024 inline icl128i16s(const icl128i16s &v) { 01025 v0 = v.v0; 01026 } 01027 01028 inline icl128i16s(const Icl128i &v) { 01029 v0 = v.v0; 01030 } 01031 01032 inline icl128i16s(const __m128i &v) { 01033 v0 = v; 01034 } 01035 01036 inline icl128i16s(const __m128i *v) { 01037 v0 = _mm_loadu_si128(v); 01038 } 01039 01040 inline icl128i16s(const icl16s *v) { 01041 v0 = _mm_loadu_si128((__m128i*)v); 01042 } 01043 01044 inline icl128i16s(const icl16u *v) { 01045 v0 = _mm_loadu_si128((__m128i*)v); 01046 } 01047 01048 inline icl128i16s(const icl16s v) { 01049 v0 = _mm_set1_epi16(v); 01050 } 01051 01052 inline icl128i16s(const Icl256i &v) { 01053 v0 = _mm_packs_epi32(v.v0, v.v1); 01054 } 01055 01056 inline operator Icl128i () const { 01057 return *this; 01058 } 01059 01060 inline icl128i16s& operator=(const icl128i16s &v) { 01061 v0 = v.v0; 01062 return *this; 01063 } 01064 01065 inline icl128i16s& operator=(const Icl128i &v) { 01066 v0 = v.v0; 01067 return *this; 01068 } 01069 01070 inline icl128i16s& operator+=(const icl128i16s &v) { 01071 v0 = _mm_add_epi16(v0, v.v0); 01072 return *this; 01073 } 01074 01075 inline icl128i16s& operator-=(const icl128i16s &v) { 01076 v0 = _mm_sub_epi16(v0, v.v0); 01077 return *this; 01078 } 01079 01080 inline void store(__m128i *v) const { 01081 _mm_store_si128(v, v0); 01082 } 01083 01084 inline void storeu(__m128i *v) const { 01085 _mm_storeu_si128(v, v0); 01086 } 01087 01088 inline void store(icl16s *v) const { 01089 _mm_store_si128((__m128i*)v, v0); 01090 } 01091 01092 inline void storeu(icl16s *v) const { 01093 _mm_storeu_si128((__m128i*)v, v0); 01094 } 01095 01096 inline void store(icl16u *v) const { 01097 _mm_store_si128((__m128i*)v, v0); 01098 } 01099 01100 inline void storeu(icl16u *v) const { 01101 _mm_storeu_si128((__m128i*)v, v0); 01102 } 01103 }; 01104 01106 struct icl128i32s : Icl128i { 01107 inline icl128i32s() { 01108 } 01109 01110 inline icl128i32s(const icl128i32s &v) { 01111 v0 = v.v0; 01112 } 01113 01114 inline icl128i32s(const Icl128i &v) { 01115 v0 = v.v0; 01116 } 01117 01118 inline icl128i32s(const __m128i &v) { 01119 v0 = v; 01120 } 01121 01122 inline icl128i32s(const __m128i *v) { 01123 v0 = _mm_loadu_si128(v); 01124 } 01125 01126 inline icl128i32s(const icl32s *v) { 01127 v0 = _mm_loadu_si128((__m128i*)v); 01128 } 01129 01130 inline icl128i32s(const icl32u *v) { 01131 v0 = _mm_loadu_si128((__m128i*)v); 01132 } 01133 01134 inline icl128i32s(const icl32s i0, const icl32s i1, const icl32s i2, const icl32s i3) { 01135 v0 = _mm_set_epi32(i3, i2, i1, i0); 01136 } 01137 01138 inline icl128i32s(const icl32s v) { 01139 v0 = _mm_set1_epi32(v); 01140 } 01141 01142 inline icl128i32s(const Icl128 &v) { 01143 //__m128 vMin = _mm_set1_ps(-2147483520.f); 01144 //__m128 vMax = _mm_set1_ps(2147483520.f); 01145 //v0 = _mm_cvtps_epi32(_mm_min_ps(_mm_max_ps(v.v0, vMin), vMax)); 01146 v0 = _mm_cvtps_epi32(v.v0); 01147 } 01148 01149 inline operator Icl128i () const { 01150 return *this; 01151 } 01152 01153 inline icl128i32s& operator=(const icl128i32s &v) { 01154 v0 = v.v0; 01155 return *this; 01156 } 01157 01158 inline icl128i32s& operator=(const Icl128i &v) { 01159 v0 = v.v0; 01160 return *this; 01161 } 01162 01163 inline icl128i32s& operator+=(const icl128i32s &v) { 01164 v0 = _mm_add_epi32(v0, v.v0); 01165 return *this; 01166 } 01167 01168 inline icl128i32s& operator-=(const icl128i32s &v) { 01169 v0 = _mm_sub_epi32(v0, v.v0); 01170 return *this; 01171 } 01172 01173 inline void store(__m128i *v) const { 01174 _mm_store_si128(v, v0); 01175 } 01176 01177 inline void storeu(__m128i *v) const { 01178 _mm_storeu_si128(v, v0); 01179 } 01180 01181 inline void store(icl32s *v) const { 01182 _mm_store_si128((__m128i*)v, v0); 01183 } 01184 01185 inline void storeu(icl32s *v) const { 01186 _mm_storeu_si128((__m128i*)v, v0); 01187 } 01188 01189 inline void store(icl32u *v) const { 01190 _mm_store_si128((__m128i*)v, v0); 01191 } 01192 01193 inline void storeu(icl32u *v) const { 01194 _mm_storeu_si128((__m128i*)v, v0); 01195 } 01196 }; 01197 01199 struct icl256i16s : Icl256i { 01200 inline icl256i16s() { 01201 } 01202 01203 inline icl256i16s(const icl256i16s &v) { 01204 v0 = v.v0; 01205 v1 = v.v0; 01206 } 01207 01208 inline icl256i16s(const Icl256i &v) { 01209 v0 = v.v0; 01210 v1 = v.v0; 01211 } 01212 01213 inline icl256i16s(const __m128i &vl, const __m128i &vh) { 01214 v0 = vl; 01215 v1 = vh; 01216 } 01217 01218 inline icl256i16s(const __m128i *v) { 01219 v0 = *v; 01220 v1 = *(v + 1); 01221 } 01222 01223 inline icl256i16s(const icl16s *v) { 01224 v0 = _mm_loadu_si128((__m128i*)v); 01225 v1 = _mm_loadu_si128((__m128i*)(v + 8)); 01226 } 01227 01228 inline icl256i16s(const icl16s v) { 01229 v0 = _mm_set1_epi16(v); 01230 v1 = _mm_set1_epi16(v); 01231 } 01232 01233 inline icl256i16s(const icl128i8u &v) { 01234 const __m128i vk0 = _mm_setzero_si128(); 01235 v0 = _mm_unpacklo_epi8(v.v0, vk0); 01236 v1 = _mm_unpackhi_epi8(v.v0, vk0); 01237 } 01238 01239 inline icl256i16s(const Icl512i &v) { 01240 v0 = _mm_packs_epi32(v.v0, v.v1); 01241 v1 = _mm_packs_epi32(v.v2, v.v3); 01242 } 01243 01244 inline operator Icl256i () const { 01245 return *this; 01246 } 01247 01248 inline icl256i16s& operator=(const icl256i16s &v) { 01249 v0 = v.v0; 01250 v1 = v.v1; 01251 return *this; 01252 } 01253 01254 inline icl256i16s& operator=(const Icl256i &v) { 01255 v0 = v.v0; 01256 v1 = v.v1; 01257 return *this; 01258 } 01259 01260 inline icl256i16s& operator+=(const icl256i16s &v) { 01261 v0 = _mm_add_epi16(v0, v.v0); 01262 v1 = _mm_add_epi16(v1, v.v1); 01263 return *this; 01264 } 01265 01266 inline icl256i16s& operator-=(const icl256i16s &v) { 01267 v0 = _mm_sub_epi16(v0, v.v0); 01268 v1 = _mm_sub_epi16(v1, v.v1); 01269 return *this; 01270 } 01271 01272 inline void store(__m128i *v) const { 01273 _mm_store_si128(v, v0); 01274 _mm_store_si128(v + 1, v1); 01275 } 01276 01277 inline void storeu(__m128i *v) const { 01278 _mm_storeu_si128(v, v0); 01279 _mm_storeu_si128(v + 1, v1); 01280 } 01281 01282 inline void store(icl16s *v) const { 01283 _mm_store_si128((__m128i*)v, v0); 01284 _mm_store_si128((__m128i*)(v + 8), v1); 01285 } 01286 01287 inline void storeu(icl16s *v) const { 01288 _mm_storeu_si128((__m128i*)v, v0); 01289 _mm_storeu_si128((__m128i*)(v + 8), v1); 01290 } 01291 01292 inline void store(icl16u *v) const { 01293 _mm_store_si128((__m128i*)v, v0); 01294 _mm_store_si128((__m128i*)(v + 8), v1); 01295 } 01296 01297 inline void storeu(icl16u *v) const { 01298 _mm_storeu_si128((__m128i*)v, v0); 01299 _mm_storeu_si128((__m128i*)(v + 8), v1); 01300 } 01301 }; 01302 01304 struct icl256i32s : Icl256i { 01305 01306 inline icl256i32s() { 01307 } 01308 01309 inline icl256i32s(const icl256i32s &v) { 01310 v0 = v.v0; 01311 v1 = v.v1; 01312 } 01313 01314 inline icl256i32s(const Icl256i &v) { 01315 v0 = v.v0; 01316 v1 = v.v1; 01317 } 01318 01319 inline icl256i32s(const __m128i &vl, const __m128i &vh) { 01320 v0 = vl; 01321 v1 = vh; 01322 } 01323 01324 inline icl256i32s(const __m128i *v) { 01325 v0 = *v; 01326 v1 = *(v + 1); 01327 } 01328 01329 inline icl256i32s(const icl32s *v) { 01330 v0 = _mm_loadu_si128((__m128i*)v); 01331 v1 = _mm_loadu_si128((__m128i*)(v + 4)); 01332 } 01333 01334 inline icl256i32s(const icl32s v) { 01335 v0 = _mm_set1_epi32(v); 01336 v1 = _mm_set1_epi32(v); 01337 } 01338 01339 inline icl256i32s(const Icl256 &v) { 01340 //__m128 vMin = _mm_set1_ps(-2147483520.f); 01341 //__m128 vMax = _mm_set1_ps(2147483520.f); 01342 //v0 = _mm_cvtps_epi32(_mm_min_ps(_mm_max_ps(v.v0, vMin), vMax)); 01343 //v1 = _mm_cvtps_epi32(_mm_min_ps(_mm_max_ps(v.v1, vMin), vMax)); 01344 v0 = _mm_cvtps_epi32(v.v0); 01345 v1 = _mm_cvtps_epi32(v.v1); 01346 } 01347 01348 inline icl256i32s& operator=(const icl256i32s &v) { 01349 v0 = v.v0; 01350 v1 = v.v1; 01351 return *this; 01352 } 01353 01354 inline icl256i32s& operator=(const Icl256i &v) { 01355 v0 = v.v0; 01356 v1 = v.v1; 01357 return *this; 01358 } 01359 01360 inline icl256i32s& operator+=(const icl256i32s &v) { 01361 v0 = _mm_add_epi16(v0, v.v0); 01362 v1 = _mm_add_epi16(v1, v.v1); 01363 return *this; 01364 } 01365 01366 inline icl256i32s& operator-=(const icl256i32s &v) { 01367 v0 = _mm_sub_epi16(v0, v.v0); 01368 v1 = _mm_sub_epi16(v1, v.v1); 01369 return *this; 01370 } 01371 01372 inline void store(__m128i *v) const { 01373 _mm_store_si128(v, v0); 01374 _mm_store_si128(v + 1, v1); 01375 } 01376 01377 inline void storeu(__m128i *v) const { 01378 _mm_storeu_si128(v, v0); 01379 _mm_storeu_si128(v + 1, v1); 01380 } 01381 01382 inline void store(icl32s *v) const { 01383 _mm_store_si128((__m128i*)v, v0); 01384 _mm_store_si128((__m128i*)(v + 4), v1); 01385 } 01386 01387 inline void storeu(icl32s *v) const { 01388 _mm_storeu_si128((__m128i*)v, v0); 01389 _mm_storeu_si128((__m128i*)(v + 4), v1); 01390 } 01391 01392 inline void store(icl32u *v) const { 01393 _mm_store_si128((__m128i*)v, v0); 01394 _mm_store_si128((__m128i*)(v + 4), v1); 01395 } 01396 01397 inline void storeu(icl32u *v) const { 01398 _mm_storeu_si128((__m128i*)v, v0); 01399 _mm_storeu_si128((__m128i*)(v + 4), v1); 01400 } 01401 }; 01402 01404 struct icl512i32s : Icl512i { 01405 inline icl512i32s() { 01406 } 01407 01408 inline icl512i32s(const icl512i32s &v) { 01409 v0 = v.v0; 01410 v1 = v.v1; 01411 v2 = v.v2; 01412 v3 = v.v3; 01413 } 01414 01415 inline icl512i32s(const Icl512i &v) { 01416 v0 = v.v0; 01417 v1 = v.v1; 01418 v2 = v.v2; 01419 v3 = v.v3; 01420 } 01421 01422 inline icl512i32s(const __m128i &vll, const __m128i &vlh, 01423 const __m128i &vhl, const __m128i &vhh) { 01424 v0 = vll; 01425 v1 = vlh; 01426 v2 = vhl; 01427 v3 = vhh; 01428 } 01429 01430 inline icl512i32s(const icl32s *v) { 01431 v0 = _mm_loadu_si128((__m128i*)v); 01432 v1 = _mm_loadu_si128((__m128i*)(v + 4)); 01433 v2 = _mm_loadu_si128((__m128i*)(v + 8)); 01434 v3 = _mm_loadu_si128((__m128i*)(v + 12)); 01435 } 01436 01437 inline icl512i32s(const Icl256i &v) { 01438 const __m128i vk0 = _mm_setzero_si128(); 01439 v0 = _mm_unpacklo_epi16(v.v0, vk0); 01440 v1 = _mm_unpackhi_epi16(v.v0, vk0); 01441 v2 = _mm_unpacklo_epi16(v.v1, vk0); 01442 v3 = _mm_unpackhi_epi16(v.v1, vk0); 01443 } 01444 01445 inline icl512i32s(const icl32s v) { 01446 v0 = _mm_set1_epi32(v); 01447 v1 = _mm_set1_epi32(v); 01448 v2 = _mm_set1_epi32(v); 01449 v3 = _mm_set1_epi32(v); 01450 } 01451 01452 inline icl512i32s(const Icl512 &v) { 01453 //__m128 vMin = _mm_set1_ps(-2147483520.f); 01454 //__m128 vMax = _mm_set1_ps(2147483520.f); 01455 //v0 = _mm_cvtps_epi32(_mm_min_ps(_mm_max_ps(v.v0, vMin), vMax)); 01456 //v1 = _mm_cvtps_epi32(_mm_min_ps(_mm_max_ps(v.v1, vMin), vMax)); 01457 //v2 = _mm_cvtps_epi32(_mm_min_ps(_mm_max_ps(v.v2, vMin), vMax)); 01458 //v3 = _mm_cvtps_epi32(_mm_min_ps(_mm_max_ps(v.v3, vMin), vMax)); 01459 v0 = _mm_cvtps_epi32(v.v0); 01460 v1 = _mm_cvtps_epi32(v.v1); 01461 v2 = _mm_cvtps_epi32(v.v2); 01462 v3 = _mm_cvtps_epi32(v.v3); 01463 } 01464 01465 inline icl512i32s& operator=(const icl512i32s &v) { 01466 v0 = v.v0; 01467 v1 = v.v1; 01468 v2 = v.v2; 01469 v3 = v.v3; 01470 return *this; 01471 } 01472 01473 inline icl512i32s& operator=(const Icl512i &v) { 01474 v0 = v.v0; 01475 v1 = v.v1; 01476 v2 = v.v2; 01477 v3 = v.v3; 01478 return *this; 01479 } 01480 01481 inline icl512i32s& operator+=(const icl512i32s &v) { 01482 v0 = _mm_add_epi32(v0, v.v0); 01483 v1 = _mm_add_epi32(v1, v.v1); 01484 v2 = _mm_add_epi32(v2, v.v2); 01485 v3 = _mm_add_epi32(v3, v.v3); 01486 return *this; 01487 } 01488 01489 inline icl512i32s& operator-=(const icl512i32s &v) { 01490 v0 = _mm_sub_epi32(v0, v.v0); 01491 v1 = _mm_sub_epi32(v1, v.v1); 01492 v2 = _mm_sub_epi32(v2, v.v2); 01493 v3 = _mm_sub_epi32(v3, v.v3); 01494 return *this; 01495 } 01496 01497 inline void store(icl32s *v) const { 01498 _mm_store_si128((__m128i*)v, v0); 01499 _mm_store_si128((__m128i*)(v + 4), v1); 01500 _mm_store_si128((__m128i*)(v + 8), v1); 01501 _mm_store_si128((__m128i*)(v + 12), v1); 01502 } 01503 01504 inline void storeu(icl32s *v) const { 01505 _mm_storeu_si128((__m128i*)v, v0); 01506 _mm_storeu_si128((__m128i*)(v + 4), v1); 01507 _mm_storeu_si128((__m128i*)(v + 8), v1); 01508 _mm_storeu_si128((__m128i*)(v + 12), v1); 01509 } 01510 01511 inline void store(icl32u *v) const { 01512 _mm_store_si128((__m128i*)v, v0); 01513 _mm_store_si128((__m128i*)(v + 4), v1); 01514 _mm_store_si128((__m128i*)(v + 8), v1); 01515 _mm_store_si128((__m128i*)(v + 12), v1); 01516 } 01517 01518 inline void storeu(icl32u *v) const { 01519 _mm_storeu_si128((__m128i*)v, v0); 01520 _mm_storeu_si128((__m128i*)(v + 4), v1); 01521 _mm_storeu_si128((__m128i*)(v + 8), v1); 01522 _mm_storeu_si128((__m128i*)(v + 12), v1); 01523 } 01524 }; 01525 01527 struct icl128d : Icl128d { 01528 inline icl128d() { 01529 } 01530 01531 inline icl128d(const __m128d &v) { 01532 v0 = v; 01533 } 01534 01535 inline icl128d(const icl64f *v) { 01536 v0 = _mm_loadu_pd(v); 01537 } 01538 01539 inline icl128d(const icl64f v) { 01540 v0 = _mm_set1_pd(v); 01541 } 01542 01543 inline icl128d(const icl128d &v) { 01544 v0 = v.v0; 01545 } 01546 01547 inline icl128d& operator=(const __m128d &v) { 01548 v0 = v; 01549 return *this; 01550 } 01551 01552 inline icl128d& operator=(const icl64f *v) { 01553 v0 = _mm_loadu_pd(v); 01554 return *this; 01555 } 01556 01557 inline icl128d& operator=(const icl128d &v) { 01558 v0 = v.v0; 01559 return *this; 01560 } 01561 01562 inline operator __m128d () const { 01563 return v0; 01564 } 01565 01566 inline icl128d& operator+=(const Icl128d &v) { 01567 v0 = _mm_add_pd(v0, v.v0); 01568 return *this; 01569 } 01570 01571 inline icl128d& operator-=(const Icl128d &v) { 01572 v0 = _mm_sub_pd(v0, v.v0); 01573 return *this; 01574 } 01575 01576 inline icl128d& operator*=(const Icl128d &v) { 01577 v0 = _mm_mul_pd(v0, v.v0); 01578 return *this; 01579 } 01580 01581 inline icl128d& operator/=(const Icl128d &v) { 01582 v0 = _mm_div_pd(v0, v.v0); 01583 return *this; 01584 } 01585 01586 inline icl128d& operator&=(const Icl128d &v) { 01587 v0 = _mm_and_pd(v0, v.v0); 01588 return *this; 01589 } 01590 01591 inline icl128d& operator|=(const Icl128d &v) { 01592 v0 = _mm_or_pd(v0, v.v0); 01593 return *this; 01594 } 01595 01596 inline icl128d& operator^=(const Icl128d &v) { 01597 v0 = _mm_xor_pd(v0, v.v0); 01598 return *this; 01599 } 01600 01601 inline icl128d& andnot(const Icl128d &v) { 01602 v0 = _mm_andnot_pd(v.v0, v0); 01603 return *this; 01604 } 01605 01606 inline void store(icl64f *v) const { 01607 _mm_store_pd(v, v0); 01608 } 01609 01610 inline void storeu(icl64f *v) const { 01611 _mm_storeu_pd(v, v0); 01612 } 01613 }; 01614 01616 struct icl256d : Icl512d { 01617 // TODO 01618 }; 01619 01620 // type for 8 icl64f values 01621 struct icl512d : Icl512d { 01622 // TODO 01623 }; 01624 01626 struct icl1024d : Icl1024d { 01627 // TODO 01628 }; 01629 01630 // -- advanced SSE types -- // 01631 01632 01633 // ++ operations on SSE types ++ // 01634 01635 // ++ arithmetic operations ++ // 01636 01637 inline icl128 operator+(const icl128 &lv, const icl128 &rv) { 01638 icl128 ret = lv; 01639 return ret += rv; 01640 } 01641 01642 inline icl128 operator-(const icl128 &lv, const icl128 &rv) { 01643 icl128 ret = lv; 01644 return ret -= rv; 01645 } 01646 01647 inline icl128 operator*(const icl128 &lv, const icl128 &rv) { 01648 icl128 ret = lv; 01649 return ret *= rv; 01650 } 01651 01652 inline icl128 operator/(const icl128 &lv, const icl128 &rv) { 01653 icl128 ret = lv; 01654 return ret /= rv; 01655 } 01656 01657 inline icl256 operator+(const icl256 &lv, const icl256 &rv) { 01658 icl256 ret = lv; 01659 return ret += rv; 01660 } 01661 01662 inline icl256 operator-(const icl256 &lv, const icl256 &rv) { 01663 icl256 ret = lv; 01664 return ret -= rv; 01665 } 01666 01667 inline icl256 operator*(const icl256 &lv, const icl256 &rv) { 01668 icl256 ret = lv; 01669 return ret *= rv; 01670 } 01671 01672 inline icl256 operator/(const icl256 &lv, const icl256 &rv) { 01673 icl256 ret = lv; 01674 return ret /= rv; 01675 } 01676 01677 inline icl512 operator+(const icl512 &lv, const icl512 &rv) { 01678 icl512 ret = lv; 01679 return ret += rv; 01680 } 01681 01682 inline icl512 operator-(const icl512 &lv, const icl512 &rv) { 01683 icl512 ret = lv; 01684 return ret -= rv; 01685 } 01686 01687 inline icl512 operator*(const icl512 &lv, const icl512 &rv) { 01688 icl512 ret = lv; 01689 return ret *= rv; 01690 } 01691 01692 inline icl512 operator/(const icl512 &lv, const icl512 &rv) { 01693 icl512 ret = lv; 01694 return ret /= rv; 01695 } 01696 01697 // -- arithmetic operations -- // 01698 01699 // ++ comparison operations ++ // 01700 01701 inline icl128 operator==(const icl128 &lv, const icl128 &rv) { 01702 icl128 ret; 01703 ret.v0 = _mm_cmpeq_ps(lv.v0, rv.v0); 01704 return ret; 01705 } 01706 01707 inline icl128 operator!=(const icl128 &lv, const icl128 &rv) { 01708 icl128 ret; 01709 ret.v0 = _mm_cmpneq_ps(lv.v0, rv.v0); 01710 return ret; 01711 } 01712 01713 inline icl128 operator<(const icl128 &lv, const icl128 &rv) { 01714 icl128 ret; 01715 ret.v0 = _mm_cmplt_ps(lv.v0, rv.v0); 01716 return ret; 01717 } 01718 01719 inline icl128 operator>(const icl128 &lv, const icl128 &rv) { 01720 icl128 ret; 01721 ret.v0 = _mm_cmpgt_ps(lv.v0, rv.v0); 01722 return ret; 01723 } 01724 01725 inline icl128 operator<=(const icl128 &lv, const icl128 &rv) { 01726 icl128 ret; 01727 ret.v0 = _mm_cmple_ps(lv.v0, rv.v0); 01728 return ret; 01729 } 01730 01731 inline icl128 operator>=(const icl128 &lv, const icl128 &rv) { 01732 icl128 ret; 01733 ret.v0 = _mm_cmpge_ps(lv.v0, rv.v0); 01734 return ret; 01735 } 01736 01737 inline icl256 operator==(const icl256 &lv, const icl256 &rv) { 01738 icl256 ret; 01739 ret.v0 = _mm_cmpeq_ps(lv.v0, rv.v0); 01740 ret.v1 = _mm_cmpeq_ps(lv.v1, rv.v1); 01741 return ret; 01742 } 01743 01744 inline icl256 operator!=(const icl256 &lv, const icl256 &rv) { 01745 icl256 ret; 01746 ret.v0 = _mm_cmpneq_ps(lv.v0, rv.v0); 01747 ret.v1 = _mm_cmpneq_ps(lv.v1, rv.v1); 01748 return ret; 01749 } 01750 01751 inline icl256 operator<(const icl256 &lv, const icl256 &rv) { 01752 icl256 ret; 01753 ret.v0 = _mm_cmplt_ps(lv.v0, rv.v0); 01754 ret.v1 = _mm_cmplt_ps(lv.v1, rv.v1); 01755 return ret; 01756 } 01757 01758 inline icl256 operator>(const icl256 &lv, const icl256 &rv) { 01759 icl256 ret; 01760 ret.v0 = _mm_cmpgt_ps(lv.v0, rv.v0); 01761 ret.v1 = _mm_cmpgt_ps(lv.v1, rv.v1); 01762 return ret; 01763 } 01764 01765 inline icl256 operator<=(const icl256 &lv, const icl256 &rv) { 01766 icl256 ret; 01767 ret.v0 = _mm_cmple_ps(lv.v0, rv.v0); 01768 ret.v1 = _mm_cmple_ps(lv.v1, rv.v1); 01769 return ret; 01770 } 01771 01772 inline icl256 operator>=(const icl256 &lv, const icl256 &rv) { 01773 icl256 ret; 01774 ret.v0 = _mm_cmpge_ps(lv.v0, rv.v0); 01775 ret.v1 = _mm_cmpge_ps(lv.v1, rv.v1); 01776 return ret; 01777 } 01778 01779 inline icl512 operator==(const icl512 &lv, const icl512 &rv) { 01780 icl512 ret; 01781 ret.v0 = _mm_cmpeq_ps(lv.v0, rv.v0); 01782 ret.v1 = _mm_cmpeq_ps(lv.v1, rv.v1); 01783 ret.v2 = _mm_cmpeq_ps(lv.v2, rv.v2); 01784 ret.v3 = _mm_cmpeq_ps(lv.v3, rv.v3); 01785 return ret; 01786 } 01787 01788 inline icl512 operator!=(const icl512 &lv, const icl512 &rv) { 01789 icl512 ret; 01790 ret.v0 = _mm_cmpneq_ps(lv.v0, rv.v0); 01791 ret.v1 = _mm_cmpneq_ps(lv.v1, rv.v1); 01792 ret.v2 = _mm_cmpneq_ps(lv.v2, rv.v2); 01793 ret.v3 = _mm_cmpneq_ps(lv.v3, rv.v3); 01794 return ret; 01795 } 01796 01797 inline icl512 operator<(const icl512 &lv, const icl512 &rv) { 01798 icl512 ret; 01799 ret.v0 = _mm_cmplt_ps(lv.v0, rv.v0); 01800 ret.v1 = _mm_cmplt_ps(lv.v1, rv.v1); 01801 ret.v2 = _mm_cmplt_ps(lv.v2, rv.v2); 01802 ret.v3 = _mm_cmplt_ps(lv.v3, rv.v3); 01803 return ret; 01804 } 01805 01806 inline icl512 operator>(const icl512 &lv, const icl512 &rv) { 01807 icl512 ret; 01808 ret.v0 = _mm_cmpgt_ps(lv.v0, rv.v0); 01809 ret.v1 = _mm_cmpgt_ps(lv.v1, rv.v1); 01810 ret.v2 = _mm_cmpgt_ps(lv.v2, rv.v2); 01811 ret.v3 = _mm_cmpgt_ps(lv.v3, rv.v3); 01812 return ret; 01813 } 01814 01815 inline icl512 operator<=(const icl512 &lv, const icl512 &rv) { 01816 icl512 ret; 01817 ret.v0 = _mm_cmple_ps(lv.v0, rv.v0); 01818 ret.v1 = _mm_cmple_ps(lv.v1, rv.v1); 01819 ret.v2 = _mm_cmple_ps(lv.v2, rv.v2); 01820 ret.v3 = _mm_cmple_ps(lv.v3, rv.v3); 01821 return ret; 01822 } 01823 01824 inline icl512 operator>=(const icl512 &lv, const icl512 &rv) { 01825 icl512 ret; 01826 ret.v0 = _mm_cmpge_ps(lv.v0, rv.v0); 01827 ret.v1 = _mm_cmpge_ps(lv.v1, rv.v1); 01828 ret.v2 = _mm_cmpge_ps(lv.v2, rv.v2); 01829 ret.v3 = _mm_cmpge_ps(lv.v3, rv.v3); 01830 return ret; 01831 } 01832 01833 // -- comparison operations -- // 01834 01835 // ++ logical operations ++ // 01836 01837 inline icl128 operator&(const icl128 &lv, const icl128 &rv) { 01838 icl128 ret; 01839 ret.v0 = _mm_and_ps(lv.v0, rv.v0); 01840 return ret; 01841 } 01842 01843 inline icl128 operator|(const icl128 &lv, const icl128 &rv) { 01844 icl128 ret; 01845 ret.v0 = _mm_or_ps(lv.v0, rv.v0); 01846 return ret; 01847 } 01848 01849 inline icl128 operator^(const icl128 &lv, const icl128 &rv) { 01850 icl128 ret; 01851 ret.v0 = _mm_xor_ps(lv.v0, rv.v0); 01852 return ret; 01853 } 01854 01855 inline icl128 andnot(const icl128 &lv, const icl128 &rv) { 01856 icl128 ret; 01857 ret.v0 = _mm_andnot_ps(rv.v0, lv.v0); 01858 return ret; 01859 } 01860 01861 inline Icl128i operator&(const Icl128i &lv, const Icl128i &rv) { 01862 Icl128i ret; 01863 ret.v0 = _mm_and_si128(lv.v0, rv.v0); 01864 return ret; 01865 } 01866 01867 inline Icl128i operator|(const Icl128i &lv, const Icl128i &rv) { 01868 Icl128i ret; 01869 ret.v0 = _mm_or_si128(lv.v0, rv.v0); 01870 return ret; 01871 } 01872 01873 inline Icl128i operator^(const Icl128i &lv, const Icl128i &rv) { 01874 Icl128i ret; 01875 ret.v0 = _mm_xor_si128(lv.v0, rv.v0); 01876 return ret; 01877 } 01878 01879 inline Icl128i andnot(const Icl128i &lv, const Icl128i &rv) { 01880 Icl128i ret; 01881 ret.v0 = _mm_andnot_si128(rv.v0, lv.v0); 01882 return ret; 01883 } 01884 01885 inline icl256 operator&(const icl256 &lv, const icl256 &rv) { 01886 icl256 ret; 01887 ret.v0 = _mm_and_ps(lv.v0, rv.v0); 01888 ret.v1 = _mm_and_ps(lv.v1, rv.v1); 01889 return ret; 01890 } 01891 01892 inline icl256 operator|(const icl256 &lv, const icl256 &rv) { 01893 icl256 ret; 01894 ret.v0 = _mm_or_ps(lv.v0, rv.v0); 01895 ret.v1 = _mm_or_ps(lv.v1, rv.v1); 01896 return ret; 01897 } 01898 01899 inline icl256 operator^(const icl256 &lv, const icl256 &rv) { 01900 icl256 ret; 01901 ret.v0 = _mm_xor_ps(lv.v0, rv.v0); 01902 ret.v1 = _mm_xor_ps(lv.v1, rv.v1); 01903 return ret; 01904 } 01905 01906 inline icl256 andnot(const icl256 &lv, const icl256 &rv) { 01907 icl256 ret; 01908 ret.v0 = _mm_andnot_ps(rv.v0, lv.v0); 01909 ret.v1 = _mm_andnot_ps(rv.v1, lv.v1); 01910 return ret; 01911 } 01912 01913 inline Icl256i operator&(const Icl256i &lv, const Icl256i &rv) { 01914 Icl256i ret; 01915 ret.v0 = _mm_and_si128(lv.v0, rv.v0); 01916 ret.v1 = _mm_and_si128(lv.v1, rv.v1); 01917 return ret; 01918 } 01919 01920 inline Icl256i operator|(const Icl256i &lv, const Icl256i &rv) { 01921 Icl256i ret; 01922 ret.v0 = _mm_or_si128(lv.v0, rv.v0); 01923 ret.v1 = _mm_or_si128(lv.v1, rv.v1); 01924 return ret; 01925 } 01926 01927 inline Icl256i operator^(const Icl256i &lv, const Icl256i &rv) { 01928 Icl256i ret; 01929 ret.v0 = _mm_xor_si128(lv.v0, rv.v0); 01930 ret.v1 = _mm_xor_si128(lv.v1, rv.v1); 01931 return ret; 01932 } 01933 01934 inline Icl256i andnot(const Icl256i &lv, const Icl256i &rv) { 01935 Icl256i ret; 01936 ret.v0 = _mm_andnot_si128(rv.v0, lv.v0); 01937 ret.v1 = _mm_andnot_si128(rv.v1, lv.v1); 01938 return ret; 01939 } 01940 01941 inline icl512 operator&(const icl512 &lv, const icl512 &rv) { 01942 icl512 ret; 01943 ret.v0 = _mm_and_ps(lv.v0, rv.v0); 01944 ret.v1 = _mm_and_ps(lv.v1, rv.v1); 01945 ret.v2 = _mm_and_ps(lv.v2, rv.v2); 01946 ret.v3 = _mm_and_ps(lv.v3, rv.v3); 01947 return ret; 01948 } 01949 01950 inline icl512 operator|(const icl512 &lv, const icl512 &rv) { 01951 icl512 ret; 01952 ret.v0 = _mm_or_ps(lv.v0, rv.v0); 01953 ret.v1 = _mm_or_ps(lv.v1, rv.v1); 01954 ret.v2 = _mm_or_ps(lv.v2, rv.v2); 01955 ret.v3 = _mm_or_ps(lv.v3, rv.v3); 01956 return ret; 01957 } 01958 01959 inline icl512 operator^(const icl512 &lv, const icl512 &rv) { 01960 icl512 ret; 01961 ret.v0 = _mm_xor_ps(lv.v0, rv.v0); 01962 ret.v1 = _mm_xor_ps(lv.v1, rv.v1); 01963 ret.v2 = _mm_xor_ps(lv.v2, rv.v2); 01964 ret.v3 = _mm_xor_ps(lv.v3, rv.v3); 01965 return ret; 01966 } 01967 01968 inline icl512 andnot(const icl512 &lv, const icl512 &rv) { 01969 icl512 ret; 01970 ret.v0 = _mm_andnot_ps(rv.v0, lv.v0); 01971 ret.v1 = _mm_andnot_ps(rv.v1, lv.v1); 01972 ret.v2 = _mm_andnot_ps(rv.v2, lv.v2); 01973 ret.v3 = _mm_andnot_ps(rv.v3, lv.v3); 01974 return ret; 01975 } 01976 01977 inline Icl512i operator&(const Icl512i &lv, const Icl512i &rv) { 01978 Icl512i ret; 01979 ret.v0 = _mm_and_si128(lv.v0, rv.v0); 01980 ret.v1 = _mm_and_si128(lv.v1, rv.v1); 01981 ret.v2 = _mm_and_si128(lv.v2, rv.v2); 01982 ret.v3 = _mm_and_si128(lv.v3, rv.v3); 01983 return ret; 01984 } 01985 01986 inline Icl512i operator|(const Icl512i &lv, const Icl512i &rv) { 01987 Icl512i ret; 01988 ret.v0 = _mm_or_si128(lv.v0, rv.v0); 01989 ret.v1 = _mm_or_si128(lv.v1, rv.v1); 01990 ret.v2 = _mm_or_si128(lv.v2, rv.v2); 01991 ret.v3 = _mm_or_si128(lv.v3, rv.v3); 01992 return ret; 01993 } 01994 01995 inline Icl512i operator^(const Icl512i &lv, const Icl512i &rv) { 01996 Icl512i ret; 01997 ret.v0 = _mm_xor_si128(lv.v0, rv.v0); 01998 ret.v1 = _mm_xor_si128(lv.v1, rv.v1); 01999 ret.v2 = _mm_xor_si128(lv.v2, rv.v2); 02000 ret.v3 = _mm_xor_si128(lv.v3, rv.v3); 02001 return ret; 02002 } 02003 02004 inline Icl512i andnot(const Icl512i &lv, const Icl512i &rv) { 02005 Icl512i ret; 02006 ret.v0 = _mm_andnot_si128(rv.v0, lv.v0); 02007 ret.v1 = _mm_andnot_si128(rv.v1, lv.v1); 02008 ret.v2 = _mm_andnot_si128(rv.v2, lv.v2); 02009 ret.v3 = _mm_andnot_si128(rv.v3, lv.v3); 02010 return ret; 02011 } 02012 02013 // -- logical operations -- // 02014 02015 // ++ shift operetions ++ // 02016 02017 inline Icl128i& operator<<(Icl128i &v, const int i) { 02018 v.v0 = _mm_slli_epi32(v.v0, i); 02019 return v; 02020 } 02021 02022 inline Icl128i& operator>>(Icl128i &v, const int i) { 02023 v.v0 = _mm_srai_epi32(v.v0, i); 02024 return v; 02025 } 02026 02027 inline Icl256i& operator<<(Icl256i &v, const int i) { 02028 v.v0 = _mm_slli_epi32(v.v0, i); 02029 v.v1 = _mm_slli_epi32(v.v1, i); 02030 return v; 02031 } 02032 02033 inline Icl256i& operator>>(Icl256i &v, const int i) { 02034 v.v0 = _mm_srai_epi32(v.v0, i); 02035 v.v1 = _mm_srai_epi32(v.v1, i); 02036 return v; 02037 } 02038 02039 inline Icl512i& operator<<(Icl512i &v, const int i) { 02040 v.v0 = _mm_slli_epi32(v.v0, i); 02041 v.v1 = _mm_slli_epi32(v.v1, i); 02042 v.v2 = _mm_slli_epi32(v.v2, i); 02043 v.v3 = _mm_slli_epi32(v.v3, i); 02044 return v; 02045 } 02046 02047 inline Icl512i& operator>>(Icl512i &v, const int i) { 02048 v.v0 = _mm_srai_epi32(v.v0, i); 02049 v.v1 = _mm_srai_epi32(v.v1, i); 02050 v.v2 = _mm_srai_epi32(v.v2, i); 02051 v.v3 = _mm_srai_epi32(v.v3, i); 02052 return v; 02053 } 02054 02055 // -- shift operations -- // 02056 02057 // ++ min-max operations ++ // 02058 02059 inline icl128i8u min(const icl128i8u &lv, const icl128i8u &rv) { 02060 icl128i8u ret; 02061 ret.v0 = _mm_min_epu8(lv.v0, rv.v0); 02062 return ret; 02063 } 02064 02065 inline icl128i8u max(const icl128i8u &lv, const icl128i8u &rv) { 02066 icl128i8u ret; 02067 ret.v0 = _mm_max_epu8(lv.v0, rv.v0); 02068 return ret; 02069 } 02070 02071 inline icl128i16s min(const icl128i16s &lv, const icl128i16s &rv) { 02072 icl128i16s ret; 02073 ret.v0 = _mm_min_epi16(lv.v0, rv.v0); 02074 return ret; 02075 } 02076 02077 inline icl128i16s max(const icl128i16s &lv, const icl128i16s &rv) { 02078 icl128i16s ret; 02079 ret.v0 = _mm_max_epi16(lv.v0, rv.v0); 02080 return ret; 02081 } 02082 02083 inline icl256i16s min(const icl256i16s &lv, const icl256i16s &rv) { 02084 icl256i16s ret; 02085 ret.v0 = _mm_min_epi16(lv.v0, rv.v0); 02086 ret.v1 = _mm_min_epi16(lv.v1, rv.v1); 02087 return ret; 02088 } 02089 02090 inline icl256i16s max(const icl256i16s &lv, const icl256i16s &rv) { 02091 icl256i16s ret; 02092 ret.v0 = _mm_max_epi16(lv.v0, rv.v0); 02093 ret.v1 = _mm_max_epi16(lv.v1, rv.v1); 02094 return ret; 02095 } 02096 02097 inline icl128 min(const icl128 &lv, const icl128 &rv) { 02098 icl128 ret; 02099 ret.v0 = _mm_min_ps(lv.v0, rv.v0); 02100 return ret; 02101 } 02102 02103 inline icl128 max(const icl128 &lv, const icl128 &rv) { 02104 icl128 ret; 02105 ret.v0 = _mm_max_ps(lv.v0, rv.v0); 02106 return ret; 02107 } 02108 02109 inline icl256 min(const icl256 &lv, const icl256 &rv) { 02110 icl256 ret; 02111 ret.v0 = _mm_min_ps(lv.v0, rv.v0); 02112 ret.v1 = _mm_min_ps(lv.v1, rv.v1); 02113 return ret; 02114 } 02115 02116 inline icl256 max(const icl256 &lv, const icl256 &rv) { 02117 icl256 ret; 02118 ret.v0 = _mm_max_ps(lv.v0, rv.v0); 02119 ret.v1 = _mm_max_ps(lv.v1, rv.v1); 02120 return ret; 02121 } 02122 02123 inline icl512 min(const icl512 &lv, const icl512 &rv) { 02124 icl512 ret; 02125 ret.v0 = _mm_min_ps(lv.v0, rv.v0); 02126 ret.v1 = _mm_min_ps(lv.v1, rv.v1); 02127 ret.v2 = _mm_min_ps(lv.v2, rv.v2); 02128 ret.v3 = _mm_min_ps(lv.v3, rv.v3); 02129 return ret; 02130 } 02131 02132 inline icl512 max(const icl512 &lv, const icl512 &rv) { 02133 icl512 ret; 02134 ret.v0 = _mm_max_ps(lv.v0, rv.v0); 02135 ret.v1 = _mm_max_ps(lv.v1, rv.v1); 02136 ret.v2 = _mm_max_ps(lv.v2, rv.v2); 02137 ret.v3 = _mm_max_ps(lv.v3, rv.v3); 02138 return ret; 02139 } 02140 02141 // -- min-max operations -- // 02142 02143 02144 // ++ absosulte values ++ // 02145 02146 #ifdef ICL_HAVE_SSE3 02147 inline icl128i8u abs(const icl128i8u &v) { 02148 icl128i8u ret; 02149 ret.v0 = _mm_abs_epi8(v.v0); 02150 return ret; 02151 } 02152 02153 inline icl128i16s abs(const icl128i16s &v) { 02154 icl128i16s ret; 02155 ret.v0 = _mm_abs_epi16(v.v0); 02156 return ret; 02157 } 02158 02159 inline icl128i32s abs(const icl128i32s &v) { 02160 icl128i32s ret; 02161 ret.v0 = _mm_abs_epi32(v.v0); 02162 return ret; 02163 } 02164 02165 inline icl256i16s abs(const icl256i16s &v) { 02166 icl256i16s ret; 02167 ret.v0 = _mm_abs_epi16(v.v0); 02168 ret.v1 = _mm_abs_epi16(v.v1); 02169 return ret; 02170 } 02171 02172 inline icl256i32s abs(const icl256i32s &v) { 02173 icl256i32s ret; 02174 ret.v0 = _mm_abs_epi32(v.v0); 02175 ret.v1 = _mm_abs_epi32(v.v1); 02176 return ret; 02177 } 02178 02179 inline icl512i32s abs(const icl512i32s &v) { 02180 icl512i32s ret; 02181 ret.v0 = _mm_abs_epi32(v.v0); 02182 ret.v1 = _mm_abs_epi32(v.v1); 02183 ret.v2 = _mm_abs_epi32(v.v2); 02184 ret.v3 = _mm_abs_epi32(v.v3); 02185 return ret; 02186 } 02187 #else 02188 // TODO: without SSE3 02189 #endif 02190 02191 inline icl128 abs(const icl128 &v) { 02192 icl128 ret; 02193 ret.v0 = _mm_andnot_ps(icl128(-0.0f), v.v0); 02194 return ret; 02195 } 02196 02197 inline icl256 abs(const icl256 &v) { 02198 icl128 tmp(-0.0f); 02199 icl256 ret; 02200 ret.v0 = _mm_andnot_ps(tmp.v0, v.v0); 02201 ret.v1 = _mm_andnot_ps(tmp.v0, v.v1); 02202 return ret; 02203 } 02204 02205 inline icl512 abs(const icl512 &v) { 02206 icl128 tmp(-0.0f); 02207 icl512 ret; 02208 ret.v0 = _mm_andnot_ps(tmp.v0, v.v0); 02209 ret.v1 = _mm_andnot_ps(tmp.v0, v.v1); 02210 ret.v2 = _mm_andnot_ps(tmp.v0, v.v2); 02211 ret.v3 = _mm_andnot_ps(tmp.v0, v.v3); 02212 return ret; 02213 } 02214 02215 // -- absosulte values -- // 02216 02217 02218 // ++ squared root ++ // 02219 02220 inline icl128 sqrt(const icl128 &v) { 02221 icl128 r; 02222 r.v0 = _mm_sqrt_ps(v.v0); 02223 return r; 02224 } 02225 02226 inline icl256 sqrt(const icl256 &v) { 02227 icl256 r; 02228 r.v0 = _mm_sqrt_ps(v.v0); 02229 r.v1 = _mm_sqrt_ps(v.v1); 02230 return r; 02231 } 02232 02233 inline icl512 sqrt(const icl512 &v) { 02234 icl512 r; 02235 r.v0 = _mm_sqrt_ps(v.v0); 02236 r.v1 = _mm_sqrt_ps(v.v1); 02237 r.v2 = _mm_sqrt_ps(v.v2); 02238 r.v3 = _mm_sqrt_ps(v.v3); 02239 return r; 02240 } 02241 02242 inline icl128d sqrt(const icl128d &v) { 02243 icl128d r; 02244 r.v0 = _mm_sqrt_pd(v.v0); 02245 return r; 02246 } 02247 02248 inline icl256d sqrt(const icl256d &v) { 02249 icl256d r; 02250 r.v0 = _mm_sqrt_pd(v.v0); 02251 r.v1 = _mm_sqrt_pd(v.v1); 02252 return r; 02253 } 02254 02255 inline icl512d sqrt(const icl512d &v) { 02256 icl512d r; 02257 r.v0 = _mm_sqrt_pd(v.v0); 02258 r.v1 = _mm_sqrt_pd(v.v1); 02259 r.v2 = _mm_sqrt_pd(v.v2); 02260 r.v3 = _mm_sqrt_pd(v.v3); 02261 return r; 02262 } 02263 02264 inline icl1024d sqrt(const icl1024d &v) { 02265 icl1024d r; 02266 r.v0 = _mm_sqrt_pd(v.v0); 02267 r.v1 = _mm_sqrt_pd(v.v1); 02268 r.v2 = _mm_sqrt_pd(v.v2); 02269 r.v3 = _mm_sqrt_pd(v.v3); 02270 r.v4 = _mm_sqrt_pd(v.v4); 02271 r.v5 = _mm_sqrt_pd(v.v5); 02272 r.v6 = _mm_sqrt_pd(v.v6); 02273 r.v7 = _mm_sqrt_pd(v.v7); 02274 return r; 02275 } 02276 02277 // -- squared root -- // 02278 02279 02280 // ++ cube root ++ // 02281 02282 inline icl128 cbrt(const icl128 &v) { 02283 icl128i32s tmp = icl128i32s(_mm_castps_si128(v)); 02284 tmp = tmp / icl128i32s(3) + icl128i32s(709921077); 02285 icl128 a = icl128(_mm_castsi128_ps(tmp)); 02286 icl128 a3 = a * a * a; 02287 return a * (a3 + v + v) * (a3 + a3 + v).rcp(); 02288 } 02289 02290 inline icl256 cbrt(const icl256 &v) { 02291 __m128i t0 = _mm_castps_si128(v.v0); 02292 __m128i t1 = _mm_castps_si128(v.v1); 02293 icl256i32s tmp = icl256i32s(t0, t1); 02294 tmp = tmp / icl256i32s(3) + icl256i32s(709921077); 02295 icl256 a = icl256(_mm_castsi128_ps(tmp.v0), 02296 _mm_castsi128_ps(tmp.v1)); 02297 icl256 a3 = a * a * a; 02298 return a * (a3 + v + v) * (a3 + a3 + v).rcp(); 02299 } 02300 02301 inline icl512 cbrt(const icl512 &v) { 02302 __m128i t0 = _mm_castps_si128(v.v0); 02303 __m128i t1 = _mm_castps_si128(v.v1); 02304 __m128i t2 = _mm_castps_si128(v.v2); 02305 __m128i t3 = _mm_castps_si128(v.v3); 02306 icl512i32s tmp = icl512i32s(t0, t1, t2, t3); 02307 tmp = tmp / icl512i32s(3) + icl512i32s(709921077); 02308 icl512 a = icl512(_mm_castsi128_ps(tmp.v0), 02309 _mm_castsi128_ps(tmp.v1), 02310 _mm_castsi128_ps(tmp.v2), 02311 _mm_castsi128_ps(tmp.v3)); 02312 icl512 a3 = a * a * a; 02313 return a * (a3 + v + v) * (a3 + a3 + v).rcp(); 02314 } 02315 02316 // -- cube root -- // 02317 02318 // -- operations on SSE types -- // 02319 02320 typedef icl128 icl32fx4; 02321 typedef icl256 icl32fx8; 02322 typedef icl512 icl32fx16; 02323 typedef icl128i8u icl8ux16; 02324 typedef icl128i16s icl16sx8; 02325 typedef icl128i32s icl32sx4; 02326 typedef icl256i16s icl16sx16; 02327 typedef icl256i32s icl32sx8; 02328 typedef icl512i32s icl32sx16; 02329 typedef icl128d icl64fx2; 02330 typedef icl256d icl64fx4; 02331 typedef icl512d icl64fx8; 02332 typedef icl1024d icl64fx16; 02333 02334 #endif 02335 02336 } // namespace utils 02337 }