Image Component Library (ICL): /home/jenkins/workspace/icl-manual-trunk/icl-manual/ICLUtils/src/ICLUtils/SSETypes.h Source File

Go to the documentation of this file.
00001 /********************************************************************
00002 **                Image Component Library (ICL)                    **
00003 **                                                                 **
00004 ** Copyright (C) 2006-2013 CITEC, University of Bielefeld          **
00005 **                         Neuroinformatics Group                  **
00006 ** Website: www.iclcv.org and                                      **
00007 **          http://opensource.cit-ec.de/projects/icl               **
00008 **                                                                 **
00009 ** File   : ICLUtils/src/ICLUtils/SSETypes.h                       **
00010 ** Module : ICLUtils                                               **
00011 ** Authors: Sergius Gaulik                                         **
00012 **                                                                 **
00013 **                                                                 **
00014 ** GNU LESSER GENERAL PUBLIC LICENSE                               **
00015 ** This file may be used under the terms of the GNU Lesser General **
00016 ** Public License version 3.0 as published by the                  **
00017 **                                                                 **
00018 ** Free Software Foundation and appearing in the file LICENSE.LGPL **
00019 ** included in the packaging of this file.  Please review the      **
00020 ** following information to ensure the license requirements will   **
00021 ** be met: http://www.gnu.org/licenses/lgpl-3.0.txt                **
00022 **                                                                 **
00023 ** The development of this software was supported by the           **
00024 ** Excellence Cluster EXC 277 Cognitive Interaction Technology.    **
00025 ** The Excellence Cluster EXC 277 is a grant of the Deutsche       **
00026 ** Forschungsgemeinschaft (DFG) in the context of the German       **
00027 ** Excellence Initiative.                                          **
00028 **                                                                 **
00029 ********************************************************************/
00030 
00031 #pragma once
00032 
00033 #ifdef ICL_USE_SSE
00034   #if defined __SSE2__ || defined _M_X64  || (defined _M_IX86_FP && _M_IX86_FP >= 2)
00035     #include "emmintrin.h"
00036     #define ICL_HAVE_SSE2
00037     #if defined __SSE3__ || (defined _MSC_VER && _MSC_VER >= 1500)
00038       #include "pmmintrin.h"
00039       #define ICL_HAVE_SSE3
00040       #if defined __SSSE3__ || (defined _MSC_VER && _MSC_VER >= 1500)
00041         #include "tmmintrin.h"
00042         #define ICL_HAVE_SSSE3
00043       #endif
00044     #endif
00045   #endif
00046 #endif
00047 
00048 #include <ICLUtils/CompatMacros.h>
00049 #include <ICLUtils/BasicTypes.h>
00050 
00051 /* This header wraps the 128 bit SSE types and defines some basic
00052    functions for them. The idea is to create an easier and more intuitive
00053    way to work with SSE types.
00054    The type names are combinations of the basic type name and the number
00055    of these basic types in the SSE type. For example: icl8ux16 is a type
00056    with 16 icl8u values.
00057    The following examples will show some diffrent ways to use the SSE types:
00058 
00059    // ++++++++++++ black and white example ++++++++++++ //
00060     #include <ICLUtils/SSETypes.h>
00061     #include <ICLUtils/Time.h>
00062 
00063     using namespace std;
00064     using namespace icl;
00065     using namespace icl::utils;
00066 
00067     #define T_SIZE 111111111
00068 
00069     // use a threshold to set values of an array to 0 or 255
00070     void createBinaryValues(icl8u *d, icl8u *dEnd, const icl8u threshold) {
00071       for (; d < dEnd; ++d) *d = (*d < threshold) ? 0 : 255;
00072     }
00073 
00074     // use a threshold to set values of an array to 0 or 255
00075     void createBinaryValuesSSE(icl8u *d, icl8u *dEnd, const icl8u threshold) {
00076       // if we end up with less than 16 values at the end
00077       // we have to convert them value by value to prevent memory access violation
00078       icl8u *dSSEEnd = dEnd - 15;
00079       // some constants
00080       const icl8ux16 c128 = icl8ux16((icl8s)128);
00081       const icl8ux16 cT = icl8ux16((icl8s)threshold - 129);
00082 
00083       // convert 16 values at the same time
00084       for (; d < dSSEEnd; d += 16) {
00085         // load the first 16 values
00086         icl8ux16 v = icl8ux16(d);
00087         // subtract 128 from every value in v
00088         v -= c128;
00089         // if a function for the SSE wrapper types does not exist
00090         // we can mix the wrapper types with the actual SSE functions
00091         v = _mm_cmpgt_epi8(v, cT);
00092         // storeu stores the values from v in d
00093         // (store works only with 16 aligned memory,
00094         // but storeu does not have this restriction)
00095         v.storeu(d);
00096       }
00097 
00098       // convert 1 value at a time
00099       for (; d < dEnd; ++d) {
00100         *d = (*d < threshold) ? 0 : 255;
00101       }
00102     }
00103 
00104     int main(int n, char **ppc){
00105       icl::utils::Time t;
00106       icl8u *c = new icl8u[T_SIZE];
00107 
00108       for (unsigned int i = 0; i < T_SIZE; ++i)
00109         c[i] = rand() % 256;
00110 
00111       t = icl::utils::Time::now();
00112       createBinaryValues(c, c + T_SIZE, 123);
00113       t.showAge("without SSE");
00114 
00115       for (unsigned int i = 0; i < T_SIZE; ++i)
00116         c[i] = rand() % 256;
00117 
00118       t = icl::utils::Time::now();
00119       createBinaryValuesSSE(c, c + T_SIZE, 123);
00120       t.showAge("with SSE");
00121 
00122       delete c;
00123 
00124       return 0;
00125     }
00126    // ------------ black and white example ------------ //
00127 
00128    // ++++++++++++ rgb to gray ++++++++++++ //
00129    #include <ICLUtils/SSETypes.h>
00130    #include <ICLUtils/ClippedCast.h>
00131    #include <ICLUtils/Time.h>
00132 
00133    using namespace std;
00134    using namespace icl;
00135    using namespace icl::utils;
00136 
00137    #define T_SIZE 111111111
00138 
00139    void RGBtoGray(const icl8u *r, const icl8u *g, const icl8u *b, icl8u *gr, icl8u *grEnd) {
00140    for (; gr != grEnd; ++gr, ++r, ++g, ++b) *gr = clipped_cast<icl32f, icl8u>((*r + *g + *b) / 3.0f + 0.5f);
00141    }
00142    void SSERGBtoGray(const icl8u *r, const icl8u *g, const icl8u *b, icl8u *gr, icl8u *grEnd) {
00143    icl8u *grSSEEnd = grEnd - 15;
00144 
00145    for (; gr < grSSEEnd; gr += 16, r += 16, g += 16, b += 16) {
00146    // convert to icl16s for number higher than 255
00147    icl16sx16 vR(r);
00148    icl16sx16 vG(g);
00149    icl16sx16 vB(b);
00150 
00151    vR += vB;
00152    vR += vG;
00153 
00154    // convert to icl32s and then to icl32f type for floating point operations
00155    icl32fx16 vRes = icl32sx16(vR);
00156 
00157    vRes *= icl32fx16(1.0f / 3.0f);
00158    vRes.storeu(gr);
00159    }
00160 
00161    for (; gr != grEnd; ++gr, ++r, ++g, ++b) *gr = clipped_cast<icl8u, icl8u>((*r + *g + *b) / 3.0f + 0.5f);
00162    }
00163 
00164    int main(int n, char **ppc){
00165      icl::utils::Time t;
00166      icl8u *r  = new icl8u[T_SIZE];
00167      icl8u *g  = new icl8u[T_SIZE];
00168      icl8u *b  = new icl8u[T_SIZE];
00169      icl8u *gr = new icl8u[T_SIZE];
00170 
00171      for (unsigned int i = 0; i < T_SIZE; ++i) {
00172      r[i] = rand() % 256;
00173      g[i] = rand() % 256;
00174      b[i] = rand() % 256;
00175      }
00176 
00177      t = icl::utils::Time::now();
00178      RGBtoGray(r, g, b, gr, gr + T_SIZE);
00179      t.showAge("without SSE");
00180 
00181      t = icl::utils::Time::now();
00182      SSERGBtoGray(r, g, b, gr, gr + T_SIZE);
00183      t.showAge("with SSE");
00184 
00185      delete r;
00186      delete g;
00187      delete b;
00188      delete gr;
00189 
00190      return 0;
00191    }
00192    // ------------ rgb to gray ------------ //
00193 
00194 */
00195 
00196 namespace icl{
00197   namespace utils{
00198 
00199     #ifdef ICL_HAVE_SSE2
00200 
00201       // ++ basic SSE types ++ //
00202 
00203       struct Icl128 {
00204         __m128 v0;
00205       };
00206 
00207       struct Icl128i {
00208         __m128i v0;
00209 
00210         inline Icl128i() {
00211         }
00212 
00213         inline Icl128i(const Icl128i &v) {
00214           v0 = v.v0;
00215         }
00216 
00217         inline Icl128i(const __m128i &v) {
00218           v0 = v;
00219         }
00220 
00221         inline Icl128i(const __m128i *v) {
00222           v0 = _mm_loadu_si128(v);
00223         }
00224 
00225         inline Icl128i& operator=(const Icl128i &v) {
00226           v0 = v.v0;
00227           return *this;
00228         }
00229 
00230         inline operator __m128i () const {
00231           return v0;
00232         }
00233 
00234         inline Icl128i& operator&=(const Icl128i &v) {
00235           v0 = _mm_and_si128(v0, v.v0);
00236           return *this;
00237         }
00238 
00239         inline Icl128i& operator|=(const Icl128i &v) {
00240           v0 = _mm_or_si128(v0, v.v0);
00241           return *this;
00242         }
00243 
00244         inline Icl128i& operator^=(const Icl128i &v) {
00245           v0 = _mm_xor_si128(v0, v.v0);
00246           return *this;
00247         }
00248 
00249         inline Icl128i& andnot(const Icl128i &v) {
00250           v0 = _mm_andnot_si128(v.v0, v0);
00251           return *this;
00252         }
00253 
00254         inline void store(__m128i *v) const {
00255           _mm_store_si128(v, v0);
00256         }
00257 
00258         inline void storeu(__m128i *v) const {
00259           _mm_storeu_si128(v, v0);
00260         }
00261       };
00262 
00263       struct Icl128d {
00264         __m128d v0;
00265       };
00266 
00267       struct Icl256 {
00268         __m128 v0; __m128 v1;
00269       };
00270 
00271       struct Icl256i {
00272         __m128i v0; __m128i v1;
00273 
00274         inline Icl256i() {
00275         }
00276 
00277         inline Icl256i(const Icl256i &v) {
00278           v0 = v.v0;
00279           v1 = v.v1;
00280         }
00281 
00282         inline Icl256i(const __m128i &vl, const __m128i &vh) {
00283           v0 = vl;
00284           v1 = vh;
00285         }
00286 
00287         inline Icl256i(const __m128i *v) {
00288           v0 = *v;
00289           v1 = *(v + 1);
00290         }
00291 
00292         inline Icl256i& operator=(const Icl256i &v) {
00293           v0 = v.v0;
00294           v1 = v.v1;
00295           return *this;
00296         }
00297 
00298         inline Icl256i& operator&=(const Icl256i &v) {
00299           v0 = _mm_and_si128(v0, v.v0);
00300           v1 = _mm_and_si128(v1, v.v1);
00301           return *this;
00302         }
00303 
00304         inline Icl256i& operator|=(const Icl256i &v) {
00305           v0 = _mm_or_si128(v0, v.v0);
00306           v1 = _mm_or_si128(v1, v.v1);
00307           return *this;
00308         }
00309 
00310         inline Icl256i& operator^=(const Icl256i &v) {
00311           v0 = _mm_xor_si128(v0, v.v0);
00312           v1 = _mm_xor_si128(v1, v.v1);
00313           return *this;
00314         }
00315 
00316         inline Icl256i& andnot(const Icl256i &v) {
00317           v0 = _mm_andnot_si128(v.v0, v0);
00318           v1 = _mm_andnot_si128(v.v1, v1);
00319           return *this;
00320         }
00321 
00322         inline void store(__m128i *v) const {
00323           _mm_store_si128(v, v0);
00324           _mm_store_si128(v + 1, v1);
00325         }
00326 
00327         inline void storeu(__m128i *v) const {
00328           _mm_storeu_si128(v, v0);
00329           _mm_storeu_si128(v + 1, v1);
00330         }
00331       };
00332 
00333       struct Icl256d {
00334         __m128d v0; __m128d v1;
00335       };
00336 
00337       struct Icl512 {
00338         __m128 v0; __m128 v1; __m128 v2; __m128 v3;
00339       };
00340 
00341       struct Icl512i {
00342         __m128i v0; __m128i v1; __m128i v2; __m128i v3;
00343 
00344         inline Icl512i() {
00345         }
00346 
00347         inline Icl512i(const Icl512i &v) {
00348           v0 = v.v0;
00349           v1 = v.v1;
00350           v2 = v.v2;
00351           v3 = v.v3;
00352         }
00353 
00354         inline Icl512i(const __m128i &vll, const __m128i &vlh,
00355           const __m128i &vhl, const __m128i &vhh) {
00356           v0 = vll;
00357           v1 = vlh;
00358           v2 = vhl;
00359           v3 = vhh;
00360         }
00361 
00362         inline Icl512i(const __m128i *v) {
00363           v0 = *v;
00364           v1 = *(v + 1);
00365           v0 = *(v + 2);
00366           v1 = *(v + 3);
00367         }
00368 
00369         inline Icl512i& operator=(const Icl512i &v) {
00370           v0 = v.v0;
00371           v1 = v.v1;
00372           v2 = v.v2;
00373           v3 = v.v3;
00374           return *this;
00375         }
00376 
00377         inline Icl512i& operator&=(const Icl512i &v) {
00378           v0 = _mm_and_si128(v0, v.v0);
00379           v1 = _mm_and_si128(v1, v.v1);
00380           v2 = _mm_and_si128(v2, v.v2);
00381           v3 = _mm_and_si128(v3, v.v3);
00382           return *this;
00383         }
00384 
00385         inline Icl512i& operator|=(const Icl512i &v) {
00386           v0 = _mm_or_si128(v0, v.v0);
00387           v1 = _mm_or_si128(v1, v.v1);
00388           v2 = _mm_or_si128(v2, v.v2);
00389           v3 = _mm_or_si128(v3, v.v3);
00390           return *this;
00391         }
00392 
00393         inline Icl512i& operator^=(const Icl512i &v) {
00394           v0 = _mm_xor_si128(v0, v.v0);
00395           v1 = _mm_xor_si128(v1, v.v1);
00396           v2 = _mm_xor_si128(v2, v.v2);
00397           v3 = _mm_xor_si128(v3, v.v3);
00398           return *this;
00399         }
00400 
00401         inline Icl512i& andnot(const Icl512i &v) {
00402           v0 = _mm_andnot_si128(v.v0, v0);
00403           v1 = _mm_andnot_si128(v.v1, v1);
00404           v2 = _mm_andnot_si128(v.v2, v2);
00405           v3 = _mm_andnot_si128(v.v3, v3);
00406           return *this;
00407         }
00408 
00409         inline void store(__m128i *v) const {
00410           _mm_store_si128(v, v0);
00411           _mm_store_si128(v + 1, v1);
00412           _mm_store_si128(v + 2, v2);
00413           _mm_store_si128(v + 3, v3);
00414         }
00415 
00416         inline void storeu(__m128i *v) const {
00417           _mm_storeu_si128(v, v0);
00418           _mm_storeu_si128(v + 1, v1);
00419           _mm_storeu_si128(v + 2, v2);
00420           _mm_storeu_si128(v + 3, v3);
00421         }
00422       };
00423 
00424       struct Icl512d {
00425         __m128d v0; __m128d v1; __m128d v2; __m128d v3;
00426       };
00427 
00428       struct Icl1024d {
00429         __m128d v0; __m128d v1; __m128d v2; __m128d v3;
00430         __m128d v4; __m128d v5; __m128d v6; __m128d v7;
00431       };
00432 
00433       // -- basic SSE types -- //
00434 
00435 
00436       // ++ advanced SSE types ++ //
00437 
00439       struct icl128 : Icl128 {
00440         inline icl128() {
00441         }
00442 
00443         inline icl128(const icl128 &v) {
00444           v0 = v.v0;
00445         }
00446 
00447         inline icl128(const __m128 &v) {
00448           v0 = v;
00449         }
00450 
00451         inline icl128(const icl32f *v) {
00452           v0 = _mm_loadu_ps(v);
00453         }
00454 
00455         inline icl128(const icl32f v) {
00456           v0 = _mm_set1_ps(v);
00457         }
00458 
00459         inline icl128(const Icl128 &v) {
00460           v0 = v.v0;
00461         }
00462 
00463         inline icl128(const Icl128i &v) {
00464           v0 = _mm_cvtepi32_ps(v.v0);
00465         }
00466 /*
00467         inline icl128& operator=(const __m128 &v) {
00468           v0 = v;
00469           return *this;
00470         }
00471 
00472         inline icl128& operator=(const icl32f *v) {
00473           v0 = _mm_loadu_ps(v);
00474           return *this;
00475         }
00476 
00477         inline icl128& operator=(const icl32f v) {
00478           v0 = _mm_set1_ps(v);
00479           return *this;
00480         }
00481 */
00482         inline icl128& operator=(const icl128 &v) {
00483           v0 = v.v0;
00484           return *this;
00485         }
00486 
00487         inline icl128& operator=(const Icl128 &v) {
00488           v0 = v.v0;
00489           return *this;
00490         }
00491 /*
00492         inline icl128& operator=(const Icl128i &v) {
00493           v0 = _mm_cvtepi32_ps(v.v0);
00494           return *this;
00495         }
00496 */
00497         inline operator __m128 () const {
00498           return v0;
00499         }
00500 
00501         inline icl128& operator+=(const Icl128 &v) {
00502           v0 = _mm_add_ps(v0, v.v0);
00503           return *this;
00504         }
00505 
00506         inline icl128& operator-=(const Icl128 &v) {
00507           v0 = _mm_sub_ps(v0, v.v0);
00508           return *this;
00509         }
00510 
00511         inline icl128& operator*=(const Icl128 &v) {
00512           v0 = _mm_mul_ps(v0, v.v0);
00513           return *this;
00514         }
00515 
00516         inline icl128& operator/=(const Icl128 &v) {
00517           v0 = _mm_div_ps(v0, v.v0);
00518           return *this;
00519         }
00520 
00521         inline icl128& operator&=(const Icl128 &v) {
00522           v0 = _mm_and_ps(v0, v.v0);
00523           return *this;
00524         }
00525 
00526         inline icl128& operator|=(const Icl128 &v) {
00527           v0 = _mm_or_ps(v0, v.v0);
00528           return *this;
00529         }
00530 
00531         inline icl128& operator^=(const Icl128 &v) {
00532           v0 = _mm_xor_ps(v0, v.v0);
00533           return *this;
00534         }
00535 
00536         inline icl128& andnot(const Icl128 &v) {
00537           v0 = _mm_andnot_ps(v.v0, v0);
00538           return *this;
00539         }
00540 
00541         inline icl128& rcp() {
00542                       v0 = _mm_rcp_ps(v0);
00543           return *this;
00544         }
00545 
00546         inline void store(icl32f *v) const {
00547                       _mm_store_ps(v, v0);
00548         }
00549 
00550         inline void storeu(icl32f *v) const {
00551                       _mm_storeu_ps(v, v0);
00552         }
00553       };
00554 
00556       struct icl256 : Icl256 {
00557         inline icl256() {
00558         }
00559 
00560         inline icl256(const icl256 &v) {
00561           v0 = v.v0;
00562           v1 = v.v1;
00563         }
00564 
00565         inline icl256(const __m128 &vl, const __m128 &vh) {
00566           v0 = vl;
00567           v1 = vh;
00568         }
00569 
00570         inline icl256(const __m128 *v) {
00571           v0 = *v;
00572           v1 = *(v + 1);
00573         }
00574 
00575         inline icl256(const icl32f v) {
00576           v0 = _mm_set1_ps(v);
00577           v1 = _mm_set1_ps(v);
00578         }
00579 
00580         inline icl256(const Icl256 &v) {
00581           v0 = v.v0;
00582           v1 = v.v1;
00583         }
00584 
00585         inline icl256(const Icl256i &v) {
00586           v0 = _mm_cvtepi32_ps(v.v0);
00587           v1 = _mm_cvtepi32_ps(v.v1);
00588         }
00589 /*
00590         inline icl256& operator=(const __m128 *v) {
00591           v0 = *v;
00592           v1 = *(v + 1);
00593           return *this;
00594         }
00595 
00596         inline icl256& operator=(const icl32f v) {
00597           v0 = _mm_set1_ps(v);
00598           v1 = _mm_set1_ps(v);
00599           return *this;
00600         }
00601 */
00602         inline icl256& operator=(const icl256 &v) {
00603           v0 = v.v0;
00604           v1 = v.v1;
00605           return *this;
00606         }
00607 
00608         inline icl256& operator=(const Icl256 &v) {
00609           v0 = v.v0;
00610           v1 = v.v1;
00611           return *this;
00612         }
00613 /*
00614         inline icl256& operator=(const Icl256i &v) {
00615           v0 = _mm_cvtepi32_ps(v.v0);
00616           v1 = _mm_cvtepi32_ps(v.v1);
00617           return *this;
00618         }
00619 */
00620         inline icl256& operator+=(const Icl256 &v) {
00621           v0 = _mm_add_ps(v0, v.v0);
00622           v1 = _mm_add_ps(v1, v.v1);
00623           return *this;
00624         }
00625 
00626         inline icl256& operator-=(const Icl256 &v) {
00627           v0 = _mm_sub_ps(v0, v.v0);
00628           v1 = _mm_sub_ps(v1, v.v1);
00629           return *this;
00630         }
00631 
00632         inline icl256& operator*=(const Icl256 &v) {
00633           v0 = _mm_mul_ps(v0, v.v0);
00634           v1 = _mm_mul_ps(v1, v.v1);
00635           return *this;
00636         }
00637 
00638         inline icl256& operator/=(const Icl256 &v) {
00639           v0 = _mm_div_ps(v0, v.v0);
00640           v1 = _mm_div_ps(v1, v.v1);
00641           return *this;
00642         }
00643 
00644         inline icl256& operator&=(const Icl256 &v) {
00645           v0 = _mm_and_ps(v0, v.v0);
00646           v1 = _mm_and_ps(v1, v.v1);
00647           return *this;
00648         }
00649 
00650         inline icl256& operator|=(const Icl256 &v) {
00651           v0 = _mm_or_ps(v0, v.v0);
00652           v1 = _mm_or_ps(v1, v.v1);
00653           return *this;
00654         }
00655 
00656         inline icl256& operator^=(const Icl256 &v) {
00657           v0 = _mm_xor_ps(v0, v.v0);
00658           v1 = _mm_xor_ps(v1, v.v1);
00659           return *this;
00660         }
00661 
00662         inline icl256& andnot(const Icl256 &v) {
00663           v0 = _mm_andnot_ps(v.v0, v0);
00664           v1 = _mm_andnot_ps(v.v1, v1);
00665           return *this;
00666         }
00667 
00668         inline icl256& rcp() {
00669           v0 = _mm_rcp_ps(v0);
00670           v1 = _mm_rcp_ps(v1);
00671           return *this;
00672         }
00673 
00674         inline void store(icl32f *v) const {
00675           _mm_store_ps(v, v0);
00676           _mm_store_ps(v + 4, v1);
00677         }
00678 
00679         inline void storeu(icl32f *v) const {
00680           _mm_storeu_ps(v, v0);
00681           _mm_storeu_ps(v + 4, v1);
00682         }
00683       };
00684 
00686       struct icl512 : Icl512 {
00687         inline icl512() {
00688         }
00689 
00690         inline icl512(const icl512 &v) {
00691           v0 = v.v0;
00692           v1 = v.v1;
00693           v2 = v.v2;
00694           v3 = v.v3;
00695         }
00696 
00697         inline icl512(const __m128 &vll, const __m128 &vlh,
00698           const __m128 &vhl, const __m128 &vhh) {
00699           v0 = vll;
00700           v1 = vlh;
00701           v2 = vhl;
00702           v3 = vhh;
00703         }
00704 
00705         inline icl512(const __m128 *v) {
00706           v0 = *v;
00707           v1 = *(v + 1);
00708           v2 = *(v + 2);
00709           v3 = *(v + 3);
00710         }
00711 
00712         inline icl512(const icl8u *v) {
00713           const __m128i vk0 = _mm_setzero_si128();
00714           __m128i vt0, vt1, vt2, vt3;
00715 
00716           vt3 = _mm_loadu_si128((__m128i*)v);
00717 
00718           vt1 = _mm_unpacklo_epi8(vt3, vk0);
00719           vt3 = _mm_unpackhi_epi8(vt3, vk0);
00720 
00721           vt0 = _mm_unpacklo_epi16(vt1, vk0);
00722           vt1 = _mm_unpackhi_epi16(vt1, vk0);
00723           vt2 = _mm_unpacklo_epi16(vt3, vk0);
00724           vt3 = _mm_unpackhi_epi16(vt3, vk0);
00725 
00726           v0 = _mm_cvtepi32_ps(vt0);
00727           v1 = _mm_cvtepi32_ps(vt1);
00728           v2 = _mm_cvtepi32_ps(vt2);
00729           v3 = _mm_cvtepi32_ps(vt3);
00730         }
00731 
00732         inline icl512(const icl32f *v) {
00733           v0 = _mm_loadu_ps(v);
00734           v1 = _mm_loadu_ps(v + 4);
00735           v2 = _mm_loadu_ps(v + 8);
00736           v3 = _mm_loadu_ps(v + 12);
00737         }
00738 
00739         inline icl512(const Icl512 &v) {
00740           v0 = v.v0;
00741           v1 = v.v1;
00742           v2 = v.v2;
00743           v3 = v.v3;
00744         }
00745 
00746         inline icl512(const Icl512i &v) {
00747           v0 = _mm_cvtepi32_ps(v.v0);
00748           v1 = _mm_cvtepi32_ps(v.v1);
00749           v2 = _mm_cvtepi32_ps(v.v2);
00750           v3 = _mm_cvtepi32_ps(v.v3);
00751         }
00752 
00753         inline icl512(const icl32f v) {
00754           v0 = _mm_set1_ps(v);
00755           v1 = _mm_set1_ps(v);
00756           v2 = _mm_set1_ps(v);
00757           v3 = _mm_set1_ps(v);
00758         }
00759 /*
00760         inline icl512& operator=(const __m128 *v) {
00761           v0 = *v;
00762           v1 = *(v + 1);
00763           v2 = *(v + 2);
00764           v3 = *(v + 3);
00765           return *this;
00766         }
00767 
00768         inline icl512& operator=(const icl32f *v) {
00769           v0 = _mm_loadu_ps(v);
00770           v1 = _mm_loadu_ps(v + 4);
00771           v2 = _mm_loadu_ps(v + 8);
00772           v3 = _mm_loadu_ps(v + 12);
00773           return *this;
00774         }
00775 */
00776         inline icl512& operator=(const icl512 &v) {
00777           v0 = v.v0;
00778           v1 = v.v1;
00779           v2 = v.v2;
00780           v3 = v.v3;
00781           return *this;
00782         }
00783 
00784         inline icl512& operator=(const Icl512 &v) {
00785           v0 = v.v0;
00786           v1 = v.v1;
00787           v2 = v.v2;
00788           v3 = v.v3;
00789           return *this;
00790         }
00791 /*
00792         inline icl512& operator=(const Icl512i &v) {
00793           v0 = _mm_cvtepi32_ps(v.v0);
00794           v1 = _mm_cvtepi32_ps(v.v1);
00795           v2 = _mm_cvtepi32_ps(v.v2);
00796           v3 = _mm_cvtepi32_ps(v.v3);
00797           return *this;
00798         }
00799 
00800         inline icl512& operator=(const icl32f v) {
00801           v0 = _mm_set1_ps(v);
00802           v1 = _mm_set1_ps(v);
00803           v2 = _mm_set1_ps(v);
00804           v3 = _mm_set1_ps(v);
00805           return *this;
00806         }
00807 */
00808         inline icl512& operator+=(const Icl512 &v) {
00809           v0 = _mm_add_ps(v0, v.v0);
00810           v1 = _mm_add_ps(v1, v.v1);
00811           v2 = _mm_add_ps(v2, v.v2);
00812           v3 = _mm_add_ps(v3, v.v3);
00813           return *this;
00814         }
00815 
00816         inline icl512& operator-=(const Icl512 &v) {
00817           v0 = _mm_sub_ps(v0, v.v0);
00818           v1 = _mm_sub_ps(v1, v.v1);
00819           v2 = _mm_sub_ps(v2, v.v2);
00820           v3 = _mm_sub_ps(v3, v.v3);
00821           return *this;
00822         }
00823 
00824         inline icl512& operator*=(const Icl512 &v) {
00825           v0 = _mm_mul_ps(v0, v.v0);
00826           v1 = _mm_mul_ps(v1, v.v1);
00827           v2 = _mm_mul_ps(v2, v.v2);
00828           v3 = _mm_mul_ps(v3, v.v3);
00829           return *this;
00830         }
00831 
00832         inline icl512& operator/=(const Icl512 &v) {
00833           v0 = _mm_div_ps(v0, v.v0);
00834           v1 = _mm_div_ps(v1, v.v1);
00835           v2 = _mm_div_ps(v2, v.v2);
00836           v3 = _mm_div_ps(v3, v.v3);
00837           return *this;
00838         }
00839 
00840         inline icl512& operator&=(const Icl512 &v) {
00841           v0 = _mm_and_ps(v0, v.v0);
00842           v1 = _mm_and_ps(v1, v.v1);
00843           v2 = _mm_and_ps(v2, v.v2);
00844           v3 = _mm_and_ps(v3, v.v3);
00845           return *this;
00846         }
00847 
00848         inline icl512& operator|=(const Icl512 &v) {
00849           v0 = _mm_or_ps(v0, v.v0);
00850           v1 = _mm_or_ps(v1, v.v1);
00851           v2 = _mm_or_ps(v2, v.v2);
00852           v3 = _mm_or_ps(v3, v.v3);
00853           return *this;
00854         }
00855 
00856         inline icl512& operator^=(const Icl512 &v) {
00857           v0 = _mm_xor_ps(v0, v.v0);
00858           v1 = _mm_xor_ps(v1, v.v1);
00859           v2 = _mm_xor_ps(v2, v.v2);
00860           v3 = _mm_xor_ps(v3, v.v3);
00861           return *this;
00862         }
00863 
00864         inline icl512& andnot(const Icl512 &v) {
00865           v0 = _mm_andnot_ps(v.v0, v0);
00866           v1 = _mm_andnot_ps(v.v1, v1);
00867           v2 = _mm_andnot_ps(v.v2, v2);
00868           v3 = _mm_andnot_ps(v.v3, v3);
00869           return *this;
00870         }
00871 
00872         inline icl512& rcp() {
00873           v0 = _mm_rcp_ps(v0);
00874           v1 = _mm_rcp_ps(v1);
00875           v2 = _mm_rcp_ps(v2);
00876           v3 = _mm_rcp_ps(v3);
00877           return *this;
00878         }
00879 
00880         inline void store(icl8u *v) const {
00881           //__m128 vMin = _mm_set1_ps(-2147483520.f);
00882           //__m128 vMax = _mm_set1_ps(2147483520.f);
00883           //v0 = _mm_cvtps_epi32(_mm_min_ps(_mm_max_ps(v.v0, vMin), vMax));
00884           //v1 = _mm_cvtps_epi32(_mm_min_ps(_mm_max_ps(v.v1, vMin), vMax));
00885           //v2 = _mm_cvtps_epi32(_mm_min_ps(_mm_max_ps(v.v2, vMin), vMax));
00886           //v3 = _mm_cvtps_epi32(_mm_min_ps(_mm_max_ps(v.v3, vMin), vMax));
00887           __m128i vt0 = _mm_cvtps_epi32(v0);
00888           __m128i vt1 = _mm_cvtps_epi32(v1);
00889           __m128i vt2 = _mm_cvtps_epi32(v2);
00890           __m128i vt3 = _mm_cvtps_epi32(v3);
00891 
00892           vt0 = _mm_packus_epi16(_mm_packs_epi32(vt0, vt1), _mm_packs_epi32(vt2, vt3));
00893           _mm_store_si128((__m128i*)v, vt0);
00894         }
00895 
00896         inline void storeu(icl8u *v) const {
00897           //__m128 vMin = _mm_set1_ps(-2147483520.f);
00898           //__m128 vMax = _mm_set1_ps(2147483520.f);
00899           //v0 = _mm_cvtps_epi32(_mm_min_ps(_mm_max_ps(v.v0, vMin), vMax));
00900           //v1 = _mm_cvtps_epi32(_mm_min_ps(_mm_max_ps(v.v1, vMin), vMax));
00901           //v2 = _mm_cvtps_epi32(_mm_min_ps(_mm_max_ps(v.v2, vMin), vMax));
00902           //v3 = _mm_cvtps_epi32(_mm_min_ps(_mm_max_ps(v.v3, vMin), vMax));
00903           __m128i vt0 = _mm_cvtps_epi32(v0);
00904           __m128i vt1 = _mm_cvtps_epi32(v1);
00905           __m128i vt2 = _mm_cvtps_epi32(v2);
00906           __m128i vt3 = _mm_cvtps_epi32(v3);
00907 
00908           vt0 = _mm_packus_epi16(_mm_packs_epi32(vt0, vt1), _mm_packs_epi32(vt2, vt3));
00909           _mm_storeu_si128((__m128i*)v, vt0);
00910         }
00911 
00912         inline void store(icl32f *v) const {
00913           _mm_store_ps(v, v0);
00914           _mm_store_ps(v + 4, v1);
00915           _mm_store_ps(v + 8, v2);
00916           _mm_store_ps(v + 12, v3);
00917         }
00918 
00919         inline void storeu(icl32f *v) const {
00920           _mm_storeu_ps(v, v0);
00921           _mm_storeu_ps(v + 4, v1);
00922           _mm_storeu_ps(v + 8, v2);
00923           _mm_storeu_ps(v + 12, v3);
00924         }
00925       };
00926 
00928       struct icl128i8u : Icl128i {
00929         inline icl128i8u() {
00930         }
00931 
00932         inline icl128i8u(const Icl128i &v) {
00933           v0 = v.v0;
00934         }
00935 
00936         inline icl128i8u(const icl128i8u &v) {
00937           v0 = v.v0;
00938         }
00939 
00940         inline icl128i8u(const __m128i &v) {
00941           v0 = v;
00942         }
00943 
00944         inline icl128i8u(const __m128i *v) {
00945           v0 = _mm_loadu_si128(v);
00946         }
00947 
00948         inline icl128i8u(const icl8s *v) {
00949           v0 = _mm_loadu_si128((__m128i*)v);
00950         }
00951 
00952         inline icl128i8u(const icl8u *v) {
00953           v0 = _mm_loadu_si128((__m128i*)v);
00954         }
00955 
00956         inline icl128i8u(const icl8s v) {
00957           v0 = _mm_set1_epi8(v);
00958         }
00959 
00960         inline icl128i8u(const Icl256i &v) {
00961           //v0 = _mm_packs_epi16(v.v0, v.v1); // for icl8s
00962           v0 = _mm_packus_epi16(v.v0, v.v1);
00963         }
00964 
00965         inline icl128i8u(const Icl512i &v) {
00966           //v0 = _mm_packs_epi16(_mm_packs_epi32(v.v0, v.v1), _mm_packs_epi32(v.v2, v.v3)); // for icl8s
00967           v0 = _mm_packus_epi16(_mm_packs_epi32(v.v0, v.v1), _mm_packs_epi32(v.v2, v.v3));
00968         }
00969 
00970         inline operator Icl128i () const {
00971           return *this;
00972         }
00973 
00974         inline icl128i8u& operator=(const icl128i8u &v) {
00975           v0 = v.v0;
00976           return *this;
00977         }
00978 
00979         inline icl128i8u& operator=(const Icl128i &v) {
00980           v0 = v.v0;
00981           return *this;
00982         }
00983 
00984         inline icl128i8u& operator+=(const icl128i8u &v) {
00985           v0 = _mm_add_epi8(v0, v.v0);
00986           return *this;
00987         }
00988 
00989         inline icl128i8u& operator-=(const icl128i8u &v) {
00990           v0 = _mm_sub_epi8(v0, v.v0);
00991           return *this;
00992         }
00993 
00994         inline void store(__m128i *v) const {
00995           _mm_store_si128(v, v0);
00996         }
00997 
00998         inline void storeu(__m128i *v) const {
00999           _mm_storeu_si128(v, v0);
01000         }
01001 
01002         inline void store(icl8s *v) const {
01003           _mm_store_si128((__m128i*)v, v0);
01004         }
01005 
01006         inline void storeu(icl8s *v) const {
01007           _mm_storeu_si128((__m128i*)v, v0);
01008         }
01009 
01010         inline void store(icl8u *v) const {
01011           _mm_store_si128((__m128i*)v, v0);
01012         }
01013 
01014         inline void storeu(icl8u *v) const {
01015           _mm_storeu_si128((__m128i*)v, v0);
01016         }
01017       };
01018 
01020       struct icl128i16s : Icl128i {
01021         inline icl128i16s() {
01022         }
01023 
01024         inline icl128i16s(const icl128i16s &v) {
01025           v0 = v.v0;
01026         }
01027 
01028         inline icl128i16s(const Icl128i &v) {
01029           v0 = v.v0;
01030         }
01031 
01032         inline icl128i16s(const __m128i &v) {
01033           v0 = v;
01034         }
01035 
01036         inline icl128i16s(const __m128i *v) {
01037           v0 = _mm_loadu_si128(v);
01038         }
01039 
01040         inline icl128i16s(const icl16s *v) {
01041           v0 = _mm_loadu_si128((__m128i*)v);
01042         }
01043 
01044         inline icl128i16s(const icl16u *v) {
01045           v0 = _mm_loadu_si128((__m128i*)v);
01046         }
01047 
01048         inline icl128i16s(const icl16s v) {
01049           v0 = _mm_set1_epi16(v);
01050         }
01051 
01052         inline icl128i16s(const Icl256i &v) {
01053           v0 = _mm_packs_epi32(v.v0, v.v1);
01054         }
01055 
01056         inline operator Icl128i () const {
01057           return *this;
01058         }
01059 
01060         inline icl128i16s& operator=(const icl128i16s &v) {
01061           v0 = v.v0;
01062           return *this;
01063         }
01064 
01065         inline icl128i16s& operator=(const Icl128i &v) {
01066           v0 = v.v0;
01067           return *this;
01068         }
01069 
01070         inline icl128i16s& operator+=(const icl128i16s &v) {
01071           v0 = _mm_add_epi16(v0, v.v0);
01072           return *this;
01073         }
01074 
01075         inline icl128i16s& operator-=(const icl128i16s &v) {
01076           v0 = _mm_sub_epi16(v0, v.v0);
01077           return *this;
01078         }
01079 
01080         inline void store(__m128i *v) const {
01081           _mm_store_si128(v, v0);
01082         }
01083 
01084         inline void storeu(__m128i *v) const {
01085           _mm_storeu_si128(v, v0);
01086         }
01087 
01088         inline void store(icl16s *v) const {
01089           _mm_store_si128((__m128i*)v, v0);
01090         }
01091 
01092         inline void storeu(icl16s *v) const {
01093           _mm_storeu_si128((__m128i*)v, v0);
01094         }
01095 
01096         inline void store(icl16u *v) const {
01097           _mm_store_si128((__m128i*)v, v0);
01098         }
01099 
01100         inline void storeu(icl16u *v) const {
01101           _mm_storeu_si128((__m128i*)v, v0);
01102         }
01103       };
01104 
01106       struct icl128i32s : Icl128i {
01107         inline icl128i32s() {
01108         }
01109 
01110         inline icl128i32s(const icl128i32s &v) {
01111           v0 = v.v0;
01112         }
01113 
01114         inline icl128i32s(const Icl128i &v) {
01115           v0 = v.v0;
01116         }
01117 
01118         inline icl128i32s(const __m128i &v) {
01119           v0 = v;
01120         }
01121 
01122         inline icl128i32s(const __m128i *v) {
01123           v0 = _mm_loadu_si128(v);
01124         }
01125 
01126         inline icl128i32s(const icl32s *v) {
01127           v0 = _mm_loadu_si128((__m128i*)v);
01128         }
01129 
01130         inline icl128i32s(const icl32u *v) {
01131           v0 = _mm_loadu_si128((__m128i*)v);
01132         }
01133 
01134         inline icl128i32s(const icl32s i0, const icl32s i1, const icl32s i2, const icl32s i3) {
01135           v0 = _mm_set_epi32(i3, i2, i1, i0);
01136         }
01137 
01138         inline icl128i32s(const icl32s v) {
01139           v0 = _mm_set1_epi32(v);
01140         }
01141 
01142         inline icl128i32s(const Icl128 &v) {
01143           //__m128 vMin = _mm_set1_ps(-2147483520.f);
01144           //__m128 vMax = _mm_set1_ps(2147483520.f);
01145           //v0 = _mm_cvtps_epi32(_mm_min_ps(_mm_max_ps(v.v0, vMin), vMax));
01146           v0 = _mm_cvtps_epi32(v.v0);
01147         }
01148 
01149         inline operator Icl128i () const {
01150           return *this;
01151         }
01152 
01153         inline icl128i32s& operator=(const icl128i32s &v) {
01154           v0 = v.v0;
01155           return *this;
01156         }
01157 
01158         inline icl128i32s& operator=(const Icl128i &v) {
01159           v0 = v.v0;
01160           return *this;
01161         }
01162 
01163         inline icl128i32s& operator+=(const icl128i32s &v) {
01164           v0 = _mm_add_epi32(v0, v.v0);
01165           return *this;
01166         }
01167 
01168         inline icl128i32s& operator-=(const icl128i32s &v) {
01169           v0 = _mm_sub_epi32(v0, v.v0);
01170           return *this;
01171         }
01172 
01173         inline void store(__m128i *v) const {
01174           _mm_store_si128(v, v0);
01175         }
01176 
01177         inline void storeu(__m128i *v) const {
01178           _mm_storeu_si128(v, v0);
01179         }
01180 
01181         inline void store(icl32s *v) const {
01182           _mm_store_si128((__m128i*)v, v0);
01183         }
01184 
01185         inline void storeu(icl32s *v) const {
01186           _mm_storeu_si128((__m128i*)v, v0);
01187         }
01188 
01189         inline void store(icl32u *v) const {
01190           _mm_store_si128((__m128i*)v, v0);
01191         }
01192 
01193         inline void storeu(icl32u *v) const {
01194           _mm_storeu_si128((__m128i*)v, v0);
01195         }
01196       };
01197 
01199       struct icl256i16s : Icl256i {
01200         inline icl256i16s() {
01201         }
01202 
01203         inline icl256i16s(const icl256i16s &v) {
01204           v0 = v.v0;
01205           v1 = v.v0;
01206         }
01207 
01208         inline icl256i16s(const Icl256i &v) {
01209           v0 = v.v0;
01210           v1 = v.v0;
01211         }
01212 
01213         inline icl256i16s(const __m128i &vl, const __m128i &vh) {
01214           v0 = vl;
01215           v1 = vh;
01216         }
01217 
01218         inline icl256i16s(const __m128i *v) {
01219           v0 = *v;
01220           v1 = *(v + 1);
01221         }
01222 
01223         inline icl256i16s(const icl16s *v) {
01224           v0 = _mm_loadu_si128((__m128i*)v);
01225           v1 = _mm_loadu_si128((__m128i*)(v + 8));
01226         }
01227 
01228         inline icl256i16s(const icl16s v) {
01229           v0 = _mm_set1_epi16(v);
01230           v1 = _mm_set1_epi16(v);
01231         }
01232 
01233         inline icl256i16s(const icl128i8u &v) {
01234           const __m128i vk0 = _mm_setzero_si128();
01235           v0 = _mm_unpacklo_epi8(v.v0, vk0);
01236           v1 = _mm_unpackhi_epi8(v.v0, vk0);
01237         }
01238 
01239         inline icl256i16s(const Icl512i &v) {
01240           v0 = _mm_packs_epi32(v.v0, v.v1);
01241           v1 = _mm_packs_epi32(v.v2, v.v3);
01242         }
01243 
01244         inline operator Icl256i () const {
01245           return *this;
01246         }
01247 
01248         inline icl256i16s& operator=(const icl256i16s &v) {
01249           v0 = v.v0;
01250           v1 = v.v1;
01251           return *this;
01252         }
01253 
01254         inline icl256i16s& operator=(const Icl256i &v) {
01255           v0 = v.v0;
01256           v1 = v.v1;
01257           return *this;
01258         }
01259 
01260         inline icl256i16s& operator+=(const icl256i16s &v) {
01261           v0 = _mm_add_epi16(v0, v.v0);
01262           v1 = _mm_add_epi16(v1, v.v1);
01263           return *this;
01264         }
01265 
01266         inline icl256i16s& operator-=(const icl256i16s &v) {
01267           v0 = _mm_sub_epi16(v0, v.v0);
01268           v1 = _mm_sub_epi16(v1, v.v1);
01269           return *this;
01270         }
01271 
01272         inline void store(__m128i *v) const {
01273           _mm_store_si128(v, v0);
01274           _mm_store_si128(v + 1, v1);
01275         }
01276 
01277         inline void storeu(__m128i *v) const {
01278           _mm_storeu_si128(v, v0);
01279           _mm_storeu_si128(v + 1, v1);
01280         }
01281 
01282         inline void store(icl16s *v) const {
01283           _mm_store_si128((__m128i*)v, v0);
01284           _mm_store_si128((__m128i*)(v + 8), v1);
01285         }
01286 
01287         inline void storeu(icl16s *v) const {
01288           _mm_storeu_si128((__m128i*)v, v0);
01289           _mm_storeu_si128((__m128i*)(v + 8), v1);
01290         }
01291 
01292         inline void store(icl16u *v) const {
01293           _mm_store_si128((__m128i*)v, v0);
01294           _mm_store_si128((__m128i*)(v + 8), v1);
01295         }
01296 
01297         inline void storeu(icl16u *v) const {
01298           _mm_storeu_si128((__m128i*)v, v0);
01299           _mm_storeu_si128((__m128i*)(v + 8), v1);
01300         }
01301       };
01302 
01304       struct icl256i32s : Icl256i {
01305 
01306         inline icl256i32s() {
01307         }
01308 
01309         inline icl256i32s(const icl256i32s &v) {
01310           v0 = v.v0;
01311           v1 = v.v1;
01312         }
01313 
01314         inline icl256i32s(const Icl256i &v) {
01315           v0 = v.v0;
01316           v1 = v.v1;
01317         }
01318 
01319         inline icl256i32s(const __m128i &vl, const __m128i &vh) {
01320           v0 = vl;
01321           v1 = vh;
01322         }
01323 
01324         inline icl256i32s(const __m128i *v) {
01325           v0 = *v;
01326           v1 = *(v + 1);
01327         }
01328 
01329         inline icl256i32s(const icl32s *v) {
01330           v0 = _mm_loadu_si128((__m128i*)v);
01331           v1 = _mm_loadu_si128((__m128i*)(v + 4));
01332         }
01333 
01334         inline icl256i32s(const icl32s v) {
01335           v0 = _mm_set1_epi32(v);
01336           v1 = _mm_set1_epi32(v);
01337         }
01338 
01339         inline icl256i32s(const Icl256 &v) {
01340           //__m128 vMin = _mm_set1_ps(-2147483520.f);
01341           //__m128 vMax = _mm_set1_ps(2147483520.f);
01342           //v0 = _mm_cvtps_epi32(_mm_min_ps(_mm_max_ps(v.v0, vMin), vMax));
01343           //v1 = _mm_cvtps_epi32(_mm_min_ps(_mm_max_ps(v.v1, vMin), vMax));
01344           v0 = _mm_cvtps_epi32(v.v0);
01345           v1 = _mm_cvtps_epi32(v.v1);
01346         }
01347 
01348         inline icl256i32s& operator=(const icl256i32s &v) {
01349           v0 = v.v0;
01350           v1 = v.v1;
01351           return *this;
01352         }
01353 
01354         inline icl256i32s& operator=(const Icl256i &v) {
01355           v0 = v.v0;
01356           v1 = v.v1;
01357           return *this;
01358         }
01359 
01360         inline icl256i32s& operator+=(const icl256i32s &v) {
01361           v0 = _mm_add_epi16(v0, v.v0);
01362           v1 = _mm_add_epi16(v1, v.v1);
01363           return *this;
01364         }
01365 
01366         inline icl256i32s& operator-=(const icl256i32s &v) {
01367           v0 = _mm_sub_epi16(v0, v.v0);
01368           v1 = _mm_sub_epi16(v1, v.v1);
01369           return *this;
01370         }
01371 
01372         inline void store(__m128i *v) const {
01373           _mm_store_si128(v, v0);
01374           _mm_store_si128(v + 1, v1);
01375         }
01376 
01377         inline void storeu(__m128i *v) const {
01378           _mm_storeu_si128(v, v0);
01379           _mm_storeu_si128(v + 1, v1);
01380         }
01381 
01382         inline void store(icl32s *v) const {
01383           _mm_store_si128((__m128i*)v, v0);
01384           _mm_store_si128((__m128i*)(v + 4), v1);
01385         }
01386 
01387         inline void storeu(icl32s *v) const {
01388           _mm_storeu_si128((__m128i*)v, v0);
01389           _mm_storeu_si128((__m128i*)(v + 4), v1);
01390         }
01391 
01392         inline void store(icl32u *v) const {
01393           _mm_store_si128((__m128i*)v, v0);
01394           _mm_store_si128((__m128i*)(v + 4), v1);
01395         }
01396 
01397         inline void storeu(icl32u *v) const {
01398           _mm_storeu_si128((__m128i*)v, v0);
01399           _mm_storeu_si128((__m128i*)(v + 4), v1);
01400         }
01401       };
01402 
01404       struct icl512i32s : Icl512i {
01405         inline icl512i32s() {
01406         }
01407 
01408         inline icl512i32s(const icl512i32s &v) {
01409           v0 = v.v0;
01410           v1 = v.v1;
01411           v2 = v.v2;
01412           v3 = v.v3;
01413         }
01414 
01415         inline icl512i32s(const Icl512i &v) {
01416           v0 = v.v0;
01417           v1 = v.v1;
01418           v2 = v.v2;
01419           v3 = v.v3;
01420         }
01421 
01422         inline icl512i32s(const __m128i &vll, const __m128i &vlh,
01423           const __m128i &vhl, const __m128i &vhh) {
01424           v0 = vll;
01425           v1 = vlh;
01426           v2 = vhl;
01427           v3 = vhh;
01428         }
01429 
01430         inline icl512i32s(const icl32s *v) {
01431           v0 = _mm_loadu_si128((__m128i*)v);
01432           v1 = _mm_loadu_si128((__m128i*)(v + 4));
01433           v2 = _mm_loadu_si128((__m128i*)(v + 8));
01434           v3 = _mm_loadu_si128((__m128i*)(v + 12));
01435         }
01436 
01437         inline icl512i32s(const Icl256i &v) {
01438           const __m128i vk0 = _mm_setzero_si128();
01439           v0 = _mm_unpacklo_epi16(v.v0, vk0);
01440           v1 = _mm_unpackhi_epi16(v.v0, vk0);
01441           v2 = _mm_unpacklo_epi16(v.v1, vk0);
01442           v3 = _mm_unpackhi_epi16(v.v1, vk0);
01443         }
01444 
01445         inline icl512i32s(const icl32s v) {
01446           v0 = _mm_set1_epi32(v);
01447           v1 = _mm_set1_epi32(v);
01448           v2 = _mm_set1_epi32(v);
01449           v3 = _mm_set1_epi32(v);
01450         }
01451 
01452         inline icl512i32s(const Icl512 &v) {
01453           //__m128 vMin = _mm_set1_ps(-2147483520.f);
01454           //__m128 vMax = _mm_set1_ps(2147483520.f);
01455           //v0 = _mm_cvtps_epi32(_mm_min_ps(_mm_max_ps(v.v0, vMin), vMax));
01456           //v1 = _mm_cvtps_epi32(_mm_min_ps(_mm_max_ps(v.v1, vMin), vMax));
01457           //v2 = _mm_cvtps_epi32(_mm_min_ps(_mm_max_ps(v.v2, vMin), vMax));
01458           //v3 = _mm_cvtps_epi32(_mm_min_ps(_mm_max_ps(v.v3, vMin), vMax));
01459           v0 = _mm_cvtps_epi32(v.v0);
01460           v1 = _mm_cvtps_epi32(v.v1);
01461           v2 = _mm_cvtps_epi32(v.v2);
01462           v3 = _mm_cvtps_epi32(v.v3);
01463         }
01464 
01465         inline icl512i32s& operator=(const icl512i32s &v) {
01466           v0 = v.v0;
01467           v1 = v.v1;
01468           v2 = v.v2;
01469           v3 = v.v3;
01470           return *this;
01471         }
01472 
01473         inline icl512i32s& operator=(const Icl512i &v) {
01474           v0 = v.v0;
01475           v1 = v.v1;
01476           v2 = v.v2;
01477           v3 = v.v3;
01478           return *this;
01479         }
01480 
01481         inline icl512i32s& operator+=(const icl512i32s &v) {
01482           v0 = _mm_add_epi32(v0, v.v0);
01483           v1 = _mm_add_epi32(v1, v.v1);
01484           v2 = _mm_add_epi32(v2, v.v2);
01485           v3 = _mm_add_epi32(v3, v.v3);
01486           return *this;
01487         }
01488 
01489         inline icl512i32s& operator-=(const icl512i32s &v) {
01490           v0 = _mm_sub_epi32(v0, v.v0);
01491           v1 = _mm_sub_epi32(v1, v.v1);
01492           v2 = _mm_sub_epi32(v2, v.v2);
01493           v3 = _mm_sub_epi32(v3, v.v3);
01494           return *this;
01495         }
01496 
01497         inline void store(icl32s *v) const {
01498           _mm_store_si128((__m128i*)v, v0);
01499           _mm_store_si128((__m128i*)(v + 4), v1);
01500           _mm_store_si128((__m128i*)(v + 8), v1);
01501           _mm_store_si128((__m128i*)(v + 12), v1);
01502         }
01503 
01504         inline void storeu(icl32s *v) const {
01505           _mm_storeu_si128((__m128i*)v, v0);
01506           _mm_storeu_si128((__m128i*)(v + 4), v1);
01507           _mm_storeu_si128((__m128i*)(v + 8), v1);
01508           _mm_storeu_si128((__m128i*)(v + 12), v1);
01509         }
01510 
01511         inline void store(icl32u *v) const {
01512           _mm_store_si128((__m128i*)v, v0);
01513           _mm_store_si128((__m128i*)(v + 4), v1);
01514           _mm_store_si128((__m128i*)(v + 8), v1);
01515           _mm_store_si128((__m128i*)(v + 12), v1);
01516         }
01517 
01518         inline void storeu(icl32u *v) const {
01519           _mm_storeu_si128((__m128i*)v, v0);
01520           _mm_storeu_si128((__m128i*)(v + 4), v1);
01521           _mm_storeu_si128((__m128i*)(v + 8), v1);
01522           _mm_storeu_si128((__m128i*)(v + 12), v1);
01523         }
01524       };
01525 
01527       struct icl128d : Icl128d {
01528         inline icl128d() {
01529         }
01530 
01531         inline icl128d(const __m128d &v) {
01532           v0 = v;
01533         }
01534 
01535         inline icl128d(const icl64f *v) {
01536           v0 = _mm_loadu_pd(v);
01537         }
01538 
01539         inline icl128d(const icl64f v) {
01540           v0 = _mm_set1_pd(v);
01541         }
01542 
01543         inline icl128d(const icl128d &v) {
01544           v0 = v.v0;
01545         }
01546 
01547         inline icl128d& operator=(const __m128d &v) {
01548           v0 = v;
01549           return *this;
01550         }
01551 
01552         inline icl128d& operator=(const icl64f *v) {
01553           v0 = _mm_loadu_pd(v);
01554           return *this;
01555         }
01556 
01557         inline icl128d& operator=(const icl128d &v) {
01558           v0 = v.v0;
01559           return *this;
01560         }
01561 
01562         inline operator __m128d () const {
01563           return v0;
01564         }
01565 
01566         inline icl128d& operator+=(const Icl128d &v) {
01567           v0 = _mm_add_pd(v0, v.v0);
01568           return *this;
01569         }
01570 
01571         inline icl128d& operator-=(const Icl128d &v) {
01572           v0 = _mm_sub_pd(v0, v.v0);
01573           return *this;
01574         }
01575 
01576         inline icl128d& operator*=(const Icl128d &v) {
01577           v0 = _mm_mul_pd(v0, v.v0);
01578           return *this;
01579         }
01580 
01581         inline icl128d& operator/=(const Icl128d &v) {
01582           v0 = _mm_div_pd(v0, v.v0);
01583           return *this;
01584         }
01585 
01586         inline icl128d& operator&=(const Icl128d &v) {
01587           v0 = _mm_and_pd(v0, v.v0);
01588           return *this;
01589         }
01590 
01591         inline icl128d& operator|=(const Icl128d &v) {
01592           v0 = _mm_or_pd(v0, v.v0);
01593           return *this;
01594         }
01595 
01596         inline icl128d& operator^=(const Icl128d &v) {
01597           v0 = _mm_xor_pd(v0, v.v0);
01598           return *this;
01599         }
01600 
01601         inline icl128d& andnot(const Icl128d &v) {
01602           v0 = _mm_andnot_pd(v.v0, v0);
01603           return *this;
01604         }
01605 
01606         inline void store(icl64f *v) const {
01607                       _mm_store_pd(v, v0);
01608         }
01609 
01610         inline void storeu(icl64f *v) const {
01611                       _mm_storeu_pd(v, v0);
01612         }
01613       };
01614 
01616       struct icl256d : Icl512d {
01617         // TODO
01618       };
01619 
01620       // type for 8 icl64f values
01621       struct icl512d : Icl512d {
01622         // TODO
01623       };
01624 
01626       struct icl1024d : Icl1024d {
01627         // TODO
01628       };
01629 
01630       // -- advanced SSE types -- //
01631 
01632 
01633       // ++ operations on SSE types ++ //
01634 
01635       // ++ arithmetic operations ++ //
01636 
01637       inline icl128 operator+(const icl128 &lv, const icl128 &rv) {
01638         icl128 ret = lv;
01639         return ret += rv;
01640       }
01641 
01642       inline icl128 operator-(const icl128 &lv, const icl128 &rv) {
01643         icl128 ret = lv;
01644         return ret -= rv;
01645       }
01646 
01647       inline icl128 operator*(const icl128 &lv, const icl128 &rv) {
01648         icl128 ret = lv;
01649         return ret *= rv;
01650       }
01651 
01652       inline icl128 operator/(const icl128 &lv, const icl128 &rv) {
01653         icl128 ret = lv;
01654         return ret /= rv;
01655       }
01656 
01657       inline icl256 operator+(const icl256 &lv, const icl256 &rv) {
01658         icl256 ret = lv;
01659         return ret += rv;
01660       }
01661 
01662       inline icl256 operator-(const icl256 &lv, const icl256 &rv) {
01663         icl256 ret = lv;
01664         return ret -= rv;
01665       }
01666 
01667       inline icl256 operator*(const icl256 &lv, const icl256 &rv) {
01668         icl256 ret = lv;
01669         return ret *= rv;
01670       }
01671 
01672       inline icl256 operator/(const icl256 &lv, const icl256 &rv) {
01673         icl256 ret = lv;
01674         return ret /= rv;
01675       }
01676 
01677       inline icl512 operator+(const icl512 &lv, const icl512 &rv) {
01678         icl512 ret = lv;
01679         return ret += rv;
01680       }
01681 
01682       inline icl512 operator-(const icl512 &lv, const icl512 &rv) {
01683         icl512 ret = lv;
01684         return ret -= rv;
01685       }
01686 
01687       inline icl512 operator*(const icl512 &lv, const icl512 &rv) {
01688         icl512 ret = lv;
01689         return ret *= rv;
01690       }
01691 
01692       inline icl512 operator/(const icl512 &lv, const icl512 &rv) {
01693         icl512 ret = lv;
01694         return ret /= rv;
01695       }
01696 
01697       // -- arithmetic operations -- //
01698 
01699       // ++ comparison operations ++ //
01700 
01701       inline icl128 operator==(const icl128 &lv, const icl128 &rv) {
01702         icl128 ret;
01703         ret.v0 = _mm_cmpeq_ps(lv.v0, rv.v0);
01704         return ret;
01705       }
01706 
01707       inline icl128 operator!=(const icl128 &lv, const icl128 &rv) {
01708         icl128 ret;
01709         ret.v0 = _mm_cmpneq_ps(lv.v0, rv.v0);
01710         return ret;
01711       }
01712 
01713       inline icl128 operator<(const icl128 &lv, const icl128 &rv) {
01714         icl128 ret;
01715         ret.v0 = _mm_cmplt_ps(lv.v0, rv.v0);
01716         return ret;
01717       }
01718 
01719       inline icl128 operator>(const icl128 &lv, const icl128 &rv) {
01720         icl128 ret;
01721         ret.v0 = _mm_cmpgt_ps(lv.v0, rv.v0);
01722         return ret;
01723       }
01724 
01725       inline icl128 operator<=(const icl128 &lv, const icl128 &rv) {
01726         icl128 ret;
01727         ret.v0 = _mm_cmple_ps(lv.v0, rv.v0);
01728         return ret;
01729       }
01730 
01731       inline icl128 operator>=(const icl128 &lv, const icl128 &rv) {
01732         icl128 ret;
01733         ret.v0 = _mm_cmpge_ps(lv.v0, rv.v0);
01734         return ret;
01735       }
01736 
01737       inline icl256 operator==(const icl256 &lv, const icl256 &rv) {
01738         icl256 ret;
01739         ret.v0 = _mm_cmpeq_ps(lv.v0, rv.v0);
01740         ret.v1 = _mm_cmpeq_ps(lv.v1, rv.v1);
01741         return ret;
01742       }
01743 
01744       inline icl256 operator!=(const icl256 &lv, const icl256 &rv) {
01745         icl256 ret;
01746         ret.v0 = _mm_cmpneq_ps(lv.v0, rv.v0);
01747         ret.v1 = _mm_cmpneq_ps(lv.v1, rv.v1);
01748         return ret;
01749       }
01750 
01751       inline icl256 operator<(const icl256 &lv, const icl256 &rv) {
01752         icl256 ret;
01753         ret.v0 = _mm_cmplt_ps(lv.v0, rv.v0);
01754         ret.v1 = _mm_cmplt_ps(lv.v1, rv.v1);
01755         return ret;
01756       }
01757 
01758       inline icl256 operator>(const icl256 &lv, const icl256 &rv) {
01759         icl256 ret;
01760         ret.v0 = _mm_cmpgt_ps(lv.v0, rv.v0);
01761         ret.v1 = _mm_cmpgt_ps(lv.v1, rv.v1);
01762         return ret;
01763       }
01764 
01765       inline icl256 operator<=(const icl256 &lv, const icl256 &rv) {
01766         icl256 ret;
01767         ret.v0 = _mm_cmple_ps(lv.v0, rv.v0);
01768         ret.v1 = _mm_cmple_ps(lv.v1, rv.v1);
01769         return ret;
01770       }
01771 
01772       inline icl256 operator>=(const icl256 &lv, const icl256 &rv) {
01773         icl256 ret;
01774         ret.v0 = _mm_cmpge_ps(lv.v0, rv.v0);
01775         ret.v1 = _mm_cmpge_ps(lv.v1, rv.v1);
01776         return ret;
01777       }
01778 
01779       inline icl512 operator==(const icl512 &lv, const icl512 &rv) {
01780         icl512 ret;
01781         ret.v0 = _mm_cmpeq_ps(lv.v0, rv.v0);
01782         ret.v1 = _mm_cmpeq_ps(lv.v1, rv.v1);
01783         ret.v2 = _mm_cmpeq_ps(lv.v2, rv.v2);
01784         ret.v3 = _mm_cmpeq_ps(lv.v3, rv.v3);
01785         return ret;
01786       }
01787 
01788       inline icl512 operator!=(const icl512 &lv, const icl512 &rv) {
01789         icl512 ret;
01790         ret.v0 = _mm_cmpneq_ps(lv.v0, rv.v0);
01791         ret.v1 = _mm_cmpneq_ps(lv.v1, rv.v1);
01792         ret.v2 = _mm_cmpneq_ps(lv.v2, rv.v2);
01793         ret.v3 = _mm_cmpneq_ps(lv.v3, rv.v3);
01794         return ret;
01795       }
01796 
01797       inline icl512 operator<(const icl512 &lv, const icl512 &rv) {
01798         icl512 ret;
01799         ret.v0 = _mm_cmplt_ps(lv.v0, rv.v0);
01800         ret.v1 = _mm_cmplt_ps(lv.v1, rv.v1);
01801         ret.v2 = _mm_cmplt_ps(lv.v2, rv.v2);
01802         ret.v3 = _mm_cmplt_ps(lv.v3, rv.v3);
01803         return ret;
01804       }
01805 
01806       inline icl512 operator>(const icl512 &lv, const icl512 &rv) {
01807         icl512 ret;
01808         ret.v0 = _mm_cmpgt_ps(lv.v0, rv.v0);
01809         ret.v1 = _mm_cmpgt_ps(lv.v1, rv.v1);
01810         ret.v2 = _mm_cmpgt_ps(lv.v2, rv.v2);
01811         ret.v3 = _mm_cmpgt_ps(lv.v3, rv.v3);
01812         return ret;
01813       }
01814 
01815       inline icl512 operator<=(const icl512 &lv, const icl512 &rv) {
01816         icl512 ret;
01817         ret.v0 = _mm_cmple_ps(lv.v0, rv.v0);
01818         ret.v1 = _mm_cmple_ps(lv.v1, rv.v1);
01819         ret.v2 = _mm_cmple_ps(lv.v2, rv.v2);
01820         ret.v3 = _mm_cmple_ps(lv.v3, rv.v3);
01821         return ret;
01822       }
01823 
01824       inline icl512 operator>=(const icl512 &lv, const icl512 &rv) {
01825         icl512 ret;
01826         ret.v0 = _mm_cmpge_ps(lv.v0, rv.v0);
01827         ret.v1 = _mm_cmpge_ps(lv.v1, rv.v1);
01828         ret.v2 = _mm_cmpge_ps(lv.v2, rv.v2);
01829         ret.v3 = _mm_cmpge_ps(lv.v3, rv.v3);
01830         return ret;
01831       }
01832 
01833       // -- comparison operations -- //
01834 
01835       // ++ logical operations ++ //
01836 
01837       inline icl128 operator&(const icl128 &lv, const icl128 &rv) {
01838         icl128 ret;
01839         ret.v0 = _mm_and_ps(lv.v0, rv.v0);
01840         return ret;
01841       }
01842 
01843       inline icl128 operator|(const icl128 &lv, const icl128 &rv) {
01844         icl128 ret;
01845         ret.v0 = _mm_or_ps(lv.v0, rv.v0);
01846         return ret;
01847       }
01848 
01849       inline icl128 operator^(const icl128 &lv, const icl128 &rv) {
01850         icl128 ret;
01851         ret.v0 = _mm_xor_ps(lv.v0, rv.v0);
01852         return ret;
01853       }
01854 
01855       inline icl128 andnot(const icl128 &lv, const icl128 &rv) {
01856         icl128 ret;
01857         ret.v0 = _mm_andnot_ps(rv.v0, lv.v0);
01858         return ret;
01859       }
01860 
01861       inline Icl128i operator&(const Icl128i &lv, const Icl128i &rv) {
01862         Icl128i ret;
01863         ret.v0 = _mm_and_si128(lv.v0, rv.v0);
01864         return ret;
01865       }
01866 
01867       inline Icl128i operator|(const Icl128i &lv, const Icl128i &rv) {
01868         Icl128i ret;
01869         ret.v0 = _mm_or_si128(lv.v0, rv.v0);
01870         return ret;
01871       }
01872 
01873       inline Icl128i operator^(const Icl128i &lv, const Icl128i &rv) {
01874         Icl128i ret;
01875         ret.v0 = _mm_xor_si128(lv.v0, rv.v0);
01876         return ret;
01877       }
01878 
01879       inline Icl128i andnot(const Icl128i &lv, const Icl128i &rv) {
01880         Icl128i ret;
01881         ret.v0 = _mm_andnot_si128(rv.v0, lv.v0);
01882         return ret;
01883       }
01884 
01885       inline icl256 operator&(const icl256 &lv, const icl256 &rv) {
01886         icl256 ret;
01887         ret.v0 = _mm_and_ps(lv.v0, rv.v0);
01888         ret.v1 = _mm_and_ps(lv.v1, rv.v1);
01889         return ret;
01890       }
01891 
01892       inline icl256 operator|(const icl256 &lv, const icl256 &rv) {
01893         icl256 ret;
01894         ret.v0 = _mm_or_ps(lv.v0, rv.v0);
01895         ret.v1 = _mm_or_ps(lv.v1, rv.v1);
01896         return ret;
01897       }
01898 
01899       inline icl256 operator^(const icl256 &lv, const icl256 &rv) {
01900         icl256 ret;
01901         ret.v0 = _mm_xor_ps(lv.v0, rv.v0);
01902         ret.v1 = _mm_xor_ps(lv.v1, rv.v1);
01903         return ret;
01904       }
01905 
01906       inline icl256 andnot(const icl256 &lv, const icl256 &rv) {
01907         icl256 ret;
01908         ret.v0 = _mm_andnot_ps(rv.v0, lv.v0);
01909         ret.v1 = _mm_andnot_ps(rv.v1, lv.v1);
01910         return ret;
01911       }
01912 
01913       inline Icl256i operator&(const Icl256i &lv, const Icl256i &rv) {
01914         Icl256i ret;
01915         ret.v0 = _mm_and_si128(lv.v0, rv.v0);
01916         ret.v1 = _mm_and_si128(lv.v1, rv.v1);
01917         return ret;
01918       }
01919 
01920       inline Icl256i operator|(const Icl256i &lv, const Icl256i &rv) {
01921         Icl256i ret;
01922         ret.v0 = _mm_or_si128(lv.v0, rv.v0);
01923         ret.v1 = _mm_or_si128(lv.v1, rv.v1);
01924         return ret;
01925       }
01926 
01927       inline Icl256i operator^(const Icl256i &lv, const Icl256i &rv) {
01928         Icl256i ret;
01929         ret.v0 = _mm_xor_si128(lv.v0, rv.v0);
01930         ret.v1 = _mm_xor_si128(lv.v1, rv.v1);
01931         return ret;
01932       }
01933 
01934       inline Icl256i andnot(const Icl256i &lv, const Icl256i &rv) {
01935         Icl256i ret;
01936         ret.v0 = _mm_andnot_si128(rv.v0, lv.v0);
01937         ret.v1 = _mm_andnot_si128(rv.v1, lv.v1);
01938         return ret;
01939       }
01940 
01941       inline icl512 operator&(const icl512 &lv, const icl512 &rv) {
01942         icl512 ret;
01943         ret.v0 = _mm_and_ps(lv.v0, rv.v0);
01944         ret.v1 = _mm_and_ps(lv.v1, rv.v1);
01945         ret.v2 = _mm_and_ps(lv.v2, rv.v2);
01946         ret.v3 = _mm_and_ps(lv.v3, rv.v3);
01947         return ret;
01948       }
01949 
01950       inline icl512 operator|(const icl512 &lv, const icl512 &rv) {
01951         icl512 ret;
01952         ret.v0 = _mm_or_ps(lv.v0, rv.v0);
01953         ret.v1 = _mm_or_ps(lv.v1, rv.v1);
01954         ret.v2 = _mm_or_ps(lv.v2, rv.v2);
01955         ret.v3 = _mm_or_ps(lv.v3, rv.v3);
01956         return ret;
01957       }
01958 
01959       inline icl512 operator^(const icl512 &lv, const icl512 &rv) {
01960         icl512 ret;
01961         ret.v0 = _mm_xor_ps(lv.v0, rv.v0);
01962         ret.v1 = _mm_xor_ps(lv.v1, rv.v1);
01963         ret.v2 = _mm_xor_ps(lv.v2, rv.v2);
01964         ret.v3 = _mm_xor_ps(lv.v3, rv.v3);
01965         return ret;
01966       }
01967 
01968       inline icl512 andnot(const icl512 &lv, const icl512 &rv) {
01969         icl512 ret;
01970         ret.v0 = _mm_andnot_ps(rv.v0, lv.v0);
01971         ret.v1 = _mm_andnot_ps(rv.v1, lv.v1);
01972         ret.v2 = _mm_andnot_ps(rv.v2, lv.v2);
01973         ret.v3 = _mm_andnot_ps(rv.v3, lv.v3);
01974         return ret;
01975       }
01976 
01977       inline Icl512i operator&(const Icl512i &lv, const Icl512i &rv) {
01978         Icl512i ret;
01979         ret.v0 = _mm_and_si128(lv.v0, rv.v0);
01980         ret.v1 = _mm_and_si128(lv.v1, rv.v1);
01981         ret.v2 = _mm_and_si128(lv.v2, rv.v2);
01982         ret.v3 = _mm_and_si128(lv.v3, rv.v3);
01983         return ret;
01984       }
01985 
01986       inline Icl512i operator|(const Icl512i &lv, const Icl512i &rv) {
01987         Icl512i ret;
01988         ret.v0 = _mm_or_si128(lv.v0, rv.v0);
01989         ret.v1 = _mm_or_si128(lv.v1, rv.v1);
01990         ret.v2 = _mm_or_si128(lv.v2, rv.v2);
01991         ret.v3 = _mm_or_si128(lv.v3, rv.v3);
01992         return ret;
01993       }
01994 
01995       inline Icl512i operator^(const Icl512i &lv, const Icl512i &rv) {
01996         Icl512i ret;
01997         ret.v0 = _mm_xor_si128(lv.v0, rv.v0);
01998         ret.v1 = _mm_xor_si128(lv.v1, rv.v1);
01999         ret.v2 = _mm_xor_si128(lv.v2, rv.v2);
02000         ret.v3 = _mm_xor_si128(lv.v3, rv.v3);
02001         return ret;
02002       }
02003 
02004       inline Icl512i andnot(const Icl512i &lv, const Icl512i &rv) {
02005         Icl512i ret;
02006         ret.v0 = _mm_andnot_si128(rv.v0, lv.v0);
02007         ret.v1 = _mm_andnot_si128(rv.v1, lv.v1);
02008         ret.v2 = _mm_andnot_si128(rv.v2, lv.v2);
02009         ret.v3 = _mm_andnot_si128(rv.v3, lv.v3);
02010         return ret;
02011       }
02012 
02013       // -- logical operations -- //
02014 
02015       // ++ shift operetions ++ //
02016 
02017       inline Icl128i& operator<<(Icl128i &v, const int i) {
02018         v.v0 = _mm_slli_epi32(v.v0, i);
02019         return v;
02020       }
02021 
02022       inline Icl128i& operator>>(Icl128i &v, const int i) {
02023         v.v0 = _mm_srai_epi32(v.v0, i);
02024         return v;
02025       }
02026 
02027       inline Icl256i& operator<<(Icl256i &v, const int i) {
02028         v.v0 = _mm_slli_epi32(v.v0, i);
02029         v.v1 = _mm_slli_epi32(v.v1, i);
02030         return v;
02031       }
02032 
02033       inline Icl256i& operator>>(Icl256i &v, const int i) {
02034         v.v0 = _mm_srai_epi32(v.v0, i);
02035         v.v1 = _mm_srai_epi32(v.v1, i);
02036         return v;
02037       }
02038 
02039       inline Icl512i& operator<<(Icl512i &v, const int i) {
02040         v.v0 = _mm_slli_epi32(v.v0, i);
02041         v.v1 = _mm_slli_epi32(v.v1, i);
02042         v.v2 = _mm_slli_epi32(v.v2, i);
02043         v.v3 = _mm_slli_epi32(v.v3, i);
02044         return v;
02045       }
02046 
02047       inline Icl512i& operator>>(Icl512i &v, const int i) {
02048         v.v0 = _mm_srai_epi32(v.v0, i);
02049         v.v1 = _mm_srai_epi32(v.v1, i);
02050         v.v2 = _mm_srai_epi32(v.v2, i);
02051         v.v3 = _mm_srai_epi32(v.v3, i);
02052         return v;
02053       }
02054 
02055       // -- shift operations -- //
02056 
02057       // ++ min-max operations ++ //
02058 
02059       inline icl128i8u min(const icl128i8u &lv, const icl128i8u &rv) {
02060         icl128i8u ret;
02061         ret.v0 = _mm_min_epu8(lv.v0, rv.v0);
02062         return ret;
02063       }
02064 
02065       inline icl128i8u max(const icl128i8u &lv, const icl128i8u &rv) {
02066         icl128i8u ret;
02067         ret.v0 = _mm_max_epu8(lv.v0, rv.v0);
02068         return ret;
02069       }
02070 
02071       inline icl128i16s min(const icl128i16s &lv, const icl128i16s &rv) {
02072         icl128i16s ret;
02073         ret.v0 = _mm_min_epi16(lv.v0, rv.v0);
02074         return ret;
02075       }
02076 
02077       inline icl128i16s max(const icl128i16s &lv, const icl128i16s &rv) {
02078         icl128i16s ret;
02079         ret.v0 = _mm_max_epi16(lv.v0, rv.v0);
02080         return ret;
02081       }
02082 
02083       inline icl256i16s min(const icl256i16s &lv, const icl256i16s &rv) {
02084         icl256i16s ret;
02085         ret.v0 = _mm_min_epi16(lv.v0, rv.v0);
02086         ret.v1 = _mm_min_epi16(lv.v1, rv.v1);
02087         return ret;
02088       }
02089 
02090       inline icl256i16s max(const icl256i16s &lv, const icl256i16s &rv) {
02091         icl256i16s ret;
02092         ret.v0 = _mm_max_epi16(lv.v0, rv.v0);
02093         ret.v1 = _mm_max_epi16(lv.v1, rv.v1);
02094         return ret;
02095       }
02096 
02097       inline icl128 min(const icl128 &lv, const icl128 &rv) {
02098         icl128 ret;
02099         ret.v0 = _mm_min_ps(lv.v0, rv.v0);
02100         return ret;
02101       }
02102 
02103       inline icl128 max(const icl128 &lv, const icl128 &rv) {
02104         icl128 ret;
02105         ret.v0 = _mm_max_ps(lv.v0, rv.v0);
02106         return ret;
02107       }
02108 
02109       inline icl256 min(const icl256 &lv, const icl256 &rv) {
02110         icl256 ret;
02111         ret.v0 = _mm_min_ps(lv.v0, rv.v0);
02112         ret.v1 = _mm_min_ps(lv.v1, rv.v1);
02113         return ret;
02114       }
02115 
02116       inline icl256 max(const icl256 &lv, const icl256 &rv) {
02117         icl256 ret;
02118         ret.v0 = _mm_max_ps(lv.v0, rv.v0);
02119         ret.v1 = _mm_max_ps(lv.v1, rv.v1);
02120         return ret;
02121       }
02122 
02123       inline icl512 min(const icl512 &lv, const icl512 &rv) {
02124         icl512 ret;
02125         ret.v0 = _mm_min_ps(lv.v0, rv.v0);
02126         ret.v1 = _mm_min_ps(lv.v1, rv.v1);
02127         ret.v2 = _mm_min_ps(lv.v2, rv.v2);
02128         ret.v3 = _mm_min_ps(lv.v3, rv.v3);
02129         return ret;
02130       }
02131 
02132       inline icl512 max(const icl512 &lv, const icl512 &rv) {
02133         icl512 ret;
02134         ret.v0 = _mm_max_ps(lv.v0, rv.v0);
02135         ret.v1 = _mm_max_ps(lv.v1, rv.v1);
02136         ret.v2 = _mm_max_ps(lv.v2, rv.v2);
02137         ret.v3 = _mm_max_ps(lv.v3, rv.v3);
02138         return ret;
02139       }
02140 
02141       // -- min-max operations -- //
02142 
02143 
02144       // ++ absosulte values ++ //
02145 
02146     #ifdef ICL_HAVE_SSE3
02147       inline icl128i8u abs(const icl128i8u &v) {
02148         icl128i8u ret;
02149         ret.v0 = _mm_abs_epi8(v.v0);
02150         return ret;
02151       }
02152 
02153       inline icl128i16s abs(const icl128i16s &v) {
02154         icl128i16s ret;
02155         ret.v0 = _mm_abs_epi16(v.v0);
02156         return ret;
02157       }
02158 
02159       inline icl128i32s abs(const icl128i32s &v) {
02160         icl128i32s ret;
02161         ret.v0 = _mm_abs_epi32(v.v0);
02162         return ret;
02163       }
02164 
02165       inline icl256i16s abs(const icl256i16s &v) {
02166         icl256i16s ret;
02167         ret.v0 = _mm_abs_epi16(v.v0);
02168         ret.v1 = _mm_abs_epi16(v.v1);
02169         return ret;
02170       }
02171 
02172       inline icl256i32s abs(const icl256i32s &v) {
02173         icl256i32s ret;
02174         ret.v0 = _mm_abs_epi32(v.v0);
02175         ret.v1 = _mm_abs_epi32(v.v1);
02176         return ret;
02177       }
02178 
02179       inline icl512i32s abs(const icl512i32s &v) {
02180         icl512i32s ret;
02181         ret.v0 = _mm_abs_epi32(v.v0);
02182         ret.v1 = _mm_abs_epi32(v.v1);
02183         ret.v2 = _mm_abs_epi32(v.v2);
02184         ret.v3 = _mm_abs_epi32(v.v3);
02185         return ret;
02186       }
02187     #else
02188       // TODO: without SSE3
02189     #endif
02190 
02191       inline icl128 abs(const icl128 &v) {
02192         icl128 ret;
02193         ret.v0 = _mm_andnot_ps(icl128(-0.0f), v.v0);
02194         return ret;
02195       }
02196 
02197       inline icl256 abs(const icl256 &v) {
02198         icl128 tmp(-0.0f);
02199         icl256 ret;
02200         ret.v0 = _mm_andnot_ps(tmp.v0, v.v0);
02201         ret.v1 = _mm_andnot_ps(tmp.v0, v.v1);
02202         return ret;
02203       }
02204 
02205       inline icl512 abs(const icl512 &v) {
02206         icl128 tmp(-0.0f);
02207         icl512 ret;
02208         ret.v0 = _mm_andnot_ps(tmp.v0, v.v0);
02209         ret.v1 = _mm_andnot_ps(tmp.v0, v.v1);
02210         ret.v2 = _mm_andnot_ps(tmp.v0, v.v2);
02211         ret.v3 = _mm_andnot_ps(tmp.v0, v.v3);
02212         return ret;
02213       }
02214 
02215       // -- absosulte values -- //
02216 
02217 
02218       // ++ squared root ++ //
02219 
02220       inline icl128 sqrt(const icl128 &v) {
02221         icl128 r;
02222         r.v0 = _mm_sqrt_ps(v.v0);
02223         return r;
02224       }
02225 
02226       inline icl256 sqrt(const icl256 &v) {
02227         icl256 r;
02228         r.v0 = _mm_sqrt_ps(v.v0);
02229         r.v1 = _mm_sqrt_ps(v.v1);
02230         return r;
02231       }
02232 
02233       inline icl512 sqrt(const icl512 &v) {
02234         icl512 r;
02235         r.v0 = _mm_sqrt_ps(v.v0);
02236         r.v1 = _mm_sqrt_ps(v.v1);
02237         r.v2 = _mm_sqrt_ps(v.v2);
02238         r.v3 = _mm_sqrt_ps(v.v3);
02239         return r;
02240       }
02241 
02242       inline icl128d sqrt(const icl128d &v) {
02243         icl128d r;
02244         r.v0 = _mm_sqrt_pd(v.v0);
02245         return r;
02246       }
02247 
02248       inline icl256d sqrt(const icl256d &v) {
02249         icl256d r;
02250         r.v0 = _mm_sqrt_pd(v.v0);
02251         r.v1 = _mm_sqrt_pd(v.v1);
02252         return r;
02253       }
02254 
02255       inline icl512d sqrt(const icl512d &v) {
02256         icl512d r;
02257         r.v0 = _mm_sqrt_pd(v.v0);
02258         r.v1 = _mm_sqrt_pd(v.v1);
02259         r.v2 = _mm_sqrt_pd(v.v2);
02260         r.v3 = _mm_sqrt_pd(v.v3);
02261         return r;
02262       }
02263 
02264       inline icl1024d sqrt(const icl1024d &v) {
02265         icl1024d r;
02266         r.v0 = _mm_sqrt_pd(v.v0);
02267         r.v1 = _mm_sqrt_pd(v.v1);
02268         r.v2 = _mm_sqrt_pd(v.v2);
02269         r.v3 = _mm_sqrt_pd(v.v3);
02270         r.v4 = _mm_sqrt_pd(v.v4);
02271         r.v5 = _mm_sqrt_pd(v.v5);
02272         r.v6 = _mm_sqrt_pd(v.v6);
02273         r.v7 = _mm_sqrt_pd(v.v7);
02274         return r;
02275       }
02276 
02277       // -- squared root -- //
02278 
02279 
02280       // ++ cube root ++ //
02281 
02282       inline icl128 cbrt(const icl128 &v) {
02283         icl128i32s tmp = icl128i32s(_mm_castps_si128(v));
02284         tmp = tmp / icl128i32s(3) + icl128i32s(709921077);
02285         icl128 a = icl128(_mm_castsi128_ps(tmp));
02286         icl128 a3 = a * a * a;
02287         return a * (a3 + v + v) * (a3 + a3 + v).rcp();
02288       }
02289 
02290       inline icl256 cbrt(const icl256 &v) {
02291         __m128i t0 = _mm_castps_si128(v.v0);
02292         __m128i t1 = _mm_castps_si128(v.v1);
02293         icl256i32s tmp = icl256i32s(t0, t1);
02294         tmp = tmp / icl256i32s(3) + icl256i32s(709921077);
02295         icl256 a = icl256(_mm_castsi128_ps(tmp.v0),
02296                           _mm_castsi128_ps(tmp.v1));
02297         icl256 a3 = a * a * a;
02298         return a * (a3 + v + v) * (a3 + a3 + v).rcp();
02299       }
02300 
02301       inline icl512 cbrt(const icl512 &v) {
02302         __m128i t0 = _mm_castps_si128(v.v0);
02303         __m128i t1 = _mm_castps_si128(v.v1);
02304         __m128i t2 = _mm_castps_si128(v.v2);
02305         __m128i t3 = _mm_castps_si128(v.v3);
02306         icl512i32s tmp = icl512i32s(t0, t1, t2, t3);
02307         tmp = tmp / icl512i32s(3) + icl512i32s(709921077);
02308         icl512 a = icl512(_mm_castsi128_ps(tmp.v0),
02309                           _mm_castsi128_ps(tmp.v1),
02310                           _mm_castsi128_ps(tmp.v2),
02311                           _mm_castsi128_ps(tmp.v3));
02312         icl512 a3 = a * a * a;
02313         return a * (a3 + v + v) * (a3 + a3 + v).rcp();
02314       }
02315 
02316       // -- cube root -- //
02317 
02318       // -- operations on SSE types -- //
02319 
02320       typedef icl128 icl32fx4;
02321       typedef icl256 icl32fx8;
02322       typedef icl512 icl32fx16;
02323       typedef icl128i8u icl8ux16;
02324       typedef icl128i16s icl16sx8;
02325       typedef icl128i32s icl32sx4;
02326       typedef icl256i16s icl16sx16;
02327       typedef icl256i32s icl32sx8;
02328       typedef icl512i32s icl32sx16;
02329       typedef icl128d icl64fx2;
02330       typedef icl256d icl64fx4;
02331       typedef icl512d icl64fx8;
02332       typedef icl1024d icl64fx16;
02333 
02334     #endif
02335 
02336   } // namespace utils
02337 }