arithmetic_sse_double.h

00001 /*
00002  *      SSE2 implementation of vector oprations (64bit double).
00003  *
00004  * Copyright (c) 2007, Naoaki Okazaki
00005  * All rights reserved.
00006  *
00007  * Permission is hereby granted, free of charge, to any person obtaining a copy
00008  * of this software and associated documentation files (the "Software"), to deal
00009  * in the Software without restriction, including without limitation the rights
00010  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
00011  * copies of the Software, and to permit persons to whom the Software is
00012  * furnished to do so, subject to the following conditions:
00013  *
00014  * The above copyright notice and this permission notice shall be included in
00015  * all copies or substantial portions of the Software.
00016  *
00017  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
00018  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
00019  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
00020  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
00021  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
00022  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
00023  * THE SOFTWARE.
00024  */
00025 
00026 /* $Id: arithmetic_sse_double.h 2 2007-10-20 01:38:42Z naoaki $ */
00027 
00028 #include <stdlib.h>
00029 #include <malloc.h>
00030 #include <memory.h>
00031 
00032 #if      1400 <= _MSC_VER
00033 #include <intrin.h>
00034 #endif
00035 
00036 inline static void* vecalloc(size_t size)
00037 {
00038    void *memblock = _aligned_malloc(size, 16);
00039    if (memblock != NULL) {
00040       memset(memblock, 0, size);
00041    }
00042    return memblock;
00043 }
00044 
00045 inline static void vecfree(void *memblock)
00046 {
00047    _aligned_free(memblock);
00048 }
00049 
00050 #define  fsigndiff(x, y) \
00051    ((_mm_movemask_pd(_mm_set_pd(*(x), *(y))) + 1) & 0x002)
00052 
00053 #define  vecset(x, c, n) \
00054 { \
00055    int i; \
00056    __m128d XMM0 = _mm_set1_pd(c); \
00057    for (i = 0;i < (n);i += 8) { \
00058       _mm_store_pd((x)+i  , XMM0); \
00059       _mm_store_pd((x)+i+2, XMM0); \
00060       _mm_store_pd((x)+i+4, XMM0); \
00061       _mm_store_pd((x)+i+6, XMM0); \
00062    } \
00063 }
00064 
00065 #define  veccpy(y, x, n) \
00066 { \
00067    int i; \
00068    for (i = 0;i < (n);i += 8) { \
00069       __m128d XMM0 = _mm_load_pd((x)+i  ); \
00070       __m128d XMM1 = _mm_load_pd((x)+i+2); \
00071       __m128d XMM2 = _mm_load_pd((x)+i+4); \
00072       __m128d XMM3 = _mm_load_pd((x)+i+6); \
00073       _mm_store_pd((y)+i  , XMM0); \
00074       _mm_store_pd((y)+i+2, XMM1); \
00075       _mm_store_pd((y)+i+4, XMM2); \
00076       _mm_store_pd((y)+i+6, XMM3); \
00077    } \
00078 }
00079 
00080 #define  vecncpy(y, x, n) \
00081 { \
00082    int i; \
00083    for (i = 0;i < (n);i += 8) { \
00084       __m128d XMM0 = _mm_setzero_pd(); \
00085       __m128d XMM1 = _mm_setzero_pd(); \
00086       __m128d XMM2 = _mm_setzero_pd(); \
00087       __m128d XMM3 = _mm_setzero_pd(); \
00088       __m128d XMM4 = _mm_load_pd((x)+i  ); \
00089       __m128d XMM5 = _mm_load_pd((x)+i+2); \
00090       __m128d XMM6 = _mm_load_pd((x)+i+4); \
00091       __m128d XMM7 = _mm_load_pd((x)+i+6); \
00092       XMM0 = _mm_sub_pd(XMM0, XMM4); \
00093       XMM1 = _mm_sub_pd(XMM1, XMM5); \
00094       XMM2 = _mm_sub_pd(XMM2, XMM6); \
00095       XMM3 = _mm_sub_pd(XMM3, XMM7); \
00096       _mm_store_pd((y)+i  , XMM0); \
00097       _mm_store_pd((y)+i+2, XMM1); \
00098       _mm_store_pd((y)+i+4, XMM2); \
00099       _mm_store_pd((y)+i+6, XMM3); \
00100    } \
00101 }
00102 
00103 #define  vecadd(y, x, c, n) \
00104 { \
00105    int i; \
00106    __m128d XMM7 = _mm_set1_pd(c); \
00107    for (i = 0;i < (n);i += 4) { \
00108       __m128d XMM0 = _mm_load_pd((x)+i  ); \
00109       __m128d XMM1 = _mm_load_pd((x)+i+2); \
00110       __m128d XMM2 = _mm_load_pd((y)+i  ); \
00111       __m128d XMM3 = _mm_load_pd((y)+i+2); \
00112       XMM0 = _mm_mul_pd(XMM0, XMM7); \
00113       XMM1 = _mm_mul_pd(XMM1, XMM7); \
00114       XMM2 = _mm_add_pd(XMM2, XMM0); \
00115       XMM3 = _mm_add_pd(XMM3, XMM1); \
00116       _mm_store_pd((y)+i  , XMM2); \
00117       _mm_store_pd((y)+i+2, XMM3); \
00118    } \
00119 }
00120 
00121 #define  vecdiff(z, x, y, n) \
00122 { \
00123    int i; \
00124    for (i = 0;i < (n);i += 8) { \
00125       __m128d XMM0 = _mm_load_pd((x)+i  ); \
00126       __m128d XMM1 = _mm_load_pd((x)+i+2); \
00127       __m128d XMM2 = _mm_load_pd((x)+i+4); \
00128       __m128d XMM3 = _mm_load_pd((x)+i+6); \
00129       __m128d XMM4 = _mm_load_pd((y)+i  ); \
00130       __m128d XMM5 = _mm_load_pd((y)+i+2); \
00131       __m128d XMM6 = _mm_load_pd((y)+i+4); \
00132       __m128d XMM7 = _mm_load_pd((y)+i+6); \
00133       XMM0 = _mm_sub_pd(XMM0, XMM4); \
00134       XMM1 = _mm_sub_pd(XMM1, XMM5); \
00135       XMM2 = _mm_sub_pd(XMM2, XMM6); \
00136       XMM3 = _mm_sub_pd(XMM3, XMM7); \
00137       _mm_store_pd((z)+i  , XMM0); \
00138       _mm_store_pd((z)+i+2, XMM1); \
00139       _mm_store_pd((z)+i+4, XMM2); \
00140       _mm_store_pd((z)+i+6, XMM3); \
00141    } \
00142 }
00143 
00144 #define  vecscale(y, c, n) \
00145 { \
00146    int i; \
00147    __m128d XMM7 = _mm_set1_pd(c); \
00148    for (i = 0;i < (n);i += 4) { \
00149       __m128d XMM0 = _mm_load_pd((y)+i  ); \
00150       __m128d XMM1 = _mm_load_pd((y)+i+2); \
00151       XMM0 = _mm_mul_pd(XMM0, XMM7); \
00152       XMM1 = _mm_mul_pd(XMM1, XMM7); \
00153       _mm_store_pd((y)+i  , XMM0); \
00154       _mm_store_pd((y)+i+2, XMM1); \
00155    } \
00156 }
00157 
00158 #define vecmul(y, x, n) \
00159 { \
00160    int i; \
00161    for (i = 0;i < (n);i += 8) { \
00162       __m128d XMM0 = _mm_load_pd((x)+i  ); \
00163       __m128d XMM1 = _mm_load_pd((x)+i+2); \
00164       __m128d XMM2 = _mm_load_pd((x)+i+4); \
00165       __m128d XMM3 = _mm_load_pd((x)+i+6); \
00166       __m128d XMM4 = _mm_load_pd((y)+i  ); \
00167       __m128d XMM5 = _mm_load_pd((y)+i+2); \
00168       __m128d XMM6 = _mm_load_pd((y)+i+4); \
00169       __m128d XMM7 = _mm_load_pd((y)+i+6); \
00170       XMM4 = _mm_mul_pd(XMM4, XMM0); \
00171       XMM5 = _mm_mul_pd(XMM5, XMM1); \
00172       XMM6 = _mm_mul_pd(XMM6, XMM2); \
00173       XMM7 = _mm_mul_pd(XMM7, XMM3); \
00174       _mm_store_pd((y)+i  , XMM4); \
00175       _mm_store_pd((y)+i+2, XMM5); \
00176       _mm_store_pd((y)+i+4, XMM6); \
00177       _mm_store_pd((y)+i+6, XMM7); \
00178    } \
00179 }
00180 
00181 
00182 
00183 #if      3 <= __SSE__
00184 /*
00185    Horizontal add with haddps SSE3 instruction. The work register (rw)
00186    is unused.
00187  */
00188 #define  __horizontal_sum(r, rw) \
00189    r = _mm_hadd_ps(r, r); \
00190    r = _mm_hadd_ps(r, r);
00191 
00192 #else
00193 /*
00194    Horizontal add with SSE instruction. The work register (rw) is used.
00195  */
00196 #define  __horizontal_sum(r, rw) \
00197    rw = r; \
00198    r = _mm_shuffle_ps(r, rw, _MM_SHUFFLE(1, 0, 3, 2)); \
00199    r = _mm_add_ps(r, rw); \
00200    rw = r; \
00201    r = _mm_shuffle_ps(r, rw, _MM_SHUFFLE(2, 3, 0, 1)); \
00202    r = _mm_add_ps(r, rw);
00203 
00204 #endif
00205 
00206 #define  vecdot(s, x, y, n) \
00207 { \
00208    int i; \
00209    __m128d XMM0 = _mm_setzero_pd(); \
00210    __m128d XMM1 = _mm_setzero_pd(); \
00211    __m128d XMM2, XMM3, XMM4, XMM5; \
00212    for (i = 0;i < (n);i += 4) { \
00213       XMM2 = _mm_load_pd((x)+i  ); \
00214       XMM3 = _mm_load_pd((x)+i+2); \
00215       XMM4 = _mm_load_pd((y)+i  ); \
00216       XMM5 = _mm_load_pd((y)+i+2); \
00217       XMM2 = _mm_mul_pd(XMM2, XMM4); \
00218       XMM3 = _mm_mul_pd(XMM3, XMM5); \
00219       XMM0 = _mm_add_pd(XMM0, XMM2); \
00220       XMM1 = _mm_add_pd(XMM1, XMM3); \
00221    } \
00222    XMM0 = _mm_add_pd(XMM0, XMM1); \
00223    XMM1 = _mm_shuffle_pd(XMM0, XMM0, _MM_SHUFFLE2(1, 1)); \
00224    XMM0 = _mm_add_pd(XMM0, XMM1); \
00225    _mm_store_sd((s), XMM0); \
00226 }
00227 
00228 #define  vecnorm(s, x, n) \
00229 { \
00230    int i; \
00231    __m128d XMM0 = _mm_setzero_pd(); \
00232    __m128d XMM1 = _mm_setzero_pd(); \
00233    __m128d XMM2, XMM3, XMM4, XMM5; \
00234    for (i = 0;i < (n);i += 4) { \
00235       XMM2 = _mm_load_pd((x)+i  ); \
00236       XMM3 = _mm_load_pd((x)+i+2); \
00237       XMM4 = XMM2; \
00238       XMM5 = XMM3; \
00239       XMM2 = _mm_mul_pd(XMM2, XMM4); \
00240       XMM3 = _mm_mul_pd(XMM3, XMM5); \
00241       XMM0 = _mm_add_pd(XMM0, XMM2); \
00242       XMM1 = _mm_add_pd(XMM1, XMM3); \
00243    } \
00244    XMM0 = _mm_add_pd(XMM0, XMM1); \
00245    XMM1 = _mm_shuffle_pd(XMM0, XMM0, _MM_SHUFFLE2(1, 1)); \
00246    XMM0 = _mm_add_pd(XMM0, XMM1); \
00247    XMM0 = _mm_sqrt_pd(XMM0); \
00248    _mm_store_sd((s), XMM0); \
00249 }
00250 
00251 
00252 #define  vecrnorm(s, x, n) \
00253 { \
00254    int i; \
00255    __m128d XMM0 = _mm_setzero_pd(); \
00256    __m128d XMM1 = _mm_setzero_pd(); \
00257    __m128d XMM2, XMM3, XMM4, XMM5; \
00258    for (i = 0;i < (n);i += 4) { \
00259       XMM2 = _mm_load_pd((x)+i  ); \
00260       XMM3 = _mm_load_pd((x)+i+2); \
00261       XMM4 = XMM2; \
00262       XMM5 = XMM3; \
00263       XMM2 = _mm_mul_pd(XMM2, XMM4); \
00264       XMM3 = _mm_mul_pd(XMM3, XMM5); \
00265       XMM0 = _mm_add_pd(XMM0, XMM2); \
00266       XMM1 = _mm_add_pd(XMM1, XMM3); \
00267    } \
00268    XMM2 = _mm_set1_pd(1.0); \
00269    XMM0 = _mm_add_pd(XMM0, XMM1); \
00270    XMM1 = _mm_shuffle_pd(XMM0, XMM0, _MM_SHUFFLE2(1, 1)); \
00271    XMM0 = _mm_add_pd(XMM0, XMM1); \
00272    XMM0 = _mm_sqrt_pd(XMM0); \
00273    XMM2 = _mm_div_pd(XMM2, XMM0); \
00274    _mm_store_sd((s), XMM2); \
00275 }

Generated on Tue Feb 10 10:01:29 2009 for imaging2 by  doxygen 1.5.5