SSE Intrinsics各函数介绍

2023-11-01

SIMD相关头文件包括：

//#include <ivec.h>//MMX
//#include <fvec.h>//SSE(also include ivec.h)
//#include <dvec.h>//SSE2(also include fvec.h)


#include <mmintrin.h> //MMX
#include <xmmintrin.h> //SSE(include mmintrin.h)
#include <emmintrin.h> //SSE2(include xmmintrin.h)
#include <pmmintrin.h> //SSE3(include emmintrin.h)
#include <tmmintrin.h>//SSSE3(include pmmintrin.h)
#include <smmintrin.h>//SSE4.1(include tmmintrin.h)
#include <nmmintrin.h>//SSE4.2(include smmintrin.h)
#include <wmmintrin.h>//AES(include nmmintrin.h)
#include <immintrin.h>//AVX(include wmmintrin.h)
#include <intrin.h>//(include immintrin.h)

mmintrin.h为MMX头文件，其中__m64的定义为：

typedef union __declspec(intrin_type) _CRT_ALIGN(8) __m64
{
    unsigned __int64    m64_u64;
    float               m64_f32[2];
    __int8              m64_i8[8];
    __int16             m64_i16[4];
    __int32             m64_i32[2];    
    __int64             m64_i64;
    unsigned __int8     m64_u8[8];
    unsigned __int16    m64_u16[4];
    unsigned __int32    m64_u32[2];
} __m64;

xmmintrin.h为SSE头文件，此头文件里包含MMX头文件，其中__m128的定义为：

typedef union __declspec(intrin_type) _CRT_ALIGN(16) __m128 {
     float               m128_f32[4];
     unsigned __int64    m128_u64[2];
     __int8              m128_i8[16];
     __int16             m128_i16[8];
     __int32             m128_i32[4];
     __int64             m128_i64[2];
     unsigned __int8     m128_u8[16];
     unsigned __int16    m128_u16[8];
     unsigned __int32    m128_u32[4];
 } __m128;

xmmintrin.h文件中各函数的介绍：

	/*----------Floating Point Intrinsics Using Streaming SIMD Extensions------------*/
	//Arithmetic Operations(Floating Point ):add、sub、mul、div、sqrt、rcp、min、max
	//---------------------说明：_ps结尾的指令表示对4个单精度浮点数同时进行运算，
	//_ss结尾的指令表示仅对4个单精度浮点数最低位的浮点数进行运算---------------------
	//返回一个__m128的寄存器，仅将寄存器_A和寄存器_B最低对应位置的32bit单精度浮点数相加，
	//其余位置取寄存器_A中的数据,例如_A=(_A0,_A1,_A2,_A3), _B=(_B0,_B1,_B2,_B3),
	//则返回寄存器为r=(_A0+_B0, _A1, _A2, _A3)
	extern __m128 _mm_add_ss(__m128 _A, __m128 _B);
	//返回一个__m128的寄存器，将寄存器_A和_B的对应位置的32bit单精度浮点数相加，
	//例如_A=(_A0,_A1,_A2,_A3), _B=(_B0,_B1,_B2,_B3),
	//则返回寄存器r0=_A0+_B0, r1=_A1+_B1, r2=_A2+_B2, r3=_A3+_B3
	extern __m128 _mm_add_ps(__m128 _A, __m128 _B);
	//返回一个__m128的寄存器，仅将寄存器_A和寄存器_B最低对应位置的32bit单精度浮点数相减，
	//其余位置取寄存器_A中的数据,例如_A=(_A0,_A1,_A2,_A3), _B=(_B0,_B1,_B2,_B3),
	//则返回寄存器为r=(_A0-_B0, _A1, _A2, _A3)
	extern __m128 _mm_sub_ss(__m128 _A, __m128 _B);
	//返回一个__m128的寄存器，将寄存器_A和_B的对应位置的32bit单精度浮点数相减，
	//例如_A=(_A0,_A1,_A2,_A3), _B=(_B0,_B1,_B2,_B3),
	//则返回寄存器r0=_A0-_B0, r1=_A1-_B1, r2=_A2-_B2, r3=_A3-_B3
	extern __m128 _mm_sub_ps(__m128 _A, __m128 _B);
	//返回一个__m128的寄存器，仅将寄存器_A和寄存器_B最低对应位置的32bit单精度浮点数相乘，
	//其余位置取寄存器_A中的数据,例如_A=(_A0,_A1,_A2,_A3), _B=(_B0,_B1,_B2,_B3),
	//则返回寄存器为r=(_A0*_B0, _A1, _A2, _A3)
	extern __m128 _mm_mul_ss(__m128 _A, __m128 _B);
	//返回一个__m128的寄存器，将寄存器_A和_B的对应位置的32bit单精度浮点数相乘，
	//例如_A=(_A0,_A1,_A2,_A3), _B=(_B0,_B1,_B2,_B3),
	//则返回寄存器r0=_A0*_B0, r1=_A1*_B1, r2=_A2*_B2, r3=_A3*_B3
	extern __m128 _mm_mul_ps(__m128 _A, __m128 _B);
	//返回一个__m128的寄存器，仅将寄存器_A和寄存器_B最低对应位置的32bit单精度浮点数相除，
	//其余位置取寄存器_A中的数据,例如_A=(_A0,_A1,_A2,_A3), _B=(_B0,_B1,_B2,_B3),
	//则返回寄存器为r=(_A0/_B0, _A1, _A2, _A3)
	extern __m128 _mm_div_ss(__m128 _A, __m128 _B);
	//返回一个__m128的寄存器，将寄存器_A和_B的对应位置的32bit单精度浮点数相除，
	//例如_A=(_A0,_A1,_A2,_A3), _B=(_B0,_B1,_B2,_B3),
	//则返回寄存器r0=_A0/_B0, r1=_A1/_B1, r2=_A2/_B2, r3=_A3/_B3
	extern __m128 _mm_div_ps(__m128 _A, __m128 _B);
	//返回一个__m128的寄存器，仅将寄存器_A最低对应位置的32bit单精度浮点数开平方，
	//其余位置取寄存器_A中的数据,例如_A=(_A0,_A1,_A2,_A3)
	//则返回寄存器为r=(sqrt(_A0), _A1, _A2, _A3)
	extern __m128 _mm_sqrt_ss(__m128 _A);
	//返回一个__m128的寄存器，将寄存器_A中4个32bit单精度浮点数开平方，
	//例如_A=(_A0,_A1,_A2,_A3)，则返回寄存器为
	//r=(sqrt(_A0), sqrt(_A1), sqrt(_A2), sqrt(_A3))
	extern __m128 _mm_sqrt_ps(__m128 _A);
	//返回一个__m128的寄存器，仅将寄存器_A最低对应位置的32bit单精度浮点数取倒数，
	//其余位置取寄存器_A中的数据,例如_A=(_A0,_A1,_A2,_A3)
	//则返回寄存器为r=(recip(_A0), _A1, _A2, _A3)
	extern __m128 _mm_rcp_ss(__m128 _A);
	//返回一个__m128的寄存器，将寄存器_A中4个32bit单精度浮点数取倒数，
	//例如_A=(_A0,_A1,_A2,_A3)，则返回寄存器为
	//r=(recip(_A0), recip(_A1), recip(_A2), recip(_A3))
	extern __m128 _mm_rcp_ps(__m128 _A);
	//返回一个__m128的寄存器，仅将寄存器_A最低对应位置的32bit单精度浮点数取平方根的倒数，
	//其余位置取寄存器_A中的数据,例如_A=(_A0,_A1,_A2,_A3)
	//则返回寄存器为r=(recip(sqrt(_A0)), _A1, _A2, _A3)
	extern __m128 _mm_rsqrt_ss(__m128 _A);
	//返回一个__m128的寄存器，将寄存器_A中4个32bit单精度浮点数取平方根的倒数，
	//例如_A=(_A0,_A1,_A2,_A3)，则返回寄存器为
	//r=(recip(sqrt(_A0)), recip(sqrt(_A1)), recip(sqrt(_A2)), recip(sqrt(_A3)))
	extern __m128 _mm_rsqrt_ps(__m128 _A);
	//返回一个__m128的寄存器，仅将寄存器_A和寄存器_B最低对应位置的32bit单精度浮点数取最小值，
	//其余位置取寄存器_A中的数据,例如_A=(_A0,_A1,_A2,_A3), _B=(_B0,_B1,_B2,_B3),
	//则返回寄存器为r=(min(_A0,_B0), _A1, _A2, _A3)
	extern __m128 _mm_min_ss(__m128 _A, __m128 _B);
	//返回一个__m128的寄存器，将寄存器_A和_B的对应位置的32bit单精度浮点数取最小值，
	//例如_A=(_A0,_A1,_A2,_A3), _B=(_B0,_B1,_B2,_B3),
	//则返回寄存器r0=min(_A0,_B0), r1=min(_A1,_B1), r2=min(_A2,_B2), r3=min(_A3,_B3)
	extern __m128 _mm_min_ps(__m128 _A, __m128 _B);
	//返回一个__m128的寄存器，仅将寄存器_A和寄存器_B最低对应位置的32bit单精度浮点数取最大值，
	//其余位置取寄存器_A中的数据,例如_A=(_A0,_A1,_A2,_A3), _B=(_B0,_B1,_B2,_B3),
	//则返回寄存器为r=(max(_A0,_B0), _A1, _A2, _A3)
	extern __m128 _mm_max_ss(__m128 _A, __m128 _B);
	//返回一个__m128的寄存器，将寄存器_A和_B的对应位置的32bit单精度浮点数取最大值，
	//例如_A=(_A0,_A1,_A2,_A3), _B=(_B0,_B1,_B2,_B3),
	//则返回寄存器r0=max(_A0,_B0), r1=max(_A1,_B1), r2=max(_A2,_B2), r3=max(_A3,_B3)
	extern __m128 _mm_max_ps(__m128 _A, __m128 _B);

	//Logical Operations(SSE)：and、andnot、or、xor
	//返回一个__m128的寄存器，将寄存器_A和_B的对应位置的32bit单精度浮点数分别进行按位与运算，
	//例如_A=(_A0,_A1,_A2,_A3), _B=(_B0,_B1,_B2,_B3),
	//则返回寄存器r0=_A0 & _B0, r1=_A1 & _B1, r2=_A2 & _B2, r3=_A3 & _B3
	extern __m128 _mm_and_ps(__m128 _A, __m128 _B);
	//返回一个__m128的寄存器，将寄存器_A对应位置的32bit单精度浮点数的非和寄存器_B对应位置的32bit
	//单精度浮点数分别进行按位与运算，例如_A=(_A0,_A1,_A2,_A3), _B=(_B0,_B1,_B2,_B3),
	//则返回寄存器r0=~_A0 & _B0, r1=~_A1 & _B1, r2=~_A2 & _B2, r3=~_A3 & _B3
	extern __m128 _mm_andnot_ps(__m128 _A, __m128 _B);
	//返回一个__m128的寄存器，将寄存器_A和_B的对应位置的32bit单精度浮点数分别进行按位或运算，
	//例如_A=(_A0,_A1,_A2,_A3), _B=(_B0,_B1,_B2,_B3),
	//则返回寄存器r0=_A0 | _B0, r1=_A1 | _B1, r2=_A2 | _B2, r3=_A3 | _B3
	extern __m128 _mm_or_ps(__m128 _A, __m128 _B);
	//返回一个__m128的寄存器，将寄存器_A和_B的对应位置的32bit单精度浮点数分别进行按位异或运算，
	//例如_A=(_A0,_A1,_A2,_A3), _B=(_B0,_B1,_B2,_B3),
	//则返回寄存器r0=_A0 ^ _B0, r1=_A1 ^ _B1, r2=_A2 ^ _B2, r3=_A3 ^ _B3
	extern __m128 _mm_xor_ps(__m128 _A, __m128 _B);

	//Comparison Intrinsics(SSE):==、<、<=、>、>=、!=、不小于、不小于等于、不大于、不大于等于
	//返回一个__m128的寄存器，Compares for equality,
	//r0=(_A0 == _B0) ? 0xffffffff : 0x0, r1=_A1, r2=_A2, r3=_A3
	extern __m128 _mm_cmpeq_ss(__m128 _A, __m128 _B);
	//返回一个__m128的寄存器，Compares for equality,
	//r0=(_A0 == _B0) ? 0xffffffff : 0x0, r1=(_A1 == _B1) ? 0xffffffff : 0x0, 
	//r2=(_A2 == _B2) ? 0xffffffff : 0x0, r3=(_A3 == _B3) ? 0xffffffff : 0x0
	extern __m128 _mm_cmpeq_ps(__m128 _A, __m128 _B);
	//返回一个__m128的寄存器，Compares for less than,
	//r0=(_A0 < _B0) ? 0xffffffff : 0x0, r1=_A1, r2=_A2, r3=_A3
	extern __m128 _mm_cmplt_ss(__m128 _A, __m128 _B);
	//返回一个__m128的寄存器，Compares for less than,
	//r0=(_A0 < _B0) ? 0xffffffff : 0x0, r1=(_A1 < _B1) ? 0xffffffff : 0x0, 
	//r2=(_A2 < _B2) ? 0xffffffff : 0x0, r3=(_A3 < _B3) ? 0xffffffff : 0x0
	extern __m128 _mm_cmplt_ps(__m128 _A, __m128 _B);
	//返回一个__m128的寄存器，Compares for less than or equal,
	//r0=(_A0 <= _B0) ? 0xffffffff : 0x0, r1=_A1, r2=_A2, r3=_A3
	extern __m128 _mm_cmple_ss(__m128 _A, __m128 _B);
	//返回一个__m128的寄存器，Compares for less than or equal,
	//r0=(_A0 <= _B0) ? 0xffffffff : 0x0, r1=(_A1 <= _B1) ? 0xffffffff : 0x0, 
	//r2=(_A2 <= _B2) ? 0xffffffff : 0x0, r3=(_A3 <= _B3) ? 0xffffffff : 0x0
	extern __m128 _mm_cmple_ps(__m128 _A, __m128 _B);
	//返回一个__m128的寄存器，Compares for greater than,
	//r0=(_A0 > _B0) ? 0xffffffff : 0x0, r1=_A1, r2=_A2, r3=_A3
	extern __m128 _mm_cmpgt_ss(__m128 _A, __m128 _B);
	//返回一个__m128的寄存器，Compares for greater than,
	//r0=(_A0 > _B0) ? 0xffffffff : 0x0, r1=(_A1 > _B1) ? 0xffffffff : 0x0, 
	//r2=(_A2 > _B2) ? 0xffffffff : 0x0, r3=(_A3 > _B3) ? 0xffffffff : 0x0
	extern __m128 _mm_cmpgt_ps(__m128 _A, __m128 _B);
	//返回一个__m128的寄存器，Compares for greater than or equal,
	//r0=(_A0 >= _B0) ? 0xffffffff : 0x0, r1=_A1, r2=_A2, r3=_A3
	extern __m128 _mm_cmpge_ss(__m128 _A, __m128 _B);
	//返回一个__m128的寄存器，Compares for greater than or equal,
	//r0=(_A0 >= _B0) ? 0xffffffff : 0x0, r1=(_A1 >= _B1) ? 0xffffffff : 0x0, 
	//r2=(_A2 >= _B2) ? 0xffffffff : 0x0, r3=(_A3 >= _B3) ? 0xffffffff : 0x0
	extern __m128 _mm_cmpge_ps(__m128 _A, __m128 _B);
	//返回一个__m128的寄存器，Compares for inequality,
	//r0=(_A0 != _B0) ? 0xffffffff : 0x0, r1=_A1, r2=_A2, r3=_A3
	extern __m128 _mm_cmpneq_ss(__m128 _A, __m128 _B);
	//返回一个__m128的寄存器，Compares for inequality,
	//r0=(_A0 != _B0) ? 0xffffffff : 0x0, r1=(_A1 != _B1) ? 0xffffffff : 0x0, 
	//r2=(_A2 != _B2) ? 0xffffffff : 0x0, r3=(_A3 != _B3) ? 0xffffffff : 0x0
	extern __m128 _mm_cmpneq_ps(__m128 _A, __m128 _B);
	//返回一个__m128的寄存器，Compares for not less than,
	//r0= !(_A0 < _B0) ? 0xffffffff : 0x0, r1=_A1, r2=_A2, r3=_A3
	extern __m128 _mm_cmpnlt_ss(__m128 _A, __m128 _B);
	//返回一个__m128的寄存器，Compares for not less than,
	//r0=!(_A0 < _B0) ? 0xffffffff : 0x0, r1=!(_A1 < _B1) ? 0xffffffff : 0x0, 
	//r2=!(_A2 < _B2) ? 0xffffffff : 0x0, r3=!(_A3 < _B3) ? 0xffffffff : 0x0
	extern __m128 _mm_cmpnlt_ps(__m128 _A, __m128 _B);
	//返回一个__m128的寄存器，Compares for not less than or equal
	//r0= !(_A0 <= _B0) ? 0xffffffff : 0x0, r1=_A1, r2=_A2, r3=_A3
	extern __m128 _mm_cmpnle_ss(__m128 _A, __m128 _B);
	//返回一个__m128的寄存器，Compares for not less than or equal
	//r0=!(_A0 <= _B0) ? 0xffffffff : 0x0, r1=!(_A1 <= _B1) ? 0xffffffff : 0x0, 
	//r2=!(_A2 <= _B2) ? 0xffffffff : 0x0, r3=!(_A3 <= _B3) ? 0xffffffff : 0x0
	extern __m128 _mm_cmpnle_ps(__m128 _A, __m128 _B);
	//返回一个__m128的寄存器，Compares for not greater than,
	//r0=!(_A0 > _B0) ? 0xffffffff : 0x0, r1=_A1, r2=_A2, r3=_A3
	extern __m128 _mm_cmpngt_ss(__m128 _A, __m128 _B);
	//返回一个__m128的寄存器，Compares for not greater than,
	//r0=!(_A0 > _B0) ? 0xffffffff : 0x0, r1=!(_A1 > _B1) ? 0xffffffff : 0x0, 
	//r2=!(_A2 > _B2) ? 0xffffffff : 0x0, r3=!(_A3 > _B3) ? 0xffffffff : 0x0
	extern __m128 _mm_cmpngt_ps(__m128 _A, __m128 _B);
	//返回一个__m128的寄存器，Compares for not greater than or equal,
	//r0=!(_A0 >= _B0) ? 0xffffffff : 0x0, r1=_A1, r2=_A2, r3=_A3
	extern __m128 _mm_cmpnge_ss(__m128 _A, __m128 _B);
	//返回一个__m128的寄存器，Compares for not greater than or equal,
	//r0=!(_A0 >= _B0) ? 0xffffffff : 0x0, r1=!(_A1 >= _B1) ? 0xffffffff : 0x0, 
	//r2=!(_A2 >= _B2) ? 0xffffffff : 0x0, r3=!(_A3 >= _B3) ? 0xffffffff : 0x0
	extern __m128 _mm_cmpnge_ps(__m128 _A, __m128 _B);
	//返回一个__m128的寄存器，Compares for ordered,
	//r0=(_A0 ord? _B0) ? 0xffffffff : 0x0, r1=_A1, r2=_A2, r3=_A3
	extern __m128 _mm_cmpord_ss(__m128 _A, __m128 _B);
	//返回一个__m128的寄存器，Compares for ordered,
	//r0=(_A0 ord? _B0) ? 0xffffffff : 0x0, r1=(_A1 ord? _B1) ? 0xffffffff : 0x0, 
	//r2=(_A2 ord? _B2) ? 0xffffffff : 0x0, r3=(_A3 ord? _B3) ? 0xffffffff : 0x0
	extern __m128 _mm_cmpord_ps(__m128 _A, __m128 _B);
	//返回一个__m128的寄存器，Compares for unordered,
	//r0=(_A0 unord? _B0) ? 0xffffffff : 0x0, r1=_A1, r2=_A2, r3=_A3	
	extern __m128 _mm_cmpunord_ss(__m128 _A, __m128 _B);
	//返回一个__m128的寄存器，Compares for unordered,
	//r0=(_A0 unord? _B0) ? 0xffffffff : 0x0, r1=(_A1 unord? _B1) ? 0xffffffff : 0x0, 
	//r2=(_A2 unord? _B2) ? 0xffffffff : 0x0, r3=(_A3 unord? _B3) ? 0xffffffff : 0x0
	extern __m128 _mm_cmpunord_ps(__m128 _A, __m128 _B);
	//返回一个0或1的整数，Compares the lower single-precision, floating-point value of
	//a and b for a equal to b,If a and b are equal, 1 is returned. Otherwise,
	//0 is returned. If a or b is a NaN, 1 is returned
	//r=(_A0 == _B0) ? 0x1 : 0x0
	extern int _mm_comieq_ss(__m128 _A, __m128 _B);
	//返回一个0或1的整数，If a is less than b, 1 is returned. Otherwise, 
	//0 is returned. If a or b is a NaN, 1 is returned,
	//r=(_A0 < _B0) ? 0x1 : 0x0
	extern int _mm_comilt_ss(__m128 _A, __m128 _B);
	//返回一个0或1的整数，If a is less than or equal to b, 1 is returned. 
	//Otherwise, 0 is returned. If a or b is a NaN, 1 is returned,
	//r=(_A0 <= _B0) ? 0x1 : 0x0
	extern int _mm_comile_ss(__m128 _A, __m128 _B);
	//返回一个0或1的整数，If a is greater than b, 1 is returned.
	//Otherwise, 0 is returned. If a or b is a NaN, 1 is returned,
	//r=(_A0 > _B0) ? 0x1 : 0x0
	extern int _mm_comigt_ss(__m128 _A, __m128 _B);
	//返回一个0或1的整数，If a is greater than or equal to b, 1 is returned. 
	//Otherwise, 0 is returned. If a or b is a NaN, 1 is returned,
	//r=(_A0 >= _B0) ? 0x1 : 0x0
	extern int _mm_comige_ss(__m128 _A, __m128 _B);
	//返回一个0或1的整数，If a and b are not equal, 1 is returned. 
	//Otherwise, 0 is returned. If a or b is a NaN, 1 is returned,
	//r=(_A0 != _B0) ? 0x1 : 0x0
	extern int _mm_comineq_ss(__m128 _A, __m128 _B);
	//返回一个0或1的整数，If a and b are equal, 1 is returned. 
	//Otherwise, 0 is returned. If a or b is a NaN, 1 is returned,
	//r=(_A0 == _B0) ? 0x1 : 0x0	
	extern int _mm_ucomieq_ss(__m128 _A, __m128 _B);
	//返回一个0或1的整数，If a is less than b , 1 is returned. 
	//Otherwise, 0 is returned. If a or b is a NaN, 1 is returned,
	//r=(_A0 < _B0) ? 0x1 : 0x0
	extern int _mm_ucomilt_ss(__m128 _A, __m128 _B);
	//返回一个0或1的整数，If a is less than or equal to b, 1 is returned. 
	//Otherwise, 0 is returned. If a or b is a NaN, 1 is returned,
	//r=(_A0 <= _B0) ? 0x1 : 0x0
	extern int _mm_ucomile_ss(__m128 _A, __m128 _B);
	//返回一个0或1的整数，If a is greater than b, 1 is returned. 
	//Otherwise, 0 is returned. If a or b is a NaN, 1 is returned,
	//r=(_A0 > _B0) ? 0x1 : 0x0
	extern int _mm_ucomigt_ss(__m128 _A, __m128 _B);
	//返回一个0或1的整数，If a is greater than or equal to b, 1 is returned.
	//Otherwise, 0 is returned,r=(_A0 >= _B0) ? 0x1 : 0x0
	extern int _mm_ucomige_ss(__m128 _A, __m128 _B);
	//返回一个0或1的整数，If a and b are not equal, 1 is returned. 
	//Otherwise, 0 is returned. If a or b is a NaN, 1 is returned,
	//r=(_A0 != _B0) ? 0x1 : 0x0
	extern int _mm_ucomineq_ss(__m128 _A, __m128 _B);

	//Conversion Operations(SSE)
	//返回一个32bit的整数，Converts the lower single-precision, floating-point value
	//of a to a 32-bit integer according to the current rounding mode, r=(int)_A0
	extern int _mm_cvt_ss2si(__m128 _A);//=_mm_cvtss_si32
	//返回一个__m64寄存器，Converts the two lower single-precision, floating-point 
	//values of a to two 32-bit integers according to the current rounding mode, 
	//returning the integers in packed form, r0=(int)_A0, r1=(int)_A1
	extern __m64 _mm_cvt_ps2pi(__m128 _A);//=_mm_cvtps_pi32
	//返回一个32bit的整数，Converts the lower single-precision, floating-point value
	//of a to a 32-bit integer with truncation, r=(int)_A0
	extern int _mm_cvtt_ss2si(__m128 _A);//=_mm_cvttss_si32
	//返回一个__m64寄存器，Converts the two lower single-precision, floating-point 
	//values of a to two 32-bit integer with truncation, returning the integers 
	//in packed form, r0=(int)_A0, r1=(int)_A1
	extern __m64 _mm_cvtt_ps2pi(__m128 _A);//=_mm_cvttps_pi32
	//返回一个__m128的寄存器，Converts the 32-bit integer value b to an single-precision,
	//floating-point value; the upper three single-precision, floating-point values are
	//passed through from a, r0=(float)_B, r1=_A1, r2=_A2, r3=_A3
	extern __m128 _mm_cvt_si2ss(__m128 _A, int _B);//=_mm_cvtsi32_ss 
	//返回一个__m128的寄存器，Converts the two 32-bit integer values in packed form in b
	//to two single-precision, floating-point values; the upper two single-precision, 
	//floating-point values are passed through from a
	//r0=(float)_B0, r1=(float)_B1, r2=_A2, r3=_A3
	extern __m128 _mm_cvt_pi2ps(__m128 _A, __m64 _B);//=_mm_cvtpi32_ps
	//返回一个__m128的寄存器，Converts the four 16-bit signed integer values in a to 
	//four single-precision, floating-point values
	//r0=(float)_A0, r1=(float)_A1, r2=(float)_A2, r3=(float)_A3
	__inline __m128 _mm_cvtpi16_ps(__m64 _A);
	//返回一个__m128的寄存器，Converts the four 16-bit unsigned integer values in a
	//to four single-precision, floating-point values
	//r0=(float)_A0, r1=(float)_A1, r2=(float)_A2, r3=(float)_A3
	__inline __m128 _mm_cvtpu16_ps(__m64 _A);
	//返回一个__m64的寄存器，Converts the four single-precision, floating-point values
	//in a to four signed 16-bit integer values
	//r0=(short)_A0, r1=(short)_A1, r2=(short)_A2, r3=(short)_A3
	__inline __m64 _mm_cvtps_pi16(__m128 _A);
	//返回一个__m128的寄存器，Converts the lower four 8-bit signed integer values in a 
	//to four single-precision, floating-point values
	//r0=(float)_A0, r1=(float)_A1, r2=(float)_A2, r3=(float)_A3
	__inline __m128 _mm_cvtpi8_ps(__m64 _A);
	//返回一个__m128的寄存器，Converts the lower four 8-bit unsigned integer values in a
	//to four single-precision, floating-point values
	//r0=(float)_A0, r1=(float)_A1, r2=(float)_A2, r3=(float)_A3
	__inline __m128 _mm_cvtpu8_ps(__m64 _A);
	//返回一个__m64的寄存器，Converts the four single-precision, floating-point values 
	//in a to the lower four signed 8-bit integer values of the result
	//r0=(char)_A0, r1=(char)_A1, r2=(char)_A2, r3=(char)_A3
	__inline __m64 _mm_cvtps_pi8(__m128 _A);
	//返回一个__m128的寄存器，Converts the two 32-bit signed integer values in a and the
	//two 32-bit signed integer values in b to four single-precision, floating-point values
	//r0=(float)_A0, r1=(float)_A1, r2=(float)_B0, r3=(float)_B1
	__inline __m128 _mm_cvtpi32x2_ps(__m64 _A, __m64 _B);
	//返回一个32bit浮点数，Extracts the lower order floating point value from the parameter
	//r=_A0
	extern float _mm_cvtss_f32(__m128 _A);

	//Miscellaneous Instructions That Use Streaming SIMD Extensions:
	//返回一个__m128的寄存器，Selects four specific single-precision, floating-point 
	//values from a and b, based on the mask i
	extern __m128 _mm_shuffle_ps(__m128 _A, __m128 _B, unsigned int _Imm8);
	//返回一个__m128的寄存器，Selects and interleaves the upper two single-precision,
	//floating-point values from a and b
	//r0=_A2, r1=_B2, r2=_A3, r3=_B3
	extern __m128 _mm_unpackhi_ps(__m128 _A, __m128 _B);
	//返回一个__m128的寄存器，Selects and interleaves the lower two single-precision,
	//floating-point values from a and b
	//r0=_A0, r1=_B0, r2=_A1, r3=_B1
	extern __m128 _mm_unpacklo_ps(__m128 _A, __m128 _B);
	//返回一个__m128的寄存器，Sets the upper two single-precision, floating-point 
	//values with 64 bits of data loaded from the address p; the lower two values
	//are passed through from a
	//r0=_A0, r1=_A1, r2=*_P0, r3=*_P1
	extern __m128 _mm_loadh_pi(__m128 _A, __m64 const* _P);
	//返回一个__m128的寄存器，Moves the upper two single-precision, floating-point
	//values of b to the lower two single-precision, floating-point values of the result
	//r3=_A3, r2=_A2, r1=_B3, r0=_B2
	extern __m128 _mm_movehl_ps(__m128 _A, __m128 _B);
	//返回一个__m128的寄存器，Moves the lower two single-precision, floating-point 
	//values of b to the upper two single-precision, floating-point values of the result
	//r3=_B1, r2=_B0, r1=_A1, r0=_A0
	extern __m128 _mm_movelh_ps(__m128 _A, __m128 _B);
	//返回为空，Stores the upper two single-precision, floating-point values of a 
	//to the address p, *_P0=_A2, *_P1=_A3
	extern void _mm_storeh_pi(__m64 *_P, __m128 _A);
	//返回一个__m128的寄存器，Sets the lower two single-precision, floating-point
	//values with 64 bits of data loaded from the address p; the upper two values
	//are passed through from a
	//r0=*_P0, r1=*_P1, r2=_A2, r3=_A3	
	extern __m128 _mm_loadl_pi(__m128 _A, __m64 const* _P);
	//返回为空，Stores the lower two single-precision, floating-point values of a
	//to the address p, *_P0=_A0, *_P1=_A1
	extern void _mm_storel_pi(__m64 *_P, __m128 _A);
	//返回一个整数，Creates a 4-bit mask from the most significant bits of the
	//four single-precision, floating-point values	
	//r=sign(_A3)<<3 | sign(_A2)<<2 | sign(_A1)<<1 | sign(_A0)
	extern int _mm_movemask_ps(__m128 _A);
	//返回一个无符号整数，Returns the contents of the control register
	extern unsigned int _mm_getcsr(void);
	//返回为空，Sets the control register to the value specified
	extern void _mm_setcsr(unsigned int);

	//Memory and Initialization Using Streaming SIMD Extensions
	//Load Operations(SSE)
	//返回一个__m128的寄存器，Loads an single-precision, floating-point value into
	//the low word and clears the upper three words
	//r0=*_P, r1=0.0, r2=0.0, r3=0.0
	extern __m128 _mm_load_ss(float const* _P);
	//返回一个__m128的寄存器，Loads a single single-precision, floating-point value,
	//copying it into all four words
	//r0=*_P0, r1=*_P1, r2=*_P2, r3=*_P3
	extern __m128 _mm_load_ps1(float const* _P);//=_mm_load1_ps
	//返回一个__m128的寄存器，Loads four single-precision, floating-point values
	//The address must be 16-byte aligned
	//r0=_P[0], r1=_P[1], r2=_P[2], r3=_P[3]
	extern __m128 _mm_load_ps(float const* _P);
	//返回一个__m128的寄存器，Loads four single-precision, floating-point values 
	//in reverse order, The address must be 16-byte aligned
	//r0=_P[3], r1=_P[2], r2=_P[1], r3=_P[0]
	extern __m128 _mm_loadr_ps(float const* _P);
	//返回一个__m128的寄存器，Loads four single-precision, floating-point values
	//The address does not need to be 16-byte aligned
	//r0=_P[0], r1=_P[1], r2=_P[2], r3=_P[3]
	extern __m128 _mm_loadu_ps(float const* _P);

	//Set Operations(SSE)
	//返回一个__m128的寄存器，Sets the low word of an single-precision, 
	//floating-point value to w and clears the upper three words
	//r0=_W, r1=r2=r3=0.0
	extern __m128 _mm_set_ss(float _W);
	//返回一个__m128的寄存器，Sets the four single-precision, floating-point values to w
	//r0=r1=r2=r3=_W
	extern __m128 _mm_set_ps1(float _W);//=_mm_set1_ps
	//返回一个__m128的寄存器，Sets the four single-precision, floating-point values to 
	//the four inputs, r0=_D, r1=_C, r2=_B, r3=_A
	extern __m128 _mm_set_ps(float _A, float _B, float _C, float _D);
	//返回一个__m128的寄存器，Sets the four single-precision, floating-point values to
	//the four inputs in reverse order, r0=_A, r1=_B, r2=_C, r3=_D
	extern __m128 _mm_setr_ps(float _A, float _B, float _C, float _D);
	//返回一个__m128的寄存器，Clears the four single-precision, floating-point values
	//r0=r1=r2=r3=0.0
	extern __m128 _mm_setzero_ps(void);

	//Store Operations(SSE)
	//返回为空，Stores the lower single-precision, floating-point value，*_V=_A0
	extern void _mm_store_ss(float *_V, __m128 _A);
	//返回为空，Stores the lower single-precision, floating-point value across four words
	//_V[0]=_A0, _V[1]=_A0, _V[2]=_A0, _V[3]=_A0
	extern void _mm_store_ps1(float *_V, __m128 _A);//=_mm_store1_ps
	//返回为空，Stores four single-precision, floating-point values
	//The address must be 16-byte aligned
	//_V[0]=_A0, _V[1]=_A1, _V[2]=_A2, _V[3]=_A3
	extern void _mm_store_ps(float *_V, __m128 _A);
	//返回为空，Stores four single-precision, floating-point values in reverse order
	//The address must be 16-byte aligned,
	//_V[0]=_A3, _V[1]=_A2, _V[2]=_A1, _V[3]=_A0
	extern void _mm_storer_ps(float *_V, __m128 _A);
	//返回为空，Stores four single-precision, floating-point values,
	//The address does not need to be 16-byte aligned
	//_V[0]=_A0, _V[1]=_A1, _V[2]=_A2, _V[3]=_A3
	extern void _mm_storeu_ps(float *_V, __m128 _A);
	//返回一个__m128的寄存器，Sets the low word to the single-precision, floating-point
	//value of b,The upper 3 single-precision, floating-point values are passed through 
	//from a, r0=_B0, r1=_A1, r2=_A2, r3=_A3		
	extern __m128 _mm_move_ss(__m128 _A, __m128 _B);

	//Integer Intrinsics Using Streaming SIMD Extensions
	//返回一个16bit整数，Extracts one of the four words of a，
	//The selector n must be an immediate,
	//r=(_Imm == 0) ? _A0 : ((_Imm==1) ? _A1 : ((_Imm==2) ? _A2 : _A3))
	extern int _m_pextrw(__m64 _A, int _Imm);//=_mm_extract_pi16
	//返回一个__m64的寄存器,Inserts word d into one of four words of a,
	//The selector n must be an immediate
	//r0=(_Imm==0)? _D : _A0, r1=(_Imm==1)? _D : _A1,
	//r2=(_Imm==2)? _D : _A2, r3=(_Imm==3)? _D : _A3
	extern __m64 _m_pinsrw(__m64 _A, int _D, int _Imm);//=_mm_insert_pi16
	//返回一个__m64的寄存器,Computes the element-wise maximum of the words in a and b,
	//r0=max(_A0, _B0), r1=max(_A1, _B1), r2=max(_A2, _B2), r3=max(_A3, _B3)
	extern __m64 _m_pmaxsw(__m64 _A, __m64 _B);//=_mm_max_pi16
	//返回一个__m64的寄存器,Computes the element-wise maximum of the unsigned bytes in
	//a and b, r0=max(_A0, _B0), r1=max(_A1, _B1), ... r7=max(_A7, _B7)
	extern __m64 _m_pmaxub(__m64 _A, __m64 _B);//=_mm_max_pu8
	//返回一个__m64的寄存器,Computes the element-wise minimum of the words in a and b
	//r0=min(_A0, _B0), r1=min(_A1, _B1), r2=min(_A2, _B2), r3=min(_A3, _B3)
	extern __m64 _m_pminsw(__m64 _A, __m64 _B);//=_mm_min_pi16
	//返回一个__m64的寄存器,Computes the element-wise minimum of the unsigned bytes
	//in a and b, r0=min(_A0, _B0), r1=min(_A1, _B1), ... r7=min(_A7, _B7)
	extern __m64 _m_pminub(__m64 _A, __m64 _B);//=_mm_min_pu8
	//返回一个整数，Creates an 8-bit mask from the most significant bits of the
	//bytes in a, r=sign(_A7)<<7 | sign(_A6)<<6 | ... | sign(_A0)
	extern int _m_pmovmskb(__m64 _A);//=_mm_movemask_pi8
	//返回一个__m64的寄存器,Multiplies the unsigned words in a and b, returning the
	//upper 16 bits of the 32-bit intermediate results,
	//r0=hiword(_A0, _B0), r1=hiword(_A1, _B1), r2=hiword(_A2, _B2), r3=hiword(_A3, _B3)
	extern __m64 _m_pmulhuw(__m64 _A, __m64 _B);//=_mm_mulhi_pu16
	//返回为空，Conditionally stores byte elements of d to address p,The high bit of 
	//each byte in the selector _B determines whether the corresponding byte in _A 
	//will be stored, if (sign(_B0)) _P[0]=_A0, if (sign(_B1)) _P[1]=_A1, ...
	//if (sign(_B7)) _P[7]=_A7
	extern void _m_maskmovq(__m64 _A, __m64 _B, char * _P);//=_mm_maskmove_si64
	//返回一个__m64的寄存器,Computes the (rounded) averages of the unsigned bytes 
	//in a and b, t=(unsigned short)_A0 + (unsigned short)_B0, r0=(t>>1) | (t & 0x01),
	//..., t=(unsigned short)_A7 + (unsigned short)_B7, r7=(t>>1) | (t & 0x01)	
	extern __m64 _m_pavgb(__m64 _A, __m64 _B);//=_mm_avg_pu8
	//返回一个__m64的寄存器,Computes the (rounded) averages of the unsigned words
	//in a and b, t=(unsigned short)_A0 + (unsigned short)_B0, r0=(t>>1) | (t & 0x01),
	//..., t=(unsigned short)_A4 + (unsigned short)_B4, r7=(t>>1) | (t & 0x01)
	extern __m64 _m_pavgw(__m64 _A, __m64 _B);//=_mm_avg_pu16
	//返回一个__m64的寄存器,Computes the sum of the absolute differences of the unsigned
	//bytes in a and b, returning the value in the lower word
	//The upper three words are cleared
	//r0=abs(_A0-_B0) + ... + abs(_A7-_B7), r1=r2=r3=0
	extern __m64 _m_psadbw(__m64, __m64);//=_mm_sad_pu8
	//返回一个__m64的寄存器,Returns a combination of the four words of a.
	//The selector _Imm must be an immediate
	//r0=word(_Imm & 0x03) of _A, r1=word((_Imm>>2) & 0x03) of _A, 
	//r2=word((_Imm>>4) & 0x03) of _A, r1=word((_Imm>>6) & 0x03) of _A, 
	extern __m64 _m_pshufw(__m64 _A, int _Imm);//=_mm_shuffle_pi16

	//Streaming SIMD Extensions that Support the Cache
	//返回为空，Loads one cache line of data from address p to a location closer
	//to the processor, The value _Sel specifies the type of prefetch operation
	extern void _mm_prefetch(char const*_A, int _Sel);
	//返回为空，Stores the data in a to the address p without polluting the caches
	//This intrinsic requires you to empty the multimedia state for the MMX register
	extern void _mm_stream_pi(__m64 * _P, __m64 _A);
	//返回为空，Stores the data in a to the address p without polluting the caches,
	//The address must be 16-byte aligned
	extern void _mm_stream_ps(float *, __m128 _A);
	//返回为空，Guarantees that every preceding store is globally visible 
	//before any subsequent store
	extern void _mm_sfence(void);

	/* Alternate intrinsic names definition */
	#define _mm_cvtss_si32    _mm_cvt_ss2si
	#define _mm_cvtps_pi32    _mm_cvt_ps2pi
	#define _mm_cvttss_si32   _mm_cvtt_ss2si
	#define _mm_cvttps_pi32   _mm_cvtt_ps2pi
	#define _mm_cvtsi32_ss    _mm_cvt_si2ss
	#define _mm_cvtpi32_ps    _mm_cvt_pi2ps
	#define _mm_extract_pi16  _m_pextrw
	#define _mm_insert_pi16   _m_pinsrw
	#define _mm_max_pi16      _m_pmaxsw
	#define _mm_max_pu8       _m_pmaxub
	#define _mm_min_pi16      _m_pminsw
	#define _mm_min_pu8       _m_pminub
	#define _mm_movemask_pi8  _m_pmovmskb
	#define _mm_mulhi_pu16    _m_pmulhuw
	#define _mm_shuffle_pi16  _m_pshufw
	#define _mm_maskmove_si64 _m_maskmovq
	#define _mm_avg_pu8       _m_pavgb
	#define _mm_avg_pu16      _m_pavgw
	#define _mm_sad_pu8       _m_psadbw
	#define _mm_set1_ps       _mm_set_ps1
	#define _mm_load1_ps      _mm_load_ps1
	#define _mm_store1_ps     _mm_store_ps1

本文内容由网友自发贡献，版权归原作者所有，本站不承担相应法律责任。如您发现有涉嫌抄袭侵权的内容，请联系:hwhale#tublm.com(使用前将#替换为@)

SIMDAssembly Language

SSE Intrinsics各函数介绍的相关文章

未找到 van-toast 节点，请确认 selector 及 context 是否正确

1 json文件引入 van toast vant weapp toast index 2 js文件引入 import Toast from vant weapp toast toast 3 wxml写入
微信小程序蓝牙BLE开发实战——遇到问题及踩坑(三)

微信小程序蓝牙BLE开发实战三对于我这种小白遇到问题是常见的哈这里记录下避免日后再踩坑文章目录微信小程序蓝牙BLE开发实战三 1 iPhone6及6plus无法搜索到设备解决方案 2 IOS无法获取 mac 地址如何连接
分布式任务调度平台xxl-job

一 java的集中式任务调度 while true Thread sleep 轮询线程休眠的方式实现定时任务 java util Timer java util TimerTask Timer是一种定时器工具用于使用后台线程计划执行指定
数字IC设计流程学习笔记

一规格定制 IC的规格定制包括物理指标性能指标和功能指标物理指标封装工艺芯片面积性能指标功耗速度功能指标接口芯片功能二系统设计系统设计是确定IC的算法模型和系统架构等并通过一些高级语言 matlab等对算法模
【tensorflow基础】读取mnist数据

转载于 MNIST手写数字数据集读取方法 TensorFlow的封装让使用MNIST数据集变得更加方便 MNIST数据集是NIST数据集的一个子集它包含了60000张图片作为训练数据 10000张图片作为测试数据在MNIST数据集中的每
spring-security

文章目录 csrf remember me 密码存储权限继承应要求添加的代码白名单相关说明 csrf A网站登录 B网站使用 Copyright C
传染病模型（4）——SIRS模型和SIER模型及matlab具体程序

前言常见的传染病模型按照具体的传染病的特点可分为 SI SIS SIR SIRS SEIR 模型其中 S E I R 的现实含义如下 S Susceptible 易感者指缺乏免疫能力健康人与感染者接触后容易受到感染 E Expose
一文了解亚马逊云科技适用于 Amazon Lightsail 的托管数据库

Amazon Lightsail 是亚马逊云科技提供的一种易上手使用月度价格经济实惠并包括了计算实例容器存储数据库的虚拟专用服务器在创建时可以进行业务蓝图选择可选择包含多种操作系统 Linux Windows 等或操作系统加

随机推荐

C++中定义常量的几种方式

概述在程序运行过程中始终不发生改变的量称之为常量在 C 语言中常量是个固定值也就是说常量值在定义后不能进行修改 define 宏常量 define 是 C 语言中定义常量的方式在 C 中也可以使用 define 的使用 defi
RocketMQ安装与启动

分享知识传递快乐官网 https rocketmq apache org 1 准备环境系统 Centos7 jdk 1 8 2 环境部署解压 rocketmq 并进入 rocketmq 下的 bin 目录调整启动内存 vim bi
C++ 函数模板

函数模板是通用的函数描述它们使用泛型来定义函数其中的泛型可用具体的类型替换通过将类型作为参数传递给模板可使编译器生成该类型的函数由于模板允许以泛型而不是具体类型的方式编写程序因此有时候也被称为通用编程在标准C 98添加关键
ubuntu14.04安装wireshakes

网络攻防这课要做一个嗅探器的大作业想在linux是实现于是先在ubuntu上下一个wiresharks看看它的一些功能和废话少说直接上安装过程与期间遇到的问题安装编译工具 sudo apt get install build e
Spring Gateway集成 Nacos注册中心不能够发现服务的问题解决

一问题描述我们现在是在用Nacos替换Eureka 原来Eureka和Spring gateway运行正常可以通过Spring gateway调用注册到Eureka中的服务当前Spring cloud的版本是Hoxton SR8 N
顺序表基本操作

文章目录 1 顺序表插入元素 2 顺序表删除元素 3 顺序表查找元素 4 顺序表更改元素 1 顺序表插入元素向顺序表中插入数据元素根据插入位置的不同可分为以下 3 种情况插入到顺序表的表头在表的中间位置插入元素尾随顺序表中已有元
TCP/IP详解卷1:协议学习笔记第十七章 TCP:传输控制协议

TCP提供一种面向连接的可靠的字节流服务面向连接意味着两个使用TCP的应用通常是一个客户一个服务器在彼此交换数据前必须先建立一个TCP连接在一个TCP连接中仅有两方进行彼此通信广播和多播不能用于TCP TCP提供可靠性的方法
Unity3D Animation、Animator和AnimationClip

文章目录 Animation 字段方法 Animator 字段方法 AnimationClip 字段方法 Animation 单一动画一般使用在单一动画播放占用资源小字段名称作用 animatePhysics 如果打开这个选
qt中connect函数探究

综合了一下网上资源整理得出 QT 是一个跨平台的 C GUI 应用构架它提供了丰富的窗口部件集具有面向对象易于扩展真正的组件编程等特点更为引人注目的是目前 Linux 上最为流行的 KDE 桌面环境就是建立在 QT 库的基础之上
Linux OOM killer机制介绍

1 概念描述 Linux内核内存管理使用OOM killer Out Of Memory killer 机制在系统内存不足时选择性杀死一些进程以释放内存以使系统继续运行 2 OOM killer产生的原因 2 1 malloc 内存分
解决js的小数点问题

diff fee should total fee add fee toFixed 2 console log diff fee
MATLAB----矩阵的运算

文章目录 1 获取矩阵的行列数 1 1 获取矩阵的行和列 1 2 把矩阵的行和列分别赋值给变量 2 矩阵的转置和逆矩阵 2 1 矩阵的转置 2 2 矩阵的逆矩阵 3 特征值和特征向量 4 加减乘除乘方运算 4 1 加法 4 2 减
Blender学习笔记 —— 资源整理（持续更新）

从准备入门到跨过门槛 3D神器Blender 教程来了一篇对Blender 3D功能入门介绍操作性质的文章斑斓中国 Blender中文社区关于Blender的各类资源很全斑斓中国的优酷频道都是Blender教学视频创意齿轮CGD
Android开发从初级到高级学习路线

Android开发需要具备的知识编程基础数据结构 C语言 Java语法初级首先需要购买一本Android入门的书籍把Android官方文档中的training和guide看一遍技术要求比如四大组件如何使用如何创建Servic
微信小程序开发（七） view 组件

view 组件的示例代码之横向布局和纵向布局 wxml
ggplot2读书笔记13：第十章数据变换

Data Transformation 10 1 简介通常情况下除了整理数据之外我们还需要把原始数据做一些数据变换聚合等这时就要使用到dplyr包本章中我们学习dplyr中四个重要的函数的用法 filter mutate gro
如何选择合适的渗压计？

渗压计的用途渗压计是测量构筑物内部渗透孔隙水压力的传感器一般直接测得水压力 kPa 再根据液体压强公式可换算为水位渗压计应用非常广泛可用于测量大坝坝体渗流压力浸润线绕坝渗流压力坝基扬压力尾矿库测压管水位干孔深度边坡
osg学习（七十四）Type mismatch in arithmetic operation between ‘vec2‘ and ‘int‘

可能是手机端语法检查更严格 glsl语句是这样的再桌面端执行没有问题在手机端执行会提示上述错误 vec3 tmpNormal osg NormalMatrix osg Normal tmpNormal normalize tmpNorm
Redis02-高级使用

本文基于 Redis6 2 7 和 CentOS 7 一事务首先要告诉大家 redis的事务和mysql的事务是不一样 1 1 事务指令 multi 开启事务 exec 提交事务 discard 回滚事务一个事务从开始到执行会经历以下
SSE Intrinsics各函数介绍

SIMD相关头文件包括 include

SSE Intrinsics各函数介绍

SSE Intrinsics各函数介绍 的相关文章

随机推荐

热门标签

SSE Intrinsics各函数介绍的相关文章