Inline instructions for TMS320C66x study notes

Jacktang

Inline instructions for TMS320C66x study notes [Copy link]

/********************************************************************************/
/* C6X.H v7.4.12 */
/********************************************************************************/
#include "vect.h"
typedef double __float2_t;//__float2_t double-precision floating-point type

#define _lltof2 _lltod
#define _f2toll _dtoll //Interpret a __float2_t as a long long
#define _ftof2 _ftod
#define _hif2 _hif
#define _lof2 _lof
#define _f2tol _dtol //Interpret a __float2_t as a __int40
#define _ltof2 _ltod

#define _amem8_f2 _amemd8 //Load and store 8 bytes, the pointer must be 8-byte aligned, and c6x.h must be included
#define _amem8_f2_const _amemd8_const //Load 8 bytes, the pointer must be 8-byte aligned, and c6x.h must be included
#define _mem8_f2 _memd8 //Load a 64-bit value from memory
#define _mem8_f2_const _memd8_const
#define _fdmv_f2 _fdmv
#define _hif2_128 _hid128
#define _lof2_128 _lod128
#define _f2to128 _dto128
#define _fdmvd_f2 _fdmvd

/*Extract the area specified by csta and cstb from src2 and sign-extend it to 32 bits. The extracted area is first sign-shifted left and then right. */
int _ext (int src2, unsigned csta, unsigned cstb);
int _extr (int src2, int src1);//Same as above, the difference is that the number of bits for left and right shifts is specified by the lower 10 bits of src1unsigned
_extu (unsigned src2, unsigned csta, unsigned cstb);//Same as above, the difference is that the last bit is 0 and extended to 32 bitsunsigned
_extur (unsigned src2, int src1);//Same as above, the difference is that the number of bits for left and right shifts is specified by the lower 10 bits of src1

/*Specify the first and last bits that need to be set to 1 through cstb and csta*/
unsigned _set (unsigned src, unsigned csta, unsigned cstb);
unsigned _setr (unsigned src2, int src1);//Set the specified position in src2 to 1, the first and last bits to be set to 1 are specified by the lower 10 bits of src1

/*Specify the first and last bits to be cleared to 0 through cstb and csta*/
unsigned _clr (unsigned src, unsigned csta, unsigned cstb);
unsigned _clrr (unsigned src2, int src1);//Clear the specified bit in src2, the first and last bits to be cleared to 0 are specified by the lower 10 bits of src1

/*Add src1 and src2 and saturate the result*/
int _sadd (int, int);
__int40_t _lsadd (int, __int40_t);
/*Subtract src2 from src1 and saturate the result*/
int _ssub (int src1, int src2);
__int40_t _lssub (int, __int40_t);

/*Convert a 40-bit long to a 32-bit signed int, saturating the result if necessary*/
int _sat (__int40_t);

/*Shift src2 left by src1, saturating the result to 32 bits*/
int _sshl (int src2, unsigned src1);
/*Add the high and low 16 bits of src1 to the high and low 16 bits of src2, respectively, and put them into the high and low 16 bits of the result*/
int _add2 (int, int);
int _sub2 (int, int);/*Subtract the high and low 16 bits of src2 from the high and low 16 bits of src1, and put them into the high and low 16 bits of the result*/

/*Conditional subtraction and left shift (commonly used in division)*/
unsigned _subc (unsigned, unsigned);
/*Search src2 for 1 or 0, 1 or 0 is determined by the LSB of src1, and return the number of bits shifted*/
unsigned _lmbd (unsigned src1, unsigned src2);
/*Returns the absolute value of src*/
int _abs (int src);
__int40_t _labs (__int40_t src);

/*Returns the number of redundant sign bits of src, bit31 is the sign bit, for example, bit31 goes to the lower bit, 01b returns 0, 001b returns 1, 0001b returns 2, 00001b returns 3*/
unsigned _norm (int);
unsigned _lnorm (__int40_t);

//16 LSBs * 16 LSBs
int _mpy (int src1, int src2);/*Multiply src1 and src2, the operand is signed by default*/
int _mpyus (unsigned src1, int src2);/*Multiply unsigned src1 and signed src2, S is used to determine which is signed (S) and which is unsigned (U)*/
int _mpysu (int, unsigned);
unsigned _mpyu (unsigned, unsigned);
//16 MSBs * 16 MSBs
int _mpyh (int, int);
int _mpyhus (unsigned, int);
int _mpyhsu (int, unsigned);
unsigned _mpyhu (unsigned, unsigned);
//16 MSBs * 16 LSBs
int _mpyhl (int, int);
int _mpyhuls (unsigned, int);
int _mpyhslu (int, unsigned);
//16 LSBs * 16 MSBs
int _mpylh (int, int);
int _mpyluhs (unsigned, int);
int _mpylshu (int, unsigned);
unsigned _mpylhu (unsigned, unsigned);
//What is the difference from the above multiplication instruction?
/*Multiply the lower 16 bits of src1 and the lower 16 bits of src2, then shift left by one bit*/
int _smpy (int src1, int src2);
int _smpyhl (int, int);//High 16 bits * low 16 bits
int _smpylh (int, int);//Low 16 bits * high 16 bits
int _smpyh (int, int);//Multiply the upper 16 bits of src1 and the upper 16 bits of src2
/*Multiply the two pairs of 16-bit signed numbers in src1 and src2, then shift left by 1 bit, and then saturate*/
long long _smpy2ll (int, int);
/*32-bit signed number multiplied by 32-bit signed number, the 64-bit result is shifted left by 1 bit and then saturated, and then the upper 32 bits of the result are written to dst*/
int _smpy32 (int, int);

/*Return the upper 32 bits of a double register (odd bit register) as an int type*/
unsigned _hi(double);
/*Return the upper 32 bits of a double register (odd bit register) as a float type*/
float _hif(double);
/*Return the upper 32 bits of a long long register (odd bit register) as an int type*/
unsigned _hill(long long);
/*Return the lower 32 bits of a double register (even bit register) as an int type*/
unsigned _lo(double);
/*Return the lower 32 bits of a double register (even bit register) as a float type*/
float _lof(double);
/*Return the lower 32 bits of a long long register (even bit register) as an int type*/
unsigned _loll(long long);

/*Create a new double register to hold the value of 2 unsigned int, where src2 is the high (odd) register and src1 is the low (even) register*/
double _itod(unsigned, unsigned);
/*Create a new double register to hold the value of 2 float, where src2 is the high (odd) register and src1 is the low (even) register*/
double _ftod(float, float);
/*Create a new long long register to hold the value of 2 unsigned int, where src2 is the high (odd) register and src1 is the low (even) register*/
long long _itoll(unsigned src2, unsigned src1);

/*Note that it is not a conversion, but a direct interpretation of the value in the register as an integer or floating point type*/
float _itof(unsigned); /* Reinterpret an int register as a float */
unsigned _ftoi(float); /* Interpret the bits of float as unsigned int */
__int40_t _dtol(double); /* Reinterpret a double register as an _int40_t */
double _ltod(__int40_t); /* Reinterpret a __int40_t register as a double */
long long _dtoll(double); /* Reinterpret a double register as a long long */
double _lltod(long long); /* Reinterpret a long long register as a double */

/* Define pseudo intrinsics for some pseudo instructions */
#ifndef _cmplt2
#define _cmplt2(src1, src2) _cmpgt2((src2), (src1))
#endif
#ifndef _cmpltu4
#define _cmpltu4(src1, src2) _cmpgtu4((src2), (src1))
#endif
#ifndef _dotpnrus2
#define _dotpnrus2(src1, src2) _dotpnrsu2((src2), (src1))
#endif
#ifndef _dotpus4
#define _dotpus4(src1, src2) _dotpsu4((src2), (src1))
#endif
#ifndef _mpyihll
#define _mpyihll(src1, src2) _mpyhill((src2), (src1))
#endif
#ifndef _mpyihr
#define _mpyihr(src1, src2) _mpyhir((src2), (src1))
#endif
#ifndef _mpyilll
#define _mpyilll(src1, src2) _mpylill((src2), (src1))
#endif
#ifndef _mpyilr
#define _mpyilr(src1, src2) _mpylir((src2), (src1))
#endif
#ifndef _mpyus4ll
#define _mpyus4ll(s rc1, src2) _mpysu4ll((src2), (src1))
#endif
#ifndef _saddsu2
#define _saddsu2(src1, src2) _saddus2((src2), (src1))
#endif
#ifndef _swap2
#define _swap2(src) _packlh2((src), (src))
#end if
/*Add 4 pairs of 8bits of src1 and src2, no saturation, carry will not affect other 8-bit numbers*/
int _add4 (int, int);
/*Calculate the average of 2 pairs of signed 16-bit numbers*/
int _avg2 (int, int);
/*Calculate the average of 4 pairs of signed 8-bit numbers*/
unsigned _avgu4 (unsigned, unsigned);
/*Compare 2 pairs of 16-bit numbers for equality, put the result into the lowest 2 bits of dst, return 1 if equal*/
int _cmpeq2 (int, int);
/*Compare 4 pairs of 8-bit numbers for equality, put the result into the lowest 4 bits of dst, set to 1 if equal, otherwise 0*/
int _cmpeq4 (int, int);
/*Compare 2 pairs of signed 16-bit numbers, src1 > src2, set to 1; otherwise 0. Put the result into the lowest 2 bits of dst*/
int _cmpgt2 (int src1, int src2);
/*Compare 4 pairs of unsigned 8-bit numbers. If src1 > src2, set to 1; otherwise, set to 0. The result is placed in the lowest 4 bits of dst*/
unsigned _cmpgtu4 (unsigned src1, unsigned src2);

/*Dot product (add) 2 pairs of 16-bit signed numbers in src1 and src2. The result is written as a signed 32-bit int or sign-extended to 64 bits*/
int _dotp2 (int , int);
__int40_t _ldotp2 (int, int);
/*Subtract the dot product of the 16-bit signed numbers in src1 and src2*/
int _dotpn2 (int, int);
/*The dot product of the upper 16 bits of src1 and src2 is "subtracted" from the dot product of the lower 16 bits. The number in src1 is treated as a signed number, the number in src2 is treated as an unsigned number, and 2^15 is added, and the result is signed and right-shifted 16 bits*/
int _dotpnrsu2 (int src1, unsigned src2);
/*The dot product of the upper 16 bits of src1 and src2 is "added" to the dot product of the lower 16 bits. The number in src1 is treated as a signed number, the number in src2 is treated as an unsigned number, and 2^15 is added, and the result is signed and right-shifted 16 bits*/
int _dotprsu2 (int, unsigned);
/*Multiply 4 pairs of 8-bit numbers in src1 and src2 and sum them. Each 8-bit number in src1 is treated as a signed number, and each 8-bit number in src2 is treated as an unsigned number*/
int _dotpsu4 (int, unsigned);
unsigned _dotpu4 (unsigned, unsigned);//Same as above, all are treated as unsigned numbers

/*Perform Galois field multiplication on 4 pairs of 8-bit unsigned numbers in src1 and src2*/
int _gmpy4 (int, int);

/*Compare 2 pairs of 16-bit signed numbers in src1 and src2 and take the larger value*/
int _max2 (int, int);
/*Compare 4 pairs of 8-bit unsigned numbers in src1 and src2, and take the larger value*/
unsigned _maxu4 (unsigned, unsigned);
/*Compare 2 pairs of 16-bit signed numbers in src1 and src2, and take the smaller value*/
int _min2 (int, int);
/*Compare 4 pairs of 8-bit unsigned numbers in src1 and src2, and take the smaller value*/
unsigned _minu4 (unsigned, unsigned);

/*Multiply 2 pairs of 16-bit signed numbers in src1 and src2 respectively, and write the two 32-bit results into long long*/
long long _mpy2ll (int, int);
/*Multiply the high 16 bits of src1 as a 16-bit signed number by the 32-bit signed number of src2, and write the result into the low 48 bits of long long*/
long long _mpyhill (int src1, int src2);
/*Multiply the lower 16 bits of src1 as a 16-bit signed number by the 32-bit signed number of src2, and write the result to the lower 48 bits of long long*/
long long _mpylill (int, int);
/*Multiply the upper 16 bits of src1 as a 16-bit signed number by the 32-bit signed number of src2. The product is converted to 32 bits by adding 2^14 using the round mode, and then right-shifted 15 bits*/
int _mpyhir (int, int);
/*Multiply the lower 16 bits of src1 as a 16-bit signed number by the 32-bit signed number of src2. The product is converted to 32 bits by adding 2^14 using round mode, and then right-shifted 15 bits*/
int _mpylir (int, int);
/*Multiply the 4 8-bit signed numbers of src1 by the 4 8-bit unsigned numbers of src2 to get 4 16-bit signed numbers to form a 64-bit number*/
long long _mpysu4ll (int src1, unsigned src2);
long long _mpyu4ll (unsigned, unsigned);//At the same time, both are unsigned numbers

/*Put the low 16 bits of src1 into the high 16 bits of the return value, and the low 16 bits of src2 into the low 16 bits of the return value*/
unsigned _pack2 (unsigned src1, unsigned src2);
/*Put the high 16 bits of src1 into the high 16 bits of the return value, and the high 16 bits of src2 into the low 16 bits of the return value*/
unsigned _packh2 (unsigned, unsigned);
/*Split src1 (a_3|a_2|a_1|a_0) and src2 (b_3|b_2|b_1|b_0) into 4 pairs of 8-bit unsigned numbers, return a_3|a_1|b_3|b_1*/
unsigned _packh4 (unsigned, unsigned);
/*Split src1 (a_3|a_2|a_1|a_0) and src2 (b_3|b_2|b_1|b_0) into 4 pairs of 8-bit unsigned numbers, return a_2|a_0|b_2|b_0*/
unsigned _packl4 (unsigned, unsigned);
/*Split src1 (a_hi|a_lo) and src2 (b_hi|b_lo) into 2 pairs of 16-bit unsigned numbers, return a_hi|b_lo*/
unsigned _packhl2 (unsigned, unsigned);
/*Split src1(a_hi|a_lo) and src2(b_hi|b_lo) into 2 pairs of 16-bit unsigned numbers, and return a_lo|b_hi*/
unsigned _packlh2 (unsigned, unsigned);
/*Left shift src2 32 bits according to the lowest 5 bits of src1, and the remaining high 5-31 bits of src1 are ignored*/
unsigned _rotl (unsigned src1, unsigned src2);
/*Add the two 16-bit signed numbers in src1 and src2 to generate two 16-bit signed numbers and saturate*/
int _sadd2 (int, int);
/*Add 4 pairs of 8-bit unsigned numbers in src1 and src2 and saturate*/
unsigned _saddu4 (unsigned, unsigned);
/*Add the two 16-bit unsigned numbers in src1 and the two 16-bit signed numbers in src2 to get two 16-bit unsigned numbers and saturate them*/
int _saddus2 (unsigned src1, int src2);

/*Shift src2 left by 1 byte, and then fill the highest 1 byte of src1 into the extra position of src2 after the left shift*/
unsigned _shlmb (unsigned src1, unsigned src2);
/*Shift src2 right by 1 byte, and then fill the highest 1 byte of src1 into the extra position of src2 after the right shift*/
unsigned _shrmb (unsigned src1, unsigned src2);
/*Shift the two 16-bit signed numbers of src2 right respectively. The number of right shifts is determined by the lower 5 bits of src1, and the extra position is extended by the sign bit*/
int _shr2 (int src1, unsigned src2);
/* Shift the two 16-bit signed numbers in src2 right respectively. The number of right shifts is determined by the lower 5 bits of src1, and the extra positions are extended by 0*/
unsigned _shru2 (unsigned src1, unsigned src2);
/* Saturate a 32-bit signed number in src1 and src2 to signed 16 bits, and then put the high 16 bits of src1 and the saturated result of src2 into the low 16 bits of dst. If you don't understand, read the instruction manual*/
int _spack2 (int src1, int src2);
/* Saturate the two pairs of 16-bit signed numbers in src1 and src2 into 8-bit unsigned numbers*/
unsigned _spacku4 (int, int);
/* *
* Shift the 32-bit signed number in src2 left or right. The number of shifts is determined by the number of bits specified by src1.
* src1 is between [-31,31]. If src1 is positive, src2 is shifted left; if src1 is negative, src2 is shifted right by |src1| and the sign bit is extended.
* */
int _sshvl (int src2, int src1);
/* *
* Shift the 32-bit signed number in src2 left or right. The number of shifts is determined by the number of bits specified by src1.
* src1 is between [-31,31]. If src1 is positive, src2 is right-shifted and sign-extended. If src1 is negative, src2 is left-shifted by |src1|
* */
int _sshvr (int src2, int src1);
/*Subtract 4 pairs of 8-bit numbers from src1 and src2 without saturation*/
int _sub4 (int src1, int src2);
/*Subtract 4 pairs of 8-bit unsigned numbers from src1 and src2 to find the absolute value*/
int _subabs4 (int src1, int src2);

/*Calculate the absolute value of the high and low 16 bits respectively*/
int _abs2 (int);
/*Count the number of 4 pairs of 8-bit bits that are 1 and write the result to the corresponding position*/
unsigned _bitc4 (unsigned);
/*Reverse the order of bits*/
unsigned _bitr (unsigned);
/* Extract the odd and even bits of the bits in src and reorganize them, put the even bits in the lower 16 bits and the odd bits in the upper 16 bits*/
unsigned _deal (unsigned);
/* Move the data of src into the return value, using the multiplication pipeline (delay 4 cycles)*/
int _mvd (int src2);
/* Interleave the upper 16 and lower 16 bits of src2 abcdefghijklmnop|ABCDEFGHIJKLMNOP, and return aAbBcCdDeEfFgGhH|iIjJkKlLmMnNoOpP*/
unsigned _shfl (unsigned src2);
/* Swap the 4 8-bit unsigned numbers (ub_3|ub_2|ub_1|ub_0|) in src and return (ub_2|ub_3|ub_0|ub_1|)*/
unsigned _swap4 (unsigned);
/*Extend 0 (upper 16 bits, 2 8-bit numbers), (ub_3|ub_2|ub_1|ub_0|)-->(0|ub_3|0|ub_2|), each separator represents 1 byte*/
unsigned _unpkhu4 (unsigned);
/*Extend 0 (lower 16 bits, 2 8-bit numbers), (ub_3|ub_2|ub_1|ub_0|)-->(0|ub_1|0|ub_0|), each separator represents 1 byte*/
unsigned _unpklu4 (unsigned);
/*Extend according to the lowest 2 bits of src, bit1 extends to the upper 16 bits, bit0 extends to the lower 16 bits*/
unsigned _xpnd2 (unsigned);
unsigned _xpnd4 (unsigned);//Same as above, expand according to the lowest 4 bits of src, for example, bit0 extends to the lower 8 bits.

/*Do 2 steps in parallel, 1.src1+src2->dst_o 2.src1-src2->dst_e, dst_o represents odd register, dst_e represents even register*/
long long _addsub (int src1, int src2);
/* Same as above, divide into 2 pairs of 16-bit signed numbers for ADD2 and SUB2*/
long long _addsub2 (unsigned, unsigned);
/* *
* Saturate (dot product of low 16 bits of src1 and high 16 bits of src2 plus dot product of high 16 bits of src1 and low 16 bits of src2) to dst_e
* sat((lsb16(src1) × msb16(src2)) + (msb16(src1) × lsb16(src2))) → dst_e
* Signed 16-bit src1 and src2 high 16 bits dot product minus src1 and src2 low 16 bits dot product to dst_o
* (msb16(src1) × msb16(src2)) - (lsb16(src1) × lsb16(src2)) → dst_o
* */
long long _cmpy (unsigned src1, unsigned src2);
/* *
* sat((lsb16(src1) × msb16(src2)) + (msb16(src1) × lsb16(src2))) → tmp_e
* msb16(sat(tmp_e + 00008000h)) → lsb16(dst)
* sat((msb16(src1) × msb16(src2)) - (lsb16(src1) × lsb16(src 2))) → tmp_o
* msb16(sat(tmp_o + 00008000h)) → msb16(dst)
* */
unsigned _cmpyr (unsigned, unsigned);
/* *
* sat((lsb16(src1) × msb16(src2)) + (msb16(src1) × lsb16(src2))) → tmp_e
* msb16(sat((tmp_e + 00004000h) << 1)) → lsb16(dst)
* sat((msb16(src1) × msb16(src2)) - (lsb16(src1) × lsb16(src2))) → tmp_o
* msb16(sat((tmp_e + 00004000h) << 1)) → msb16(dst)
* */
unsigned _cmpyr1 (unsigned, unsigned);
/*See the 35-page diagram of TMS320C6000 inline instruction assembly to complete the dot product and function*/
long long _ddotph2 (long long, unsigned);
unsigned _ddotph2r (long long, unsigned);
long long _ddotpl2 (long long, unsigned); unsigned);
unsigned _ddotpl2r (long long, unsigned);
long long _ddotp4 (unsigned src1, unsigned src2);
/*Data packing, see the diagram on page 37*/
long long _dpack2 (unsigned src1, unsigned src2);
long long _dpackx2 (unsigned, unsigned);
/*Move two registers into one register at once*/
long long _dmv (unsigned, unsigned);
double _fdmv (float, float);
/*Multiplication on the Galois field*/
unsigned _gmpy (unsigned, unsigned);
/*32-bit multiplication by 32-bit. All are signed numbers, and the 64-bit results are all written to dst*/
long long _mpy32ll (int, int);
/*32-bit multiplication by 32-bit. All are signed numbers, and the lower 32 bits of the 64-bit result are written to dst*/
int _mpy32 (int, int);
/*src1 signed 32-bit multiplied by src2 unsigned 32-bit = signed 64-bit*/
long long _mpy32su (int, unsigned);
long long _mpy32us (unsigned, int); //Same as above, unsigned multiplied by signed
long long _mpy32u (unsigned, unsigned);//Same as above, both are unsigned
/* *
* Perform 16-bit multiplication by 32-bit. Treat the upper and lower 16 bits of src1 as signed 16 bits; treat the value of src2 as signed 32 bits.
* The product is rounded to 32 bits by adding 2^14, and then the result is right-shifted 15 bits. The lower 32 bits of the two results are written to dst_o:dst_e
* */
long long _mpy2ir (unsigned src1, int src2);
/*The upper 16 bits of src1 and src2 are saturated after being right shifted 1 bit respectively, and put into the 32-bit result*/
unsigned _rpack2 (unsigned src1, unsigned src2);
/*Parallel 1. Saturation (src1+src2)->dst_o 2. Saturation (src1-src2)->dst_e*/
long long _saddsub (int, int);
long long _saddsub2 (unsigned, unsigned);//Parallel SADD2 and SSUB2 instructions
/*I don't know what they are used for, see page 39*/
long long _shfl3 (unsigned, unsigned);
/* Two 16-bit signed numbers in src1 minus two 16-bit signed numbers in src2*/
int _ssub2 (int src1, int src2);
/* Galois multiplication*/
unsigned _xormpy (unsigned, unsigned );

long long _dcmpyr1 (long long, long long);
long long _dccmpyr1 (long long, long long);
long long _cmpy32r1 (long long, long long);
long long _ccmpy32r1 (long long, long long);
long long _mpyu2 ( unsigned, unsigned);
/*4 addition of dot products of 8-bit signed numbers*/
int _dotp4h (long long, long long);
long long _dotp4hll (long long, long long);
/*4 pairs of 16-bit signed numbers in src1 and 4 pairs of 16-bit unsigned numbers in src2, get a 32-bit dot product sum*/ int _dotpsu4h (long long, long long); /*4 pairs of 16-bit signed numbers in src1 and src2 , get a 32-bit dot product sum*/
int _dotpsu4h (long long, long long);
For 16-bit signed numbers, 4 pairs of 16-bit unsigned numbers in src2 get a 64-bit dot product sum*/
long long _dotpsu4hll (long long, long long);
/*2 32-bit signed numbers in src1 Add the two 32-bit signed numbers of src2*/
long long _dadd (long long src1, long long src2);
long long _dadd_c (int, long long);

long long _dsadd (long long, long long);
/*4 Add 16-bit signed numbers*/
long long _dadd2 (long long, long long);
long long _dsadd2 (long long, long long);
long long _dsub (long long, long long);
long long _dssub (long long, long long);
long long _dssub2 (long long, long long);

long long _dapys2 (long long, long long);
long long _dshr (long long, unsigned);
long long _dshru (
long long _dshl (long long, unsigned); long
long _dshr2 (long long, unsigned);
long long _dshru2 (long long, unsigned);
unsigned _shl2 (unsigned , unsigned);
long long _dshl2 (long long , unsigned);
long long _dxpnd4 (unsigned);
long long _dxpnd2 (unsigned);
int _crot90 (int);
long long _dcrot90 (long long);
int _crot270 (int);
long long _dcrot270 (long long);
/*Compare 4 pairs of 16-bit signed numbers in src1 and src2, and put the larger one into dst*/
long long _dmax2 (long long, long long);
long long _dmin2 (long long, long long);
/*src1 and Compare 8 pairs of 8-bit signed numbers in src2, and put the larger one into dst*/
long long _dmaxu4 (long long, long long);
long long _dminu4 (long long, long long);

/*4 pairs of 16-bit comparisons, Return 1 if equal, 0 if not equal*/
unsigned _dcmpeq2 (long long, long long);
/*8-bit comparison, return 1 if equal, 0 if not equal*/
unsigned _dcmpeq4 (long long, long long);
/* 4 pairs of 16-bit comparisons, return 1 if greater than, return 0 if not greater than*/
unsigned _dcmpgt2 (long long, long long);
/*8 pairs of 8-bit comparisons, return 1 if greater than, return 0 if not greater than*/
unsigned _dcmpgtu4 (long long, long long);

/*4 Calculate the average of 4 pairs of 16-bit signed numbers "(a+b+1)/2"*/
long long _davg2 (long long, long long);
/*8对8位无符号数求8个平均*/
long long _davgu4 (long long, long long);
/*有符号16位，无round模式，4个平均"(a+b)/2"*/
long long _davgnr2 (long long, long long);
/*无符号8位，无round模式，8个平均*/
long long _davgnru4 (long long, long long);
long long _unpkbu4 (unsigned);
long long _unpkh2 (unsigned);
long long _unpkhu2 (unsigned);

/*并行执行2个PACKL2*/
long long _dpackl2 (long long, long long);
/*并行执行2个PACKH2*/
long long _dpackh2 (long long, long long);
long long _dpackhl2 (long long, long long);
/*并行执行PACKH4和PACKL4*/
long long _dpacklh4 (unsigned, unsigned);
long long _dpackl4 (long long, long long);
long long _dpackh4 (long long, long long);
long long _dspacku4 (long long, long long);

void _mfence ();
__float2_t _dmpysp (__float2_t, __float2_t);
/*2路float型相加*/
__float2_t _daddsp (__float2_t, __float2_t);
__float2_t _dsubsp (__float2_t, __float2_t);
/*src中的16位有符号数转换成单精度浮点放入dst_e和dst_o中*/
__float2_t _dinthsp (unsigned src);
/*src中的16位无符号数转换成单精度浮点放入dst_e和dst_o中*/
__float2_t _dinthspu (unsigned);
/*src中的32位有符号转换成单精度浮点，放入dst_e和dst_o中*/
__float2_t _dintsp (long long);
/*src中的32位无符号转换成单精度浮点，放入dst_e和dst_o中*/
__float2_t _dintspu (long long);
unsigned _dspinth (__float2_t);
long long _dspint (__float2_t);

int _land (int, int);
int _landn (int, int);
int _lor (int, int);
/*将2个寄存器移入1个寄存器中。一次进行2次移动，当处理很多double word时很有用。减去寄存器压力*/
long long _dmvd (int, int);
double _fdmvd (float, float);

double _complex_mpysp (double, double); /* CMPYSP then DADDSP */
double _complex_conjugate_mpysp (double, double); /* CMPYSP then DSUBSP */

long long _xorll_c (int, long long);

__x128_t __BUILTIN _dcmpy (long long, long long);
__x128_t __BUILTIN _dccmpy (long long, long long);
long long __BUILTIN _cmatmpyr1 (long long, __x128_t);
long long __BUILTIN _ccmatmpyr1 (long long, __x128_t);
__x128_t __BUILTIN _cmatmpy (long long, __x128_t);
__x128_t __BUILTIN _ccmatmpy (long long, __x128_t);
__x128_t __BUILTIN _qsmpy32r1 (__x128_t, __x128_t);
__x128_t __BUILTIN _qmpy32 (__x128_t, __x128_t);
__x128_t __BUILTIN _dsmpy2 (long long, long long);
/*Multiply 4 pairs of 16-bit signed numbers to get a 32-bit signed number and put it into a 128-bit register*/
__x128_t __BUILTIN _dmpy2 (long long, long long);
/*Multiply 4 pairs of 16-bit signed numbers to get a 32-bit signed number and put it into a 128-bit register*/
__x128_t __BUILTIN _dmpyu2 (long long, long long);
/*Multiply the 8 8-bit signed numbers in src1 by the 8 8-bit unsigned numbers in src2 to get 8 16-bit signed numbers*/
__x128_t __BUILTIN _dmpysu4 (long long src1, long long src2);
__x128_t __BUILTIN _dmpyu4 (long long, long long);//Same as above, all unsigned
__x128_t __BUILTIN _cmpysp (__float2_t, __float2_t);
__x128_t __BUILTIN _qmpysp (__x128_t, __x128_t);
/* Execute 2 dotp4h, both signed */
long long __BUILTIN _ddotp4h (__x128_t, __x128_t);
/* Execute 2 dotpsu4h, one signed, one unsigned */
long long __BUILTIN _ddotpsu4h (__x128_t, __x128_t);

__x128_t __BUILTIN _ito128 (unsigned, unsigned, unsigned, unsigned);
__x128_t __BUILTIN _fto128 (float, float, float, float);
__x128_t __BUILTIN _llto128 (long long, long long);
__x128_t __BUILTIN _dto128 (double, double);

long long __BUILTIN _hi128 (__x128_t);
double __BUILTIN _hid128 (__x128_t);
long long __BUILTIN _lo128 (__x128_t);
double __BUILTIN _lod128 (__x128_t);

unsigned __BUILTIN _get32_128 (__x128_t, __CONST(0,3) unsigned);
float __BUILTIN _get32f_128 (__x128_t, __CONST(0,3) unsigned);

__x128_t __BUILTIN _dup32_128 (unsign ed);

extern
extern __cregister volatile unsigned int IFR;
extern __cregister volatile unsigned int ISR;
extern __cregister volatile unsigned int ICR;
extern __cregister volatile unsigned int ISTP
; extern __cregister volatile unsigned int I extern __cregister volatile unsigned int NRP; extern __cregister volatile unsigned int GFPGFR; extern __cregister volatile unsigned int DIER; extern __cregister volatile unsigned int REP; extern __cregister volatile unsigned int TSCL; extern __cregister volatile unsigned int TSCH; extern extern __cregister volatile unsigned int ILC; extern __cregister volatile unsigned int RILC; extern __cregister volatile unsigned int PCE1; extern __cregister volatile unsigned int SSR; extern __cregister volatile unsigned int GPLYA; extern __cregister volatile un signed int GPLYB; extern __cregister volatile unsigned int TSR; extern __cregister volatile unsigned int ITSR;

extern __cregister volatile unsigned int NTSR;
extern __cregister volatile unsigned int ECR;
extern __cregister volatile unsigned int EFR;
extern __cregister volatile unsigned int IERR;

extern __cregister volatile unsigned int DMSG;
extern __cregister volatile unsigned int CMSG;
extern __cregister volatile unsigned int DT_DMA_ADDR;
extern __cregister volatile unsigned int DT_DMA_DATA;
extern __cregister volatile unsigned int DT_DMA_CNTL;
extern __cregister volatile unsigned int TCU_CNTL;
extern __cregister volatile unsigned int RTDX_REC_CNTL;
extern __cregister volatile unsigned int RTDX_XMT_CNTL;
extern __cregister volatile unsigned int RTDX_CFG;
extern __cregister volatile unsigned int RTDX_RDATA;
extern __cregister volatile unsigned int RTDX_WDATA;
extern __cregister volatile unsigned int RTDX_RADDR;
extern __cregister volatile unsigned int RTDX_WADDR;
extern __cregister volatile unsigned int MFREG0;
extern __cregister volatile unsigned int DBG_STAT;
extern __cregister volatile unsigned int BRK_EN;
extern __cregister volatile unsigned int HWBP0_CNT;
extern __cregister volatile unsigned int HWBP0;
extern __cregister volatile unsigned int HWBP1;
extern __cregister volatile unsigned int HWBP2;
extern __cregister volatile unsigned int HWBP3;
extern __cregister volatile unsigned int OVERLAY;
extern __cregister volatile unsigned int PC_PROF;
extern __cregister volatile unsigned int ATSR;
extern __cregister volatile unsigned int TRR;
extern __cregister volatile unsigned int TCRR;