| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488 |
- ; from a new GOGO-no-coda (1999/09)
- ; Copyright (C) 1999 shigeo
- ; special thanks to Keiichi SAKAI, URURI
- ; hacked and back-ported to LAME
- ; by Takehiro TOMINAGA Nov 2000
- %include "nasm.h"
- globaldef fht_3DN
- segment_data
- align 16
- costab dd 0x80000000, 0
- dd 1.414213562,1.414213562
- dd 9.238795283293805e-01, 9.238795283293805e-01
- dd 3.826834424611044e-01, 3.826834424611044e-01
- dd 9.951847264044178e-01, 9.951847264044178e-01
- dd 9.801714304836734e-02, 9.801714304836734e-02
- dd 9.996988186794428e-01, 9.996988186794428e-01
- dd 2.454122920569705e-02, 2.454122920569705e-02
- dd 9.999811752815535e-01, 9.999811752815535e-01
- dd 6.135884819898878e-03, 6.135884819898878e-03
- D_1_0_0_0 dd 0.0 , 1.0
- segment_code
- PIC_OFFSETTABLE
- ;void fht_3DN(float *fz, int nn);
- proc fht_3DN
- pushd ebp, ebx, esi, edi
- sub esp, 20
- call get_pc.bp
- add ebp, PIC_BASE()
- mov r0, [esp+40] ;fi
- mov r1, [esp+44] ;r1 = nn
- lea r3, [PIC_EBP_REL(costab)] ;tri = costab
- lea r4, [r0+r1*8] ;r4 = fn = &fz[n]
- mov [esp+16], r4
- mov r4, 8 ;kx = k1/2
- pmov mm7, [r3]
- loopalign 16
- .do1
- lea r3, [r3+16] ;tri += 2;
- pmov mm6, [PIC_EBP_REL(costab+8)]
- lea r2, [r4+r4*2] ;k3*fsize/2
- mov r5, 4 ;i = 1*fsize
- loopalign 16
- .do2:
- lea r1, [r0+r4] ;gi = fi + kx
- ;f
- pmov mm0, [r0] ;fi0
- pmov mm1, [r0+r4*2] ;fi1
- pmov mm2, [r0+r2*2] ;fi3
- pmov mm3, [r0+r4*4] ;fi2
- pupldq mm0, mm0 ;fi0 | fi0
- pupldq mm1, mm1 ;fi1 | fi1
- pupldq mm2, mm2 ;fi2 | fi2
- pupldq mm3, mm3 ;fi3 | fi3
- pxor mm1, mm7 ;fi1 | -fi1
- pxor mm3, mm7 ;fi3 | -fi3
- pfsub mm0, mm1 ;f1 | f0
- pfsub mm2, mm3 ;f3 | f2
- pmov mm4, mm0
- pfadd mm0, mm2 ;f1+f3|f0+f2 = fi1 | fi0
- pfsub mm4, mm2 ;f1-f3|f0-f2 = fi3 | fi2
- pmovd [r0], mm0 ;fi[0]
- puphdq mm0, mm0
- pmovd [r0+r4*4], mm4 ;fi[k2]
- puphdq mm4, mm4
- pmovd [r0+r4*2], mm4 ;fi[k1]
- pmovd [r0+r2*2], mm0 ;fi[k3]
- lea r0, [r0+r4*8]
- ;g
- pmov mm0, [r1] ;gi0
- pmov mm1, [r1+r4*2] ;gi1
- pmov mm2, [r1+r4*4] ;gi2
- pmov mm3, [r1+r2*2] ;gi3
- pupldq mm1, mm1
- pupldq mm0, mm0 ;gi0 | gi0
- pupldq mm2, mm3 ;gi3 | gi2
- pxor mm1, mm7 ;gi1 | -gi1
- pfsub mm0, mm1 ;gi0-gi1|gi0+gi1 = g1 | g0
- pfmul mm2, mm6 ;gi3*SQRT2|gi2*SQRT2 = g3 | g2
- pmov mm4, mm0
- pfadd mm0, mm2 ;g1+g3|g0+g2 = gi1 | gi0
- pfsub mm4, mm2 ;g1-g3|g0-g2 = gi3 | gi2
- pmovd [r1], mm0 ;gi[0]
- puphdq mm0, mm0
- pmovd [r1+r4*4], mm4 ;gi[k2]
- puphdq mm4, mm4
- cmp r0, [esp + 16]
- pmovd [r1+r4*2], mm0 ;gi[k1]
- pmovd [r1+r2*2], mm4 ;gi[k3]
- jb near .do2
- pmov mm6, [r3+r5] ; this is not aligned address!!
- loopalign 16
- .for:
- ;
- ; mm6 = c1 | s1
- ; mm7 = 0x800000000 | 0
- ;
- pmov mm1, mm6
- mov r0, [esp+40] ; fz
- puphdq mm1, mm1 ; c1 | c1
- lea r1, [r0+r4*2]
- pfadd mm1, mm1 ; c1+c1 | c1+c1
- pfmul mm1, mm6 ; 2*c1*c1 | 2*c1*s1
- pfsub mm1, [PIC_EBP_REL(D_1_0_0_0)] ; 2*c1*c1-1.0 | 2*c1*s1 = -c2 | s2
- pmov mm0, mm1
- pxor mm7, mm6 ; c1 | -s1
- pupldq mm2, mm0
- pupldq mm3, mm6 ; ** | c1
- puphdq mm0, mm2 ; s2 | c2
- puphdq mm6, mm3 ;-s1 | c1
- pxor mm0, [PIC_EBP_REL(costab)] ; c2 | -s2
- ; mm0 = s2| c2
- ; mm1 = -c2| s2
- ; mm6 = c1| s1
- ; mm7 = s1|-c1 (we use the opposite sign. from GOGO here)
- pmov [esp], mm0
- pmov [esp+8], mm1
- sub r1, r5 ;r1 = gi
- add r0, r5 ;r0 = fi
- loopalign 16
- .do3:
- pmov mm2, [r0+r4*2] ; fi[k1]
- pmov mm4, [r1+r4*2] ; gi[k1]
- pmov mm3, [r0+r2*2] ; fi[k3]
- pmov mm5, [r1+r2*2] ; gi[k3]
- pupldq mm2, mm2 ; fi1 | fi1
- pupldq mm4, mm4 ; gi1 | gi1
- pupldq mm3, mm3 ; fi3 | fi3
- pupldq mm5, mm5 ; gi3 | gi3
- pfmul mm2, mm0 ; s2 * fi1 | c2 * fi1
- pfmul mm4, mm1 ;-c2 * gi1 | s2 * gi1
- pfmul mm3, mm0 ; s2 * fi3 | c2 * fi3
- pfmul mm5, mm1 ;-c2 * gi3 | s2 * gi3
- pfadd mm2, mm4 ;b | a
- pfadd mm3, mm5 ;d | c
- pmov mm0, [r0]
- pmov mm4, [r1]
- pmov mm1, [r0+r4*4]
- pmov mm5, [r1+r4*4]
- pupldq mm0, mm4 ;gi0 | fi0
- pupldq mm1, mm5 ;gi2 | fi2
- pmov mm4, mm2
- pmov mm5, mm3
- pfadd mm2, mm0 ;g0 | f0
- pfadd mm3, mm1 ;g2 | f2
- pfsub mm0, mm4 ;g1 | f1
- pfsub mm1, mm5 ;g3 | f3
- pmov mm4, mm3
- pmov mm5, mm1
- pupldq mm4, mm4 ;f2 | f2
- puphdq mm5, mm5 ;g3 | g3
- puphdq mm3, mm3 ;g2 | g2
- pupldq mm1, mm1 ;f3 | f3
- pfmul mm4, mm6 ;f2 * c1 | f2 * s1
- pfmul mm5, mm7 ;g3 * s1 | g3 *-c1
- pfmul mm3, mm6 ;g2 * c1 | g2 * s1
- pfmul mm1, mm7 ;f3 * s1 | f3 *-c1
- pfadd mm4, mm5 ;a | b
- pfsub mm3, mm1 ;d | c
- pmov mm5, mm2
- pmov mm1, mm0
- pupldq mm2, mm2 ;f0 | f0
- pupldq mm0, mm0 ;f1 | f1
- puphdq mm1, mm2 ;f0 | g1
- puphdq mm5, mm0 ;f1 | g0
- pmov mm2, mm4
- pmov mm0, mm3
- pfadd mm4, mm1 ;fi0 | gi1
- pfadd mm3, mm5 ;fi1 | gi0
- pfsub mm1, mm2 ;fi2 | gi3
- pfsub mm5, mm0 ;fi3 | gi2
- pmovd [r1+r4*2], mm4 ;gi[k1]
- puphdq mm4, mm4
- pmovd [r1], mm3 ;gi[0]
- puphdq mm3, mm3
- pmovd [r1+r2*2], mm1 ;gi[k3]
- puphdq mm1, mm1
- pmovd [r1+r4*4], mm5 ;gi[k2]
- puphdq mm5, mm5
- pmovd [r0], mm4 ;fi[0]
- pmovd [r0+r4*2], mm3 ;fi[k1]
- pmovd [r0+r4*4], mm1 ;fi[k2]
- pmovd [r0+r2*2], mm5 ;fi[k3]
- lea r0, [r0+r4*8]
- lea r1, [r1+r4*8]
- cmp r0, [esp + 16]
- pmov mm0, [esp]
- pmov mm1, [esp+8]
- jb near .do3
- add r5, 4
- ; mm6 = c1| s1
- ; mm7 = s1|-c1 (we use the opposite sign. from GOGO here)
- pfmul mm6, [r3] ; c1*a | s1*a
- pfmul mm7, [r3+8] ; s1*b |-c1*b
- cmp r5, r4
- pfsub mm6, mm7 ; c1*a-s1*b | s1*a+c1*b
- pupldq mm7,mm6
- puphdq mm6,mm7
- pmov mm7, [PIC_EBP_REL(costab)]
- jb near .for
- mov r0, [esp+40] ;fi
- cmp r4, [esp+40+4]
- lea r4, [r4*4] ;kx *= 4
- jb near .do1
- .exitttt
- femms
- add esp,20
- popd ebp, ebx, esi, edi
- endproc
- ;void fht_E3DN(float *fz, int nn);
- proc fht_E3DN
- pushd ebp, ebx, esi, edi
- sub esp, 20
- call get_pc.bp
- add ebp, PIC_BASE()
- mov r0, [esp+40] ;fi
- mov r1, [esp+44] ;r1 = nn
- lea r3, [PIC_EBP_REL(costab)] ;tri = costab
- lea r4, [r0+r1*8] ;r4 = fn = &fz[n]
- mov [esp+16], r4
- mov r4, 8 ;kx = k1/2
- pmov mm7, [r3]
- loopalign 16
- .do1
- lea r3, [r3+16] ;tri += 2;
- pmov mm6, [PIC_EBP_REL(costab+8)]
- lea r2, [r4+r4*2] ;k3*fsize/2
- mov r5, 4 ;i = 1*fsize
- loopalign 16
- .do2:
- lea r1, [r0+r4] ;gi = fi + kx
- ;f
- pmov mm0, [r0] ; X | fi0
- pmov mm1, [r0+r4*4] ; X | fi2
- pupldq mm0, [r0+r4*2] ;fi1 | fi0
- pupldq mm1, [r0+r2*2] ;fi3 | fi2
- pfpnacc mm0, mm0 ;fi0+fi1 | fi0-fi1 = f0|f1
- pfpnacc mm1, mm1 ;fi2+fi3 | fi2-fi3 = f2|f3
- pmov mm2, mm0
- pfadd mm0, mm1 ;f0+f2|f1+f3 = fi0 | fi1
- pfsub mm2, mm1 ;f0-f2|f1-f3 = fi2 | fi3
- pmovd [r0+r4*2], mm0 ;fi[k1]
- pmovd [r0+r2*2], mm2 ;fi[k3]
- puphdq mm0, mm0
- puphdq mm2, mm2
- pmovd [r0], mm0 ;fi[0]
- pmovd [r0+r4*4], mm2 ;fi[k2]
- lea r0, [r0+r4*8]
- ;g
- pmov mm3, [r1] ; gi0
- pmov mm4, [r1+r2*2] ; gi3
- pupldq mm3, [r1+r4*2] ;gi1|gi0
- pupldq mm4, [r1+r4*4] ;gi2|gi3
- pfpnacc mm3, mm3 ;gi0+gi1 |gi0-gi1 = f0|f1
- pfmul mm4, mm6 ;gi2*SQRT2|gi3*SQRT2 = f2|f3
- pmov mm5, mm3
- pfadd mm3, mm4 ;f0+f2|f1+f3
- pfsub mm5, mm4 ;f0-f2|f1-f3
- cmp r0, [esp + 16]
- pmovd [r1+r4*2], mm3 ;gi[k1]
- pmovd [r1+r2*2], mm5 ;gi[k3]
- puphdq mm3, mm3
- puphdq mm5, mm5
- pmovd [r1], mm3 ;gi[0]
- pmovd [r1+r4*4], mm5 ;gi[k2]
- jb near .do2
- pmov mm6, [r3+r5] ; this is not aligned address!!
- loopalign 16
- .for:
- ;
- ; mm6 = c1 | s1
- ; mm7 = 0x800000000 | 0
- ;
- pmov mm5, mm6
- mov r0, [esp+40] ; fz
- puphdq mm5, mm5 ; c1 | c1
- lea r1, [r0+r4*2]
- pfadd mm5, mm5 ; c1+c1 | c1+c1
- pfmul mm5, mm6 ; 2*c1*c1 | 2*c1*s1
- pfsub mm5, [PIC_EBP_REL(D_1_0_0_0)] ; 2*c1*c1-1.0 | 2*c1*s1 = -c2 | s2
- pswapd mm4, mm5 ; s2 |-c2
- pxor mm4, mm7 ; s2 | c2
- pxor mm7, mm6 ; c1 |-s1
- pswapd mm6, mm6 ; s1 | c1
- ; mm4 = s2| c2
- ; mm5 = -c2| s2
- ; mm6 = c1| s1
- ; mm7 = s1|-c1 (we use the opposite sign. from GOGO here)
- pmov [esp], mm4
- pmov [esp+8], mm5
- sub r1, r5 ;r1 = gi
- add r0, r5 ;r0 = fi
- loopalign 16
- .do3:
- pmov mm0, [r0+r2*2] ; fi[k1]
- pmov mm2, [r1+r2*2] ; gi[k1]
- pmov mm1, [r0+r4*2] ; fi[k3]
- pmov mm3, [r1+r4*2] ; gi[k3]
- pupldq mm0, mm0
- pupldq mm2, mm2
- pupldq mm1, mm1
- pupldq mm3, mm3
- pfmul mm0, mm4
- pfmul mm2, mm5
- pfmul mm1, mm4
- pfmul mm3, mm5
- pfadd mm0, mm2 ;d | c
- pfadd mm1, mm3 ;b | a
- pmov mm2, [r0+r4*4] ;fi2
- pupldq mm3, [r1+r4*4] ;gi2 | -
- pmov mm4, [r0] ;fi0
- pupldq mm5, [r1] ;gi0 | -
- pupldq mm2, mm0 ;c | fi2
- puphdq mm3, mm0 ;d | gi2
- pupldq mm4, mm1 ;a | fi0
- puphdq mm5, mm1 ;b | gi0
- pfpnacc mm2, mm2 ;f2 | f3
- pfpnacc mm3, mm3 ;g2 | g3
- pfpnacc mm4, mm4 ;f0 | f1
- pfpnacc mm5, mm5 ;g0 | g1
- pmov mm0, mm2
- pmov mm1, mm3
- pupldq mm2, mm2 ;f3 | f3
- pupldq mm3, mm3 ;g3 | g3
- puphdq mm0, mm0 ;f2 | f2
- puphdq mm1, mm1 ;g2 | g2
- pswapd mm4, mm4 ;f1 | f0
- pswapd mm5, mm5 ;g1 | g0
- pfmul mm0, mm7 ;f2 * s1 | f2 *-c1
- pfmul mm3, mm6 ;g3 * c1 | g3 * s1
- pfmul mm1, mm6 ;g2 * c1 | g2 * s1
- pfmul mm2, mm7 ;f3 * s1 | f3 *-c1
- pfsub mm0, mm3 ; b |-a
- pfsub mm1, mm2 ; d | c
- pmov mm2, mm5
- pmov mm3, mm4
- pupldq mm4, mm0 ;-a | f0
- pupldq mm5, mm1 ; c | g0
- puphdq mm2, mm0 ; b | g1
- puphdq mm3, mm1 ; d | f1
- pfpnacc mm4, mm4 ;fi2 | fi0
- pfpnacc mm5, mm5 ;gi0 | gi2
- pfpnacc mm2, mm2 ;gi1 | gi3
- pfpnacc mm3, mm3 ;fi1 | fi3
- pmovd [r0], mm4 ;fi[0]
- pmovd [r1+r4*4], mm5 ;gi[k2]
- pmovd [r1+r2*2], mm2 ;gi[k3]
- pmovd [r0+r2*2], mm3 ;fi[k3]
- puphdq mm4, mm4
- puphdq mm5, mm5
- puphdq mm2, mm2
- puphdq mm3, mm3
- pmovd [r0+r4*4], mm4 ;fi[k2]
- pmovd [r1], mm5 ;gi[0]
- pmovd [r1+r4*2], mm2 ;gi[k1]
- pmovd [r0+r4*2], mm3 ;fi[k1]
- lea r0, [r0+r4*8]
- lea r1, [r1+r4*8]
- cmp r0, [esp + 16]
- pmov mm4, [esp]
- pmov mm5, [esp+8]
- jb near .do3
- add r5, 4
- ; mm6 = c1| s1
- ; mm7 = s1|-c1 (we use the opposite sign. from GOGO here)
- pfmul mm6, [r3] ; c1*a | s1*a
- pfmul mm7, [r3+8] ; s1*b |-c1*b
- cmp r5, r4
- pfsub mm6, mm7 ; c1*a-s1*b | s1*a+c1*b
- pswapd mm6, mm6 ; ??? ; s1*a+c1*b | c1*a-s1*b
- pmov mm7, [PIC_EBP_REL(costab)]
- jb near .for
- mov r0, [esp+40] ;fi
- cmp r4, [esp+40+4]
- lea r4, [r4*4] ;kx *= 4
- jb near .do1
- .exitttt
- femms
- add esp,20
- popd ebp, ebx, esi, edi
- endproc
|