fft3dn.nas 9.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488
  1. ; from a new GOGO-no-coda (1999/09)
  2. ; Copyright (C) 1999 shigeo
  3. ; special thanks to Keiichi SAKAI, URURI
  4. ; hacked and back-ported to LAME
  5. ; by Takehiro TOMINAGA Nov 2000
  6. %include "nasm.h"
  7. globaldef fht_3DN
  8. segment_data
  9. align 16
  10. costab dd 0x80000000, 0
  11. dd 1.414213562,1.414213562
  12. dd 9.238795283293805e-01, 9.238795283293805e-01
  13. dd 3.826834424611044e-01, 3.826834424611044e-01
  14. dd 9.951847264044178e-01, 9.951847264044178e-01
  15. dd 9.801714304836734e-02, 9.801714304836734e-02
  16. dd 9.996988186794428e-01, 9.996988186794428e-01
  17. dd 2.454122920569705e-02, 2.454122920569705e-02
  18. dd 9.999811752815535e-01, 9.999811752815535e-01
  19. dd 6.135884819898878e-03, 6.135884819898878e-03
  20. D_1_0_0_0 dd 0.0 , 1.0
  21. segment_code
  22. PIC_OFFSETTABLE
  23. ;void fht_3DN(float *fz, int nn);
  24. proc fht_3DN
  25. pushd ebp, ebx, esi, edi
  26. sub esp, 20
  27. call get_pc.bp
  28. add ebp, PIC_BASE()
  29. mov r0, [esp+40] ;fi
  30. mov r1, [esp+44] ;r1 = nn
  31. lea r3, [PIC_EBP_REL(costab)] ;tri = costab
  32. lea r4, [r0+r1*8] ;r4 = fn = &fz[n]
  33. mov [esp+16], r4
  34. mov r4, 8 ;kx = k1/2
  35. pmov mm7, [r3]
  36. loopalign 16
  37. .do1
  38. lea r3, [r3+16] ;tri += 2;
  39. pmov mm6, [PIC_EBP_REL(costab+8)]
  40. lea r2, [r4+r4*2] ;k3*fsize/2
  41. mov r5, 4 ;i = 1*fsize
  42. loopalign 16
  43. .do2:
  44. lea r1, [r0+r4] ;gi = fi + kx
  45. ;f
  46. pmov mm0, [r0] ;fi0
  47. pmov mm1, [r0+r4*2] ;fi1
  48. pmov mm2, [r0+r2*2] ;fi3
  49. pmov mm3, [r0+r4*4] ;fi2
  50. pupldq mm0, mm0 ;fi0 | fi0
  51. pupldq mm1, mm1 ;fi1 | fi1
  52. pupldq mm2, mm2 ;fi2 | fi2
  53. pupldq mm3, mm3 ;fi3 | fi3
  54. pxor mm1, mm7 ;fi1 | -fi1
  55. pxor mm3, mm7 ;fi3 | -fi3
  56. pfsub mm0, mm1 ;f1 | f0
  57. pfsub mm2, mm3 ;f3 | f2
  58. pmov mm4, mm0
  59. pfadd mm0, mm2 ;f1+f3|f0+f2 = fi1 | fi0
  60. pfsub mm4, mm2 ;f1-f3|f0-f2 = fi3 | fi2
  61. pmovd [r0], mm0 ;fi[0]
  62. puphdq mm0, mm0
  63. pmovd [r0+r4*4], mm4 ;fi[k2]
  64. puphdq mm4, mm4
  65. pmovd [r0+r4*2], mm4 ;fi[k1]
  66. pmovd [r0+r2*2], mm0 ;fi[k3]
  67. lea r0, [r0+r4*8]
  68. ;g
  69. pmov mm0, [r1] ;gi0
  70. pmov mm1, [r1+r4*2] ;gi1
  71. pmov mm2, [r1+r4*4] ;gi2
  72. pmov mm3, [r1+r2*2] ;gi3
  73. pupldq mm1, mm1
  74. pupldq mm0, mm0 ;gi0 | gi0
  75. pupldq mm2, mm3 ;gi3 | gi2
  76. pxor mm1, mm7 ;gi1 | -gi1
  77. pfsub mm0, mm1 ;gi0-gi1|gi0+gi1 = g1 | g0
  78. pfmul mm2, mm6 ;gi3*SQRT2|gi2*SQRT2 = g3 | g2
  79. pmov mm4, mm0
  80. pfadd mm0, mm2 ;g1+g3|g0+g2 = gi1 | gi0
  81. pfsub mm4, mm2 ;g1-g3|g0-g2 = gi3 | gi2
  82. pmovd [r1], mm0 ;gi[0]
  83. puphdq mm0, mm0
  84. pmovd [r1+r4*4], mm4 ;gi[k2]
  85. puphdq mm4, mm4
  86. cmp r0, [esp + 16]
  87. pmovd [r1+r4*2], mm0 ;gi[k1]
  88. pmovd [r1+r2*2], mm4 ;gi[k3]
  89. jb near .do2
  90. pmov mm6, [r3+r5] ; this is not aligned address!!
  91. loopalign 16
  92. .for:
  93. ;
  94. ; mm6 = c1 | s1
  95. ; mm7 = 0x800000000 | 0
  96. ;
  97. pmov mm1, mm6
  98. mov r0, [esp+40] ; fz
  99. puphdq mm1, mm1 ; c1 | c1
  100. lea r1, [r0+r4*2]
  101. pfadd mm1, mm1 ; c1+c1 | c1+c1
  102. pfmul mm1, mm6 ; 2*c1*c1 | 2*c1*s1
  103. pfsub mm1, [PIC_EBP_REL(D_1_0_0_0)] ; 2*c1*c1-1.0 | 2*c1*s1 = -c2 | s2
  104. pmov mm0, mm1
  105. pxor mm7, mm6 ; c1 | -s1
  106. pupldq mm2, mm0
  107. pupldq mm3, mm6 ; ** | c1
  108. puphdq mm0, mm2 ; s2 | c2
  109. puphdq mm6, mm3 ;-s1 | c1
  110. pxor mm0, [PIC_EBP_REL(costab)] ; c2 | -s2
  111. ; mm0 = s2| c2
  112. ; mm1 = -c2| s2
  113. ; mm6 = c1| s1
  114. ; mm7 = s1|-c1 (we use the opposite sign. from GOGO here)
  115. pmov [esp], mm0
  116. pmov [esp+8], mm1
  117. sub r1, r5 ;r1 = gi
  118. add r0, r5 ;r0 = fi
  119. loopalign 16
  120. .do3:
  121. pmov mm2, [r0+r4*2] ; fi[k1]
  122. pmov mm4, [r1+r4*2] ; gi[k1]
  123. pmov mm3, [r0+r2*2] ; fi[k3]
  124. pmov mm5, [r1+r2*2] ; gi[k3]
  125. pupldq mm2, mm2 ; fi1 | fi1
  126. pupldq mm4, mm4 ; gi1 | gi1
  127. pupldq mm3, mm3 ; fi3 | fi3
  128. pupldq mm5, mm5 ; gi3 | gi3
  129. pfmul mm2, mm0 ; s2 * fi1 | c2 * fi1
  130. pfmul mm4, mm1 ;-c2 * gi1 | s2 * gi1
  131. pfmul mm3, mm0 ; s2 * fi3 | c2 * fi3
  132. pfmul mm5, mm1 ;-c2 * gi3 | s2 * gi3
  133. pfadd mm2, mm4 ;b | a
  134. pfadd mm3, mm5 ;d | c
  135. pmov mm0, [r0]
  136. pmov mm4, [r1]
  137. pmov mm1, [r0+r4*4]
  138. pmov mm5, [r1+r4*4]
  139. pupldq mm0, mm4 ;gi0 | fi0
  140. pupldq mm1, mm5 ;gi2 | fi2
  141. pmov mm4, mm2
  142. pmov mm5, mm3
  143. pfadd mm2, mm0 ;g0 | f0
  144. pfadd mm3, mm1 ;g2 | f2
  145. pfsub mm0, mm4 ;g1 | f1
  146. pfsub mm1, mm5 ;g3 | f3
  147. pmov mm4, mm3
  148. pmov mm5, mm1
  149. pupldq mm4, mm4 ;f2 | f2
  150. puphdq mm5, mm5 ;g3 | g3
  151. puphdq mm3, mm3 ;g2 | g2
  152. pupldq mm1, mm1 ;f3 | f3
  153. pfmul mm4, mm6 ;f2 * c1 | f2 * s1
  154. pfmul mm5, mm7 ;g3 * s1 | g3 *-c1
  155. pfmul mm3, mm6 ;g2 * c1 | g2 * s1
  156. pfmul mm1, mm7 ;f3 * s1 | f3 *-c1
  157. pfadd mm4, mm5 ;a | b
  158. pfsub mm3, mm1 ;d | c
  159. pmov mm5, mm2
  160. pmov mm1, mm0
  161. pupldq mm2, mm2 ;f0 | f0
  162. pupldq mm0, mm0 ;f1 | f1
  163. puphdq mm1, mm2 ;f0 | g1
  164. puphdq mm5, mm0 ;f1 | g0
  165. pmov mm2, mm4
  166. pmov mm0, mm3
  167. pfadd mm4, mm1 ;fi0 | gi1
  168. pfadd mm3, mm5 ;fi1 | gi0
  169. pfsub mm1, mm2 ;fi2 | gi3
  170. pfsub mm5, mm0 ;fi3 | gi2
  171. pmovd [r1+r4*2], mm4 ;gi[k1]
  172. puphdq mm4, mm4
  173. pmovd [r1], mm3 ;gi[0]
  174. puphdq mm3, mm3
  175. pmovd [r1+r2*2], mm1 ;gi[k3]
  176. puphdq mm1, mm1
  177. pmovd [r1+r4*4], mm5 ;gi[k2]
  178. puphdq mm5, mm5
  179. pmovd [r0], mm4 ;fi[0]
  180. pmovd [r0+r4*2], mm3 ;fi[k1]
  181. pmovd [r0+r4*4], mm1 ;fi[k2]
  182. pmovd [r0+r2*2], mm5 ;fi[k3]
  183. lea r0, [r0+r4*8]
  184. lea r1, [r1+r4*8]
  185. cmp r0, [esp + 16]
  186. pmov mm0, [esp]
  187. pmov mm1, [esp+8]
  188. jb near .do3
  189. add r5, 4
  190. ; mm6 = c1| s1
  191. ; mm7 = s1|-c1 (we use the opposite sign. from GOGO here)
  192. pfmul mm6, [r3] ; c1*a | s1*a
  193. pfmul mm7, [r3+8] ; s1*b |-c1*b
  194. cmp r5, r4
  195. pfsub mm6, mm7 ; c1*a-s1*b | s1*a+c1*b
  196. pupldq mm7,mm6
  197. puphdq mm6,mm7
  198. pmov mm7, [PIC_EBP_REL(costab)]
  199. jb near .for
  200. mov r0, [esp+40] ;fi
  201. cmp r4, [esp+40+4]
  202. lea r4, [r4*4] ;kx *= 4
  203. jb near .do1
  204. .exitttt
  205. femms
  206. add esp,20
  207. popd ebp, ebx, esi, edi
  208. endproc
  209. ;void fht_E3DN(float *fz, int nn);
  210. proc fht_E3DN
  211. pushd ebp, ebx, esi, edi
  212. sub esp, 20
  213. call get_pc.bp
  214. add ebp, PIC_BASE()
  215. mov r0, [esp+40] ;fi
  216. mov r1, [esp+44] ;r1 = nn
  217. lea r3, [PIC_EBP_REL(costab)] ;tri = costab
  218. lea r4, [r0+r1*8] ;r4 = fn = &fz[n]
  219. mov [esp+16], r4
  220. mov r4, 8 ;kx = k1/2
  221. pmov mm7, [r3]
  222. loopalign 16
  223. .do1
  224. lea r3, [r3+16] ;tri += 2;
  225. pmov mm6, [PIC_EBP_REL(costab+8)]
  226. lea r2, [r4+r4*2] ;k3*fsize/2
  227. mov r5, 4 ;i = 1*fsize
  228. loopalign 16
  229. .do2:
  230. lea r1, [r0+r4] ;gi = fi + kx
  231. ;f
  232. pmov mm0, [r0] ; X | fi0
  233. pmov mm1, [r0+r4*4] ; X | fi2
  234. pupldq mm0, [r0+r4*2] ;fi1 | fi0
  235. pupldq mm1, [r0+r2*2] ;fi3 | fi2
  236. pfpnacc mm0, mm0 ;fi0+fi1 | fi0-fi1 = f0|f1
  237. pfpnacc mm1, mm1 ;fi2+fi3 | fi2-fi3 = f2|f3
  238. pmov mm2, mm0
  239. pfadd mm0, mm1 ;f0+f2|f1+f3 = fi0 | fi1
  240. pfsub mm2, mm1 ;f0-f2|f1-f3 = fi2 | fi3
  241. pmovd [r0+r4*2], mm0 ;fi[k1]
  242. pmovd [r0+r2*2], mm2 ;fi[k3]
  243. puphdq mm0, mm0
  244. puphdq mm2, mm2
  245. pmovd [r0], mm0 ;fi[0]
  246. pmovd [r0+r4*4], mm2 ;fi[k2]
  247. lea r0, [r0+r4*8]
  248. ;g
  249. pmov mm3, [r1] ; gi0
  250. pmov mm4, [r1+r2*2] ; gi3
  251. pupldq mm3, [r1+r4*2] ;gi1|gi0
  252. pupldq mm4, [r1+r4*4] ;gi2|gi3
  253. pfpnacc mm3, mm3 ;gi0+gi1 |gi0-gi1 = f0|f1
  254. pfmul mm4, mm6 ;gi2*SQRT2|gi3*SQRT2 = f2|f3
  255. pmov mm5, mm3
  256. pfadd mm3, mm4 ;f0+f2|f1+f3
  257. pfsub mm5, mm4 ;f0-f2|f1-f3
  258. cmp r0, [esp + 16]
  259. pmovd [r1+r4*2], mm3 ;gi[k1]
  260. pmovd [r1+r2*2], mm5 ;gi[k3]
  261. puphdq mm3, mm3
  262. puphdq mm5, mm5
  263. pmovd [r1], mm3 ;gi[0]
  264. pmovd [r1+r4*4], mm5 ;gi[k2]
  265. jb near .do2
  266. pmov mm6, [r3+r5] ; this is not aligned address!!
  267. loopalign 16
  268. .for:
  269. ;
  270. ; mm6 = c1 | s1
  271. ; mm7 = 0x800000000 | 0
  272. ;
  273. pmov mm5, mm6
  274. mov r0, [esp+40] ; fz
  275. puphdq mm5, mm5 ; c1 | c1
  276. lea r1, [r0+r4*2]
  277. pfadd mm5, mm5 ; c1+c1 | c1+c1
  278. pfmul mm5, mm6 ; 2*c1*c1 | 2*c1*s1
  279. pfsub mm5, [PIC_EBP_REL(D_1_0_0_0)] ; 2*c1*c1-1.0 | 2*c1*s1 = -c2 | s2
  280. pswapd mm4, mm5 ; s2 |-c2
  281. pxor mm4, mm7 ; s2 | c2
  282. pxor mm7, mm6 ; c1 |-s1
  283. pswapd mm6, mm6 ; s1 | c1
  284. ; mm4 = s2| c2
  285. ; mm5 = -c2| s2
  286. ; mm6 = c1| s1
  287. ; mm7 = s1|-c1 (we use the opposite sign. from GOGO here)
  288. pmov [esp], mm4
  289. pmov [esp+8], mm5
  290. sub r1, r5 ;r1 = gi
  291. add r0, r5 ;r0 = fi
  292. loopalign 16
  293. .do3:
  294. pmov mm0, [r0+r2*2] ; fi[k1]
  295. pmov mm2, [r1+r2*2] ; gi[k1]
  296. pmov mm1, [r0+r4*2] ; fi[k3]
  297. pmov mm3, [r1+r4*2] ; gi[k3]
  298. pupldq mm0, mm0
  299. pupldq mm2, mm2
  300. pupldq mm1, mm1
  301. pupldq mm3, mm3
  302. pfmul mm0, mm4
  303. pfmul mm2, mm5
  304. pfmul mm1, mm4
  305. pfmul mm3, mm5
  306. pfadd mm0, mm2 ;d | c
  307. pfadd mm1, mm3 ;b | a
  308. pmov mm2, [r0+r4*4] ;fi2
  309. pupldq mm3, [r1+r4*4] ;gi2 | -
  310. pmov mm4, [r0] ;fi0
  311. pupldq mm5, [r1] ;gi0 | -
  312. pupldq mm2, mm0 ;c | fi2
  313. puphdq mm3, mm0 ;d | gi2
  314. pupldq mm4, mm1 ;a | fi0
  315. puphdq mm5, mm1 ;b | gi0
  316. pfpnacc mm2, mm2 ;f2 | f3
  317. pfpnacc mm3, mm3 ;g2 | g3
  318. pfpnacc mm4, mm4 ;f0 | f1
  319. pfpnacc mm5, mm5 ;g0 | g1
  320. pmov mm0, mm2
  321. pmov mm1, mm3
  322. pupldq mm2, mm2 ;f3 | f3
  323. pupldq mm3, mm3 ;g3 | g3
  324. puphdq mm0, mm0 ;f2 | f2
  325. puphdq mm1, mm1 ;g2 | g2
  326. pswapd mm4, mm4 ;f1 | f0
  327. pswapd mm5, mm5 ;g1 | g0
  328. pfmul mm0, mm7 ;f2 * s1 | f2 *-c1
  329. pfmul mm3, mm6 ;g3 * c1 | g3 * s1
  330. pfmul mm1, mm6 ;g2 * c1 | g2 * s1
  331. pfmul mm2, mm7 ;f3 * s1 | f3 *-c1
  332. pfsub mm0, mm3 ; b |-a
  333. pfsub mm1, mm2 ; d | c
  334. pmov mm2, mm5
  335. pmov mm3, mm4
  336. pupldq mm4, mm0 ;-a | f0
  337. pupldq mm5, mm1 ; c | g0
  338. puphdq mm2, mm0 ; b | g1
  339. puphdq mm3, mm1 ; d | f1
  340. pfpnacc mm4, mm4 ;fi2 | fi0
  341. pfpnacc mm5, mm5 ;gi0 | gi2
  342. pfpnacc mm2, mm2 ;gi1 | gi3
  343. pfpnacc mm3, mm3 ;fi1 | fi3
  344. pmovd [r0], mm4 ;fi[0]
  345. pmovd [r1+r4*4], mm5 ;gi[k2]
  346. pmovd [r1+r2*2], mm2 ;gi[k3]
  347. pmovd [r0+r2*2], mm3 ;fi[k3]
  348. puphdq mm4, mm4
  349. puphdq mm5, mm5
  350. puphdq mm2, mm2
  351. puphdq mm3, mm3
  352. pmovd [r0+r4*4], mm4 ;fi[k2]
  353. pmovd [r1], mm5 ;gi[0]
  354. pmovd [r1+r4*2], mm2 ;gi[k1]
  355. pmovd [r0+r4*2], mm3 ;fi[k1]
  356. lea r0, [r0+r4*8]
  357. lea r1, [r1+r4*8]
  358. cmp r0, [esp + 16]
  359. pmov mm4, [esp]
  360. pmov mm5, [esp+8]
  361. jb near .do3
  362. add r5, 4
  363. ; mm6 = c1| s1
  364. ; mm7 = s1|-c1 (we use the opposite sign. from GOGO here)
  365. pfmul mm6, [r3] ; c1*a | s1*a
  366. pfmul mm7, [r3+8] ; s1*b |-c1*b
  367. cmp r5, r4
  368. pfsub mm6, mm7 ; c1*a-s1*b | s1*a+c1*b
  369. pswapd mm6, mm6 ; ??? ; s1*a+c1*b | c1*a-s1*b
  370. pmov mm7, [PIC_EBP_REL(costab)]
  371. jb near .for
  372. mov r0, [esp+40] ;fi
  373. cmp r4, [esp+40+4]
  374. lea r4, [r4*4] ;kx *= 4
  375. jb near .do1
  376. .exitttt
  377. femms
  378. add esp,20
  379. popd ebp, ebx, esi, edi
  380. endproc