fftfpu.nas 9.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619
  1. ; back port from GOGO-no coda 2.24b by Takehiro TOMINAGA
  2. ; GOGO-no-coda
  3. ; Copyright (C) 1999 shigeo
  4. ; special thanks to URURI
  5. %include "nasm.h"
  6. externdef costab_fft
  7. externdef sintab_fft
  8. segment_data
  9. align 32
  10. D_1_41421 dd 1.41421356
  11. D_1_0 dd 1.0
  12. D_0_5 dd 0.5
  13. D_0_25 dd 0.25
  14. D_0_0005 dd 0.0005
  15. D_0_0 dd 0.0
  16. segment_code
  17. ;void fht(float *fz, int n);
  18. proc fht_FPU
  19. %$fz arg 4
  20. %$n arg 4
  21. %$k local 4
  22. %$f0 local 4
  23. %$f1 local 4
  24. %$f2 local 4
  25. %$f3 local 4
  26. %$g0 local 4
  27. %$g1 local 4
  28. %$g2 local 4
  29. %$g3 local 4
  30. %$s1 local 4
  31. %$c1 local 4
  32. %$s2 local 4
  33. %$c2 local 4
  34. %$t_s local 4
  35. %$t_c local 4
  36. alloc
  37. pushd ebp, ebx, esi, edi
  38. fht_FPU_1st_part:
  39. fht_FPU_2nd_part:
  40. fht_FPU_3rd_part:
  41. .do_init:
  42. mov r3, 16 ;k1*fsize = 4*fsize = k4
  43. mov r4, 8 ;kx = k1/2
  44. mov r2, 48 ;k3*fsize
  45. mov dword [sp(%$k)], 2 ;k = 2
  46. mov r0, [sp(%$fz)] ;fi
  47. lea r1, [r0+8] ;gi = fi + kx
  48. .do:
  49. .do2:
  50. ;f
  51. fld dword [r0]
  52. fsub dword [r0+r3]
  53. fld dword [r0]
  54. fadd dword [r0+r3]
  55. fld dword [r0+r3*2]
  56. fsub dword [r0+r2]
  57. fld dword [r0+r3*2]
  58. fadd dword [r0+r2] ;f2 f3 f0 f1
  59. fld st2 ;f0 f2 f3 f0 f1
  60. fadd st0, st1
  61. fstp dword [r0] ;fi[0]
  62. fld st3 ;f1 f2 f3 f0 f1
  63. fadd st0, st2
  64. fstp dword [r0+r3] ;fi[k1]
  65. fsubr st0, st2 ;f0-f2 f3 f0 f1
  66. fstp dword [r0+r3*2] ;fi[k2]
  67. fsubr st0, st2 ;f1-f3 f0 f1
  68. fstp dword [r0+r2] ;fi[k3]
  69. fcompp
  70. ;g
  71. fld dword [r1]
  72. fsub dword [r1+r3]
  73. fld dword [r1]
  74. fadd dword [r1+r3]
  75. fld dword [D_1_41421]
  76. fmul dword [r1+r2]
  77. fld dword [D_1_41421]
  78. fmul dword [r1+r3*2] ;g2 g3 g0 g1
  79. fld st2 ;g0 g2 g3 g0 g1
  80. fadd st0, st1
  81. fstp dword [r1] ;gi[0]
  82. fld st3 ;g1 g2 g3 g0 g1
  83. fadd st0, st2
  84. fstp dword [r1+r3] ;gi[k1]
  85. fsubr st0, st2 ;g0-g2 g3 g0 g1
  86. fstp dword [r1+r3*2] ;gi[k2]
  87. fsubr st0, st2 ;g1-g3 g0 g1
  88. fstp dword [r1+r2] ;gi[k3]
  89. fcompp
  90. lea r0, [r0+r3*4]
  91. lea r1, [r1+r3*4]
  92. cmp r0, r6
  93. jb .do2
  94. mov r0, [sp(%$k)]
  95. fld dword [costab_fft +r0*4]
  96. fstp dword [sp(%$t_c)]
  97. fld dword [sintab_fft +r0*4]
  98. fstp dword [sp(%$t_s)]
  99. fld dword [D_1_0]
  100. fstp dword [sp(%$c1)]
  101. fld dword [D_0_0]
  102. fstp dword [sp(%$s1)]
  103. .for_init:
  104. mov r5, 4 ;i = 1*fsize
  105. .for:
  106. fld dword [sp(%$c1)]
  107. fmul dword [sp(%$t_c)]
  108. fld dword [sp(%$s1)]
  109. fmul dword [sp(%$t_s)]
  110. fsubp st1, st0 ;c1
  111. fld dword [sp(%$c1)]
  112. fmul dword [sp(%$t_s)]
  113. fld dword [sp(%$s1)]
  114. fmul dword [sp(%$t_c)]
  115. faddp st1, st0 ;s1 c1
  116. fld st1
  117. fmul st0, st0 ;c1c1 s1 c1
  118. fld st1
  119. fmul st0, st0 ;s1s1 c1c1 s1 c1
  120. fsubp st1, st0 ;c2 s1 c1
  121. fstp dword [sp(%$c2)] ;s1 c1
  122. fld st1 ;c1 s1 c1
  123. fmul st0, st1 ;c1s1 s1 c1
  124. fadd st0, st0 ;s2 s1 c1
  125. fstp dword [sp(%$s2)] ;s1 c1
  126. fstp dword [sp(%$s1)] ;c1
  127. fstp dword [sp(%$c1)] ;
  128. mov r0, [sp(%$fz)]
  129. add r0, r5 ;r0 = fi
  130. mov r1, [sp(%$fz)]
  131. add r1, r3
  132. sub r1, r5 ;r1 = gi
  133. .do3:
  134. fld dword [sp(%$s2)]
  135. fmul dword [r0+r3]
  136. fld dword [sp(%$c2)]
  137. fmul dword [r1+r3]
  138. fsubp st1, st0 ;b = s2*fi[k1] - c2*gi[k1]
  139. fld dword [sp(%$c2)]
  140. fmul dword [r0+r3]
  141. fld dword [sp(%$s2)]
  142. fmul dword [r1+r3]
  143. faddp st1, st0 ;a = c2*fi[k1] + s2*gi[k1] b
  144. fld dword [r0]
  145. fsub st0, st1 ;f1 a b
  146. fstp dword [sp(%$f1)] ;a b
  147. fadd dword [r0] ;f0 b
  148. fstp dword [sp(%$f0)] ;b
  149. fld dword [r1]
  150. fsub st0, st1 ;g1 b
  151. fstp dword [sp(%$g1)] ;b
  152. fadd dword [r1] ;g0
  153. fstp dword [sp(%$g0)] ;
  154. fld dword [sp(%$s2)]
  155. fmul dword [r0+r2]
  156. fld dword [sp(%$c2)]
  157. fmul dword [r1+r2]
  158. fsubp st1, st0 ;b = s2*fi[k3] - c2*gi[k3]
  159. fld dword [sp(%$c2)]
  160. fmul dword [r0+r2]
  161. fld dword [sp(%$s2)]
  162. fmul dword [r1+r2]
  163. faddp st1, st0 ;a = c2*fi[k3] + s2*gi[k3] b
  164. fld dword [r0+r3*2]
  165. fsub st0, st1 ;f3 a b
  166. fstp dword [sp(%$f3)] ;a b
  167. fadd dword [r0+r3*2] ;f2 b
  168. fstp dword [sp(%$f2)] ;b
  169. fld dword [r1+r3*2]
  170. fsub st0, st1 ;g3 b
  171. fstp dword [sp(%$g3)] ;b
  172. fadd dword [r1+r3*2] ;g2
  173. fstp dword [sp(%$g2)] ;
  174. fld dword [sp(%$s1)]
  175. fmul dword [sp(%$f2)]
  176. fld dword [sp(%$c1)]
  177. fmul dword [sp(%$g3)]
  178. fsubp st1, st0 ;b = s1*f2 - c1*g3
  179. fld dword [sp(%$c1)]
  180. fmul dword [sp(%$f2)]
  181. fld dword [sp(%$s1)]
  182. fmul dword [sp(%$g3)]
  183. faddp st1, st0 ;a = c1*f2 + s1*g3 b
  184. fld dword [sp(%$f0)]
  185. fsub st0, st1 ;fi[k2] a b
  186. fstp dword [r0+r3*2]
  187. fadd dword [sp(%$f0)] ;fi[0] b
  188. fstp dword [r0]
  189. fld dword [sp(%$g1)]
  190. fsub st0, st1 ;gi[k3] b
  191. fstp dword [r1+r2]
  192. fadd dword [sp(%$g1)] ;gi[k1]
  193. fstp dword [r1+r3]
  194. fld dword [sp(%$c1)]
  195. fmul dword [sp(%$g2)]
  196. fld dword [sp(%$s1)]
  197. fmul dword [sp(%$f3)]
  198. fsubp st1, st0 ;b = c1*g2 - s1*f3
  199. fld dword [sp(%$s1)]
  200. fmul dword [sp(%$g2)]
  201. fld dword [sp(%$c1)]
  202. fmul dword [sp(%$f3)]
  203. faddp st1, st0 ;a = s1*g2 + c1*f3 b
  204. fld dword [sp(%$g0)]
  205. fsub st0, st1 ;gi[k2] a b
  206. fstp dword [r1+r3*2]
  207. fadd dword [sp(%$g0)] ;gi[0] b
  208. fstp dword [r1]
  209. fld dword [sp(%$f1)]
  210. fsub st0, st1 ;fi[k3] b
  211. fstp dword [r0+r2]
  212. fadd dword [sp(%$f1)] ;fi[k1]
  213. fstp dword [r0+r3]
  214. lea r0, [r0+r3*4]
  215. lea r1, [r1+r3*4]
  216. cmp r0, r6
  217. jb near .do3
  218. add r5, 4
  219. cmp r5, r4
  220. jb near .for
  221. cmp r3, [sp(%$n)]
  222. jae .exit
  223. add dword [sp(%$k)], 2 ;k += 2;
  224. lea r3, [r3*4] ;k1 *= 4
  225. lea r2, [r2*4] ;k3 *= 4
  226. lea r4, [r4*4] ;kx *= 4
  227. mov r0, [sp(%$fz)] ;fi
  228. lea r1, [r0+r4] ;gi = fi + kx
  229. jmp .do
  230. .exit:
  231. popd ebp, ebx, esi, edi
  232. endproc
  233. ;*************************************************************
  234. ;void fht_FPU_FXCH(float *fz, int n);
  235. proc fht_FPU_FXCH
  236. %$fz arg 4
  237. %$n arg 4
  238. %$k local 4
  239. %$f0 local 4
  240. %$f1 local 4
  241. %$f2 local 4
  242. %$f3 local 4
  243. %$g0 local 4
  244. %$g1 local 4
  245. %$g2 local 4
  246. %$g3 local 4
  247. %$s1 local 4
  248. %$c1 local 4
  249. %$s2 local 4
  250. %$c2 local 4
  251. %$t_s local 4
  252. %$t_c local 4
  253. alloc
  254. pushd ebp, ebx, esi, edi
  255. fht_FPU_FXCH_1st_part:
  256. fht_FPU_FXCH_2nd_part:
  257. fht_FPU_FXCH_3rd_part:
  258. .do_init:
  259. mov r3, 16 ;k1*fsize = 4*fsize = k4
  260. mov r4, 8 ;kx = k1/2
  261. mov r2, 48 ;k3*fsize
  262. mov dword [sp(%$k)], 2 ;k = 2
  263. mov r0, [sp(%$fz)] ;fi
  264. lea r1, [r0+8] ;gi = fi + kx
  265. .do:
  266. .do2:
  267. ;f
  268. fld dword [r0]
  269. fsub dword [r0+r3]
  270. fld dword [r0]
  271. fadd dword [r0+r3]
  272. fld dword [r0+r3*2]
  273. fsub dword [r0+r2]
  274. fld dword [r0+r3*2]
  275. fadd dword [r0+r2] ;f2 f3 f0 f1
  276. fld st3
  277. fld st3
  278. fxch st5
  279. fadd st0, st3
  280. fxch st4
  281. fadd st0, st2
  282. fxch st3
  283. fsubp st1, st0
  284. fxch st1
  285. fsubp st4, st0
  286. fxch st2
  287. fstp dword [r0+r3] ;fi[k1]
  288. fstp dword [r0] ;fi[0]
  289. fstp dword [r0+r2] ;fi[k3]
  290. fstp dword [r0+r3*2] ;fi[k2]
  291. ;g
  292. fld dword [r1]
  293. fsub dword [r1+r3]
  294. fld dword [r1]
  295. fadd dword [r1+r3]
  296. fld dword [D_1_41421]
  297. fmul dword [r1+r2]
  298. fld dword [D_1_41421]
  299. fmul dword [r1+r3*2] ;g2 g3 g0 g1
  300. fld st3
  301. fld st3
  302. fxch st5
  303. fadd st0, st3
  304. fxch st4
  305. fadd st0, st2
  306. fxch st3
  307. fsubp st1, st0
  308. fxch st1
  309. fsubp st4, st0
  310. fxch st2
  311. fstp dword [r1+r3] ;gi[k1]
  312. fstp dword [r1] ;gi[0]
  313. fstp dword [r1+r2] ;gi[k3]
  314. fstp dword [r1+r3*2] ;gi[k2]
  315. lea r0, [r0+r3*4]
  316. lea r1, [r1+r3*4]
  317. cmp r0, r6
  318. jb .do2
  319. mov r0, [sp(%$k)]
  320. fld dword [costab_fft +r0*4]
  321. fld dword [sintab_fft +r0*4]
  322. fld dword [D_1_0]
  323. fld dword [D_0_0]
  324. fxch st3
  325. fstp dword [sp(%$t_c)]
  326. fxch st1
  327. fstp dword [sp(%$t_s)]
  328. fstp dword [sp(%$c1)]
  329. fstp dword [sp(%$s1)]
  330. .for_init:
  331. mov r5, 4 ;i = 1*fsize
  332. .for:
  333. fld dword [sp(%$c1)]
  334. fmul dword [sp(%$t_c)]
  335. fld dword [sp(%$s1)]
  336. fmul dword [sp(%$t_s)]
  337. fld dword [sp(%$c1)]
  338. fmul dword [sp(%$t_s)]
  339. fld dword [sp(%$s1)]
  340. fmul dword [sp(%$t_c)]
  341. fxch st2
  342. fsubp st3, st0 ;c1
  343. faddp st1, st0 ;s1 c1
  344. fld st1
  345. fxch st2
  346. fmul st0, st0 ;c1c1 s1 c1
  347. fld st1
  348. fxch st2
  349. fmul st0, st0 ;s1s1 c1c1 s1 c1
  350. fxch st3
  351. fst dword [sp(%$c1)] ;c1
  352. fxch st2
  353. fst dword [sp(%$s1)] ;s1 c1c1 c1 s1s1
  354. fmulp st2, st0
  355. fsubrp st2, st0
  356. fadd st0, st0 ;s2 c2
  357. fxch st1
  358. fstp dword [sp(%$c2)]
  359. fstp dword [sp(%$s2)]
  360. mov r0, [sp(%$fz)]
  361. mov r1, [sp(%$fz)]
  362. add r0, r5 ;r0 = fi
  363. add r1, r3
  364. sub r1, r5 ;r1 = gi
  365. .do3:
  366. fld dword [sp(%$s2)]
  367. fmul dword [r0+r3]
  368. fld dword [sp(%$c2)]
  369. fmul dword [r1+r3]
  370. fld dword [sp(%$c2)]
  371. fmul dword [r0+r3]
  372. fld dword [sp(%$s2)]
  373. fmul dword [r1+r3]
  374. fxch st2
  375. fsubp st3, st0 ;b = s2*fi[k1] - c2*gi[k1]
  376. faddp st1, st0 ;a = c2*fi[k1] + s2*gi[k1] b
  377. fld dword [r1]
  378. fsub st0, st2 ;g1 a b
  379. fxch st2
  380. fadd dword [r1] ;g0 a g1
  381. fld dword [r0]
  382. fsub st0, st2 ;f1 g0 a g1
  383. fxch st2
  384. fadd dword [r0] ;f0 g0 f1 g1
  385. fxch st3
  386. fstp dword [sp(%$g1)]
  387. fstp dword [sp(%$g0)]
  388. fstp dword [sp(%$f1)]
  389. fstp dword [sp(%$f0)]
  390. fld dword [sp(%$s2)]
  391. fmul dword [r0+r2]
  392. fld dword [sp(%$c2)]
  393. fmul dword [r1+r2]
  394. fld dword [sp(%$c2)]
  395. fmul dword [r0+r2]
  396. fld dword [sp(%$s2)]
  397. fmul dword [r1+r2]
  398. fxch st2
  399. fsubp st3, st0 ;b = s2*fi[k3] - c2*gi[k3]
  400. faddp st1, st0 ;a = c2*fi[k3] + s2*gi[k3] b
  401. fld dword [r1+r3*2]
  402. fsub st0, st2 ;g3 a b
  403. fxch st2
  404. fadd dword [r1+r3*2] ;g2 a g3
  405. fld dword [r0+r3*2]
  406. fsub st0, st2 ;f3 g2 a g3
  407. fxch st2
  408. fadd dword [r0+r3*2] ;f2 g2 f3 g3
  409. fxch st3
  410. fstp dword [sp(%$g3)]
  411. fstp dword [sp(%$g2)]
  412. fstp dword [sp(%$f3)]
  413. fstp dword [sp(%$f2)]
  414. fld dword [sp(%$s1)]
  415. fmul dword [sp(%$f2)]
  416. fld dword [sp(%$c1)]
  417. fmul dword [sp(%$g3)]
  418. fld dword [sp(%$c1)]
  419. fmul dword [sp(%$f2)]
  420. fld dword [sp(%$s1)]
  421. fmul dword [sp(%$g3)]
  422. fxch st2
  423. fsubp st3, st0 ;b = s1*f2 - c1*g3
  424. faddp st1, st0 ;a = c1*f2 + s1*g3 b
  425. fld dword [sp(%$g1)]
  426. fsub st0, st2 ;gi[k3] a b
  427. fxch st2
  428. fadd dword [sp(%$g1)] ;gi[k1] a gi[k3]
  429. fld dword [sp(%$f0)]
  430. fsub st0, st2 ;fi[k2] gi[k1] a gi[k3]
  431. fxch st2
  432. fadd dword [sp(%$f0)] ;fi[0] gi[k1] fi[k2] gi[k3]
  433. fxch st3
  434. fstp dword [r1+r2]
  435. fstp dword [r1+r3]
  436. fstp dword [r0+r3*2]
  437. fstp dword [r0]
  438. fld dword [sp(%$c1)]
  439. fmul dword [sp(%$g2)]
  440. fld dword [sp(%$s1)]
  441. fmul dword [sp(%$f3)]
  442. fld dword [sp(%$s1)]
  443. fmul dword [sp(%$g2)]
  444. fld dword [sp(%$c1)]
  445. fmul dword [sp(%$f3)]
  446. fxch st2
  447. fsubp st3, st0 ;b = c1*g2 - s1*f3
  448. faddp st1, st0 ;a = s1*g2 + c1*f3 b
  449. fld dword [sp(%$f1)]
  450. fsub st0, st2 ;fi[k3] a b
  451. fxch st2
  452. fadd dword [sp(%$f1)] ;fi[k1] a fi[k3]
  453. fld dword [sp(%$g0)]
  454. fsub st0, st2 ;gi[k2] fi[k1] a fi[k3]
  455. fxch st2
  456. fadd dword [sp(%$g0)] ;gi[0] fi[k1] gi[k2] fi[k3]
  457. fxch st3
  458. fstp dword [r0+r2]
  459. fstp dword [r0+r3]
  460. fstp dword [r1+r3*2]
  461. fstp dword [r1]
  462. lea r0, [r0+r3*4]
  463. lea r1, [r1+r3*4]
  464. cmp r0, r6
  465. jb near .do3
  466. add r5, 4
  467. cmp r5, r4
  468. jb near .for
  469. cmp r3, [sp(%$n)]
  470. jae .exit
  471. add dword [sp(%$k)], 2 ;k += 2;
  472. lea r3, [r3*4] ;k1 *= 4
  473. lea r2, [r2*4] ;k3 *= 4
  474. lea r4, [r4*4] ;kx *= 4
  475. mov r0, [sp(%$fz)] ;fi
  476. lea r1, [r0+r4] ;gi = fi + kx
  477. jmp .do
  478. .exit:
  479. popd ebp, ebx, esi, edi
  480. endproc
  481. end