fftsse.nas 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422
  1. ; back port from GOGO-no coda 2.24b by Takehiro TOMINAGA
  2. ; GOGO-no-coda
  3. ; Copyright (C) 1999 shigeo
  4. ; special thanks to Keiichi SAKAI
  5. %include "nasm.h"
  6. globaldef fht_SSE
  7. segment_data
  8. align 16
  9. Q_MMPP dd 0x0,0x0,0x80000000,0x80000000
  10. Q_MPMP dd 0x0,0x80000000,0x0,0x80000000
  11. D_1100 dd 0.0, 0.0, 1.0, 1.0
  12. costab_fft:
  13. dd 9.238795325112867e-01
  14. dd 3.826834323650898e-01
  15. dd 9.951847266721969e-01
  16. dd 9.801714032956060e-02
  17. dd 9.996988186962042e-01
  18. dd 2.454122852291229e-02
  19. dd 9.999811752836011e-01
  20. dd 6.135884649154475e-03
  21. S_SQRT2 dd 1.414213562
  22. segment_code
  23. PIC_OFFSETTABLE
  24. ;------------------------------------------------------------------------
  25. ; by K. SAKAI
  26. ; 99/08/18 PIII 23k[clk]
  27. ; 99/08/19 命令順序入れ換え PIII 22k[clk]
  28. ; 99/08/20 bit reversal を旧午後から移植した PIII 17k[clk]
  29. ; 99/08/23 一部 unroll PIII 14k[clk]
  30. ; 99/11/12 clean up
  31. ;
  32. ;void fht_SSE(float *fz, int n);
  33. align 16
  34. fht_SSE:
  35. push ebx
  36. push esi
  37. push edi
  38. push ebp
  39. %assign _P 4*5
  40. ;2つ目のループ
  41. mov eax,[esp+_P+0] ;eax=fz
  42. mov ebp,[esp+_P+4] ;=n
  43. shl ebp,3
  44. add ebp,eax ; fn = fz + n, この関数終了まで不変
  45. push ebp
  46. call get_pc.bp
  47. add ebp, PIC_BASE()
  48. lea ecx,[PIC_EBP_REL(costab_fft)]
  49. xor eax,eax
  50. mov al,8 ; =k1=1*(sizeof float) // 4, 16, 64, 256,...
  51. .lp2: ; do{
  52. mov esi,[esp+_P+4] ; esi=fi=fz
  53. lea edx,[eax+eax*2]
  54. mov ebx, esi
  55. ; たかだか2並列しか期待できない部分はFPUのほうが速い。
  56. loopalign 16
  57. .lp20: ; do{
  58. ; f0 = fi[0 ] + fi[k1];
  59. ; f2 = fi[k2] + fi[k3];
  60. ; f1 = fi[0 ] - fi[k1];
  61. ; f3 = fi[k2] - fi[k3];
  62. ; fi[0 ] = f0 + f2;
  63. ; fi[k1] = f1 + f3;
  64. ; fi[k2] = f0 - f2;
  65. ; fi[k3] = f1 - f3;
  66. lea edi,[ebx+eax] ; edi=gi=fi+ki/2
  67. fld dword [ebx]
  68. fadd dword [ebx+eax*2]
  69. fld dword [ebx+eax*4]
  70. fadd dword [ebx+edx*2]
  71. fld dword [ebx]
  72. fsub dword [ebx+eax*2]
  73. fld dword [ebx+eax*4]
  74. fsub dword [ebx+edx*2]
  75. fld st1
  76. fadd st0,st1
  77. fstp dword [ebx+eax*2]
  78. fsubp st1,st0
  79. fstp dword [ebx+edx*2]
  80. fld st1
  81. fadd st0,st1
  82. fstp dword [ebx]
  83. fsubp st1,st0
  84. fstp dword [ebx+eax*4]
  85. lea ebx,[ebx + eax*8] ; = fi += (k1 * 4);
  86. ; g0 = gi[0 ] + gi[k1];
  87. ; g2 = SQRT2 * gi[k2];
  88. ; g1 = gi[0 ] - gi[k1];
  89. ; g3 = SQRT2 * gi[k3];
  90. ; gi[0 ] = g0 + g2;
  91. ; gi[k2] = g0 - g2;
  92. ; gi[k1] = g1 + g3;
  93. ; gi[k3] = g1 - g3;
  94. fld dword [edi]
  95. fadd dword [edi+eax*2]
  96. fld dword [PIC_EBP_REL(S_SQRT2)]
  97. fmul dword [edi+eax*4]
  98. fld dword [edi]
  99. fsub dword [edi+eax*2]
  100. fld dword [PIC_EBP_REL(S_SQRT2)]
  101. fmul dword [edi+edx*2]
  102. fld st1
  103. fadd st0,st1
  104. fstp dword [edi+eax*2]
  105. fsubp st1,st0
  106. fstp dword [edi+edx*2]
  107. fld st1
  108. fadd st0,st1
  109. fstp dword [edi]
  110. fsubp st1,st0
  111. fstp dword [edi+eax*4]
  112. cmp ebx,[esp]
  113. jl near .lp20 ; while (fi<fn);
  114. ; i = 1; //for (i=1;i<kx;i++){
  115. ; c1 = 1.0*t_c - 0.0*t_s;
  116. ; s1 = 0.0*t_c + 1.0*t_s;
  117. movlps xmm6,[ecx] ; = { --, --, s1, c1}
  118. movaps xmm7,xmm6
  119. shufps xmm6,xmm6,R4(0,1,1,0) ; = {+c1, +s1, +s1, +c1} -> 必要
  120. ; c2 = c1*c1 - s1*s1 = 1 - (2*s1)*s1;
  121. ; s2 = c1*s1 + s1*c1 = 2*s1*c1;
  122. shufps xmm7,xmm7,R4(1,0,0,1)
  123. movss xmm5,xmm7 ; = { --, --, --, s1}
  124. xorps xmm7,[PIC_EBP_REL(Q_MMPP)] ; = {-s1, -c1, +c1, +s1} -> 必要
  125. addss xmm5,xmm5 ; = (--, --, --, 2*s1)
  126. add esi,4 ; esi = fi = fz + i
  127. shufps xmm5,xmm5,R4(0,0,0,0) ; = (2*s1, 2*s1, 2*s1, 2*s1)
  128. mulps xmm5,xmm6 ; = (2*s1*c1, 2*s1*s1, 2*s1*s1, 2*s1*c1)
  129. subps xmm5,[PIC_EBP_REL(D_1100)] ; = (--, 2*s1*s1-1, --, 2*s1*c1) = {-- -c2 -- s2}
  130. movaps xmm4,xmm5
  131. shufps xmm5,xmm5,R4(2,0,2,0) ; = {-c2, s2, -c2, s2} -> 必要
  132. xorps xmm4,[PIC_EBP_REL(Q_MMPP)] ; = {--, c2, --, s2}
  133. shufps xmm4,xmm4,R4(0,2,0,2) ; = {s2, c2, s2, c2} -> 必要
  134. loopalign 16
  135. .lp21: ; do{
  136. ; a = c2*fi[k1] + s2*gi[k1];
  137. ; b = s2*fi[k1] - c2*gi[k1];
  138. ; c = c2*fi[k3] + s2*gi[k3];
  139. ; d = s2*fi[k3] - c2*gi[k3];
  140. ; f0 = fi[0 ] + a;
  141. ; g0 = gi[0 ] + b;
  142. ; f2 = fi[k1 * 2] + c;
  143. ; g2 = gi[k1 * 2] + d;
  144. ; f1 = fi[0 ] - a;
  145. ; g1 = gi[0 ] - b;
  146. ; f3 = fi[k1 * 2] - c;
  147. ; g3 = gi[k1 * 2] - d;
  148. lea edi,[esi + eax*2 - 8] ; edi = gi = fz +k1-i
  149. movss xmm0,[esi + eax*2] ; = fi[k1]
  150. movss xmm2,[esi + edx*2] ; = fi[k3]
  151. shufps xmm0,xmm2,0x00 ; = {fi[k3], fi[k3], fi[k1], fi[k1]}
  152. movss xmm1,[edi + eax*2] ; = fi[k1]
  153. movss xmm3,[edi + edx*2] ; = fi[k3]
  154. shufps xmm1,xmm3,0x00 ; = {gi[k3], gi[k3], gi[k1], gi[k1]}
  155. movss xmm2,[esi] ; = fi[0]
  156. mulps xmm0,xmm4 ; *= {+s2, +c2, +s2, +c2}
  157. movss xmm3,[esi + eax*4] ; = fi[k2]
  158. unpcklps xmm2,xmm3 ; = {--, --, fi[k2], fi[0]}
  159. mulps xmm1,xmm5 ; *= {-c2, +s2, -c2, +s2}
  160. movss xmm3,[edi + eax*4] ; = gi[k2]
  161. addps xmm0,xmm1 ; = {d, c, b, a}
  162. movss xmm1,[edi] ; = gi[0]
  163. unpcklps xmm1,xmm3 ; = {--, --, gi[k2], gi[0]}
  164. unpcklps xmm2,xmm1 ; = {gi[k2], fi[k2], gi[0], fi[0]}
  165. movaps xmm1,xmm2
  166. addps xmm1,xmm0 ; = {g2, f2, g0, f0}
  167. subps xmm2,xmm0 ; = {g3, f3, g1, f1}
  168. ; a = c1*f2 + s1*g3;
  169. ; c = s1*g2 + c1*f3;
  170. ; b = s1*f2 - c1*g3;
  171. ; d = c1*g2 - s1*f3;
  172. ; fi[0 ] = f0 + a;
  173. ; gi[0 ] = g0 + c;
  174. ; gi[k1] = g1 + b;
  175. ; fi[k1] = f1 + d;
  176. ; fi[k1 * 2] = f0 - a;
  177. ; gi[k1 * 2] = g0 - c;
  178. ; gi[k3] = g1 - b;
  179. ; fi[k3] = f1 - d;
  180. movaps xmm3,xmm1
  181. movhlps xmm1,xmm1 ; = {g2, f2, g2, f2}
  182. shufps xmm3,xmm2,0x14 ; = {f1, g1, g0, f0}
  183. mulps xmm1,xmm6 ; *= {+c1, +s1, +s1, +c1}
  184. shufps xmm2,xmm2,0xBB ; = {f3, g3, f3, g3}
  185. mulps xmm2,xmm7 ; *= {-s1, -c1, +c1, +s1}
  186. addps xmm1,xmm2 ; = {d, b, c, a}
  187. movaps xmm2,xmm3
  188. addps xmm3,xmm1 ; = {fi[k1], gi[k1], gi[0], fi[0]}
  189. subps xmm2,xmm1 ; = {fi[k3], gi[k3], gi[k1*2], fi[k1*2]}
  190. movhlps xmm0,xmm3
  191. movss [esi],xmm3
  192. shufps xmm3,xmm3,0x55
  193. movss [edi+eax*2],xmm0
  194. shufps xmm0,xmm0,0x55
  195. movss [edi],xmm3
  196. movss [esi+eax*2],xmm0
  197. movhlps xmm0,xmm2
  198. movss [esi+eax*4],xmm2
  199. shufps xmm2,xmm2,0x55
  200. movss [edi+edx*2],xmm0
  201. shufps xmm0,xmm0,0x55
  202. movss [edi+eax*4],xmm2
  203. movss [esi+edx*2],xmm0
  204. lea esi,[esi + eax*8] ; fi += (k1 * 4);
  205. cmp esi,[esp]
  206. jl near .lp21 ; while (fi<fn);
  207. ; unroll前のdo loopは43+4命令
  208. ; 最内周ではないforループのi=2から先をunrollingした
  209. ; kx= 2, 8, 32, 128
  210. ; k4= 16, 64, 256, 1024
  211. ; 0, 6/2,30/2,126/2
  212. xor ebx,ebx
  213. mov bl, 4*2 ; = i = 4
  214. cmp ebx,eax ; i < k1
  215. jnl near .F22
  216. ; for (i=2;i<kx;i+=2){
  217. loopalign 16
  218. .lp22:
  219. ; at here, xmm6 is {c3, s3, s3, c3}
  220. ; c1 = c3*t_c - s3*t_s;
  221. ; s1 = c3*t_s + s3*t_c;
  222. movlps xmm0,[ecx]
  223. shufps xmm0,xmm0,R4(1,1,0,0) ; = {t_s, t_s, t_c, t_c}
  224. mulps xmm6,xmm0 ; = {c3*ts, s3*ts, s3*tc, c3*tc}
  225. movhlps xmm4,xmm6 ; = {--, --, c3*ts, s3*ts}
  226. xorps xmm4,[PIC_EBP_REL(Q_MPMP)] ; = {--, --, -c3*ts, s3*ts}
  227. subps xmm6,xmm4 ; = {-,-, c3*ts+s3*tc, c3*tc-s3*ts}={-,-,s1,c1}
  228. ; c3 = c1*t_c - s1*t_s;
  229. ; s3 = s1*t_c + c1*t_s;
  230. shufps xmm6,xmm6,0x14 ; = {c1, s1, s1, c1}
  231. mulps xmm0,xmm6 ; = {ts*c1 ts*s1 tc*s1 tc*c1}
  232. movhlps xmm3,xmm0
  233. xorps xmm3,[PIC_EBP_REL(Q_MPMP)]
  234. subps xmm0,xmm3 ; = {--, --, s3, c3}
  235. ; {s2 s4 c4 c2} = {2*s1*c1 2*s3*c3 1-2*s3*s3 1-2*s1*s1}
  236. unpcklps xmm6,xmm0 ; xmm6 = {s3, s1, c3, c1}
  237. movaps xmm7, xmm6
  238. shufps xmm6,xmm6,R4(2,3,1,0) ; xmm6 = {s1, s3, c3, c1}
  239. addps xmm7, xmm7 ; {s3*2, s1*2, --, --}
  240. mov edi,[esp+_P+4] ; = fz
  241. shufps xmm7, xmm7, R4(2,3,3,2) ; {s1*2, s3*2, s3*2, s1*2}
  242. sub edi,ebx ; edi = fz - i/2
  243. mulps xmm7, xmm6 ; {s1*s1*2, s3*s3*2, s3*c3*2, s1*c1*2}
  244. lea esi,[edi + ebx*2] ; esi = fi = fz +i/2
  245. subps xmm7, [PIC_EBP_REL(D_1100)] ; {-c2, -c4, s4, s2}
  246. lea edi,[edi + eax*2-4] ; edi = gi = fz +k1-i/2
  247. ; fi = fz +i;
  248. ; gi = fz +k1-i;
  249. ; do{
  250. .lp220:
  251. ; unroll後のdo loopは51+4命令
  252. ; a = c2*fi[k1 ] + s2*gi[k1 ];
  253. ; e = c4*fi[k1+1] + s4*gi[k1-1];
  254. ; f = s4*fi[k1+1] - c4*gi[k1-1];
  255. ; b = s2*fi[k1 ] - c2*gi[k1 ];
  256. ; c = c2*fi[k3 ] + s2*gi[k3 ];
  257. ; g = c4*fi[k3+1] + s4*gi[k3-1];
  258. ; h = s4*fi[k3+1] - c4*gi[k3-1];
  259. ; d = s2*fi[k3 ] - c2*gi[k3 ];
  260. movaps xmm4,xmm7 ; = {-c2 -c4 s4 s2}
  261. xorps xmm4,[PIC_EBP_REL(Q_MMPP)] ; = { c2 c4 s4 s2}
  262. shufps xmm4,xmm4,0x1B ; = { s2 s4 c4 c2}
  263. movlps xmm0,[esi+eax*2]
  264. movlps xmm1,[edi+eax*2]
  265. movlps xmm2,[esi+edx*2]
  266. movlps xmm3,[edi+edx*2]
  267. shufps xmm0,xmm0,0x14
  268. shufps xmm1,xmm1,0x41
  269. shufps xmm2,xmm2,0x14
  270. shufps xmm3,xmm3,0x41
  271. mulps xmm0,xmm4
  272. mulps xmm1,xmm7
  273. mulps xmm2,xmm4
  274. mulps xmm3,xmm7
  275. addps xmm0,xmm1 ; xmm0 = {b, f, e, a}
  276. addps xmm2,xmm3 ; xmm2 = {d, h, g, c}
  277. ;17
  278. ; f0 = fi[0 ] + a;
  279. ; f4 = fi[0 +1] + e;
  280. ; g4 = gi[0 -1] + f;
  281. ; g0 = gi[0 ] + b;
  282. ; f1 = fi[0 ] - a;
  283. ; f5 = fi[0 +1] - e;
  284. ; g5 = gi[0 -1] - f;
  285. ; g1 = gi[0 ] - b;
  286. ; f2 = fi[k2 ] + c;
  287. ; f6 = fi[k2+1] + g;
  288. ; g6 = gi[k2-1] + h;
  289. ; g2 = gi[k2 ] + d;
  290. ; f3 = fi[k2 ] - c;
  291. ; f7 = fi[k2+1] - g;
  292. ; g7 = gi[k2-1] - h;
  293. ; g3 = gi[k2 ] - d;
  294. movlps xmm1,[esi ]
  295. movhps xmm1,[edi ]
  296. movaps xmm4,xmm1
  297. subps xmm1,xmm0 ; xmm1 = {g1, g5, f5, f1}
  298. movlps xmm3,[esi+eax*4]
  299. movhps xmm3,[edi+eax*4]
  300. movaps xmm5,xmm3
  301. subps xmm3,xmm2 ; xmm3 = {g3, g7, f7, f3}
  302. addps xmm0,xmm4 ; xmm0 = {g0, g4, f4, f0}
  303. addps xmm2,xmm5 ; xmm2 = {g2, g6, f6, f2}
  304. ;10
  305. ; a = c1*f2 + s1*g3; 順*順 + 逆*逆
  306. ; e = c3*f6 + s3*g7;
  307. ; g = s3*g6 + c3*f7;
  308. ; c = s1*g2 + c1*f3;
  309. ; d = c1*g2 - s1*f3; 順*逆 - 逆*順
  310. ; h = c3*g6 - s3*f7;
  311. ; f = s3*f6 - c3*g7;
  312. ; b = s1*f2 - c1*g3;
  313. movaps xmm5,xmm6 ; xmm6 = {s1, s3, c3, c1}
  314. shufps xmm5,xmm5,0x1B ; = {c1, c3, s3, s1}
  315. movaps xmm4,xmm2
  316. mulps xmm4,xmm6
  317. shufps xmm2,xmm2,0x1B ; xmm2 = {f2, f6, g6, g2}
  318. mulps xmm2,xmm6
  319. mulps xmm5,xmm3
  320. mulps xmm3,xmm6
  321. shufps xmm3,xmm3,0x1B
  322. addps xmm4,xmm3 ; = {c, g, e, a}
  323. subps xmm2,xmm5 ; = {b, f, h, d}
  324. ;10
  325. ; fi[0 ] = f0 + a;
  326. ; fi[0 +1] = f4 + e;
  327. ; gi[0 -1] = g4 + g;
  328. ; gi[0 ] = g0 + c;
  329. ; fi[k2 ] = f0 - a;
  330. ; fi[k2+1] = f4 - e;
  331. ; gi[k2-1] = g4 - g;
  332. ; gi[k2 ] = g0 - c;
  333. ; fi[k1 ] = f1 + d;
  334. ; fi[k1+1] = f5 + h;
  335. ; gi[k1-1] = g5 + f;
  336. ; gi[k1 ] = g1 + b;
  337. ; fi[k3 ] = f1 - d;
  338. ; fi[k3+1] = f5 - h;
  339. ; gi[k3-1] = g5 - f;
  340. ; gi[k3 ] = g1 - b;
  341. movaps xmm3,xmm0
  342. subps xmm0,xmm4
  343. movlps [esi+eax*4],xmm0
  344. movhps [edi+eax*4],xmm0
  345. addps xmm4,xmm3
  346. movlps [esi ],xmm4
  347. movhps [edi ],xmm4
  348. movaps xmm5,xmm1
  349. subps xmm1,xmm2
  350. movlps [esi+edx*2],xmm1
  351. movhps [edi+edx*2],xmm1
  352. addps xmm2,xmm5
  353. movlps [esi+eax*2],xmm2
  354. movhps [edi+eax*2],xmm2
  355. ; 14
  356. ; gi += k4;
  357. ; fi += k4;
  358. lea edi,[edi + eax*8] ; gi += (k1 * 4);
  359. lea esi,[esi + eax*8] ; fi += (k1 * 4);
  360. cmp esi,[esp]
  361. jl near .lp220 ; while (fi<fn);
  362. ; } while (fi<fn);
  363. add ebx,byte 2*4 ; i+= 4
  364. cmp ebx,eax ; i < k1
  365. shufps xmm6,xmm6,R4(1,2,2,1) ; (--,s3,c3,--) => {c3, s3, s3, c3}
  366. jl near .lp22
  367. ; }
  368. .F22:
  369. shl eax,2
  370. add ecx, byte 8
  371. cmp eax,[esp+_P+8] ; while ((k1 * 4)<n);
  372. jle near .lp2
  373. pop ebp
  374. pop ebp
  375. pop edi
  376. pop esi
  377. pop ebx
  378. ret
  379. end