choose_table.nas 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447
  1. ; new count bit routine
  2. ; part of this code is origined from
  3. ; new GOGO-no-coda (1999, 2000)
  4. ; Copyright (C) 1999 shigeo
  5. ; modified by Keiichi SAKAI
  6. %include "nasm.h"
  7. globaldef choose_table_MMX
  8. globaldef MMX_masking
  9. externdef largetbl
  10. externdef t1l
  11. externdef table23
  12. externdef table56
  13. segment_data
  14. align 16
  15. D14_14_14_14 dd 0x000E000E, 0x000E000E
  16. D15_15_15_15 dd 0xfff0fff0, 0xfff0fff0
  17. mul_add dd 0x00010010, 0x00010010
  18. mul_add23 dd 0x00010003, 0x00010003
  19. mul_add56 dd 0x00010004, 0x00010004
  20. tableDEF
  21. dd 0x00010003,0x01,0x00050005,0x05,0x00070006,0x07,0x00090008,0x08,0x000a0008, 0x09
  22. dd 0x000a0009,0x0a,0x000b000a,0x0a,0x000b000a,0x0b,0x000c000a,0x0a,0x000c000b, 0x0b
  23. dd 0x000c000b,0x0c,0x000d000c,0x0c,0x000d000c,0x0d,0x000d000c,0x0d,0x000e000d, 0x0e
  24. dd 0x000b000e,0x0e,0x00040005,0x04,0x00060005,0x06,0x00080007,0x08,0x00090008, 0x09
  25. dd 0x000a0009,0x0a,0x000b0009,0x0a,0x000b000a,0x0b,0x000b000a,0x0b,0x000c000a, 0x0b
  26. dd 0x000c000b,0x0b,0x000c000b,0x0c,0x000d000c,0x0c,0x000e000c,0x0d,0x000d000c, 0x0e
  27. dd 0x000e000d,0x0e,0x000b000d,0x0e,0x00070006,0x07,0x00080007,0x08,0x00090007, 0x09
  28. dd 0x000a0008,0x0a,0x000b0009,0x0b,0x000b0009,0x0b,0x000c000a,0x0c,0x000c000a, 0x0c
  29. dd 0x000d000a,0x0b,0x000c000b,0x0c,0x000d000b,0x0c,0x000d000c,0x0d,0x000d000c, 0x0d
  30. dd 0x000e000d,0x0e,0x000e000d,0x0f,0x000c000d,0x0f,0x00090007,0x08,0x00090008, 0x09
  31. dd 0x000a0008,0x0a,0x000b0009,0x0b,0x000b0009,0x0b,0x000c000a,0x0c,0x000c000a, 0x0c
  32. dd 0x000c000b,0x0c,0x000d000b,0x0c,0x000d000b,0x0d,0x000e000c,0x0d,0x000e000c, 0x0d
  33. dd 0x000e000c,0x0d,0x000f000d,0x0e,0x000f000d,0x0f,0x000d000d,0x0f,0x000a0008, 0x09
  34. dd 0x000a0008,0x09,0x000b0009,0x0b,0x000b0009,0x0b,0x000c000a,0x0c,0x000c000a, 0x0c
  35. dd 0x000d000b,0x0d,0x000d000b,0x0d,0x000d000b,0x0c,0x000e000b,0x0d,0x000e000c, 0x0d
  36. dd 0x000e000c,0x0e,0x000f000c,0x0e,0x000f000d,0x0f,0x000f000d,0x0f,0x000c000d, 0x10
  37. dd 0x000a0009,0x0a,0x000a0009,0x0a,0x000b0009,0x0b,0x000b000a,0x0c,0x000c000a, 0x0c
  38. dd 0x000d000a,0x0c,0x000d000b,0x0d,0x000e000b,0x0d,0x000d000b,0x0d,0x000e000b, 0x0d
  39. dd 0x000e000c,0x0e,0x000f000c,0x0d,0x000f000d,0x0f,0x000f000d,0x0f,0x0010000d, 0x10
  40. dd 0x000d000e,0x10,0x000b000a,0x0a,0x000b0009,0x0b,0x000b000a,0x0c,0x000c000a, 0x0c
  41. dd 0x000d000a,0x0d,0x000d000b,0x0d,0x000d000b,0x0d,0x000d000b,0x0d,0x000e000b, 0x0d
  42. dd 0x000e000c,0x0e,0x000e000c,0x0e,0x000e000c,0x0e,0x000f000d,0x0f,0x000f000d, 0x0f
  43. dd 0x0010000e,0x10,0x000d000e,0x10,0x000b000a,0x0b,0x000b000a,0x0b,0x000c000a, 0x0c
  44. dd 0x000c000b,0x0d,0x000d000b,0x0d,0x000d000b,0x0d,0x000d000b,0x0e,0x000e000c, 0x0e
  45. dd 0x000e000c,0x0e,0x000f000c,0x0e,0x000f000c,0x0f,0x000f000c,0x0f,0x000f000d, 0x0f
  46. dd 0x0011000d,0x10,0x0011000d,0x12,0x000d000e,0x12,0x000b000a,0x0a,0x000c000a, 0x0a
  47. dd 0x000c000a,0x0b,0x000d000b,0x0c,0x000d000b,0x0c,0x000d000b,0x0d,0x000e000b, 0x0d
  48. dd 0x000e000c,0x0e,0x000f000c,0x0e,0x000f000c,0x0e,0x000f000c,0x0e,0x000f000d, 0x0f
  49. dd 0x0010000d,0x0f,0x0010000e,0x10,0x0010000e,0x11,0x000d000e,0x11,0x000c000a, 0x0b
  50. dd 0x000c000a,0x0b,0x000c000b,0x0c,0x000d000b,0x0c,0x000d000b,0x0d,0x000e000b, 0x0d
  51. dd 0x000e000c,0x0d,0x000f000c,0x0f,0x000f000c,0x0e,0x000f000d,0x0f,0x000f000d, 0x0f
  52. dd 0x0010000d,0x10,0x000f000d,0x10,0x0010000e,0x10,0x000f000e,0x12,0x000e000e, 0x11
  53. dd 0x000c000b,0x0b,0x000d000b,0x0c,0x000c000b,0x0c,0x000d000b,0x0d,0x000e000c, 0x0d
  54. dd 0x000e000c,0x0e,0x000e000c,0x0e,0x000e000c,0x0f,0x000f000c,0x0e,0x0010000d, 0x0f
  55. dd 0x0010000d,0x10,0x0010000d,0x0f,0x0011000d,0x10,0x0011000e,0x11,0x0010000f, 0x12
  56. dd 0x000d000e,0x13,0x000d000b,0x0c,0x000d000b,0x0c,0x000d000b,0x0c,0x000d000b, 0x0d
  57. dd 0x000e000c,0x0e,0x000e000c,0x0e,0x000f000c,0x0e,0x0010000c,0x0e,0x0010000d, 0x0f
  58. dd 0x0010000d,0x0f,0x0010000d,0x0f,0x0010000d,0x10,0x0010000e,0x11,0x000f000e, 0x11
  59. dd 0x0010000e,0x11,0x000e000f,0x12,0x000d000c,0x0c,0x000e000c,0x0d,0x000e000b, 0x0d
  60. dd 0x000e000c,0x0e,0x000e000c,0x0e,0x000f000c,0x0f,0x000f000d,0x0e,0x000f000d, 0x0f
  61. dd 0x000f000d,0x10,0x0011000d,0x10,0x0010000d,0x11,0x0010000d,0x11,0x0010000e, 0x11
  62. dd 0x0010000e,0x12,0x0012000f,0x12,0x000e000f,0x12,0x000f000c,0x0d,0x000e000c, 0x0d
  63. dd 0x000e000c,0x0e,0x000e000c,0x0f,0x000f000c,0x0f,0x000f000d,0x0f,0x0010000d, 0x10
  64. dd 0x0010000d,0x10,0x0010000d,0x10,0x0012000e,0x10,0x0011000e,0x10,0x0011000e, 0x11
  65. dd 0x0011000e,0x12,0x0013000e,0x11,0x0011000f,0x12,0x000e000f,0x12,0x000e000d, 0x0e
  66. dd 0x000f000d,0x0e,0x000d000d,0x0e,0x000e000d,0x0f,0x0010000d,0x0f,0x0010000d, 0x0f
  67. dd 0x000f000d,0x11,0x0010000d,0x10,0x0010000e,0x10,0x0011000e,0x13,0x0012000e, 0x11
  68. dd 0x0011000e,0x11,0x0013000f,0x11,0x0011000f,0x13,0x0010000e,0x12,0x000e000f, 0x12
  69. dd 0x000b000d,0x0d,0x000b000d,0x0e,0x000b000d,0x0f,0x000c000d,0x10,0x000c000d, 0x10
  70. dd 0x000d000d,0x10,0x000d000d,0x11,0x000d000e,0x10,0x000e000e,0x11,0x000e000e, 0x11
  71. dd 0x000e000e,0x12,0x000e000e,0x12,0x000e000f,0x15,0x000e000f,0x14,0x000e000f, 0x15
  72. dd 0x000c000f,0x12
  73. tableABC
  74. dd 0x00020004,0x1,0x00040004,0x4,0x00060006,0x7,0x00080008,0x9,0x00090009,0xa,0x000a000a,0xa
  75. dd 0x0009000a,0xa,0x000a000a,0xb,0x00000000,0x0,0x00020003,0x1,0x00040004,0x4,0x00070006,0x7
  76. dd 0x00090007,0x9,0x00090009,0x9,0x000a000a,0xa,0x00000000,0x0,0x00040004,0x4,0x00050005,0x6
  77. dd 0x00060006,0x8,0x00080007,0x9,0x000a0009,0xa,0x000a0009,0xb,0x0009000a,0xa,0x000a000a,0xa
  78. dd 0x00000000,0x0,0x00040004,0x4,0x00040005,0x6,0x00060006,0x8,0x000a0007,0x9,0x000a0008,0x9
  79. dd 0x000a000a,0xa,0x00000000,0x0,0x00060006,0x7,0x00070006,0x8,0x00080007,0x9,0x00090008,0xa
  80. dd 0x000a0009,0xb,0x000b000a,0xc,0x000a0009,0xb,0x000a000a,0xb,0x00000000,0x0,0x00070005,0x7
  81. dd 0x00060006,0x7,0x00080007,0x9,0x000a0008,0xa,0x000a0009,0xa,0x000b000a,0xb,0x00000000,0x0
  82. dd 0x00080007,0x8,0x00080007,0x9,0x00090008,0xa,0x000b0008,0xb,0x000a0009,0xc,0x000c000a,0xc
  83. dd 0x000a000a,0xb,0x000b000a,0xc,0x00000000,0x0,0x00090007,0x8,0x000a0007,0x9,0x000a0008,0xa
  84. dd 0x000b0009,0xb,0x000b0009,0xb,0x000c000a,0xb,0x00000000,0x0,0x00090008,0x9,0x000a0008,0xa
  85. dd 0x000a0009,0xb,0x000b0009,0xc,0x000b000a,0xc,0x000c000a,0xc,0x000b000a,0xc,0x000c000b,0xc
  86. dd 0x00000000,0x0,0x00090008,0x8,0x00090008,0x9,0x000a0009,0xa,0x000b0009,0xb,0x000c000a,0xb
  87. dd 0x000c000b,0xc,0x00000000,0x0,0x00090009,0xa,0x000a0009,0xb,0x000b000a,0xc,0x000c000a,0xc
  88. dd 0x000c000a,0xd,0x000d000b,0xd,0x000c000a,0xc,0x000d000b,0xd,0x00000000,0x0,0x000a0009,0x9
  89. dd 0x000a0009,0xa,0x000b000a,0xb,0x000b000a,0xc,0x000d000b,0xc,0x000d000b,0xc,0x00000000,0x0
  90. dd 0x00090009,0x9,0x00090009,0xa,0x00090009,0xb,0x000a000a,0xc,0x000b000a,0xc,0x000c000b,0xc
  91. dd 0x000c000b,0xd,0x000c000c,0xd,0x00000000,0x0,0x00000000,0x0,0x00000000,0x0,0x00000000,0x0
  92. dd 0x00000000,0x0,0x00000000,0x0,0x00000000,0x0,0x00000000,0x0,0x0009000a,0xa,0x0009000a,0xa
  93. dd 0x000a000a,0xb,0x000b000b,0xc,0x000c000b,0xc,0x000c000b,0xd,0x000c000b,0xd,0x000c000c,0xd
  94. dd 0x00000000,0x0,0x00000000,0x0,0x00000000,0x0,0x00000000,0x0,0x00000000,0x0,0x00000000,0x0
  95. dd 0x0,0x00000000, 0x0,0x00000000
  96. linbits32
  97. dd 0x00040004,0x10001,0x00040004,0x20002,0x00040004,0x30003,0x00040004,0x40004
  98. dd 0x00050005,0x60006,0x00060006,0x60006,0x00070007,0x80008,0x00080008,0x80008
  99. dd 0x00090009,0xa000a,0x000b000b,0xa000a,0x000b000b,0xd000d,0x000d000d,0xd000d
  100. dd 0x000d000d,0xd000d
  101. choose_table_H
  102. dw 0x1810, 0x1811, 0x1812, 0x1813, 0x1914, 0x1a14, 0x1b15, 0x1c15
  103. dw 0x1d16, 0x1e16, 0x1e17, 0x1f17, 0x1f17
  104. choose_jump_table_L:
  105. dd table_MMX.L_case_0 - choose_table_MMX
  106. dd table_MMX.L_case_1 - choose_table_MMX
  107. dd table_MMX.L_case_2 - choose_table_MMX
  108. dd table_MMX.L_case_3 - choose_table_MMX
  109. dd table_MMX.L_case_45 - choose_table_MMX
  110. dd table_MMX.L_case_45 - choose_table_MMX
  111. dd table_MMX.L_case_67 - choose_table_MMX
  112. dd table_MMX.L_case_67 - choose_table_MMX
  113. dd table_MMX.L_case_8_15 - choose_table_MMX
  114. dd table_MMX.L_case_8_15 - choose_table_MMX
  115. dd table_MMX.L_case_8_15 - choose_table_MMX
  116. dd table_MMX.L_case_8_15 - choose_table_MMX
  117. dd table_MMX.L_case_8_15 - choose_table_MMX
  118. dd table_MMX.L_case_8_15 - choose_table_MMX
  119. dd table_MMX.L_case_8_15 - choose_table_MMX
  120. dd table_MMX.L_case_8_15 - choose_table_MMX
  121. segment_code
  122. ;
  123. ; use MMX
  124. ;
  125. PIC_OFFSETTABLE
  126. align 16
  127. ; int choose_table(int *ix, int *end, int *s)
  128. choose_table_MMX:
  129. push ebp
  130. call get_pc.bp
  131. add ebp, PIC_BASE()
  132. mov ecx,[esp+8] ;ecx = begin
  133. mov edx,[esp+12] ;edx = end
  134. sub ecx,edx ;ecx = begin-end(should be minus)
  135. test ecx,8
  136. pxor mm0,mm0 ;mm0=[0:0]
  137. movq mm1,[edx+ecx]
  138. jz .lp
  139. add ecx,8
  140. jz .exit
  141. align 4
  142. .lp:
  143. movq mm4,[edx+ecx]
  144. movq mm5,[edx+ecx+8]
  145. add ecx,16
  146. psubusw mm4,mm0 ; $BK\Ev$O(B dword $B$G$J$$$H$$$1$J$$$N$@$,(B
  147. psubusw mm5,mm1 ; $B$=$s$J%3%^%s%I$O$J$$(B :-p
  148. paddw mm0,mm4 ; $B$,(B, $B$3$3$G07$&CM$NHO0O$O(B 8191+15 $B0J2<$J$N$GLdBj$J$$(B
  149. paddw mm1,mm5
  150. jnz .lp
  151. .exit:
  152. psubusw mm1,mm0 ; $B$3$l$bK\Ev$O(B dword $B$G$J$$$H$$$1$J$$(B
  153. paddw mm0,mm1
  154. movq mm4,mm0
  155. punpckhdq mm4,mm4
  156. psubusw mm4,mm0 ; $B$3$l$bK\Ev$O(B dword $B$G$J$$$H$$$1$J$$(B
  157. paddw mm0,mm4
  158. movd eax,mm0
  159. cmp eax,15
  160. ja .with_ESC
  161. lea ecx,[PIC_EBP_REL(choose_table_MMX)]
  162. add ecx,[PIC_EBP_REL(choose_jump_table_L+eax*4)]
  163. jmp ecx
  164. .with_ESC1:
  165. emms
  166. mov ecx, [esp+16] ; *s
  167. mov [ecx], eax
  168. or eax,-1
  169. pop ebp
  170. ret
  171. .with_ESC:
  172. cmp eax, 8191+15
  173. ja .with_ESC1
  174. sub eax,15
  175. push ebx
  176. push esi
  177. bsr eax, eax
  178. %assign _P 4*2
  179. movq mm5, [PIC_EBP_REL(D15_15_15_15)]
  180. movq mm6, [PIC_EBP_REL(D14_14_14_14)]
  181. movq mm3, [PIC_EBP_REL(mul_add)]
  182. mov ecx, [esp+_P+8] ; = ix
  183. ; mov edx, [esp+_P+12] ; = end
  184. sub ecx, edx
  185. xor esi, esi ; sum = 0
  186. test ecx, 8
  187. pxor mm7, mm7 ; linbits_sum, 14$B$r1[$($?$b$N$N?t(B
  188. jz .H_dual_lp1
  189. movq mm0, [edx+ecx]
  190. add ecx,8
  191. packssdw mm0,mm7
  192. movq mm2, mm0
  193. paddusw mm0, mm5 ; mm0 = min(ix, 15)+0xfff0
  194. pcmpgtw mm2, mm6 ; 14$B$h$jBg$-$$$+!)(B
  195. psubw mm7, mm2 ; 14$B$h$jBg$-$$$H$-(B linbits_sum++;
  196. pmaddwd mm0, mm3 ; {0, 0, y, x}*{1, 16, 1, 16}
  197. movd ebx, mm0
  198. mov esi, [PIC_EBP_REL(largetbl+ebx*4+(16*16+16)*4)]
  199. jz .H_dual_exit
  200. align 4
  201. .H_dual_lp1:
  202. movq mm0, [edx+ecx]
  203. movq mm1, [edx+ecx+8]
  204. packssdw mm0,mm1
  205. movq mm2, mm0
  206. paddusw mm0, mm5 ; mm0 = min(ix, 15)+0xfff0
  207. pcmpgtw mm2, mm6 ; 14$B$h$jBg$-$$$+!)(B
  208. pmaddwd mm0, mm3 ; {y, x, y, x}*{1, 16, 1, 16}
  209. movd ebx, mm0
  210. punpckhdq mm0,mm0
  211. add esi, [PIC_EBP_REL(largetbl+ebx*4+(16*16+16)*4)]
  212. movd ebx, mm0
  213. add esi, [PIC_EBP_REL(largetbl+ebx*4+(16*16+16)*4)]
  214. add ecx, 16
  215. psubw mm7, mm2 ; 14$B$h$jBg$-$$$H$-(B linbits_sum++;
  216. jnz .H_dual_lp1
  217. .H_dual_exit:
  218. pmov mm1,mm7
  219. punpckhdq mm7,mm7
  220. paddd mm7,mm1
  221. punpckldq mm7,mm7
  222. pmaddwd mm7, [PIC_EBP_REL(linbits32+eax*8)] ; linbits
  223. mov ax, [PIC_EBP_REL(choose_table_H+eax*2)]
  224. movd ecx, mm7
  225. punpckhdq mm7,mm7
  226. movd edx,mm7
  227. emms
  228. shl edx, 16
  229. add ecx, edx
  230. add ecx, esi
  231. pop esi
  232. pop ebx
  233. mov edx, ecx
  234. and ecx, 0xffff ; ecx = sum2
  235. shr edx, 16 ; edx = sum
  236. cmp edx, ecx
  237. jle .chooseE_s1
  238. mov edx, ecx
  239. shr eax, 8
  240. .chooseE_s1:
  241. mov ecx, [esp+16] ; *s
  242. and eax, 0xff
  243. add [ecx], edx
  244. pop ebp
  245. ret
  246. table_MMX.L_case_0:
  247. emms
  248. pop ebp
  249. ret
  250. table_MMX.L_case_1:
  251. emms
  252. mov eax, [esp+16] ; *s
  253. mov ecx, [esp+8] ; *ix
  254. sub ecx, edx
  255. push ebx
  256. .lp:
  257. mov ebx, [edx+ecx]
  258. add ebx, ebx
  259. add ebx, [edx+ecx+4]
  260. movzx ebx, byte [PIC_EBP_REL(ebx+t1l)]
  261. add [eax], ebx
  262. add ecx, 8
  263. jnz .lp
  264. pop ebx
  265. mov eax, 1
  266. pop ebp
  267. ret
  268. table_MMX.L_case_45:
  269. push dword 7
  270. lea ecx, [PIC_EBP_REL(tableABC+9*8)]
  271. jmp from3
  272. table_MMX.L_case_67:
  273. push dword 10
  274. lea ecx, [PIC_EBP_REL(tableABC)]
  275. jmp from3
  276. table_MMX.L_case_8_15:
  277. push dword 13
  278. lea ecx, [PIC_EBP_REL(tableDEF)]
  279. from3:
  280. mov eax,[esp+12] ;eax = *begin
  281. ; mov edx,[esp+16] ;edx = *end
  282. push ebx
  283. sub eax, edx
  284. movq mm5,[PIC_EBP_REL(mul_add)]
  285. pxor mm2,mm2 ;mm2 = sum
  286. test eax, 8
  287. jz .choose3_lp1
  288. ; odd length
  289. movq mm0,[edx+eax] ;mm0 = ix[0] | ix[1]
  290. add eax,8
  291. packssdw mm0,mm2
  292. pmaddwd mm0,mm5
  293. movd ebx,mm0
  294. movq mm2, [ecx+ebx*8]
  295. jz .choose3_exit
  296. align 4
  297. .choose3_lp1
  298. movq mm0,[edx+eax]
  299. movq mm1,[edx+eax+8]
  300. add eax,16
  301. packssdw mm0,mm1 ;mm0 = ix[0]|ix[1]|ix[2]|ix[3]
  302. pmaddwd mm0,mm5
  303. movd ebx,mm0
  304. punpckhdq mm0,mm0
  305. paddd mm2, [ecx+ebx*8]
  306. movd ebx,mm0
  307. paddd mm2, [ecx+ebx*8]
  308. jnz .choose3_lp1
  309. .choose3_exit
  310. ; xor eax,eax
  311. movd ebx, mm2
  312. punpckhdq mm2,mm2
  313. mov ecx, ebx
  314. and ecx, 0xffff ; ecx = sum2
  315. shr ebx, 16 ; ebx = sum1
  316. movd edx, mm2 ; edx = sum
  317. cmp edx, ebx
  318. jle .choose3_s1
  319. mov edx, ebx
  320. inc eax
  321. .choose3_s1:
  322. emms
  323. pop ebx
  324. cmp edx, ecx
  325. jle .choose3_s2
  326. mov edx, ecx
  327. mov eax, 2
  328. .choose3_s2:
  329. pop ecx
  330. add eax, ecx
  331. mov ecx, [esp+16] ; *s
  332. add [ecx], edx
  333. pop ebp
  334. ret
  335. table_MMX.L_case_2:
  336. push dword 2
  337. lea ecx,[PIC_EBP_REL(table23)]
  338. pmov mm5,[PIC_EBP_REL(mul_add23)]
  339. jmp from2
  340. table_MMX.L_case_3:
  341. push dword 5
  342. lea ecx,[PIC_EBP_REL(table56)]
  343. pmov mm5,[PIC_EBP_REL(mul_add56)]
  344. from2:
  345. mov eax,[esp+12] ;eax = *begin
  346. ; mov edx,[esp+16] ;edx = *end
  347. push ebx
  348. push edi
  349. sub eax, edx
  350. xor edi, edi
  351. test eax, 8
  352. jz .choose2_lp1
  353. ; odd length
  354. movq mm0,[edx+eax] ;mm0 = ix[0] | ix[1]
  355. pxor mm2,mm2 ;mm2 = sum
  356. packssdw mm0,mm2
  357. pmaddwd mm0,mm5
  358. movd ebx,mm0
  359. mov edi, [ecx+ebx*4]
  360. add eax,8
  361. jz .choose2_exit
  362. align 4
  363. .choose2_lp1
  364. movq mm0,[edx+eax]
  365. movq mm1,[edx+eax+8]
  366. packssdw mm0,mm1 ;mm0 = ix[0]|ix[1]|ix[2]|ix[3]
  367. pmaddwd mm0,mm5
  368. movd ebx,mm0
  369. punpckhdq mm0,mm0
  370. add edi, [ecx+ebx*4]
  371. movd ebx, mm0
  372. add edi, [ecx+ebx*4]
  373. add eax,16
  374. jnc .choose2_lp1
  375. .choose2_exit
  376. mov ecx, edi
  377. pop edi
  378. pop ebx
  379. pop eax ; table num.
  380. emms
  381. mov edx, ecx
  382. and ecx, 0xffff ; ecx = sum2
  383. shr edx, 16 ; edx = sum1
  384. cmp edx, ecx
  385. jle .choose2_s1
  386. mov edx, ecx
  387. inc eax
  388. .choose2_s1:
  389. mov ecx, [esp+16] ; *s
  390. add [ecx], edx
  391. pop ebp
  392. ret
  393. end