| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447 |
- ; new count bit routine
- ; part of this code is origined from
- ; new GOGO-no-coda (1999, 2000)
- ; Copyright (C) 1999 shigeo
- ; modified by Keiichi SAKAI
- %include "nasm.h"
- globaldef choose_table_MMX
- globaldef MMX_masking
- externdef largetbl
- externdef t1l
- externdef table23
- externdef table56
- segment_data
- align 16
- D14_14_14_14 dd 0x000E000E, 0x000E000E
- D15_15_15_15 dd 0xfff0fff0, 0xfff0fff0
- mul_add dd 0x00010010, 0x00010010
- mul_add23 dd 0x00010003, 0x00010003
- mul_add56 dd 0x00010004, 0x00010004
- tableDEF
- dd 0x00010003,0x01,0x00050005,0x05,0x00070006,0x07,0x00090008,0x08,0x000a0008, 0x09
- dd 0x000a0009,0x0a,0x000b000a,0x0a,0x000b000a,0x0b,0x000c000a,0x0a,0x000c000b, 0x0b
- dd 0x000c000b,0x0c,0x000d000c,0x0c,0x000d000c,0x0d,0x000d000c,0x0d,0x000e000d, 0x0e
- dd 0x000b000e,0x0e,0x00040005,0x04,0x00060005,0x06,0x00080007,0x08,0x00090008, 0x09
- dd 0x000a0009,0x0a,0x000b0009,0x0a,0x000b000a,0x0b,0x000b000a,0x0b,0x000c000a, 0x0b
- dd 0x000c000b,0x0b,0x000c000b,0x0c,0x000d000c,0x0c,0x000e000c,0x0d,0x000d000c, 0x0e
- dd 0x000e000d,0x0e,0x000b000d,0x0e,0x00070006,0x07,0x00080007,0x08,0x00090007, 0x09
- dd 0x000a0008,0x0a,0x000b0009,0x0b,0x000b0009,0x0b,0x000c000a,0x0c,0x000c000a, 0x0c
- dd 0x000d000a,0x0b,0x000c000b,0x0c,0x000d000b,0x0c,0x000d000c,0x0d,0x000d000c, 0x0d
- dd 0x000e000d,0x0e,0x000e000d,0x0f,0x000c000d,0x0f,0x00090007,0x08,0x00090008, 0x09
- dd 0x000a0008,0x0a,0x000b0009,0x0b,0x000b0009,0x0b,0x000c000a,0x0c,0x000c000a, 0x0c
- dd 0x000c000b,0x0c,0x000d000b,0x0c,0x000d000b,0x0d,0x000e000c,0x0d,0x000e000c, 0x0d
- dd 0x000e000c,0x0d,0x000f000d,0x0e,0x000f000d,0x0f,0x000d000d,0x0f,0x000a0008, 0x09
- dd 0x000a0008,0x09,0x000b0009,0x0b,0x000b0009,0x0b,0x000c000a,0x0c,0x000c000a, 0x0c
- dd 0x000d000b,0x0d,0x000d000b,0x0d,0x000d000b,0x0c,0x000e000b,0x0d,0x000e000c, 0x0d
- dd 0x000e000c,0x0e,0x000f000c,0x0e,0x000f000d,0x0f,0x000f000d,0x0f,0x000c000d, 0x10
- dd 0x000a0009,0x0a,0x000a0009,0x0a,0x000b0009,0x0b,0x000b000a,0x0c,0x000c000a, 0x0c
- dd 0x000d000a,0x0c,0x000d000b,0x0d,0x000e000b,0x0d,0x000d000b,0x0d,0x000e000b, 0x0d
- dd 0x000e000c,0x0e,0x000f000c,0x0d,0x000f000d,0x0f,0x000f000d,0x0f,0x0010000d, 0x10
- dd 0x000d000e,0x10,0x000b000a,0x0a,0x000b0009,0x0b,0x000b000a,0x0c,0x000c000a, 0x0c
- dd 0x000d000a,0x0d,0x000d000b,0x0d,0x000d000b,0x0d,0x000d000b,0x0d,0x000e000b, 0x0d
- dd 0x000e000c,0x0e,0x000e000c,0x0e,0x000e000c,0x0e,0x000f000d,0x0f,0x000f000d, 0x0f
- dd 0x0010000e,0x10,0x000d000e,0x10,0x000b000a,0x0b,0x000b000a,0x0b,0x000c000a, 0x0c
- dd 0x000c000b,0x0d,0x000d000b,0x0d,0x000d000b,0x0d,0x000d000b,0x0e,0x000e000c, 0x0e
- dd 0x000e000c,0x0e,0x000f000c,0x0e,0x000f000c,0x0f,0x000f000c,0x0f,0x000f000d, 0x0f
- dd 0x0011000d,0x10,0x0011000d,0x12,0x000d000e,0x12,0x000b000a,0x0a,0x000c000a, 0x0a
- dd 0x000c000a,0x0b,0x000d000b,0x0c,0x000d000b,0x0c,0x000d000b,0x0d,0x000e000b, 0x0d
- dd 0x000e000c,0x0e,0x000f000c,0x0e,0x000f000c,0x0e,0x000f000c,0x0e,0x000f000d, 0x0f
- dd 0x0010000d,0x0f,0x0010000e,0x10,0x0010000e,0x11,0x000d000e,0x11,0x000c000a, 0x0b
- dd 0x000c000a,0x0b,0x000c000b,0x0c,0x000d000b,0x0c,0x000d000b,0x0d,0x000e000b, 0x0d
- dd 0x000e000c,0x0d,0x000f000c,0x0f,0x000f000c,0x0e,0x000f000d,0x0f,0x000f000d, 0x0f
- dd 0x0010000d,0x10,0x000f000d,0x10,0x0010000e,0x10,0x000f000e,0x12,0x000e000e, 0x11
- dd 0x000c000b,0x0b,0x000d000b,0x0c,0x000c000b,0x0c,0x000d000b,0x0d,0x000e000c, 0x0d
- dd 0x000e000c,0x0e,0x000e000c,0x0e,0x000e000c,0x0f,0x000f000c,0x0e,0x0010000d, 0x0f
- dd 0x0010000d,0x10,0x0010000d,0x0f,0x0011000d,0x10,0x0011000e,0x11,0x0010000f, 0x12
- dd 0x000d000e,0x13,0x000d000b,0x0c,0x000d000b,0x0c,0x000d000b,0x0c,0x000d000b, 0x0d
- dd 0x000e000c,0x0e,0x000e000c,0x0e,0x000f000c,0x0e,0x0010000c,0x0e,0x0010000d, 0x0f
- dd 0x0010000d,0x0f,0x0010000d,0x0f,0x0010000d,0x10,0x0010000e,0x11,0x000f000e, 0x11
- dd 0x0010000e,0x11,0x000e000f,0x12,0x000d000c,0x0c,0x000e000c,0x0d,0x000e000b, 0x0d
- dd 0x000e000c,0x0e,0x000e000c,0x0e,0x000f000c,0x0f,0x000f000d,0x0e,0x000f000d, 0x0f
- dd 0x000f000d,0x10,0x0011000d,0x10,0x0010000d,0x11,0x0010000d,0x11,0x0010000e, 0x11
- dd 0x0010000e,0x12,0x0012000f,0x12,0x000e000f,0x12,0x000f000c,0x0d,0x000e000c, 0x0d
- dd 0x000e000c,0x0e,0x000e000c,0x0f,0x000f000c,0x0f,0x000f000d,0x0f,0x0010000d, 0x10
- dd 0x0010000d,0x10,0x0010000d,0x10,0x0012000e,0x10,0x0011000e,0x10,0x0011000e, 0x11
- dd 0x0011000e,0x12,0x0013000e,0x11,0x0011000f,0x12,0x000e000f,0x12,0x000e000d, 0x0e
- dd 0x000f000d,0x0e,0x000d000d,0x0e,0x000e000d,0x0f,0x0010000d,0x0f,0x0010000d, 0x0f
- dd 0x000f000d,0x11,0x0010000d,0x10,0x0010000e,0x10,0x0011000e,0x13,0x0012000e, 0x11
- dd 0x0011000e,0x11,0x0013000f,0x11,0x0011000f,0x13,0x0010000e,0x12,0x000e000f, 0x12
- dd 0x000b000d,0x0d,0x000b000d,0x0e,0x000b000d,0x0f,0x000c000d,0x10,0x000c000d, 0x10
- dd 0x000d000d,0x10,0x000d000d,0x11,0x000d000e,0x10,0x000e000e,0x11,0x000e000e, 0x11
- dd 0x000e000e,0x12,0x000e000e,0x12,0x000e000f,0x15,0x000e000f,0x14,0x000e000f, 0x15
- dd 0x000c000f,0x12
- tableABC
- dd 0x00020004,0x1,0x00040004,0x4,0x00060006,0x7,0x00080008,0x9,0x00090009,0xa,0x000a000a,0xa
- dd 0x0009000a,0xa,0x000a000a,0xb,0x00000000,0x0,0x00020003,0x1,0x00040004,0x4,0x00070006,0x7
- dd 0x00090007,0x9,0x00090009,0x9,0x000a000a,0xa,0x00000000,0x0,0x00040004,0x4,0x00050005,0x6
- dd 0x00060006,0x8,0x00080007,0x9,0x000a0009,0xa,0x000a0009,0xb,0x0009000a,0xa,0x000a000a,0xa
- dd 0x00000000,0x0,0x00040004,0x4,0x00040005,0x6,0x00060006,0x8,0x000a0007,0x9,0x000a0008,0x9
- dd 0x000a000a,0xa,0x00000000,0x0,0x00060006,0x7,0x00070006,0x8,0x00080007,0x9,0x00090008,0xa
- dd 0x000a0009,0xb,0x000b000a,0xc,0x000a0009,0xb,0x000a000a,0xb,0x00000000,0x0,0x00070005,0x7
- dd 0x00060006,0x7,0x00080007,0x9,0x000a0008,0xa,0x000a0009,0xa,0x000b000a,0xb,0x00000000,0x0
- dd 0x00080007,0x8,0x00080007,0x9,0x00090008,0xa,0x000b0008,0xb,0x000a0009,0xc,0x000c000a,0xc
- dd 0x000a000a,0xb,0x000b000a,0xc,0x00000000,0x0,0x00090007,0x8,0x000a0007,0x9,0x000a0008,0xa
- dd 0x000b0009,0xb,0x000b0009,0xb,0x000c000a,0xb,0x00000000,0x0,0x00090008,0x9,0x000a0008,0xa
- dd 0x000a0009,0xb,0x000b0009,0xc,0x000b000a,0xc,0x000c000a,0xc,0x000b000a,0xc,0x000c000b,0xc
- dd 0x00000000,0x0,0x00090008,0x8,0x00090008,0x9,0x000a0009,0xa,0x000b0009,0xb,0x000c000a,0xb
- dd 0x000c000b,0xc,0x00000000,0x0,0x00090009,0xa,0x000a0009,0xb,0x000b000a,0xc,0x000c000a,0xc
- dd 0x000c000a,0xd,0x000d000b,0xd,0x000c000a,0xc,0x000d000b,0xd,0x00000000,0x0,0x000a0009,0x9
- dd 0x000a0009,0xa,0x000b000a,0xb,0x000b000a,0xc,0x000d000b,0xc,0x000d000b,0xc,0x00000000,0x0
- dd 0x00090009,0x9,0x00090009,0xa,0x00090009,0xb,0x000a000a,0xc,0x000b000a,0xc,0x000c000b,0xc
- dd 0x000c000b,0xd,0x000c000c,0xd,0x00000000,0x0,0x00000000,0x0,0x00000000,0x0,0x00000000,0x0
- dd 0x00000000,0x0,0x00000000,0x0,0x00000000,0x0,0x00000000,0x0,0x0009000a,0xa,0x0009000a,0xa
- dd 0x000a000a,0xb,0x000b000b,0xc,0x000c000b,0xc,0x000c000b,0xd,0x000c000b,0xd,0x000c000c,0xd
- dd 0x00000000,0x0,0x00000000,0x0,0x00000000,0x0,0x00000000,0x0,0x00000000,0x0,0x00000000,0x0
- dd 0x0,0x00000000, 0x0,0x00000000
- linbits32
- dd 0x00040004,0x10001,0x00040004,0x20002,0x00040004,0x30003,0x00040004,0x40004
- dd 0x00050005,0x60006,0x00060006,0x60006,0x00070007,0x80008,0x00080008,0x80008
- dd 0x00090009,0xa000a,0x000b000b,0xa000a,0x000b000b,0xd000d,0x000d000d,0xd000d
- dd 0x000d000d,0xd000d
- choose_table_H
- dw 0x1810, 0x1811, 0x1812, 0x1813, 0x1914, 0x1a14, 0x1b15, 0x1c15
- dw 0x1d16, 0x1e16, 0x1e17, 0x1f17, 0x1f17
- choose_jump_table_L:
- dd table_MMX.L_case_0 - choose_table_MMX
- dd table_MMX.L_case_1 - choose_table_MMX
- dd table_MMX.L_case_2 - choose_table_MMX
- dd table_MMX.L_case_3 - choose_table_MMX
- dd table_MMX.L_case_45 - choose_table_MMX
- dd table_MMX.L_case_45 - choose_table_MMX
- dd table_MMX.L_case_67 - choose_table_MMX
- dd table_MMX.L_case_67 - choose_table_MMX
- dd table_MMX.L_case_8_15 - choose_table_MMX
- dd table_MMX.L_case_8_15 - choose_table_MMX
- dd table_MMX.L_case_8_15 - choose_table_MMX
- dd table_MMX.L_case_8_15 - choose_table_MMX
- dd table_MMX.L_case_8_15 - choose_table_MMX
- dd table_MMX.L_case_8_15 - choose_table_MMX
- dd table_MMX.L_case_8_15 - choose_table_MMX
- dd table_MMX.L_case_8_15 - choose_table_MMX
- segment_code
- ;
- ; use MMX
- ;
- PIC_OFFSETTABLE
- align 16
- ; int choose_table(int *ix, int *end, int *s)
- choose_table_MMX:
- push ebp
- call get_pc.bp
- add ebp, PIC_BASE()
- mov ecx,[esp+8] ;ecx = begin
- mov edx,[esp+12] ;edx = end
- sub ecx,edx ;ecx = begin-end(should be minus)
- test ecx,8
- pxor mm0,mm0 ;mm0=[0:0]
- movq mm1,[edx+ecx]
- jz .lp
- add ecx,8
- jz .exit
- align 4
- .lp:
- movq mm4,[edx+ecx]
- movq mm5,[edx+ecx+8]
- add ecx,16
- psubusw mm4,mm0 ; $BK\Ev$O(B dword $B$G$J$$$H$$$1$J$$$N$@$,(B
- psubusw mm5,mm1 ; $B$=$s$J%3%^%s%I$O$J$$(B :-p
- paddw mm0,mm4 ; $B$,(B, $B$3$3$G07$&CM$NHO0O$O(B 8191+15 $B0J2<$J$N$GLdBj$J$$(B
- paddw mm1,mm5
- jnz .lp
- .exit:
- psubusw mm1,mm0 ; $B$3$l$bK\Ev$O(B dword $B$G$J$$$H$$$1$J$$(B
- paddw mm0,mm1
- movq mm4,mm0
- punpckhdq mm4,mm4
- psubusw mm4,mm0 ; $B$3$l$bK\Ev$O(B dword $B$G$J$$$H$$$1$J$$(B
- paddw mm0,mm4
- movd eax,mm0
- cmp eax,15
- ja .with_ESC
- lea ecx,[PIC_EBP_REL(choose_table_MMX)]
- add ecx,[PIC_EBP_REL(choose_jump_table_L+eax*4)]
- jmp ecx
- .with_ESC1:
- emms
- mov ecx, [esp+16] ; *s
- mov [ecx], eax
- or eax,-1
- pop ebp
- ret
- .with_ESC:
- cmp eax, 8191+15
- ja .with_ESC1
- sub eax,15
- push ebx
- push esi
- bsr eax, eax
- %assign _P 4*2
- movq mm5, [PIC_EBP_REL(D15_15_15_15)]
- movq mm6, [PIC_EBP_REL(D14_14_14_14)]
- movq mm3, [PIC_EBP_REL(mul_add)]
- mov ecx, [esp+_P+8] ; = ix
- ; mov edx, [esp+_P+12] ; = end
- sub ecx, edx
- xor esi, esi ; sum = 0
- test ecx, 8
- pxor mm7, mm7 ; linbits_sum, 14$B$r1[$($?$b$N$N?t(B
- jz .H_dual_lp1
- movq mm0, [edx+ecx]
- add ecx,8
- packssdw mm0,mm7
- movq mm2, mm0
- paddusw mm0, mm5 ; mm0 = min(ix, 15)+0xfff0
- pcmpgtw mm2, mm6 ; 14$B$h$jBg$-$$$+!)(B
- psubw mm7, mm2 ; 14$B$h$jBg$-$$$H$-(B linbits_sum++;
- pmaddwd mm0, mm3 ; {0, 0, y, x}*{1, 16, 1, 16}
- movd ebx, mm0
- mov esi, [PIC_EBP_REL(largetbl+ebx*4+(16*16+16)*4)]
- jz .H_dual_exit
- align 4
- .H_dual_lp1:
- movq mm0, [edx+ecx]
- movq mm1, [edx+ecx+8]
- packssdw mm0,mm1
- movq mm2, mm0
- paddusw mm0, mm5 ; mm0 = min(ix, 15)+0xfff0
- pcmpgtw mm2, mm6 ; 14$B$h$jBg$-$$$+!)(B
- pmaddwd mm0, mm3 ; {y, x, y, x}*{1, 16, 1, 16}
- movd ebx, mm0
- punpckhdq mm0,mm0
- add esi, [PIC_EBP_REL(largetbl+ebx*4+(16*16+16)*4)]
- movd ebx, mm0
- add esi, [PIC_EBP_REL(largetbl+ebx*4+(16*16+16)*4)]
- add ecx, 16
- psubw mm7, mm2 ; 14$B$h$jBg$-$$$H$-(B linbits_sum++;
- jnz .H_dual_lp1
- .H_dual_exit:
- pmov mm1,mm7
- punpckhdq mm7,mm7
- paddd mm7,mm1
- punpckldq mm7,mm7
- pmaddwd mm7, [PIC_EBP_REL(linbits32+eax*8)] ; linbits
- mov ax, [PIC_EBP_REL(choose_table_H+eax*2)]
- movd ecx, mm7
- punpckhdq mm7,mm7
- movd edx,mm7
- emms
- shl edx, 16
- add ecx, edx
- add ecx, esi
- pop esi
- pop ebx
- mov edx, ecx
- and ecx, 0xffff ; ecx = sum2
- shr edx, 16 ; edx = sum
- cmp edx, ecx
- jle .chooseE_s1
- mov edx, ecx
- shr eax, 8
- .chooseE_s1:
- mov ecx, [esp+16] ; *s
- and eax, 0xff
- add [ecx], edx
- pop ebp
- ret
- table_MMX.L_case_0:
- emms
- pop ebp
- ret
- table_MMX.L_case_1:
- emms
- mov eax, [esp+16] ; *s
- mov ecx, [esp+8] ; *ix
- sub ecx, edx
- push ebx
- .lp:
- mov ebx, [edx+ecx]
- add ebx, ebx
- add ebx, [edx+ecx+4]
- movzx ebx, byte [PIC_EBP_REL(ebx+t1l)]
- add [eax], ebx
- add ecx, 8
- jnz .lp
- pop ebx
- mov eax, 1
- pop ebp
- ret
- table_MMX.L_case_45:
- push dword 7
- lea ecx, [PIC_EBP_REL(tableABC+9*8)]
- jmp from3
- table_MMX.L_case_67:
- push dword 10
- lea ecx, [PIC_EBP_REL(tableABC)]
- jmp from3
- table_MMX.L_case_8_15:
- push dword 13
- lea ecx, [PIC_EBP_REL(tableDEF)]
- from3:
- mov eax,[esp+12] ;eax = *begin
- ; mov edx,[esp+16] ;edx = *end
- push ebx
- sub eax, edx
- movq mm5,[PIC_EBP_REL(mul_add)]
- pxor mm2,mm2 ;mm2 = sum
- test eax, 8
- jz .choose3_lp1
- ; odd length
- movq mm0,[edx+eax] ;mm0 = ix[0] | ix[1]
- add eax,8
- packssdw mm0,mm2
- pmaddwd mm0,mm5
- movd ebx,mm0
- movq mm2, [ecx+ebx*8]
- jz .choose3_exit
- align 4
- .choose3_lp1
- movq mm0,[edx+eax]
- movq mm1,[edx+eax+8]
- add eax,16
- packssdw mm0,mm1 ;mm0 = ix[0]|ix[1]|ix[2]|ix[3]
- pmaddwd mm0,mm5
- movd ebx,mm0
- punpckhdq mm0,mm0
- paddd mm2, [ecx+ebx*8]
- movd ebx,mm0
- paddd mm2, [ecx+ebx*8]
- jnz .choose3_lp1
- .choose3_exit
- ; xor eax,eax
- movd ebx, mm2
- punpckhdq mm2,mm2
- mov ecx, ebx
- and ecx, 0xffff ; ecx = sum2
- shr ebx, 16 ; ebx = sum1
- movd edx, mm2 ; edx = sum
- cmp edx, ebx
- jle .choose3_s1
- mov edx, ebx
- inc eax
- .choose3_s1:
- emms
- pop ebx
- cmp edx, ecx
- jle .choose3_s2
- mov edx, ecx
- mov eax, 2
- .choose3_s2:
- pop ecx
- add eax, ecx
- mov ecx, [esp+16] ; *s
- add [ecx], edx
- pop ebp
- ret
- table_MMX.L_case_2:
- push dword 2
- lea ecx,[PIC_EBP_REL(table23)]
- pmov mm5,[PIC_EBP_REL(mul_add23)]
- jmp from2
- table_MMX.L_case_3:
- push dword 5
- lea ecx,[PIC_EBP_REL(table56)]
- pmov mm5,[PIC_EBP_REL(mul_add56)]
- from2:
- mov eax,[esp+12] ;eax = *begin
- ; mov edx,[esp+16] ;edx = *end
- push ebx
- push edi
- sub eax, edx
- xor edi, edi
- test eax, 8
- jz .choose2_lp1
- ; odd length
- movq mm0,[edx+eax] ;mm0 = ix[0] | ix[1]
- pxor mm2,mm2 ;mm2 = sum
- packssdw mm0,mm2
- pmaddwd mm0,mm5
- movd ebx,mm0
- mov edi, [ecx+ebx*4]
- add eax,8
- jz .choose2_exit
- align 4
- .choose2_lp1
- movq mm0,[edx+eax]
- movq mm1,[edx+eax+8]
- packssdw mm0,mm1 ;mm0 = ix[0]|ix[1]|ix[2]|ix[3]
- pmaddwd mm0,mm5
- movd ebx,mm0
- punpckhdq mm0,mm0
- add edi, [ecx+ebx*4]
- movd ebx, mm0
- add edi, [ecx+ebx*4]
- add eax,16
- jnc .choose2_lp1
- .choose2_exit
- mov ecx, edi
- pop edi
- pop ebx
- pop eax ; table num.
- emms
- mov edx, ecx
- and ecx, 0xffff ; ecx = sum2
- shr edx, 16 ; edx = sum1
- cmp edx, ecx
- jle .choose2_s1
- mov edx, ecx
- inc eax
- .choose2_s1:
- mov ecx, [esp+16] ; *s
- add [ecx], edx
- pop ebp
- ret
- end
|