From 07162fa301835387e4da4278326ec1ff45d1dc3e Mon Sep 17 00:00:00 2001 From: Aki Date: Wed, 29 Sep 2021 21:52:57 +0200 Subject: Squashed 'zlib/' content from commit cacf7f1 git-subtree-dir: zlib git-subtree-split: cacf7f1d4e3d44d871b605da3b647f07d718623f --- contrib/masmx86/inffas32.asm | 1080 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1080 insertions(+) create mode 100644 contrib/masmx86/inffas32.asm (limited to 'contrib/masmx86/inffas32.asm') diff --git a/contrib/masmx86/inffas32.asm b/contrib/masmx86/inffas32.asm new file mode 100644 index 0000000..03d20f8 --- /dev/null +++ b/contrib/masmx86/inffas32.asm @@ -0,0 +1,1080 @@ +;/* inffas32.asm is a hand tuned assembler version of inffast.c -- fast decoding +; * +; * inffas32.asm is derivated from inffas86.c, with translation of assembly code +; * +; * Copyright (C) 1995-2003 Mark Adler +; * For conditions of distribution and use, see copyright notice in zlib.h +; * +; * Copyright (C) 2003 Chris Anderson +; * Please use the copyright conditions above. +; * +; * Mar-13-2003 -- Most of this is derived from inffast.S which is derived from +; * the gcc -S output of zlib-1.2.0/inffast.c. Zlib-1.2.0 is in beta release at +; * the moment. I have successfully compiled and tested this code with gcc2.96, +; * gcc3.2, icc5.0, msvc6.0. It is very close to the speed of inffast.S +; * compiled with gcc -DNO_MMX, but inffast.S is still faster on the P3 with MMX +; * enabled. I will attempt to merge the MMX code into this version. Newer +; * versions of this and inffast.S can be found at +; * http://www.eetbeetee.com/zlib/ and http://www.charm.net/~christop/zlib/ +; * +; * 2005 : modification by Gilles Vollant +; */ +; For Visual C++ 4.x and higher and ML 6.x and higher +; ml.exe is in directory \MASM611C of Win95 DDK +; ml.exe is also distributed in http://www.masm32.com/masmdl.htm +; and in VC++2003 toolkit at http://msdn.microsoft.com/visualc/vctoolkit2003/ +; +; +; compile with command line option +; ml /coff /Zi /c /Flinffas32.lst inffas32.asm + +; if you define NO_GZIP (see inflate.h), compile with +; ml /coff /Zi /c /Flinffas32.lst /DNO_GUNZIP inffas32.asm + + +; zlib122sup is 0 fort zlib 1.2.2.1 and lower +; zlib122sup is 8 fort zlib 1.2.2.2 and more (with addition of dmax and head +; in inflate_state in inflate.h) +zlib1222sup equ 8 + + +IFDEF GUNZIP + INFLATE_MODE_TYPE equ 11 + INFLATE_MODE_BAD equ 26 +ELSE + IFNDEF NO_GUNZIP + INFLATE_MODE_TYPE equ 11 + INFLATE_MODE_BAD equ 26 + ELSE + INFLATE_MODE_TYPE equ 3 + INFLATE_MODE_BAD equ 17 + ENDIF +ENDIF + + +; 75 "inffast.S" +;FILE "inffast.S" + +;;;GLOBAL _inflate_fast + +;;;SECTION .text + + + + .586p + .mmx + + name inflate_fast_x86 + .MODEL FLAT + +_DATA segment +inflate_fast_use_mmx: + dd 1 + + +_TEXT segment + + + +ALIGN 4 + db 'Fast decoding Code from Chris Anderson' + db 0 + +ALIGN 4 +invalid_literal_length_code_msg: + db 'invalid literal/length code' + db 0 + +ALIGN 4 +invalid_distance_code_msg: + db 'invalid distance code' + db 0 + +ALIGN 4 +invalid_distance_too_far_msg: + db 'invalid distance too far back' + db 0 + + +ALIGN 4 +inflate_fast_mask: +dd 0 +dd 1 +dd 3 +dd 7 +dd 15 +dd 31 +dd 63 +dd 127 +dd 255 +dd 511 +dd 1023 +dd 2047 +dd 4095 +dd 8191 +dd 16383 +dd 32767 +dd 65535 +dd 131071 +dd 262143 +dd 524287 +dd 1048575 +dd 2097151 +dd 4194303 +dd 8388607 +dd 16777215 +dd 33554431 +dd 67108863 +dd 134217727 +dd 268435455 +dd 536870911 +dd 1073741823 +dd 2147483647 +dd 4294967295 + + +mode_state equ 0 ;/* state->mode */ +wsize_state equ (32+zlib1222sup) ;/* state->wsize */ +write_state equ (36+4+zlib1222sup) ;/* state->write */ +window_state equ (40+4+zlib1222sup) ;/* state->window */ +hold_state equ (44+4+zlib1222sup) ;/* state->hold */ +bits_state equ (48+4+zlib1222sup) ;/* state->bits */ +lencode_state equ (64+4+zlib1222sup) ;/* state->lencode */ +distcode_state equ (68+4+zlib1222sup) ;/* state->distcode */ +lenbits_state equ (72+4+zlib1222sup) ;/* state->lenbits */ +distbits_state equ (76+4+zlib1222sup) ;/* state->distbits */ + + +;;SECTION .text +; 205 "inffast.S" +;GLOBAL inflate_fast_use_mmx + +;SECTION .data + + +; GLOBAL inflate_fast_use_mmx:object +;.size inflate_fast_use_mmx, 4 +; 226 "inffast.S" +;SECTION .text + +ALIGN 4 +_inflate_fast proc near +.FPO (16, 4, 0, 0, 1, 0) + push edi + push esi + push ebp + push ebx + pushfd + sub esp,64 + cld + + + + + mov esi, [esp+88] + mov edi, [esi+28] + + + + + + + + mov edx, [esi+4] + mov eax, [esi+0] + + add edx,eax + sub edx,11 + + mov [esp+44],eax + mov [esp+20],edx + + mov ebp, [esp+92] + mov ecx, [esi+16] + mov ebx, [esi+12] + + sub ebp,ecx + neg ebp + add ebp,ebx + + sub ecx,257 + add ecx,ebx + + mov [esp+60],ebx + mov [esp+40],ebp + mov [esp+16],ecx +; 285 "inffast.S" + mov eax, [edi+lencode_state] + mov ecx, [edi+distcode_state] + + mov [esp+8],eax + mov [esp+12],ecx + + mov eax,1 + mov ecx, [edi+lenbits_state] + shl eax,cl + dec eax + mov [esp+0],eax + + mov eax,1 + mov ecx, [edi+distbits_state] + shl eax,cl + dec eax + mov [esp+4],eax + + mov eax, [edi+wsize_state] + mov ecx, [edi+write_state] + mov edx, [edi+window_state] + + mov [esp+52],eax + mov [esp+48],ecx + mov [esp+56],edx + + mov ebp, [edi+hold_state] + mov ebx, [edi+bits_state] +; 321 "inffast.S" + mov esi, [esp+44] + mov ecx, [esp+20] + cmp ecx,esi + ja L_align_long + + add ecx,11 + sub ecx,esi + mov eax,12 + sub eax,ecx + lea edi, [esp+28] + rep movsb + mov ecx,eax + xor eax,eax + rep stosb + lea esi, [esp+28] + mov [esp+20],esi + jmp L_is_aligned + + +L_align_long: + test esi,3 + jz L_is_aligned + xor eax,eax + mov al, [esi] + inc esi + mov ecx,ebx + add ebx,8 + shl eax,cl + or ebp,eax + jmp L_align_long + +L_is_aligned: + mov edi, [esp+60] +; 366 "inffast.S" +L_check_mmx: + cmp dword ptr [inflate_fast_use_mmx],2 + je L_init_mmx + ja L_do_loop + + push eax + push ebx + push ecx + push edx + pushfd + mov eax, [esp] + xor dword ptr [esp],0200000h + + + + + popfd + pushfd + pop edx + xor edx,eax + jz L_dont_use_mmx + xor eax,eax + cpuid + cmp ebx,0756e6547h + jne L_dont_use_mmx + cmp ecx,06c65746eh + jne L_dont_use_mmx + cmp edx,049656e69h + jne L_dont_use_mmx + mov eax,1 + cpuid + shr eax,8 + and eax,15 + cmp eax,6 + jne L_dont_use_mmx + test edx,0800000h + jnz L_use_mmx + jmp L_dont_use_mmx +L_use_mmx: + mov dword ptr [inflate_fast_use_mmx],2 + jmp L_check_mmx_pop +L_dont_use_mmx: + mov dword ptr [inflate_fast_use_mmx],3 +L_check_mmx_pop: + pop edx + pop ecx + pop ebx + pop eax + jmp L_check_mmx +; 426 "inffast.S" +ALIGN 4 +L_do_loop: +; 437 "inffast.S" + cmp bl,15 + ja L_get_length_code + + xor eax,eax + lodsw + mov cl,bl + add bl,16 + shl eax,cl + or ebp,eax + +L_get_length_code: + mov edx, [esp+0] + mov ecx, [esp+8] + and edx,ebp + mov eax, [ecx+edx*4] + +L_dolen: + + + + + + + mov cl,ah + sub bl,ah + shr ebp,cl + + + + + + + test al,al + jnz L_test_for_length_base + + shr eax,16 + stosb + +L_while_test: + + + cmp [esp+16],edi + jbe L_break_loop + + cmp [esp+20],esi + ja L_do_loop + jmp L_break_loop + +L_test_for_length_base: +; 502 "inffast.S" + mov edx,eax + shr edx,16 + mov cl,al + + test al,16 + jz L_test_for_second_level_length + and cl,15 + jz L_save_len + cmp bl,cl + jae L_add_bits_to_len + + mov ch,cl + xor eax,eax + lodsw + mov cl,bl + add bl,16 + shl eax,cl + or ebp,eax + mov cl,ch + +L_add_bits_to_len: + mov eax,1 + shl eax,cl + dec eax + sub bl,cl + and eax,ebp + shr ebp,cl + add edx,eax + +L_save_len: + mov [esp+24],edx + + +L_decode_distance: +; 549 "inffast.S" + cmp bl,15 + ja L_get_distance_code + + xor eax,eax + lodsw + mov cl,bl + add bl,16 + shl eax,cl + or ebp,eax + +L_get_distance_code: + mov edx, [esp+4] + mov ecx, [esp+12] + and edx,ebp + mov eax, [ecx+edx*4] + + +L_dodist: + mov edx,eax + shr edx,16 + mov cl,ah + sub bl,ah + shr ebp,cl +; 584 "inffast.S" + mov cl,al + + test al,16 + jz L_test_for_second_level_dist + and cl,15 + jz L_check_dist_one + cmp bl,cl + jae L_add_bits_to_dist + + mov ch,cl + xor eax,eax + lodsw + mov cl,bl + add bl,16 + shl eax,cl + or ebp,eax + mov cl,ch + +L_add_bits_to_dist: + mov eax,1 + shl eax,cl + dec eax + sub bl,cl + and eax,ebp + shr ebp,cl + add edx,eax + jmp L_check_window + +L_check_window: +; 625 "inffast.S" + mov [esp+44],esi + mov eax,edi + sub eax, [esp+40] + + cmp eax,edx + jb L_clip_window + + mov ecx, [esp+24] + mov esi,edi + sub esi,edx + + sub ecx,3 + mov al, [esi] + mov [edi],al + mov al, [esi+1] + mov dl, [esi+2] + add esi,3 + mov [edi+1],al + mov [edi+2],dl + add edi,3 + rep movsb + + mov esi, [esp+44] + jmp L_while_test + +ALIGN 4 +L_check_dist_one: + cmp edx,1 + jne L_check_window + cmp [esp+40],edi + je L_check_window + + dec edi + mov ecx, [esp+24] + mov al, [edi] + sub ecx,3 + + mov [edi+1],al + mov [edi+2],al + mov [edi+3],al + add edi,4 + rep stosb + + jmp L_while_test + +ALIGN 4 +L_test_for_second_level_length: + + + + + test al,64 + jnz L_test_for_end_of_block + + mov eax,1 + shl eax,cl + dec eax + and eax,ebp + add eax,edx + mov edx, [esp+8] + mov eax, [edx+eax*4] + jmp L_dolen + +ALIGN 4 +L_test_for_second_level_dist: + + + + + test al,64 + jnz L_invalid_distance_code + + mov eax,1 + shl eax,cl + dec eax + and eax,ebp + add eax,edx + mov edx, [esp+12] + mov eax, [edx+eax*4] + jmp L_dodist + +ALIGN 4 +L_clip_window: +; 721 "inffast.S" + mov ecx,eax + mov eax, [esp+52] + neg ecx + mov esi, [esp+56] + + cmp eax,edx + jb L_invalid_distance_too_far + + add ecx,edx + cmp dword ptr [esp+48],0 + jne L_wrap_around_window + + sub eax,ecx + add esi,eax +; 749 "inffast.S" + mov eax, [esp+24] + cmp eax,ecx + jbe L_do_copy1 + + sub eax,ecx + rep movsb + mov esi,edi + sub esi,edx + jmp L_do_copy1 + + cmp eax,ecx + jbe L_do_copy1 + + sub eax,ecx + rep movsb + mov esi,edi + sub esi,edx + jmp L_do_copy1 + +L_wrap_around_window: +; 793 "inffast.S" + mov eax, [esp+48] + cmp ecx,eax + jbe L_contiguous_in_window + + add esi, [esp+52] + add esi,eax + sub esi,ecx + sub ecx,eax + + + mov eax, [esp+24] + cmp eax,ecx + jbe L_do_copy1 + + sub eax,ecx + rep movsb + mov esi, [esp+56] + mov ecx, [esp+48] + cmp eax,ecx + jbe L_do_copy1 + + sub eax,ecx + rep movsb + mov esi,edi + sub esi,edx + jmp L_do_copy1 + +L_contiguous_in_window: +; 836 "inffast.S" + add esi,eax + sub esi,ecx + + + mov eax, [esp+24] + cmp eax,ecx + jbe L_do_copy1 + + sub eax,ecx + rep movsb + mov esi,edi + sub esi,edx + +L_do_copy1: +; 862 "inffast.S" + mov ecx,eax + rep movsb + + mov esi, [esp+44] + jmp L_while_test +; 878 "inffast.S" +ALIGN 4 +L_init_mmx: + emms + + + + + + movd mm0,ebp + mov ebp,ebx +; 896 "inffast.S" + movd mm4,dword ptr [esp+0] + movq mm3,mm4 + movd mm5,dword ptr [esp+4] + movq mm2,mm5 + pxor mm1,mm1 + mov ebx, [esp+8] + jmp L_do_loop_mmx + +ALIGN 4 +L_do_loop_mmx: + psrlq mm0,mm1 + + cmp ebp,32 + ja L_get_length_code_mmx + + movd mm6,ebp + movd mm7,dword ptr [esi] + add esi,4 + psllq mm7,mm6 + add ebp,32 + por mm0,mm7 + +L_get_length_code_mmx: + pand mm4,mm0 + movd eax,mm4 + movq mm4,mm3 + mov eax, [ebx+eax*4] + +L_dolen_mmx: + movzx ecx,ah + movd mm1,ecx + sub ebp,ecx + + test al,al + jnz L_test_for_length_base_mmx + + shr eax,16 + stosb + +L_while_test_mmx: + + + cmp [esp+16],edi + jbe L_break_loop + + cmp [esp+20],esi + ja L_do_loop_mmx + jmp L_break_loop + +L_test_for_length_base_mmx: + + mov edx,eax + shr edx,16 + + test al,16 + jz L_test_for_second_level_length_mmx + and eax,15 + jz L_decode_distance_mmx + + psrlq mm0,mm1 + movd mm1,eax + movd ecx,mm0 + sub ebp,eax + and ecx, [inflate_fast_mask+eax*4] + add edx,ecx + +L_decode_distance_mmx: + psrlq mm0,mm1 + + cmp ebp,32 + ja L_get_dist_code_mmx + + movd mm6,ebp + movd mm7,dword ptr [esi] + add esi,4 + psllq mm7,mm6 + add ebp,32 + por mm0,mm7 + +L_get_dist_code_mmx: + mov ebx, [esp+12] + pand mm5,mm0 + movd eax,mm5 + movq mm5,mm2 + mov eax, [ebx+eax*4] + +L_dodist_mmx: + + movzx ecx,ah + mov ebx,eax + shr ebx,16 + sub ebp,ecx + movd mm1,ecx + + test al,16 + jz L_test_for_second_level_dist_mmx + and eax,15 + jz L_check_dist_one_mmx + +L_add_bits_to_dist_mmx: + psrlq mm0,mm1 + movd mm1,eax + movd ecx,mm0 + sub ebp,eax + and ecx, [inflate_fast_mask+eax*4] + add ebx,ecx + +L_check_window_mmx: + mov [esp+44],esi + mov eax,edi + sub eax, [esp+40] + + cmp eax,ebx + jb L_clip_window_mmx + + mov ecx,edx + mov esi,edi + sub esi,ebx + + sub ecx,3 + mov al, [esi] + mov [edi],al + mov al, [esi+1] + mov dl, [esi+2] + add esi,3 + mov [edi+1],al + mov [edi+2],dl + add edi,3 + rep movsb + + mov esi, [esp+44] + mov ebx, [esp+8] + jmp L_while_test_mmx + +ALIGN 4 +L_check_dist_one_mmx: + cmp ebx,1 + jne L_check_window_mmx + cmp [esp+40],edi + je L_check_window_mmx + + dec edi + mov ecx,edx + mov al, [edi] + sub ecx,3 + + mov [edi+1],al + mov [edi+2],al + mov [edi+3],al + add edi,4 + rep stosb + + mov ebx, [esp+8] + jmp L_while_test_mmx + +ALIGN 4 +L_test_for_second_level_length_mmx: + test al,64 + jnz L_test_for_end_of_block + + and eax,15 + psrlq mm0,mm1 + movd ecx,mm0 + and ecx, [inflate_fast_mask+eax*4] + add ecx,edx + mov eax, [ebx+ecx*4] + jmp L_dolen_mmx + +ALIGN 4 +L_test_for_second_level_dist_mmx: + test al,64 + jnz L_invalid_distance_code + + and eax,15 + psrlq mm0,mm1 + movd ecx,mm0 + and ecx, [inflate_fast_mask+eax*4] + mov eax, [esp+12] + add ecx,ebx + mov eax, [eax+ecx*4] + jmp L_dodist_mmx + +ALIGN 4 +L_clip_window_mmx: + + mov ecx,eax + mov eax, [esp+52] + neg ecx + mov esi, [esp+56] + + cmp eax,ebx + jb L_invalid_distance_too_far + + add ecx,ebx + cmp dword ptr [esp+48],0 + jne L_wrap_around_window_mmx + + sub eax,ecx + add esi,eax + + cmp edx,ecx + jbe L_do_copy1_mmx + + sub edx,ecx + rep movsb + mov esi,edi + sub esi,ebx + jmp L_do_copy1_mmx + + cmp edx,ecx + jbe L_do_copy1_mmx + + sub edx,ecx + rep movsb + mov esi,edi + sub esi,ebx + jmp L_do_copy1_mmx + +L_wrap_around_window_mmx: + + mov eax, [esp+48] + cmp ecx,eax + jbe L_contiguous_in_window_mmx + + add esi, [esp+52] + add esi,eax + sub esi,ecx + sub ecx,eax + + + cmp edx,ecx + jbe L_do_copy1_mmx + + sub edx,ecx + rep movsb + mov esi, [esp+56] + mov ecx, [esp+48] + cmp edx,ecx + jbe L_do_copy1_mmx + + sub edx,ecx + rep movsb + mov esi,edi + sub esi,ebx + jmp L_do_copy1_mmx + +L_contiguous_in_window_mmx: + + add esi,eax + sub esi,ecx + + + cmp edx,ecx + jbe L_do_copy1_mmx + + sub edx,ecx + rep movsb + mov esi,edi + sub esi,ebx + +L_do_copy1_mmx: + + + mov ecx,edx + rep movsb + + mov esi, [esp+44] + mov ebx, [esp+8] + jmp L_while_test_mmx +; 1174 "inffast.S" +L_invalid_distance_code: + + + + + + mov ecx, invalid_distance_code_msg + mov edx,INFLATE_MODE_BAD + jmp L_update_stream_state + +L_test_for_end_of_block: + + + + + + test al,32 + jz L_invalid_literal_length_code + + mov ecx,0 + mov edx,INFLATE_MODE_TYPE + jmp L_update_stream_state + +L_invalid_literal_length_code: + + + + + + mov ecx, invalid_literal_length_code_msg + mov edx,INFLATE_MODE_BAD + jmp L_update_stream_state + +L_invalid_distance_too_far: + + + + mov esi, [esp+44] + mov ecx, invalid_distance_too_far_msg + mov edx,INFLATE_MODE_BAD + jmp L_update_stream_state + +L_update_stream_state: + + mov eax, [esp+88] + test ecx,ecx + jz L_skip_msg + mov [eax+24],ecx +L_skip_msg: + mov eax, [eax+28] + mov [eax+mode_state],edx + jmp L_break_loop + +ALIGN 4 +L_break_loop: +; 1243 "inffast.S" + cmp dword ptr [inflate_fast_use_mmx],2 + jne L_update_next_in + + + + mov ebx,ebp + +L_update_next_in: +; 1266 "inffast.S" + mov eax, [esp+88] + mov ecx,ebx + mov edx, [eax+28] + shr ecx,3 + sub esi,ecx + shl ecx,3 + sub ebx,ecx + mov [eax+12],edi + mov [edx+bits_state],ebx + mov ecx,ebx + + lea ebx, [esp+28] + cmp [esp+20],ebx + jne L_buf_not_used + + sub esi,ebx + mov ebx, [eax+0] + mov [esp+20],ebx + add esi,ebx + mov ebx, [eax+4] + sub ebx,11 + add [esp+20],ebx + +L_buf_not_used: + mov [eax+0],esi + + mov ebx,1 + shl ebx,cl + dec ebx + + + + + + cmp dword ptr [inflate_fast_use_mmx],2 + jne L_update_hold + + + + psrlq mm0,mm1 + movd ebp,mm0 + + emms + +L_update_hold: + + + + and ebp,ebx + mov [edx+hold_state],ebp + + + + + mov ebx, [esp+20] + cmp ebx,esi + jbe L_last_is_smaller + + sub ebx,esi + add ebx,11 + mov [eax+4],ebx + jmp L_fixup_out +L_last_is_smaller: + sub esi,ebx + neg esi + add esi,11 + mov [eax+4],esi + + + + +L_fixup_out: + + mov ebx, [esp+16] + cmp ebx,edi + jbe L_end_is_smaller + + sub ebx,edi + add ebx,257 + mov [eax+16],ebx + jmp L_done +L_end_is_smaller: + sub edi,ebx + neg edi + add edi,257 + mov [eax+16],edi + + + + + +L_done: + add esp,64 + popfd + pop ebx + pop ebp + pop esi + pop edi + ret +_inflate_fast endp + +_TEXT ends +end -- cgit v1.1