From f2e1338eba568ac7caa0051d57f1f6a10ae81801 Mon Sep 17 00:00:00 2001 From: Yasha Jannoo Date: Sat, 14 Sep 2019 12:15:04 +0100 Subject: Update zlib 3rd party code to v1.2.11 zlib source code, version 1.2.11. https://www.zlib.net/. Include all the gynmastics involved in: - making sure we compile zlibstat and set the output files locations and filename to those expected by the starshatter setup. - turning off the ASM includes as they are invalid for the new versions of zlib (and crash when running). - matching the calling conventions of the Starshatter code by ommiting ZLIB_WINAPI from the zlib preprocessor defines. - fix debug build compile error in zlibstatic: https://stackoverflow.com/questions/31867018/unresolved-external-symbol-except-handler4-common-in-visual-studio-2015 This fixes some string warnings. --- zlib/contrib/masmx86/bld_ml32.bat | 2 + zlib/contrib/masmx86/inffas32.asm | 1080 ++++++++++++++++++++++++++++++++ zlib/contrib/masmx86/inffas32.lst | 1224 +++++++++++++++++++++++++++++++++++++ zlib/contrib/masmx86/match686.asm | 479 +++++++++++++++ zlib/contrib/masmx86/match686.lst | 624 +++++++++++++++++++ zlib/contrib/masmx86/readme.txt | 27 + 6 files changed, 3436 insertions(+) create mode 100644 zlib/contrib/masmx86/bld_ml32.bat create mode 100644 zlib/contrib/masmx86/inffas32.asm create mode 100644 zlib/contrib/masmx86/inffas32.lst create mode 100644 zlib/contrib/masmx86/match686.asm create mode 100644 zlib/contrib/masmx86/match686.lst create mode 100644 zlib/contrib/masmx86/readme.txt (limited to 'zlib/contrib/masmx86') diff --git a/zlib/contrib/masmx86/bld_ml32.bat b/zlib/contrib/masmx86/bld_ml32.bat new file mode 100644 index 0000000..67e6a6a --- /dev/null +++ b/zlib/contrib/masmx86/bld_ml32.bat @@ -0,0 +1,2 @@ +ml /safeseh /coff /Zi /c /Flmatch686.lst match686.asm +ml /safeseh /coff /Zi /c /Flinffas32.lst inffas32.asm diff --git a/zlib/contrib/masmx86/inffas32.asm b/zlib/contrib/masmx86/inffas32.asm new file mode 100644 index 0000000..cb37a81 --- /dev/null +++ b/zlib/contrib/masmx86/inffas32.asm @@ -0,0 +1,1080 @@ +;/* inffas32.asm is a hand tuned assembler version of inffast.c -- fast decoding +; * +; * inffas32.asm is derivated from inffas86.c, with translation of assembly code +; * +; * Copyright (C) 1995-2003 Mark Adler +; * For conditions of distribution and use, see copyright notice in zlib.h +; * +; * Copyright (C) 2003 Chris Anderson +; * Please use the copyright conditions above. +; * +; * Mar-13-2003 -- Most of this is derived from inffast.S which is derived from +; * the gcc -S output of zlib-1.2.0/inffast.c. Zlib-1.2.0 is in beta release at +; * the moment. I have successfully compiled and tested this code with gcc2.96, +; * gcc3.2, icc5.0, msvc6.0. It is very close to the speed of inffast.S +; * compiled with gcc -DNO_MMX, but inffast.S is still faster on the P3 with MMX +; * enabled. I will attempt to merge the MMX code into this version. Newer +; * versions of this and inffast.S can be found at +; * http://www.eetbeetee.com/zlib/ and http://www.charm.net/~christop/zlib/ +; * +; * 2005 : modification by Gilles Vollant +; */ +; For Visual C++ 4.x and higher and ML 6.x and higher +; ml.exe is in directory \MASM611C of Win95 DDK +; ml.exe is also distributed in http://www.masm32.com/masmdl.htm +; and in VC++2003 toolkit at http://msdn.microsoft.com/visualc/vctoolkit2003/ +; +; +; compile with command line option +; ml /coff /Zi /c /Flinffas32.lst inffas32.asm + +; if you define NO_GZIP (see inflate.h), compile with +; ml /coff /Zi /c /Flinffas32.lst /DNO_GUNZIP inffas32.asm + + +; zlib122sup is 0 fort zlib 1.2.2.1 and lower +; zlib122sup is 8 fort zlib 1.2.2.2 and more (with addition of dmax and head +; in inflate_state in inflate.h) +zlib1222sup equ 8 + + +IFDEF GUNZIP + INFLATE_MODE_TYPE equ 11 + INFLATE_MODE_BAD equ 26 +ELSE + IFNDEF NO_GUNZIP + INFLATE_MODE_TYPE equ 11 + INFLATE_MODE_BAD equ 26 + ELSE + INFLATE_MODE_TYPE equ 3 + INFLATE_MODE_BAD equ 17 + ENDIF +ENDIF + + +; 75 "inffast.S" +;FILE "inffast.S" + +;;;GLOBAL _inflate_fast + +;;;SECTION .text + + + + .586p + .mmx + + name inflate_fast_x86 + .MODEL FLAT + +_DATA segment +inflate_fast_use_mmx: + dd 1 + + +_TEXT segment + + + +ALIGN 4 + db 'Fast decoding Code from Chris Anderson' + db 0 + +ALIGN 4 +invalid_literal_length_code_msg: + db 'invalid literal/length code' + db 0 + +ALIGN 4 +invalid_distance_code_msg: + db 'invalid distance code' + db 0 + +ALIGN 4 +invalid_distance_too_far_msg: + db 'invalid distance too far back' + db 0 + + +ALIGN 4 +inflate_fast_mask: +dd 0 +dd 1 +dd 3 +dd 7 +dd 15 +dd 31 +dd 63 +dd 127 +dd 255 +dd 511 +dd 1023 +dd 2047 +dd 4095 +dd 8191 +dd 16383 +dd 32767 +dd 65535 +dd 131071 +dd 262143 +dd 524287 +dd 1048575 +dd 2097151 +dd 4194303 +dd 8388607 +dd 16777215 +dd 33554431 +dd 67108863 +dd 134217727 +dd 268435455 +dd 536870911 +dd 1073741823 +dd 2147483647 +dd 4294967295 + + +mode_state equ 0 ;/* state->mode */ +wsize_state equ (32+zlib1222sup) ;/* state->wsize */ +write_state equ (36+4+zlib1222sup) ;/* state->write */ +window_state equ (40+4+zlib1222sup) ;/* state->window */ +hold_state equ (44+4+zlib1222sup) ;/* state->hold */ +bits_state equ (48+4+zlib1222sup) ;/* state->bits */ +lencode_state equ (64+4+zlib1222sup) ;/* state->lencode */ +distcode_state equ (68+4+zlib1222sup) ;/* state->distcode */ +lenbits_state equ (72+4+zlib1222sup) ;/* state->lenbits */ +distbits_state equ (76+4+zlib1222sup) ;/* state->distbits */ + + +;;SECTION .text +; 205 "inffast.S" +;GLOBAL inflate_fast_use_mmx + +;SECTION .data + + +; GLOBAL inflate_fast_use_mmx:object +;.size inflate_fast_use_mmx, 4 +; 226 "inffast.S" +;SECTION .text + +ALIGN 4 +_inflate_fast proc near +.FPO (16, 4, 0, 0, 1, 0) + push edi + push esi + push ebp + push ebx + pushfd + sub esp,64 + cld + + + + + mov esi, [esp+88] + mov edi, [esi+28] + + + + + + + + mov edx, [esi+4] + mov eax, [esi+0] + + add edx,eax + sub edx,11 + + mov [esp+44],eax + mov [esp+20],edx + + mov ebp, [esp+92] + mov ecx, [esi+16] + mov ebx, [esi+12] + + sub ebp,ecx + neg ebp + add ebp,ebx + + sub ecx,257 + add ecx,ebx + + mov [esp+60],ebx + mov [esp+40],ebp + mov [esp+16],ecx +; 285 "inffast.S" + mov eax, [edi+lencode_state] + mov ecx, [edi+distcode_state] + + mov [esp+8],eax + mov [esp+12],ecx + + mov eax,1 + mov ecx, [edi+lenbits_state] + shl eax,cl + dec eax + mov [esp+0],eax + + mov eax,1 + mov ecx, [edi+distbits_state] + shl eax,cl + dec eax + mov [esp+4],eax + + mov eax, [edi+wsize_state] + mov ecx, [edi+write_state] + mov edx, [edi+window_state] + + mov [esp+52],eax + mov [esp+48],ecx + mov [esp+56],edx + + mov ebp, [edi+hold_state] + mov ebx, [edi+bits_state] +; 321 "inffast.S" + mov esi, [esp+44] + mov ecx, [esp+20] + cmp ecx,esi + ja L_align_long + + add ecx,11 + sub ecx,esi + mov eax,12 + sub eax,ecx + lea edi, [esp+28] + rep movsb + mov ecx,eax + xor eax,eax + rep stosb + lea esi, [esp+28] + mov [esp+20],esi + jmp L_is_aligned + + +L_align_long: + test esi,3 + jz L_is_aligned + xor eax,eax + mov al, [esi] + inc esi + mov ecx,ebx + add ebx,8 + shl eax,cl + or ebp,eax + jmp L_align_long + +L_is_aligned: + mov edi, [esp+60] +; 366 "inffast.S" +L_check_mmx: + cmp dword ptr [inflate_fast_use_mmx],2 + je L_init_mmx + ja L_do_loop + + push eax + push ebx + push ecx + push edx + pushfd + mov eax, [esp] + xor dword ptr [esp],0200000h + + + + + popfd + pushfd + pop edx + xor edx,eax + jz L_dont_use_mmx + xor eax,eax + cpuid + cmp ebx,0756e6547h + jne L_dont_use_mmx + cmp ecx,06c65746eh + jne L_dont_use_mmx + cmp edx,049656e69h + jne L_dont_use_mmx + mov eax,1 + cpuid + shr eax,8 + and eax,15 + cmp eax,6 + jne L_dont_use_mmx + test edx,0800000h + jnz L_use_mmx + jmp L_dont_use_mmx +L_use_mmx: + mov dword ptr [inflate_fast_use_mmx],2 + jmp L_check_mmx_pop +L_dont_use_mmx: + mov dword ptr [inflate_fast_use_mmx],3 +L_check_mmx_pop: + pop edx + pop ecx + pop ebx + pop eax + jmp L_check_mmx +; 426 "inffast.S" +ALIGN 4 +L_do_loop: +; 437 "inffast.S" + cmp bl,15 + ja L_get_length_code + + xor eax,eax + lodsw + mov cl,bl + add bl,16 + shl eax,cl + or ebp,eax + +L_get_length_code: + mov edx, [esp+0] + mov ecx, [esp+8] + and edx,ebp + mov eax, [ecx+edx*4] + +L_dolen: + + + + + + + mov cl,ah + sub bl,ah + shr ebp,cl + + + + + + + test al,al + jnz L_test_for_length_base + + shr eax,16 + stosb + +L_while_test: + + + cmp [esp+16],edi + jbe L_break_loop + + cmp [esp+20],esi + ja L_do_loop + jmp L_break_loop + +L_test_for_length_base: +; 502 "inffast.S" + mov edx,eax + shr edx,16 + mov cl,al + + test al,16 + jz L_test_for_second_level_length + and cl,15 + jz L_save_len + cmp bl,cl + jae L_add_bits_to_len + + mov ch,cl + xor eax,eax + lodsw + mov cl,bl + add bl,16 + shl eax,cl + or ebp,eax + mov cl,ch + +L_add_bits_to_len: + mov eax,1 + shl eax,cl + dec eax + sub bl,cl + and eax,ebp + shr ebp,cl + add edx,eax + +L_save_len: + mov [esp+24],edx + + +L_decode_distance: +; 549 "inffast.S" + cmp bl,15 + ja L_get_distance_code + + xor eax,eax + lodsw + mov cl,bl + add bl,16 + shl eax,cl + or ebp,eax + +L_get_distance_code: + mov edx, [esp+4] + mov ecx, [esp+12] + and edx,ebp + mov eax, [ecx+edx*4] + + +L_dodist: + mov edx,eax + shr edx,16 + mov cl,ah + sub bl,ah + shr ebp,cl +; 584 "inffast.S" + mov cl,al + + test al,16 + jz L_test_for_second_level_dist + and cl,15 + jz L_check_dist_one + cmp bl,cl + jae L_add_bits_to_dist + + mov ch,cl + xor eax,eax + lodsw + mov cl,bl + add bl,16 + shl eax,cl + or ebp,eax + mov cl,ch + +L_add_bits_to_dist: + mov eax,1 + shl eax,cl + dec eax + sub bl,cl + and eax,ebp + shr ebp,cl + add edx,eax + jmp L_check_window + +L_check_window: +; 625 "inffast.S" + mov [esp+44],esi + mov eax,edi + sub eax, [esp+40] + + cmp eax,edx + jb L_clip_window + + mov ecx, [esp+24] + mov esi,edi + sub esi,edx + + sub ecx,3 + mov al, [esi] + mov [edi],al + mov al, [esi+1] + mov dl, [esi+2] + add esi,3 + mov [edi+1],al + mov [edi+2],dl + add edi,3 + rep movsb + + mov esi, [esp+44] + jmp L_while_test + +ALIGN 4 +L_check_dist_one: + cmp edx,1 + jne L_check_window + cmp [esp+40],edi + je L_check_window + + dec edi + mov ecx, [esp+24] + mov al, [edi] + sub ecx,3 + + mov [edi+1],al + mov [edi+2],al + mov [edi+3],al + add edi,4 + rep stosb + + jmp L_while_test + +ALIGN 4 +L_test_for_second_level_length: + + + + + test al,64 + jnz L_test_for_end_of_block + + mov eax,1 + shl eax,cl + dec eax + and eax,ebp + add eax,edx + mov edx, [esp+8] + mov eax, [edx+eax*4] + jmp L_dolen + +ALIGN 4 +L_test_for_second_level_dist: + + + + + test al,64 + jnz L_invalid_distance_code + + mov eax,1 + shl eax,cl + dec eax + and eax,ebp + add eax,edx + mov edx, [esp+12] + mov eax, [edx+eax*4] + jmp L_dodist + +ALIGN 4 +L_clip_window: +; 721 "inffast.S" + mov ecx,eax + mov eax, [esp+52] + neg ecx + mov esi, [esp+56] + + cmp eax,edx + jb L_invalid_distance_too_far + + add ecx,edx + cmp dword ptr [esp+48],0 + jne L_wrap_around_window + + sub eax,ecx + add esi,eax +; 749 "inffast.S" + mov eax, [esp+24] + cmp eax,ecx + jbe L_do_copy1 + + sub eax,ecx + rep movsb + mov esi,edi + sub esi,edx + jmp L_do_copy1 + + cmp eax,ecx + jbe L_do_copy1 + + sub eax,ecx + rep movsb + mov esi,edi + sub esi,edx + jmp L_do_copy1 + +L_wrap_around_window: +; 793 "inffast.S" + mov eax, [esp+48] + cmp ecx,eax + jbe L_contiguous_in_window + + add esi, [esp+52] + add esi,eax + sub esi,ecx + sub ecx,eax + + + mov eax, [esp+24] + cmp eax,ecx + jbe L_do_copy1 + + sub eax,ecx + rep movsb + mov esi, [esp+56] + mov ecx, [esp+48] + cmp eax,ecx + jbe L_do_copy1 + + sub eax,ecx + rep movsb + mov esi,edi + sub esi,edx + jmp L_do_copy1 + +L_contiguous_in_window: +; 836 "inffast.S" + add esi,eax + sub esi,ecx + + + mov eax, [esp+24] + cmp eax,ecx + jbe L_do_copy1 + + sub eax,ecx + rep movsb + mov esi,edi + sub esi,edx + +L_do_copy1: +; 862 "inffast.S" + mov ecx,eax + rep movsb + + mov esi, [esp+44] + jmp L_while_test +; 878 "inffast.S" +ALIGN 4 +L_init_mmx: + emms + + + + + + movd mm0,ebp + mov ebp,ebx +; 896 "inffast.S" + movd mm4,dword ptr [esp+0] + movq mm3,mm4 + movd mm5,dword ptr [esp+4] + movq mm2,mm5 + pxor mm1,mm1 + mov ebx, [esp+8] + jmp L_do_loop_mmx + +ALIGN 4 +L_do_loop_mmx: + psrlq mm0,mm1 + + cmp ebp,32 + ja L_get_length_code_mmx + + movd mm6,ebp + movd mm7,dword ptr [esi] + add esi,4 + psllq mm7,mm6 + add ebp,32 + por mm0,mm7 + +L_get_length_code_mmx: + pand mm4,mm0 + movd eax,mm4 + movq mm4,mm3 + mov eax, [ebx+eax*4] + +L_dolen_mmx: + movzx ecx,ah + movd mm1,ecx + sub ebp,ecx + + test al,al + jnz L_test_for_length_base_mmx + + shr eax,16 + stosb + +L_while_test_mmx: + + + cmp [esp+16],edi + jbe L_break_loop + + cmp [esp+20],esi + ja L_do_loop_mmx + jmp L_break_loop + +L_test_for_length_base_mmx: + + mov edx,eax + shr edx,16 + + test al,16 + jz L_test_for_second_level_length_mmx + and eax,15 + jz L_decode_distance_mmx + + psrlq mm0,mm1 + movd mm1,eax + movd ecx,mm0 + sub ebp,eax + and ecx, [inflate_fast_mask+eax*4] + add edx,ecx + +L_decode_distance_mmx: + psrlq mm0,mm1 + + cmp ebp,32 + ja L_get_dist_code_mmx + + movd mm6,ebp + movd mm7,dword ptr [esi] + add esi,4 + psllq mm7,mm6 + add ebp,32 + por mm0,mm7 + +L_get_dist_code_mmx: + mov ebx, [esp+12] + pand mm5,mm0 + movd eax,mm5 + movq mm5,mm2 + mov eax, [ebx+eax*4] + +L_dodist_mmx: + + movzx ecx,ah + mov ebx,eax + shr ebx,16 + sub ebp,ecx + movd mm1,ecx + + test al,16 + jz L_test_for_second_level_dist_mmx + and eax,15 + jz L_check_dist_one_mmx + +L_add_bits_to_dist_mmx: + psrlq mm0,mm1 + movd mm1,eax + movd ecx,mm0 + sub ebp,eax + and ecx, [inflate_fast_mask+eax*4] + add ebx,ecx + +L_check_window_mmx: + mov [esp+44],esi + mov eax,edi + sub eax, [esp+40] + + cmp eax,ebx + jb L_clip_window_mmx + + mov ecx,edx + mov esi,edi + sub esi,ebx + + sub ecx,3 + mov al, [esi] + mov [edi],al + mov al, [esi+1] + mov dl, [esi+2] + add esi,3 + mov [edi+1],al + mov [edi+2],dl + add edi,3 + rep movsb + + mov esi, [esp+44] + mov ebx, [esp+8] + jmp L_while_test_mmx + +ALIGN 4 +L_check_dist_one_mmx: + cmp ebx,1 + jne L_check_window_mmx + cmp [esp+40],edi + je L_check_window_mmx + + dec edi + mov ecx,edx + mov al, [edi] + sub ecx,3 + + mov [edi+1],al + mov [edi+2],al + mov [edi+3],al + add edi,4 + rep stosb + + mov ebx, [esp+8] + jmp L_while_test_mmx + +ALIGN 4 +L_test_for_second_level_length_mmx: + test al,64 + jnz L_test_for_end_of_block + + and eax,15 + psrlq mm0,mm1 + movd ecx,mm0 + and ecx, [inflate_fast_mask+eax*4] + add ecx,edx + mov eax, [ebx+ecx*4] + jmp L_dolen_mmx + +ALIGN 4 +L_test_for_second_level_dist_mmx: + test al,64 + jnz L_invalid_distance_code + + and eax,15 + psrlq mm0,mm1 + movd ecx,mm0 + and ecx, [inflate_fast_mask+eax*4] + mov eax, [esp+12] + add ecx,ebx + mov eax, [eax+ecx*4] + jmp L_dodist_mmx + +ALIGN 4 +L_clip_window_mmx: + + mov ecx,eax + mov eax, [esp+52] + neg ecx + mov esi, [esp+56] + + cmp eax,ebx + jb L_invalid_distance_too_far + + add ecx,ebx + cmp dword ptr [esp+48],0 + jne L_wrap_around_window_mmx + + sub eax,ecx + add esi,eax + + cmp edx,ecx + jbe L_do_copy1_mmx + + sub edx,ecx + rep movsb + mov esi,edi + sub esi,ebx + jmp L_do_copy1_mmx + + cmp edx,ecx + jbe L_do_copy1_mmx + + sub edx,ecx + rep movsb + mov esi,edi + sub esi,ebx + jmp L_do_copy1_mmx + +L_wrap_around_window_mmx: + + mov eax, [esp+48] + cmp ecx,eax + jbe L_contiguous_in_window_mmx + + add esi, [esp+52] + add esi,eax + sub esi,ecx + sub ecx,eax + + + cmp edx,ecx + jbe L_do_copy1_mmx + + sub edx,ecx + rep movsb + mov esi, [esp+56] + mov ecx, [esp+48] + cmp edx,ecx + jbe L_do_copy1_mmx + + sub edx,ecx + rep movsb + mov esi,edi + sub esi,ebx + jmp L_do_copy1_mmx + +L_contiguous_in_window_mmx: + + add esi,eax + sub esi,ecx + + + cmp edx,ecx + jbe L_do_copy1_mmx + + sub edx,ecx + rep movsb + mov esi,edi + sub esi,ebx + +L_do_copy1_mmx: + + + mov ecx,edx + rep movsb + + mov esi, [esp+44] + mov ebx, [esp+8] + jmp L_while_test_mmx +; 1174 "inffast.S" +L_invalid_distance_code: + + + + + + mov ecx, invalid_distance_code_msg + mov edx,INFLATE_MODE_BAD + jmp L_update_stream_state + +L_test_for_end_of_block: + + + + + + test al,32 + jz L_invalid_literal_length_code + + mov ecx,0 + mov edx,INFLATE_MODE_TYPE + jmp L_update_stream_state + +L_invalid_literal_length_code: + + + + + + mov ecx, invalid_literal_length_code_msg + mov edx,INFLATE_MODE_BAD + jmp L_update_stream_state + +L_invalid_distance_too_far: + + + + mov esi, [esp+44] + mov ecx, invalid_distance_too_far_msg + mov edx,INFLATE_MODE_BAD + jmp L_update_stream_state + +L_update_stream_state: + + mov eax, [esp+88] + test ecx,ecx + jz L_skip_msg + mov [eax+24],ecx +L_skip_msg: + mov eax, [eax+28] + mov [eax+mode_state],edx + jmp L_break_loop + +ALIGN 4 +L_break_loop: +; 1243 "inffast.S" + cmp dword ptr [inflate_fast_use_mmx],2 + jne L_update_next_in + + + + mov ebx,ebp + +L_update_next_in: +; 1266 "inffast.S" + mov eax, [esp+88] + mov ecx,ebx + mov edx, [eax+28] + shr ecx,3 + sub esi,ecx + shl ecx,3 + sub ebx,ecx + mov [eax+12],edi + mov [edx+bits_state],ebx + mov ecx,ebx + + lea ebx, [esp+28] + cmp [esp+20],ebx + jne L_buf_not_used + + sub esi,ebx + mov ebx, [eax+0] + mov [esp+20],ebx + add esi,ebx + mov ebx, [eax+4] + sub ebx,11 + add [esp+20],ebx + +L_buf_not_used: + mov [eax+0],esi + + mov ebx,1 + shl ebx,cl + dec ebx + + + + + + cmp dword ptr [inflate_fast_use_mmx],2 + jne L_update_hold + + + + psrlq mm0,mm1 + movd ebp,mm0 + + emms + +L_update_hold: + + + + and ebp,ebx + mov [edx+hold_state],ebp + + + + + mov ebx, [esp+20] + cmp ebx,esi + jbe L_last_is_smaller + + sub ebx,esi + add ebx,11 + mov [eax+4],ebx + jmp L_fixup_out +L_last_is_smaller: + sub esi,ebx + neg esi + add esi,11 + mov [eax+4],esi + + + + +L_fixup_out: + + mov ebx, [esp+16] + cmp ebx,edi + jbe L_end_is_smaller + + sub ebx,edi + add ebx,257 + mov [eax+16],ebx + jmp L_done +L_end_is_smaller: + sub edi,ebx + neg edi + add edi,257 + mov [eax+16],edi + + + + + +L_done: + add esp,64 + popfd + pop ebx + pop ebp + pop esi + pop edi + ret +_inflate_fast endp + +_TEXT ends +end diff --git a/zlib/contrib/masmx86/inffas32.lst b/zlib/contrib/masmx86/inffas32.lst new file mode 100644 index 0000000..025627c --- /dev/null +++ b/zlib/contrib/masmx86/inffas32.lst @@ -0,0 +1,1224 @@ +Microsoft (R) Macro Assembler Version 14.16.27031.1 09/14/19 11:35:23 +inffas32.asm Page 1 - 1 + + + ;/* inffas32.asm is a hand tuned assembler version of inffast.c -- fast decoding + ; * + ; * inffas32.asm is derivated from inffas86.c, with translation of assembly code + ; * + ; * Copyright (C) 1995-2003 Mark Adler + ; * For conditions of distribution and use, see copyright notice in zlib.h + ; * + ; * Copyright (C) 2003 Chris Anderson + ; * Please use the copyright conditions above. + ; * + ; * Mar-13-2003 -- Most of this is derived from inffast.S which is derived from + ; * the gcc -S output of zlib-1.2.0/inffast.c. Zlib-1.2.0 is in beta release at + ; * the moment. I have successfully compiled and tested this code with gcc2.96, + ; * gcc3.2, icc5.0, msvc6.0. It is very close to the speed of inffast.S + ; * compiled with gcc -DNO_MMX, but inffast.S is still faster on the P3 with MMX + ; * enabled. I will attempt to merge the MMX code into this version. Newer + ; * versions of this and inffast.S can be found at + ; * http://www.eetbeetee.com/zlib/ and http://www.charm.net/~christop/zlib/ + ; * + ; * 2005 : modification by Gilles Vollant + ; */ + ; For Visual C++ 4.x and higher and ML 6.x and higher + ; ml.exe is in directory \MASM611C of Win95 DDK + ; ml.exe is also distributed in http://www.masm32.com/masmdl.htm + ; and in VC++2003 toolkit at http://msdn.microsoft.com/visualc/vctoolkit2003/ + ; + ; + ; compile with command line option + ; ml /coff /Zi /c /Flinffas32.lst inffas32.asm + + ; if you define NO_GZIP (see inflate.h), compile with + ; ml /coff /Zi /c /Flinffas32.lst /DNO_GUNZIP inffas32.asm + + + ; zlib122sup is 0 fort zlib 1.2.2.1 and lower + ; zlib122sup is 8 fort zlib 1.2.2.2 and more (with addition of dmax and head + ; in inflate_state in inflate.h) + = 00000008 zlib1222sup equ 8 + + + IFDEF GUNZIP + ELSE + IFNDEF NO_GUNZIP + = 0000000B INFLATE_MODE_TYPE equ 11 + = 0000001A INFLATE_MODE_BAD equ 26 + ELSE + ENDIF + ENDIF + + + ; 75 "inffast.S" + ;FILE "inffast.S" + + ;;;GLOBAL _inflate_fast + + ;;;SECTION .text + + + + .586p + .mmx + + name inflate_fast_x86 + .MODEL FLAT + + 00000000 _DATA segment + 00000000 inflate_fast_use_mmx: + 00000000 00000001 dd 1 + + + 00000000 _TEXT segment + + + + ALIGN 4 + 00000000 46 61 73 74 20 db 'Fast decoding Code from Chris Anderson' + 64 65 63 6F 64 + 69 6E 67 20 43 + 6F 64 65 20 66 + 72 6F 6D 20 43 + 68 72 69 73 20 + 41 6E 64 65 72 + 73 6F 6E + 00000026 00 db 0 + + ALIGN 4 + 00000028 invalid_literal_length_code_msg: + 00000028 69 6E 76 61 6C db 'invalid literal/length code' + 69 64 20 6C 69 + 74 65 72 61 6C + 2F 6C 65 6E 67 + 74 68 20 63 6F + 64 65 + 00000043 00 db 0 + + ALIGN 4 + 00000044 invalid_distance_code_msg: + 00000044 69 6E 76 61 6C db 'invalid distance code' + 69 64 20 64 69 + 73 74 61 6E 63 + 65 20 63 6F 64 + 65 + 00000059 00 db 0 + + ALIGN 4 + 0000005C invalid_distance_too_far_msg: + 0000005C 69 6E 76 61 6C db 'invalid distance too far back' + 69 64 20 64 69 + 73 74 61 6E 63 + 65 20 74 6F 6F + 20 66 61 72 20 + 62 61 63 6B + 00000079 00 db 0 + + + ALIGN 4 + 0000007C inflate_fast_mask: + 0000007C 00000000 dd 0 + 00000080 00000001 dd 1 + 00000084 00000003 dd 3 + 00000088 00000007 dd 7 + 0000008C 0000000F dd 15 + 00000090 0000001F dd 31 + 00000094 0000003F dd 63 + 00000098 0000007F dd 127 + 0000009C 000000FF dd 255 + 000000A0 000001FF dd 511 + 000000A4 000003FF dd 1023 + 000000A8 000007FF dd 2047 + 000000AC 00000FFF dd 4095 + 000000B0 00001FFF dd 8191 + 000000B4 00003FFF dd 16383 + 000000B8 00007FFF dd 32767 + 000000BC 0000FFFF dd 65535 + 000000C0 0001FFFF dd 131071 + 000000C4 0003FFFF dd 262143 + 000000C8 0007FFFF dd 524287 + 000000CC 000FFFFF dd 1048575 + 000000D0 001FFFFF dd 2097151 + 000000D4 003FFFFF dd 4194303 + 000000D8 007FFFFF dd 8388607 + 000000DC 00FFFFFF dd 16777215 + 000000E0 01FFFFFF dd 33554431 + 000000E4 03FFFFFF dd 67108863 + 000000E8 07FFFFFF dd 134217727 + 000000EC 0FFFFFFF dd 268435455 + 000000F0 1FFFFFFF dd 536870911 + 000000F4 3FFFFFFF dd 1073741823 + 000000F8 7FFFFFFF dd 2147483647 + 000000FC FFFFFFFF dd 4294967295 + + + = 00000000 mode_state equ 0 ;/* state->mode */ + = 00000028 wsize_state equ (32+zlib1222sup) ;/* state->wsize */ + = 00000030 write_state equ (36+4+zlib1222sup) ;/* state->write */ + = 00000034 window_state equ (40+4+zlib1222sup) ;/* state->window */ + = 00000038 hold_state equ (44+4+zlib1222sup) ;/* state->hold */ + = 0000003C bits_state equ (48+4+zlib1222sup) ;/* state->bits */ + = 0000004C lencode_state equ (64+4+zlib1222sup) ;/* state->lencode */ + = 00000050 distcode_state equ (68+4+zlib1222sup) ;/* state->distcode */ + = 00000054 lenbits_state equ (72+4+zlib1222sup) ;/* state->lenbits */ + = 00000058 distbits_state equ (76+4+zlib1222sup) ;/* state->distbits */ + + + ;;SECTION .text + ; 205 "inffast.S" + ;GLOBAL inflate_fast_use_mmx + + ;SECTION .data + + + ; GLOBAL inflate_fast_use_mmx:object + ;.size inflate_fast_use_mmx, 4 + ; 226 "inffast.S" + ;SECTION .text + + ALIGN 4 + 00000100 _inflate_fast proc near + 00000100 .FPO (16, 4, 0, 0, 1, 0) + 00000100 57 push edi + 00000101 56 push esi + 00000102 55 push ebp + 00000103 53 push ebx + 00000104 9C pushfd + 00000105 83 EC 40 sub esp,64 + 00000108 FC cld + + + + + 00000109 8B 74 24 58 mov esi, [esp+88] + 0000010D 8B 7E 1C mov edi, [esi+28] + + + + + + + + 00000110 8B 56 04 mov edx, [esi+4] + 00000113 8B 06 mov eax, [esi+0] + + 00000115 03 D0 add edx,eax + 00000117 83 EA 0B sub edx,11 + + 0000011A 89 44 24 2C mov [esp+44],eax + 0000011E 89 54 24 14 mov [esp+20],edx + + 00000122 8B 6C 24 5C mov ebp, [esp+92] + 00000126 8B 4E 10 mov ecx, [esi+16] + 00000129 8B 5E 0C mov ebx, [esi+12] + + 0000012C 2B E9 sub ebp,ecx + 0000012E F7 DD neg ebp + 00000130 03 EB add ebp,ebx + + 00000132 81 E9 00000101 sub ecx,257 + 00000138 03 CB add ecx,ebx + + 0000013A 89 5C 24 3C mov [esp+60],ebx + 0000013E 89 6C 24 28 mov [esp+40],ebp + 00000142 89 4C 24 10 mov [esp+16],ecx + ; 285 "inffast.S" + 00000146 8B 47 4C mov eax, [edi+lencode_state] + 00000149 8B 4F 50 mov ecx, [edi+distcode_state] + + 0000014C 89 44 24 08 mov [esp+8],eax + 00000150 89 4C 24 0C mov [esp+12],ecx + + 00000154 B8 00000001 mov eax,1 + 00000159 8B 4F 54 mov ecx, [edi+lenbits_state] + 0000015C D3 E0 shl eax,cl + 0000015E 48 dec eax + 0000015F 89 04 24 mov [esp+0],eax + + 00000162 B8 00000001 mov eax,1 + 00000167 8B 4F 58 mov ecx, [edi+distbits_state] + 0000016A D3 E0 shl eax,cl + 0000016C 48 dec eax + 0000016D 89 44 24 04 mov [esp+4],eax + + 00000171 8B 47 28 mov eax, [edi+wsize_state] + 00000174 8B 4F 30 mov ecx, [edi+write_state] + 00000177 8B 57 34 mov edx, [edi+window_state] + + 0000017A 89 44 24 34 mov [esp+52],eax + 0000017E 89 4C 24 30 mov [esp+48],ecx + 00000182 89 54 24 38 mov [esp+56],edx + + 00000186 8B 6F 38 mov ebp, [edi+hold_state] + 00000189 8B 5F 3C mov ebx, [edi+bits_state] + ; 321 "inffast.S" + 0000018C 8B 74 24 2C mov esi, [esp+44] + 00000190 8B 4C 24 14 mov ecx, [esp+20] + 00000194 3B CE cmp ecx,esi + 00000196 77 22 ja L_align_long + + 00000198 83 C1 0B add ecx,11 + 0000019B 2B CE sub ecx,esi + 0000019D B8 0000000C mov eax,12 + 000001A2 2B C1 sub eax,ecx + 000001A4 8D 7C 24 1C lea edi, [esp+28] + 000001A8 F3/ A4 rep movsb + 000001AA 8B C8 mov ecx,eax + 000001AC 33 C0 xor eax,eax + 000001AE F3/ AA rep stosb + 000001B0 8D 74 24 1C lea esi, [esp+28] + 000001B4 89 74 24 14 mov [esp+20],esi + 000001B8 EB 18 jmp L_is_aligned + + + 000001BA L_align_long: + 000001BA F7 C6 00000003 test esi,3 + 000001C0 74 10 jz L_is_aligned + 000001C2 33 C0 xor eax,eax + 000001C4 8A 06 mov al, [esi] + 000001C6 46 inc esi + 000001C7 8B CB mov ecx,ebx + 000001C9 83 C3 08 add ebx,8 + 000001CC D3 E0 shl eax,cl + 000001CE 0B E8 or ebp,eax + 000001D0 EB E8 jmp L_align_long + + 000001D2 L_is_aligned: + 000001D2 8B 7C 24 3C mov edi, [esp+60] + ; 366 "inffast.S" + 000001D6 L_check_mmx: + 000001D6 83 3D 00000000 R cmp dword ptr [inflate_fast_use_mmx],2 + 02 + 000001DD 0F 84 00000289 je L_init_mmx + 000001E3 77 6B ja L_do_loop + + 000001E5 50 push eax + 000001E6 53 push ebx + 000001E7 51 push ecx + 000001E8 52 push edx + 000001E9 9C pushfd + 000001EA 8B 04 24 mov eax, [esp] + 000001ED 81 34 24 xor dword ptr [esp],0200000h + 00200000 + + + + + 000001F4 9D popfd + 000001F5 9C pushfd + 000001F6 5A pop edx + 000001F7 33 D0 xor edx,eax + 000001F9 74 44 jz L_dont_use_mmx + 000001FB 33 C0 xor eax,eax + 000001FD 0F A2 cpuid + 000001FF 81 FB 756E6547 cmp ebx,0756e6547h + 00000205 75 38 jne L_dont_use_mmx + 00000207 81 F9 6C65746E cmp ecx,06c65746eh + 0000020D 75 30 jne L_dont_use_mmx + 0000020F 81 FA 49656E69 cmp edx,049656e69h + 00000215 75 28 jne L_dont_use_mmx + 00000217 B8 00000001 mov eax,1 + 0000021C 0F A2 cpuid + 0000021E C1 E8 08 shr eax,8 + 00000221 83 E0 0F and eax,15 + 00000224 83 F8 06 cmp eax,6 + 00000227 75 16 jne L_dont_use_mmx + 00000229 F7 C2 00800000 test edx,0800000h + 0000022F 75 02 jnz L_use_mmx + 00000231 EB 0C jmp L_dont_use_mmx + 00000233 L_use_mmx: + 00000233 C7 05 00000000 R mov dword ptr [inflate_fast_use_mmx],2 + 00000002 + 0000023D EB 0A jmp L_check_mmx_pop + 0000023F L_dont_use_mmx: + 0000023F C7 05 00000000 R mov dword ptr [inflate_fast_use_mmx],3 + 00000003 + 00000249 L_check_mmx_pop: + 00000249 5A pop edx + 0000024A 59 pop ecx + 0000024B 5B pop ebx + 0000024C 58 pop eax + 0000024D EB 87 jmp L_check_mmx + ; 426 "inffast.S" + ALIGN 4 + 00000250 L_do_loop: + ; 437 "inffast.S" + 00000250 80 FB 0F cmp bl,15 + 00000253 77 0D ja L_get_length_code + + 00000255 33 C0 xor eax,eax + 00000257 66| AD lodsw + 00000259 8A CB mov cl,bl + 0000025B 80 C3 10 add bl,16 + 0000025E D3 E0 shl eax,cl + 00000260 0B E8 or ebp,eax + + 00000262 L_get_length_code: + 00000262 8B 14 24 mov edx, [esp+0] + 00000265 8B 4C 24 08 mov ecx, [esp+8] + 00000269 23 D5 and edx,ebp + 0000026B 8B 04 91 mov eax, [ecx+edx*4] + + 0000026E L_dolen: + + + + + + + 0000026E 8A CC mov cl,ah + 00000270 2A DC sub bl,ah + 00000272 D3 ED shr ebp,cl + + + + + + + 00000274 84 C0 test al,al + 00000276 75 19 jnz L_test_for_length_base + + 00000278 C1 E8 10 shr eax,16 + 0000027B AA stosb + + 0000027C L_while_test: + + + 0000027C 39 7C 24 10 cmp [esp+16],edi + 00000280 0F 86 00000462 jbe L_break_loop + + 00000286 39 74 24 14 cmp [esp+20],esi + 0000028A 77 C4 ja L_do_loop + 0000028C E9 00000457 jmp L_break_loop + + 00000291 L_test_for_length_base: + ; 502 "inffast.S" + 00000291 8B D0 mov edx,eax + 00000293 C1 EA 10 shr edx,16 + 00000296 8A C8 mov cl,al + + 00000298 A8 10 test al,16 + 0000029A 0F 84 000000F4 jz L_test_for_second_level_length + 000002A0 80 E1 0F and cl,15 + 000002A3 74 25 jz L_save_len + 000002A5 3A D9 cmp bl,cl + 000002A7 73 11 jae L_add_bits_to_len + + 000002A9 8A E9 mov ch,cl + 000002AB 33 C0 xor eax,eax + 000002AD 66| AD lodsw + 000002AF 8A CB mov cl,bl + 000002B1 80 C3 10 add bl,16 + 000002B4 D3 E0 shl eax,cl + 000002B6 0B E8 or ebp,eax + 000002B8 8A CD mov cl,ch + + 000002BA L_add_bits_to_len: + 000002BA B8 00000001 mov eax,1 + 000002BF D3 E0 shl eax,cl + 000002C1 48 dec eax + 000002C2 2A D9 sub bl,cl + 000002C4 23 C5 and eax,ebp + 000002C6 D3 ED shr ebp,cl + 000002C8 03 D0 add edx,eax + + 000002CA L_save_len: + 000002CA 89 54 24 18 mov [esp+24],edx + + + 000002CE L_decode_distance: + ; 549 "inffast.S" + 000002CE 80 FB 0F cmp bl,15 + 000002D1 77 0D ja L_get_distance_code + + 000002D3 33 C0 xor eax,eax + 000002D5 66| AD lodsw + 000002D7 8A CB mov cl,bl + 000002D9 80 C3 10 add bl,16 + 000002DC D3 E0 shl eax,cl + 000002DE 0B E8 or ebp,eax + + 000002E0 L_get_distance_code: + 000002E0 8B 54 24 04 mov edx, [esp+4] + 000002E4 8B 4C 24 0C mov ecx, [esp+12] + 000002E8 23 D5 and edx,ebp + 000002EA 8B 04 91 mov eax, [ecx+edx*4] + + + 000002ED L_dodist: + 000002ED 8B D0 mov edx,eax + 000002EF C1 EA 10 shr edx,16 + 000002F2 8A CC mov cl,ah + 000002F4 2A DC sub bl,ah + 000002F6 D3 ED shr ebp,cl + ; 584 "inffast.S" + 000002F8 8A C8 mov cl,al + + 000002FA A8 10 test al,16 + 000002FC 0F 84 000000B2 jz L_test_for_second_level_dist + 00000302 80 E1 0F and cl,15 + 00000305 74 65 jz L_check_dist_one + 00000307 3A D9 cmp bl,cl + 00000309 73 11 jae L_add_bits_to_dist + + 0000030B 8A E9 mov ch,cl + 0000030D 33 C0 xor eax,eax + 0000030F 66| AD lodsw + 00000311 8A CB mov cl,bl + 00000313 80 C3 10 add bl,16 + 00000316 D3 E0 shl eax,cl + 00000318 0B E8 or ebp,eax + 0000031A 8A CD mov cl,ch + + 0000031C L_add_bits_to_dist: + 0000031C B8 00000001 mov eax,1 + 00000321 D3 E0 shl eax,cl + 00000323 48 dec eax + 00000324 2A D9 sub bl,cl + 00000326 23 C5 and eax,ebp + 00000328 D3 ED shr ebp,cl + 0000032A 03 D0 add edx,eax + 0000032C EB 00 jmp L_check_window + + 0000032E L_check_window: + ; 625 "inffast.S" + 0000032E 89 74 24 2C mov [esp+44],esi + 00000332 8B C7 mov eax,edi + 00000334 2B 44 24 28 sub eax, [esp+40] + + 00000338 3B C2 cmp eax,edx + 0000033A 0F 82 00000094 jb L_clip_window + + 00000340 8B 4C 24 18 mov ecx, [esp+24] + 00000344 8B F7 mov esi,edi + 00000346 2B F2 sub esi,edx + + 00000348 83 E9 03 sub ecx,3 + 0000034B 8A 06 mov al, [esi] + 0000034D 88 07 mov [edi],al + 0000034F 8A 46 01 mov al, [esi+1] + 00000352 8A 56 02 mov dl, [esi+2] + 00000355 83 C6 03 add esi,3 + 00000358 88 47 01 mov [edi+1],al + 0000035B 88 57 02 mov [edi+2],dl + 0000035E 83 C7 03 add edi,3 + 00000361 F3/ A4 rep movsb + + 00000363 8B 74 24 2C mov esi, [esp+44] + 00000367 E9 FFFFFF10 jmp L_while_test + + ALIGN 4 + 0000036C L_check_dist_one: + 0000036C 83 FA 01 cmp edx,1 + 0000036F 75 BD jne L_check_window + 00000371 39 7C 24 28 cmp [esp+40],edi + 00000375 74 B7 je L_check_window + + 00000377 4F dec edi + 00000378 8B 4C 24 18 mov ecx, [esp+24] + 0000037C 8A 07 mov al, [edi] + 0000037E 83 E9 03 sub ecx,3 + + 00000381 88 47 01 mov [edi+1],al + 00000384 88 47 02 mov [edi+2],al + 00000387 88 47 03 mov [edi+3],al + 0000038A 83 C7 04 add edi,4 + 0000038D F3/ AA rep stosb + + 0000038F E9 FFFFFEE8 jmp L_while_test + + ALIGN 4 + 00000394 L_test_for_second_level_length: + + + + + 00000394 A8 40 test al,64 + 00000396 0F 85 0000030E jnz L_test_for_end_of_block + + 0000039C B8 00000001 mov eax,1 + 000003A1 D3 E0 shl eax,cl + 000003A3 48 dec eax + 000003A4 23 C5 and eax,ebp + 000003A6 03 C2 add eax,edx + 000003A8 8B 54 24 08 mov edx, [esp+8] + 000003AC 8B 04 82 mov eax, [edx+eax*4] + 000003AF E9 FFFFFEBA jmp L_dolen + + ALIGN 4 + 000003B4 L_test_for_second_level_dist: + + + + + 000003B4 A8 40 test al,64 + 000003B6 0F 85 000002E2 jnz L_invalid_distance_code + + 000003BC B8 00000001 mov eax,1 + 000003C1 D3 E0 shl eax,cl + 000003C3 48 dec eax + 000003C4 23 C5 and eax,ebp + 000003C6 03 C2 add eax,edx + 000003C8 8B 54 24 0C mov edx, [esp+12] + 000003CC 8B 04 82 mov eax, [edx+eax*4] + 000003CF E9 FFFFFF19 jmp L_dodist + + ALIGN 4 + 000003D4 L_clip_window: + ; 721 "inffast.S" + 000003D4 8B C8 mov ecx,eax + 000003D6 8B 44 24 34 mov eax, [esp+52] + 000003DA F7 D9 neg ecx + 000003DC 8B 74 24 38 mov esi, [esp+56] + + 000003E0 3B C2 cmp eax,edx + 000003E2 0F 82 000002DE jb L_invalid_distance_too_far + + 000003E8 03 CA add ecx,edx + 000003EA 83 7C 24 30 00 cmp dword ptr [esp+48],0 + 000003EF 75 24 jne L_wrap_around_window + + 000003F1 2B C1 sub eax,ecx + 000003F3 03 F0 add esi,eax + ; 749 "inffast.S" + 000003F5 8B 44 24 18 mov eax, [esp+24] + 000003F9 3B C1 cmp eax,ecx + 000003FB 76 60 jbe L_do_copy1 + + 000003FD 2B C1 sub eax,ecx + 000003FF F3/ A4 rep movsb + 00000401 8B F7 mov esi,edi + 00000403 2B F2 sub esi,edx + 00000405 EB 56 jmp L_do_copy1 + + 00000407 3B C1 cmp eax,ecx + 00000409 76 52 jbe L_do_copy1 + + 0000040B 2B C1 sub eax,ecx + 0000040D F3/ A4 rep movsb + 0000040F 8B F7 mov esi,edi + 00000411 2B F2 sub esi,edx + 00000413 EB 48 jmp L_do_copy1 + + 00000415 L_wrap_around_window: + ; 793 "inffast.S" + 00000415 8B 44 24 30 mov eax, [esp+48] + 00000419 3B C8 cmp ecx,eax + 0000041B 76 2C jbe L_contiguous_in_window + + 0000041D 03 74 24 34 add esi, [esp+52] + 00000421 03 F0 add esi,eax + 00000423 2B F1 sub esi,ecx + 00000425 2B C8 sub ecx,eax + + + 00000427 8B 44 24 18 mov eax, [esp+24] + 0000042B 3B C1 cmp eax,ecx + 0000042D 76 2E jbe L_do_copy1 + + 0000042F 2B C1 sub eax,ecx + 00000431 F3/ A4 rep movsb + 00000433 8B 74 24 38 mov esi, [esp+56] + 00000437 8B 4C 24 30 mov ecx, [esp+48] + 0000043B 3B C1 cmp eax,ecx + 0000043D 76 1E jbe L_do_copy1 + + 0000043F 2B C1 sub eax,ecx + 00000441 F3/ A4 rep movsb + 00000443 8B F7 mov esi,edi + 00000445 2B F2 sub esi,edx + 00000447 EB 14 jmp L_do_copy1 + + 00000449 L_contiguous_in_window: + ; 836 "inffast.S" + 00000449 03 F0 add esi,eax + 0000044B 2B F1 sub esi,ecx + + + 0000044D 8B 44 24 18 mov eax, [esp+24] + 00000451 3B C1 cmp eax,ecx + 00000453 76 08 jbe L_do_copy1 + + 00000455 2B C1 sub eax,ecx + 00000457 F3/ A4 rep movsb + 00000459 8B F7 mov esi,edi + 0000045B 2B F2 sub esi,edx + + 0000045D L_do_copy1: + ; 862 "inffast.S" + 0000045D 8B C8 mov ecx,eax + 0000045F F3/ A4 rep movsb + + 00000461 8B 74 24 2C mov esi, [esp+44] + 00000465 E9 FFFFFE12 jmp L_while_test + ; 878 "inffast.S" + ALIGN 4 + 0000046C L_init_mmx: + 0000046C 0F 77 emms + + + + + + 0000046E 0F 6E C5 movd mm0,ebp + 00000471 8B EB mov ebp,ebx + ; 896 "inffast.S" + 00000473 0F 6E 24 24 movd mm4,dword ptr [esp+0] + 00000477 0F 7F E3 movq mm3,mm4 + 0000047A 0F 6E 6C 24 04 movd mm5,dword ptr [esp+4] + 0000047F 0F 7F EA movq mm2,mm5 + 00000482 0F EF C9 pxor mm1,mm1 + 00000485 8B 5C 24 08 mov ebx, [esp+8] + 00000489 EB 01 jmp L_do_loop_mmx + + ALIGN 4 + 0000048C L_do_loop_mmx: + 0000048C 0F D3 C1 psrlq mm0,mm1 + + 0000048F 83 FD 20 cmp ebp,32 + 00000492 77 12 ja L_get_length_code_mmx + + 00000494 0F 6E F5 movd mm6,ebp + 00000497 0F 6E 3E movd mm7,dword ptr [esi] + 0000049A 83 C6 04 add esi,4 + 0000049D 0F F3 FE psllq mm7,mm6 + 000004A0 83 C5 20 add ebp,32 + 000004A3 0F EB C7 por mm0,mm7 + + 000004A6 L_get_length_code_mmx: + 000004A6 0F DB E0 pand mm4,mm0 + 000004A9 0F 7E E0 movd eax,mm4 + 000004AC 0F 7F DC movq mm4,mm3 + 000004AF 8B 04 83 mov eax, [ebx+eax*4] + + 000004B2 L_dolen_mmx: + 000004B2 0F B6 CC movzx ecx,ah + 000004B5 0F 6E C9 movd mm1,ecx + 000004B8 2B E9 sub ebp,ecx + + 000004BA 84 C0 test al,al + 000004BC 75 19 jnz L_test_for_length_base_mmx + + 000004BE C1 E8 10 shr eax,16 + 000004C1 AA stosb + + 000004C2 L_while_test_mmx: + + + 000004C2 39 7C 24 10 cmp [esp+16],edi + 000004C6 0F 86 0000021C jbe L_break_loop + + 000004CC 39 74 24 14 cmp [esp+20],esi + 000004D0 77 BA ja L_do_loop_mmx + 000004D2 E9 00000211 jmp L_break_loop + + 000004D7 L_test_for_length_base_mmx: + + 000004D7 8B D0 mov edx,eax + 000004D9 C1 EA 10 shr edx,16 + + 000004DC A8 10 test al,16 + 000004DE 0F 84 000000E0 jz L_test_for_second_level_length_mmx + 000004E4 83 E0 0F and eax,15 + 000004E7 74 14 jz L_decode_distance_mmx + + 000004E9 0F D3 C1 psrlq mm0,mm1 + 000004EC 0F 6E C8 movd mm1,eax + 000004EF 0F 7E C1 movd ecx,mm0 + 000004F2 2B E8 sub ebp,eax + 000004F4 23 0C 85 and ecx, [inflate_fast_mask+eax*4] + 0000007C R + 000004FB 03 D1 add edx,ecx + + 000004FD L_decode_distance_mmx: + 000004FD 0F D3 C1 psrlq mm0,mm1 + + 00000500 83 FD 20 cmp ebp,32 + 00000503 77 12 ja L_get_dist_code_mmx + + 00000505 0F 6E F5 movd mm6,ebp + 00000508 0F 6E 3E movd mm7,dword ptr [esi] + 0000050B 83 C6 04 add esi,4 + 0000050E 0F F3 FE psllq mm7,mm6 + 00000511 83 C5 20 add ebp,32 + 00000514 0F EB C7 por mm0,mm7 + + 00000517 L_get_dist_code_mmx: + 00000517 8B 5C 24 0C mov ebx, [esp+12] + 0000051B 0F DB E8 pand mm5,mm0 + 0000051E 0F 7E E8 movd eax,mm5 + 00000521 0F 7F D5 movq mm5,mm2 + 00000524 8B 04 83 mov eax, [ebx+eax*4] + + 00000527 L_dodist_mmx: + + 00000527 0F B6 CC movzx ecx,ah + 0000052A 8B D8 mov ebx,eax + 0000052C C1 EB 10 shr ebx,16 + 0000052F 2B E9 sub ebp,ecx + 00000531 0F 6E C9 movd mm1,ecx + + 00000534 A8 10 test al,16 + 00000536 0F 84 000000AC jz L_test_for_second_level_dist_mmx + 0000053C 83 E0 0F and eax,15 + 0000053F 74 57 jz L_check_dist_one_mmx + + 00000541 L_add_bits_to_dist_mmx: + 00000541 0F D3 C1 psrlq mm0,mm1 + 00000544 0F 6E C8 movd mm1,eax + 00000547 0F 7E C1 movd ecx,mm0 + 0000054A 2B E8 sub ebp,eax + 0000054C 23 0C 85 and ecx, [inflate_fast_mask+eax*4] + 0000007C R + 00000553 03 D9 add ebx,ecx + + 00000555 L_check_window_mmx: + 00000555 89 74 24 2C mov [esp+44],esi + 00000559 8B C7 mov eax,edi + 0000055B 2B 44 24 28 sub eax, [esp+40] + + 0000055F 3B C3 cmp eax,ebx + 00000561 0F 82 000000A9 jb L_clip_window_mmx + + 00000567 8B CA mov ecx,edx + 00000569 8B F7 mov esi,edi + 0000056B 2B F3 sub esi,ebx + + 0000056D 83 E9 03 sub ecx,3 + 00000570 8A 06 mov al, [esi] + 00000572 88 07 mov [edi],al + 00000574 8A 46 01 mov al, [esi+1] + 00000577 8A 56 02 mov dl, [esi+2] + 0000057A 83 C6 03 add esi,3 + 0000057D 88 47 01 mov [edi+1],al + 00000580 88 57 02 mov [edi+2],dl + 00000583 83 C7 03 add edi,3 + 00000586 F3/ A4 rep movsb + + 00000588 8B 74 24 2C mov esi, [esp+44] + 0000058C 8B 5C 24 08 mov ebx, [esp+8] + 00000590 E9 FFFFFF2D jmp L_while_test_mmx + + ALIGN 4 + 00000598 L_check_dist_one_mmx: + 00000598 83 FB 01 cmp ebx,1 + 0000059B 75 B8 jne L_check_window_mmx + 0000059D 39 7C 24 28 cmp [esp+40],edi + 000005A1 74 B2 je L_check_window_mmx + + 000005A3 4F dec edi + 000005A4 8B CA mov ecx,edx + 000005A6 8A 07 mov al, [edi] + 000005A8 83 E9 03 sub ecx,3 + + 000005AB 88 47 01 mov [edi+1],al + 000005AE 88 47 02 mov [edi+2],al + 000005B1 88 47 03 mov [edi+3],al + 000005B4 83 C7 04 add edi,4 + 000005B7 F3/ AA rep stosb + + 000005B9 8B 5C 24 08 mov ebx, [esp+8] + 000005BD E9 FFFFFF00 jmp L_while_test_mmx + + ALIGN 4 + 000005C4 L_test_for_second_level_length_mmx: + 000005C4 A8 40 test al,64 + 000005C6 0F 85 000000DE jnz L_test_for_end_of_block + + 000005CC 83 E0 0F and eax,15 + 000005CF 0F D3 C1 psrlq mm0,mm1 + 000005D2 0F 7E C1 movd ecx,mm0 + 000005D5 23 0C 85 and ecx, [inflate_fast_mask+eax*4] + 0000007C R + 000005DC 03 CA add ecx,edx + 000005DE 8B 04 8B mov eax, [ebx+ecx*4] + 000005E1 E9 FFFFFECC jmp L_dolen_mmx + + ALIGN 4 + 000005E8 L_test_for_second_level_dist_mmx: + 000005E8 A8 40 test al,64 + 000005EA 0F 85 000000AE jnz L_invalid_distance_code + + 000005F0 83 E0 0F and eax,15 + 000005F3 0F D3 C1 psrlq mm0,mm1 + 000005F6 0F 7E C1 movd ecx,mm0 + 000005F9 23 0C 85 and ecx, [inflate_fast_mask+eax*4] + 0000007C R + 00000600 8B 44 24 0C mov eax, [esp+12] + 00000604 03 CB add ecx,ebx + 00000606 8B 04 88 mov eax, [eax+ecx*4] + 00000609 E9 FFFFFF19 jmp L_dodist_mmx + + ALIGN 4 + 00000610 L_clip_window_mmx: + + 00000610 8B C8 mov ecx,eax + 00000612 8B 44 24 34 mov eax, [esp+52] + 00000616 F7 D9 neg ecx + 00000618 8B 74 24 38 mov esi, [esp+56] + + 0000061C 3B C3 cmp eax,ebx + 0000061E 0F 82 000000A2 jb L_invalid_distance_too_far + + 00000624 03 CB add ecx,ebx + 00000626 83 7C 24 30 00 cmp dword ptr [esp+48],0 + 0000062B 75 20 jne L_wrap_around_window_mmx + + 0000062D 2B C1 sub eax,ecx + 0000062F 03 F0 add esi,eax + + 00000631 3B D1 cmp edx,ecx + 00000633 76 58 jbe L_do_copy1_mmx + + 00000635 2B D1 sub edx,ecx + 00000637 F3/ A4 rep movsb + 00000639 8B F7 mov esi,edi + 0000063B 2B F3 sub esi,ebx + 0000063D EB 4E jmp L_do_copy1_mmx + + 0000063F 3B D1 cmp edx,ecx + 00000641 76 4A jbe L_do_copy1_mmx + + 00000643 2B D1 sub edx,ecx + 00000645 F3/ A4 rep movsb + 00000647 8B F7 mov esi,edi + 00000649 2B F3 sub esi,ebx + 0000064B EB 40 jmp L_do_copy1_mmx + + 0000064D L_wrap_around_window_mmx: + + 0000064D 8B 44 24 30 mov eax, [esp+48] + 00000651 3B C8 cmp ecx,eax + 00000653 76 28 jbe L_contiguous_in_window_mmx + + 00000655 03 74 24 34 add esi, [esp+52] + 00000659 03 F0 add esi,eax + 0000065B 2B F1 sub esi,ecx + 0000065D 2B C8 sub ecx,eax + + + 0000065F 3B D1 cmp edx,ecx + 00000661 76 2A jbe L_do_copy1_mmx + + 00000663 2B D1 sub edx,ecx + 00000665 F3/ A4 rep movsb + 00000667 8B 74 24 38 mov esi, [esp+56] + 0000066B 8B 4C 24 30 mov ecx, [esp+48] + 0000066F 3B D1 cmp edx,ecx + 00000671 76 1A jbe L_do_copy1_mmx + + 00000673 2B D1 sub edx,ecx + 00000675 F3/ A4 rep movsb + 00000677 8B F7 mov esi,edi + 00000679 2B F3 sub esi,ebx + 0000067B EB 10 jmp L_do_copy1_mmx + + 0000067D L_contiguous_in_window_mmx: + + 0000067D 03 F0 add esi,eax + 0000067F 2B F1 sub esi,ecx + + + 00000681 3B D1 cmp edx,ecx + 00000683 76 08 jbe L_do_copy1_mmx + + 00000685 2B D1 sub edx,ecx + 00000687 F3/ A4 rep movsb + 00000689 8B F7 mov esi,edi + 0000068B 2B F3 sub esi,ebx + + 0000068D L_do_copy1_mmx: + + + 0000068D 8B CA mov ecx,edx + 0000068F F3/ A4 rep movsb + + 00000691 8B 74 24 2C mov esi, [esp+44] + 00000695 8B 5C 24 08 mov ebx, [esp+8] + 00000699 E9 FFFFFE24 jmp L_while_test_mmx + ; 1174 "inffast.S" + 0000069E L_invalid_distance_code: + + + + + + 0000069E B9 00000044 R mov ecx, invalid_distance_code_msg + 000006A3 BA 0000001A mov edx,INFLATE_MODE_BAD + 000006A8 EB 2C jmp L_update_stream_state + + 000006AA L_test_for_end_of_block: + + + + + + 000006AA A8 20 test al,32 + 000006AC 74 0C jz L_invalid_literal_length_code + + 000006AE B9 00000000 mov ecx,0 + 000006B3 BA 0000000B mov edx,INFLATE_MODE_TYPE + 000006B8 EB 1C jmp L_update_stream_state + + 000006BA L_invalid_literal_length_code: + + + + + + 000006BA B9 00000028 R mov ecx, invalid_literal_length_code_msg + 000006BF BA 0000001A mov edx,INFLATE_MODE_BAD + 000006C4 EB 10 jmp L_update_stream_state + + 000006C6 L_invalid_distance_too_far: + + + + 000006C6 8B 74 24 2C mov esi, [esp+44] + 000006CA B9 0000005C R mov ecx, invalid_distance_too_far_msg + 000006CF BA 0000001A mov edx,INFLATE_MODE_BAD + 000006D4 EB 00 jmp L_update_stream_state + + 000006D6 L_update_stream_state: + + 000006D6 8B 44 24 58 mov eax, [esp+88] + 000006DA 85 C9 test ecx,ecx + 000006DC 74 03 jz L_skip_msg + 000006DE 89 48 18 mov [eax+24],ecx + 000006E1 L_skip_msg: + 000006E1 8B 40 1C mov eax, [eax+28] + 000006E4 89 10 mov [eax+mode_state],edx + 000006E6 EB 00 jmp L_break_loop + + ALIGN 4 + 000006E8 L_break_loop: + ; 1243 "inffast.S" + 000006E8 83 3D 00000000 R cmp dword ptr [inflate_fast_use_mmx],2 + 02 + 000006EF 75 02 jne L_update_next_in + + + + 000006F1 8B DD mov ebx,ebp + + 000006F3 L_update_next_in: + ; 1266 "inffast.S" + 000006F3 8B 44 24 58 mov eax, [esp+88] + 000006F7 8B CB mov ecx,ebx + 000006F9 8B 50 1C mov edx, [eax+28] + 000006FC C1 E9 03 shr ecx,3 + 000006FF 2B F1 sub esi,ecx + 00000701 C1 E1 03 shl ecx,3 + 00000704 2B D9 sub ebx,ecx + 00000706 89 78 0C mov [eax+12],edi + 00000709 89 5A 3C mov [edx+bits_state],ebx + 0000070C 8B CB mov ecx,ebx + + 0000070E 8D 5C 24 1C lea ebx, [esp+28] + 00000712 39 5C 24 14 cmp [esp+20],ebx + 00000716 75 14 jne L_buf_not_used + + 00000718 2B F3 sub esi,ebx + 0000071A 8B 18 mov ebx, [eax+0] + 0000071C 89 5C 24 14 mov [esp+20],ebx + 00000720 03 F3 add esi,ebx + 00000722 8B 58 04 mov ebx, [eax+4] + 00000725 83 EB 0B sub ebx,11 + 00000728 01 5C 24 14 add [esp+20],ebx + + 0000072C L_buf_not_used: + 0000072C 89 30 mov [eax+0],esi + + 0000072E BB 00000001 mov ebx,1 + 00000733 D3 E3 shl ebx,cl + 00000735 4B dec ebx + + + + + + 00000736 83 3D 00000000 R cmp dword ptr [inflate_fast_use_mmx],2 + 02 + 0000073D 75 08 jne L_update_hold + + + + 0000073F 0F D3 C1 psrlq mm0,mm1 + 00000742 0F 7E C5 movd ebp,mm0 + + 00000745 0F 77 emms + + 00000747 L_update_hold: + + + + 00000747 23 EB and ebp,ebx + 00000749 89 6A 38 mov [edx+hold_state],ebp + + + + + 0000074C 8B 5C 24 14 mov ebx, [esp+20] + 00000750 3B DE cmp ebx,esi + 00000752 76 0A jbe L_last_is_smaller + + 00000754 2B DE sub ebx,esi + 00000756 83 C3 0B add ebx,11 + 00000759 89 58 04 mov [eax+4],ebx + 0000075C EB 0A jmp L_fixup_out + 0000075E L_last_is_smaller: + 0000075E 2B F3 sub esi,ebx + 00000760 F7 DE neg esi + 00000762 83 C6 0B add esi,11 + 00000765 89 70 04 mov [eax+4],esi + + + + + 00000768 L_fixup_out: + + 00000768 8B 5C 24 10 mov ebx, [esp+16] + 0000076C 3B DF cmp ebx,edi + 0000076E 76 0D jbe L_end_is_smaller + + 00000770 2B DF sub ebx,edi + 00000772 81 C3 00000101 add ebx,257 + 00000778 89 58 10 mov [eax+16],ebx + 0000077B EB 0D jmp L_done + 0000077D L_end_is_smaller: + 0000077D 2B FB sub edi,ebx + 0000077F F7 DF neg edi + 00000781 81 C7 00000101 add edi,257 + 00000787 89 78 10 mov [eax+16],edi + + + + + + 0000078A L_done: + 0000078A 83 C4 40 add esp,64 + 0000078D 9D popfd + 0000078E 5B pop ebx + 0000078F 5D pop ebp + 00000790 5E pop esi + 00000791 5F pop edi + 00000792 C3 ret + 00000793 _inflate_fast endp + + 00000004 _TEXT ends + end + Microsoft (R) Macro Assembler Version 14.16.27031.1 09/14/19 11:35:23 +inffas32.asm Symbols 2 - 1 + + + + +Segments and Groups: + + N a m e Size Length Align Combine Class + +FLAT . . . . . . . . . . . . . . GROUP +_DATA . . . . . . . . . . . . . 32 Bit 00000004 Para Public 'DATA' +_TEXT . . . . . . . . . . . . . 32 Bit 00000793 Para Public 'CODE' + + +Procedures, parameters, and locals: + + N a m e Type Value Attr + +_inflate_fast . . . . . . . . . P Near 00000100 _TEXT Length= 00000693 Public + L_align_long . . . . . . . . . L Near 000001BA _TEXT + L_is_aligned . . . . . . . . . L Near 000001D2 _TEXT + L_check_mmx . . . . . . . . . L Near 000001D6 _TEXT + L_use_mmx . . . . . . . . . . L Near 00000233 _TEXT + L_dont_use_mmx . . . . . . . . L Near 0000023F _TEXT + L_check_mmx_pop . . . . . . . L Near 00000249 _TEXT + L_do_loop . . . . . . . . . . L Near 00000250 _TEXT + L_get_length_code . . . . . . L Near 00000262 _TEXT + L_dolen . . . . . . . . . . . L Near 0000026E _TEXT + L_while_test . . . . . . . . . L Near 0000027C _TEXT + L_test_for_length_base . . . . L Near 00000291 _TEXT + L_add_bits_to_len . . . . . . L Near 000002BA _TEXT + L_save_len . . . . . . . . . . L Near 000002CA _TEXT + L_decode_distance . . . . . . L Near 000002CE _TEXT + L_get_distance_code . . . . . L Near 000002E0 _TEXT + L_dodist . . . . . . . . . . . L Near 000002ED _TEXT + L_add_bits_to_dist . . . . . . L Near 0000031C _TEXT + L_check_window . . . . . . . . L Near 0000032E _TEXT + L_check_dist_one . . . . . . . L Near 0000036C _TEXT + L_test_for_second_level_length . L Near 00000394 _TEXT + L_test_for_second_level_dist . L Near 000003B4 _TEXT + L_clip_window . . . . . . . . L Near 000003D4 _TEXT + L_wrap_around_window . . . . . L Near 00000415 _TEXT + L_contiguous_in_window . . . . L Near 00000449 _TEXT + L_do_copy1 . . . . . . . . . . L Near 0000045D _TEXT + L_init_mmx . . . . . . . . . . L Near 0000046C _TEXT + L_do_loop_mmx . . . . . . . . L Near 0000048C _TEXT + L_get_length_code_mmx . . . . L Near 000004A6 _TEXT + L_dolen_mmx . . . . . . . . . L Near 000004B2 _TEXT + L_while_test_mmx . . . . . . . L Near 000004C2 _TEXT + L_test_for_length_base_mmx . . L Near 000004D7 _TEXT + L_decode_distance_mmx . . . . L Near 000004FD _TEXT + L_get_dist_code_mmx . . . . . L Near 00000517 _TEXT + L_dodist_mmx . . . . . . . . . L Near 00000527 _TEXT + L_add_bits_to_dist_mmx . . . . L Near 00000541 _TEXT + L_check_window_mmx . . . . . . L Near 00000555 _TEXT + L_check_dist_one_mmx . . . . . L Near 00000598 _TEXT + L_test_for_second_level_length_mmx . L Near 000005C4 _TEXT + L_test_for_second_level_dist_mmx . L Near 000005E8 _TEXT + L_clip_window_mmx . . . . . . L Near 00000610 _TEXT + L_wrap_around_window_mmx . . . L Near 0000064D _TEXT + L_contiguous_in_window_mmx . . L Near 0000067D _TEXT + L_do_copy1_mmx . . . . . . . . L Near 0000068D _TEXT + L_invalid_distance_code . . . L Near 0000069E _TEXT + L_test_for_end_of_block . . . L Near 000006AA _TEXT + L_invalid_literal_length_code L Near 000006BA _TEXT + L_invalid_distance_too_far . . L Near 000006C6 _TEXT + L_update_stream_state . . . . L Near 000006D6 _TEXT + L_skip_msg . . . . . . . . . . L Near 000006E1 _TEXT + L_break_loop . . . . . . . . . L Near 000006E8 _TEXT + L_update_next_in . . . . . . . L Near 000006F3 _TEXT + L_buf_not_used . . . . . . . . L Near 0000072C _TEXT + L_update_hold . . . . . . . . L Near 00000747 _TEXT + L_last_is_smaller . . . . . . L Near 0000075E _TEXT + L_fixup_out . . . . . . . . . L Near 00000768 _TEXT + L_end_is_smaller . . . . . . . L Near 0000077D _TEXT + L_done . . . . . . . . . . . . L Near 0000078A _TEXT + + +Symbols: + + N a m e Type Value Attr + +@CodeSize . . . . . . . . . . . Number 00000000h +@DataSize . . . . . . . . . . . Number 00000000h +@Interface . . . . . . . . . . . Number 00000000h +@Model . . . . . . . . . . . . . Number 00000007h +@code . . . . . . . . . . . . . Text _TEXT +@data . . . . . . . . . . . . . Text FLAT +@fardata? . . . . . . . . . . . Text FLAT +@fardata . . . . . . . . . . . . Text FLAT +@stack . . . . . . . . . . . . . Text FLAT +INFLATE_MODE_BAD . . . . . . . . Number 0000001Ah +INFLATE_MODE_TYPE . . . . . . . Number 0000000Bh +bits_state . . . . . . . . . . . Number 0000003Ch +distbits_state . . . . . . . . . Number 00000058h +distcode_state . . . . . . . . . Number 00000050h +hold_state . . . . . . . . . . . Number 00000038h +inflate_fast_mask . . . . . . . L Near 0000007C _TEXT +inflate_fast_use_mmx . . . . . . L Near 00000000 _DATA +invalid_distance_code_msg . . . L Near 00000044 _TEXT +invalid_distance_too_far_msg . . L Near 0000005C _TEXT +invalid_literal_length_code_msg L Near 00000028 _TEXT +lenbits_state . . . . . . . . . Number 00000054h +lencode_state . . . . . . . . . Number 0000004Ch +mode_state . . . . . . . . . . . Number 00000000h +window_state . . . . . . . . . . Number 00000034h +write_state . . . . . . . . . . Number 00000030h +wsize_state . . . . . . . . . . Number 00000028h +zlib1222sup . . . . . . . . . . Number 00000008h + + 0 Warnings + 0 Errors diff --git a/zlib/contrib/masmx86/match686.asm b/zlib/contrib/masmx86/match686.asm new file mode 100644 index 0000000..69e0eed --- /dev/null +++ b/zlib/contrib/masmx86/match686.asm @@ -0,0 +1,479 @@ +; match686.asm -- Asm portion of the optimized longest_match for 32 bits x86 +; Copyright (C) 1995-1996 Jean-loup Gailly, Brian Raiter and Gilles Vollant. +; File written by Gilles Vollant, by converting match686.S from Brian Raiter +; for MASM. This is as assembly version of longest_match +; from Jean-loup Gailly in deflate.c +; +; http://www.zlib.net +; http://www.winimage.com/zLibDll +; http://www.muppetlabs.com/~breadbox/software/assembly.html +; +; For Visual C++ 4.x and higher and ML 6.x and higher +; ml.exe is distributed in +; http://www.microsoft.com/downloads/details.aspx?FamilyID=7a1c9da0-0510-44a2-b042-7ef370530c64 +; +; this file contain two implementation of longest_match +; +; this longest_match was written by Brian raiter (1998), optimized for Pentium Pro +; (and the faster known version of match_init on modern Core 2 Duo and AMD Phenom) +; +; for using an assembly version of longest_match, you need define ASMV in project +; +; compile the asm file running +; ml /coff /Zi /c /Flmatch686.lst match686.asm +; and do not include match686.obj in your project +; +; note: contrib of zLib 1.2.3 and earlier contained both a deprecated version for +; Pentium (prior Pentium Pro) and this version for Pentium Pro and modern processor +; with autoselect (with cpu detection code) +; if you want support the old pentium optimization, you can still use these version +; +; this file is not optimized for old pentium, but it compatible with all x86 32 bits +; processor (starting 80386) +; +; +; see below : zlib1222add must be adjuster if you use a zlib version < 1.2.2.2 + +;uInt longest_match(s, cur_match) +; deflate_state *s; +; IPos cur_match; /* current match */ + + NbStack equ 76 + cur_match equ dword ptr[esp+NbStack-0] + str_s equ dword ptr[esp+NbStack-4] +; 5 dword on top (ret,ebp,esi,edi,ebx) + adrret equ dword ptr[esp+NbStack-8] + pushebp equ dword ptr[esp+NbStack-12] + pushedi equ dword ptr[esp+NbStack-16] + pushesi equ dword ptr[esp+NbStack-20] + pushebx equ dword ptr[esp+NbStack-24] + + chain_length equ dword ptr [esp+NbStack-28] + limit equ dword ptr [esp+NbStack-32] + best_len equ dword ptr [esp+NbStack-36] + window equ dword ptr [esp+NbStack-40] + prev equ dword ptr [esp+NbStack-44] + scan_start equ word ptr [esp+NbStack-48] + wmask equ dword ptr [esp+NbStack-52] + match_start_ptr equ dword ptr [esp+NbStack-56] + nice_match equ dword ptr [esp+NbStack-60] + scan equ dword ptr [esp+NbStack-64] + + windowlen equ dword ptr [esp+NbStack-68] + match_start equ dword ptr [esp+NbStack-72] + strend equ dword ptr [esp+NbStack-76] + NbStackAdd equ (NbStack-24) + + .386p + + name gvmatch + .MODEL FLAT + + + +; all the +zlib1222add offsets are due to the addition of fields +; in zlib in the deflate_state structure since the asm code was first written +; (if you compile with zlib 1.0.4 or older, use "zlib1222add equ (-4)"). +; (if you compile with zlib between 1.0.5 and 1.2.2.1, use "zlib1222add equ 0"). +; if you compile with zlib 1.2.2.2 or later , use "zlib1222add equ 8"). + + zlib1222add equ 8 + +; Note : these value are good with a 8 bytes boundary pack structure + dep_chain_length equ 74h+zlib1222add + dep_window equ 30h+zlib1222add + dep_strstart equ 64h+zlib1222add + dep_prev_length equ 70h+zlib1222add + dep_nice_match equ 88h+zlib1222add + dep_w_size equ 24h+zlib1222add + dep_prev equ 38h+zlib1222add + dep_w_mask equ 2ch+zlib1222add + dep_good_match equ 84h+zlib1222add + dep_match_start equ 68h+zlib1222add + dep_lookahead equ 6ch+zlib1222add + + +_TEXT segment + +IFDEF NOUNDERLINE + public longest_match + public match_init +ELSE + public _longest_match + public _match_init +ENDIF + + MAX_MATCH equ 258 + MIN_MATCH equ 3 + MIN_LOOKAHEAD equ (MAX_MATCH+MIN_MATCH+1) + + + +MAX_MATCH equ 258 +MIN_MATCH equ 3 +MIN_LOOKAHEAD equ (MAX_MATCH + MIN_MATCH + 1) +MAX_MATCH_8_ equ ((MAX_MATCH + 7) AND 0FFF0h) + + +;;; stack frame offsets + +chainlenwmask equ esp + 0 ; high word: current chain len + ; low word: s->wmask +window equ esp + 4 ; local copy of s->window +windowbestlen equ esp + 8 ; s->window + bestlen +scanstart equ esp + 16 ; first two bytes of string +scanend equ esp + 12 ; last two bytes of string +scanalign equ esp + 20 ; dword-misalignment of string +nicematch equ esp + 24 ; a good enough match size +bestlen equ esp + 28 ; size of best match so far +scan equ esp + 32 ; ptr to string wanting match + +LocalVarsSize equ 36 +; saved ebx byte esp + 36 +; saved edi byte esp + 40 +; saved esi byte esp + 44 +; saved ebp byte esp + 48 +; return address byte esp + 52 +deflatestate equ esp + 56 ; the function arguments +curmatch equ esp + 60 + +;;; Offsets for fields in the deflate_state structure. These numbers +;;; are calculated from the definition of deflate_state, with the +;;; assumption that the compiler will dword-align the fields. (Thus, +;;; changing the definition of deflate_state could easily cause this +;;; program to crash horribly, without so much as a warning at +;;; compile time. Sigh.) + +dsWSize equ 36+zlib1222add +dsWMask equ 44+zlib1222add +dsWindow equ 48+zlib1222add +dsPrev equ 56+zlib1222add +dsMatchLen equ 88+zlib1222add +dsPrevMatch equ 92+zlib1222add +dsStrStart equ 100+zlib1222add +dsMatchStart equ 104+zlib1222add +dsLookahead equ 108+zlib1222add +dsPrevLen equ 112+zlib1222add +dsMaxChainLen equ 116+zlib1222add +dsGoodMatch equ 132+zlib1222add +dsNiceMatch equ 136+zlib1222add + + +;;; match686.asm -- Pentium-Pro-optimized version of longest_match() +;;; Written for zlib 1.1.2 +;;; Copyright (C) 1998 Brian Raiter +;;; You can look at http://www.muppetlabs.com/~breadbox/software/assembly.html +;;; +;; +;; This software is provided 'as-is', without any express or implied +;; warranty. In no event will the authors be held liable for any damages +;; arising from the use of this software. +;; +;; Permission is granted to anyone to use this software for any purpose, +;; including commercial applications, and to alter it and redistribute it +;; freely, subject to the following restrictions: +;; +;; 1. The origin of this software must not be misrepresented; you must not +;; claim that you wrote the original software. If you use this software +;; in a product, an acknowledgment in the product documentation would be +;; appreciated but is not required. +;; 2. Altered source versions must be plainly marked as such, and must not be +;; misrepresented as being the original software +;; 3. This notice may not be removed or altered from any source distribution. +;; + +;GLOBAL _longest_match, _match_init + + +;SECTION .text + +;;; uInt longest_match(deflate_state *deflatestate, IPos curmatch) + +;_longest_match: + IFDEF NOUNDERLINE + longest_match proc near + ELSE + _longest_match proc near + ENDIF +.FPO (9, 4, 0, 0, 1, 0) + +;;; Save registers that the compiler may be using, and adjust esp to +;;; make room for our stack frame. + + push ebp + push edi + push esi + push ebx + sub esp, LocalVarsSize + +;;; Retrieve the function arguments. ecx will hold cur_match +;;; throughout the entire function. edx will hold the pointer to the +;;; deflate_state structure during the function's setup (before +;;; entering the main loop. + + mov edx, [deflatestate] + mov ecx, [curmatch] + +;;; uInt wmask = s->w_mask; +;;; unsigned chain_length = s->max_chain_length; +;;; if (s->prev_length >= s->good_match) { +;;; chain_length >>= 2; +;;; } + + mov eax, [edx + dsPrevLen] + mov ebx, [edx + dsGoodMatch] + cmp eax, ebx + mov eax, [edx + dsWMask] + mov ebx, [edx + dsMaxChainLen] + jl LastMatchGood + shr ebx, 2 +LastMatchGood: + +;;; chainlen is decremented once beforehand so that the function can +;;; use the sign flag instead of the zero flag for the exit test. +;;; It is then shifted into the high word, to make room for the wmask +;;; value, which it will always accompany. + + dec ebx + shl ebx, 16 + or ebx, eax + mov [chainlenwmask], ebx + +;;; if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead; + + mov eax, [edx + dsNiceMatch] + mov ebx, [edx + dsLookahead] + cmp ebx, eax + jl LookaheadLess + mov ebx, eax +LookaheadLess: mov [nicematch], ebx + +;;; register Bytef *scan = s->window + s->strstart; + + mov esi, [edx + dsWindow] + mov [window], esi + mov ebp, [edx + dsStrStart] + lea edi, [esi + ebp] + mov [scan], edi + +;;; Determine how many bytes the scan ptr is off from being +;;; dword-aligned. + + mov eax, edi + neg eax + and eax, 3 + mov [scanalign], eax + +;;; IPos limit = s->strstart > (IPos)MAX_DIST(s) ? +;;; s->strstart - (IPos)MAX_DIST(s) : NIL; + + mov eax, [edx + dsWSize] + sub eax, MIN_LOOKAHEAD + sub ebp, eax + jg LimitPositive + xor ebp, ebp +LimitPositive: + +;;; int best_len = s->prev_length; + + mov eax, [edx + dsPrevLen] + mov [bestlen], eax + +;;; Store the sum of s->window + best_len in esi locally, and in esi. + + add esi, eax + mov [windowbestlen], esi + +;;; register ush scan_start = *(ushf*)scan; +;;; register ush scan_end = *(ushf*)(scan+best_len-1); +;;; Posf *prev = s->prev; + + movzx ebx, word ptr [edi] + mov [scanstart], ebx + movzx ebx, word ptr [edi + eax - 1] + mov [scanend], ebx + mov edi, [edx + dsPrev] + +;;; Jump into the main loop. + + mov edx, [chainlenwmask] + jmp short LoopEntry + +align 4 + +;;; do { +;;; match = s->window + cur_match; +;;; if (*(ushf*)(match+best_len-1) != scan_end || +;;; *(ushf*)match != scan_start) continue; +;;; [...] +;;; } while ((cur_match = prev[cur_match & wmask]) > limit +;;; && --chain_length != 0); +;;; +;;; Here is the inner loop of the function. The function will spend the +;;; majority of its time in this loop, and majority of that time will +;;; be spent in the first ten instructions. +;;; +;;; Within this loop: +;;; ebx = scanend +;;; ecx = curmatch +;;; edx = chainlenwmask - i.e., ((chainlen << 16) | wmask) +;;; esi = windowbestlen - i.e., (window + bestlen) +;;; edi = prev +;;; ebp = limit + +LookupLoop: + and ecx, edx + movzx ecx, word ptr [edi + ecx*2] + cmp ecx, ebp + jbe LeaveNow + sub edx, 00010000h + js LeaveNow +LoopEntry: movzx eax, word ptr [esi + ecx - 1] + cmp eax, ebx + jnz LookupLoop + mov eax, [window] + movzx eax, word ptr [eax + ecx] + cmp eax, [scanstart] + jnz LookupLoop + +;;; Store the current value of chainlen. + + mov [chainlenwmask], edx + +;;; Point edi to the string under scrutiny, and esi to the string we +;;; are hoping to match it up with. In actuality, esi and edi are +;;; both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and edx is +;;; initialized to -(MAX_MATCH_8 - scanalign). + + mov esi, [window] + mov edi, [scan] + add esi, ecx + mov eax, [scanalign] + mov edx, 0fffffef8h; -(MAX_MATCH_8) + lea edi, [edi + eax + 0108h] ;MAX_MATCH_8] + lea esi, [esi + eax + 0108h] ;MAX_MATCH_8] + +;;; Test the strings for equality, 8 bytes at a time. At the end, +;;; adjust edx so that it is offset to the exact byte that mismatched. +;;; +;;; We already know at this point that the first three bytes of the +;;; strings match each other, and they can be safely passed over before +;;; starting the compare loop. So what this code does is skip over 0-3 +;;; bytes, as much as necessary in order to dword-align the edi +;;; pointer. (esi will still be misaligned three times out of four.) +;;; +;;; It should be confessed that this loop usually does not represent +;;; much of the total running time. Replacing it with a more +;;; straightforward "rep cmpsb" would not drastically degrade +;;; performance. + +LoopCmps: + mov eax, [esi + edx] + xor eax, [edi + edx] + jnz LeaveLoopCmps + mov eax, [esi + edx + 4] + xor eax, [edi + edx + 4] + jnz LeaveLoopCmps4 + add edx, 8 + jnz LoopCmps + jmp short LenMaximum +LeaveLoopCmps4: add edx, 4 +LeaveLoopCmps: test eax, 0000FFFFh + jnz LenLower + add edx, 2 + shr eax, 16 +LenLower: sub al, 1 + adc edx, 0 + +;;; Calculate the length of the match. If it is longer than MAX_MATCH, +;;; then automatically accept it as the best possible match and leave. + + lea eax, [edi + edx] + mov edi, [scan] + sub eax, edi + cmp eax, MAX_MATCH + jge LenMaximum + +;;; If the length of the match is not longer than the best match we +;;; have so far, then forget it and return to the lookup loop. + + mov edx, [deflatestate] + mov ebx, [bestlen] + cmp eax, ebx + jg LongerMatch + mov esi, [windowbestlen] + mov edi, [edx + dsPrev] + mov ebx, [scanend] + mov edx, [chainlenwmask] + jmp LookupLoop + +;;; s->match_start = cur_match; +;;; best_len = len; +;;; if (len >= nice_match) break; +;;; scan_end = *(ushf*)(scan+best_len-1); + +LongerMatch: mov ebx, [nicematch] + mov [bestlen], eax + mov [edx + dsMatchStart], ecx + cmp eax, ebx + jge LeaveNow + mov esi, [window] + add esi, eax + mov [windowbestlen], esi + movzx ebx, word ptr [edi + eax - 1] + mov edi, [edx + dsPrev] + mov [scanend], ebx + mov edx, [chainlenwmask] + jmp LookupLoop + +;;; Accept the current string, with the maximum possible length. + +LenMaximum: mov edx, [deflatestate] + mov dword ptr [bestlen], MAX_MATCH + mov [edx + dsMatchStart], ecx + +;;; if ((uInt)best_len <= s->lookahead) return (uInt)best_len; +;;; return s->lookahead; + +LeaveNow: + mov edx, [deflatestate] + mov ebx, [bestlen] + mov eax, [edx + dsLookahead] + cmp ebx, eax + jg LookaheadRet + mov eax, ebx +LookaheadRet: + +;;; Restore the stack and return from whence we came. + + add esp, LocalVarsSize + pop ebx + pop esi + pop edi + pop ebp + + ret +; please don't remove this string ! +; Your can freely use match686 in any free or commercial app if you don't remove the string in the binary! + db 0dh,0ah,"asm686 with masm, optimised assembly code from Brian Raiter, written 1998",0dh,0ah + + + IFDEF NOUNDERLINE + longest_match endp + ELSE + _longest_match endp + ENDIF + + IFDEF NOUNDERLINE + match_init proc near + ret + match_init endp + ELSE + _match_init proc near + ret + _match_init endp + ENDIF + + +_TEXT ends +end diff --git a/zlib/contrib/masmx86/match686.lst b/zlib/contrib/masmx86/match686.lst new file mode 100644 index 0000000..315ad87 --- /dev/null +++ b/zlib/contrib/masmx86/match686.lst @@ -0,0 +1,624 @@ +Microsoft (R) Macro Assembler Version 14.16.27031.1 09/14/19 11:35:23 +match686.asm Page 1 - 1 + + + ; match686.asm -- Asm portion of the optimized longest_match for 32 bits x86 + ; Copyright (C) 1995-1996 Jean-loup Gailly, Brian Raiter and Gilles Vollant. + ; File written by Gilles Vollant, by converting match686.S from Brian Raiter + ; for MASM. This is as assembly version of longest_match + ; from Jean-loup Gailly in deflate.c + ; + ; http://www.zlib.net + ; http://www.winimage.com/zLibDll + ; http://www.muppetlabs.com/~breadbox/software/assembly.html + ; + ; For Visual C++ 4.x and higher and ML 6.x and higher + ; ml.exe is distributed in + ; http://www.microsoft.com/downloads/details.aspx?FamilyID=7a1c9da0-0510-44a2-b042-7ef370530c64 + ; + ; this file contain two implementation of longest_match + ; + ; this longest_match was written by Brian raiter (1998), optimized for Pentium Pro + ; (and the faster known version of match_init on modern Core 2 Duo and AMD Phenom) + ; + ; for using an assembly version of longest_match, you need define ASMV in project + ; + ; compile the asm file running + ; ml /coff /Zi /c /Flmatch686.lst match686.asm + ; and do not include match686.obj in your project + ; + ; note: contrib of zLib 1.2.3 and earlier contained both a deprecated version for + ; Pentium (prior Pentium Pro) and this version for Pentium Pro and modern processor + ; with autoselect (with cpu detection code) + ; if you want support the old pentium optimization, you can still use these version + ; + ; this file is not optimized for old pentium, but it compatible with all x86 32 bits + ; processor (starting 80386) + ; + ; + ; see below : zlib1222add must be adjuster if you use a zlib version < 1.2.2.2 + + ;uInt longest_match(s, cur_match) + ; deflate_state *s; + ; IPos cur_match; /* current match */ + + = 0000004C NbStack equ 76 + = dword ptr[esp+NbStack-0] cur_match equ dword ptr[esp+NbStack-0] + = dword ptr[esp+NbStack-4] str_s equ dword ptr[esp+NbStack-4] + ; 5 dword on top (ret,ebp,esi,edi,ebx) + = dword ptr[esp+NbStack-8] adrret equ dword ptr[esp+NbStack-8] + = dword ptr[esp+NbStack-12 pushebp equ dword ptr[esp+NbStack-12] + ] + = dword ptr[esp+NbStack-16 pushedi equ dword ptr[esp+NbStack-16] + ] + = dword ptr[esp+NbStack-20 pushesi equ dword ptr[esp+NbStack-20] + ] + = dword ptr[esp+NbStack-24 pushebx equ dword ptr[esp+NbStack-24] + ] + + = dword ptr [esp+NbStack-2 chain_length equ dword ptr [esp+NbStack-28] + 8] + = dword ptr [esp+NbStack-3 limit equ dword ptr [esp+NbStack-32] + 2] + = dword ptr [esp+NbStack-3 best_len equ dword ptr [esp+NbStack-36] + 6] + = dword ptr [esp+NbStack-4 window equ dword ptr [esp+NbStack-40] + 0] + = dword ptr [esp+NbStack-4 prev equ dword ptr [esp+NbStack-44] + 4] + = word ptr [esp+NbStack-48 scan_start equ word ptr [esp+NbStack-48] + ] + = dword ptr [esp+NbStack-5 wmask equ dword ptr [esp+NbStack-52] + 2] + = dword ptr [esp+NbStack-5 match_start_ptr equ dword ptr [esp+NbStack-56] + 6] + = dword ptr [esp+NbStack-6 nice_match equ dword ptr [esp+NbStack-60] + 0] + = dword ptr [esp+NbStack-6 scan equ dword ptr [esp+NbStack-64] + 4] + + = dword ptr [esp+NbStack-6 windowlen equ dword ptr [esp+NbStack-68] + 8] + = dword ptr [esp+NbStack-7 match_start equ dword ptr [esp+NbStack-72] + 2] + = dword ptr [esp+NbStack-7 strend equ dword ptr [esp+NbStack-76] + 6] + = 00000034 NbStackAdd equ (NbStack-24) + + .386p + + name gvmatch + .MODEL FLAT + + + + ; all the +zlib1222add offsets are due to the addition of fields + ; in zlib in the deflate_state structure since the asm code was first written + ; (if you compile with zlib 1.0.4 or older, use "zlib1222add equ (-4)"). + ; (if you compile with zlib between 1.0.5 and 1.2.2.1, use "zlib1222add equ 0"). + ; if you compile with zlib 1.2.2.2 or later , use "zlib1222add equ 8"). + + = 00000008 zlib1222add equ 8 + + ; Note : these value are good with a 8 bytes boundary pack structure + = 0000007C dep_chain_length equ 74h+zlib1222add + = 00000038 dep_window equ 30h+zlib1222add + = 0000006C dep_strstart equ 64h+zlib1222add + = 00000078 dep_prev_length equ 70h+zlib1222add + = 00000090 dep_nice_match equ 88h+zlib1222add + = 0000002C dep_w_size equ 24h+zlib1222add + = 00000040 dep_prev equ 38h+zlib1222add + = 00000034 dep_w_mask equ 2ch+zlib1222add + = 0000008C dep_good_match equ 84h+zlib1222add + = 00000070 dep_match_start equ 68h+zlib1222add + = 00000074 dep_lookahead equ 6ch+zlib1222add + + + 00000000 _TEXT segment + + IFDEF NOUNDERLINE + ELSE + public _longest_match + public _match_init + ENDIF + + = 00000102 MAX_MATCH equ 258 + = 00000003 MIN_MATCH equ 3 + = 00000106 MIN_LOOKAHEAD equ (MAX_MATCH+MIN_MATCH+1) + + + + = 00000102 MAX_MATCH equ 258 + = 00000003 MIN_MATCH equ 3 + = 00000106 MIN_LOOKAHEAD equ (MAX_MATCH + MIN_MATCH + 1) + = 00000100 MAX_MATCH_8_ equ ((MAX_MATCH + 7) AND 0FFF0h) + + + ;;; stack frame offsets + + = esp + 0 chainlenwmask equ esp + 0 ; high word: current chain len + ; low word: s->wmask + = esp + 4 window equ esp + 4 ; local copy of s->window + = esp + 8 windowbestlen equ esp + 8 ; s->window + bestlen + = esp + 16 scanstart equ esp + 16 ; first two bytes of string + = esp + 12 scanend equ esp + 12 ; last two bytes of string + = esp + 20 scanalign equ esp + 20 ; dword-misalignment of string + = esp + 24 nicematch equ esp + 24 ; a good enough match size + = esp + 28 bestlen equ esp + 28 ; size of best match so far + = esp + 32 scan equ esp + 32 ; ptr to string wanting match + + = 00000024 LocalVarsSize equ 36 + ; saved ebx byte esp + 36 + ; saved edi byte esp + 40 + ; saved esi byte esp + 44 + ; saved ebp byte esp + 48 + ; return address byte esp + 52 + = esp + 56 deflatestate equ esp + 56 ; the function arguments + = esp + 60 curmatch equ esp + 60 + + ;;; Offsets for fields in the deflate_state structure. These numbers + ;;; are calculated from the definition of deflate_state, with the + ;;; assumption that the compiler will dword-align the fields. (Thus, + ;;; changing the definition of deflate_state could easily cause this + ;;; program to crash horribly, without so much as a warning at + ;;; compile time. Sigh.) + + = 0000002C dsWSize equ 36+zlib1222add + = 00000034 dsWMask equ 44+zlib1222add + = 00000038 dsWindow equ 48+zlib1222add + = 00000040 dsPrev equ 56+zlib1222add + = 00000060 dsMatchLen equ 88+zlib1222add + = 00000064 dsPrevMatch equ 92+zlib1222add + = 0000006C dsStrStart equ 100+zlib1222add + = 00000070 dsMatchStart equ 104+zlib1222add + = 00000074 dsLookahead equ 108+zlib1222add + = 00000078 dsPrevLen equ 112+zlib1222add + = 0000007C dsMaxChainLen equ 116+zlib1222add + = 0000008C dsGoodMatch equ 132+zlib1222add + = 00000090 dsNiceMatch equ 136+zlib1222add + + + ;;; match686.asm -- Pentium-Pro-optimized version of longest_match() + ;;; Written for zlib 1.1.2 + ;;; Copyright (C) 1998 Brian Raiter + ;;; You can look at http://www.muppetlabs.com/~breadbox/software/assembly.html + ;;; + ;; + ;; This software is provided 'as-is', without any express or implied + ;; warranty. In no event will the authors be held liable for any damages + ;; arising from the use of this software. + ;; + ;; Permission is granted to anyone to use this software for any purpose, + ;; including commercial applications, and to alter it and redistribute it + ;; freely, subject to the following restrictions: + ;; + ;; 1. The origin of this software must not be misrepresented; you must not + ;; claim that you wrote the original software. If you use this software + ;; in a product, an acknowledgment in the product documentation would be + ;; appreciated but is not required. + ;; 2. Altered source versions must be plainly marked as such, and must not be + ;; misrepresented as being the original software + ;; 3. This notice may not be removed or altered from any source distribution. + ;; + + ;GLOBAL _longest_match, _match_init + + + ;SECTION .text + + ;;; uInt longest_match(deflate_state *deflatestate, IPos curmatch) + + ;_longest_match: + IFDEF NOUNDERLINE + ELSE + 00000000 _longest_match proc near + ENDIF + 00000000 .FPO (9, 4, 0, 0, 1, 0) + + ;;; Save registers that the compiler may be using, and adjust esp to + ;;; make room for our stack frame. + + 00000000 55 push ebp + 00000001 57 push edi + 00000002 56 push esi + 00000003 53 push ebx + 00000004 83 EC 24 sub esp, LocalVarsSize + + ;;; Retrieve the function arguments. ecx will hold cur_match + ;;; throughout the entire function. edx will hold the pointer to the + ;;; deflate_state structure during the function's setup (before + ;;; entering the main loop. + + 00000007 8B 54 24 38 mov edx, [deflatestate] + 0000000B 8B 4C 24 3C mov ecx, [curmatch] + + ;;; uInt wmask = s->w_mask; + ;;; unsigned chain_length = s->max_chain_length; + ;;; if (s->prev_length >= s->good_match) { + ;;; chain_length >>= 2; + ;;; } + + 0000000F 8B 42 78 mov eax, [edx + dsPrevLen] + 00000012 8B 9A 0000008C mov ebx, [edx + dsGoodMatch] + 00000018 3B C3 cmp eax, ebx + 0000001A 8B 42 34 mov eax, [edx + dsWMask] + 0000001D 8B 5A 7C mov ebx, [edx + dsMaxChainLen] + 00000020 7C 03 jl LastMatchGood + 00000022 C1 EB 02 shr ebx, 2 + 00000025 LastMatchGood: + + ;;; chainlen is decremented once beforehand so that the function can + ;;; use the sign flag instead of the zero flag for the exit test. + ;;; It is then shifted into the high word, to make room for the wmask + ;;; value, which it will always accompany. + + 00000025 4B dec ebx + 00000026 C1 E3 10 shl ebx, 16 + 00000029 0B D8 or ebx, eax + 0000002B 89 1C 24 mov [chainlenwmask], ebx + + ;;; if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead; + + 0000002E 8B 82 00000090 mov eax, [edx + dsNiceMatch] + 00000034 8B 5A 74 mov ebx, [edx + dsLookahead] + 00000037 3B D8 cmp ebx, eax + 00000039 7C 02 jl LookaheadLess + 0000003B 8B D8 mov ebx, eax + 0000003D 89 5C 24 18 LookaheadLess: mov [nicematch], ebx + + ;;; register Bytef *scan = s->window + s->strstart; + + 00000041 8B 72 38 mov esi, [edx + dsWindow] + 00000044 89 74 24 04 mov [window], esi + 00000048 8B 6A 6C mov ebp, [edx + dsStrStart] + 0000004B 8D 3C 2E lea edi, [esi + ebp] + 0000004E 89 7C 24 20 mov [scan], edi + + ;;; Determine how many bytes the scan ptr is off from being + ;;; dword-aligned. + + 00000052 8B C7 mov eax, edi + 00000054 F7 D8 neg eax + 00000056 83 E0 03 and eax, 3 + 00000059 89 44 24 14 mov [scanalign], eax + + ;;; IPos limit = s->strstart > (IPos)MAX_DIST(s) ? + ;;; s->strstart - (IPos)MAX_DIST(s) : NIL; + + 0000005D 8B 42 2C mov eax, [edx + dsWSize] + 00000060 2D 00000106 sub eax, MIN_LOOKAHEAD + 00000065 2B E8 sub ebp, eax + 00000067 7F 02 jg LimitPositive + 00000069 33 ED xor ebp, ebp + 0000006B LimitPositive: + + ;;; int best_len = s->prev_length; + + 0000006B 8B 42 78 mov eax, [edx + dsPrevLen] + 0000006E 89 44 24 1C mov [bestlen], eax + + ;;; Store the sum of s->window + best_len in esi locally, and in esi. + + 00000072 03 F0 add esi, eax + 00000074 89 74 24 08 mov [windowbestlen], esi + + ;;; register ush scan_start = *(ushf*)scan; + ;;; register ush scan_end = *(ushf*)(scan+best_len-1); + ;;; Posf *prev = s->prev; + + 00000078 0F B7 1F movzx ebx, word ptr [edi] + 0000007B 89 5C 24 10 mov [scanstart], ebx + 0000007F 0F B7 5C 07 FF movzx ebx, word ptr [edi + eax - 1] + 00000084 89 5C 24 0C mov [scanend], ebx + 00000088 8B 7A 40 mov edi, [edx + dsPrev] + + ;;; Jump into the main loop. + + 0000008B 8B 14 24 mov edx, [chainlenwmask] + 0000008E EB 1A jmp short LoopEntry + + align 4 + + ;;; do { + ;;; match = s->window + cur_match; + ;;; if (*(ushf*)(match+best_len-1) != scan_end || + ;;; *(ushf*)match != scan_start) continue; + ;;; [...] + ;;; } while ((cur_match = prev[cur_match & wmask]) > limit + ;;; && --chain_length != 0); + ;;; + ;;; Here is the inner loop of the function. The function will spend the + ;;; majority of its time in this loop, and majority of that time will + ;;; be spent in the first ten instructions. + ;;; + ;;; Within this loop: + ;;; ebx = scanend + ;;; ecx = curmatch + ;;; edx = chainlenwmask - i.e., ((chainlen << 16) | wmask) + ;;; esi = windowbestlen - i.e., (window + bestlen) + ;;; edi = prev + ;;; ebp = limit + + 00000090 LookupLoop: + 00000090 23 CA and ecx, edx + 00000092 0F B7 0C 4F movzx ecx, word ptr [edi + ecx*2] + 00000096 3B CD cmp ecx, ebp + 00000098 0F 86 000000E0 jbe LeaveNow + 0000009E 81 EA 00010000 sub edx, 00010000h + 000000A4 0F 88 000000D4 js LeaveNow + 000000AA 0F B7 44 0E FF LoopEntry: movzx eax, word ptr [esi + ecx - 1] + 000000AF 3B C3 cmp eax, ebx + 000000B1 75 DD jnz LookupLoop + 000000B3 8B 44 24 04 mov eax, [window] + 000000B7 0F B7 04 08 movzx eax, word ptr [eax + ecx] + 000000BB 3B 44 24 10 cmp eax, [scanstart] + 000000BF 75 CF jnz LookupLoop + + ;;; Store the current value of chainlen. + + 000000C1 89 14 24 mov [chainlenwmask], edx + + ;;; Point edi to the string under scrutiny, and esi to the string we + ;;; are hoping to match it up with. In actuality, esi and edi are + ;;; both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and edx is + ;;; initialized to -(MAX_MATCH_8 - scanalign). + + 000000C4 8B 74 24 04 mov esi, [window] + 000000C8 8B 7C 24 20 mov edi, [scan] + 000000CC 03 F1 add esi, ecx + 000000CE 8B 44 24 14 mov eax, [scanalign] + 000000D2 BA FFFFFEF8 mov edx, 0fffffef8h; -(MAX_MATCH_8) + 000000D7 8D BC 07 lea edi, [edi + eax + 0108h] ;MAX_MATCH_8] + 00000108 + 000000DE 8D B4 06 lea esi, [esi + eax + 0108h] ;MAX_MATCH_8] + 00000108 + + ;;; Test the strings for equality, 8 bytes at a time. At the end, + ;;; adjust edx so that it is offset to the exact byte that mismatched. + ;;; + ;;; We already know at this point that the first three bytes of the + ;;; strings match each other, and they can be safely passed over before + ;;; starting the compare loop. So what this code does is skip over 0-3 + ;;; bytes, as much as necessary in order to dword-align the edi + ;;; pointer. (esi will still be misaligned three times out of four.) + ;;; + ;;; It should be confessed that this loop usually does not represent + ;;; much of the total running time. Replacing it with a more + ;;; straightforward "rep cmpsb" would not drastically degrade + ;;; performance. + + 000000E5 LoopCmps: + 000000E5 8B 04 16 mov eax, [esi + edx] + 000000E8 33 04 17 xor eax, [edi + edx] + 000000EB 75 14 jnz LeaveLoopCmps + 000000ED 8B 44 16 04 mov eax, [esi + edx + 4] + 000000F1 33 44 17 04 xor eax, [edi + edx + 4] + 000000F5 75 07 jnz LeaveLoopCmps4 + 000000F7 83 C2 08 add edx, 8 + 000000FA 75 E9 jnz LoopCmps + 000000FC EB 71 jmp short LenMaximum + 000000FE 83 C2 04 LeaveLoopCmps4: add edx, 4 + 00000101 A9 0000FFFF LeaveLoopCmps: test eax, 0000FFFFh + 00000106 75 06 jnz LenLower + 00000108 83 C2 02 add edx, 2 + 0000010B C1 E8 10 shr eax, 16 + 0000010E 2C 01 LenLower: sub al, 1 + 00000110 83 D2 00 adc edx, 0 + + ;;; Calculate the length of the match. If it is longer than MAX_MATCH, + ;;; then automatically accept it as the best possible match and leave. + + 00000113 8D 04 17 lea eax, [edi + edx] + 00000116 8B 7C 24 20 mov edi, [scan] + 0000011A 2B C7 sub eax, edi + 0000011C 3D 00000102 cmp eax, MAX_MATCH + 00000121 7D 4C jge LenMaximum + + ;;; If the length of the match is not longer than the best match we + ;;; have so far, then forget it and return to the lookup loop. + + 00000123 8B 54 24 38 mov edx, [deflatestate] + 00000127 8B 5C 24 1C mov ebx, [bestlen] + 0000012B 3B C3 cmp eax, ebx + 0000012D 7F 13 jg LongerMatch + 0000012F 8B 74 24 08 mov esi, [windowbestlen] + 00000133 8B 7A 40 mov edi, [edx + dsPrev] + 00000136 8B 5C 24 0C mov ebx, [scanend] + 0000013A 8B 14 24 mov edx, [chainlenwmask] + 0000013D E9 FFFFFF4E jmp LookupLoop + + ;;; s->match_start = cur_match; + ;;; best_len = len; + ;;; if (len >= nice_match) break; + ;;; scan_end = *(ushf*)(scan+best_len-1); + + 00000142 8B 5C 24 18 LongerMatch: mov ebx, [nicematch] + 00000146 89 44 24 1C mov [bestlen], eax + 0000014A 89 4A 70 mov [edx + dsMatchStart], ecx + 0000014D 3B C3 cmp eax, ebx + 0000014F 7D 2D jge LeaveNow + 00000151 8B 74 24 04 mov esi, [window] + 00000155 03 F0 add esi, eax + 00000157 89 74 24 08 mov [windowbestlen], esi + 0000015B 0F B7 5C 07 FF movzx ebx, word ptr [edi + eax - 1] + 00000160 8B 7A 40 mov edi, [edx + dsPrev] + 00000163 89 5C 24 0C mov [scanend], ebx + 00000167 8B 14 24 mov edx, [chainlenwmask] + 0000016A E9 FFFFFF21 jmp LookupLoop + + ;;; Accept the current string, with the maximum possible length. + + 0000016F 8B 54 24 38 LenMaximum: mov edx, [deflatestate] + 00000173 C7 44 24 1C mov dword ptr [bestlen], MAX_MATCH + 00000102 + 0000017B 89 4A 70 mov [edx + dsMatchStart], ecx + + ;;; if ((uInt)best_len <= s->lookahead) return (uInt)best_len; + ;;; return s->lookahead; + + 0000017E LeaveNow: + 0000017E 8B 54 24 38 mov edx, [deflatestate] + 00000182 8B 5C 24 1C mov ebx, [bestlen] + 00000186 8B 42 74 mov eax, [edx + dsLookahead] + 00000189 3B D8 cmp ebx, eax + 0000018B 7F 02 jg LookaheadRet + 0000018D 8B C3 mov eax, ebx + 0000018F LookaheadRet: + + ;;; Restore the stack and return from whence we came. + + 0000018F 83 C4 24 add esp, LocalVarsSize + 00000192 5B pop ebx + 00000193 5E pop esi + 00000194 5F pop edi + 00000195 5D pop ebp + + 00000196 C3 ret + ; please don't remove this string ! + ; Your can freely use match686 in any free or commercial app if you don't remove the string in the binary! + 00000197 0D 0A 61 73 6D db 0dh,0ah,"asm686 with masm, optimised assembly code from Brian Raiter, written 1998",0dh,0ah + 36 38 36 20 77 + 69 74 68 20 6D + 61 73 6D 2C 20 + 6F 70 74 69 6D + 69 73 65 64 20 + 61 73 73 65 6D + 62 6C 79 20 63 + 6F 64 65 20 66 + 72 6F 6D 20 42 + 72 69 61 6E 20 + 52 61 69 74 65 + 72 2C 20 77 72 + 69 74 74 65 6E + 20 31 39 39 38 + 0D 0A + + + IFDEF NOUNDERLINE + ELSE + 000001E4 _longest_match endp + ENDIF + + IFDEF NOUNDERLINE + ELSE + 000001E4 _match_init proc near + 000001E4 C3 ret + 000001E5 _match_init endp + ENDIF + + + 000001E5 _TEXT ends + end + Microsoft (R) Macro Assembler Version 14.16.27031.1 09/14/19 11:35:23 +match686.asm Symbols 2 - 1 + + + + +Segments and Groups: + + N a m e Size Length Align Combine Class + +FLAT . . . . . . . . . . . . . . GROUP +_DATA . . . . . . . . . . . . . 32 Bit 00000000 Para Public 'DATA' +_TEXT . . . . . . . . . . . . . 32 Bit 000001E5 Para Public 'CODE' + + +Procedures, parameters, and locals: + + N a m e Type Value Attr + +_longest_match . . . . . . . . . P Near 00000000 _TEXT Length= 000001E4 Public + LastMatchGood . . . . . . . . L Near 00000025 _TEXT + LookaheadLess . . . . . . . . L Near 0000003D _TEXT + LimitPositive . . . . . . . . L Near 0000006B _TEXT + LookupLoop . . . . . . . . . . L Near 00000090 _TEXT + LoopEntry . . . . . . . . . . L Near 000000AA _TEXT + LoopCmps . . . . . . . . . . . L Near 000000E5 _TEXT + LeaveLoopCmps4 . . . . . . . . L Near 000000FE _TEXT + LeaveLoopCmps . . . . . . . . L Near 00000101 _TEXT + LenLower . . . . . . . . . . . L Near 0000010E _TEXT + LongerMatch . . . . . . . . . L Near 00000142 _TEXT + LenMaximum . . . . . . . . . . L Near 0000016F _TEXT + LeaveNow . . . . . . . . . . . L Near 0000017E _TEXT + LookaheadRet . . . . . . . . . L Near 0000018F _TEXT +_match_init . . . . . . . . . . P Near 000001E4 _TEXT Length= 00000001 Public + + +Symbols: + + N a m e Type Value Attr + +@CodeSize . . . . . . . . . . . Number 00000000h +@DataSize . . . . . . . . . . . Number 00000000h +@Interface . . . . . . . . . . . Number 00000000h +@Model . . . . . . . . . . . . . Number 00000007h +@code . . . . . . . . . . . . . Text _TEXT +@data . . . . . . . . . . . . . Text FLAT +@fardata? . . . . . . . . . . . Text FLAT +@fardata . . . . . . . . . . . . Text FLAT +@stack . . . . . . . . . . . . . Text FLAT +LocalVarsSize . . . . . . . . . Number 00000024h +MAX_MATCH_8_ . . . . . . . . . . Number 00000100h +MAX_MATCH . . . . . . . . . . . Number 00000102h +MIN_LOOKAHEAD . . . . . . . . . Number 00000106h +MIN_MATCH . . . . . . . . . . . Number 00000003h +NbStackAdd . . . . . . . . . . . Number 00000034h +NbStack . . . . . . . . . . . . Number 0000004Ch +adrret . . . . . . . . . . . . . Text dword ptr[esp+NbStack-8] +best_len . . . . . . . . . . . . Text dword ptr [esp+NbStack-36] +bestlen . . . . . . . . . . . . Text esp + 28 +chain_length . . . . . . . . . . Text dword ptr [esp+NbStack-28] +chainlenwmask . . . . . . . . . Text esp + 0 +cur_match . . . . . . . . . . . Text dword ptr[esp+NbStack-0] +curmatch . . . . . . . . . . . . Text esp + 60 +deflatestate . . . . . . . . . . Text esp + 56 +dep_chain_length . . . . . . . . Number 0000007Ch +dep_good_match . . . . . . . . . Number 0000008Ch +dep_lookahead . . . . . . . . . Number 00000074h +dep_match_start . . . . . . . . Number 00000070h +dep_nice_match . . . . . . . . . Number 00000090h +dep_prev_length . . . . . . . . Number 00000078h +dep_prev . . . . . . . . . . . . Number 00000040h +dep_strstart . . . . . . . . . . Number 0000006Ch +dep_w_mask . . . . . . . . . . . Number 00000034h +dep_w_size . . . . . . . . . . . Number 0000002Ch +dep_window . . . . . . . . . . . Number 00000038h +dsGoodMatch . . . . . . . . . . Number 0000008Ch +dsLookahead . . . . . . . . . . Number 00000074h +dsMatchLen . . . . . . . . . . . Number 00000060h +dsMatchStart . . . . . . . . . . Number 00000070h +dsMaxChainLen . . . . . . . . . Number 0000007Ch +dsNiceMatch . . . . . . . . . . Number 00000090h +dsPrevLen . . . . . . . . . . . Number 00000078h +dsPrevMatch . . . . . . . . . . Number 00000064h +dsPrev . . . . . . . . . . . . . Number 00000040h +dsStrStart . . . . . . . . . . . Number 0000006Ch +dsWMask . . . . . . . . . . . . Number 00000034h +dsWSize . . . . . . . . . . . . Number 0000002Ch +dsWindow . . . . . . . . . . . . Number 00000038h +limit . . . . . . . . . . . . . Text dword ptr [esp+NbStack-32] +match_start_ptr . . . . . . . . Text dword ptr [esp+NbStack-56] +match_start . . . . . . . . . . Text dword ptr [esp+NbStack-72] +nice_match . . . . . . . . . . . Text dword ptr [esp+NbStack-60] +nicematch . . . . . . . . . . . Text esp + 24 +prev . . . . . . . . . . . . . . Text dword ptr [esp+NbStack-44] +pushebp . . . . . . . . . . . . Text dword ptr[esp+NbStack-12] +pushebx . . . . . . . . . . . . Text dword ptr[esp+NbStack-24] +pushedi . . . . . . . . . . . . Text dword ptr[esp+NbStack-16] +pushesi . . . . . . . . . . . . Text dword ptr[esp+NbStack-20] +scan_start . . . . . . . . . . . Text word ptr [esp+NbStack-48] +scanalign . . . . . . . . . . . Text esp + 20 +scanend . . . . . . . . . . . . Text esp + 12 +scanstart . . . . . . . . . . . Text esp + 16 +scan . . . . . . . . . . . . . . Text esp + 32 +str_s . . . . . . . . . . . . . Text dword ptr[esp+NbStack-4] +strend . . . . . . . . . . . . . Text dword ptr [esp+NbStack-76] +windowbestlen . . . . . . . . . Text esp + 8 +windowlen . . . . . . . . . . . Text dword ptr [esp+NbStack-68] +window . . . . . . . . . . . . . Text esp + 4 +wmask . . . . . . . . . . . . . Text dword ptr [esp+NbStack-52] +zlib1222add . . . . . . . . . . Number 00000008h + + 0 Warnings + 0 Errors diff --git a/zlib/contrib/masmx86/readme.txt b/zlib/contrib/masmx86/readme.txt new file mode 100644 index 0000000..3f88886 --- /dev/null +++ b/zlib/contrib/masmx86/readme.txt @@ -0,0 +1,27 @@ + +Summary +------- +This directory contains ASM implementations of the functions +longest_match() and inflate_fast(). + + +Use instructions +---------------- +Assemble using MASM, and copy the object files into the zlib source +directory, then run the appropriate makefile, as suggested below. You can +donwload MASM from here: + + http://www.microsoft.com/downloads/details.aspx?displaylang=en&FamilyID=7a1c9da0-0510-44a2-b042-7ef370530c64 + +You can also get objects files here: + + http://www.winimage.com/zLibDll/zlib124_masm_obj.zip + +Build instructions +------------------ +* With Microsoft C and MASM: +nmake -f win32/Makefile.msc LOC="-DASMV -DASMINF" OBJA="match686.obj inffas32.obj" + +* With Borland C and TASM: +make -f win32/Makefile.bor LOCAL_ZLIB="-DASMV -DASMINF" OBJA="match686.obj inffas32.obj" OBJPA="+match686c.obj+match686.obj+inffas32.obj" + -- cgit v1.1