Mercurial > hg > CbC > CbC_gcc
comparison zlib/contrib/masmx86/gvmat32.asm @ 51:ae3a4bfb450b
add some files of version 4.4.3 that have been forgotten.
author | kent <kent@cr.ie.u-ryukyu.ac.jp> |
---|---|
date | Sun, 07 Feb 2010 18:27:48 +0900 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
47:3bfb6c00c1e0 | 51:ae3a4bfb450b |
---|---|
1 ; gvmat32.asm -- Asm portion of the optimized longest_match for 32 bits x86 | |
2 ; Copyright (C) 1995-1996 Jean-loup Gailly and Gilles Vollant. | |
3 ; File written by Gilles Vollant, by modifiying the longest_match | |
4 ; from Jean-loup Gailly in deflate.c | |
5 ; | |
6 ; http://www.zlib.net | |
7 ; http://www.winimage.com/zLibDll | |
8 ; http://www.muppetlabs.com/~breadbox/software/assembly.html | |
9 ; | |
10 ; For Visual C++ 4.x and higher and ML 6.x and higher | |
11 ; ml.exe is in directory \MASM611C of Win95 DDK | |
12 ; ml.exe is also distributed in http://www.masm32.com/masmdl.htm | |
13 ; and in VC++2003 toolkit at http://msdn.microsoft.com/visualc/vctoolkit2003/ | |
14 ; | |
15 ; this file contain two implementation of longest_match | |
16 ; | |
17 ; longest_match_7fff : written 1996 by Gilles Vollant optimized for | |
18 ; first Pentium. Assume s->w_mask == 0x7fff | |
19 ; longest_match_686 : written by Brian raiter (1998), optimized for Pentium Pro | |
20 ; | |
21 ; for using an seembly version of longest_match, you need define ASMV in project | |
22 ; There is two way in using gvmat32.asm | |
23 ; | |
24 ; A) Suggested method | |
25 ; if you want include both longest_match_7fff and longest_match_686 | |
26 ; compile the asm file running | |
27 ; ml /coff /Zi /Flgvmat32.lst /c gvmat32.asm | |
28 ; and include gvmat32c.c in your project | |
29 ; if you have an old cpu (386,486 or first Pentium) and s->w_mask==0x7fff, | |
30 ; longest_match_7fff will be used | |
31 ; if you have a more modern CPU (Pentium Pro, II and higher) | |
32 ; longest_match_686 will be used | |
33 ; on old cpu with s->w_mask!=0x7fff, longest_match_686 will be used, | |
34 ; but this is not a sitation you'll find often | |
35 ; | |
36 ; B) Alternative | |
37 ; if you are not interresed in old cpu performance and want the smaller | |
38 ; binaries possible | |
39 ; | |
40 ; compile the asm file running | |
41 ; ml /coff /Zi /c /Flgvmat32.lst /DNOOLDPENTIUMCODE gvmat32.asm | |
42 ; and do not include gvmat32c.c in your project (ou define also | |
43 ; NOOLDPENTIUMCODE) | |
44 ; | |
45 ; note : as I known, longest_match_686 is very faster than longest_match_7fff | |
46 ; on pentium Pro/II/III, faster (but less) in P4, but it seem | |
47 ; longest_match_7fff can be faster (very very litte) on AMD Athlon64/K8 | |
48 ; | |
49 ; see below : zlib1222add must be adjuster if you use a zlib version < 1.2.2.2 | |
50 | |
51 ;uInt longest_match_7fff(s, cur_match) | |
52 ; deflate_state *s; | |
53 ; IPos cur_match; /* current match */ | |
54 | |
55 NbStack equ 76 | |
56 cur_match equ dword ptr[esp+NbStack-0] | |
57 str_s equ dword ptr[esp+NbStack-4] | |
58 ; 5 dword on top (ret,ebp,esi,edi,ebx) | |
59 adrret equ dword ptr[esp+NbStack-8] | |
60 pushebp equ dword ptr[esp+NbStack-12] | |
61 pushedi equ dword ptr[esp+NbStack-16] | |
62 pushesi equ dword ptr[esp+NbStack-20] | |
63 pushebx equ dword ptr[esp+NbStack-24] | |
64 | |
65 chain_length equ dword ptr [esp+NbStack-28] | |
66 limit equ dword ptr [esp+NbStack-32] | |
67 best_len equ dword ptr [esp+NbStack-36] | |
68 window equ dword ptr [esp+NbStack-40] | |
69 prev equ dword ptr [esp+NbStack-44] | |
70 scan_start equ word ptr [esp+NbStack-48] | |
71 wmask equ dword ptr [esp+NbStack-52] | |
72 match_start_ptr equ dword ptr [esp+NbStack-56] | |
73 nice_match equ dword ptr [esp+NbStack-60] | |
74 scan equ dword ptr [esp+NbStack-64] | |
75 | |
76 windowlen equ dword ptr [esp+NbStack-68] | |
77 match_start equ dword ptr [esp+NbStack-72] | |
78 strend equ dword ptr [esp+NbStack-76] | |
79 NbStackAdd equ (NbStack-24) | |
80 | |
81 .386p | |
82 | |
83 name gvmatch | |
84 .MODEL FLAT | |
85 | |
86 | |
87 | |
88 ; all the +zlib1222add offsets are due to the addition of fields | |
89 ; in zlib in the deflate_state structure since the asm code was first written | |
90 ; (if you compile with zlib 1.0.4 or older, use "zlib1222add equ (-4)"). | |
91 ; (if you compile with zlib between 1.0.5 and 1.2.2.1, use "zlib1222add equ 0"). | |
92 ; if you compile with zlib 1.2.2.2 or later , use "zlib1222add equ 8"). | |
93 | |
94 zlib1222add equ 8 | |
95 | |
96 ; Note : these value are good with a 8 bytes boundary pack structure | |
97 dep_chain_length equ 74h+zlib1222add | |
98 dep_window equ 30h+zlib1222add | |
99 dep_strstart equ 64h+zlib1222add | |
100 dep_prev_length equ 70h+zlib1222add | |
101 dep_nice_match equ 88h+zlib1222add | |
102 dep_w_size equ 24h+zlib1222add | |
103 dep_prev equ 38h+zlib1222add | |
104 dep_w_mask equ 2ch+zlib1222add | |
105 dep_good_match equ 84h+zlib1222add | |
106 dep_match_start equ 68h+zlib1222add | |
107 dep_lookahead equ 6ch+zlib1222add | |
108 | |
109 | |
110 _TEXT segment | |
111 | |
112 IFDEF NOUNDERLINE | |
113 IFDEF NOOLDPENTIUMCODE | |
114 public longest_match | |
115 public match_init | |
116 ELSE | |
117 public longest_match_7fff | |
118 public cpudetect32 | |
119 public longest_match_686 | |
120 ENDIF | |
121 ELSE | |
122 IFDEF NOOLDPENTIUMCODE | |
123 public _longest_match | |
124 public _match_init | |
125 ELSE | |
126 public _longest_match_7fff | |
127 public _cpudetect32 | |
128 public _longest_match_686 | |
129 ENDIF | |
130 ENDIF | |
131 | |
132 MAX_MATCH equ 258 | |
133 MIN_MATCH equ 3 | |
134 MIN_LOOKAHEAD equ (MAX_MATCH+MIN_MATCH+1) | |
135 | |
136 | |
137 | |
138 IFNDEF NOOLDPENTIUMCODE | |
139 IFDEF NOUNDERLINE | |
140 longest_match_7fff proc near | |
141 ELSE | |
142 _longest_match_7fff proc near | |
143 ENDIF | |
144 | |
145 mov edx,[esp+4] | |
146 | |
147 | |
148 | |
149 push ebp | |
150 push edi | |
151 push esi | |
152 push ebx | |
153 | |
154 sub esp,NbStackAdd | |
155 | |
156 ; initialize or check the variables used in match.asm. | |
157 mov ebp,edx | |
158 | |
159 ; chain_length = s->max_chain_length | |
160 ; if (prev_length>=good_match) chain_length >>= 2 | |
161 mov edx,[ebp+dep_chain_length] | |
162 mov ebx,[ebp+dep_prev_length] | |
163 cmp [ebp+dep_good_match],ebx | |
164 ja noshr | |
165 shr edx,2 | |
166 noshr: | |
167 ; we increment chain_length because in the asm, the --chain_lenght is in the beginning of the loop | |
168 inc edx | |
169 mov edi,[ebp+dep_nice_match] | |
170 mov chain_length,edx | |
171 mov eax,[ebp+dep_lookahead] | |
172 cmp eax,edi | |
173 ; if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead; | |
174 jae nolookaheadnicematch | |
175 mov edi,eax | |
176 nolookaheadnicematch: | |
177 ; best_len = s->prev_length | |
178 mov best_len,ebx | |
179 | |
180 ; window = s->window | |
181 mov esi,[ebp+dep_window] | |
182 mov ecx,[ebp+dep_strstart] | |
183 mov window,esi | |
184 | |
185 mov nice_match,edi | |
186 ; scan = window + strstart | |
187 add esi,ecx | |
188 mov scan,esi | |
189 ; dx = *window | |
190 mov dx,word ptr [esi] | |
191 ; bx = *(window+best_len-1) | |
192 mov bx,word ptr [esi+ebx-1] | |
193 add esi,MAX_MATCH-1 | |
194 ; scan_start = *scan | |
195 mov scan_start,dx | |
196 ; strend = scan + MAX_MATCH-1 | |
197 mov strend,esi | |
198 ; bx = scan_end = *(window+best_len-1) | |
199 | |
200 ; IPos limit = s->strstart > (IPos)MAX_DIST(s) ? | |
201 ; s->strstart - (IPos)MAX_DIST(s) : NIL; | |
202 | |
203 mov esi,[ebp+dep_w_size] | |
204 sub esi,MIN_LOOKAHEAD | |
205 ; here esi = MAX_DIST(s) | |
206 sub ecx,esi | |
207 ja nodist | |
208 xor ecx,ecx | |
209 nodist: | |
210 mov limit,ecx | |
211 | |
212 ; prev = s->prev | |
213 mov edx,[ebp+dep_prev] | |
214 mov prev,edx | |
215 | |
216 ; | |
217 mov edx,dword ptr [ebp+dep_match_start] | |
218 mov bp,scan_start | |
219 mov eax,cur_match | |
220 mov match_start,edx | |
221 | |
222 mov edx,window | |
223 mov edi,edx | |
224 add edi,best_len | |
225 mov esi,prev | |
226 dec edi | |
227 ; windowlen = window + best_len -1 | |
228 mov windowlen,edi | |
229 | |
230 jmp beginloop2 | |
231 align 4 | |
232 | |
233 ; here, in the loop | |
234 ; eax = ax = cur_match | |
235 ; ecx = limit | |
236 ; bx = scan_end | |
237 ; bp = scan_start | |
238 ; edi = windowlen (window + best_len -1) | |
239 ; esi = prev | |
240 | |
241 | |
242 ;// here; chain_length <=16 | |
243 normalbeg0add16: | |
244 add chain_length,16 | |
245 jz exitloop | |
246 normalbeg0: | |
247 cmp word ptr[edi+eax],bx | |
248 je normalbeg2noroll | |
249 rcontlabnoroll: | |
250 ; cur_match = prev[cur_match & wmask] | |
251 and eax,7fffh | |
252 mov ax,word ptr[esi+eax*2] | |
253 ; if cur_match > limit, go to exitloop | |
254 cmp ecx,eax | |
255 jnb exitloop | |
256 ; if --chain_length != 0, go to exitloop | |
257 dec chain_length | |
258 jnz normalbeg0 | |
259 jmp exitloop | |
260 | |
261 normalbeg2noroll: | |
262 ; if (scan_start==*(cur_match+window)) goto normalbeg2 | |
263 cmp bp,word ptr[edx+eax] | |
264 jne rcontlabnoroll | |
265 jmp normalbeg2 | |
266 | |
267 contloop3: | |
268 mov edi,windowlen | |
269 | |
270 ; cur_match = prev[cur_match & wmask] | |
271 and eax,7fffh | |
272 mov ax,word ptr[esi+eax*2] | |
273 ; if cur_match > limit, go to exitloop | |
274 cmp ecx,eax | |
275 jnbexitloopshort1: | |
276 jnb exitloop | |
277 ; if --chain_length != 0, go to exitloop | |
278 | |
279 | |
280 ; begin the main loop | |
281 beginloop2: | |
282 sub chain_length,16+1 | |
283 ; if chain_length <=16, don't use the unrolled loop | |
284 jna normalbeg0add16 | |
285 | |
286 do16: | |
287 cmp word ptr[edi+eax],bx | |
288 je normalbeg2dc0 | |
289 | |
290 maccn MACRO lab | |
291 and eax,7fffh | |
292 mov ax,word ptr[esi+eax*2] | |
293 cmp ecx,eax | |
294 jnb exitloop | |
295 cmp word ptr[edi+eax],bx | |
296 je lab | |
297 ENDM | |
298 | |
299 rcontloop0: | |
300 maccn normalbeg2dc1 | |
301 | |
302 rcontloop1: | |
303 maccn normalbeg2dc2 | |
304 | |
305 rcontloop2: | |
306 maccn normalbeg2dc3 | |
307 | |
308 rcontloop3: | |
309 maccn normalbeg2dc4 | |
310 | |
311 rcontloop4: | |
312 maccn normalbeg2dc5 | |
313 | |
314 rcontloop5: | |
315 maccn normalbeg2dc6 | |
316 | |
317 rcontloop6: | |
318 maccn normalbeg2dc7 | |
319 | |
320 rcontloop7: | |
321 maccn normalbeg2dc8 | |
322 | |
323 rcontloop8: | |
324 maccn normalbeg2dc9 | |
325 | |
326 rcontloop9: | |
327 maccn normalbeg2dc10 | |
328 | |
329 rcontloop10: | |
330 maccn short normalbeg2dc11 | |
331 | |
332 rcontloop11: | |
333 maccn short normalbeg2dc12 | |
334 | |
335 rcontloop12: | |
336 maccn short normalbeg2dc13 | |
337 | |
338 rcontloop13: | |
339 maccn short normalbeg2dc14 | |
340 | |
341 rcontloop14: | |
342 maccn short normalbeg2dc15 | |
343 | |
344 rcontloop15: | |
345 and eax,7fffh | |
346 mov ax,word ptr[esi+eax*2] | |
347 cmp ecx,eax | |
348 jnb exitloop | |
349 | |
350 sub chain_length,16 | |
351 ja do16 | |
352 jmp normalbeg0add16 | |
353 | |
354 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
355 | |
356 normbeg MACRO rcontlab,valsub | |
357 ; if we are here, we know that *(match+best_len-1) == scan_end | |
358 cmp bp,word ptr[edx+eax] | |
359 ; if (match != scan_start) goto rcontlab | |
360 jne rcontlab | |
361 ; calculate the good chain_length, and we'll compare scan and match string | |
362 add chain_length,16-valsub | |
363 jmp iseq | |
364 ENDM | |
365 | |
366 | |
367 normalbeg2dc11: | |
368 normbeg rcontloop11,11 | |
369 | |
370 normalbeg2dc12: | |
371 normbeg short rcontloop12,12 | |
372 | |
373 normalbeg2dc13: | |
374 normbeg short rcontloop13,13 | |
375 | |
376 normalbeg2dc14: | |
377 normbeg short rcontloop14,14 | |
378 | |
379 normalbeg2dc15: | |
380 normbeg short rcontloop15,15 | |
381 | |
382 normalbeg2dc10: | |
383 normbeg rcontloop10,10 | |
384 | |
385 normalbeg2dc9: | |
386 normbeg rcontloop9,9 | |
387 | |
388 normalbeg2dc8: | |
389 normbeg rcontloop8,8 | |
390 | |
391 normalbeg2dc7: | |
392 normbeg rcontloop7,7 | |
393 | |
394 normalbeg2dc6: | |
395 normbeg rcontloop6,6 | |
396 | |
397 normalbeg2dc5: | |
398 normbeg rcontloop5,5 | |
399 | |
400 normalbeg2dc4: | |
401 normbeg rcontloop4,4 | |
402 | |
403 normalbeg2dc3: | |
404 normbeg rcontloop3,3 | |
405 | |
406 normalbeg2dc2: | |
407 normbeg rcontloop2,2 | |
408 | |
409 normalbeg2dc1: | |
410 normbeg rcontloop1,1 | |
411 | |
412 normalbeg2dc0: | |
413 normbeg rcontloop0,0 | |
414 | |
415 | |
416 ; we go in normalbeg2 because *(ushf*)(match+best_len-1) == scan_end | |
417 | |
418 normalbeg2: | |
419 mov edi,window | |
420 | |
421 cmp bp,word ptr[edi+eax] | |
422 jne contloop3 ; if *(ushf*)match != scan_start, continue | |
423 | |
424 iseq: | |
425 ; if we are here, we know that *(match+best_len-1) == scan_end | |
426 ; and (match == scan_start) | |
427 | |
428 mov edi,edx | |
429 mov esi,scan ; esi = scan | |
430 add edi,eax ; edi = window + cur_match = match | |
431 | |
432 mov edx,[esi+3] ; compare manually dword at match+3 | |
433 xor edx,[edi+3] ; and scan +3 | |
434 | |
435 jz begincompare ; if equal, go to long compare | |
436 | |
437 ; we will determine the unmatch byte and calculate len (in esi) | |
438 or dl,dl | |
439 je eq1rr | |
440 mov esi,3 | |
441 jmp trfinval | |
442 eq1rr: | |
443 or dx,dx | |
444 je eq1 | |
445 | |
446 mov esi,4 | |
447 jmp trfinval | |
448 eq1: | |
449 and edx,0ffffffh | |
450 jz eq11 | |
451 mov esi,5 | |
452 jmp trfinval | |
453 eq11: | |
454 mov esi,6 | |
455 jmp trfinval | |
456 | |
457 begincompare: | |
458 ; here we now scan and match begin same | |
459 add edi,6 | |
460 add esi,6 | |
461 mov ecx,(MAX_MATCH-(2+4))/4 ; scan for at most MAX_MATCH bytes | |
462 repe cmpsd ; loop until mismatch | |
463 | |
464 je trfin ; go to trfin if not unmatch | |
465 ; we determine the unmatch byte | |
466 sub esi,4 | |
467 mov edx,[edi-4] | |
468 xor edx,[esi] | |
469 | |
470 or dl,dl | |
471 jnz trfin | |
472 inc esi | |
473 | |
474 or dx,dx | |
475 jnz trfin | |
476 inc esi | |
477 | |
478 and edx,0ffffffh | |
479 jnz trfin | |
480 inc esi | |
481 | |
482 trfin: | |
483 sub esi,scan ; esi = len | |
484 trfinval: | |
485 ; here we have finised compare, and esi contain len of equal string | |
486 cmp esi,best_len ; if len > best_len, go newbestlen | |
487 ja short newbestlen | |
488 ; now we restore edx, ecx and esi, for the big loop | |
489 mov esi,prev | |
490 mov ecx,limit | |
491 mov edx,window | |
492 jmp contloop3 | |
493 | |
494 newbestlen: | |
495 mov best_len,esi ; len become best_len | |
496 | |
497 mov match_start,eax ; save new position as match_start | |
498 cmp esi,nice_match ; if best_len >= nice_match, exit | |
499 jae exitloop | |
500 mov ecx,scan | |
501 mov edx,window ; restore edx=window | |
502 add ecx,esi | |
503 add esi,edx | |
504 | |
505 dec esi | |
506 mov windowlen,esi ; windowlen = window + best_len-1 | |
507 mov bx,[ecx-1] ; bx = *(scan+best_len-1) = scan_end | |
508 | |
509 ; now we restore ecx and esi, for the big loop : | |
510 mov esi,prev | |
511 mov ecx,limit | |
512 jmp contloop3 | |
513 | |
514 exitloop: | |
515 ; exit : s->match_start=match_start | |
516 mov ebx,match_start | |
517 mov ebp,str_s | |
518 mov ecx,best_len | |
519 mov dword ptr [ebp+dep_match_start],ebx | |
520 mov eax,dword ptr [ebp+dep_lookahead] | |
521 cmp ecx,eax | |
522 ja minexlo | |
523 mov eax,ecx | |
524 minexlo: | |
525 ; return min(best_len,s->lookahead) | |
526 | |
527 ; restore stack and register ebx,esi,edi,ebp | |
528 add esp,NbStackAdd | |
529 | |
530 pop ebx | |
531 pop esi | |
532 pop edi | |
533 pop ebp | |
534 ret | |
535 InfoAuthor: | |
536 ; please don't remove this string ! | |
537 ; Your are free use gvmat32 in any fre or commercial apps if you don't remove the string in the binary! | |
538 db 0dh,0ah,"GVMat32 optimised assembly code written 1996-98 by Gilles Vollant",0dh,0ah | |
539 | |
540 | |
541 | |
542 IFDEF NOUNDERLINE | |
543 longest_match_7fff endp | |
544 ELSE | |
545 _longest_match_7fff endp | |
546 ENDIF | |
547 | |
548 | |
549 IFDEF NOUNDERLINE | |
550 cpudetect32 proc near | |
551 ELSE | |
552 _cpudetect32 proc near | |
553 ENDIF | |
554 | |
555 push ebx | |
556 | |
557 pushfd ; push original EFLAGS | |
558 pop eax ; get original EFLAGS | |
559 mov ecx, eax ; save original EFLAGS | |
560 xor eax, 40000h ; flip AC bit in EFLAGS | |
561 push eax ; save new EFLAGS value on stack | |
562 popfd ; replace current EFLAGS value | |
563 pushfd ; get new EFLAGS | |
564 pop eax ; store new EFLAGS in EAX | |
565 xor eax, ecx ; can’t toggle AC bit, processor=80386 | |
566 jz end_cpu_is_386 ; jump if 80386 processor | |
567 push ecx | |
568 popfd ; restore AC bit in EFLAGS first | |
569 | |
570 pushfd | |
571 pushfd | |
572 pop ecx | |
573 | |
574 mov eax, ecx ; get original EFLAGS | |
575 xor eax, 200000h ; flip ID bit in EFLAGS | |
576 push eax ; save new EFLAGS value on stack | |
577 popfd ; replace current EFLAGS value | |
578 pushfd ; get new EFLAGS | |
579 pop eax ; store new EFLAGS in EAX | |
580 popfd ; restore original EFLAGS | |
581 xor eax, ecx ; can’t toggle ID bit, | |
582 je is_old_486 ; processor=old | |
583 | |
584 mov eax,1 | |
585 db 0fh,0a2h ;CPUID | |
586 | |
587 exitcpudetect: | |
588 pop ebx | |
589 ret | |
590 | |
591 end_cpu_is_386: | |
592 mov eax,0300h | |
593 jmp exitcpudetect | |
594 | |
595 is_old_486: | |
596 mov eax,0400h | |
597 jmp exitcpudetect | |
598 | |
599 IFDEF NOUNDERLINE | |
600 cpudetect32 endp | |
601 ELSE | |
602 _cpudetect32 endp | |
603 ENDIF | |
604 ENDIF | |
605 | |
606 MAX_MATCH equ 258 | |
607 MIN_MATCH equ 3 | |
608 MIN_LOOKAHEAD equ (MAX_MATCH + MIN_MATCH + 1) | |
609 MAX_MATCH_8_ equ ((MAX_MATCH + 7) AND 0FFF0h) | |
610 | |
611 | |
612 ;;; stack frame offsets | |
613 | |
614 chainlenwmask equ esp + 0 ; high word: current chain len | |
615 ; low word: s->wmask | |
616 window equ esp + 4 ; local copy of s->window | |
617 windowbestlen equ esp + 8 ; s->window + bestlen | |
618 scanstart equ esp + 16 ; first two bytes of string | |
619 scanend equ esp + 12 ; last two bytes of string | |
620 scanalign equ esp + 20 ; dword-misalignment of string | |
621 nicematch equ esp + 24 ; a good enough match size | |
622 bestlen equ esp + 28 ; size of best match so far | |
623 scan equ esp + 32 ; ptr to string wanting match | |
624 | |
625 LocalVarsSize equ 36 | |
626 ; saved ebx byte esp + 36 | |
627 ; saved edi byte esp + 40 | |
628 ; saved esi byte esp + 44 | |
629 ; saved ebp byte esp + 48 | |
630 ; return address byte esp + 52 | |
631 deflatestate equ esp + 56 ; the function arguments | |
632 curmatch equ esp + 60 | |
633 | |
634 ;;; Offsets for fields in the deflate_state structure. These numbers | |
635 ;;; are calculated from the definition of deflate_state, with the | |
636 ;;; assumption that the compiler will dword-align the fields. (Thus, | |
637 ;;; changing the definition of deflate_state could easily cause this | |
638 ;;; program to crash horribly, without so much as a warning at | |
639 ;;; compile time. Sigh.) | |
640 | |
641 dsWSize equ 36+zlib1222add | |
642 dsWMask equ 44+zlib1222add | |
643 dsWindow equ 48+zlib1222add | |
644 dsPrev equ 56+zlib1222add | |
645 dsMatchLen equ 88+zlib1222add | |
646 dsPrevMatch equ 92+zlib1222add | |
647 dsStrStart equ 100+zlib1222add | |
648 dsMatchStart equ 104+zlib1222add | |
649 dsLookahead equ 108+zlib1222add | |
650 dsPrevLen equ 112+zlib1222add | |
651 dsMaxChainLen equ 116+zlib1222add | |
652 dsGoodMatch equ 132+zlib1222add | |
653 dsNiceMatch equ 136+zlib1222add | |
654 | |
655 | |
656 ;;; match.asm -- Pentium-Pro-optimized version of longest_match() | |
657 ;;; Written for zlib 1.1.2 | |
658 ;;; Copyright (C) 1998 Brian Raiter <breadbox@muppetlabs.com> | |
659 ;;; You can look at http://www.muppetlabs.com/~breadbox/software/assembly.html | |
660 ;;; | |
661 ;;; This is free software; you can redistribute it and/or modify it | |
662 ;;; under the terms of the GNU General Public License. | |
663 | |
664 ;GLOBAL _longest_match, _match_init | |
665 | |
666 | |
667 ;SECTION .text | |
668 | |
669 ;;; uInt longest_match(deflate_state *deflatestate, IPos curmatch) | |
670 | |
671 ;_longest_match: | |
672 IFDEF NOOLDPENTIUMCODE | |
673 IFDEF NOUNDERLINE | |
674 longest_match proc near | |
675 ELSE | |
676 _longest_match proc near | |
677 ENDIF | |
678 ELSE | |
679 IFDEF NOUNDERLINE | |
680 longest_match_686 proc near | |
681 ELSE | |
682 _longest_match_686 proc near | |
683 ENDIF | |
684 ENDIF | |
685 | |
686 ;;; Save registers that the compiler may be using, and adjust esp to | |
687 ;;; make room for our stack frame. | |
688 | |
689 push ebp | |
690 push edi | |
691 push esi | |
692 push ebx | |
693 sub esp, LocalVarsSize | |
694 | |
695 ;;; Retrieve the function arguments. ecx will hold cur_match | |
696 ;;; throughout the entire function. edx will hold the pointer to the | |
697 ;;; deflate_state structure during the function's setup (before | |
698 ;;; entering the main loop. | |
699 | |
700 mov edx, [deflatestate] | |
701 mov ecx, [curmatch] | |
702 | |
703 ;;; uInt wmask = s->w_mask; | |
704 ;;; unsigned chain_length = s->max_chain_length; | |
705 ;;; if (s->prev_length >= s->good_match) { | |
706 ;;; chain_length >>= 2; | |
707 ;;; } | |
708 | |
709 mov eax, [edx + dsPrevLen] | |
710 mov ebx, [edx + dsGoodMatch] | |
711 cmp eax, ebx | |
712 mov eax, [edx + dsWMask] | |
713 mov ebx, [edx + dsMaxChainLen] | |
714 jl LastMatchGood | |
715 shr ebx, 2 | |
716 LastMatchGood: | |
717 | |
718 ;;; chainlen is decremented once beforehand so that the function can | |
719 ;;; use the sign flag instead of the zero flag for the exit test. | |
720 ;;; It is then shifted into the high word, to make room for the wmask | |
721 ;;; value, which it will always accompany. | |
722 | |
723 dec ebx | |
724 shl ebx, 16 | |
725 or ebx, eax | |
726 mov [chainlenwmask], ebx | |
727 | |
728 ;;; if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead; | |
729 | |
730 mov eax, [edx + dsNiceMatch] | |
731 mov ebx, [edx + dsLookahead] | |
732 cmp ebx, eax | |
733 jl LookaheadLess | |
734 mov ebx, eax | |
735 LookaheadLess: mov [nicematch], ebx | |
736 | |
737 ;;; register Bytef *scan = s->window + s->strstart; | |
738 | |
739 mov esi, [edx + dsWindow] | |
740 mov [window], esi | |
741 mov ebp, [edx + dsStrStart] | |
742 lea edi, [esi + ebp] | |
743 mov [scan], edi | |
744 | |
745 ;;; Determine how many bytes the scan ptr is off from being | |
746 ;;; dword-aligned. | |
747 | |
748 mov eax, edi | |
749 neg eax | |
750 and eax, 3 | |
751 mov [scanalign], eax | |
752 | |
753 ;;; IPos limit = s->strstart > (IPos)MAX_DIST(s) ? | |
754 ;;; s->strstart - (IPos)MAX_DIST(s) : NIL; | |
755 | |
756 mov eax, [edx + dsWSize] | |
757 sub eax, MIN_LOOKAHEAD | |
758 sub ebp, eax | |
759 jg LimitPositive | |
760 xor ebp, ebp | |
761 LimitPositive: | |
762 | |
763 ;;; int best_len = s->prev_length; | |
764 | |
765 mov eax, [edx + dsPrevLen] | |
766 mov [bestlen], eax | |
767 | |
768 ;;; Store the sum of s->window + best_len in esi locally, and in esi. | |
769 | |
770 add esi, eax | |
771 mov [windowbestlen], esi | |
772 | |
773 ;;; register ush scan_start = *(ushf*)scan; | |
774 ;;; register ush scan_end = *(ushf*)(scan+best_len-1); | |
775 ;;; Posf *prev = s->prev; | |
776 | |
777 movzx ebx, word ptr [edi] | |
778 mov [scanstart], ebx | |
779 movzx ebx, word ptr [edi + eax - 1] | |
780 mov [scanend], ebx | |
781 mov edi, [edx + dsPrev] | |
782 | |
783 ;;; Jump into the main loop. | |
784 | |
785 mov edx, [chainlenwmask] | |
786 jmp short LoopEntry | |
787 | |
788 align 4 | |
789 | |
790 ;;; do { | |
791 ;;; match = s->window + cur_match; | |
792 ;;; if (*(ushf*)(match+best_len-1) != scan_end || | |
793 ;;; *(ushf*)match != scan_start) continue; | |
794 ;;; [...] | |
795 ;;; } while ((cur_match = prev[cur_match & wmask]) > limit | |
796 ;;; && --chain_length != 0); | |
797 ;;; | |
798 ;;; Here is the inner loop of the function. The function will spend the | |
799 ;;; majority of its time in this loop, and majority of that time will | |
800 ;;; be spent in the first ten instructions. | |
801 ;;; | |
802 ;;; Within this loop: | |
803 ;;; ebx = scanend | |
804 ;;; ecx = curmatch | |
805 ;;; edx = chainlenwmask - i.e., ((chainlen << 16) | wmask) | |
806 ;;; esi = windowbestlen - i.e., (window + bestlen) | |
807 ;;; edi = prev | |
808 ;;; ebp = limit | |
809 | |
810 LookupLoop: | |
811 and ecx, edx | |
812 movzx ecx, word ptr [edi + ecx*2] | |
813 cmp ecx, ebp | |
814 jbe LeaveNow | |
815 sub edx, 00010000h | |
816 js LeaveNow | |
817 LoopEntry: movzx eax, word ptr [esi + ecx - 1] | |
818 cmp eax, ebx | |
819 jnz LookupLoop | |
820 mov eax, [window] | |
821 movzx eax, word ptr [eax + ecx] | |
822 cmp eax, [scanstart] | |
823 jnz LookupLoop | |
824 | |
825 ;;; Store the current value of chainlen. | |
826 | |
827 mov [chainlenwmask], edx | |
828 | |
829 ;;; Point edi to the string under scrutiny, and esi to the string we | |
830 ;;; are hoping to match it up with. In actuality, esi and edi are | |
831 ;;; both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and edx is | |
832 ;;; initialized to -(MAX_MATCH_8 - scanalign). | |
833 | |
834 mov esi, [window] | |
835 mov edi, [scan] | |
836 add esi, ecx | |
837 mov eax, [scanalign] | |
838 mov edx, 0fffffef8h; -(MAX_MATCH_8) | |
839 lea edi, [edi + eax + 0108h] ;MAX_MATCH_8] | |
840 lea esi, [esi + eax + 0108h] ;MAX_MATCH_8] | |
841 | |
842 ;;; Test the strings for equality, 8 bytes at a time. At the end, | |
843 ;;; adjust edx so that it is offset to the exact byte that mismatched. | |
844 ;;; | |
845 ;;; We already know at this point that the first three bytes of the | |
846 ;;; strings match each other, and they can be safely passed over before | |
847 ;;; starting the compare loop. So what this code does is skip over 0-3 | |
848 ;;; bytes, as much as necessary in order to dword-align the edi | |
849 ;;; pointer. (esi will still be misaligned three times out of four.) | |
850 ;;; | |
851 ;;; It should be confessed that this loop usually does not represent | |
852 ;;; much of the total running time. Replacing it with a more | |
853 ;;; straightforward "rep cmpsb" would not drastically degrade | |
854 ;;; performance. | |
855 | |
856 LoopCmps: | |
857 mov eax, [esi + edx] | |
858 xor eax, [edi + edx] | |
859 jnz LeaveLoopCmps | |
860 mov eax, [esi + edx + 4] | |
861 xor eax, [edi + edx + 4] | |
862 jnz LeaveLoopCmps4 | |
863 add edx, 8 | |
864 jnz LoopCmps | |
865 jmp short LenMaximum | |
866 LeaveLoopCmps4: add edx, 4 | |
867 LeaveLoopCmps: test eax, 0000FFFFh | |
868 jnz LenLower | |
869 add edx, 2 | |
870 shr eax, 16 | |
871 LenLower: sub al, 1 | |
872 adc edx, 0 | |
873 | |
874 ;;; Calculate the length of the match. If it is longer than MAX_MATCH, | |
875 ;;; then automatically accept it as the best possible match and leave. | |
876 | |
877 lea eax, [edi + edx] | |
878 mov edi, [scan] | |
879 sub eax, edi | |
880 cmp eax, MAX_MATCH | |
881 jge LenMaximum | |
882 | |
883 ;;; If the length of the match is not longer than the best match we | |
884 ;;; have so far, then forget it and return to the lookup loop. | |
885 | |
886 mov edx, [deflatestate] | |
887 mov ebx, [bestlen] | |
888 cmp eax, ebx | |
889 jg LongerMatch | |
890 mov esi, [windowbestlen] | |
891 mov edi, [edx + dsPrev] | |
892 mov ebx, [scanend] | |
893 mov edx, [chainlenwmask] | |
894 jmp LookupLoop | |
895 | |
896 ;;; s->match_start = cur_match; | |
897 ;;; best_len = len; | |
898 ;;; if (len >= nice_match) break; | |
899 ;;; scan_end = *(ushf*)(scan+best_len-1); | |
900 | |
901 LongerMatch: mov ebx, [nicematch] | |
902 mov [bestlen], eax | |
903 mov [edx + dsMatchStart], ecx | |
904 cmp eax, ebx | |
905 jge LeaveNow | |
906 mov esi, [window] | |
907 add esi, eax | |
908 mov [windowbestlen], esi | |
909 movzx ebx, word ptr [edi + eax - 1] | |
910 mov edi, [edx + dsPrev] | |
911 mov [scanend], ebx | |
912 mov edx, [chainlenwmask] | |
913 jmp LookupLoop | |
914 | |
915 ;;; Accept the current string, with the maximum possible length. | |
916 | |
917 LenMaximum: mov edx, [deflatestate] | |
918 mov dword ptr [bestlen], MAX_MATCH | |
919 mov [edx + dsMatchStart], ecx | |
920 | |
921 ;;; if ((uInt)best_len <= s->lookahead) return (uInt)best_len; | |
922 ;;; return s->lookahead; | |
923 | |
924 LeaveNow: | |
925 mov edx, [deflatestate] | |
926 mov ebx, [bestlen] | |
927 mov eax, [edx + dsLookahead] | |
928 cmp ebx, eax | |
929 jg LookaheadRet | |
930 mov eax, ebx | |
931 LookaheadRet: | |
932 | |
933 ;;; Restore the stack and return from whence we came. | |
934 | |
935 add esp, LocalVarsSize | |
936 pop ebx | |
937 pop esi | |
938 pop edi | |
939 pop ebp | |
940 | |
941 ret | |
942 ; please don't remove this string ! | |
943 ; Your can freely use gvmat32 in any free or commercial app if you don't remove the string in the binary! | |
944 db 0dh,0ah,"asm686 with masm, optimised assembly code from Brian Raiter, written 1998",0dh,0ah | |
945 | |
946 | |
947 IFDEF NOOLDPENTIUMCODE | |
948 IFDEF NOUNDERLINE | |
949 longest_match endp | |
950 ELSE | |
951 _longest_match endp | |
952 ENDIF | |
953 | |
954 IFDEF NOUNDERLINE | |
955 match_init proc near | |
956 ret | |
957 match_init endp | |
958 ELSE | |
959 _match_init proc near | |
960 ret | |
961 _match_init endp | |
962 ENDIF | |
963 ELSE | |
964 IFDEF NOUNDERLINE | |
965 longest_match_686 endp | |
966 ELSE | |
967 _longest_match_686 endp | |
968 ENDIF | |
969 ENDIF | |
970 | |
971 _TEXT ends | |
972 end |