dnl AMD K6 mpn_gcd_1 -- mpn by 1 gcd. dnl Copyright 2000-2002, 2004, 2014 Free Software Foundation, Inc. dnl This file is part of the GNU MP Library. dnl dnl The GNU MP Library is free software; you can redistribute it and/or modify dnl it under the terms of either: dnl dnl * the GNU Lesser General Public License as published by the Free dnl Software Foundation; either version 3 of the License, or (at your dnl option) any later version. dnl dnl or dnl dnl * the GNU General Public License as published by the Free Software dnl Foundation; either version 2 of the License, or (at your option) any dnl later version. dnl dnl or both in parallel, as here. dnl dnl The GNU MP Library is distributed in the hope that it will be useful, but dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License dnl for more details. dnl dnl You should have received copies of the GNU General Public License and the dnl GNU Lesser General Public License along with the GNU MP Library. If not, dnl see https://www.gnu.org/licenses/. include(`../config.m4') C K6: 9.5 cycles/bit (approx) 1x1 gcd C 11.0 cycles/limb Nx1 reduction (modexact_1_odd) C mp_limb_t mpn_gcd_1 (mp_srcptr src, mp_size_t size, mp_limb_t y); C C This code is nothing very special, but offers a speedup over what gcc 2.95 C can do with mpn/generic/gcd_1.c. C C Future: C C Using a lookup table to count trailing zeros seems a touch quicker, but C after a slightly longer startup. Might be worthwhile if an mpn_gcd_2 used C it too. dnl If size==1 and x (the larger operand) is more than DIV_THRESHOLD bits dnl bigger than y, then a division x%y is done to reduce it. dnl dnl A divl is 20 cycles and the loop runs at about 9.5 cycles/bitpair so dnl there should be an advantage in the divl at about 4 or 5 bits, which is dnl what's found. deflit(DIV_THRESHOLD, 5) defframe(PARAM_LIMB, 12) defframe(PARAM_SIZE, 8) defframe(PARAM_SRC, 4) TEXT ALIGN(16) PROLOGUE(mpn_gcd_1) deflit(`FRAME',0) ASSERT(ne, `cmpl $0, PARAM_LIMB') ASSERT(ae, `cmpl $1, PARAM_SIZE') movl PARAM_SRC, %eax pushl %ebx FRAME_pushl() movl PARAM_LIMB, %edx movl $-1, %ecx movl (%eax), %ebx C src low limb movl %ebx, %eax C src low limb orl %edx, %ebx L(common_twos): shrl %ebx incl %ecx jnc L(common_twos) C 1/4 chance on random data shrl %cl, %edx C y cmpl $1, PARAM_SIZE ja L(size_two_or_more) ASSERT(nz, `orl %eax, %eax') C should have src limb != 0 shrl %cl, %eax C x C Swap if necessary to make x>=y. Measures a touch quicker as a C jump than a branch free calculation. C C eax x C ebx C ecx common twos C edx y movl %eax, %ebx cmpl %eax, %edx jb L(noswap) movl %edx, %eax movl %ebx, %edx movl %eax, %ebx L(noswap): C See if it's worth reducing x with a divl. C C eax x C ebx x C ecx common twos C edx y shrl $DIV_THRESHOLD, %ebx cmpl %ebx, %edx ja L(nodiv) C Reduce x to x%y. C C eax x C ebx C ecx common twos C edx y movl %edx, %ebx xorl %edx, %edx divl %ebx orl %edx, %edx C y nop C code alignment movl %ebx, %eax C x jz L(done_shll) L(nodiv): C eax x C ebx C ecx common twos C edx y C esi C edi C ebp L(strip_y): shrl %edx jnc L(strip_y) leal 1(%edx,%edx), %edx movl %ecx, %ebx C common twos leal 1(%eax), %ecx jmp L(strip_x_and) C Calculating a %cl shift based on the low bit 0 or 1 avoids doing a branch C on a 50/50 chance of 0 or 1. The chance of the next bit also being 0 is C only 1/4. C C A second computed %cl shift was tried, but that measured a touch slower C than branching back. C C A branch-free abs(x-y) and min(x,y) calculation was tried, but that C measured about 1 cycle/bit slower. C eax x C ebx common twos C ecx scratch C edx y ALIGN(4) L(swap): addl %eax, %edx C x-y+y = x negl %eax C -(x-y) = y-x L(strip_x): shrl %eax C odd-odd = even, so always one to strip ASSERT(nz) L(strip_x_leal): leal 1(%eax), %ecx L(strip_x_and): andl $1, %ecx C (x^1)&1 shrl %cl, %eax C shift if x even testb $1, %al jz L(strip_x) ASSERT(nz,`testl $1, %eax') C x, y odd ASSERT(nz,`testl $1, %edx') subl %edx, %eax jb L(swap) ja L(strip_x) movl %edx, %eax movl %ebx, %ecx L(done_shll): shll %cl, %eax popl %ebx ret C ----------------------------------------------------------------------------- C Two or more limbs. C C x={src,size} is reduced modulo y using either a plain mod_1 style C remainder, or a modexact_1 style exact division. deflit(MODEXACT_THRESHOLD, ifdef(`PIC', 4, 4)) ALIGN(8) L(size_two_or_more): C eax C ebx C ecx common twos C edx y, without common twos C esi C edi C ebp deflit(FRAME_TWO_OR_MORE, FRAME) pushl %edi defframe_pushl(SAVE_EDI) movl PARAM_SRC, %ebx L(y_twos): shrl %edx jnc L(y_twos) movl %ecx, %edi C common twos movl PARAM_SIZE, %ecx pushl %esi defframe_pushl(SAVE_ESI) leal 1(%edx,%edx), %esi C y (odd) movl -4(%ebx,%ecx,4), %eax C src high limb cmpl %edx, %eax C carry if high