dnl PowerPC-64 mpn_mod_1_1p dnl Copyright 2010, 2011 Free Software Foundation, Inc. dnl This file is part of the GNU MP Library. dnl dnl The GNU MP Library is free software; you can redistribute it and/or modify dnl it under the terms of either: dnl dnl * the GNU Lesser General Public License as published by the Free dnl Software Foundation; either version 3 of the License, or (at your dnl option) any later version. dnl dnl or dnl dnl * the GNU General Public License as published by the Free Software dnl Foundation; either version 2 of the License, or (at your option) any dnl later version. dnl dnl or both in parallel, as here. dnl dnl The GNU MP Library is distributed in the hope that it will be useful, but dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License dnl for more details. dnl dnl You should have received copies of the GNU General Public License and the dnl GNU Lesser General Public License along with the GNU MP Library. If not, dnl see https://www.gnu.org/licenses/. include(`../config.m4') C cycles/limb C POWER3/PPC630 ? C POWER4/PPC970 17 C POWER5 16 C POWER6 30 C POWER7 10.2 C TODO C * Optimise, in particular the cps function. This was compiler-generated and C then hand optimised. C INPUT PARAMETERS define(`ap', `r3') define(`n', `r4') define(`d', `r5') define(`cps', `r6') ASM_START() EXTERN_FUNC(mpn_invert_limb) PROLOGUE(mpn_mod_1_1p) sldi r10, r4, 3 addi r4, r4, -1 add r3, r3, r10 ld r0, 16(r6) C B1modb ld r12, 24(r6) C B2modb ld r9, -8(r3) ld r10, -16(r3) mtctr r4 mulhdu r8, r9, r0 mulld r7, r9, r0 addc r11, r7, r10 addze r9, r8 bdz L(end) ALIGN(16) L(top): ld r4, -24(r3) addi r3, r3, -8 nop mulld r10, r11, r0 mulld r8, r9, r12 mulhdu r11, r11, r0 mulhdu r9, r9, r12 addc r7, r10, r4 addze r10, r11 addc r11, r8, r7 adde r9, r9, r10 bdnz L(top) L(end): ifdef(`HAVE_LIMB_LITTLE_ENDIAN', ` lwz r0, 8(r6)', ` lwz r0, 12(r6)') ld r3, 0(r6) cmpdi cr7, r0, 0 beq- cr7, L(4) subfic r10, r0, 64 sld r9, r9, r0 srd r10, r11, r10 or r9, r10, r9 L(4): subfc r10, r5, r9 subfe r10, r10, r10 nand r10, r10, r10 sld r11, r11, r0 and r10, r10, r5 subf r9, r10, r9 mulhdu r10, r9, r3 mulld r3, r9, r3 addi r9, r9, 1 addc r8, r3, r11 adde r3, r10, r9 mulld r3, r3, r5 subf r3, r3, r11 cmpld cr7, r8, r3 bge cr7, L(5) C FIXME: Make branch-less add r3, r3, r5 L(5): cmpld cr7, r3, r5 bge- cr7, L(10) srd r3, r3, r0 blr L(10): subf r3, r5, r3 srd r3, r3, r0 blr EPILOGUE() PROLOGUE(mpn_mod_1_1p_cps,toc) mflr r0 std r29, -24(r1) std r30, -16(r1) std r31, -8(r1) cntlzd r31, r4 std r0, 16(r1) extsw r31, r31 mr r29, r3 stdu r1, -144(r1) sld r30, r4, r31 mr r3, r30 CALL( mpn_invert_limb) cmpdi cr7, r31, 0 neg r0, r30 beq- cr7, L(13) subfic r11, r31, 64 li r0, 1 neg r9, r30 srd r11, r3, r11 sld r0, r0, r31 or r0, r11, r0 mulld r0, r0, r9 L(13): mulhdu r9, r0, r3 mulld r11, r0, r3 add r9, r0, r9 nor r9, r9, r9 mulld r9, r9, r30 cmpld cr7, r11, r9 bge cr7, L(14) add r9, r9, r30 L(14): addi r1, r1, 144 srd r0, r0, r31 std r31, 8(r29) std r3, 0(r29) std r0, 16(r29) ld r0, 16(r1) srd r9, r9, r31 ld r30, -16(r1) ld r31, -8(r1) std r9, 24(r29) ld r29, -24(r1) mtlr r0 blr EPILOGUE()