index 2aaf041..66cf82b 100644 (file)
.frame  \$30,0,\$26,0

-       ldq     \$3,0(\$17)
-       ldq     \$4,0(\$18)
-
-       subq    \$19,1,\$19
-       and     \$19,4-1,\$2      # number of limbs in first loop
-       bis     \$31,\$31,\$0
-       beq     \$2,.L0          # if multiple of 4 limbs, skip first loop
-
-       subq    \$19,\$2,\$19
-
-.Loop0:        subq    \$2,1,\$2
+       or      \$31,\$31,\$25             # clear cy
+       subq    \$19,4,\$19               # decr loop cnt
+       blt     \$19,.Lend2              # if less than 4 limbs, goto 2nd loop
+ # Start software pipeline for 1st loop
+       ldq     \$0,0(\$18)
+       ldq     \$1,8(\$18)
+       ldq     \$4,0(\$17)
ldq     \$5,8(\$17)
-       ldq     \$6,8(\$18)
-       cmpult  \$4,\$0,\$1
-       cmpult  \$4,\$3,\$0
-       stq     \$4,0(\$16)
-       or      \$0,\$1,\$0
-
-       bis     \$5,\$5,\$3
-       bis     \$6,\$6,\$4
-       bne     \$2,.Loop0
-
-.L0:   beq     \$19,.Lend
-
+       addq    \$17,32,\$17              # update s1_ptr
+       ldq     \$2,16(\$18)
+       ldq     \$3,24(\$18)
+       subq    \$19,4,\$19               # decr loop cnt
+       ldq     \$6,-16(\$17)
+       cmpult  \$20,\$0,\$25              # compute cy from last add
+       ldq     \$7,-8(\$17)
+       addq    \$18,32,\$18              # update s2_ptr
+       cmpult  \$28,\$25,\$8              # compute cy from last add
+       blt     \$19,.Lend1              # if less than 4 limbs remain, jump
+ # 1st loop handles groups of 4 limbs in a software pipeline
.align  4
-.Loop: subq    \$19,4,\$19
-       unop
-
-       ldq     \$6,8(\$18)
+.Loop: cmpult  \$21,\$28,\$25             # compute cy from last add
+       ldq     \$0,0(\$18)
+       or      \$8,\$25,\$25              # combine cy from the two adds
+       ldq     \$1,8(\$18)
+       ldq     \$4,0(\$17)
ldq     \$5,8(\$17)
-       cmpult  \$0,\$4,\$1
-       ldq     \$4,16(\$18)
-       cmpult  \$20,\$3,\$0
-       ldq     \$3,16(\$17)
-       or      \$0,\$1,\$0
-       cmpult  \$0,\$6,\$1
-       ldq     \$6,24(\$18)
-       cmpult  \$21,\$5,\$0
-       ldq     \$5,24(\$17)
-       or      \$0,\$1,\$0
-       cmpult  \$0,\$4,\$1
-       ldq     \$4,32(\$18)
-       cmpult  \$22,\$3,\$0
-       ldq     \$3,32(\$17)
-       or      \$0,\$1,\$0
-       cmpult  \$0,\$6,\$1
-       cmpult  \$23,\$5,\$0
-       or      \$0,\$1,\$0
-
+       cmpult  \$28,\$25,\$8              # compute cy from last add
+       cmpult  \$22,\$28,\$25             # compute cy from last add
stq     \$20,0(\$16)
+       or      \$8,\$25,\$25              # combine cy from the two adds
stq     \$21,8(\$16)
-       stq     \$22,16(\$16)
-       stq     \$23,24(\$16)
-
-       bne     \$19,.Loop
+       cmpult  \$28,\$25,\$8              # compute cy from last add
+       cmpult  \$23,\$28,\$25             # compute cy from last add
+       addq    \$17,32,\$17              # update s1_ptr
+       or      \$8,\$25,\$25              # combine cy from the two adds
+       addq    \$16,32,\$16              # update res_ptr
+       ldq     \$2,16(\$18)
+       ldq     \$3,24(\$18)
+       cmpult  \$28,\$25,\$8              # compute cy from last add
+       ldq     \$6,-16(\$17)
+       cmpult  \$20,\$28,\$25             # compute cy from last add
+       ldq     \$7,-8(\$17)
+       or      \$8,\$25,\$25              # combine cy from the two adds
+       subq    \$19,4,\$19               # decr loop cnt
+       stq     \$22,-16(\$16)
+       stq     \$23,-8(\$16)
+       addq    \$18,32,\$18              # update s2_ptr
+       cmpult  \$28,\$25,\$8              # compute cy from last add
+       bge     \$19,.Loop
+ # Finish software pipeline for 1st loop
+.Lend1:        cmpult  \$21,\$28,\$25             # compute cy from last add
+       or      \$8,\$25,\$25              # combine cy from the two adds
+       cmpult  \$28,\$25,\$8              # compute cy from last add
+       cmpult  \$22,\$28,\$25             # compute cy from last add
+       stq     \$20,0(\$16)
+       or      \$8,\$25,\$25              # combine cy from the two adds
+       stq     \$21,8(\$16)
+       cmpult  \$28,\$25,\$8              # compute cy from last add
+       cmpult  \$23,\$28,\$25             # compute cy from last add
+       or      \$8,\$25,\$25              # combine cy from the two adds
+       addq    \$16,32,\$16              # update res_ptr
+       stq     \$22,-16(\$16)
+       stq     \$23,-8(\$16)
+.Lend2:        addq    \$19,4,\$19               # restore loop cnt
+       beq     \$19,.Lret
+ # Start software pipeline for 2nd loop
+       ldq     \$0,0(\$18)
+       ldq     \$4,0(\$17)
+       subq    \$19,1,\$19
+       beq     \$19,.Lend0
+ # 2nd loop handles remaining 1-3 limbs
+       .align  4
+       ldq     \$0,8(\$18)
+       ldq     \$4,8(\$17)
+       cmpult  \$28,\$25,\$8              # compute cy from last add
+       stq     \$20,0(\$16)
+       cmpult  \$20,\$28,\$25             # compute cy from last add
+       subq    \$19,1,\$19               # decr loop cnt
+       or      \$8,\$25,\$25              # combine cy from the two adds
+       bne     \$19,.Loop0
+       cmpult  \$28,\$25,\$8              # compute cy from last add
+       cmpult  \$20,\$28,\$25             # compute cy from last add
+       stq     \$20,0(\$16)
+       or      \$8,\$25,\$25              # combine cy from the two adds