Updated from /src/gmp-1.937
[kopensolaris-gnu/glibc.git] / sysdeps / alpha / alphaev5 / add_n.s
index 2aaf041..66cf82b 100644 (file)
 __mpn_add_n:
        .frame  $30,0,$26,0
 
-       ldq     $3,0($17)
-       ldq     $4,0($18)
-
-       subq    $19,1,$19
-       and     $19,4-1,$2      # number of limbs in first loop
-       bis     $31,$31,$0
-       beq     $2,.L0          # if multiple of 4 limbs, skip first loop
-
-       subq    $19,$2,$19
-
-.Loop0:        subq    $2,1,$2
+       or      $31,$31,$25             # clear cy
+       subq    $19,4,$19               # decr loop cnt
+       blt     $19,.Lend2              # if less than 4 limbs, goto 2nd loop
+ # Start software pipeline for 1st loop
+       ldq     $0,0($18)
+       ldq     $1,8($18)
+       ldq     $4,0($17)
        ldq     $5,8($17)
-       addq    $4,$0,$4
-       ldq     $6,8($18)
-       cmpult  $4,$0,$1
-       addq    $3,$4,$4
-       cmpult  $4,$3,$0
-       stq     $4,0($16)
-       or      $0,$1,$0
-
-       addq    $17,8,$17
-       addq    $18,8,$18
-       bis     $5,$5,$3
-       bis     $6,$6,$4
-       addq    $16,8,$16
-       bne     $2,.Loop0
-
-.L0:   beq     $19,.Lend
-
+       addq    $17,32,$17              # update s1_ptr
+       ldq     $2,16($18)
+       addq    $0,$4,$20               # 1st main add
+       ldq     $3,24($18)
+       subq    $19,4,$19               # decr loop cnt
+       ldq     $6,-16($17)
+       cmpult  $20,$0,$25              # compute cy from last add
+       ldq     $7,-8($17)
+       addq    $1,$25,$28              # cy add
+       addq    $18,32,$18              # update s2_ptr
+       addq    $5,$28,$21              # 2nd main add
+       cmpult  $28,$25,$8              # compute cy from last add
+       blt     $19,.Lend1              # if less than 4 limbs remain, jump
+ # 1st loop handles groups of 4 limbs in a software pipeline
        .align  4
-.Loop: subq    $19,4,$19
-       unop
-
-       ldq     $6,8($18)
-       addq    $4,$0,$0
+.Loop: cmpult  $21,$28,$25             # compute cy from last add
+       ldq     $0,0($18)
+       or      $8,$25,$25              # combine cy from the two adds
+       ldq     $1,8($18)
+       addq    $2,$25,$28              # cy add
+       ldq     $4,0($17)
+       addq    $28,$6,$22              # 3rd main add
        ldq     $5,8($17)
-       cmpult  $0,$4,$1
-       ldq     $4,16($18)
-       addq    $3,$0,$20
-       cmpult  $20,$3,$0
-       ldq     $3,16($17)
-       or      $0,$1,$0
-       addq    $6,$0,$0
-       cmpult  $0,$6,$1
-       ldq     $6,24($18)
-       addq    $5,$0,$21
-       cmpult  $21,$5,$0
-       ldq     $5,24($17)
-       or      $0,$1,$0
-       addq    $4,$0,$0
-       cmpult  $0,$4,$1
-       ldq     $4,32($18)
-       addq    $3,$0,$22
-       cmpult  $22,$3,$0
-       ldq     $3,32($17)
-       or      $0,$1,$0
-       addq    $6,$0,$0
-       cmpult  $0,$6,$1
-       addq    $5,$0,$23
-       cmpult  $23,$5,$0
-       or      $0,$1,$0
-
+       cmpult  $28,$25,$8              # compute cy from last add
+       cmpult  $22,$28,$25             # compute cy from last add
        stq     $20,0($16)
+       or      $8,$25,$25              # combine cy from the two adds
        stq     $21,8($16)
-       stq     $22,16($16)
-       stq     $23,24($16)
-
-       addq    $17,32,$17
-       addq    $18,32,$18
-       addq    $16,32,$16
-       bne     $19,.Loop
+       addq    $3,$25,$28              # cy add
+       addq    $28,$7,$23              # 4th main add
+       cmpult  $28,$25,$8              # compute cy from last add
+       cmpult  $23,$28,$25             # compute cy from last add
+       addq    $17,32,$17              # update s1_ptr
+       or      $8,$25,$25              # combine cy from the two adds
+       addq    $16,32,$16              # update res_ptr
+       addq    $0,$25,$28              # cy add
+       ldq     $2,16($18)
+       addq    $4,$28,$20              # 1st main add
+       ldq     $3,24($18)
+       cmpult  $28,$25,$8              # compute cy from last add
+       ldq     $6,-16($17)
+       cmpult  $20,$28,$25             # compute cy from last add
+       ldq     $7,-8($17)
+       or      $8,$25,$25              # combine cy from the two adds
+       subq    $19,4,$19               # decr loop cnt
+       stq     $22,-16($16)
+       addq    $1,$25,$28              # cy add
+       stq     $23,-8($16)
+       addq    $5,$28,$21              # 2nd main add
+       addq    $18,32,$18              # update s2_ptr
+       cmpult  $28,$25,$8              # compute cy from last add
+       bge     $19,.Loop
+ # Finish software pipeline for 1st loop
+.Lend1:        cmpult  $21,$28,$25             # compute cy from last add
+       or      $8,$25,$25              # combine cy from the two adds
+       addq    $2,$25,$28              # cy add
+       addq    $28,$6,$22              # 3rd main add
+       cmpult  $28,$25,$8              # compute cy from last add
+       cmpult  $22,$28,$25             # compute cy from last add
+       stq     $20,0($16)
+       or      $8,$25,$25              # combine cy from the two adds
+       stq     $21,8($16)
+       addq    $3,$25,$28              # cy add
+       addq    $28,$7,$23              # 4th main add
+       cmpult  $28,$25,$8              # compute cy from last add
+       cmpult  $23,$28,$25             # compute cy from last add
+       or      $8,$25,$25              # combine cy from the two adds
+       addq    $16,32,$16              # update res_ptr
+       stq     $22,-16($16)
+       stq     $23,-8($16)
+.Lend2:        addq    $19,4,$19               # restore loop cnt
+       beq     $19,.Lret
+ # Start software pipeline for 2nd loop
+       ldq     $0,0($18)
+       ldq     $4,0($17)
+       subq    $19,1,$19
+       beq     $19,.Lend0
+ # 2nd loop handles remaining 1-3 limbs
+       .align  4
+.Loop0:        addq    $0,$25,$28              # cy add
+       ldq     $0,8($18)
+       addq    $4,$28,$20              # main add
+       ldq     $4,8($17)
+       addq    $18,8,$18
+       cmpult  $28,$25,$8              # compute cy from last add
+       addq    $17,8,$17
+       stq     $20,0($16)
+       cmpult  $20,$28,$25             # compute cy from last add
+       subq    $19,1,$19               # decr loop cnt
+       or      $8,$25,$25              # combine cy from the two adds
+       addq    $16,8,$16
+       bne     $19,.Loop0
+.Lend0:        addq    $0,$25,$28              # cy add
+       addq    $4,$28,$20              # main add
+       cmpult  $28,$25,$8              # compute cy from last add
+       cmpult  $20,$28,$25             # compute cy from last add
+       stq     $20,0($16)
+       or      $8,$25,$25              # combine cy from the two adds
 
-.Lend: addq    $4,$0,$4
-       cmpult  $4,$0,$1
-       addq    $3,$4,$4
-       cmpult  $4,$3,$0
-       stq     $4,0($16)
-       or      $0,$1,$0
+.Lret: or      $25,$31,$0              # return cy
        ret     $31,($26),1
-
        .end    __mpn_add_n