2005-04-05 H.J. Lu <hongjiu.lu@intel.com>
[kopensolaris-gnu/glibc.git] / sysdeps / ia64 / fpu / s_tanh.S
1 .file "tanh.s"
2
3
4 // Copyright (c) 2001 - 2005, Intel Corporation
5 // All rights reserved.
6 //
7 // Contributed 2001 by the Intel Numerics Group, Intel Corporation
8 //
9 // Redistribution and use in source and binary forms, with or without
10 // modification, are permitted provided that the following conditions are
11 // met:
12 //
13 // * Redistributions of source code must retain the above copyright
14 // notice, this list of conditions and the following disclaimer.
15 //
16 // * Redistributions in binary form must reproduce the above copyright
17 // notice, this list of conditions and the following disclaimer in the
18 // documentation and/or other materials provided with the distribution.
19 //
20 // * The name of Intel Corporation may not be used to endorse or promote
21 // products derived from this software without specific prior written
22 // permission.
23
24 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
25 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
26 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
27 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS 
28 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
29 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 
30 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 
31 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 
32 // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
33 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 
34 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
35 // 
36 // Intel Corporation is the author of this code, and requests that all
37 // problem reports or change requests be submitted to it directly at 
38 // http://www.intel.com/software/products/opensource/libraries/num.htm.
39 //
40 // History
41 //==============================================================================
42 // 05/30/01  Initial version
43 // 12/04/01  Rewritten version with erf-like algorithm.
44 //           Performance improved.
45 // 05/20/02  Cleaned up namespace and sf0 syntax
46 // 08/14/02  Changed mli templates to mlx
47 // 02/10/03  Reordered header: .section, .global, .proc, .align
48 // 03/31/05  Reformatted delimiters between data tables
49 //
50 // API
51 //==============================================================================
52 // double tanh(double)
53 //
54 // Overview of operation
55 //==============================================================================
56 //
57 // Algorithm description
58 // ---------------------
59 //
60 // There are 4 paths:
61 //
62 // 1. Special path: x = 0, Inf, NaNs, denormals
63 //    Return tanh(x) = +/-0.0 for zeros
64 //    Return tanh(x) = QNaN for NaNs
65 //    Return tanh(x) = sign(x)*1.0 for Inf
66 //    Return tanh(x) = x + x^2   for - denormals
67 //    Return tanh(x) = x - x^2   for + denormals
68 //
69 // 2. Near zero path: 0.0 < |x| < 0.25
70 //    Return tanh(x) = x + x^3*A3 + ... + x^19*A19
71 //
72 // 3. Main path: 0.25 <= |x| < 19.0625
73 //    For several ranges of 0.25 <= |x| < 19.0625
74 //    Return tanh(x) = sign(x)*(A0 + y*A1 + y^2*A2 + 
75 //                                       + y^3*A3 + ... + y^19*A19)
76 //    where y = (|x|/a) - b
77 //    
78 //    For each range there is particular set of coefficients.
79 //    Below is the list of ranges:
80 //    1/4  <= |x| < 1/2     a = 0.25, b = 1.0
81 //    1/2  <= |x| < 1.0     a = 0.5,  b = 1.0
82 //    1.0  <= |x| < 2.0     a = 1.0,  b = 1.0
83 //    2.0  <= |x| < 3.25    a = 2.0,  b = 1.0
84 //    3.25 <= |x| < 4.0     a = 2.0,  b = 2.0
85 //    4.0  <= |x| < 6.5     a = 4.0,  b = 1.0
86 //    6.5  <= |x| < 8.0     a = 4.0,  b = 2.0
87 //    8.0  <= |x| < 13.0    a = 8.0,  b = 1.0
88 //    13.0 <= |x| < 16.0    a = 8.0,  b = 2.0
89 //    16.0 <= |x| < 19.0625 a = 16.0, b = 1.0
90 //    ( [3.25;4.0], [6.5;8.0], [13.0;16.0] subranges separated 
91 //                               for monotonicity issues resolve )
92 //
93 // 4. Saturation path: 19.0625 <= |x| < +INF 
94 //    Return tanh(x) = sign(x)*(1.0 - tiny_value)
95 //    (tiny_value ~ 2^(-63))
96 //
97 // Registers used
98 //==============================================================================
99 // Floating Point registers used: 
100 // f8 = input, output
101 // f32 -> f64
102 //
103 // General registers used:  
104 // r32 -> r51, r2, r3
105 //
106 // Predicate registers used:
107 // p6, p8, p10, p11, p12, p14, p15
108 // p6           arg is zero, denormal or special IEEE
109 // p8           to filter out case when signd(x) > 1.625 
110 // p10          to filter out case when |x| < 0.25
111 // p11          to filter out case when signd(x) <= 1.625 
112 // p12          to filter out case when |x| >= 19.0625
113 // p14          set to 1 for positive x
114 // p15          set to 1 for negative x
115
116 // Assembly macros
117 //==============================================================================
118 rDataPtr           = r2
119 rDataPtr1          = r3
120
121 rBias              = r33
122 rCoeffAddr3        = r34
123 rThreeAndQ         = r35
124 rCoeffAddr2        = r36
125 rMask              = r37
126 rArg               = r38
127 rSignBit           = r39
128 rAbsArg            = r40
129 rSaturation        = r41
130 rIndex             = r42
131 rCoeffAddr1        = r43
132 rCoeffAddr4        = r44
133 rShiftedArg        = r45
134 rShiftedArgMasked  = r46
135 rBiasedExpOf4      = r47
136 rShiftedAbsArg     = r48
137 rArgSgnd           = r49
138 r1625Sgnd          = r50
139 rTwo               = r51
140
141 //==============================================================================
142 fA0                = f32
143 fA1                = f33
144 fA2                = f34
145 fA3                = f35
146 fA4                = f36
147 fA5                = f37
148 fA6                = f38
149 fA7                = f39
150 fA8                = f40
151 fA9                = f41
152 fA10               = f42
153 fA11               = f43
154 fA12               = f44
155 fA13               = f45
156 fA14               = f46
157 fA15               = f47
158 fA16               = f48
159 fA17               = f49
160 fA18               = f50
161 fA19               = f51
162 fArgSqr            = f52
163 fArgAbsNorm        = f53
164 fSignumX           = f54
165 fRes               = f55
166 fThreeAndQ         = f56
167 fArgAbs            = f57
168 fTSqr              = f58
169 fTQuadr            = f59
170 fTDeg3             = f60
171 fTDeg7             = f61
172 fArgAbsNormSgn     = f62                          
173 fTQuadrSgn         = f63
174 fTwo               = f64
175
176 // Data tables
177 //==============================================================================
178 RODATA
179
180 .align 16
181
182 LOCAL_OBJECT_START(tanh_data)
183 // CAUTION: The order of these table coefficients shouldn't be changed!
184
185 // Main path coefficients:
186 // Coefficients ##0..15 ("main" coefficient tables)
187 // Polynomial coefficients for the tanh(x), 0.25 <= |x| < 0.5 
188 data8 0xE9D218BC9A3FB55A, 0x00003FC7 //A19
189 data8 0xC8C0D38687F36EBA, 0x00003FCE //A18
190 data8 0xA2663E519FAC8A43, 0x0000BFD2 //A17
191 data8 0xD913F0490674B0DF, 0x00003FD3 //A16
192 data8 0xF75D84789DE0AE52, 0x00003FD6 //A15
193 data8 0xACB3C40EEF3A06F0, 0x0000BFD9 //A14
194 data8 0xEBD7F5DC02CFD5BA, 0x0000BFDB //A13
195 data8 0x8B52CDF66D709E2A, 0x00003FDF //A12
196 data8 0x9EC21F28E05C4A3E, 0x00003FE0 //A11
197 data8 0xC412B44D0176F3ED, 0x0000BFE4 //A10
198 data8 0x97BF35A34DD1EA4C, 0x0000BFE0 //A9
199 data8 0xF89F5B39E3A3AA36, 0x00003FE9 //A8
200 data8 0xF2BA654BCEEBA433, 0x0000BFEA //A7
201 data8 0x8E1C15876AA589AD, 0x0000BFEF //A6
202 data8 0x942226246A8C2A86, 0x00003FF1 //A5
203 data8 0x8F06D9FF7DB47261, 0x00003FF4 //A4
204 //
205 // Polynomial coefficients for the tanh(x), 0.5 <= |x| < 1.0 
206 data8 0xC4A7B8FB672A8520, 0x00003FDC //A19
207 data8 0xA20724B847E13499, 0x0000BFE0 //A18
208 data8 0xE17DB53F02E4D340, 0x00003FE2 //A17
209 data8 0x90264A1012F4CA6F, 0x0000BFE4 //A16
210 data8 0xEBEC9F776F0BF415, 0x0000BFE0 //A15
211 data8 0x89AF912B305B45A4, 0x00003FE7 //A14
212 data8 0xB4A960B81F5EC36A, 0x0000BFE7 //A13
213 data8 0x969A4E95B2DA86B5, 0x0000BFEA //A12
214 data8 0x8A3FC0EC082305CB, 0x00003FEC //A11
215 data8 0x83D7795BCBE24373, 0x00003FEC //A10
216 data8 0xDCBF42AEB82932EC, 0x0000BFEF //A9
217 data8 0x83318E61ECAFD804, 0x00003FF0 //A8
218 data8 0xEA4DE5746975A914, 0x00003FF2 //A7
219 data8 0xCE63E8FA6B96480B, 0x0000BFF4 //A6
220 data8 0xDF017BE0D4FE45D8, 0x0000BFF4 //A5
221 data8 0xA8A0C6E2226DF3CD, 0x00003FF8 //A4
222 //
223 // Polynomial coefficients for the tanh(x), 1.0 <= |x| < 2.0 
224 data8 0x8E89D2EBFDAA160B, 0x00003FE9 //A19
225 data8 0xDD9226310A272046, 0x0000BFEC //A18
226 data8 0xA038042D28B0D665, 0x00003FEF //A17
227 data8 0x8C04796F03516306, 0x0000BFF1 //A16
228 data8 0x9CD6A9CB4E90A2FD, 0x00003FF2 //A15
229 data8 0xC8980E166F5A84FD, 0x0000BFF2 //A14
230 data8 0x9ADFE65F56B7BCFD, 0x00003FED //A13
231 data8 0x8B11FDFB5D0A7B96, 0x00003FF4 //A12
232 data8 0x8209A125E829CBFA, 0x0000BFF5 //A11
233 data8 0xCF38AAC17B85BD76, 0x00003FF1 //A10
234 data8 0xD5C2E248D8AB99AB, 0x00003FF6 //A9
235 data8 0xE12BE2785727F2D6, 0x0000BFF7 //A8
236 data8 0x9FC9EF90F87BF1E2, 0x00003FF6 //A7
237 data8 0x9B02FE0DAF42C08F, 0x00003FF9 //A6
238 data8 0xBDACE06F531D9491, 0x0000BFFA //A5
239 data8 0xE3048AD1DB2F648C, 0x00003FF9 //A4
240 //
241 // Polynomial coefficients for the tanh(x), 2.0 <= |x| < 3.25 
242 data8 0x856EC3B0330A385A, 0x00003FEB //A19
243 data8 0xC641D69DAE2D429C, 0x0000BFF2 //A18
244 data8 0xC683EB0BE1343FFF, 0x00003FF5 //A17
245 data8 0xC358954224E4E823, 0x0000BFF7 //A16
246 data8 0xF813A8D6D396BC5F, 0x00003FF8 //A15
247 data8 0xE0ECDFED078D37D6, 0x0000BFF9 //A14
248 data8 0x950E4E619855E316, 0x00003FFA //A13
249 data8 0x8453B8F93370FB58, 0x0000BFFA //A12
250 data8 0xFDBA28430AEC95BA, 0x00003FF7 //A11
251 data8 0x9371AAC1FDB1E664, 0x00003FFA //A10
252 data8 0xAC972DA97782D88A, 0x0000BFFB //A9
253 data8 0xE18F47B10B9CE1BC, 0x00003FFB //A8
254 data8 0xAB7C81230BF13BC6, 0x0000BFFB //A7
255 data8 0xA6CAAD4A3E31A7D5, 0x0000BFF8 //A6
256 data8 0x9CABD76D1D5C3878, 0x00003FFC //A5
257 data8 0x92906D077941CAA9, 0x0000BFFD //A4
258 //
259 // Polynomial coefficients for the tanh(x), 4.0 <= |x| < 6.5 
260 data8 0x9232D19F71709AC9, 0x0000BFF5 //A19
261 data8 0x819E31323F5DD3F8, 0x00003FF8 //A18
262 data8 0xDA8E1CDB8D23DC29, 0x0000BFF9 //A17
263 data8 0xE97C7CD8FC0486D8, 0x00003FFA //A16
264 data8 0xB0C4AD234D88C9F2, 0x0000BFFB //A15
265 data8 0xC5989BFB28FDE267, 0x00003FFB //A14
266 data8 0x9B26520EC4EFEE8E, 0x0000BFFB //A13
267 data8 0xC4B6F758AD21E574, 0x00003FF9 //A12
268 data8 0xCC36E3FFA10D2CFF, 0x00003FFA //A11
269 data8 0x8738696FB06A5CED, 0x0000BFFC //A10
270 data8 0xD31981825BF39228, 0x00003FFC //A9
271 data8 0x82C58FB9BEE43992, 0x0000BFFD //A8
272 data8 0x88D5AAE49164B6F3, 0x00003FFD //A7
273 data8 0xF4CA0B968AF2DDE2, 0x0000BFFC //A6
274 data8 0xB99874B482BD17EE, 0x00003FFC //A5
275 data8 0xE93FB2F99431DC1D, 0x0000BFFB //A4
276 //
277 // Polynomial coefficients for the tanh(x), 8.0 <= |x| < 13.0 
278 data8 0xAAA9EB7EADA85CEC, 0x00003FF5 //A19
279 data8 0x980C80EE05A6BE78, 0x0000BFF8 //A18
280 data8 0x818DA9F5396390A5, 0x00003FFA //A17
281 data8 0x8D8CC21E23D8A6A2, 0x0000BFFB //A16
282 data8 0xE0EC19E55A886765, 0x00003FFB //A15
283 data8 0x8C11197A7E6244C5, 0x0000BFFC //A14
284 data8 0x901D2BF203C2F7F3, 0x00003FFC //A13
285 data8 0xFEACAEE66EE803E5, 0x0000BFFB //A12
286 data8 0xC684E4925E318C3F, 0x00003FFB //A11
287 data8 0x8A9D8A970565F28D, 0x0000BFFB //A10
288 data8 0xAE34C61DE5CEA4D4, 0x00003FFA //A9
289 data8 0xC44C5714BD6208A0, 0x0000BFF9 //A8
290 data8 0xC4612F7D6C8BDB79, 0x00003FF8 //A7
291 data8 0xABD91DCE40D5EECB, 0x0000BFF7 //A6
292 data8 0x80E375C1B847B72F, 0x00003FF6 //A5
293 data8 0xA11C7DD978CF700A, 0x0000BFF4 //A4
294 //
295 // Polynomial coefficients for the tanh(x), 16.0 <= |x| < 19.0625 
296 data8 0xE29D17C510F86F6B, 0x00003FF3 //A19
297 data8 0x88FE52EB39A3A98C, 0x0000BFF5 //A18
298 data8 0xA406547E50360693, 0x00003FF5 //A17
299 data8 0x83E6260B71C6D7DE, 0x0000BFF5 //A16
300 data8 0xA36AB5B0CBC97B85, 0x00003FF4 //A15
301 data8 0xA94931E0B7BA6C14, 0x0000BFF3 //A14
302 data8 0x9A4596DAF350AD63, 0x00003FF2 //A13
303 data8 0xFE47643F375AECA5, 0x0000BFF0 //A12
304 data8 0xBF8433C5ABEE63B1, 0x00003FEF //A11
305 data8 0x83CEE05D7AE90A0A, 0x0000BFEE //A10
306 data8 0xA4CC45480BCEB02D, 0x00003FEC //A9
307 data8 0xB967CBDCBC16CB10, 0x0000BFEA //A8
308 data8 0xB9681B214EDC098D, 0x00003FE8 //A7
309 data8 0xA23B20D87B80DFA8, 0x0000BFE6 //A6
310 data8 0xF358B2C46F10CBAF, 0x00003FE3 //A5
311 data8 0x98176FD06229A385, 0x0000BFE1 //A4
312 //
313 // Binary subranges
314 // Polynomial coefficients for the tanh(x), 3.25 <= |x| < 4.0 
315 data8 0xEF2EE841288F6706, 0x00003FE9 //A19
316 data8 0xE65D5B74B85F82A6, 0x00003FEB //A18
317 data8 0xE495FC21E42A79FF, 0x00003FEA //A17
318 data8 0xF99B267A913CF3E5, 0x00003FEC //A16
319 data8 0xFE3D700F4A0A0FDE, 0x0000BFEC //A15
320 data8 0x8F91BB4EE4E4EA52, 0x00003FEE //A14
321 data8 0xBCA9F41A5C6EF8BA, 0x0000BFEE //A13
322 data8 0xF93E00884027A9CF, 0x00003FED //A12
323 data8 0xC4D4036A61BABC2F, 0x00003FEF //A11
324 data8 0x86CC2AD1AD47C7D5, 0x0000BFF2 //A10
325 data8 0xD3065DEF4CE9AD32, 0x00003FF3 //A9
326 data8 0x82C44125F568D54E, 0x0000BFF5 //A8
327 data8 0x88D588729BAF14CA, 0x00003FF6 //A7
328 data8 0xF4CA0661307243C7, 0x0000BFF6 //A6
329 data8 0xB998746D57061F74, 0x00003FF7 //A5
330 data8 0xE93FB2F482327C19, 0x0000BFF7 //A4
331 //
332 // Polynomial coefficients for the tanh(x), 6.5 <= |x| < 8.0 
333 data8 0xEB189B71ADC40BE2, 0x00003FEA //A19
334 data8 0xA60B46F9FF6DC2DF, 0x00003FEA //A18
335 data8 0xBB061CDD9F368B9D, 0x00003FEC //A17
336 data8 0x841E08BDF5429991, 0x0000BFEC //A16
337 data8 0xDD33990B433F25BE, 0x00003FED //A15
338 data8 0xBA5DE6B870F0A2BB, 0x0000BFEE //A14
339 data8 0xA71D489AAA6DACF0, 0x00003FEF //A13
340 data8 0x874CCB2B8F3FBC0E, 0x0000BFF0 //A12
341 data8 0xCB1D2E9754EA534A, 0x00003FF0 //A11
342 data8 0x8BA5ABB53BA6ABCF, 0x0000BFF1 //A10
343 data8 0xAE91FD1C2391A32B, 0x00003FF1 //A9
344 data8 0xC465A74B798E5761, 0x0000BFF1 //A8
345 data8 0xC4666152397D15C1, 0x00003FF1 //A7
346 data8 0xABD9E63CA575B950, 0x0000BFF1 //A6
347 data8 0x80E38B18E8D0F460, 0x00003FF1 //A5
348 data8 0xA11C80E20AAFDD3C, 0x0000BFF0 //A4
349 //
350 // Polynomial coefficients for the tanh(x), 13.0 <= |x| < 16.0 
351 data8 0xBECD0AF7E22E5594, 0x00003FE9 //A19
352 data8 0xE2834E2D68C1128C, 0x00003FEA //A18
353 data8 0x97B117611B317379, 0x00003FEB //A17
354 data8 0xEE91A0D39A772F6B, 0x00003FEA //A16
355 data8 0x92F6EC377DCADA4F, 0x00003FEA //A15
356 data8 0xD8FCCD6A3277FAB7, 0x00003FE8 //A14
357 data8 0xC15AB9CB0C3DCFE0, 0x00003FE7 //A13
358 data8 0xC3C659704A7147CD, 0x00003FE2 //A12
359 data8 0xFA17F09D27C97912, 0x00003FE4 //A11
360 data8 0xF664147182B94788, 0x0000BFE3 //A10
361 data8 0xA6C89FA741464DA1, 0x00003FE3 //A9
362 data8 0xB90FE464A825EFA8, 0x0000BFE2 //A8
363 data8 0xB973AE0FD86EC024, 0x00003FE1 //A7
364 data8 0xA23A087F96846951, 0x0000BFE0 //A6
365 data8 0xF358D8A7FC012D5D, 0x00003FDE //A5
366 data8 0x98176E2309B7C73A, 0x0000BFDD //A4
367 //
368 // Coefficients ##16..19 ("tail" coefficient tables)
369 // Polynomial coefficients for the tanh(x), 0.25 <= |x| < 0.5 
370 data8 0x838F209ABB9BA7B3, 0x0000BFF7 //A3
371 data8 0xEBC0AC78DA4FC500, 0x0000BFF8 //A2
372 data8 0xF0A4D02960B60E69, 0x00003FFC //A1
373 data8 0xFACBF534D0E42F8A, 0x00003FFC //A0
374 //
375 // Polynomial coefficients for the tanh(x), 0.5 <= |x| < 1.0 
376 data8 0xC0ECBDC0A0D133A6, 0x0000BFF8 //A3
377 data8 0xBA13A076BF8E812F, 0x0000BFFB //A2
378 data8 0xC954A37D1A1CA070, 0x00003FFD //A1
379 data8 0xEC9A9EBAB4579B29, 0x00003FFD //A0
380 //
381 // Polynomial coefficients for the tanh(x), 1.0 <= |x| < 2.0 
382 data8 0xD42E9175A6EA1397, 0x00003FFB //A3
383 data8 0xA3C361378A55CF56, 0x0000BFFD //A2
384 data8 0xD706E07CC8622983, 0x00003FFD //A1
385 data8 0xC2F7D5A8A79CA2AC, 0x00003FFE //A0
386 //
387 // Polynomial coefficients for the tanh(x), 2.0 <= |x| < 3.25
388 data8 0xAC7A7F8776817C7E, 0x00003FFD //A3
389 data8 0x8B7CE95E69FCFE9A, 0x0000BFFD //A2
390 data8 0x90B161317028D995, 0x00003FFC //A1
391 data8 0xF6CA82F0DE1E9E9A, 0x00003FFE //A0
392 //
393 // Polynomial coefficients for the tanh(x), 4.0 <= |x| < 6.5
394 data8 0xE9E072407BC22DC6, 0x00003FFA //A3
395 data8 0xAFA4A913D8E6BB4A, 0x0000BFF9 //A2
396 data8 0xAFC2D6A885BAA875, 0x00003FF7 //A1
397 data8 0xFFD40B84505A10B2, 0x00003FFE //A0
398 //
399 // Polynomial coefficients for the tanh(x), 8.0 <= |x| < 13.0
400 data8 0xA11C8A1FED168CD5, 0x00003FF2 //A3
401 data8 0xF1AAD6B02063A5F5, 0x0000BFEF //A2
402 data8 0xF1AADA46AD341C34, 0x00003FEC //A1
403 data8 0xFFFFFC39548FC34B, 0x00003FFE //A0
404 //
405 // Polynomial coefficients for the tanh(x), 16.0 <= |x| < 19.0625
406 data8 0x98176FD1F0950C16, 0x00003FDE //A3
407 data8 0xE42327BB09C8B2A5, 0x0000BFDA //A2
408 data8 0xE42327BB0B154F13, 0x00003FD6 //A1
409 data8 0xFFFFFFFFFFF8DEE7, 0x00003FFE //A0
410 //
411 // Binary subranges
412 // Polynomial coefficients for the tanh(x), 3.25 <= |x| < 4.0
413 data8 0xE9E072404329293B, 0x00003FF7 //A3
414 data8 0xAFA4A913D798300B, 0x0000BFF7 //A2
415 data8 0xAFC2D6A885B48567, 0x00003FF6 //A1
416 data8 0xFFD40B84505A10B4, 0x00003FFE //A0
417 //
418 // Polynomial coefficients for the tanh(x), 6.5 <= |x| < 8.0
419 data8 0xA11C8A63815F7A28, 0x00003FEF //A3
420 data8 0xF1AAD6B65B0EBF53, 0x0000BFED //A2
421 data8 0xF1AADA46E799831F, 0x00003FEB //A1
422 data8 0xFFFFFC39548FC348, 0x00003FFE //A0
423 //
424 // Polynomial coefficients for the tanh(x), 13.0 <= |x| < 16.0
425 data8 0x98176FE982140A59, 0x00003FDB //A3
426 data8 0xE42327B9B0D7202F, 0x0000BFD8 //A2
427 data8 0xE42327BB13076BD6, 0x00003FD5 //A1
428 data8 0xFFFFFFFFFFF8DEE7, 0x00003FFE //A0
429 //
430 // Polynomial coefficients for the tanh(x), 0.0 <= |x| < 0.25 
431 // ('tanh_near_zero' path)
432 data8 0xBF2BA5D26E479D0C //A9
433 data8 0x3F4336D96F81EE26 //A8
434 data8 0xBF8226E34AE197B0 //A5
435 data8 0x3F9664F488148657 //A4
436 data8 0xAAAAAAAAAAAAAA99, 0x0000BFFD //A1
437 data8 0xBF57D91925BB5EE2 //A7
438 data8 0x3F6D6D36C3D5B7A1 //A6
439 data8 0xBFABA1BA1BA19D32 //A3
440 data8 0x3FC1111111111108 //A2
441 //
442 // 1.0 - 2^(-63)
443 // ('tanh_saturation' path)
444 data8 0xFFFFFFFFFFFFFFFF, 0x00003FFE 
445 LOCAL_OBJECT_END(tanh_data)
446
447 // CAUTION: The order of table coefficients shouldn't be changed!
448
449
450 .section .text
451 GLOBAL_LIBM_ENTRY(tanh)
452 { .mfi
453       alloc          r32         = ar.pfs, 0, 20, 0, 0
454       fmerge.se      fArgAbsNorm = f1, f8         // normalized x
455       adds           rSignBit    = 0x1, r0        // Bit for sign removing
456 }
457 { .mfi
458       addl           rDataPtr    = @ltoff(tanh_data), gp // Data pointer
459       fma.s1         fTwo        = f1, f1, f1            // 2.0 construct
460       addl           rArgSgnd    = 0xfff, r0             // mask for exponent
461 };;
462
463 { .mfi
464       getf.d         rArg        = f8       // x in GR 
465       fclass.m       p6,p0       = f8, 0xEF // Filter 0, denormals and specials 
466                             // 0xEF = @qnan|@snan|@pos|@neg|@zero|@unorm|@inf
467       shl            rArgSgnd    = rArgSgnd, 52  // mask for exponent
468 }
469 { .mlx
470       ld8            rDataPtr    = [rDataPtr]        // Real data pointer
471       movl           r1625Sgnd   = 0xA000000000000   // 1.625 signd
472       // 1.625 significand used to filter values greater than 3.25, 6.5, 13.0
473       // to enter binary subranges
474 };;
475
476 { .mfi
477       addl           rBias       = 0x3FD00, r0       // bias of 0.25 << 8
478       fma.s1         fArgSqr     = f8, f8, f0        // x^2
479       shl            rSignBit    = rSignBit, 63      // mask for sign bit
480 }
481 { .mlx
482       addl           rMask       = 0x7FF00, r0          // Mask for index bits
483       movl           rTwo        = 0x4000000000000000   // 2.0
484 };;
485
486 { .mfi
487       andcm          rArgSgnd    = rArg, rArgSgnd // Remove exponent
488       nop.f          0
489       shr.u          rShiftedArg = rArg, 44 // Select only necessary bits of arg
490 }
491 { .mfb
492       andcm          rAbsArg     = rArg, rSignBit     // Remove sign
493       nop.f          0
494 (p6)  br.cond.spnt   _tanh_spec    // Branch to zero, denorm & specs
495 };;
496    
497 { .mfi
498       and            rShiftedArgMasked = rShiftedArg, rMask // bias of x << 8
499       fmerge.s       fArgAbs     = f1, f8                   // |x|
500       shr            rShiftedAbsArg    = rAbsArg, 44 // Select only necessary 
501                                                      // bits of absolute arg
502 }
503 { .mfi
504       cmp.gt         p8, p11     = rArgSgnd, r1625Sgnd // p8 = 1 if
505       // signd(x) > 1.625 - to filter values greater than 3.25, 6.5, 13.0
506       nop.f          0
507       nop.i          0
508 };;
509
510 { .mfi
511       sub            rIndex      = rShiftedArgMasked, rBias // index << 8
512       nop.f          0 
513       cmp.lt         p10, p0     = rShiftedArgMasked, rBias // p10=1 if |x|<0.25
514 }
515 { .mfb
516 (p8)  cmp.gt         p8, p11     = rAbsArg, rTwo // If arg is greater than 2.0?
517                                        // (then we should use binary subranges)
518       nop.f          0 
519 (p10) br.cond.spnt   tanh_near_zero    // branch out if |x| < 0.25
520 };;
521
522 .pred.rel "mutex",p8,p11
523 { .mfi
524 (p8)  add            rIndex      = 0x400, rIndex // Make pointer to binary 
525                                                  // subranges
526 (p11) fms.s1         fArgAbsNorm = fArgAbsNorm, f1, f1     // |x|/b - 1.0
527       addl           rSaturation = 0x40331, r0 // shifted bits of 19.0625
528 }
529 { .mfi
530       nop.m          0 
531 (p8)  fms.s1         fArgAbsNorm = fArgAbsNorm, f1, fTwo // |x|/b - 2.0
532        // this is only for binary subranges [3.25;4], [6.5;8], [13.0;16]
533       nop.i          0 
534 }
535 ;;
536
537 { .mfi
538       add            rCoeffAddr1 = rDataPtr, rIndex// coeff. ##0,2,..14
539       nop.f          0
540       nop.i          0
541 };;
542
543 { .mfi
544       adds           rCoeffAddr2 = 16, rCoeffAddr1 // Shifted pointer to coeffs
545       fmerge.s       fSignumX    = f8, f1          // signum(x)
546       nop.i          0
547
548 { .mfb
549       cmp.le         p12, p0     = rSaturation, rShiftedAbsArg // |x|>=19.0625?
550       nop.f          0
551 (p12) br.cond.spnt   tanh_saturation          // branch out if x |x| >= 19.0625
552 };;
553
554 {.mfi
555       ldfe           fA19        = [rCoeffAddr1], 32 // Load A19
556       nop.f          0
557       nop.i          0
558 }
559 {.mfi
560       ldfe           fA18        = [rCoeffAddr2], 32 // Load A18
561       nop.f          0
562       adds           rCoeffAddr3 = 0xA00, rDataPtr   // Pointer to "tail"
563                                                      // coefficients tables
564 };;
565
566 {.mfi
567       ldfe           fA17        = [rCoeffAddr1], 32 // Load A17
568       nop.f          0
569       nop.i          0
570 }
571 {.mfi
572       ldfe           fA16        = [rCoeffAddr2], 32 // Load A16
573       nop.f          0
574       nop.i          0
575 };;
576
577 {.mfi
578       ldfe           fA15        = [rCoeffAddr1], 32 // Load A15
579       fma.s1         fTSqr       = fArgAbsNorm, fArgAbsNorm, f0 // x^2
580       shr.u          rIndex      = rIndex, 2 // Index for "tail" tables
581 }
582 {.mfi
583       ldfe           fA14        = [rCoeffAddr2], 32 // Load A14
584       nop.f          0
585       adds           rCoeffAddr4 = 16, r0            // Shifter pointer
586                                                      // to "tail" tables
587 };;
588
589 {.mfi
590       ldfe           fA13        = [rCoeffAddr1], 32   // Load A13
591       nop.f          0
592       add            rCoeffAddr3 = rCoeffAddr3, rIndex // "tail" coeffs to load
593                                                        // ##16..23
594 }
595 {.mfi
596       ldfe           fA12        = [rCoeffAddr2], 32 // Load A12
597       nop.f          0
598       cmp.lt         p15, p14    = rArg, r0          // Arg positive (p14) 
599                                                      // or negative (p15)?
600 };;
601
602 {.mfi
603       ldfe           fA11        = [rCoeffAddr1], 32        // Load A11
604       nop.f          0
605       add            rCoeffAddr4 = rCoeffAddr3, rCoeffAddr4 // shifted "tail" 
606                                                             // coeffs to load 
607 }
608 {.mfi
609       ldfe           fA10        = [rCoeffAddr2], 32 // Load A10
610       nop.f          0
611       nop.i          0
612 };;
613
614 {.mfi
615       ldfe           fA9         = [rCoeffAddr1], 32 // Load A9
616       nop.f          0
617       nop.i          0
618 }
619 {.mfi
620       ldfe           fA8         = [rCoeffAddr2], 32 // Load A8
621       nop.f          0
622       nop.i          0
623 };;
624
625 {.mfi
626       ldfe           fA7         = [rCoeffAddr1], 32 // Load A7
627       nop.f          0
628       nop.i          0
629 }
630 {.mfi
631       ldfe           fA6         = [rCoeffAddr2], 32 // Load A6
632       nop.f          0
633       nop.i          0
634 };;
635
636 {.mfi
637       ldfe           fA5         = [rCoeffAddr1], 32 // Load A5
638       fma.s1         fTDeg3      = fArgAbsNorm, fTSqr, f0 // x^3
639       nop.i          0
640 }
641 {.mfi
642       ldfe           fA4         = [rCoeffAddr2], 32 // Load A4
643       fma.s1         fTQuadr     = fTSqr, fTSqr, f0  // x^4
644       nop.i          0
645 };;
646
647 // Path #3 Polynomial Pol19(y) computation; y = fArgAbsNorm
648 {.mfi
649       ldfe           fA3         = [rCoeffAddr3], 32            // Load A3
650       fma.s1         fArgAbsNormSgn = fArgAbsNorm, fSignumX, f0 // sign(x)*x
651       nop.i          0
652 }
653 {.mfi
654       ldfe           fA2         = [rCoeffAddr4], 32            // Load A2
655       nop.f          0
656       nop.i          0
657 };;
658
659 {.mfi
660       ldfe           fA1         = [rCoeffAddr3], 32       // Load A1
661       fma.s1         fRes        = fA19, fArgAbsNorm, fA18 // Polynomial
662       nop.i          0
663 }
664 {.mfi
665       ldfe           fA0         = [rCoeffAddr4], 32       // Load A0
666       nop.f          0
667       nop.i          0
668 };;
669
670 { .mfi
671       nop.m          0
672       fma.s1         fA17        = fA17, fArgAbsNorm, fA16  // Polynomial
673       nop.i          0
674 };;
675
676 { .mfi
677       nop.m          0
678       fma.s1         fA15        = fA15, fArgAbsNorm, fA14  // Polynomial
679       nop.i          0
680 };;
681
682 { .mfi
683       nop.m          0
684       fma.s1         fTDeg7      = fTDeg3, fTQuadr, f0     // Polynomial
685       nop.i          0
686 }
687 { .mfi
688       nop.m          0
689       fma.s1         fA13        = fA13, fArgAbsNorm, fA12 // Polynomial
690       nop.i          0
691 };;
692
693 { .mfi
694       nop.m          0
695       fma.s1         fA11        = fA11, fArgAbsNorm, fA10 // Polynomial
696       nop.i          0
697 };;
698
699 { .mfi
700       nop.m          0
701       fma.s1         fA9         = fA9, fArgAbsNorm, fA8   // Polynomial
702       nop.i          0
703 };;
704
705 { .mfi
706       nop.m          0
707       fma.s1         fRes        = fRes, fTSqr, fA17       // Polynomial
708       nop.i          0
709 }
710 { .mfi
711       nop.m          0
712       fma.s1         fA7         = fA7, fArgAbsNorm, fA6 // Polynomial
713       nop.i          0
714 };;
715
716 { .mfi
717       nop.m          0
718       fma.s1         fA5         = fA5, fArgAbsNorm, f0  // Polynomial
719       nop.i          0
720 };;
721
722 { .mfi
723       nop.m          0
724       fma.s1         fA15        = fA15, fTSqr, fA13     // Polynomial  
725       nop.i          0
726 }
727 { .mfi
728       nop.m          0
729       fma.s1         fA4         = fA4, fArgAbsNorm, fA3 // Polynomial
730       nop.i          0
731 };;
732
733 { .mfi
734       nop.m          0
735       fma.s1         fA2         = fA2, fArgAbsNorm, fA1 // Polynomial
736       nop.i          0
737 };;
738
739 { .mfi
740       nop.m          0
741       fma.s1         fA11        = fA11, fTSqr, fA9 // Polynomial
742       nop.i          0
743 };;
744
745 { .mfi
746       nop.m          0                                       
747       fma.s1         fA7         = fA7, fTSqr, fA5  // Polynomial
748       nop.i          0
749 };;
750
751 { .mfi
752       nop.m          0                                       
753       fma.s1         fRes        = fRes, fTQuadr, fA15 // Polynomial
754       nop.i          0
755 };;
756
757 { .mfi
758       nop.m          0                                       
759       fma.s1         fA4         = fA4, fTSqr, fA2     // Polynomial
760       nop.i          0
761 };;
762
763 { .mfi
764       nop.m          0
765       fma.s1         fRes        = fRes, fTQuadr, fA11 // Polynomial
766       nop.i          0
767 };;
768
769 { .mfi
770       nop.m          0                                       
771       fma.s1         fA4         = fA7, fTDeg3, fA4    // Polynomial
772       nop.i          0
773 };;
774
775 { .mfi
776       nop.m          0
777       fma.s1         fRes        = fRes,  fTDeg7, fA4  // Polynomial
778       nop.i          0
779 };;
780
781 { .mfi
782       nop.m          0
783       // result for negative argument
784 (p15) fms.d.s0       f8          = fRes, fArgAbsNormSgn, fA0 // Polynomial
785       nop.i          0
786 }
787 { .mfb
788       nop.m          0
789       // result for positive argument
790 (p14) fma.d.s0       f8          = fRes, fArgAbsNormSgn, fA0 // Polynomial
791       br.ret.sptk    b0
792 };;
793
794
795 // |x| < 0.25 Path /////////////////////////////////////////////////////////////
796 .align 32
797 tanh_near_zero:
798 { .mfi
799       adds           rCoeffAddr1 = 0xC80, rDataPtr      // address of A9
800       fma.s0         fTSqr       = fArgSqr, fArgSqr, f0 // x^4 
801       nop.i          0
802 }
803 { .mfi
804       adds           rCoeffAddr2 = 0xCB0, rDataPtr      // address of A7
805       nop.f          0
806       nop.i          0
807 };;
808
809 { .mfi
810       ldfpd          fA9, fA8    = [rCoeffAddr1], 16 // Load A9, A8
811       nop.f          0
812       nop.i          0
813 }
814 { .mfi
815       ldfpd          fA7, fA6    = [rCoeffAddr2], 16 // Load A7, A6
816       nop.f          0
817       nop.i          0
818 };;
819
820 { .mfi
821       ldfpd          fA5, fA4    = [rCoeffAddr1], 16 // Load A5, A4
822       nop.f          0
823       nop.i          0
824 }
825 { .mfi
826       ldfpd          fA3, fA2    = [rCoeffAddr2], 16 // Load A3, A2
827       nop.f          0
828       nop.i          0
829 };;
830
831 { .mfi
832       ldfe           fA1         = [rCoeffAddr1] // Load A1
833       nop.f          0
834       nop.i          0
835 };;
836
837 { .mfi
838       nop.m          0
839       fma.s1         fTQuadr     = fTSqr, fTSqr, f0 // x^4
840       nop.i          0
841 };;
842
843 { .mfi
844       nop.m          0
845       fma.s1         fRes        = fA9, fArgSqr, fA8 // Polynomial
846       nop.i          0
847 }
848 { .mfi
849       nop.m          0
850       fma.s1         fA7         = fA7, fArgSqr, fA6 // Polynomial
851       nop.i          0
852 };;
853
854 { .mfi
855       nop.m          0
856       fma.s1         fA3         = fA3, fArgSqr, fA2 // Polynomial
857       nop.i          0
858 }
859 { .mfi
860       nop.m          0
861       fma.s1         fA5         = fA5, fArgSqr, fA4 // Polynomial
862       nop.i          0
863 };;
864
865 { .mfi
866       nop.m          0
867       fma.s1         fA1         = fA1, fArgSqr, f0 // Polynomial
868       nop.i          0
869 }
870 { .mfi
871       nop.m          0
872       fma.s1         fTQuadrSgn  = fTQuadr, f8, f0  // x^4 * x
873       nop.i          0
874 };;
875
876 { .mfi
877       nop.m          0
878       fma.s1         fRes        = fRes, fTSqr, fA7 // Polynomial
879       nop.i          0
880 };;
881
882 { .mfi
883       nop.m          0
884       fma.s1         fA1         = fA3, fTSqr, fA1 // Polynomial
885       nop.i          0
886 };;
887
888 { .mfi
889       nop.m          0
890       fma.s1         fRes        = fRes, fTSqr, fA5 // Polynomial
891       nop.i          0
892 };;
893
894 { .mfi
895       nop.m          0
896       fma.s1         fRes        = fRes, fTQuadr, fA1 // Polynomial
897       nop.i          0
898 };;
899
900 { .mfb
901       nop.m          0
902       fma.d.s0       f8          = fRes, f8, f8 // x+x*Polynomial
903       br.ret.sptk    b0                         // Exit for |x| < 0.25
904 };;
905
906
907
908
909
910 // 19.0625 <= |x| < +inf Saturation path ///////////////////////////////////////
911 .align 32
912 tanh_saturation:
913 { .mfi
914       adds           rDataPtr    = 0xCD0, rDataPtr  // address of A0
915       nop.f          0
916       nop.i          0
917 };;
918
919 { .mfi
920       ldfe           fA0         = [rDataPtr]       // Load  A0 = 2^(-63)
921       nop.f          0
922       nop.i          0
923 };;
924
925 { .mfb
926       nop.m          0
927       fma.d.s0       f8          = fA0, fSignumX, f0 // sign(x)*(1.0-2^(-63))
928       br.ret.sptk    b0                       // Exit for 19.0625 <=|x|< +inf
929 };;
930
931
932
933
934       
935 //  0, denormals and special IEEE numbers path /////////////////////////////////
936 _tanh_spec:
937
938 { .mfi 
939       cmp.lt         p15, p14    = rArg, r0 // Is arg negative (p15) 
940                                             // or positive p14)
941       fclass.m       p6,p0       = f8, 0x23 // To filter infinities
942                                           // 0x23 = @pos|@neg|@inf 
943       nop.i          0
944 };;
945
946 { .mfi 
947       nop.m          0
948       fclass.m       p7,p0       = f8, 0xC7 // To filter NaNs & Zeros
949                                  // 0xC7 = @pos|@neg|@zero|@qnan|@snan
950       nop.i          0
951 };;
952
953 { .mfb 
954       nop.m          0
955 (p6)  fmerge.s       f8          = f8, f1     // +/-1 for INF args 
956 (p6)  br.ret.spnt    b0                       // exit for x = INF
957 };;
958
959 { .mfb 
960       nop.m          0
961 (p7)  fma.d.s0       f8          = f8, f1, f8    // +/-0 for 0 args 
962                                                  // and NaNs for NaNs
963 (p7)  br.ret.spnt    b0                          // exit for x = NaN or +/-0
964 };;
965
966 { .mfi 
967       nop.m          0
968       fnorm.s0       f8          = f8            // Normalize arg
969       nop.i          0
970 };;
971
972 .pred.rel "mutex",p14,p15
973 { .mfi 
974       nop.m          0
975 (p14) fnma.d.s0      f8          = f8, f8, f8  // res = r-r^2
976       nop.i          0
977 }
978 { .mfb 
979       nop.m          0
980 (p15) fma.d.s0       f8          = f8, f8, f8  // res = r+r^2
981       br.ret.sptk    b0          // 0, denormals, specials return
982 };;
983
984 GLOBAL_LIBM_END(tanh)
985
986