shared/fpsp/setox.S

0001 #include "fpsp-namespace.h"
0002 //
0003 //
0004 //  setox.sa 3.1 12/10/90
0005 //
0006 //  The entry point setox computes the exponential of a value.
0007 //  setoxd does the same except the input value is a denormalized
0008 //  number. setoxm1 computes exp(X)-1, and setoxm1d computes
0009 //  exp(X)-1 for denormalized X.
0010 //
0011 //  INPUT
0012 //  -----
0013 //  Double-extended value in memory location pointed to by address
0014 //  register a0.
0015 //
0016 //  OUTPUT
0017 //  ------
0018 //  exp(X) or exp(X)-1 returned in floating-point register fp0.
0019 //
0020 //  ACCURACY and MONOTONICITY
0021 //  -------------------------
0022 //  The returned result is within 0.85 ulps in 64 significant bit, i.e.
0023 //  within 0.5001 ulp to 53 bits if the result is subsequently rounded
0024 //  to double precision. The result is provably monotonic in double
0025 //  precision.
0026 //
0027 //  SPEED
0028 //  -----
0029 //  Two timings are measured, both in the copy-back mode. The
0030 //  first one is measured when the function is invoked the first time
0031 //  (so the instructions and data are not in cache), and the
0032 //  second one is measured when the function is reinvoked at the same
0033 //  input argument.
0034 //
0035 //  The program setox takes approximately 210/190 cycles for input
0036 //  argument X whose magnitude is less than 16380 log2, which
0037 //  is the usual situation. For the less common arguments,
0038 //  depending on their values, the program may run faster or slower --
0039 //  but no worse than 10% slower even in the extreme cases.
0040 //
0041 //  The program setoxm1 takes approximately ???/??? cycles for input
0042 //  argument X, 0.25 <= |X| < 70log2. For |X| < 0.25, it takes
0043 //  approximately ???/??? cycles. For the less common arguments,
0044 //  depending on their values, the program may run faster or slower --
0045 //  but no worse than 10% slower even in the extreme cases.
0046 //
0047 //  ALGORITHM and IMPLEMENTATION NOTES
0048 //  ----------------------------------
0049 //
0050 //  setoxd
0051 //  ------
0052 //  Step 1. Set ans := 1.0
0053 //
0054 //  Step 2. Return  ans := ans + sign(X)*2^(-126). Exit.
0055 //  Notes:  This will always generate one exception -- inexact.
0056 //
0057 //
0058 //  setox
0059 //  -----
0060 //
0061 //  Step 1. Filter out extreme cases of input argument.
0062 //      1.1 If |X| >= 2^(-65), go to Step 1.3.
0063 //      1.2 Go to Step 7.
0064 //      1.3 If |X| < 16380 log(2), go to Step 2.
0065 //      1.4 Go to Step 8.
0066 //  Notes:  The usual case should take the branches 1.1 -> 1.3 -> 2.
0067 //       To avoid the use of floating-point comparisons, a
0068 //       compact representation of |X| is used. This format is a
0069 //       32-bit integer, the upper (more significant) 16 bits are
0070 //       the sign and biased exponent field of |X|; the lower 16
0071 //       bits are the 16 most significant fraction (including the
0072 //       explicit bit) bits of |X|. Consequently, the comparisons
0073 //       in Steps 1.1 and 1.3 can be performed by integer comparison.
0074 //       Note also that the constant 16380 log(2) used in Step 1.3
0075 //       is also in the compact form. Thus taking the branch
0076 //       to Step 2 guarantees |X| < 16380 log(2). There is no harm
0077 //       to have a small number of cases where |X| is less than,
0078 //       but close to, 16380 log(2) and the branch to Step 9 is
0079 //       taken.
0080 //
0081 //  Step 2. Calculate N = round-to-nearest-int( X * 64/log2 ).
0082 //      2.1 Set AdjFlag := 0 (indicates the branch 1.3 -> 2 was taken)
0083 //      2.2 N := round-to-nearest-integer( X * 64/log2 ).
0084 //      2.3 Calculate   J = N mod 64; so J = 0,1,2,..., or 63.
0085 //      2.4 Calculate   M = (N - J)/64; so N = 64M + J.
0086 //      2.5 Calculate the address of the stored value of 2^(J/64).
0087 //      2.6 Create the value Scale = 2^M.
0088 //  Notes:  The calculation in 2.2 is really performed by
0089 //
0090 //          Z := X * constant
0091 //          N := round-to-nearest-integer(Z)
0092 //
0093 //       where
0094 //
0095 //          constant := single-precision( 64/log 2 ).
0096 //
0097 //       Using a single-precision constant avoids memory access.
0098 //       Another effect of using a single-precision "constant" is
0099 //       that the calculated value Z is
0100 //
0101 //          Z = X*(64/log2)*(1+eps), |eps| <= 2^(-24).
0102 //
0103 //       This error has to be considered later in Steps 3 and 4.
0104 //
0105 //  Step 3. Calculate X - N*log2/64.
0106 //      3.1 R := X + N*L1, where L1 := single-precision(-log2/64).
0107 //      3.2 R := R + N*L2, L2 := extended-precision(-log2/64 - L1).
0108 //  Notes:  a) The way L1 and L2 are chosen ensures L1+L2 approximate
0109 //       the value  -log2/64    to 88 bits of accuracy.
0110 //       b) N*L1 is exact because N is no longer than 22 bits and
0111 //       L1 is no longer than 24 bits.
0112 //       c) The calculation X+N*L1 is also exact due to cancellation.
0113 //       Thus, R is practically X+N(L1+L2) to full 64 bits.
0114 //       d) It is important to estimate how large can |R| be after
0115 //       Step 3.2.
0116 //
0117 //          N = rnd-to-int( X*64/log2 (1+eps) ), |eps|<=2^(-24)
0118 //          X*64/log2 (1+eps)   =   N + f,  |f| <= 0.5
0119 //          X*64/log2 - N   =   f - eps*X 64/log2
0120 //          X - N*log2/64   =   f*log2/64 - eps*X
0121 //
0122 //
0123 //       Now |X| <= 16446 log2, thus
0124 //
0125 //          |X - N*log2/64| <= (0.5 + 16446/2^(18))*log2/64
0126 //                  <= 0.57 log2/64.
0127 //       This bound will be used in Step 4.
0128 //
0129 //  Step 4. Approximate exp(R)-1 by a polynomial
0130 //          p = R + R*R*(A1 + R*(A2 + R*(A3 + R*(A4 + R*A5))))
0131 //  Notes:  a) In order to reduce memory access, the coefficients are
0132 //       made as "short" as possible: A1 (which is 1/2), A4 and A5
0133 //       are single precision; A2 and A3 are double precision.
0134 //       b) Even with the restrictions above,
0135 //          |p - (exp(R)-1)| < 2^(-68.8) for all |R| <= 0.0062.
0136 //       Note that 0.0062 is slightly bigger than 0.57 log2/64.
0137 //       c) To fully utilize the pipeline, p is separated into
0138 //       two independent pieces of roughly equal complexities
0139 //          p = [ R + R*S*(A2 + S*A4) ] +
0140 //              [ S*(A1 + S*(A3 + S*A5)) ]
0141 //       where S = R*R.
0142 //
0143 //  Step 5. Compute 2^(J/64)*exp(R) = 2^(J/64)*(1+p) by
0144 //              ans := T + ( T*p + t)
0145 //       where T and t are the stored values for 2^(J/64).
0146 //  Notes:  2^(J/64) is stored as T and t where T+t approximates
0147 //       2^(J/64) to roughly 85 bits; T is in extended precision
0148 //       and t is in single precision. Note also that T is rounded
0149 //       to 62 bits so that the last two bits of T are zero. The
0150 //       reason for such a special form is that T-1, T-2, and T-8
0151 //       will all be exact --- a property that will give much
0152 //       more accurate computation of the function EXPM1.
0153 //
0154 //  Step 6. Reconstruction of exp(X)
0155 //          exp(X) = 2^M * 2^(J/64) * exp(R).
0156 //      6.1 If AdjFlag = 0, go to 6.3
0157 //      6.2 ans := ans * AdjScale
0158 //      6.3 Restore the user FPCR
0159 //      6.4 Return ans := ans * Scale. Exit.
0160 //  Notes:  If AdjFlag = 0, we have X = Mlog2 + Jlog2/64 + R,
0161 //       |M| <= 16380, and Scale = 2^M. Moreover, exp(X) will
0162 //       neither overflow nor underflow. If AdjFlag = 1, that
0163 //       means that
0164 //          X = (M1+M)log2 + Jlog2/64 + R, |M1+M| >= 16380.
0165 //       Hence, exp(X) may overflow or underflow or neither.
0166 //       When that is the case, AdjScale = 2^(M1) where M1 is
0167 //       approximately M. Thus 6.2 will never cause over/underflow.
0168 //       Possible exception in 6.4 is overflow or underflow.
0169 //       The inexact exception is not generated in 6.4. Although
0170 //       one can argue that the inexact flag should always be
0171 //       raised, to simulate that exception cost to much than the
0172 //       flag is worth in practical uses.
0173 //
0174 //  Step 7. Return 1 + X.
0175 //      7.1 ans := X
0176 //      7.2 Restore user FPCR.
0177 //      7.3 Return ans := 1 + ans. Exit
0178 //  Notes:  For non-zero X, the inexact exception will always be
0179 //       raised by 7.3. That is the only exception raised by 7.3.
0180 //       Note also that we use the FMOVEM instruction to move X
0181 //       in Step 7.1 to avoid unnecessary trapping. (Although
0182 //       the FMOVEM may not seem relevant since X is normalized,
0183 //       the precaution will be useful in the library version of
0184 //       this code where the separate entry for denormalized inputs
0185 //       will be done away with.)
0186 //
0187 //  Step 8. Handle exp(X) where |X| >= 16380log2.
0188 //      8.1 If |X| > 16480 log2, go to Step 9.
0189 //      (mimic 2.2 - 2.6)
0190 //      8.2 N := round-to-integer( X * 64/log2 )
0191 //      8.3 Calculate J = N mod 64, J = 0,1,...,63
0192 //      8.4 K := (N-J)/64, M1 := truncate(K/2), M = K-M1, AdjFlag := 1.
0193 //      8.5 Calculate the address of the stored value 2^(J/64).
0194 //      8.6 Create the values Scale = 2^M, AdjScale = 2^M1.
0195 //      8.7 Go to Step 3.
0196 //  Notes:  Refer to notes for 2.2 - 2.6.
0197 //
0198 //  Step 9. Handle exp(X), |X| > 16480 log2.
0199 //      9.1 If X < 0, go to 9.3
0200 //      9.2 ans := Huge, go to 9.4
0201 //      9.3 ans := Tiny.
0202 //      9.4 Restore user FPCR.
0203 //      9.5 Return ans := ans * ans. Exit.
0204 //  Notes:  Exp(X) will surely overflow or underflow, depending on
0205 //       X's sign. "Huge" and "Tiny" are respectively large/tiny
0206 //       extended-precision numbers whose square over/underflow
0207 //       with an inexact result. Thus, 9.5 always raises the
0208 //       inexact together with either overflow or underflow.
0209 //
0210 //
0211 //  setoxm1d
0212 //  --------
0213 //
0214 //  Step 1. Set ans := 0
0215 //
0216 //  Step 2. Return  ans := X + ans. Exit.
0217 //  Notes:  This will return X with the appropriate rounding
0218 //       precision prescribed by the user FPCR.
0219 //
0220 //  setoxm1
0221 //  -------
0222 //
0223 //  Step 1. Check |X|
0224 //      1.1 If |X| >= 1/4, go to Step 1.3.
0225 //      1.2 Go to Step 7.
0226 //      1.3 If |X| < 70 log(2), go to Step 2.
0227 //      1.4 Go to Step 10.
0228 //  Notes:  The usual case should take the branches 1.1 -> 1.3 -> 2.
0229 //       However, it is conceivable |X| can be small very often
0230 //       because EXPM1 is intended to evaluate exp(X)-1 accurately
0231 //       when |X| is small. For further details on the comparisons,
0232 //       see the notes on Step 1 of setox.
0233 //
0234 //  Step 2. Calculate N = round-to-nearest-int( X * 64/log2 ).
0235 //      2.1 N := round-to-nearest-integer( X * 64/log2 ).
0236 //      2.2 Calculate   J = N mod 64; so J = 0,1,2,..., or 63.
0237 //      2.3 Calculate   M = (N - J)/64; so N = 64M + J.
0238 //      2.4 Calculate the address of the stored value of 2^(J/64).
0239 //      2.5 Create the values Sc = 2^M and OnebySc := -2^(-M).
0240 //  Notes:  See the notes on Step 2 of setox.
0241 //
0242 //  Step 3. Calculate X - N*log2/64.
0243 //      3.1 R := X + N*L1, where L1 := single-precision(-log2/64).
0244 //      3.2 R := R + N*L2, L2 := extended-precision(-log2/64 - L1).
0245 //  Notes:  Applying the analysis of Step 3 of setox in this case
0246 //       shows that |R| <= 0.0055 (note that |X| <= 70 log2 in
0247 //       this case).
0248 //
0249 //  Step 4. Approximate exp(R)-1 by a polynomial
0250 //          p = R+R*R*(A1+R*(A2+R*(A3+R*(A4+R*(A5+R*A6)))))
0251 //  Notes:  a) In order to reduce memory access, the coefficients are
0252 //       made as "short" as possible: A1 (which is 1/2), A5 and A6
0253 //       are single precision; A2, A3 and A4 are double precision.
0254 //       b) Even with the restriction above,
0255 //          |p - (exp(R)-1)| <  |R| * 2^(-72.7)
0256 //       for all |R| <= 0.0055.
0257 //       c) To fully utilize the pipeline, p is separated into
0258 //       two independent pieces of roughly equal complexity
0259 //          p = [ R*S*(A2 + S*(A4 + S*A6)) ]    +
0260 //              [ R + S*(A1 + S*(A3 + S*A5)) ]
0261 //       where S = R*R.
0262 //
0263 //  Step 5. Compute 2^(J/64)*p by
0264 //              p := T*p
0265 //       where T and t are the stored values for 2^(J/64).
0266 //  Notes:  2^(J/64) is stored as T and t where T+t approximates
0267 //       2^(J/64) to roughly 85 bits; T is in extended precision
0268 //       and t is in single precision. Note also that T is rounded
0269 //       to 62 bits so that the last two bits of T are zero. The
0270 //       reason for such a special form is that T-1, T-2, and T-8
0271 //       will all be exact --- a property that will be exploited
0272 //       in Step 6 below. The total relative error in p is no
0273 //       bigger than 2^(-67.7) compared to the final result.
0274 //
0275 //  Step 6. Reconstruction of exp(X)-1
0276 //          exp(X)-1 = 2^M * ( 2^(J/64) + p - 2^(-M) ).
0277 //      6.1 If M <= 63, go to Step 6.3.
0278 //      6.2 ans := T + (p + (t + OnebySc)). Go to 6.6
0279 //      6.3 If M >= -3, go to 6.5.
0280 //      6.4 ans := (T + (p + t)) + OnebySc. Go to 6.6
0281 //      6.5 ans := (T + OnebySc) + (p + t).
0282 //      6.6 Restore user FPCR.
0283 //      6.7 Return ans := Sc * ans. Exit.
0284 //  Notes:  The various arrangements of the expressions give accurate
0285 //       evaluations.
0286 //
0287 //  Step 7. exp(X)-1 for |X| < 1/4.
0288 //      7.1 If |X| >= 2^(-65), go to Step 9.
0289 //      7.2 Go to Step 8.
0290 //
0291 //  Step 8. Calculate exp(X)-1, |X| < 2^(-65).
0292 //      8.1 If |X| < 2^(-16312), goto 8.3
0293 //      8.2 Restore FPCR; return ans := X - 2^(-16382). Exit.
0294 //      8.3 X := X * 2^(140).
0295 //      8.4 Restore FPCR; ans := ans - 2^(-16382).
0296 //       Return ans := ans*2^(140). Exit
0297 //  Notes:  The idea is to return "X - tiny" under the user
0298 //       precision and rounding modes. To avoid unnecessary
0299 //       inefficiency, we stay away from denormalized numbers the
0300 //       best we can. For |X| >= 2^(-16312), the straightforward
0301 //       8.2 generates the inexact exception as the case warrants.
0302 //
0303 //  Step 9. Calculate exp(X)-1, |X| < 1/4, by a polynomial
0304 //          p = X + X*X*(B1 + X*(B2 + ... + X*B12))
0305 //  Notes:  a) In order to reduce memory access, the coefficients are
0306 //       made as "short" as possible: B1 (which is 1/2), B9 to B12
0307 //       are single precision; B3 to B8 are double precision; and
0308 //       B2 is double extended.
0309 //       b) Even with the restriction above,
0310 //          |p - (exp(X)-1)| < |X| 2^(-70.6)
0311 //       for all |X| <= 0.251.
0312 //       Note that 0.251 is slightly bigger than 1/4.
0313 //       c) To fully preserve accuracy, the polynomial is computed
0314 //       as X + ( S*B1 +    Q ) where S = X*X and
0315 //          Q   =   X*S*(B2 + X*(B3 + ... + X*B12))
0316 //       d) To fully utilize the pipeline, Q is separated into
0317 //       two independent pieces of roughly equal complexity
0318 //          Q = [ X*S*(B2 + S*(B4 + ... + S*B12)) ] +
0319 //              [ S*S*(B3 + S*(B5 + ... + S*B11)) ]
0320 //
0321 //  Step 10.    Calculate exp(X)-1 for |X| >= 70 log 2.
0322 //      10.1 If X >= 70log2 , exp(X) - 1 = exp(X) for all practical
0323 //       purposes. Therefore, go to Step 1 of setox.
0324 //      10.2 If X <= -70log2, exp(X) - 1 = -1 for all practical purposes.
0325 //       ans := -1
0326 //       Restore user FPCR
0327 //       Return ans := ans + 2^(-126). Exit.
0328 //  Notes:  10.2 will always create an inexact and return -1 + tiny
0329 //       in the user rounding precision and mode.
0330 //
0331 //
0332
0333 //      Copyright (C) Motorola, Inc. 1990
0334 //          All Rights Reserved
0335 //
0336 //  THIS IS UNPUBLISHED PROPRIETARY SOURCE CODE OF MOTOROLA
0337 //  The copyright notice above does not evidence any
0338 //  actual or intended publication of such source code.
0339
0340 //setox idnt    2,1 | Motorola 040 Floating Point Software Package
0341
0342     |section    8
0343
0344 #include "fpsp.defs"
0345
0346 L2: .long   0x3FDC0000,0x82E30865,0x4361C4C6,0x00000000
0347
0348 EXPA3:  .long   0x3FA55555,0x55554431
0349 EXPA2:  .long   0x3FC55555,0x55554018
0350
0351 HUGE:   .long   0x7FFE0000,0xFFFFFFFF,0xFFFFFFFF,0x00000000
0352 TINY:   .long   0x00010000,0xFFFFFFFF,0xFFFFFFFF,0x00000000
0353
0354 EM1A4:  .long   0x3F811111,0x11174385
0355 EM1A3:  .long   0x3FA55555,0x55554F5A
0356
0357 EM1A2:  .long   0x3FC55555,0x55555555,0x00000000,0x00000000
0358
0359 EM1B8:  .long   0x3EC71DE3,0xA5774682
0360 EM1B7:  .long   0x3EFA01A0,0x19D7CB68
0361
0362 EM1B6:  .long   0x3F2A01A0,0x1A019DF3
0363 EM1B5:  .long   0x3F56C16C,0x16C170E2
0364
0365 EM1B4:  .long   0x3F811111,0x11111111
0366 EM1B3:  .long   0x3FA55555,0x55555555
0367
0368 EM1B2:  .long   0x3FFC0000,0xAAAAAAAA,0xAAAAAAAB
0369     .long   0x00000000
0370
0371 TWO140: .long   0x48B00000,0x00000000
0372 TWON140:    .long   0x37300000,0x00000000
0373
0374 EXPTBL:
0375     .long   0x3FFF0000,0x80000000,0x00000000,0x00000000
0376     .long   0x3FFF0000,0x8164D1F3,0xBC030774,0x9F841A9B
0377     .long   0x3FFF0000,0x82CD8698,0xAC2BA1D8,0x9FC1D5B9
0378     .long   0x3FFF0000,0x843A28C3,0xACDE4048,0xA0728369
0379     .long   0x3FFF0000,0x85AAC367,0xCC487B14,0x1FC5C95C
0380     .long   0x3FFF0000,0x871F6196,0x9E8D1010,0x1EE85C9F
0381     .long   0x3FFF0000,0x88980E80,0x92DA8528,0x9FA20729
0382     .long   0x3FFF0000,0x8A14D575,0x496EFD9C,0xA07BF9AF
0383     .long   0x3FFF0000,0x8B95C1E3,0xEA8BD6E8,0xA0020DCF
0384     .long   0x3FFF0000,0x8D1ADF5B,0x7E5BA9E4,0x205A63DA
0385     .long   0x3FFF0000,0x8EA4398B,0x45CD53C0,0x1EB70051
0386     .long   0x3FFF0000,0x9031DC43,0x1466B1DC,0x1F6EB029
0387     .long   0x3FFF0000,0x91C3D373,0xAB11C338,0xA0781494
0388     .long   0x3FFF0000,0x935A2B2F,0x13E6E92C,0x9EB319B0
0389     .long   0x3FFF0000,0x94F4EFA8,0xFEF70960,0x2017457D
0390     .long   0x3FFF0000,0x96942D37,0x20185A00,0x1F11D537
0391     .long   0x3FFF0000,0x9837F051,0x8DB8A970,0x9FB952DD
0392     .long   0x3FFF0000,0x99E04593,0x20B7FA64,0x1FE43087
0393     .long   0x3FFF0000,0x9B8D39B9,0xD54E5538,0x1FA2A818
0394     .long   0x3FFF0000,0x9D3ED9A7,0x2CFFB750,0x1FDE494D
0395     .long   0x3FFF0000,0x9EF53260,0x91A111AC,0x20504890
0396     .long   0x3FFF0000,0xA0B0510F,0xB9714FC4,0xA073691C
0397     .long   0x3FFF0000,0xA2704303,0x0C496818,0x1F9B7A05
0398     .long   0x3FFF0000,0xA43515AE,0x09E680A0,0xA0797126
0399     .long   0x3FFF0000,0xA5FED6A9,0xB15138EC,0xA071A140
0400     .long   0x3FFF0000,0xA7CD93B4,0xE9653568,0x204F62DA
0401     .long   0x3FFF0000,0xA9A15AB4,0xEA7C0EF8,0x1F283C4A
0402     .long   0x3FFF0000,0xAB7A39B5,0xA93ED338,0x9F9A7FDC
0403     .long   0x3FFF0000,0xAD583EEA,0x42A14AC8,0xA05B3FAC
0404     .long   0x3FFF0000,0xAF3B78AD,0x690A4374,0x1FDF2610
0405     .long   0x3FFF0000,0xB123F581,0xD2AC2590,0x9F705F90
0406     .long   0x3FFF0000,0xB311C412,0xA9112488,0x201F678A
0407     .long   0x3FFF0000,0xB504F333,0xF9DE6484,0x1F32FB13
0408     .long   0x3FFF0000,0xB6FD91E3,0x28D17790,0x20038B30
0409     .long   0x3FFF0000,0xB8FBAF47,0x62FB9EE8,0x200DC3CC
0410     .long   0x3FFF0000,0xBAFF5AB2,0x133E45FC,0x9F8B2AE6
0411     .long   0x3FFF0000,0xBD08A39F,0x580C36C0,0xA02BBF70
0412     .long   0x3FFF0000,0xBF1799B6,0x7A731084,0xA00BF518
0413     .long   0x3FFF0000,0xC12C4CCA,0x66709458,0xA041DD41
0414     .long   0x3FFF0000,0xC346CCDA,0x24976408,0x9FDF137B
0415     .long   0x3FFF0000,0xC5672A11,0x5506DADC,0x201F1568
0416     .long   0x3FFF0000,0xC78D74C8,0xABB9B15C,0x1FC13A2E
0417     .long   0x3FFF0000,0xC9B9BD86,0x6E2F27A4,0xA03F8F03
0418     .long   0x3FFF0000,0xCBEC14FE,0xF2727C5C,0x1FF4907D
0419     .long   0x3FFF0000,0xCE248C15,0x1F8480E4,0x9E6E53E4
0420     .long   0x3FFF0000,0xD06333DA,0xEF2B2594,0x1FD6D45C
0421     .long   0x3FFF0000,0xD2A81D91,0xF12AE45C,0xA076EDB9
0422     .long   0x3FFF0000,0xD4F35AAB,0xCFEDFA20,0x9FA6DE21
0423     .long   0x3FFF0000,0xD744FCCA,0xD69D6AF4,0x1EE69A2F
0424     .long   0x3FFF0000,0xD99D15C2,0x78AFD7B4,0x207F439F
0425     .long   0x3FFF0000,0xDBFBB797,0xDAF23754,0x201EC207
0426     .long   0x3FFF0000,0xDE60F482,0x5E0E9124,0x9E8BE175
0427     .long   0x3FFF0000,0xE0CCDEEC,0x2A94E110,0x20032C4B
0428     .long   0x3FFF0000,0xE33F8972,0xBE8A5A50,0x2004DFF5
0429     .long   0x3FFF0000,0xE5B906E7,0x7C8348A8,0x1E72F47A
0430     .long   0x3FFF0000,0xE8396A50,0x3C4BDC68,0x1F722F22
0431     .long   0x3FFF0000,0xEAC0C6E7,0xDD243930,0xA017E945
0432     .long   0x3FFF0000,0xED4F301E,0xD9942B84,0x1F401A5B
0433     .long   0x3FFF0000,0xEFE4B99B,0xDCDAF5CC,0x9FB9A9E3
0434     .long   0x3FFF0000,0xF281773C,0x59FFB138,0x20744C05
0435     .long   0x3FFF0000,0xF5257D15,0x2486CC2C,0x1F773A19
0436     .long   0x3FFF0000,0xF7D0DF73,0x0AD13BB8,0x1FFE90D5
0437     .long   0x3FFF0000,0xFA83B2DB,0x722A033C,0xA041ED22
0438     .long   0x3FFF0000,0xFD3E0C0C,0xF486C174,0x1F853F3A
0439
0440     .set    ADJFLAG,L_SCR2
0441     .set    SCALE,FP_SCR1
0442     .set    ADJSCALE,FP_SCR2
0443     .set    SC,FP_SCR3
0444     .set    ONEBYSC,FP_SCR4
0445
0446     | xref  t_frcinx
0447     |xref   t_extdnrm
0448     |xref   t_unfl
0449     |xref   t_ovfl
0450
0451     .global setoxd
0452 setoxd:
0453 //--entry point for EXP(X), X is denormalized
0454     movel       (%a0),%d0
0455     andil       #0x80000000,%d0
0456     oril        #0x00800000,%d0     // ...sign(X)*2^(-126)
0457     movel       %d0,-(%sp)
0458     fmoves      #0x3F800000,%fp0
0459     fmovel      %d1,%fpcr
0460     fadds       (%sp)+,%fp0
0461     bra     t_frcinx
0462
0463     .global setox
0464 setox:
0465 //--entry point for EXP(X), here X is finite, non-zero, and not NaN's
0466
0467 //--Step 1.
0468     movel       (%a0),%d0    // ...load part of input X
0469     andil       #0x7FFF0000,%d0 // ...biased expo. of X
0470     cmpil       #0x3FBE0000,%d0 // ...2^(-65)
0471     bges        EXPC1       // ...normal case
0472     bra     EXPSM
0473
0474 EXPC1:
0475 //--The case |X| >= 2^(-65)
0476     movew       4(%a0),%d0  // ...expo. and partial sig. of |X|
0477     cmpil       #0x400CB167,%d0 // ...16380 log2 trunc. 16 bits
0478     blts        EXPMAIN  // ...normal case
0479     bra     EXPBIG
0480
0481 EXPMAIN:
0482 //--Step 2.
0483 //--This is the normal branch:  2^(-65) <= |X| < 16380 log2.
0484     fmovex      (%a0),%fp0  // ...load input from (a0)
0485
0486     fmovex      %fp0,%fp1
0487     fmuls       #0x42B8AA3B,%fp0    // ...64/log2 * X
0488     fmovemx %fp2-%fp2/%fp3,-(%a7)       // ...save fp2
0489     movel       #0,ADJFLAG(%a6)
0490     fmovel      %fp0,%d0        // ...N = int( X * 64/log2 )
0491     lea     EXPTBL,%a1
0492     fmovel      %d0,%fp0        // ...convert to floating-format
0493
0494     movel       %d0,L_SCR1(%a6) // ...save N temporarily
0495     andil       #0x3F,%d0       // ...D0 is J = N mod 64
0496     lsll        #4,%d0
0497     addal       %d0,%a1     // ...address of 2^(J/64)
0498     movel       L_SCR1(%a6),%d0
0499     asrl        #6,%d0      // ...D0 is M
0500     addiw       #0x3FFF,%d0 // ...biased expo. of 2^(M)
0501     movew       L2,L_SCR1(%a6)  // ...prefetch L2, no need in CB
0502
0503 EXPCONT1:
0504 //--Step 3.
0505 //--fp1,fp2 saved on the stack. fp0 is N, fp1 is X,
0506 //--a0 points to 2^(J/64), D0 is biased expo. of 2^(M)
0507     fmovex      %fp0,%fp2
0508     fmuls       #0xBC317218,%fp0    // ...N * L1, L1 = lead(-log2/64)
0509     fmulx       L2,%fp2     // ...N * L2, L1+L2 = -log2/64
0510     faddx       %fp1,%fp0       // ...X + N*L1
0511     faddx       %fp2,%fp0       // ...fp0 is R, reduced arg.
0512 //  MOVE.W      #$3FA5,EXPA3    ...load EXPA3 in cache
0513
0514 //--Step 4.
0515 //--WE NOW COMPUTE EXP(R)-1 BY A POLYNOMIAL
0516 //-- R + R*R*(A1 + R*(A2 + R*(A3 + R*(A4 + R*A5))))
0517 //--TO FULLY UTILIZE THE PIPELINE, WE COMPUTE S = R*R
0518 //--[R+R*S*(A2+S*A4)] + [S*(A1+S*(A3+S*A5))]
0519
0520     fmovex      %fp0,%fp1
0521     fmulx       %fp1,%fp1       // ...fp1 IS S = R*R
0522
0523     fmoves      #0x3AB60B70,%fp2    // ...fp2 IS A5
0524 //  MOVE.W      #0,2(%a1)   ...load 2^(J/64) in cache
0525
0526     fmulx       %fp1,%fp2       // ...fp2 IS S*A5
0527     fmovex      %fp1,%fp3
0528     fmuls       #0x3C088895,%fp3    // ...fp3 IS S*A4
0529
0530     faddd       EXPA3,%fp2  // ...fp2 IS A3+S*A5
0531     faddd       EXPA2,%fp3  // ...fp3 IS A2+S*A4
0532
0533     fmulx       %fp1,%fp2       // ...fp2 IS S*(A3+S*A5)
0534     movew       %d0,SCALE(%a6)  // ...SCALE is 2^(M) in extended
0535     clrw        SCALE+2(%a6)
0536     movel       #0x80000000,SCALE+4(%a6)
0537     clrl        SCALE+8(%a6)
0538
0539     fmulx       %fp1,%fp3       // ...fp3 IS S*(A2+S*A4)
0540
0541     fadds       #0x3F000000,%fp2    // ...fp2 IS A1+S*(A3+S*A5)
0542     fmulx       %fp0,%fp3       // ...fp3 IS R*S*(A2+S*A4)
0543
0544     fmulx       %fp1,%fp2       // ...fp2 IS S*(A1+S*(A3+S*A5))
0545     faddx       %fp3,%fp0       // ...fp0 IS R+R*S*(A2+S*A4),
0546 //                  ...fp3 released
0547
0548     fmovex      (%a1)+,%fp1 // ...fp1 is lead. pt. of 2^(J/64)
0549     faddx       %fp2,%fp0       // ...fp0 is EXP(R) - 1
0550 //                  ...fp2 released
0551
0552 //--Step 5
0553 //--final reconstruction process
0554 //--EXP(X) = 2^M * ( 2^(J/64) + 2^(J/64)*(EXP(R)-1) )
0555
0556     fmulx       %fp1,%fp0       // ...2^(J/64)*(Exp(R)-1)
0557     fmovemx (%a7)+,%fp2-%fp2/%fp3   // ...fp2 restored
0558     fadds       (%a1),%fp0  // ...accurate 2^(J/64)
0559
0560     faddx       %fp1,%fp0       // ...2^(J/64) + 2^(J/64)*...
0561     movel       ADJFLAG(%a6),%d0
0562
0563 //--Step 6
0564     tstl        %d0
0565     beqs        NORMAL
0566 ADJUST:
0567     fmulx       ADJSCALE(%a6),%fp0
0568 NORMAL:
0569     fmovel      %d1,%FPCR       // ...restore user FPCR
0570     fmulx       SCALE(%a6),%fp0 // ...multiply 2^(M)
0571     bra     t_frcinx
0572
0573 EXPSM:
0574 //--Step 7
0575     fmovemx (%a0),%fp0-%fp0 // ...in case X is denormalized
0576     fmovel      %d1,%FPCR
0577     fadds       #0x3F800000,%fp0    // ...1+X in user mode
0578     bra     t_frcinx
0579
0580 EXPBIG:
0581 //--Step 8
0582     cmpil       #0x400CB27C,%d0 // ...16480 log2
0583     bgts        EXP2BIG
0584 //--Steps 8.2 -- 8.6
0585     fmovex      (%a0),%fp0  // ...load input from (a0)
0586
0587     fmovex      %fp0,%fp1
0588     fmuls       #0x42B8AA3B,%fp0    // ...64/log2 * X
0589     fmovemx  %fp2-%fp2/%fp3,-(%a7)      // ...save fp2
0590     movel       #1,ADJFLAG(%a6)
0591     fmovel      %fp0,%d0        // ...N = int( X * 64/log2 )
0592     lea     EXPTBL,%a1
0593     fmovel      %d0,%fp0        // ...convert to floating-format
0594     movel       %d0,L_SCR1(%a6)         // ...save N temporarily
0595     andil       #0x3F,%d0        // ...D0 is J = N mod 64
0596     lsll        #4,%d0
0597     addal       %d0,%a1         // ...address of 2^(J/64)
0598     movel       L_SCR1(%a6),%d0
0599     asrl        #6,%d0          // ...D0 is K
0600     movel       %d0,L_SCR1(%a6)         // ...save K temporarily
0601     asrl        #1,%d0          // ...D0 is M1
0602     subl        %d0,L_SCR1(%a6)         // ...a1 is M
0603     addiw       #0x3FFF,%d0     // ...biased expo. of 2^(M1)
0604     movew       %d0,ADJSCALE(%a6)       // ...ADJSCALE := 2^(M1)
0605     clrw        ADJSCALE+2(%a6)
0606     movel       #0x80000000,ADJSCALE+4(%a6)
0607     clrl        ADJSCALE+8(%a6)
0608     movel       L_SCR1(%a6),%d0         // ...D0 is M
0609     addiw       #0x3FFF,%d0     // ...biased expo. of 2^(M)
0610     bra     EXPCONT1        // ...go back to Step 3
0611
0612 EXP2BIG:
0613 //--Step 9
0614     fmovel      %d1,%FPCR
0615     movel       (%a0),%d0
0616     bclrb       #sign_bit,(%a0)     // ...setox always returns positive
0617     cmpil       #0,%d0
0618     blt     t_unfl
0619     bra     t_ovfl
0620
0621     .global setoxm1d
0622 setoxm1d:
0623 //--entry point for EXPM1(X), here X is denormalized
0624 //--Step 0.
0625     bra     t_extdnrm
0626
0627
0628     .global setoxm1
0629 setoxm1:
0630 //--entry point for EXPM1(X), here X is finite, non-zero, non-NaN
0631
0632 //--Step 1.
0633 //--Step 1.1
0634     movel       (%a0),%d0    // ...load part of input X
0635     andil       #0x7FFF0000,%d0 // ...biased expo. of X
0636     cmpil       #0x3FFD0000,%d0 // ...1/4
0637     bges        EM1CON1  // ...|X| >= 1/4
0638     bra     EM1SM
0639
0640 EM1CON1:
0641 //--Step 1.3
0642 //--The case |X| >= 1/4
0643     movew       4(%a0),%d0  // ...expo. and partial sig. of |X|
0644     cmpil       #0x4004C215,%d0 // ...70log2 rounded up to 16 bits
0645     bles        EM1MAIN  // ...1/4 <= |X| <= 70log2
0646     bra     EM1BIG
0647
0648 EM1MAIN:
0649 //--Step 2.
0650 //--This is the case:   1/4 <= |X| <= 70 log2.
0651     fmovex      (%a0),%fp0  // ...load input from (a0)
0652
0653     fmovex      %fp0,%fp1
0654     fmuls       #0x42B8AA3B,%fp0    // ...64/log2 * X
0655     fmovemx %fp2-%fp2/%fp3,-(%a7)       // ...save fp2
0656 //  MOVE.W      #$3F81,EM1A4        ...prefetch in CB mode
0657     fmovel      %fp0,%d0        // ...N = int( X * 64/log2 )
0658     lea     EXPTBL,%a1
0659     fmovel      %d0,%fp0        // ...convert to floating-format
0660
0661     movel       %d0,L_SCR1(%a6)         // ...save N temporarily
0662     andil       #0x3F,%d0        // ...D0 is J = N mod 64
0663     lsll        #4,%d0
0664     addal       %d0,%a1         // ...address of 2^(J/64)
0665     movel       L_SCR1(%a6),%d0
0666     asrl        #6,%d0          // ...D0 is M
0667     movel       %d0,L_SCR1(%a6)         // ...save a copy of M
0668 //  MOVE.W      #$3FDC,L2       ...prefetch L2 in CB mode
0669
0670 //--Step 3.
0671 //--fp1,fp2 saved on the stack. fp0 is N, fp1 is X,
0672 //--a0 points to 2^(J/64), D0 and a1 both contain M
0673     fmovex      %fp0,%fp2
0674     fmuls       #0xBC317218,%fp0    // ...N * L1, L1 = lead(-log2/64)
0675     fmulx       L2,%fp2     // ...N * L2, L1+L2 = -log2/64
0676     faddx       %fp1,%fp0    // ...X + N*L1
0677     faddx       %fp2,%fp0    // ...fp0 is R, reduced arg.
0678 //  MOVE.W      #$3FC5,EM1A2        ...load EM1A2 in cache
0679     addiw       #0x3FFF,%d0     // ...D0 is biased expo. of 2^M
0680
0681 //--Step 4.
0682 //--WE NOW COMPUTE EXP(R)-1 BY A POLYNOMIAL
0683 //-- R + R*R*(A1 + R*(A2 + R*(A3 + R*(A4 + R*(A5 + R*A6)))))
0684 //--TO FULLY UTILIZE THE PIPELINE, WE COMPUTE S = R*R
0685 //--[R*S*(A2+S*(A4+S*A6))] + [R+S*(A1+S*(A3+S*A5))]
0686
0687     fmovex      %fp0,%fp1
0688     fmulx       %fp1,%fp1       // ...fp1 IS S = R*R
0689
0690     fmoves      #0x3950097B,%fp2    // ...fp2 IS a6
0691 //  MOVE.W      #0,2(%a1)   ...load 2^(J/64) in cache
0692
0693     fmulx       %fp1,%fp2       // ...fp2 IS S*A6
0694     fmovex      %fp1,%fp3
0695     fmuls       #0x3AB60B6A,%fp3    // ...fp3 IS S*A5
0696
0697     faddd       EM1A4,%fp2  // ...fp2 IS A4+S*A6
0698     faddd       EM1A3,%fp3  // ...fp3 IS A3+S*A5
0699     movew       %d0,SC(%a6)     // ...SC is 2^(M) in extended
0700     clrw        SC+2(%a6)
0701     movel       #0x80000000,SC+4(%a6)
0702     clrl        SC+8(%a6)
0703
0704     fmulx       %fp1,%fp2       // ...fp2 IS S*(A4+S*A6)
0705     movel       L_SCR1(%a6),%d0     // ...D0 is M
0706     negw        %d0     // ...D0 is -M
0707     fmulx       %fp1,%fp3       // ...fp3 IS S*(A3+S*A5)
0708     addiw       #0x3FFF,%d0 // ...biased expo. of 2^(-M)
0709     faddd       EM1A2,%fp2  // ...fp2 IS A2+S*(A4+S*A6)
0710     fadds       #0x3F000000,%fp3    // ...fp3 IS A1+S*(A3+S*A5)
0711
0712     fmulx       %fp1,%fp2       // ...fp2 IS S*(A2+S*(A4+S*A6))
0713     oriw        #0x8000,%d0 // ...signed/expo. of -2^(-M)
0714     movew       %d0,ONEBYSC(%a6)    // ...OnebySc is -2^(-M)
0715     clrw        ONEBYSC+2(%a6)
0716     movel       #0x80000000,ONEBYSC+4(%a6)
0717     clrl        ONEBYSC+8(%a6)
0718     fmulx       %fp3,%fp1       // ...fp1 IS S*(A1+S*(A3+S*A5))
0719 //                  ...fp3 released
0720
0721     fmulx       %fp0,%fp2       // ...fp2 IS R*S*(A2+S*(A4+S*A6))
0722     faddx       %fp1,%fp0       // ...fp0 IS R+S*(A1+S*(A3+S*A5))
0723 //                  ...fp1 released
0724
0725     faddx       %fp2,%fp0       // ...fp0 IS EXP(R)-1
0726 //                  ...fp2 released
0727     fmovemx (%a7)+,%fp2-%fp2/%fp3   // ...fp2 restored
0728
0729 //--Step 5
0730 //--Compute 2^(J/64)*p
0731
0732     fmulx       (%a1),%fp0  // ...2^(J/64)*(Exp(R)-1)
0733
0734 //--Step 6
0735 //--Step 6.1
0736     movel       L_SCR1(%a6),%d0     // ...retrieve M
0737     cmpil       #63,%d0
0738     bles        MLE63
0739 //--Step 6.2    M >= 64
0740     fmoves      12(%a1),%fp1    // ...fp1 is t
0741     faddx       ONEBYSC(%a6),%fp1   // ...fp1 is t+OnebySc
0742     faddx       %fp1,%fp0       // ...p+(t+OnebySc), fp1 released
0743     faddx       (%a1),%fp0  // ...T+(p+(t+OnebySc))
0744     bras        EM1SCALE
0745 MLE63:
0746 //--Step 6.3    M <= 63
0747     cmpil       #-3,%d0
0748     bges        MGEN3
0749 MLTN3:
0750 //--Step 6.4    M <= -4
0751     fadds       12(%a1),%fp0    // ...p+t
0752     faddx       (%a1),%fp0  // ...T+(p+t)
0753     faddx       ONEBYSC(%a6),%fp0   // ...OnebySc + (T+(p+t))
0754     bras        EM1SCALE
0755 MGEN3:
0756 //--Step 6.5    -3 <= M <= 63
0757     fmovex      (%a1)+,%fp1 // ...fp1 is T
0758     fadds       (%a1),%fp0  // ...fp0 is p+t
0759     faddx       ONEBYSC(%a6),%fp1   // ...fp1 is T+OnebySc
0760     faddx       %fp1,%fp0       // ...(T+OnebySc)+(p+t)
0761
0762 EM1SCALE:
0763 //--Step 6.6
0764     fmovel      %d1,%FPCR
0765     fmulx       SC(%a6),%fp0
0766
0767     bra     t_frcinx
0768
0769 EM1SM:
0770 //--Step 7  |X| < 1/4.
0771     cmpil       #0x3FBE0000,%d0 // ...2^(-65)
0772     bges        EM1POLY
0773
0774 EM1TINY:
0775 //--Step 8  |X| < 2^(-65)
0776     cmpil       #0x00330000,%d0 // ...2^(-16312)
0777     blts        EM12TINY
0778 //--Step 8.2
0779     movel       #0x80010000,SC(%a6) // ...SC is -2^(-16382)
0780     movel       #0x80000000,SC+4(%a6)
0781     clrl        SC+8(%a6)
0782     fmovex      (%a0),%fp0
0783     fmovel      %d1,%FPCR
0784     faddx       SC(%a6),%fp0
0785
0786     bra     t_frcinx
0787
0788 EM12TINY:
0789 //--Step 8.3
0790     fmovex      (%a0),%fp0
0791     fmuld       TWO140,%fp0
0792     movel       #0x80010000,SC(%a6)
0793     movel       #0x80000000,SC+4(%a6)
0794     clrl        SC+8(%a6)
0795     faddx       SC(%a6),%fp0
0796     fmovel      %d1,%FPCR
0797     fmuld       TWON140,%fp0
0798
0799     bra     t_frcinx
0800
0801 EM1POLY:
0802 //--Step 9  exp(X)-1 by a simple polynomial
0803     fmovex      (%a0),%fp0  // ...fp0 is X
0804     fmulx       %fp0,%fp0       // ...fp0 is S := X*X
0805     fmovemx %fp2-%fp2/%fp3,-(%a7)   // ...save fp2
0806     fmoves      #0x2F30CAA8,%fp1    // ...fp1 is B12
0807     fmulx       %fp0,%fp1       // ...fp1 is S*B12
0808     fmoves      #0x310F8290,%fp2    // ...fp2 is B11
0809     fadds       #0x32D73220,%fp1    // ...fp1 is B10+S*B12
0810
0811     fmulx       %fp0,%fp2       // ...fp2 is S*B11
0812     fmulx       %fp0,%fp1       // ...fp1 is S*(B10 + ...
0813
0814     fadds       #0x3493F281,%fp2    // ...fp2 is B9+S*...
0815     faddd       EM1B8,%fp1  // ...fp1 is B8+S*...
0816
0817     fmulx       %fp0,%fp2       // ...fp2 is S*(B9+...
0818     fmulx       %fp0,%fp1       // ...fp1 is S*(B8+...
0819
0820     faddd       EM1B7,%fp2  // ...fp2 is B7+S*...
0821     faddd       EM1B6,%fp1  // ...fp1 is B6+S*...
0822
0823     fmulx       %fp0,%fp2       // ...fp2 is S*(B7+...
0824     fmulx       %fp0,%fp1       // ...fp1 is S*(B6+...
0825
0826     faddd       EM1B5,%fp2  // ...fp2 is B5+S*...
0827     faddd       EM1B4,%fp1  // ...fp1 is B4+S*...
0828
0829     fmulx       %fp0,%fp2       // ...fp2 is S*(B5+...
0830     fmulx       %fp0,%fp1       // ...fp1 is S*(B4+...
0831
0832     faddd       EM1B3,%fp2  // ...fp2 is B3+S*...
0833     faddx       EM1B2,%fp1  // ...fp1 is B2+S*...
0834
0835     fmulx       %fp0,%fp2       // ...fp2 is S*(B3+...
0836     fmulx       %fp0,%fp1       // ...fp1 is S*(B2+...
0837
0838     fmulx       %fp0,%fp2       // ...fp2 is S*S*(B3+...)
0839     fmulx       (%a0),%fp1  // ...fp1 is X*S*(B2...
0840
0841     fmuls       #0x3F000000,%fp0    // ...fp0 is S*B1
0842     faddx       %fp2,%fp1       // ...fp1 is Q
0843 //                  ...fp2 released
0844
0845     fmovemx (%a7)+,%fp2-%fp2/%fp3   // ...fp2 restored
0846
0847     faddx       %fp1,%fp0       // ...fp0 is S*B1+Q
0848 //                  ...fp1 released
0849
0850     fmovel      %d1,%FPCR
0851     faddx       (%a0),%fp0
0852
0853     bra     t_frcinx
0854
0855 EM1BIG:
0856 //--Step 10 |X| > 70 log2
0857     movel       (%a0),%d0
0858     cmpil       #0,%d0
0859     bgt     EXPC1
0860 //--Step 10.2
0861     fmoves      #0xBF800000,%fp0    // ...fp0 is -1
0862     fmovel      %d1,%FPCR
0863     fadds       #0x00800000,%fp0    // ...-1 + 2^(-126)
0864
0865     bra     t_frcinx
0866
0867     |end