File indexing completed on 2025-05-11 08:23:49
0001 #include "fpsp-namespace.h"
0002 //
0003 //
0004 // setox.sa 3.1 12/10/90
0005 //
0006 // The entry point setox computes the exponential of a value.
0007 // setoxd does the same except the input value is a denormalized
0008 // number. setoxm1 computes exp(X)-1, and setoxm1d computes
0009 // exp(X)-1 for denormalized X.
0010 //
0011 // INPUT
0012 // -----
0013 // Double-extended value in memory location pointed to by address
0014 // register a0.
0015 //
0016 // OUTPUT
0017 // ------
0018 // exp(X) or exp(X)-1 returned in floating-point register fp0.
0019 //
0020 // ACCURACY and MONOTONICITY
0021 // -------------------------
0022 // The returned result is within 0.85 ulps in 64 significant bit, i.e.
0023 // within 0.5001 ulp to 53 bits if the result is subsequently rounded
0024 // to double precision. The result is provably monotonic in double
0025 // precision.
0026 //
0027 // SPEED
0028 // -----
0029 // Two timings are measured, both in the copy-back mode. The
0030 // first one is measured when the function is invoked the first time
0031 // (so the instructions and data are not in cache), and the
0032 // second one is measured when the function is reinvoked at the same
0033 // input argument.
0034 //
0035 // The program setox takes approximately 210/190 cycles for input
0036 // argument X whose magnitude is less than 16380 log2, which
0037 // is the usual situation. For the less common arguments,
0038 // depending on their values, the program may run faster or slower --
0039 // but no worse than 10% slower even in the extreme cases.
0040 //
0041 // The program setoxm1 takes approximately ???/??? cycles for input
0042 // argument X, 0.25 <= |X| < 70log2. For |X| < 0.25, it takes
0043 // approximately ???/??? cycles. For the less common arguments,
0044 // depending on their values, the program may run faster or slower --
0045 // but no worse than 10% slower even in the extreme cases.
0046 //
0047 // ALGORITHM and IMPLEMENTATION NOTES
0048 // ----------------------------------
0049 //
0050 // setoxd
0051 // ------
0052 // Step 1. Set ans := 1.0
0053 //
0054 // Step 2. Return ans := ans + sign(X)*2^(-126). Exit.
0055 // Notes: This will always generate one exception -- inexact.
0056 //
0057 //
0058 // setox
0059 // -----
0060 //
0061 // Step 1. Filter out extreme cases of input argument.
0062 // 1.1 If |X| >= 2^(-65), go to Step 1.3.
0063 // 1.2 Go to Step 7.
0064 // 1.3 If |X| < 16380 log(2), go to Step 2.
0065 // 1.4 Go to Step 8.
0066 // Notes: The usual case should take the branches 1.1 -> 1.3 -> 2.
0067 // To avoid the use of floating-point comparisons, a
0068 // compact representation of |X| is used. This format is a
0069 // 32-bit integer, the upper (more significant) 16 bits are
0070 // the sign and biased exponent field of |X|; the lower 16
0071 // bits are the 16 most significant fraction (including the
0072 // explicit bit) bits of |X|. Consequently, the comparisons
0073 // in Steps 1.1 and 1.3 can be performed by integer comparison.
0074 // Note also that the constant 16380 log(2) used in Step 1.3
0075 // is also in the compact form. Thus taking the branch
0076 // to Step 2 guarantees |X| < 16380 log(2). There is no harm
0077 // to have a small number of cases where |X| is less than,
0078 // but close to, 16380 log(2) and the branch to Step 9 is
0079 // taken.
0080 //
0081 // Step 2. Calculate N = round-to-nearest-int( X * 64/log2 ).
0082 // 2.1 Set AdjFlag := 0 (indicates the branch 1.3 -> 2 was taken)
0083 // 2.2 N := round-to-nearest-integer( X * 64/log2 ).
0084 // 2.3 Calculate J = N mod 64; so J = 0,1,2,..., or 63.
0085 // 2.4 Calculate M = (N - J)/64; so N = 64M + J.
0086 // 2.5 Calculate the address of the stored value of 2^(J/64).
0087 // 2.6 Create the value Scale = 2^M.
0088 // Notes: The calculation in 2.2 is really performed by
0089 //
0090 // Z := X * constant
0091 // N := round-to-nearest-integer(Z)
0092 //
0093 // where
0094 //
0095 // constant := single-precision( 64/log 2 ).
0096 //
0097 // Using a single-precision constant avoids memory access.
0098 // Another effect of using a single-precision "constant" is
0099 // that the calculated value Z is
0100 //
0101 // Z = X*(64/log2)*(1+eps), |eps| <= 2^(-24).
0102 //
0103 // This error has to be considered later in Steps 3 and 4.
0104 //
0105 // Step 3. Calculate X - N*log2/64.
0106 // 3.1 R := X + N*L1, where L1 := single-precision(-log2/64).
0107 // 3.2 R := R + N*L2, L2 := extended-precision(-log2/64 - L1).
0108 // Notes: a) The way L1 and L2 are chosen ensures L1+L2 approximate
0109 // the value -log2/64 to 88 bits of accuracy.
0110 // b) N*L1 is exact because N is no longer than 22 bits and
0111 // L1 is no longer than 24 bits.
0112 // c) The calculation X+N*L1 is also exact due to cancellation.
0113 // Thus, R is practically X+N(L1+L2) to full 64 bits.
0114 // d) It is important to estimate how large can |R| be after
0115 // Step 3.2.
0116 //
0117 // N = rnd-to-int( X*64/log2 (1+eps) ), |eps|<=2^(-24)
0118 // X*64/log2 (1+eps) = N + f, |f| <= 0.5
0119 // X*64/log2 - N = f - eps*X 64/log2
0120 // X - N*log2/64 = f*log2/64 - eps*X
0121 //
0122 //
0123 // Now |X| <= 16446 log2, thus
0124 //
0125 // |X - N*log2/64| <= (0.5 + 16446/2^(18))*log2/64
0126 // <= 0.57 log2/64.
0127 // This bound will be used in Step 4.
0128 //
0129 // Step 4. Approximate exp(R)-1 by a polynomial
0130 // p = R + R*R*(A1 + R*(A2 + R*(A3 + R*(A4 + R*A5))))
0131 // Notes: a) In order to reduce memory access, the coefficients are
0132 // made as "short" as possible: A1 (which is 1/2), A4 and A5
0133 // are single precision; A2 and A3 are double precision.
0134 // b) Even with the restrictions above,
0135 // |p - (exp(R)-1)| < 2^(-68.8) for all |R| <= 0.0062.
0136 // Note that 0.0062 is slightly bigger than 0.57 log2/64.
0137 // c) To fully utilize the pipeline, p is separated into
0138 // two independent pieces of roughly equal complexities
0139 // p = [ R + R*S*(A2 + S*A4) ] +
0140 // [ S*(A1 + S*(A3 + S*A5)) ]
0141 // where S = R*R.
0142 //
0143 // Step 5. Compute 2^(J/64)*exp(R) = 2^(J/64)*(1+p) by
0144 // ans := T + ( T*p + t)
0145 // where T and t are the stored values for 2^(J/64).
0146 // Notes: 2^(J/64) is stored as T and t where T+t approximates
0147 // 2^(J/64) to roughly 85 bits; T is in extended precision
0148 // and t is in single precision. Note also that T is rounded
0149 // to 62 bits so that the last two bits of T are zero. The
0150 // reason for such a special form is that T-1, T-2, and T-8
0151 // will all be exact --- a property that will give much
0152 // more accurate computation of the function EXPM1.
0153 //
0154 // Step 6. Reconstruction of exp(X)
0155 // exp(X) = 2^M * 2^(J/64) * exp(R).
0156 // 6.1 If AdjFlag = 0, go to 6.3
0157 // 6.2 ans := ans * AdjScale
0158 // 6.3 Restore the user FPCR
0159 // 6.4 Return ans := ans * Scale. Exit.
0160 // Notes: If AdjFlag = 0, we have X = Mlog2 + Jlog2/64 + R,
0161 // |M| <= 16380, and Scale = 2^M. Moreover, exp(X) will
0162 // neither overflow nor underflow. If AdjFlag = 1, that
0163 // means that
0164 // X = (M1+M)log2 + Jlog2/64 + R, |M1+M| >= 16380.
0165 // Hence, exp(X) may overflow or underflow or neither.
0166 // When that is the case, AdjScale = 2^(M1) where M1 is
0167 // approximately M. Thus 6.2 will never cause over/underflow.
0168 // Possible exception in 6.4 is overflow or underflow.
0169 // The inexact exception is not generated in 6.4. Although
0170 // one can argue that the inexact flag should always be
0171 // raised, to simulate that exception cost to much than the
0172 // flag is worth in practical uses.
0173 //
0174 // Step 7. Return 1 + X.
0175 // 7.1 ans := X
0176 // 7.2 Restore user FPCR.
0177 // 7.3 Return ans := 1 + ans. Exit
0178 // Notes: For non-zero X, the inexact exception will always be
0179 // raised by 7.3. That is the only exception raised by 7.3.
0180 // Note also that we use the FMOVEM instruction to move X
0181 // in Step 7.1 to avoid unnecessary trapping. (Although
0182 // the FMOVEM may not seem relevant since X is normalized,
0183 // the precaution will be useful in the library version of
0184 // this code where the separate entry for denormalized inputs
0185 // will be done away with.)
0186 //
0187 // Step 8. Handle exp(X) where |X| >= 16380log2.
0188 // 8.1 If |X| > 16480 log2, go to Step 9.
0189 // (mimic 2.2 - 2.6)
0190 // 8.2 N := round-to-integer( X * 64/log2 )
0191 // 8.3 Calculate J = N mod 64, J = 0,1,...,63
0192 // 8.4 K := (N-J)/64, M1 := truncate(K/2), M = K-M1, AdjFlag := 1.
0193 // 8.5 Calculate the address of the stored value 2^(J/64).
0194 // 8.6 Create the values Scale = 2^M, AdjScale = 2^M1.
0195 // 8.7 Go to Step 3.
0196 // Notes: Refer to notes for 2.2 - 2.6.
0197 //
0198 // Step 9. Handle exp(X), |X| > 16480 log2.
0199 // 9.1 If X < 0, go to 9.3
0200 // 9.2 ans := Huge, go to 9.4
0201 // 9.3 ans := Tiny.
0202 // 9.4 Restore user FPCR.
0203 // 9.5 Return ans := ans * ans. Exit.
0204 // Notes: Exp(X) will surely overflow or underflow, depending on
0205 // X's sign. "Huge" and "Tiny" are respectively large/tiny
0206 // extended-precision numbers whose square over/underflow
0207 // with an inexact result. Thus, 9.5 always raises the
0208 // inexact together with either overflow or underflow.
0209 //
0210 //
0211 // setoxm1d
0212 // --------
0213 //
0214 // Step 1. Set ans := 0
0215 //
0216 // Step 2. Return ans := X + ans. Exit.
0217 // Notes: This will return X with the appropriate rounding
0218 // precision prescribed by the user FPCR.
0219 //
0220 // setoxm1
0221 // -------
0222 //
0223 // Step 1. Check |X|
0224 // 1.1 If |X| >= 1/4, go to Step 1.3.
0225 // 1.2 Go to Step 7.
0226 // 1.3 If |X| < 70 log(2), go to Step 2.
0227 // 1.4 Go to Step 10.
0228 // Notes: The usual case should take the branches 1.1 -> 1.3 -> 2.
0229 // However, it is conceivable |X| can be small very often
0230 // because EXPM1 is intended to evaluate exp(X)-1 accurately
0231 // when |X| is small. For further details on the comparisons,
0232 // see the notes on Step 1 of setox.
0233 //
0234 // Step 2. Calculate N = round-to-nearest-int( X * 64/log2 ).
0235 // 2.1 N := round-to-nearest-integer( X * 64/log2 ).
0236 // 2.2 Calculate J = N mod 64; so J = 0,1,2,..., or 63.
0237 // 2.3 Calculate M = (N - J)/64; so N = 64M + J.
0238 // 2.4 Calculate the address of the stored value of 2^(J/64).
0239 // 2.5 Create the values Sc = 2^M and OnebySc := -2^(-M).
0240 // Notes: See the notes on Step 2 of setox.
0241 //
0242 // Step 3. Calculate X - N*log2/64.
0243 // 3.1 R := X + N*L1, where L1 := single-precision(-log2/64).
0244 // 3.2 R := R + N*L2, L2 := extended-precision(-log2/64 - L1).
0245 // Notes: Applying the analysis of Step 3 of setox in this case
0246 // shows that |R| <= 0.0055 (note that |X| <= 70 log2 in
0247 // this case).
0248 //
0249 // Step 4. Approximate exp(R)-1 by a polynomial
0250 // p = R+R*R*(A1+R*(A2+R*(A3+R*(A4+R*(A5+R*A6)))))
0251 // Notes: a) In order to reduce memory access, the coefficients are
0252 // made as "short" as possible: A1 (which is 1/2), A5 and A6
0253 // are single precision; A2, A3 and A4 are double precision.
0254 // b) Even with the restriction above,
0255 // |p - (exp(R)-1)| < |R| * 2^(-72.7)
0256 // for all |R| <= 0.0055.
0257 // c) To fully utilize the pipeline, p is separated into
0258 // two independent pieces of roughly equal complexity
0259 // p = [ R*S*(A2 + S*(A4 + S*A6)) ] +
0260 // [ R + S*(A1 + S*(A3 + S*A5)) ]
0261 // where S = R*R.
0262 //
0263 // Step 5. Compute 2^(J/64)*p by
0264 // p := T*p
0265 // where T and t are the stored values for 2^(J/64).
0266 // Notes: 2^(J/64) is stored as T and t where T+t approximates
0267 // 2^(J/64) to roughly 85 bits; T is in extended precision
0268 // and t is in single precision. Note also that T is rounded
0269 // to 62 bits so that the last two bits of T are zero. The
0270 // reason for such a special form is that T-1, T-2, and T-8
0271 // will all be exact --- a property that will be exploited
0272 // in Step 6 below. The total relative error in p is no
0273 // bigger than 2^(-67.7) compared to the final result.
0274 //
0275 // Step 6. Reconstruction of exp(X)-1
0276 // exp(X)-1 = 2^M * ( 2^(J/64) + p - 2^(-M) ).
0277 // 6.1 If M <= 63, go to Step 6.3.
0278 // 6.2 ans := T + (p + (t + OnebySc)). Go to 6.6
0279 // 6.3 If M >= -3, go to 6.5.
0280 // 6.4 ans := (T + (p + t)) + OnebySc. Go to 6.6
0281 // 6.5 ans := (T + OnebySc) + (p + t).
0282 // 6.6 Restore user FPCR.
0283 // 6.7 Return ans := Sc * ans. Exit.
0284 // Notes: The various arrangements of the expressions give accurate
0285 // evaluations.
0286 //
0287 // Step 7. exp(X)-1 for |X| < 1/4.
0288 // 7.1 If |X| >= 2^(-65), go to Step 9.
0289 // 7.2 Go to Step 8.
0290 //
0291 // Step 8. Calculate exp(X)-1, |X| < 2^(-65).
0292 // 8.1 If |X| < 2^(-16312), goto 8.3
0293 // 8.2 Restore FPCR; return ans := X - 2^(-16382). Exit.
0294 // 8.3 X := X * 2^(140).
0295 // 8.4 Restore FPCR; ans := ans - 2^(-16382).
0296 // Return ans := ans*2^(140). Exit
0297 // Notes: The idea is to return "X - tiny" under the user
0298 // precision and rounding modes. To avoid unnecessary
0299 // inefficiency, we stay away from denormalized numbers the
0300 // best we can. For |X| >= 2^(-16312), the straightforward
0301 // 8.2 generates the inexact exception as the case warrants.
0302 //
0303 // Step 9. Calculate exp(X)-1, |X| < 1/4, by a polynomial
0304 // p = X + X*X*(B1 + X*(B2 + ... + X*B12))
0305 // Notes: a) In order to reduce memory access, the coefficients are
0306 // made as "short" as possible: B1 (which is 1/2), B9 to B12
0307 // are single precision; B3 to B8 are double precision; and
0308 // B2 is double extended.
0309 // b) Even with the restriction above,
0310 // |p - (exp(X)-1)| < |X| 2^(-70.6)
0311 // for all |X| <= 0.251.
0312 // Note that 0.251 is slightly bigger than 1/4.
0313 // c) To fully preserve accuracy, the polynomial is computed
0314 // as X + ( S*B1 + Q ) where S = X*X and
0315 // Q = X*S*(B2 + X*(B3 + ... + X*B12))
0316 // d) To fully utilize the pipeline, Q is separated into
0317 // two independent pieces of roughly equal complexity
0318 // Q = [ X*S*(B2 + S*(B4 + ... + S*B12)) ] +
0319 // [ S*S*(B3 + S*(B5 + ... + S*B11)) ]
0320 //
0321 // Step 10. Calculate exp(X)-1 for |X| >= 70 log 2.
0322 // 10.1 If X >= 70log2 , exp(X) - 1 = exp(X) for all practical
0323 // purposes. Therefore, go to Step 1 of setox.
0324 // 10.2 If X <= -70log2, exp(X) - 1 = -1 for all practical purposes.
0325 // ans := -1
0326 // Restore user FPCR
0327 // Return ans := ans + 2^(-126). Exit.
0328 // Notes: 10.2 will always create an inexact and return -1 + tiny
0329 // in the user rounding precision and mode.
0330 //
0331 //
0332
0333 // Copyright (C) Motorola, Inc. 1990
0334 // All Rights Reserved
0335 //
0336 // THIS IS UNPUBLISHED PROPRIETARY SOURCE CODE OF MOTOROLA
0337 // The copyright notice above does not evidence any
0338 // actual or intended publication of such source code.
0339
0340 //setox idnt 2,1 | Motorola 040 Floating Point Software Package
0341
0342 |section 8
0343
0344 #include "fpsp.defs"
0345
0346 L2: .long 0x3FDC0000,0x82E30865,0x4361C4C6,0x00000000
0347
0348 EXPA3: .long 0x3FA55555,0x55554431
0349 EXPA2: .long 0x3FC55555,0x55554018
0350
0351 HUGE: .long 0x7FFE0000,0xFFFFFFFF,0xFFFFFFFF,0x00000000
0352 TINY: .long 0x00010000,0xFFFFFFFF,0xFFFFFFFF,0x00000000
0353
0354 EM1A4: .long 0x3F811111,0x11174385
0355 EM1A3: .long 0x3FA55555,0x55554F5A
0356
0357 EM1A2: .long 0x3FC55555,0x55555555,0x00000000,0x00000000
0358
0359 EM1B8: .long 0x3EC71DE3,0xA5774682
0360 EM1B7: .long 0x3EFA01A0,0x19D7CB68
0361
0362 EM1B6: .long 0x3F2A01A0,0x1A019DF3
0363 EM1B5: .long 0x3F56C16C,0x16C170E2
0364
0365 EM1B4: .long 0x3F811111,0x11111111
0366 EM1B3: .long 0x3FA55555,0x55555555
0367
0368 EM1B2: .long 0x3FFC0000,0xAAAAAAAA,0xAAAAAAAB
0369 .long 0x00000000
0370
0371 TWO140: .long 0x48B00000,0x00000000
0372 TWON140: .long 0x37300000,0x00000000
0373
0374 EXPTBL:
0375 .long 0x3FFF0000,0x80000000,0x00000000,0x00000000
0376 .long 0x3FFF0000,0x8164D1F3,0xBC030774,0x9F841A9B
0377 .long 0x3FFF0000,0x82CD8698,0xAC2BA1D8,0x9FC1D5B9
0378 .long 0x3FFF0000,0x843A28C3,0xACDE4048,0xA0728369
0379 .long 0x3FFF0000,0x85AAC367,0xCC487B14,0x1FC5C95C
0380 .long 0x3FFF0000,0x871F6196,0x9E8D1010,0x1EE85C9F
0381 .long 0x3FFF0000,0x88980E80,0x92DA8528,0x9FA20729
0382 .long 0x3FFF0000,0x8A14D575,0x496EFD9C,0xA07BF9AF
0383 .long 0x3FFF0000,0x8B95C1E3,0xEA8BD6E8,0xA0020DCF
0384 .long 0x3FFF0000,0x8D1ADF5B,0x7E5BA9E4,0x205A63DA
0385 .long 0x3FFF0000,0x8EA4398B,0x45CD53C0,0x1EB70051
0386 .long 0x3FFF0000,0x9031DC43,0x1466B1DC,0x1F6EB029
0387 .long 0x3FFF0000,0x91C3D373,0xAB11C338,0xA0781494
0388 .long 0x3FFF0000,0x935A2B2F,0x13E6E92C,0x9EB319B0
0389 .long 0x3FFF0000,0x94F4EFA8,0xFEF70960,0x2017457D
0390 .long 0x3FFF0000,0x96942D37,0x20185A00,0x1F11D537
0391 .long 0x3FFF0000,0x9837F051,0x8DB8A970,0x9FB952DD
0392 .long 0x3FFF0000,0x99E04593,0x20B7FA64,0x1FE43087
0393 .long 0x3FFF0000,0x9B8D39B9,0xD54E5538,0x1FA2A818
0394 .long 0x3FFF0000,0x9D3ED9A7,0x2CFFB750,0x1FDE494D
0395 .long 0x3FFF0000,0x9EF53260,0x91A111AC,0x20504890
0396 .long 0x3FFF0000,0xA0B0510F,0xB9714FC4,0xA073691C
0397 .long 0x3FFF0000,0xA2704303,0x0C496818,0x1F9B7A05
0398 .long 0x3FFF0000,0xA43515AE,0x09E680A0,0xA0797126
0399 .long 0x3FFF0000,0xA5FED6A9,0xB15138EC,0xA071A140
0400 .long 0x3FFF0000,0xA7CD93B4,0xE9653568,0x204F62DA
0401 .long 0x3FFF0000,0xA9A15AB4,0xEA7C0EF8,0x1F283C4A
0402 .long 0x3FFF0000,0xAB7A39B5,0xA93ED338,0x9F9A7FDC
0403 .long 0x3FFF0000,0xAD583EEA,0x42A14AC8,0xA05B3FAC
0404 .long 0x3FFF0000,0xAF3B78AD,0x690A4374,0x1FDF2610
0405 .long 0x3FFF0000,0xB123F581,0xD2AC2590,0x9F705F90
0406 .long 0x3FFF0000,0xB311C412,0xA9112488,0x201F678A
0407 .long 0x3FFF0000,0xB504F333,0xF9DE6484,0x1F32FB13
0408 .long 0x3FFF0000,0xB6FD91E3,0x28D17790,0x20038B30
0409 .long 0x3FFF0000,0xB8FBAF47,0x62FB9EE8,0x200DC3CC
0410 .long 0x3FFF0000,0xBAFF5AB2,0x133E45FC,0x9F8B2AE6
0411 .long 0x3FFF0000,0xBD08A39F,0x580C36C0,0xA02BBF70
0412 .long 0x3FFF0000,0xBF1799B6,0x7A731084,0xA00BF518
0413 .long 0x3FFF0000,0xC12C4CCA,0x66709458,0xA041DD41
0414 .long 0x3FFF0000,0xC346CCDA,0x24976408,0x9FDF137B
0415 .long 0x3FFF0000,0xC5672A11,0x5506DADC,0x201F1568
0416 .long 0x3FFF0000,0xC78D74C8,0xABB9B15C,0x1FC13A2E
0417 .long 0x3FFF0000,0xC9B9BD86,0x6E2F27A4,0xA03F8F03
0418 .long 0x3FFF0000,0xCBEC14FE,0xF2727C5C,0x1FF4907D
0419 .long 0x3FFF0000,0xCE248C15,0x1F8480E4,0x9E6E53E4
0420 .long 0x3FFF0000,0xD06333DA,0xEF2B2594,0x1FD6D45C
0421 .long 0x3FFF0000,0xD2A81D91,0xF12AE45C,0xA076EDB9
0422 .long 0x3FFF0000,0xD4F35AAB,0xCFEDFA20,0x9FA6DE21
0423 .long 0x3FFF0000,0xD744FCCA,0xD69D6AF4,0x1EE69A2F
0424 .long 0x3FFF0000,0xD99D15C2,0x78AFD7B4,0x207F439F
0425 .long 0x3FFF0000,0xDBFBB797,0xDAF23754,0x201EC207
0426 .long 0x3FFF0000,0xDE60F482,0x5E0E9124,0x9E8BE175
0427 .long 0x3FFF0000,0xE0CCDEEC,0x2A94E110,0x20032C4B
0428 .long 0x3FFF0000,0xE33F8972,0xBE8A5A50,0x2004DFF5
0429 .long 0x3FFF0000,0xE5B906E7,0x7C8348A8,0x1E72F47A
0430 .long 0x3FFF0000,0xE8396A50,0x3C4BDC68,0x1F722F22
0431 .long 0x3FFF0000,0xEAC0C6E7,0xDD243930,0xA017E945
0432 .long 0x3FFF0000,0xED4F301E,0xD9942B84,0x1F401A5B
0433 .long 0x3FFF0000,0xEFE4B99B,0xDCDAF5CC,0x9FB9A9E3
0434 .long 0x3FFF0000,0xF281773C,0x59FFB138,0x20744C05
0435 .long 0x3FFF0000,0xF5257D15,0x2486CC2C,0x1F773A19
0436 .long 0x3FFF0000,0xF7D0DF73,0x0AD13BB8,0x1FFE90D5
0437 .long 0x3FFF0000,0xFA83B2DB,0x722A033C,0xA041ED22
0438 .long 0x3FFF0000,0xFD3E0C0C,0xF486C174,0x1F853F3A
0439
0440 .set ADJFLAG,L_SCR2
0441 .set SCALE,FP_SCR1
0442 .set ADJSCALE,FP_SCR2
0443 .set SC,FP_SCR3
0444 .set ONEBYSC,FP_SCR4
0445
0446 | xref t_frcinx
0447 |xref t_extdnrm
0448 |xref t_unfl
0449 |xref t_ovfl
0450
0451 .global setoxd
0452 setoxd:
0453 //--entry point for EXP(X), X is denormalized
0454 movel (%a0),%d0
0455 andil #0x80000000,%d0
0456 oril #0x00800000,%d0 // ...sign(X)*2^(-126)
0457 movel %d0,-(%sp)
0458 fmoves #0x3F800000,%fp0
0459 fmovel %d1,%fpcr
0460 fadds (%sp)+,%fp0
0461 bra t_frcinx
0462
0463 .global setox
0464 setox:
0465 //--entry point for EXP(X), here X is finite, non-zero, and not NaN's
0466
0467 //--Step 1.
0468 movel (%a0),%d0 // ...load part of input X
0469 andil #0x7FFF0000,%d0 // ...biased expo. of X
0470 cmpil #0x3FBE0000,%d0 // ...2^(-65)
0471 bges EXPC1 // ...normal case
0472 bra EXPSM
0473
0474 EXPC1:
0475 //--The case |X| >= 2^(-65)
0476 movew 4(%a0),%d0 // ...expo. and partial sig. of |X|
0477 cmpil #0x400CB167,%d0 // ...16380 log2 trunc. 16 bits
0478 blts EXPMAIN // ...normal case
0479 bra EXPBIG
0480
0481 EXPMAIN:
0482 //--Step 2.
0483 //--This is the normal branch: 2^(-65) <= |X| < 16380 log2.
0484 fmovex (%a0),%fp0 // ...load input from (a0)
0485
0486 fmovex %fp0,%fp1
0487 fmuls #0x42B8AA3B,%fp0 // ...64/log2 * X
0488 fmovemx %fp2-%fp2/%fp3,-(%a7) // ...save fp2
0489 movel #0,ADJFLAG(%a6)
0490 fmovel %fp0,%d0 // ...N = int( X * 64/log2 )
0491 lea EXPTBL,%a1
0492 fmovel %d0,%fp0 // ...convert to floating-format
0493
0494 movel %d0,L_SCR1(%a6) // ...save N temporarily
0495 andil #0x3F,%d0 // ...D0 is J = N mod 64
0496 lsll #4,%d0
0497 addal %d0,%a1 // ...address of 2^(J/64)
0498 movel L_SCR1(%a6),%d0
0499 asrl #6,%d0 // ...D0 is M
0500 addiw #0x3FFF,%d0 // ...biased expo. of 2^(M)
0501 movew L2,L_SCR1(%a6) // ...prefetch L2, no need in CB
0502
0503 EXPCONT1:
0504 //--Step 3.
0505 //--fp1,fp2 saved on the stack. fp0 is N, fp1 is X,
0506 //--a0 points to 2^(J/64), D0 is biased expo. of 2^(M)
0507 fmovex %fp0,%fp2
0508 fmuls #0xBC317218,%fp0 // ...N * L1, L1 = lead(-log2/64)
0509 fmulx L2,%fp2 // ...N * L2, L1+L2 = -log2/64
0510 faddx %fp1,%fp0 // ...X + N*L1
0511 faddx %fp2,%fp0 // ...fp0 is R, reduced arg.
0512 // MOVE.W #$3FA5,EXPA3 ...load EXPA3 in cache
0513
0514 //--Step 4.
0515 //--WE NOW COMPUTE EXP(R)-1 BY A POLYNOMIAL
0516 //-- R + R*R*(A1 + R*(A2 + R*(A3 + R*(A4 + R*A5))))
0517 //--TO FULLY UTILIZE THE PIPELINE, WE COMPUTE S = R*R
0518 //--[R+R*S*(A2+S*A4)] + [S*(A1+S*(A3+S*A5))]
0519
0520 fmovex %fp0,%fp1
0521 fmulx %fp1,%fp1 // ...fp1 IS S = R*R
0522
0523 fmoves #0x3AB60B70,%fp2 // ...fp2 IS A5
0524 // MOVE.W #0,2(%a1) ...load 2^(J/64) in cache
0525
0526 fmulx %fp1,%fp2 // ...fp2 IS S*A5
0527 fmovex %fp1,%fp3
0528 fmuls #0x3C088895,%fp3 // ...fp3 IS S*A4
0529
0530 faddd EXPA3,%fp2 // ...fp2 IS A3+S*A5
0531 faddd EXPA2,%fp3 // ...fp3 IS A2+S*A4
0532
0533 fmulx %fp1,%fp2 // ...fp2 IS S*(A3+S*A5)
0534 movew %d0,SCALE(%a6) // ...SCALE is 2^(M) in extended
0535 clrw SCALE+2(%a6)
0536 movel #0x80000000,SCALE+4(%a6)
0537 clrl SCALE+8(%a6)
0538
0539 fmulx %fp1,%fp3 // ...fp3 IS S*(A2+S*A4)
0540
0541 fadds #0x3F000000,%fp2 // ...fp2 IS A1+S*(A3+S*A5)
0542 fmulx %fp0,%fp3 // ...fp3 IS R*S*(A2+S*A4)
0543
0544 fmulx %fp1,%fp2 // ...fp2 IS S*(A1+S*(A3+S*A5))
0545 faddx %fp3,%fp0 // ...fp0 IS R+R*S*(A2+S*A4),
0546 // ...fp3 released
0547
0548 fmovex (%a1)+,%fp1 // ...fp1 is lead. pt. of 2^(J/64)
0549 faddx %fp2,%fp0 // ...fp0 is EXP(R) - 1
0550 // ...fp2 released
0551
0552 //--Step 5
0553 //--final reconstruction process
0554 //--EXP(X) = 2^M * ( 2^(J/64) + 2^(J/64)*(EXP(R)-1) )
0555
0556 fmulx %fp1,%fp0 // ...2^(J/64)*(Exp(R)-1)
0557 fmovemx (%a7)+,%fp2-%fp2/%fp3 // ...fp2 restored
0558 fadds (%a1),%fp0 // ...accurate 2^(J/64)
0559
0560 faddx %fp1,%fp0 // ...2^(J/64) + 2^(J/64)*...
0561 movel ADJFLAG(%a6),%d0
0562
0563 //--Step 6
0564 tstl %d0
0565 beqs NORMAL
0566 ADJUST:
0567 fmulx ADJSCALE(%a6),%fp0
0568 NORMAL:
0569 fmovel %d1,%FPCR // ...restore user FPCR
0570 fmulx SCALE(%a6),%fp0 // ...multiply 2^(M)
0571 bra t_frcinx
0572
0573 EXPSM:
0574 //--Step 7
0575 fmovemx (%a0),%fp0-%fp0 // ...in case X is denormalized
0576 fmovel %d1,%FPCR
0577 fadds #0x3F800000,%fp0 // ...1+X in user mode
0578 bra t_frcinx
0579
0580 EXPBIG:
0581 //--Step 8
0582 cmpil #0x400CB27C,%d0 // ...16480 log2
0583 bgts EXP2BIG
0584 //--Steps 8.2 -- 8.6
0585 fmovex (%a0),%fp0 // ...load input from (a0)
0586
0587 fmovex %fp0,%fp1
0588 fmuls #0x42B8AA3B,%fp0 // ...64/log2 * X
0589 fmovemx %fp2-%fp2/%fp3,-(%a7) // ...save fp2
0590 movel #1,ADJFLAG(%a6)
0591 fmovel %fp0,%d0 // ...N = int( X * 64/log2 )
0592 lea EXPTBL,%a1
0593 fmovel %d0,%fp0 // ...convert to floating-format
0594 movel %d0,L_SCR1(%a6) // ...save N temporarily
0595 andil #0x3F,%d0 // ...D0 is J = N mod 64
0596 lsll #4,%d0
0597 addal %d0,%a1 // ...address of 2^(J/64)
0598 movel L_SCR1(%a6),%d0
0599 asrl #6,%d0 // ...D0 is K
0600 movel %d0,L_SCR1(%a6) // ...save K temporarily
0601 asrl #1,%d0 // ...D0 is M1
0602 subl %d0,L_SCR1(%a6) // ...a1 is M
0603 addiw #0x3FFF,%d0 // ...biased expo. of 2^(M1)
0604 movew %d0,ADJSCALE(%a6) // ...ADJSCALE := 2^(M1)
0605 clrw ADJSCALE+2(%a6)
0606 movel #0x80000000,ADJSCALE+4(%a6)
0607 clrl ADJSCALE+8(%a6)
0608 movel L_SCR1(%a6),%d0 // ...D0 is M
0609 addiw #0x3FFF,%d0 // ...biased expo. of 2^(M)
0610 bra EXPCONT1 // ...go back to Step 3
0611
0612 EXP2BIG:
0613 //--Step 9
0614 fmovel %d1,%FPCR
0615 movel (%a0),%d0
0616 bclrb #sign_bit,(%a0) // ...setox always returns positive
0617 cmpil #0,%d0
0618 blt t_unfl
0619 bra t_ovfl
0620
0621 .global setoxm1d
0622 setoxm1d:
0623 //--entry point for EXPM1(X), here X is denormalized
0624 //--Step 0.
0625 bra t_extdnrm
0626
0627
0628 .global setoxm1
0629 setoxm1:
0630 //--entry point for EXPM1(X), here X is finite, non-zero, non-NaN
0631
0632 //--Step 1.
0633 //--Step 1.1
0634 movel (%a0),%d0 // ...load part of input X
0635 andil #0x7FFF0000,%d0 // ...biased expo. of X
0636 cmpil #0x3FFD0000,%d0 // ...1/4
0637 bges EM1CON1 // ...|X| >= 1/4
0638 bra EM1SM
0639
0640 EM1CON1:
0641 //--Step 1.3
0642 //--The case |X| >= 1/4
0643 movew 4(%a0),%d0 // ...expo. and partial sig. of |X|
0644 cmpil #0x4004C215,%d0 // ...70log2 rounded up to 16 bits
0645 bles EM1MAIN // ...1/4 <= |X| <= 70log2
0646 bra EM1BIG
0647
0648 EM1MAIN:
0649 //--Step 2.
0650 //--This is the case: 1/4 <= |X| <= 70 log2.
0651 fmovex (%a0),%fp0 // ...load input from (a0)
0652
0653 fmovex %fp0,%fp1
0654 fmuls #0x42B8AA3B,%fp0 // ...64/log2 * X
0655 fmovemx %fp2-%fp2/%fp3,-(%a7) // ...save fp2
0656 // MOVE.W #$3F81,EM1A4 ...prefetch in CB mode
0657 fmovel %fp0,%d0 // ...N = int( X * 64/log2 )
0658 lea EXPTBL,%a1
0659 fmovel %d0,%fp0 // ...convert to floating-format
0660
0661 movel %d0,L_SCR1(%a6) // ...save N temporarily
0662 andil #0x3F,%d0 // ...D0 is J = N mod 64
0663 lsll #4,%d0
0664 addal %d0,%a1 // ...address of 2^(J/64)
0665 movel L_SCR1(%a6),%d0
0666 asrl #6,%d0 // ...D0 is M
0667 movel %d0,L_SCR1(%a6) // ...save a copy of M
0668 // MOVE.W #$3FDC,L2 ...prefetch L2 in CB mode
0669
0670 //--Step 3.
0671 //--fp1,fp2 saved on the stack. fp0 is N, fp1 is X,
0672 //--a0 points to 2^(J/64), D0 and a1 both contain M
0673 fmovex %fp0,%fp2
0674 fmuls #0xBC317218,%fp0 // ...N * L1, L1 = lead(-log2/64)
0675 fmulx L2,%fp2 // ...N * L2, L1+L2 = -log2/64
0676 faddx %fp1,%fp0 // ...X + N*L1
0677 faddx %fp2,%fp0 // ...fp0 is R, reduced arg.
0678 // MOVE.W #$3FC5,EM1A2 ...load EM1A2 in cache
0679 addiw #0x3FFF,%d0 // ...D0 is biased expo. of 2^M
0680
0681 //--Step 4.
0682 //--WE NOW COMPUTE EXP(R)-1 BY A POLYNOMIAL
0683 //-- R + R*R*(A1 + R*(A2 + R*(A3 + R*(A4 + R*(A5 + R*A6)))))
0684 //--TO FULLY UTILIZE THE PIPELINE, WE COMPUTE S = R*R
0685 //--[R*S*(A2+S*(A4+S*A6))] + [R+S*(A1+S*(A3+S*A5))]
0686
0687 fmovex %fp0,%fp1
0688 fmulx %fp1,%fp1 // ...fp1 IS S = R*R
0689
0690 fmoves #0x3950097B,%fp2 // ...fp2 IS a6
0691 // MOVE.W #0,2(%a1) ...load 2^(J/64) in cache
0692
0693 fmulx %fp1,%fp2 // ...fp2 IS S*A6
0694 fmovex %fp1,%fp3
0695 fmuls #0x3AB60B6A,%fp3 // ...fp3 IS S*A5
0696
0697 faddd EM1A4,%fp2 // ...fp2 IS A4+S*A6
0698 faddd EM1A3,%fp3 // ...fp3 IS A3+S*A5
0699 movew %d0,SC(%a6) // ...SC is 2^(M) in extended
0700 clrw SC+2(%a6)
0701 movel #0x80000000,SC+4(%a6)
0702 clrl SC+8(%a6)
0703
0704 fmulx %fp1,%fp2 // ...fp2 IS S*(A4+S*A6)
0705 movel L_SCR1(%a6),%d0 // ...D0 is M
0706 negw %d0 // ...D0 is -M
0707 fmulx %fp1,%fp3 // ...fp3 IS S*(A3+S*A5)
0708 addiw #0x3FFF,%d0 // ...biased expo. of 2^(-M)
0709 faddd EM1A2,%fp2 // ...fp2 IS A2+S*(A4+S*A6)
0710 fadds #0x3F000000,%fp3 // ...fp3 IS A1+S*(A3+S*A5)
0711
0712 fmulx %fp1,%fp2 // ...fp2 IS S*(A2+S*(A4+S*A6))
0713 oriw #0x8000,%d0 // ...signed/expo. of -2^(-M)
0714 movew %d0,ONEBYSC(%a6) // ...OnebySc is -2^(-M)
0715 clrw ONEBYSC+2(%a6)
0716 movel #0x80000000,ONEBYSC+4(%a6)
0717 clrl ONEBYSC+8(%a6)
0718 fmulx %fp3,%fp1 // ...fp1 IS S*(A1+S*(A3+S*A5))
0719 // ...fp3 released
0720
0721 fmulx %fp0,%fp2 // ...fp2 IS R*S*(A2+S*(A4+S*A6))
0722 faddx %fp1,%fp0 // ...fp0 IS R+S*(A1+S*(A3+S*A5))
0723 // ...fp1 released
0724
0725 faddx %fp2,%fp0 // ...fp0 IS EXP(R)-1
0726 // ...fp2 released
0727 fmovemx (%a7)+,%fp2-%fp2/%fp3 // ...fp2 restored
0728
0729 //--Step 5
0730 //--Compute 2^(J/64)*p
0731
0732 fmulx (%a1),%fp0 // ...2^(J/64)*(Exp(R)-1)
0733
0734 //--Step 6
0735 //--Step 6.1
0736 movel L_SCR1(%a6),%d0 // ...retrieve M
0737 cmpil #63,%d0
0738 bles MLE63
0739 //--Step 6.2 M >= 64
0740 fmoves 12(%a1),%fp1 // ...fp1 is t
0741 faddx ONEBYSC(%a6),%fp1 // ...fp1 is t+OnebySc
0742 faddx %fp1,%fp0 // ...p+(t+OnebySc), fp1 released
0743 faddx (%a1),%fp0 // ...T+(p+(t+OnebySc))
0744 bras EM1SCALE
0745 MLE63:
0746 //--Step 6.3 M <= 63
0747 cmpil #-3,%d0
0748 bges MGEN3
0749 MLTN3:
0750 //--Step 6.4 M <= -4
0751 fadds 12(%a1),%fp0 // ...p+t
0752 faddx (%a1),%fp0 // ...T+(p+t)
0753 faddx ONEBYSC(%a6),%fp0 // ...OnebySc + (T+(p+t))
0754 bras EM1SCALE
0755 MGEN3:
0756 //--Step 6.5 -3 <= M <= 63
0757 fmovex (%a1)+,%fp1 // ...fp1 is T
0758 fadds (%a1),%fp0 // ...fp0 is p+t
0759 faddx ONEBYSC(%a6),%fp1 // ...fp1 is T+OnebySc
0760 faddx %fp1,%fp0 // ...(T+OnebySc)+(p+t)
0761
0762 EM1SCALE:
0763 //--Step 6.6
0764 fmovel %d1,%FPCR
0765 fmulx SC(%a6),%fp0
0766
0767 bra t_frcinx
0768
0769 EM1SM:
0770 //--Step 7 |X| < 1/4.
0771 cmpil #0x3FBE0000,%d0 // ...2^(-65)
0772 bges EM1POLY
0773
0774 EM1TINY:
0775 //--Step 8 |X| < 2^(-65)
0776 cmpil #0x00330000,%d0 // ...2^(-16312)
0777 blts EM12TINY
0778 //--Step 8.2
0779 movel #0x80010000,SC(%a6) // ...SC is -2^(-16382)
0780 movel #0x80000000,SC+4(%a6)
0781 clrl SC+8(%a6)
0782 fmovex (%a0),%fp0
0783 fmovel %d1,%FPCR
0784 faddx SC(%a6),%fp0
0785
0786 bra t_frcinx
0787
0788 EM12TINY:
0789 //--Step 8.3
0790 fmovex (%a0),%fp0
0791 fmuld TWO140,%fp0
0792 movel #0x80010000,SC(%a6)
0793 movel #0x80000000,SC+4(%a6)
0794 clrl SC+8(%a6)
0795 faddx SC(%a6),%fp0
0796 fmovel %d1,%FPCR
0797 fmuld TWON140,%fp0
0798
0799 bra t_frcinx
0800
0801 EM1POLY:
0802 //--Step 9 exp(X)-1 by a simple polynomial
0803 fmovex (%a0),%fp0 // ...fp0 is X
0804 fmulx %fp0,%fp0 // ...fp0 is S := X*X
0805 fmovemx %fp2-%fp2/%fp3,-(%a7) // ...save fp2
0806 fmoves #0x2F30CAA8,%fp1 // ...fp1 is B12
0807 fmulx %fp0,%fp1 // ...fp1 is S*B12
0808 fmoves #0x310F8290,%fp2 // ...fp2 is B11
0809 fadds #0x32D73220,%fp1 // ...fp1 is B10+S*B12
0810
0811 fmulx %fp0,%fp2 // ...fp2 is S*B11
0812 fmulx %fp0,%fp1 // ...fp1 is S*(B10 + ...
0813
0814 fadds #0x3493F281,%fp2 // ...fp2 is B9+S*...
0815 faddd EM1B8,%fp1 // ...fp1 is B8+S*...
0816
0817 fmulx %fp0,%fp2 // ...fp2 is S*(B9+...
0818 fmulx %fp0,%fp1 // ...fp1 is S*(B8+...
0819
0820 faddd EM1B7,%fp2 // ...fp2 is B7+S*...
0821 faddd EM1B6,%fp1 // ...fp1 is B6+S*...
0822
0823 fmulx %fp0,%fp2 // ...fp2 is S*(B7+...
0824 fmulx %fp0,%fp1 // ...fp1 is S*(B6+...
0825
0826 faddd EM1B5,%fp2 // ...fp2 is B5+S*...
0827 faddd EM1B4,%fp1 // ...fp1 is B4+S*...
0828
0829 fmulx %fp0,%fp2 // ...fp2 is S*(B5+...
0830 fmulx %fp0,%fp1 // ...fp1 is S*(B4+...
0831
0832 faddd EM1B3,%fp2 // ...fp2 is B3+S*...
0833 faddx EM1B2,%fp1 // ...fp1 is B2+S*...
0834
0835 fmulx %fp0,%fp2 // ...fp2 is S*(B3+...
0836 fmulx %fp0,%fp1 // ...fp1 is S*(B2+...
0837
0838 fmulx %fp0,%fp2 // ...fp2 is S*S*(B3+...)
0839 fmulx (%a0),%fp1 // ...fp1 is X*S*(B2...
0840
0841 fmuls #0x3F000000,%fp0 // ...fp0 is S*B1
0842 faddx %fp2,%fp1 // ...fp1 is Q
0843 // ...fp2 released
0844
0845 fmovemx (%a7)+,%fp2-%fp2/%fp3 // ...fp2 restored
0846
0847 faddx %fp1,%fp0 // ...fp0 is S*B1+Q
0848 // ...fp1 released
0849
0850 fmovel %d1,%FPCR
0851 faddx (%a0),%fp0
0852
0853 bra t_frcinx
0854
0855 EM1BIG:
0856 //--Step 10 |X| > 70 log2
0857 movel (%a0),%d0
0858 cmpil #0,%d0
0859 bgt EXPC1
0860 //--Step 10.2
0861 fmoves #0xBF800000,%fp0 // ...fp0 is -1
0862 fmovel %d1,%FPCR
0863 fadds #0x00800000,%fp0 // ...-1 + 2^(-126)
0864
0865 bra t_frcinx
0866
0867 |end