stan.s
上传用户:baixin
上传日期:2008-03-13
资源大小:4795k
文件大小:14k
开发平台:

MultiPlatform

  1. /* stan.s - Motorola 68040 FP tangent routines (EXC) */
  2. /* Copyright 1991-1993 Wind River Systems, Inc. */
  3. .data
  4. .globl _copyright_wind_river
  5. .long _copyright_wind_river
  6. /*
  7. modification history
  8. --------------------
  9. 01f,21jul93,kdl  added .text (SPR #2372).
  10. 01e,23aug92,jcf  changed bxxx to jxx.
  11. 01d,26may92,rrr  the tree shuffle
  12. 01c,10jan92,kdl  general cleanup.
  13. 01b,17dec91,kdl  put in changes from Motorola v3.3 (from FPSP 2.1):
  14.  reduce argument by one step before general reduction
  15.  loop.
  16. 01a,15aug91,kdl  original version, from Motorola FPSP v2.0.
  17. */
  18. /*
  19. DESCRIPTION
  20. stansa 3.2 12/18/90
  21. The entry point __x_stan computes the tangent of
  22. an input argument;
  23. __x_stand does the same except for denormalized input.
  24. Input: Double-extended number X in location pointed to
  25. by address register a0.
  26. Output: The value tan(X) returned in floating-point register Fp0.
  27. Accuracy and Monotonicity: The returned result is within 3 ulp in
  28. 64 significant bit, i.e. within 0.5001 ulp to 53 bits if the
  29. result is subsequently rounded to double precision. The
  30. result is provably monotonic in double precision.
  31. Speed: The program sTAN takes approximately 170 cycles for
  32. input argument X such that |X| < 15Pi, which is the the usual
  33. situation.
  34. Algorithm:
  35. 1. If |X| >= 15Pi or |X| < 2**(-40), go to 6.
  36. 2. Decompose X as X = N(Pi/2) + r where |r| <= Pi/4. Let
  37. k = N mod 2, so in particular, k = 0 or 1.
  38. 3. If k is odd, go to 5.
  39. 4. (k is even) Tan(X) = tan(r) and tan(r) is approximated by a
  40. rational function U/V where
  41. U = r + r*s*(P1 + s*(P2 + s*P3)), and
  42. V = 1 + s*(Q1 + s*(Q2 + s*(Q3 + s*Q4))),  s = r*r.
  43. Exit.
  44. 4. (k is odd) Tan(X) = -cot(r). Since tan(r) is approximated by a
  45. rational function U/V where
  46. U = r + r*s*(P1 + s*(P2 + s*P3)), and
  47. V = 1 + s*(Q1 + s*(Q2 + s*(Q3 + s*Q4))), s = r*r,
  48. -Cot(r) = -V/U. Exit.
  49. 6. If |X| > 1, go to 8.
  50. 7. (|X|<2**(-40)) Tan(X) = X. Exit.
  51. 8. Overwrite X by X := X rem 2Pi. Now that |X| <= Pi, go back to 2.
  52. Copyright (C) Motorola, Inc. 1990
  53. All Rights Reserved
  54. THIS IS UNPUBLISHED PROPRIETARY SOURCE CODE OF MOTOROLA
  55. The copyright notice above does not evidence any
  56. actual or intended publication of such source code.
  57. STAN idnt 2,1 Motorola 040 Floating Point Software Package
  58. section 8
  59. NOMANUAL
  60. */
  61. #include "fpsp040E.h"
  62. BOUNDS1: .long 0x3FD78000,0x4004BC7E
  63. TWOBYPI: .long 0x3FE45F30,0x6DC9C883
  64. TANQ4: .long 0x3EA0B759,0xF50F8688
  65. TANP3: .long 0xBEF2BAA5,0xA8924F04
  66. TANQ3: .long 0xBF346F59,0xB39BA65F,0x00000000,0x00000000
  67. TANP2: .long 0x3FF60000,0xE073D3FC,0x199C4A00,0x00000000
  68. TANQ2: .long 0x3FF90000,0xD23CD684,0x15D95FA1,0x00000000
  69. TANP1: .long 0xBFFC0000,0x8895A6C5,0xFB423BCA,0x00000000
  70. TANQ1: .long 0xBFFD0000,0xEEF57E0D,0xA84BC8CE,0x00000000
  71. INVTWOPI: .long 0x3FFC0000,0xA2F9836E,0x4E44152A,0x00000000
  72. TWOPI1: .long 0x40010000,0xC90FDAA2,0x00000000,0x00000000
  73. TWOPI2: .long 0x3FDF0000,0x85A308D4,0x00000000,0x00000000
  74. |--N*PI/2, -32 <= N <= 32, IN A LEADING TERM IN EXT. AND TRAILING
  75. |--TERM IN SGL. NOTE THAT PI IS 64-BIT LONG, THUS N*PI/2 IS AT
  76. |--MOST 69 BITS LONG.
  77. .globl __x_PITBL
  78. __x_PITBL:
  79.   .long  0xC0040000,0xC90FDAA2,0x2168C235,0x21800000
  80.   .long  0xC0040000,0xC2C75BCD,0x105D7C23,0xA0D00000
  81.   .long  0xC0040000,0xBC7EDCF7,0xFF523611,0xA1E80000
  82.   .long  0xC0040000,0xB6365E22,0xEE46F000,0x21480000
  83.   .long  0xC0040000,0xAFEDDF4D,0xDD3BA9EE,0xA1200000
  84.   .long  0xC0040000,0xA9A56078,0xCC3063DD,0x21FC0000
  85.   .long  0xC0040000,0xA35CE1A3,0xBB251DCB,0x21100000
  86.   .long  0xC0040000,0x9D1462CE,0xAA19D7B9,0xA1580000
  87.   .long  0xC0040000,0x96CBE3F9,0x990E91A8,0x21E00000
  88.   .long  0xC0040000,0x90836524,0x88034B96,0x20B00000
  89.   .long  0xC0040000,0x8A3AE64F,0x76F80584,0xA1880000
  90.   .long  0xC0040000,0x83F2677A,0x65ECBF73,0x21C40000
  91.   .long  0xC0030000,0xFB53D14A,0xA9C2F2C2,0x20000000
  92.   .long  0xC0030000,0xEEC2D3A0,0x87AC669F,0x21380000
  93.   .long  0xC0030000,0xE231D5F6,0x6595DA7B,0xA1300000
  94.   .long  0xC0030000,0xD5A0D84C,0x437F4E58,0x9FC00000
  95.   .long  0xC0030000,0xC90FDAA2,0x2168C235,0x21000000
  96.   .long  0xC0030000,0xBC7EDCF7,0xFF523611,0xA1680000
  97.   .long  0xC0030000,0xAFEDDF4D,0xDD3BA9EE,0xA0A00000
  98.   .long  0xC0030000,0xA35CE1A3,0xBB251DCB,0x20900000
  99.   .long  0xC0030000,0x96CBE3F9,0x990E91A8,0x21600000
  100.   .long  0xC0030000,0x8A3AE64F,0x76F80584,0xA1080000
  101.   .long  0xC0020000,0xFB53D14A,0xA9C2F2C2,0x1F800000
  102.   .long  0xC0020000,0xE231D5F6,0x6595DA7B,0xA0B00000
  103.   .long  0xC0020000,0xC90FDAA2,0x2168C235,0x20800000
  104.   .long  0xC0020000,0xAFEDDF4D,0xDD3BA9EE,0xA0200000
  105.   .long  0xC0020000,0x96CBE3F9,0x990E91A8,0x20E00000
  106.   .long  0xC0010000,0xFB53D14A,0xA9C2F2C2,0x1F000000
  107.   .long  0xC0010000,0xC90FDAA2,0x2168C235,0x20000000
  108.   .long  0xC0010000,0x96CBE3F9,0x990E91A8,0x20600000
  109.   .long  0xC0000000,0xC90FDAA2,0x2168C235,0x1F800000
  110.   .long  0xBFFF0000,0xC90FDAA2,0x2168C235,0x1F000000
  111.   .long  0x00000000,0x00000000,0x00000000,0x00000000
  112.   .long  0x3FFF0000,0xC90FDAA2,0x2168C235,0x9F000000
  113.   .long  0x40000000,0xC90FDAA2,0x2168C235,0x9F800000
  114.   .long  0x40010000,0x96CBE3F9,0x990E91A8,0xA0600000
  115.   .long  0x40010000,0xC90FDAA2,0x2168C235,0xA0000000
  116.   .long  0x40010000,0xFB53D14A,0xA9C2F2C2,0x9F000000
  117.   .long  0x40020000,0x96CBE3F9,0x990E91A8,0xA0E00000
  118.   .long  0x40020000,0xAFEDDF4D,0xDD3BA9EE,0x20200000
  119.   .long  0x40020000,0xC90FDAA2,0x2168C235,0xA0800000
  120.   .long  0x40020000,0xE231D5F6,0x6595DA7B,0x20B00000
  121.   .long  0x40020000,0xFB53D14A,0xA9C2F2C2,0x9F800000
  122.   .long  0x40030000,0x8A3AE64F,0x76F80584,0x21080000
  123.   .long  0x40030000,0x96CBE3F9,0x990E91A8,0xA1600000
  124.   .long  0x40030000,0xA35CE1A3,0xBB251DCB,0xA0900000
  125.   .long  0x40030000,0xAFEDDF4D,0xDD3BA9EE,0x20A00000
  126.   .long  0x40030000,0xBC7EDCF7,0xFF523611,0x21680000
  127.   .long  0x40030000,0xC90FDAA2,0x2168C235,0xA1000000
  128.   .long  0x40030000,0xD5A0D84C,0x437F4E58,0x1FC00000
  129.   .long  0x40030000,0xE231D5F6,0x6595DA7B,0x21300000
  130.   .long  0x40030000,0xEEC2D3A0,0x87AC669F,0xA1380000
  131.   .long  0x40030000,0xFB53D14A,0xA9C2F2C2,0xA0000000
  132.   .long  0x40040000,0x83F2677A,0x65ECBF73,0xA1C40000
  133.   .long  0x40040000,0x8A3AE64F,0x76F80584,0x21880000
  134.   .long  0x40040000,0x90836524,0x88034B96,0xA0B00000
  135.   .long  0x40040000,0x96CBE3F9,0x990E91A8,0xA1E00000
  136.   .long  0x40040000,0x9D1462CE,0xAA19D7B9,0x21580000
  137.   .long  0x40040000,0xA35CE1A3,0xBB251DCB,0xA1100000
  138.   .long  0x40040000,0xA9A56078,0xCC3063DD,0xA1FC0000
  139.   .long  0x40040000,0xAFEDDF4D,0xDD3BA9EE,0x21200000
  140.   .long  0x40040000,0xB6365E22,0xEE46F000,0xA1480000
  141.   .long  0x40040000,0xBC7EDCF7,0xFF523611,0x21E80000
  142.   .long  0x40040000,0xC2C75BCD,0x105D7C23,0x20D00000
  143.   .long  0x40040000,0xC90FDAA2,0x2168C235,0xA1800000
  144. #define INARG FP_SCR4
  145. #define TWOTO63      L_SCR1
  146. #define ENDFLAG L_SCR2
  147. #define N            L_SCR3
  148. | xref __x_t_frcinx
  149. | xref __x_t_extdnrm
  150. .text
  151. .globl __x_stand
  152. __x_stand:
  153. |--TAN(X) = X FOR DENORMALIZED X
  154. jra  __x_t_extdnrm
  155. .globl __x_stan
  156. __x_stan:
  157. fmovex a0@,fp0 |...lOAD INPUT
  158. movel A0@,d0
  159. movew A0@(4),d0
  160. andil #0x7FFFFFFF,d0
  161. cmpil #0x3FD78000,d0 |...|X| >= 2**(-40)?
  162. jge  TANOK1
  163. jra  TANSM
  164. TANOK1:
  165. cmpil #0x4004BC7E,d0 |...|X| < 15 PI?
  166. jlt  TANMAIN
  167. jra  REDUCEX
  168. TANMAIN:
  169. |--THIS IS THE USUAL CASE, |X| <= 15 PI.
  170. |--THE ARGUMENT REDUCTION IS DONE BY TABLE LOOK UP.
  171. fmovex fp0,fp1
  172. fmuld TWOBYPI,fp1 |...X*2/PI
  173. |--HIDE THE NEXT TWO INSTRUCTIONS
  174. lea __x_PITBL+0x200,a1 |...TABLE OF N*PI/2, N = -32,...,32
  175. |--FP1 IS NOW READY
  176. fmovel fp1,d0 |...CONVERT TO INTEGER
  177. asll #4,d0
  178. addal d0,a1 |...ADDRESS N*PIBY2 IN Y1, Y2
  179. fsubx a1@+,fp0 |...X-Y1
  180. |--HIDE THE NEXT ONE
  181. fsubs a1@,fp0 |...FP0 IS R = (X-Y1)-Y2
  182. rorl #5,d0
  183. andil #0x80000000,d0 |...D0 WAS ODD IFF d0 < 0
  184. TANCONT:
  185. cmpil #0,d0
  186. jlt  NODD
  187. fmovex fp0,fp1
  188. fmulx fp1,fp1   |...S = R*R
  189. fmoved TANQ4,fp3
  190. fmoved TANP3,fp2
  191. fmulx fp1,fp3   |...SQ4
  192. fmulx fp1,fp2   |...SP3
  193. faddd TANQ3,fp3 |...Q3+SQ4
  194. faddx TANP2,fp2 |...P2+SP3
  195. fmulx fp1,fp3   |...S(Q3+SQ4)
  196. fmulx fp1,fp2   |...S(P2+SP3)
  197. faddx TANQ2,fp3 |...Q2+S(Q3+SQ4)
  198. faddx TANP1,fp2 |...P1+S(P2+SP3)
  199. fmulx fp1,fp3   |...S(Q2+S(Q3+SQ4))
  200. fmulx fp1,fp2   |...S(P1+S(P2+SP3))
  201. faddx TANQ1,fp3 |...Q1+S(Q2+S(Q3+SQ4))
  202. fmulx fp0,fp2   |...RS(P1+S(P2+SP3))
  203. fmulx fp3,fp1   |...S(Q1+S(Q2+S(Q3+SQ4)))
  204. faddx fp2,fp0   |...R+RS(P1+S(P2+SP3))
  205. .long 0xf23c44a2,0x3f800000 /*  fadds  &0x3F800000,fp1 */
  206. fmovel d1,fpcr | restore users exceptions
  207. fdivx fp1,fp0 | last inst - possible exception set
  208. jra  __x_t_frcinx
  209. NODD:
  210. fmovex fp0,fp1
  211. fmulx fp0,fp0   |...S = R*R
  212. fmoved TANQ4,fp3
  213. fmoved TANP3,fp2
  214. fmulx fp0,fp3   |...SQ4
  215. fmulx fp0,fp2   |...SP3
  216. faddd TANQ3,fp3 |...Q3+SQ4
  217. faddx TANP2,fp2 |...P2+SP3
  218. fmulx fp0,fp3   |...S(Q3+SQ4)
  219. fmulx fp0,fp2   |...S(P2+SP3)
  220. faddx TANQ2,fp3 |...Q2+S(Q3+SQ4)
  221. faddx TANP1,fp2 |...P1+S(P2+SP3)
  222. fmulx fp0,fp3   |...S(Q2+S(Q3+SQ4))
  223. fmulx fp0,fp2   |...S(P1+S(P2+SP3))
  224. faddx TANQ1,fp3 |...Q1+S(Q2+S(Q3+SQ4))
  225. fmulx fp1,fp2   |...RS(P1+S(P2+SP3))
  226. fmulx fp3,fp0   |...S(Q1+S(Q2+S(Q3+SQ4)))
  227. faddx fp2,fp1   |...R+RS(P1+S(P2+SP3))
  228. .long 0xf23c4422,0x3f800000 /*  fadds  &0x3F800000,fp0 */
  229. fmovex fp1,a7@-
  230. eoril #0x80000000,a7@
  231. fmovel d1,fpcr   | restore users exceptions
  232. fdivx a7@+,fp0 | last inst - possible exception set
  233. jra  __x_t_frcinx
  234. TANBORS:
  235. |--IF |X| > 15PI, WE USE THE GENERAL ARGUMENT REDUCTION.
  236. |--IF |X| < 2**(-40), RETURN X OR 1.
  237. cmpil #0x3FFF8000,d0
  238. jgt  REDUCEX
  239. TANSM:
  240. fmovex fp0,a7@-
  241. fmovel d1,fpcr  | restore users exceptions
  242. fmovex a7@+,fp0 | last inst - posibble exception set
  243. jra  __x_t_frcinx
  244. REDUCEX:
  245. |--WHEN REDUCEX IS USED, THE CODE WILL INEVITABLY BE SLOW.
  246. |--THIS REDUCTION METHOD, HOWEVER, IS MUCH FASTER THAN USING
  247. |--THE REMAINDER INSTRUCTION WHICH IS NOW IN SOFTWARE.
  248. fmovemx fp2-fp5,A7@- |...save fp2 through fp5
  249. movel d2,A7@-
  250. .long 0xf23c4480,0x00000000 /*  fmoves  &0x00000000,fp1 */
  251. |--If compact form of abs(arg) in d0=0x7ffeffff, argument is so large that
  252. |--there is a danger of unwanted overflow in first LOOP iteration.  In this
  253. |--case, reduce argument by one remainder step to make subsequent reduction
  254. |--safe.
  255. cmpil #0x7ffeffff,d0 | is argument dangerously large?
  256. jne  LOOP
  257. movel #0x7ffe0000,a6@(FP_SCR2) | yes
  258. | | create 2**16383*PI/2
  259. movel #0xc90fdaa2,a6@(FP_SCR2+4)
  260. clrl a6@(FP_SCR2+8)
  261. ftstx fp0 | test sign of argument
  262. movel #0x7fdc0000,a6@(FP_SCR3) | create low half of 2**16383*
  263. | | PI/2 at FP_SCR3
  264. movel #0x85a308d3,a6@(FP_SCR3+4)
  265. clrl   a6@(FP_SCR3+8)
  266. fblt red_neg
  267. orw #0x8000,a6@(FP_SCR2) | positive arg
  268. orw #0x8000,a6@(FP_SCR3)
  269. red_neg:
  270. faddx  a6@(FP_SCR2),fp0 | high part of reduction is exact
  271. fmovex  fp0,fp1 | save high result in fp1
  272. faddx  a6@(FP_SCR3),fp0 | low part of reduction
  273. fsubx   fp0,fp1 | determine low component of result
  274. faddx  a6@(FP_SCR3),fp1 | fp0/fp1 are reduced argument.
  275. |--ON ENTRY, FP0 IS X, ON RETURN, FP0 IS X REM PI/2, |X| <= PI/4.
  276. |--integer quotient will be stored in N
  277. |--Intermeditate remainder is 66-bit long|  (R,r) in (FP0,FP1)
  278. LOOP:
  279. fmovex fp0,a6@(INARG) |...+-2**K * F, 1 <= F < 2
  280. movew a6@(INARG),d0
  281.         movel          d0,a1 |...save a copy of d0
  282. andil #0x00007FFF,d0
  283. subil #0x00003FFF,d0 |...D0 IS K
  284. cmpil #28,d0
  285. jle  LASTLOOP
  286. CONTLOOP:
  287. subil #27,d0  |...D0 IS L := K-27
  288. movel #0,a6@(ENDFLAG)
  289. jra  WORK
  290. LASTLOOP:
  291. clrl d0 |...D0 IS L := 0
  292. movel #1,a6@(ENDFLAG)
  293. WORK:
  294. |--FIND THE REMAINDER OF (R,r) W.R.T. 2**L * (PI/2). L IS SO CHOSEN
  295. |--THAT INT( X * (2/PI) / 2**(L) ) < 2**29.
  296. |--CREATE 2**(-L) * (2/PI), SIGN(INARG)*2**(63),
  297. |--2**L * (PIby2_1), 2**L * (PIby2_2)
  298. movel #0x00003FFE,d2 |...BIASED EXPO OF 2/PI
  299. subl d0,d2 |...BIASED EXPO OF 2**(-L)*(2/PI)
  300. movel #0xA2F9836E,a6@(FP_SCR1+4)
  301. movel #0x4E44152A,a6@(FP_SCR1+8)
  302. movew d2,a6@(FP_SCR1) |...FP_SCR1 is 2**(-L)*(2/PI)
  303. fmovex fp0,fp2
  304. fmulx a6@(FP_SCR1),fp2
  305. |--WE MUST NOW FIND INT(FP2). SINCE WE NEED THIS VALUE IN
  306. /* |--FLOATING POINT FORMAT, THE TWO FMOVE'S FMOVE.l FP <--> N */
  307. |--WILL BE TOO INEFFICIENT. THE WAY AROUND IT IS THAT
  308. |--(SIGN(INARG)*2**63 + FP2) - SIGN(INARG)*2**63 WILL GIVE
  309. |--US THE DESIRED VALUE IN FLOATING POINT.
  310. |--HIDE SIX CYCLES OF INSTRUCTION
  311.         movel a1,d2
  312.         swap d2
  313. andil #0x80000000,d2
  314. oril #0x5F000000,d2 |...D2 IS SIGN(INARG)*2**63 IN SGL
  315. movel d2,a6@(TWOTO63)
  316. movel d0,d2
  317. addil #0x00003FFF,d2 |...BIASED EXPO OF 2**L * (PI/2)
  318. |--FP2 IS READY
  319. fadds a6@(TWOTO63),fp2 |...THE FRACTIONAL PART OF fp1 IS ROUNDED
  320. |--HIDE 4 CYCLES OF INSTRUCTION|  creating 2**(L)*Piby2_1  and  2**(L)*Piby2_2
  321.         movew d2,a6@(FP_SCR2)
  322. clrw           a6@(FP_SCR2+2)
  323. movel #0xC90FDAA2,a6@(FP_SCR2+4)
  324. clrl a6@(FP_SCR2+8) |...FP_SCR2 is  2**(L) * Piby2_1
  325. |--FP2 IS READY
  326. fsubs a6@(TWOTO63),fp2 |...FP2 is N
  327. addil #0x00003FDD,d0
  328.         movew d0,a6@(FP_SCR3)
  329. clrw           a6@(FP_SCR3+2)
  330. movel #0x85A308D3,a6@(FP_SCR3+4)
  331. clrl a6@(FP_SCR3+8) |...FP_SCR3 is 2**(L) * Piby2_2
  332. movel a6@(ENDFLAG),d0
  333. |--We are now ready to perform (R+r) - N*P1 - N*P2, P1 = 2**(L) * Piby2_1 and
  334. |--P2 = 2**(L) * Piby2_2
  335. fmovex fp2,fp4
  336. fmulx a6@(FP_SCR2),fp4 |...w = N*P1
  337. fmovex fp2,fp5
  338. fmulx a6@(FP_SCR3),fp5 |...w = N*P2
  339. fmovex fp4,fp3
  340. |--we want P+p = W+w  but  |p| <= half ulp of P
  341. |--Then, we need to compute  A := R-P   and  a := r-p
  342. faddx fp5,fp3 |...FP3 is P
  343. fsubx fp3,fp4 |...w-P
  344. fsubx fp3,fp0 |...FP0 is A := R - P
  345.         faddx fp5,fp4 |...FP4 is p = (W-P)+w
  346. fmovex fp0,fp3 |...FP3 A
  347. fsubx fp4,fp1 |...FP1 is a := r - p
  348. |--Now we need to normalize (A,a) to  "new (R,r)" where R+r = A+a but
  349. |--|r| <= half ulp of R.
  350. faddx fp1,fp0 |...FP0 is R := A+a
  351. |--No need to calculate r if this is the last loop
  352. cmpil #0,d0
  353. jgt  RESTORE
  354. |--Need to calculate r
  355. fsubx fp0,fp3 |...A-R
  356. faddx fp3,fp1 |...FP1 is r := (A-R)+a
  357. jra  LOOP
  358. RESTORE:
  359.         fmovel fp2,a6@(N)
  360. movel A7@+,d2
  361. fmovemx A7@+,fp2-fp5
  362. movel a6@(N),d0
  363.         rorl #1,d0
  364. jra  TANCONT
  365. | end