//*************************************************************
	// Name: sincos.S
	// Author: Larry Pyeatt
	// Date: 2/22/2014		
	//*************************************************************
	/*
	This is a version of the sin/cos functions that uses symmetry
	to enhance precision.  The actual sin and cos routines convert
	the input to lie in the range 0 to pi/2, then pass it to the
	worker routine that computes the result.  The result is then
	converted back to correspond with the original input.
	
	We calculate sin(x) using the first seven terms of the Taylor
	Series:	 sin(x) = x - x^3/3! + x^5/5! - x^7/7! + x^9/9! - ...
	and we calculate cos(x) using the relationship:
	cos(x) = sin(pi/2-x)
	
	We start by defining a helper function, which we call sinq.
	The sinq function calculates sin(x) for 0<=x<=pi/2.  The
	input, x, must be an S(1,30) number. The factors of x that
	sinq will use are: x, x^3, x^5, x^7, x^9, x^11, and x^13.
	
	Dividing by (2n+1)! is changed to a multiply by a coefficient
	as we compute each term, we will add it to the sum, stored as
	an S(2,61). Therefore, we want the product of each power of x
	and its coefficient to be converted to an S(2,61) for the add.
	It turns out that this just requires a small shift.
	
	We build a table to decide how much to shift each product
	before adding it to the total.  x^2 will be stored as an
	S(2,29), and x is given as an S(1,30).  After multiplying x by
	x^2, we will shift left one bit, so the procedure is:
        x   will be an  S(1,30) - multiply by x^2 and shift left
	x^3 will be an  S(3,28) - multiply by x^2 and shift left
	x^5 will be an  S(5,26) - multiply by x^2 and shift left
	x^7 will be an  S(7,24) - multiply by x^2 and shift left
	x^9 will be an  S(9,22) - multiply by x^2 and shift left
	x^11 will be an S(11,20)- multiply by x^2 and shift left
	x^13 will be an S(13,18)- multiply by x^2 and shift left
	x^15 will be an S(15,16)- multiply by x^2 and shift left
	x^17 will be an S(17,14)
	*/
	/*
	The following table shows the constant coefficients
	needed for calculating each term.

	-1/3! =  AAAAAAAA as an S(-2,32)
	 1/5! =  44444445 as an S(-6,32)
	-1/7! =  97F97F97 as an S(-12,32)
	 1/9! =  5C778E96 as an S(-18,32)
	-1/11! = 9466EA60 as an S(-25,32)
	 1/13! = 5849184F as an S(-32,32)
	-1/15! = 94603063 as an S(-40,32)
	 1/17! = 654B1DC1 as an S(-48,32)
	
	Combining the two tables of power and coefficient formats, we
	can now determine how much shift we need after each step in
	order to do all sums in S(2,61) format:
	
	power powerfmt     coef  coeffmt     resultfmt  right shift
	 x     S(1,30)  *   1 (skip the multiply)        1 -> S(2,61)
	 x^3   S(3,28)  *  -1/3!  S(-2,32)  = S(2,61)    0 -> S(2,61)
	 x^5   S(5,26)  *   1/5!  S(-6,32)  = S(0,63)    2 -> S(2,61)
	 x^7   S(7,24)  *  -1/7!  S(-12,32) = S(-4,64)   6 -> S(2,61)
	 x^9   S(9,22)  *   1/9!  S(-18,32) = S(-8,64)   10-> S(2,61)
	 x^11  S(11,20) *  -1/11! S(-25,32) = S(-13,64)  15-> S(2,61)
	 x^13  S(13,18) *   1/13! S(-32,32) = S(-18,64)  20-> S(2,61)
	 x^15  S(15,16) *  -1/15! S(-40,32) = S(-24,64)  26-> S(2,61)
	 x^17  S(17,14) *   1/17! S(-48,32) = S(-30,64)  32-> S(2,61)
        Note: the last row has a shift of 32, which indicates that
	      it can contribute no more than 1/2 bit of precision
	      to the 32-bit result.
	*/

	.data
	.align  2
	// We will define a few constants that may be useful
	.global	pi
pi:	.word	0x3243F6A8	@ pi as an S(3,28)
	.global	pi_2
pi_2:	.word	0x1921FB54	@ pi/2 as an S(3,28)
	.global	pi_x2
pi_x2:	.word	0x6487ED51	@ 2*pi as an S(3,28)
	
sintab: // This is the table of coefficients and shifts for sinq to use
	.word 0xAAAAAAAA,  0	@ -1/3!  as an S(-2,32)  
	.word 0x44444445,  2	@  1/5!  as an S(-6,32)  
	.word 0x97F97F97,  6	@ -1/7!  as an S(-12,32) 
	.word 0x5C778E96,  10	@  1/9!  as an S(-18,32) 
	.word 0x9466EA60,  15	@ -1/11! as an S(-25,32) 
	.word 0x5849184F,  20	@  1/13! as an S(-32,32)
	.word 0x94603063,  26	@ -1/15! as an S(-40,32)
	.word 0x654B1DC1,  32	@  1/17! as an S(-48,32)
	.equ tablen,(.-sintab)	@ set tablen to size of table
	@@ The '.' refers to the current address counter value.
	@@ Subtracting the address of sintab from the current
	@@ address gives the size of the table. 
	
	.text
	//-------------------------------------------------------------
	// sinq(x)
	// input: x -> S(1,30) s.t. 0 <= x <= pi/2
	// returns sin(x) -> S(1,30)
sinq:	stmfd	sp!,{r4-r11,lr}
	@@ The first term in the Taylor series is simply x, so
	@@ convert x to an S(2,61) by doing an asr in 64 bits,
	@@ and use it to initialize the sum.
	mov	r10, r0,lsl #31	@ low 32 bits of sum
	mov	r11, r0,asr #1	@ high 32 bits of sum
	@@ r11:r10 now contains the sum (currently x) as an S(2,61)
	smull	r2,r4,r0,r0	@ r4 will hold x^2. 
	@@ We are going to convert x^2 to an S(2,28), and round it
	adds	r2,r2,#0x40000000 @ Round the result up by adding 1 to
				@ the first bit that will be lost.
	addcs	r4,r4,#1	@ Propagate the carry.
	lsl	r4,r4,#1	@ Make room for one bit at LSB position
	orr	r4,r4,r2,lsr#31 @ Copy top bit of r2
	@@ r4 now contains x^2 as an S(2,28)
	mov	r5,r0		@ r5 will keep x^(2n-1).
	@@ r5 now contains x as an S(1,30)
	ldr	r6, =sintab	@ get pointer to beginning of table
	add	r7, r6, #tablen	@ get pointer to end of table
	@@ We know that we will always execute the loop 6 times,
	@@ so we use a post-test loop.
sloop:	ldmia	r6!,{r8,r9}     @ Load two values from the table
	@@ r8 now has 1/(2n+1)!
	@@ r9 contains the correcting shift
	smull	r0,r5,r4,r5	@ r5:r0 <- x^(2n+1) as an S(4,59)
	lsl	r5,r5,#1	@ Shift and copy the MSB of the LSW to
	orr	r5,r5,r0,lsr#31 @ LSB of MSW to convert to an S(3,60).
	@@ r5 now contains x^(2n+1) as an S(3,60)
	smulls	r0,r1,r5,r8     @ multiply by reciprocal that we
				@ loaded earlier
	@@ if the result is negative, then add one
	bge	noadd
	adds	r0,r0,#1
	adc	r1,r1,#0
noadd:	@@ Apply correcting right shift to make an S(2,61)
	@@ r9 was loaded from table earlier
	lsr	r0,r0,r9	@ Make room in low word for some bits
	rsb	r2,r9,#32	@ calculate inverse shift amount
	lsl	r3,r1,r2	@ shift upper 32 bits to the left
	orr	r0,r0,r3	@ paste bits into low word
	asr	r1,r1,r9	@ shift upper word right	
	@@ accumulate result in r10:r11
	adds	r10,r10,r0
	adc	r11,r11,r1
	@@ check to see if there is another term to compute
	cmp	r6, r7
	blt	sloop		@ Repeat for every table entry	
	@@ shift result left 1 bit and move to r0
	lsl	r11,r11,#1
	orr	r0,r11,r10,lsr #31
	@@ return the result
	ldmfd	sp!,{r4-r11,pc}
	
	//-------------------------------------------------------------
	// cos(x)  NOTE: The cos(x) function does not return.
	//               It is an alternate entry point to sin(x).
	// input: x -> S(3,28)
	// returns cos(x) -> S(3,28)
	.global	cos
cos:	ldr	r1,=pi_x2	@ load pointer to 2*pi
	ldr	r1,[r1]		@ load 2*pi
	cmp	r0,#0		@ Add 2*pi to x if needed, to make 
	addle	r0,r0,r1	@ sure x does not become too negative.
cosgood:ldr	r1,=pi_2	@ load pointer to pi/2 
	ldr	r1,[r1]		@ load pi/2
	sub	r0,r1,r0	@ cos(x) = sin(pi/2-x)
	@@ now we just fall through into the sin function

	//-------------------------------------------------------------
	// sin(x)
	// input: x -> S(3,28)
	// returns sin(x) -> S(3,28)
	.global	sin
sin:	stmfd	sp!,{lr}
	ldr	r1,=pi_2	@ r1 has pointer to pi/2
	ldr	r2,=pi		@ r2 has pointer to pi	
	ldr	r3,=pi_x2	@ r3 has pointer to pi*2
	ldr	r1,[r1]         @ r1 has pi/2
	ldr	r2,[r2]		@ r2 has pi	
	ldr	r3,[r3]		@ r3 has pi*2

	@@ step 1: make sure x>=0.0 and x<=2pi
negl:	cmp	r0,#0 		@ while(x < 0)	   
	addlt	r0,r0,r3	@   x = x + 2 * pi 
	blt	negl            @ end while        
nonneg:	cmp	r0,r3           @ while(x > pi/2)	   
	subgt	r0,r0,r3	@   x = x - 2 * pi 
	bgt	nonneg		@ end while        

	@@ step 2: find the quadrant and call sinq appropriately
inrange:cmp	r0,r1
	bgt	chkq2
	@@ it is in the first quadrant... just shift and call sinq
	lsl	r0,r0,#2
	bl	sinq
	b	sin_done
chkq2:	cmp	r0,r2
	bgt	chkq3
	@@ it is in the second quadrant... mirror, shift, and call sinq
	sub	r0,r2,r0
	lsl	r0,r0,#2
	bl	sinq
	b	sin_done
chkq3:	add	r1,r1,r2	@ we won't need pi/2 again
	cmp	r0,r1		@ so use r1 to calculate 3pi/2
	bgt	chkq4
	@@ it is in the third quadrant... rotate, shift, call sinq,
	@@ then complement the result
	sub	r0,r0,r2
	lsl	r0,r0,#2
	bl	sinq
	rsb	r0,r0,#0
	b	sin_done
	@@ it is in the fourth quadrant... rotate, mirror, shift,
	@@ call sinq, then complement the result
chkq4:	sub	r0,r0,r2
	sub	r0,r2,r0
	lsl	r0,r0,#2
	bl	sinq
	rsb	r0,r0,#0
sin_done:
	@@ shift result right 2 bits 
	asr	r0,r0,#2
	@@ return the result
	ldmfd	sp!,{pc}
	//-------------------------------------------------------------