/*
Donut Bump Mapping Demo
This demo shows how to use a bump mapping technique using Glide(tm)
Copyright (C) 1999  3Dfx Interactive, Inc.

This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
*/

#include "basics.h"
#include "tlib.h"
#include "torus_xforms.h"

//#define USE_ASM

// local variables
static Matrix gMat;
static int gNumXformedVerts;
static unsigned short g_fpu_cw_old, g_fpu_cw_new;
const float ONE = 1.0f;


void SetCurrMatrix(Matrix m)
{
	MatrixCopy(gMat, m);
}

void BeginXforms()
{
	gNumXformedVerts = 0;

	// change the FPU control word to 24-bit precision
	// bits 8 and 9 determine precision
	// 00 - 24-bits
	// 10 - 53-bits
	// 11 - 64-bits
//	__asm
//	{
//		fnstcw	word ptr [g_fpu_cw_old]
//		mov			ax, [g_fpu_cw_old]
//		and			ax, 0xfcff // 24-bit precision
//		mov			[g_fpu_cw_new], ax
//		fldcw		word ptr [g_fpu_cw_new]
//	}
}

int EndXforms()
{
	// restore the FPU control word
//	__asm
//	{
//		fldcw		word ptr [g_fpu_cw_old]
//	}
//
	return gNumXformedVerts;
}

// this function will transform the vector v by the matrix m
// it'll then compute 1/w and homogenize (i.e. compute x/w and y/w)
// the vert's x, y, and oow fields get set accordingly
// it assumes that v[3] is 1.0f pre-transform
#ifdef USE_ASM
__declspec(naked) void XformVertex(GrVertex *vert, Vector *v)
{
	__asm // 72 cycles
	{
		// ******** MATRIX MULTIPLY ********
		//	res[0] = m[0][0]*v[0] + m[0][1]*v[1] + m[0][2]*v[2] + m[0][3];
		//	res[1] = m[1][0]*v[0] + m[1][1]*v[1] + m[1][2]*v[2] + m[1][3];
		//	res[3] = m[3][0]*v[0] + m[3][1]*v[1] + m[3][2]*v[2] + m[3][3];
		lea			ecx, [gMat] // ecx = gMat
		mov			edx, [esp + 8] // edx = v

		mov			eax, [gNumXformedVerts]

		fld			dword ptr [ecx + 48]
		// m30
		fmul		dword ptr [edx]
		// m30*v0 (2)
		fld			dword ptr [ecx + 48 + 4]
		// m31
		// m30*v0 (1)
		fmul		dword ptr [edx + 4]
		// m31*v1 (2)
		// m30*v0 (0)
		fxch		st(1)
		// m30*v0 (0)
		// m31*v1 (2)
		fadd		dword ptr [ecx + 48 + 12]
		// m30*v0 + m33 (2)
		// m31*v1 (1)
		fld			dword ptr [ecx + 48 + 8]
		// m32
		// m30*v0 + m33 (1)
		// m31*v1 (0)
		fmul		dword ptr [edx + 8]
		// m32*v2 (2)
		// m30*v0 + m33 (0)
		// m31*v1 (0)
		fxch		st(2)
		// m31*v1 (0)
		// m30*v0 + m33 (0)
		// m32*v2 (2)
		faddp		st(1), st
		// m30*v0 + m31*v1 + m33 (2)
		// m32*v2 (1)
		fld			dword ptr [ecx]
		// m00
		// m30*v0 + m31*v1 + m33 (1)
		// m32*v2 (0)
		fmul		dword ptr [edx]
		// m00*v0 (2)
		// m30*v0 + m31*v1 + m33 (0)
		// m32*v2 (0)
		fxch		st(2)
		// m32*v2 (0)
		// m30*v0 + m31*v1 + m33 (0)
		// m00*v0 (2)
		faddp		st(1), st
		// W = m30*v0 + m31*v1 + m32*v2 + m33 (2)
		// m00*v0 (1)
		fld			dword ptr [ecx + 4]
		// m01
		// W = m30*v0 + m31*v1 + m32*v2 + m33 (1)
		// m00*v0 (0)
		fmul		dword ptr [edx + 4]
		// m01*v1 (2)
		// W = m30*v0 + m31*v1 + m32*v2 + m33 (0)
		// m00*v0 (0)
		fxch		st(2)
		// m00*v0 (0)
		// W = m30*v0 + m31*v1 + m32*v2 + m33 (0)
		// m01*v1 (2)
		fadd		dword ptr [ecx + 12]
		// m00*v0 + m03 (2)
		// W = m30*v0 + m31*v1 + m32*v2 + m33 (0)
		// m01*v1 (1)
		fld			dword ptr [ecx + 8]
		// m02
		// m00*v0 + m03 (1)
		// W = m30*v0 + m31*v1 + m32*v2 + m33 (0)
		// m01*v1 (0)
		fmul		dword ptr [edx + 8]
		// m02*v2 (2)
		// m00*v0 + m03 (0)
		// W = m30*v0 + m31*v1 + m32*v2 + m33 (0)
		// m01*v1 (0)
		fxch		st(3)
		// m01*v1 (0)
		// m00*v0 + m03 (0)
		// W = m30*v0 + m31*v1 + m32*v2 + m33 (0)
		// m02*v2 (2)
		faddp		st(1), st
		// m00*v0 + m01*v1 + m03 (2)
		// W = m30*v0 + m31*v1 + m32*v2 + m33 (0)
		// m02*v2 (1)
		fld			dword ptr [ecx + 16]
		// m10
		// m00*v0 + m01*v1 + m03 (1)
		// W = m30*v0 + m31*v1 + m32*v2 + m33 (0)
		// m02*v2 (0)
		fmul		dword ptr [edx]
		// m10*v0 (2)
		// m00*v0 + m01*v1 + m03 (0)
		// W = m30*v0 + m31*v1 + m32*v2 + m33 (0)
		// m02*v2 (0)
		fxch		st(3)
		// m02*v2 (0)
		// m00*v0 + m01*v1 + m03 (0)
		// W = m30*v0 + m31*v1 + m32*v2 + m33 (0)
		// m10*v0 (2)
		faddp		st(1), st
		// X = m00*v0 + m01*v1 + m02*v2 + m03 (2)
		// W = m30*v0 + m31*v1 + m32*v2 + m33 (0)
		// m10*v0 (1)
		fld			dword ptr [ecx + 16 + 4]
		// m11
		// X = m00*v0 + m01*v1 + m02*v2 + m03 (1)
		// W = m30*v0 + m31*v1 + m32*v2 + m33 (0)
		// m10*v0 (0)
		fmul		dword ptr [edx + 4]
		// m11*v1 (2)
		// X = m00*v0 + m01*v1 + m02*v2 + m03 (0)
		// W = m30*v0 + m31*v1 + m32*v2 + m33 (0)
		// m10*v0 (0)
		fxch		st(3)
		// m10*v0 (0)
		// X = m00*v0 + m01*v1 + m02*v2 + m03 (0)
		// W = m30*v0 + m31*v1 + m32*v2 + m33 (0)
		// m11*v1 (2)
		fadd		dword ptr [ecx + 16 + 12]
		// m10*v0 + m13 (2)
		// X = m00*v0 + m01*v1 + m02*v2 + m03 (0)
		// W = m30*v0 + m31*v1 + m32*v2 + m33 (0)
		// m11*v1 (1)
		fld			dword ptr [ecx + 16 + 8]
		// m12
		// m10*v0 + m13 (1)
		// X = m00*v0 + m01*v1 + m02*v2 + m03 (0)
		// W = m30*v0 + m31*v1 + m32*v2 + m33 (0)
		// m11*v1 (0)
		fmul		dword ptr [edx + 8]
		// m12*v2 (2)
		// m10*v0 + m13 (0)
		// X = m00*v0 + m01*v1 + m02*v2 + m03 (0)
		// W = m30*v0 + m31*v1 + m32*v2 + m33 (0)
		// m11*v1 (0)
		fxch		st(4)
		// m11*v1 (0)
		// m10*v0 + m13 (0)
		// X = m00*v0 + m01*v1 + m02*v2 + m03 (0)
		// W = m30*v0 + m31*v1 + m32*v2 + m33 (0)
		// m12*v2 (2)
		faddp		st(1), st
		// m10*v0 + m11*v1 + m13 (2)
		// X = m00*v0 + m01*v1 + m02*v2 + m03 (0)
		// W = m30*v0 + m31*v1 + m32*v2 + m33 (0)
		// m12*v2 (1)

		mov			edx, [esp + 4] // edx = vert
		inc			eax

		mov			[gNumXformedVerts], eax

		faddp		st(3), st
		// X (0)
		// W (0)
		// Y (2)
		fxch		st(1)
		// W (0)
		// X (0)
		// Y (2)

		// ******** DIVIDE ********
		fld			dword ptr [ONE]
		// 1
		// W (0)
		// X (0)
		// Y (1)
		fdivrp	st(1), st
		// 1/W (18)
		// X (0)
		// Y (0)

		// ******** stall 18 cycles ********
		fmul		st(1), st
		// 1/W (0)
		// X/W (2)
		// Y (0)
		fxch		st(2)
		// Y (0)
		// X/W (2)
		// 1/W (0)
		// ******** stall 1 cycle (consecutive fmuls) ********
		fmul		st, st(2)
		// Y/W (2)
		// X/W (1)
		// 1/W (0)
		fxch		st(2)
		// 1/W (0)
		// X/W (1)
		// Y/W (2)
		fstp		dword ptr [edx + 4*GR_VERTEX_OOW_OFFSET]
		fstp		dword ptr [edx + 4*GR_VERTEX_X_OFFSET]
		fstp		dword ptr [edx + 4*GR_VERTEX_Y_OFFSET]

		ret
	}
}
#else
void XformVertex(GrVertex *vert, Vector *v)
{
	vert->x   = gMat[0][0]*(*v)[0] + gMat[0][1]*(*v)[1] + gMat[0][2]*(*v)[2] + gMat[0][3];
	vert->y   = gMat[1][0]*(*v)[0] + gMat[1][1]*(*v)[1] + gMat[1][2]*(*v)[2] + gMat[1][3];
	vert->oow = gMat[3][0]*(*v)[0] + gMat[3][1]*(*v)[1] + gMat[3][2]*(*v)[2] + gMat[3][3];

	vert->oow = 1.0f/vert->oow;
	vert->x *= vert->oow;
	vert->y *= vert->oow;

	gNumXformedVerts++;
}
#endif // USE_ASM
