diff options
Diffstat (limited to 'src/asm')
-rw-r--r-- | src/asm/ftola.asm | 23 | ||||
-rw-r--r-- | src/asm/ftola.c | 13 | ||||
-rw-r--r-- | src/asm/snapvector.asm | 20 | ||||
-rw-r--r-- | src/asm/snapvector.c | 29 |
4 files changed, 30 insertions, 55 deletions
diff --git a/src/asm/ftola.asm b/src/asm/ftola.asm index 370c12d8..eea9d0e3 100644 --- a/src/asm/ftola.asm +++ b/src/asm/ftola.asm @@ -25,11 +25,11 @@ IFNDEF idx64 .model flat, c ENDIF -; .data +.data -; ifndef idx64 -; fpucw WORD 0F7Fh -; endif +ifndef idx64 + fpucw WORD 0F7Fh +endif .code @@ -51,22 +51,19 @@ ELSE ; qftol using FPU qftolx87m macro src -; not necessary, fpucw is set with _controlfp at startup -; sub esp, 2 -; fnstcw word ptr [esp] -; fldcw fpucw + sub esp, 2 + fnstcw word ptr [esp] + fldcw fpucw fld dword ptr src fistp dword ptr src -; fldcw [esp] + fldcw [esp] mov eax, src -; add esp, 2 + add esp, 2 ret endm qftolx87 PROC -; need this line when storing FPU control word on stack -; qftolx87m [esp + 6] - qftolx87m [esp + 4] + qftolx87m [esp + 6] qftolx87 ENDP qvmftolx87 PROC diff --git a/src/asm/ftola.c b/src/asm/ftola.c index ad197836..a2a4eaa0 100644 --- a/src/asm/ftola.c +++ b/src/asm/ftola.c @@ -22,6 +22,8 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA #include "qasm-inline.h" +static const unsigned short fpucw = 0x0C7F; + /* * GNU inline asm ftol conversion functions using SSE or FPU */ @@ -59,14 +61,18 @@ int qvmftolsse(void) long qftolx87(float f) { long retval; + unsigned short oldcw; __asm__ volatile ( + "fnstcw %2\n" + "fldcw %3\n" "flds %1\n" "fistpl %1\n" + "fldcw %2\n" "mov %1, %0\n" : "=r" (retval) - : "m" (f) + : "m" (f), "m" (oldcw), "m" (fpucw) ); return retval; @@ -75,13 +81,18 @@ long qftolx87(float f) int qvmftolx87(void) { int retval; + unsigned short oldcw; __asm__ volatile ( + "fnstcw %1\n" + "fldcw %2\n" "flds (" EDI ", " EBX ", 4)\n" "fistpl (" EDI ", " EBX ", 4)\n" + "fldcw %2\n" "mov (" EDI ", " EBX ", 4), %0\n" : "=r" (retval) + : "m" (oldcw), "m" (fpucw) ); return retval; diff --git a/src/asm/snapvector.asm b/src/asm/snapvector.asm index 000777b4..22f9b225 100644 --- a/src/asm/snapvector.asm +++ b/src/asm/snapvector.asm @@ -45,9 +45,6 @@ IFDEF idx64 qsnapvectorsse PROC sub rsp, 8 - stmxcsr [rsp] ; save SSE control word - ldmxcsr ssecw ; set to round nearest - movaps xmm1, ssemask ; initialize the mask register movups xmm0, [rcx] ; here is stored our vector. Read 4 values in one go movaps xmm2, xmm0 ; keep a copy of the original data @@ -57,20 +54,13 @@ IFDEF idx64 cvtdq2ps xmm0, xmm0 ; convert 4 int to single fp orps xmm0, xmm1 ; combine all 4 values again movups [rcx], xmm0 ; write 3 rounded and 1 unchanged values back to memory - - ldmxcsr [rsp] ; restore sse control word to old value - add rsp, 8 ret qsnapvectorsse ENDP ELSE qsnapvectorsse PROC - sub esp, 8 - stmxcsr [esp] ; save SSE control word - ldmxcsr ssecw ; set to round nearest - - mov eax, dword ptr 12[esp] ; store address of vector in eax + mov eax, dword ptr 4[esp] ; store address of vector in eax movaps xmm1, ssemask ; initialize the mask register for maskmovdqu movups xmm0, [eax] ; here is stored our vector. Read 4 values in one go movaps xmm2, xmm0 ; keep a copy of the original data @@ -80,9 +70,6 @@ ELSE cvtdq2ps xmm0, xmm0 ; convert 4 int to single fp orps xmm0, xmm1 ; combine all 4 values again movups [eax], xmm0 ; write 3 rounded and 1 unchanged values back to memory - - ldmxcsr [esp] ; restore sse control word to old value - add esp, 8 ret qsnapvectorsse ENDP @@ -95,14 +82,9 @@ ELSE qsnapvectorx87 PROC mov eax, dword ptr 4[esp] - sub esp, 2 - fnstcw word ptr [esp] - fldcw fpucw qroundx87 [eax] qroundx87 4[eax] qroundx87 8[eax] - fldcw [esp] - add esp, 2 ret qsnapvectorx87 ENDP diff --git a/src/asm/snapvector.c b/src/asm/snapvector.c index 121c110e..a70a1069 100644 --- a/src/asm/snapvector.c +++ b/src/asm/snapvector.c @@ -33,31 +33,21 @@ static unsigned char ssemask[16] __attribute__((aligned(16))) = "\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\x00\x00\x00\x00" }; -static const unsigned int ssecw __attribute__((aligned(16))) = 0x00001F80; -static const unsigned short fpucw = 0x037F; - void qsnapvectorsse(vec3_t vec) { - uint32_t oldcw __attribute__((aligned(16))); - __asm__ volatile ( - "stmxcsr %3\n" - "ldmxcsr %1\n" - "movaps (%0), %%xmm1\n" - "movups (%2), %%xmm0\n" + "movups (%1), %%xmm0\n" "movaps %%xmm0, %%xmm2\n" "andps %%xmm1, %%xmm0\n" "andnps %%xmm2, %%xmm1\n" "cvtps2dq %%xmm0, %%xmm0\n" "cvtdq2ps %%xmm0, %%xmm0\n" "orps %%xmm1, %%xmm0\n" - "movups %%xmm0, (%2)\n" - - "ldmxcsr %3\n" + "movups %%xmm0, (%1)\n" : - : "r" (ssemask), "m" (ssecw), "r" (vec), "m" (oldcw) + : "r" (ssemask), "r" (vec) : "memory", "%xmm0", "%xmm1", "%xmm2" ); @@ -73,16 +63,11 @@ void qsnapvectorx87(vec3_t vec) { __asm__ volatile ( - "sub $2, " ESP "\n" - "fnstcw (" ESP ")\n" - "fldcw %0\n" - QROUNDX87("(%1)") - QROUNDX87("4(%1)") - QROUNDX87("8(%1)") - "fldcw (" ESP ")\n" - "add $2, " ESP "\n" + QROUNDX87("(%0)") + QROUNDX87("4(%0)") + QROUNDX87("8(%0)") : - : "m" (fpucw), "r" (vec) + : "r" (vec) : "memory" ); } |