diff options
Diffstat (limited to 'src/asm')
-rw-r--r-- | src/asm/ftola.c | 24 | ||||
-rw-r--r-- | src/asm/snapvector.asm | 10 | ||||
-rw-r--r-- | src/asm/snapvector.c | 43 |
3 files changed, 45 insertions, 32 deletions
diff --git a/src/asm/ftola.c b/src/asm/ftola.c index e0298e8e..ad197836 100644 --- a/src/asm/ftola.c +++ b/src/asm/ftola.c @@ -28,7 +28,7 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA long qftolsse(float f) { - register long retval; + long retval; __asm__ volatile ( @@ -40,21 +40,25 @@ long qftolsse(float f) return retval; } -void qvmftolsse(void) +int qvmftolsse(void) { + int retval; + __asm__ volatile ( "movss (" EDI ", " EBX ", 4), %%xmm0\n" - "cvttss2si %%xmm0, " EAX "\n" - : + "cvttss2si %%xmm0, %0\n" + : "=r" (retval) : : "%xmm0" ); + + return retval; } long qftolx87(float f) { - register long retval; + long retval; __asm__ volatile ( @@ -68,13 +72,17 @@ long qftolx87(float f) return retval; } -void qvmftolx87(void) +int qvmftolx87(void) { + int retval; + __asm__ volatile ( "flds (" EDI ", " EBX ", 4)\n" "fistpl (" EDI ", " EBX ", 4)\n" - "mov (" EDI ", " EBX ", 4), " EAX "\n" - : + "mov (" EDI ", " EBX ", 4), %0\n" + : "=r" (retval) ); + + return retval; } diff --git a/src/asm/snapvector.asm b/src/asm/snapvector.asm index 87c77372..eca40fe1 100644 --- a/src/asm/snapvector.asm +++ b/src/asm/snapvector.asm @@ -44,7 +44,7 @@ IFDEF idx64 ; qsnapvector using SSE qsnapvectorsse PROC - sub rsp, 4 + sub rsp, 8 stmxcsr [rsp] ; save SSE control word ldmxcsr ssecw ; set to round nearest @@ -58,19 +58,19 @@ IFDEF idx64 pop rdi ldmxcsr [rsp] ; restore sse control word to old value - add rsp, 4 + add rsp, 8 ret qsnapvectorsse ENDP ELSE qsnapvectorsse PROC - sub esp, 4 + sub esp, 8 stmxcsr [esp] ; save SSE control word ldmxcsr ssecw ; set to round nearest push edi - mov edi, dword ptr 12[esp] ; maskmovdqu uses edi as implicit memory operand + mov edi, dword ptr 16[esp] ; maskmovdqu uses edi as implicit memory operand movaps xmm1, ssemask ; initialize the mask register for maskmovdqu movups xmm0, [edi] ; here is stored our vector. Read 4 values in one go cvtps2dq xmm0, xmm0 ; convert 4 single fp to int @@ -79,7 +79,7 @@ ELSE pop edi ldmxcsr [esp] ; restore sse control word to old value - add esp, 4 + add esp, 8 ret qsnapvectorsse ENDP diff --git a/src/asm/snapvector.c b/src/asm/snapvector.c index 402b3925..8e9b2868 100644 --- a/src/asm/snapvector.c +++ b/src/asm/snapvector.c @@ -25,6 +25,7 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA /* * GNU inline asm version of qsnapvector + * See MASM snapvector.asm for commentary */ static unsigned char ssemask[16] __attribute__((aligned(16))) = @@ -32,29 +33,33 @@ static unsigned char ssemask[16] __attribute__((aligned(16))) = "\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\x00\x00\x00\x00" }; -static unsigned int ssecw __attribute__((aligned(16))) = 0x00001F80; -static unsigned short fpucw = 0x037F; +static const unsigned int ssecw __attribute__((aligned(16))) = 0x00001F80; +static const unsigned short fpucw = 0x037F; void qsnapvectorsse(vec3_t vec) { + uint32_t oldcw __attribute__((aligned(16))); + __asm__ volatile ( - "sub $4, " ESP "\n" - "stmxcsr (" ESP ")\n" + "stmxcsr %3\n" "ldmxcsr %1\n" "movaps (%0), %%xmm1\n" - "movups (" EDI "), %%xmm0\n" + "movups (%2), %%xmm0\n" "cvtps2dq %%xmm0, %%xmm0\n" "cvtdq2ps %%xmm0, %%xmm0\n" + // vec MUST reside in register rdi as maskmovdqu uses + // it as an implicit operand. The "D" constraint makes + // sure of that. "maskmovdqu %%xmm1, %%xmm0\n" - "ldmxcsr (" ESP ")\n" - "add $4, " ESP "\n" + "ldmxcsr %3\n" : - : "r" (ssemask), "m" (ssecw), "D" (vec) + : "r" (ssemask), "m" (ssecw), "D" (vec), "m" (oldcw) : "memory", "%xmm0", "%xmm1" ); + } #define QROUNDX87(src) \ @@ -67,16 +72,16 @@ void qsnapvectorx87(vec3_t vec) { __asm__ volatile ( - "sub $2, " ESP "\n" - "fnstcw (" ESP ")\n" - "fldcw %0\n" - QROUNDX87("(%1)") - QROUNDX87("4(%1)") - QROUNDX87("8(%1)") - "fldcw (" ESP ")\n" - "add $2, " ESP "\n" - : - : "m" (fpucw), "r" (vec) - : "memory" + "sub $2, " ESP "\n" + "fnstcw (" ESP ")\n" + "fldcw %0\n" + QROUNDX87("(%1)") + QROUNDX87("4(%1)") + QROUNDX87("8(%1)") + "fldcw (" ESP ")\n" + "add $2, " ESP "\n" + : + : "m" (fpucw), "r" (vec) + : "memory" ); } |