summaryrefslogtreecommitdiff
path: root/src/asm/snapvector.asm
diff options
context:
space:
mode:
Diffstat (limited to 'src/asm/snapvector.asm')
-rw-r--r--src/asm/snapvector.asm27
1 files changed, 15 insertions, 12 deletions
diff --git a/src/asm/snapvector.asm b/src/asm/snapvector.asm
index acfb7930..a953778a 100644
--- a/src/asm/snapvector.asm
+++ b/src/asm/snapvector.asm
@@ -48,14 +48,15 @@ IFDEF idx64
stmxcsr [rsp] ; save SSE control word
ldmxcsr ssecw ; set to round nearest
- push rdi
- mov rdi, rcx ; maskmovdqu uses rdi as implicit memory operand
- movaps xmm1, ssemask ; initialize the mask register for maskmovdqu
- movups xmm0, [rdi] ; here is stored our vector. Read 4 values in one go
+ movaps xmm1, ssemask ; initialize the mask register
+ movups xmm0, [rcx] ; here is stored our vector. Read 4 values in one go
+ movaps xmm2, xmm0 ; keep a copy of the original data
+ andps xmm0, xmm1 ; set the fourth value to zero in xmm0
+ andnps xmm1, xmm2 ; copy fourth value to xmm1 and set rest to zero
cvtps2dq xmm0, xmm0 ; convert 4 single fp to int
cvtdq2ps xmm0, xmm0 ; convert 4 int to single fp
- maskmovdqu xmm0, xmm1 ; write 3 values back to memory
- pop rdi
+ orps xmm0, xmm1 ; combine all 4 values again
+ movups [rcx], xmm0 ; write 3 rounded and 1 unchanged values back to memory
ldmxcsr [rsp] ; restore sse control word to old value
add rsp, 8
@@ -69,14 +70,16 @@ ELSE
stmxcsr [esp] ; save SSE control word
ldmxcsr ssecw ; set to round nearest
- push edi
- mov edi, dword ptr 16[esp] ; maskmovdqu uses edi as implicit memory operand
- movaps xmm1, ssemask ; initialize the mask register for maskmovdqu
- movups xmm0, [edi] ; here is stored our vector. Read 4 values in one go
+ mov eax, dword ptr 16[esp] ; store address of vector in eax
+ movaps xmm1, ssemask ; initialize the mask register for maskmovdqu
+ movups xmm0, [eax] ; here is stored our vector. Read 4 values in one go
+ movaps xmm2, xmm0 ; keep a copy of the original data
+ andps xmm0, xmm1 ; set the fourth value to zero in xmm0
+ andnps xmm1, xmm2 ; copy fourth value to xmm1 and set rest to zero
cvtps2dq xmm0, xmm0 ; convert 4 single fp to int
cvtdq2ps xmm0, xmm0 ; convert 4 int to single fp
- maskmovdqu xmm0, xmm1 ; write 3 values back to memory
- pop edi
+ orps xmm0, xmm1 ; combine all 4 values again
+ movups [eax], xmm0 ; write 3 rounded and 1 unchanged values back to memory
ldmxcsr [esp] ; restore sse control word to old value
add esp, 8