diff --git a/dlib/math/sse.d b/dlib/math/sse.d index 9eb65dc1..0385a557 100644 --- a/dlib/math/sse.d +++ b/dlib/math/sse.d @@ -44,202 +44,205 @@ import dlib.math.matrix; version(GNU) { pragma(inline, true); - - /// Vector addition - Vector4f sseAdd4(Vector4f a, Vector4f b) - { - asm { - "movups %[a], %%xmm0 \n" ~ // Load vector a into xmm0 - "movups %[b], %%xmm1 \n" ~ // Load vector b into xmm1 - "addps %%xmm1, %%xmm0 \n" ~ // Add xmm1 to xmm0 - "movups %%xmm0, %[a] \n" // Store the result back in vector a - : [a] "+m" (a) // Output operand a, constrained to memory - : [b] "m" (b) // Input operand b, constrained to memory - : "%xmm0", "%xmm1"; // Clobbered registers - } - - return a; - } - - /// Vector subtraction for GNU D Compiler (using AVX) - Vector4f sseSub4(Vector4f a, Vector4f b) - { - asm - { - "movups %[a], %%xmm0 \n" ~ // Load vector a into xmm0 - "movups %[b], %%xmm1 \n" ~ // Load vector b into xmm1 - "subps %%xmm1, %%xmm0 \n" ~ // Subtract xmm1 from xmm0 - "movups %%xmm0, %[a] \n" // Store the result back in vector a - : [a] "+m" (a) // Output operand a, constrained to memory - : [b] "m" (b) // Input operand b, constrained to memory - : "%xmm0", "%xmm1"; // Clobbered registers - } - - return a; - } + + version(X86_Any) { - /// Vector multiplication for GNU D Compiler (using AVX) - Vector4f sseMul4(Vector4f a, Vector4f b) - { - asm + /// Vector addition + Vector4f sseAdd4(Vector4f a, Vector4f b) { - "movups %[a], %%xmm0 \n" ~ // Load vector a into xmm0 - "movups %[b], %%xmm1 \n" ~ // Load vector b into xmm1 - "mulps %%xmm1, %%xmm0 \n" ~ // Multiply xmm0 by xmm1 - "movups %%xmm0, %[a] \n" // Store the result back in vector a - : [a] "+m" (a) // Output operand a, constrained to memory - : [b] "m" (b) // Input operand b, constrained to memory - : "%xmm0", "%xmm1"; // Clobbered registers + asm { + "movups %[a], %%xmm0 \n" ~ // Load vector a into xmm0 + "movups %[b], %%xmm1 \n" ~ // Load vector b into xmm1 + "addps %%xmm1, %%xmm0 \n" ~ // Add xmm1 to xmm0 + "movups %%xmm0, %[a] \n" // Store the result back in vector a + : [a] "+m" (a) // Output operand a, constrained to memory + : [b] "m" (b) // Input operand b, constrained to memory + : "%xmm0", "%xmm1"; // Clobbered registers + } + + return a; } - return a; - } + /// Vector subtraction for GNU D Compiler (using AVX) + Vector4f sseSub4(Vector4f a, Vector4f b) + { + asm + { + "movups %[a], %%xmm0 \n" ~ // Load vector a into xmm0 + "movups %[b], %%xmm1 \n" ~ // Load vector b into xmm1 + "subps %%xmm1, %%xmm0 \n" ~ // Subtract xmm1 from xmm0 + "movups %%xmm0, %[a] \n" // Store the result back in vector a + : [a] "+m" (a) // Output operand a, constrained to memory + : [b] "m" (b) // Input operand b, constrained to memory + : "%xmm0", "%xmm1"; // Clobbered registers + } - /// Vector division for GNU D Compiler (using AVX) - Vector4f sseDiv4(Vector4f a, Vector4f b) - { - asm + return a; + } + + /// Vector multiplication for GNU D Compiler (using AVX) + Vector4f sseMul4(Vector4f a, Vector4f b) { - "movups %[a], %%xmm0 \n" ~ // Load vector a into xmm0 - "movups %[b], %%xmm1 \n" ~ // Load vector b into xmm1 - "divps %%xmm1, %%xmm0 \n" ~ // Divide xmm0 by xmm1 - "movups %%xmm0, %[a] \n" // Store the result back in vector a - : [a] "+m" (a) // Output operand a, constrained to memory - : [b] "m" (b) // Input operand b, constrained to memory - : "%xmm0", "%xmm1"; // Clobbered registers + asm + { + "movups %[a], %%xmm0 \n" ~ // Load vector a into xmm0 + "movups %[b], %%xmm1 \n" ~ // Load vector b into xmm1 + "mulps %%xmm1, %%xmm0 \n" ~ // Multiply xmm0 by xmm1 + "movups %%xmm0, %[a] \n" // Store the result back in vector a + : [a] "+m" (a) // Output operand a, constrained to memory + : [b] "m" (b) // Input operand b, constrained to memory + : "%xmm0", "%xmm1"; // Clobbered registers + } + + return a; } - - return a; - } - - /// Vector dot product for GNU D Compiler (using SSE) - float sseDot4(Vector4f a, Vector4f b) - { - asm + + /// Vector division for GNU D Compiler (using AVX) + Vector4f sseDiv4(Vector4f a, Vector4f b) { - "movups %[a], %%xmm0 \n" ~ // Load vector a into xmm0 - "movups %[b], %%xmm1 \n" ~ // Load vector b into xmm1 - "mulps %%xmm1, %%xmm0 \n" ~ // Multiply xmm0 by xmm1 - - // Horizontal addition - "movhlps %%xmm0, %%xmm1 \n" ~// Copy the high 64 bits to the low 64 bits of xmm1 - "addps %%xmm1, %%xmm0 \n" ~ // Add xmm1 to xmm0 - - "movups %%xmm0, %[a] \n" // Store the result back in vector a - : [a] "+m" (a) // Output operand a, constrained to memory - : [b] "m" (b) // Input operand b, constrained to memory - : "%xmm0", "%xmm1"; // Clobbered registers + asm + { + "movups %[a], %%xmm0 \n" ~ // Load vector a into xmm0 + "movups %[b], %%xmm1 \n" ~ // Load vector b into xmm1 + "divps %%xmm1, %%xmm0 \n" ~ // Divide xmm0 by xmm1 + "movups %%xmm0, %[a] \n" // Store the result back in vector a + : [a] "+m" (a) // Output operand a, constrained to memory + : [b] "m" (b) // Input operand b, constrained to memory + : "%xmm0", "%xmm1"; // Clobbered registers + } + + return a; } - - return a[0]; - } - - /// Vector cross product for GNU D Compiler (using SSE) - Vector4f sseCross3(Vector4f a, Vector4f b) - { - asm + + /// Vector dot product for GNU D Compiler (using SSE) + float sseDot4(Vector4f a, Vector4f b) { - "movups %[a], %%xmm0 \n" ~ // Load vector a into xmm0 - "movups %[b], %%xmm1 \n" ~ // Load vector b into xmm1 - "movaps %%xmm0, %%xmm2 \n" ~ // Copy xmm0 to xmm2 - "movaps %%xmm1, %%xmm3 \n" ~ // Copy xmm1 to xmm3 - - "shufps $0xC9, %%xmm0, %%xmm0 \n" ~ // Shuffle xmm0 according to 0xC9 - "shufps $0xD2, %%xmm1, %%xmm1 \n" ~ // Shuffle xmm1 according to 0xD2 - "shufps $0xD2, %%xmm2, %%xmm2 \n" ~ // Shuffle xmm2 according to 0xD2 - "shufps $0xC9, %%xmm3, %%xmm3 \n" ~ // Shuffle xmm3 according to 0xC9 - - "mulps %%xmm1, %%xmm0 \n" ~ // Multiply xmm0 by xmm1 - "mulps %%xmm3, %%xmm2 \n" ~ // Multiply xmm2 by xmm3 - - "subps %%xmm2, %%xmm0 \n" ~ // Subtract xmm2 from xmm0 - - "movups %%xmm0, %[a] \n" // Store the result back in vector a - : [a] "+m" (a) // Output operand a, constrained to memory - : [b] "m" (b) // Input operand b, constrained to memory - : "%xmm0", "%xmm1", "%xmm2", "%xmm3"; // Clobbered registers + asm + { + "movups %[a], %%xmm0 \n" ~ // Load vector a into xmm0 + "movups %[b], %%xmm1 \n" ~ // Load vector b into xmm1 + "mulps %%xmm1, %%xmm0 \n" ~ // Multiply xmm0 by xmm1 + + // Horizontal addition + "movhlps %%xmm0, %%xmm1 \n" ~// Copy the high 64 bits to the low 64 bits of xmm1 + "addps %%xmm1, %%xmm0 \n" ~ // Add xmm1 to xmm0 + + "movups %%xmm0, %[a] \n" // Store the result back in vector a + : [a] "+m" (a) // Output operand a, constrained to memory + : [b] "m" (b) // Input operand b, constrained to memory + : "%xmm0", "%xmm1"; // Clobbered registers + } + + return a[0]; } - - return a; - } - - /// Matrix multiplication for GNU D Compiler (using SSE) - Matrix4x4f sseMulMat4(Matrix4x4f a, Matrix4x4f b) - { - Matrix4x4f r; - Vector4f a_line, b_line, r_line; - float _b; - uint i, j; - Vector4f* _rp; - - for (i = 0; i < 16; i += 4) + + /// Vector cross product for GNU D Compiler (using SSE) + Vector4f sseCross3(Vector4f a, Vector4f b) { - a_line = *cast(Vector4f*)(a.arrayof.ptr); - _b = *(b.arrayof.ptr + i); - asm { - "movups %[a_line], %%xmm0 \n" ~ // Load vector a_line into xmm0 - - "mov %[_b], %%eax \n" ~ // Move _b into the EAX register - "movd %%eax, %%xmm1 \n" ~ // Move EAX into xmm1 - - "shufps $0, %%xmm1, %%xmm1 \n" ~ // Shuffle xmm1 according to 0 - - "mulps %%xmm1, %%xmm0 \n" ~ // Multiply xmm0 by xmm1 - "movups %%xmm0, %[r_line]" // Store the result in r_line - - : [r_line] "=m" (r_line) // Output operand r_line, constrained to memory - : [a_line] "m" (a_line), [_b] "r" (_b) // Input operands a_line and _b, constrained to memory and register - : "%xmm0", "%xmm1", "%eax"; // Clobbered registers + "movups %[a], %%xmm0 \n" ~ // Load vector a into xmm0 + "movups %[b], %%xmm1 \n" ~ // Load vector b into xmm1 + "movaps %%xmm0, %%xmm2 \n" ~ // Copy xmm0 to xmm2 + "movaps %%xmm1, %%xmm3 \n" ~ // Copy xmm1 to xmm3 + + "shufps $0xC9, %%xmm0, %%xmm0 \n" ~ // Shuffle xmm0 according to 0xC9 + "shufps $0xD2, %%xmm1, %%xmm1 \n" ~ // Shuffle xmm1 according to 0xD2 + "shufps $0xD2, %%xmm2, %%xmm2 \n" ~ // Shuffle xmm2 according to 0xD2 + "shufps $0xC9, %%xmm3, %%xmm3 \n" ~ // Shuffle xmm3 according to 0xC9 + + "mulps %%xmm1, %%xmm0 \n" ~ // Multiply xmm0 by xmm1 + "mulps %%xmm3, %%xmm2 \n" ~ // Multiply xmm2 by xmm3 + + "subps %%xmm2, %%xmm0 \n" ~ // Subtract xmm2 from xmm0 + + "movups %%xmm0, %[a] \n" // Store the result back in vector a + : [a] "+m" (a) // Output operand a, constrained to memory + : [b] "m" (b) // Input operand b, constrained to memory + : "%xmm0", "%xmm1", "%xmm2", "%xmm3"; // Clobbered registers } - - for (j = 1; j < 4; j++) + + return a; + } + + /// Matrix multiplication for GNU D Compiler (using SSE) + Matrix4x4f sseMulMat4(Matrix4x4f a, Matrix4x4f b) + { + Matrix4x4f r; + Vector4f a_line, b_line, r_line; + float _b; + uint i, j; + Vector4f* _rp; + + for (i = 0; i < 16; i += 4) { - a_line = *cast(Vector4f*)(a.arrayof.ptr + j * 4); - _b = *(b.arrayof.ptr + i + j); - + a_line = *cast(Vector4f*)(a.arrayof.ptr); + _b = *(b.arrayof.ptr + i); + asm { - "movups %[a_line], %%xmm0 \n" ~ // Load vector a_line into xmm0 - - "mov %[_b], %%eax \n" ~ // Move _b into the EAX register - "movd %%eax, %%xmm1 \n" ~ // Move EAX into xmm1 - "shufps $0, %%xmm1, %%xmm1 \n" ~ // Shuffle xmm1 according to 0 - - "mulps %%xmm1, %%xmm0 \n" ~ // Multiply xmm0 by xmm1 - - "movups %[r_line], %%xmm2 \n" ~ // Load r_line into xmm2 - "addps %%xmm2, %%xmm0 \n" ~ // Add xmm2 to xmm0 - - "movups %%xmm0, %[r_line]" // Store the result back in r_line - : [r_line] "=m" (r_line) // Output and input operands - : [a_line] "m" (a_line), [_b] "r" (_b) // Input operand b, constrained to memory - : "%xmm0", "%xmm1", "%xmm2", "%eax"; // Clobbered registers + "movups %[a_line], %%xmm0 \n" ~ // Load vector a_line into xmm0 + + "mov %[_b], %%eax \n" ~ // Move _b into the EAX register + "movd %%eax, %%xmm1 \n" ~ // Move EAX into xmm1 + + "shufps $0, %%xmm1, %%xmm1 \n" ~ // Shuffle xmm1 according to 0 + + "mulps %%xmm1, %%xmm0 \n" ~ // Multiply xmm0 by xmm1 + "movups %%xmm0, %[r_line]" // Store the result in r_line + + : [r_line] "=m" (r_line) // Output operand r_line, constrained to memory + : [a_line] "m" (a_line), [_b] "r" (_b) // Input operands a_line and _b, constrained to memory and register + : "%xmm0", "%xmm1", "%eax"; // Clobbered registers + } + + for (j = 1; j < 4; j++) + { + a_line = *cast(Vector4f*)(a.arrayof.ptr + j * 4); + _b = *(b.arrayof.ptr + i + j); + + asm + { + "movups %[a_line], %%xmm0 \n" ~ // Load vector a_line into xmm0 + + "mov %[_b], %%eax \n" ~ // Move _b into the EAX register + "movd %%eax, %%xmm1 \n" ~ // Move EAX into xmm1 + "shufps $0, %%xmm1, %%xmm1 \n" ~ // Shuffle xmm1 according to 0 + + "mulps %%xmm1, %%xmm0 \n" ~ // Multiply xmm0 by xmm1 + + "movups %[r_line], %%xmm2 \n" ~ // Load r_line into xmm2 + "addps %%xmm2, %%xmm0 \n" ~ // Add xmm2 to xmm0 + + "movups %%xmm0, %[r_line]" // Store the result back in r_line + : [r_line] "=m" (r_line) // Output and input operands + : [a_line] "m" (a_line), [_b] "r" (_b) // Input operand b, constrained to memory + : "%xmm0", "%xmm1", "%xmm2", "%eax"; // Clobbered registers + } + } + + _rp = cast(Vector4f*)(r.arrayof.ptr + i); + + version(X86) asm + { + "mov %[_rp], %%eax \n" ~ // Move _rp into the EAX register + "movups %%xmm0,(%%eax)" // Move xmm0 to the memory location pointed by EAX + : [_rp] "+r" (_rp) // Output and input operands + : // No additional input operands + : "%eax", "%xmm0"; // Clobbered registers + } + version(X86_64) asm + { + "mov %[_rp], %%rax \n" ~ // Move _rp into the RAX register + "movups %%xmm0, (%%rax)" // Move xmm0 to the memory location pointed by RAX + : [_rp] "+r" (_rp) // Output and input operands + : // No additional input operands + : "%rax", "%xmm0"; // Clobbered registers } } - - _rp = cast(Vector4f*)(r.arrayof.ptr + i); - - version(X86) asm - { - "mov %[_rp], %%eax \n" ~ // Move _rp into the EAX register - "movups %%xmm0,(%%eax)" // Move xmm0 to the memory location pointed by EAX - : [_rp] "+r" (_rp) // Output and input operands - : // No additional input operands - : "%eax", "%xmm0"; // Clobbered registers - } - version(X86_64) asm - { - "mov %[_rp], %%rax \n" ~ // Move _rp into the RAX register - "movups %%xmm0, (%%rax)" // Move xmm0 to the memory location pointed by RAX - : [_rp] "+r" (_rp) // Output and input operands - : // No additional input operands - : "%rax", "%xmm0"; // Clobbered registers - } + + return r; } - - return r; } } @@ -247,169 +250,172 @@ version(DMD) { pragma(inline, true): - /// Vector addition - Vector4f sseAdd4(Vector4f a, Vector4f b) - { - asm - { - movups XMM0, a; - movups XMM1, b; - addps XMM0, XMM1; - movups a, XMM0; - } - - return a; - } - - /// Vector subtraction - Vector4f sseSub4(Vector4f a, Vector4f b) - { - asm - { - movups XMM0, a; - movups XMM1, b; - subps XMM0, XMM1; - movups a, XMM0; - } - - return a; - } - - /// Vector multiplication - Vector4f sseMul4(Vector4f a, Vector4f b) - { - asm - { - movups XMM0, a; - movups XMM1, b; - mulps XMM0, XMM1; - movups a, XMM0; - } - - return a; - } - - /// Vector division - Vector4f sseDiv4(Vector4f a, Vector4f b) - { - asm - { - movups XMM0, a; - movups XMM1, b; - divps XMM0, XMM1; - movups a, XMM0; - } - - return a; - } - - /// Vector dot product - float sseDot4(Vector4f a, Vector4f b) - { - asm - { - movups XMM0, a; - movups XMM1, b; - mulps XMM0, XMM1; - - // Horizontal addition - movhlps XMM1, XMM0; - addps XMM0, XMM1; - movups XMM1, XMM0; - shufps XMM1, XMM1, 0x55; - addps XMM0, XMM1; - - movups a, XMM0; - } - - return a[0]; - } - - /// Vector cross product - Vector4f sseCross3(Vector4f a, Vector4f b) - { - asm - { - movups XMM0, a; - movups XMM1, b; - movaps XMM2, XMM0; - movaps XMM3, XMM1; - - shufps XMM0, XMM0, 0xC9; - shufps XMM1, XMM1, 0xD2; - shufps XMM2, XMM2, 0xD2; - shufps XMM3, XMM3, 0xC9; - - mulps XMM0, XMM1; - mulps XMM2, XMM3; - - subps XMM0, XMM2; - - movups a, XMM0; - } - - return a; - } - - /// Matrix multiplication - Matrix4x4f sseMulMat4(Matrix4x4f a, Matrix4x4f b) - { - Matrix4x4f r; - Vector4f a_line, b_line, r_line; - float _b; - uint i, j; - Vector4f* _rp; - for (i = 0; i < 16; i += 4) - { - a_line = *cast(Vector4f*)(a.arrayof.ptr); - _b = *(b.arrayof.ptr + i); - asm - { - movups XMM0, a_line; - - mov EAX, _b; - movd XMM1, EAX; - - shufps XMM1, XMM1, 0; - - mulps XMM0, XMM1; - movups r_line, XMM0; - } - - for (j = 1; j < 4; j++) - { - a_line = *cast(Vector4f*)(a.arrayof.ptr + j * 4); - _b = *(b.arrayof.ptr + i + j); - asm - { - movups XMM0, a_line; - - mov EAX, _b; - movd XMM1, EAX; - shufps XMM1, XMM1, 0; - - mulps XMM0, XMM1; - - movups XMM2, r_line; - addps XMM0, XMM2; - - movups r_line, XMM0; - } - } - - _rp = cast(Vector4f*)(r.arrayof.ptr + i); - version(X86) asm - { - mov EAX, _rp; - movups [EAX], XMM0; - } - version(X86_64) asm - { - mov RAX, _rp; - movups [RAX], XMM0; - } - } - - return r; + version(X86_Any) { + + /// Vector addition + Vector4f sseAdd4(Vector4f a, Vector4f b) + { + asm + { + movups XMM0, a; + movups XMM1, b; + addps XMM0, XMM1; + movups a, XMM0; + } + + return a; + } + + /// Vector subtraction + Vector4f sseSub4(Vector4f a, Vector4f b) + { + asm + { + movups XMM0, a; + movups XMM1, b; + subps XMM0, XMM1; + movups a, XMM0; + } + + return a; + } + + /// Vector multiplication + Vector4f sseMul4(Vector4f a, Vector4f b) + { + asm + { + movups XMM0, a; + movups XMM1, b; + mulps XMM0, XMM1; + movups a, XMM0; + } + + return a; + } + + /// Vector division + Vector4f sseDiv4(Vector4f a, Vector4f b) + { + asm + { + movups XMM0, a; + movups XMM1, b; + divps XMM0, XMM1; + movups a, XMM0; + } + + return a; + } + + /// Vector dot product + float sseDot4(Vector4f a, Vector4f b) + { + asm + { + movups XMM0, a; + movups XMM1, b; + mulps XMM0, XMM1; + + // Horizontal addition + movhlps XMM1, XMM0; + addps XMM0, XMM1; + movups XMM1, XMM0; + shufps XMM1, XMM1, 0x55; + addps XMM0, XMM1; + + movups a, XMM0; + } + + return a[0]; + } + + /// Vector cross product + Vector4f sseCross3(Vector4f a, Vector4f b) + { + asm + { + movups XMM0, a; + movups XMM1, b; + movaps XMM2, XMM0; + movaps XMM3, XMM1; + + shufps XMM0, XMM0, 0xC9; + shufps XMM1, XMM1, 0xD2; + shufps XMM2, XMM2, 0xD2; + shufps XMM3, XMM3, 0xC9; + + mulps XMM0, XMM1; + mulps XMM2, XMM3; + + subps XMM0, XMM2; + + movups a, XMM0; + } + + return a; + } + + /// Matrix multiplication + Matrix4x4f sseMulMat4(Matrix4x4f a, Matrix4x4f b) + { + Matrix4x4f r; + Vector4f a_line, b_line, r_line; + float _b; + uint i, j; + Vector4f* _rp; + for (i = 0; i < 16; i += 4) + { + a_line = *cast(Vector4f*)(a.arrayof.ptr); + _b = *(b.arrayof.ptr + i); + asm + { + movups XMM0, a_line; + + mov EAX, _b; + movd XMM1, EAX; + + shufps XMM1, XMM1, 0; + + mulps XMM0, XMM1; + movups r_line, XMM0; + } + + for (j = 1; j < 4; j++) + { + a_line = *cast(Vector4f*)(a.arrayof.ptr + j * 4); + _b = *(b.arrayof.ptr + i + j); + asm + { + movups XMM0, a_line; + + mov EAX, _b; + movd XMM1, EAX; + shufps XMM1, XMM1, 0; + + mulps XMM0, XMM1; + + movups XMM2, r_line; + addps XMM0, XMM2; + + movups r_line, XMM0; + } + } + + _rp = cast(Vector4f*)(r.arrayof.ptr + i); + version(X86) asm + { + mov EAX, _rp; + movups [EAX], XMM0; + } + version(X86_64) asm + { + mov RAX, _rp; + movups [RAX], XMM0; + } + } + + return r; + } } }