From 455147ee95e8de7af237fc66aba9bbd52474ec3b Mon Sep 17 00:00:00 2001
From: Yuri Kunde Schlesner <yuriks@yuriks.net>
Date: Sun, 23 Aug 2015 21:46:36 -0300
Subject: Shader JIT: Fix CMP NaN behavior to match hardware

---
 src/video_core/shader/shader_jit_x64.cpp | 31 +++++++++++++++++++++++--------
 1 file changed, 23 insertions(+), 8 deletions(-)

(limited to 'src/video_core/shader/shader_jit_x64.cpp')

diff --git a/src/video_core/shader/shader_jit_x64.cpp b/src/video_core/shader/shader_jit_x64.cpp
index 6865c64e3..2a1e51013 100644
--- a/src/video_core/shader/shader_jit_x64.cpp
+++ b/src/video_core/shader/shader_jit_x64.cpp
@@ -578,27 +578,42 @@ void JitCompiler::Compile_CALLU(Instruction instr) {
 }
 
 void JitCompiler::Compile_CMP(Instruction instr) {
+    using Op = Instruction::Common::CompareOpType::Op;
+    Op op_x = instr.common.compare_op.x;
+    Op op_y = instr.common.compare_op.y;
+
     Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
     Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
 
-    static const u8 cmp[] = { CMP_EQ, CMP_NEQ, CMP_LT, CMP_LE, CMP_NLE, CMP_NLT };
+    // SSE doesn't have greater-than (GT) or greater-equal (GE) comparison operators. You need to
+    // emulate them by swapping the lhs and rhs and using LT and LE. NLT and NLE can't be used here
+    // because they don't match when used with NaNs.
+    static const u8 cmp[] = { CMP_EQ, CMP_NEQ, CMP_LT, CMP_LE, CMP_LT, CMP_LE };
+
+    bool invert_op_x = (op_x == Op::GreaterThan || op_x == Op::GreaterEqual);
+    Gen::X64Reg lhs_x = invert_op_x ? SRC2 : SRC1;
+    Gen::X64Reg rhs_x = invert_op_x ? SRC1 : SRC2;
 
-    if (instr.common.compare_op.x == instr.common.compare_op.y) {
+    if (op_x == op_y) {
         // Compare X-component and Y-component together
-        CMPPS(SRC1, R(SRC2), cmp[instr.common.compare_op.x]);
+        CMPPS(lhs_x, R(rhs_x), cmp[op_x]);
+        MOVQ_xmm(R(COND0), lhs_x);
 
-        MOVQ_xmm(R(COND0), SRC1);
         MOV(64, R(COND1), R(COND0));
     } else {
+        bool invert_op_y = (op_y == Op::GreaterThan || op_y == Op::GreaterEqual);
+        Gen::X64Reg lhs_y = invert_op_y ? SRC2 : SRC1;
+        Gen::X64Reg rhs_y = invert_op_y ? SRC1 : SRC2;
+
         // Compare X-component
-        MOVAPS(SCRATCH, R(SRC1));
-        CMPSS(SCRATCH, R(SRC2), cmp[instr.common.compare_op.x]);
+        MOVAPS(SCRATCH, R(lhs_x));
+        CMPSS(SCRATCH, R(rhs_x), cmp[op_x]);
 
         // Compare Y-component
-        CMPPS(SRC1, R(SRC2), cmp[instr.common.compare_op.y]);
+        CMPPS(lhs_y, R(rhs_y), cmp[op_y]);
 
         MOVQ_xmm(R(COND0), SCRATCH);
-        MOVQ_xmm(R(COND1), SRC1);
+        MOVQ_xmm(R(COND1), lhs_y);
     }
 
     SHR(32, R(COND0), Imm8(31));
-- 
cgit v1.2.3


From 76247170dfc43d1d01f71447893915ec02686990 Mon Sep 17 00:00:00 2001
From: Yuri Kunde Schlesner <yuriks@yuriks.net>
Date: Mon, 24 Aug 2015 01:46:10 -0300
Subject: Shader JIT: Add name to second scratch register (XMM4)

---
 src/video_core/shader/shader_jit_x64.cpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'src/video_core/shader/shader_jit_x64.cpp')

diff --git a/src/video_core/shader/shader_jit_x64.cpp b/src/video_core/shader/shader_jit_x64.cpp
index 2a1e51013..e56bff2b8 100644
--- a/src/video_core/shader/shader_jit_x64.cpp
+++ b/src/video_core/shader/shader_jit_x64.cpp
@@ -115,6 +115,8 @@ static const X64Reg SRC1 = XMM1;
 static const X64Reg SRC2 = XMM2;
 /// Loaded with the third swizzled source register, otherwise can be used as a scratch register
 static const X64Reg SRC3 = XMM3;
+/// Additional scratch register
+static const X64Reg SCRATCH2 = XMM4;
 /// Constant vector of [1.0f, 1.0f, 1.0f, 1.0f], used to efficiently set a vector to one
 static const X64Reg ONE = XMM14;
 /// Constant vector of [-0.f, -0.f, -0.f, -0.f], used to efficiently negate a vector with XOR
@@ -227,8 +229,8 @@ void JitCompiler::Compile_DestEnable(Instruction instr,X64Reg src) {
             u8 mask = ((swiz.dest_mask & 1) << 3) | ((swiz.dest_mask & 8) >> 3) | ((swiz.dest_mask & 2) << 1) | ((swiz.dest_mask & 4) >> 1);
             BLENDPS(SCRATCH, R(src), mask);
         } else {
-            MOVAPS(XMM4, R(src));
-            UNPCKHPS(XMM4, R(SCRATCH)); // Unpack X/Y components of source and destination
+            MOVAPS(SCRATCH2, R(src));
+            UNPCKHPS(SCRATCH2, R(SCRATCH)); // Unpack X/Y components of source and destination
             UNPCKLPS(SCRATCH, R(src)); // Unpack Z/W components of source and destination
 
             // Compute selector to selectively copy source components to destination for SHUFPS instruction
@@ -236,7 +238,7 @@ void JitCompiler::Compile_DestEnable(Instruction instr,X64Reg src) {
                      ((swiz.DestComponentEnabled(1) ? 3 : 2) << 2) |
                      ((swiz.DestComponentEnabled(2) ? 0 : 1) << 4) |
                      ((swiz.DestComponentEnabled(3) ? 2 : 3) << 6);
-            SHUFPS(SCRATCH, R(XMM4), sel);
+            SHUFPS(SCRATCH, R(SCRATCH2), sel);
         }
 
         // Store dest back to memory
-- 
cgit v1.2.3


From 082b74fa241ba71f27b2e3fdf756824a0467f7f4 Mon Sep 17 00:00:00 2001
From: Yuri Kunde Schlesner <yuriks@yuriks.net>
Date: Mon, 24 Aug 2015 01:46:58 -0300
Subject: Shaders: Explicitly conform to PICA semantics in MAX/MIN

---
 src/video_core/shader/shader_jit_x64.cpp | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'src/video_core/shader/shader_jit_x64.cpp')

diff --git a/src/video_core/shader/shader_jit_x64.cpp b/src/video_core/shader/shader_jit_x64.cpp
index e56bff2b8..456c8567d 100644
--- a/src/video_core/shader/shader_jit_x64.cpp
+++ b/src/video_core/shader/shader_jit_x64.cpp
@@ -467,6 +467,7 @@ void JitCompiler::Compile_FLR(Instruction instr) {
 void JitCompiler::Compile_MAX(Instruction instr) {
     Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
     Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
+    // SSE semantics match PICA200 ones: In case of NaN, SRC2 is returned.
     MAXPS(SRC1, R(SRC2));
     Compile_DestEnable(instr, SRC1);
 }
@@ -474,6 +475,7 @@ void JitCompiler::Compile_MAX(Instruction instr) {
 void JitCompiler::Compile_MIN(Instruction instr) {
     Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
     Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
+    // SSE semantics match PICA200 ones: In case of NaN, SRC2 is returned.
     MINPS(SRC1, R(SRC2));
     Compile_DestEnable(instr, SRC1);
 }
-- 
cgit v1.2.3


From 630a850d4d5a0509b16e96aaccc81e9384e1fba8 Mon Sep 17 00:00:00 2001
From: Yuri Kunde Schlesner <yuriks@yuriks.net>
Date: Mon, 24 Aug 2015 01:48:15 -0300
Subject: Shaders: Fix multiplications between 0.0 and inf

The PICA200 semantics for multiplication are so that when multiplying
inf by exactly 0.0, the result is 0.0, instead of NaN, as defined by
IEEE. This is relied upon by games.

Fixes #1024 (missing OoT interface items)
---
 src/video_core/shader/shader_jit_x64.cpp | 78 ++++++++++++++++----------------
 1 file changed, 39 insertions(+), 39 deletions(-)

(limited to 'src/video_core/shader/shader_jit_x64.cpp')

diff --git a/src/video_core/shader/shader_jit_x64.cpp b/src/video_core/shader/shader_jit_x64.cpp
index 456c8567d..ddae61cae 100644
--- a/src/video_core/shader/shader_jit_x64.cpp
+++ b/src/video_core/shader/shader_jit_x64.cpp
@@ -246,6 +246,19 @@ void JitCompiler::Compile_DestEnable(Instruction instr,X64Reg src) {
     }
 }
 
+void JitCompiler::Compile_SanitizedMul(Gen::X64Reg src1, Gen::X64Reg src2, Gen::X64Reg scratch) {
+    MOVAPS(scratch, R(src1));
+    CMPPS(scratch, R(src2), CMP_ORD);
+
+    MULPS(src1, R(src2));
+
+    MOVAPS(src2, R(src1));
+    CMPPS(src2, R(src2), CMP_UNORD);
+
+    XORPS(scratch, R(src2));
+    ANDPS(src1, R(scratch));
+}
+
 void JitCompiler::Compile_EvaluateCondition(Instruction instr) {
     // Note: NXOR is used below to check for equality
     switch (instr.flow_control.op) {
@@ -309,21 +322,17 @@ void JitCompiler::Compile_DP3(Instruction instr) {
     Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
     Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
 
-    if (Common::GetCPUCaps().sse4_1) {
-        DPPS(SRC1, R(SRC2), 0x7f);
-    } else {
-        MULPS(SRC1, R(SRC2));
+    Compile_SanitizedMul(SRC1, SRC2, SCRATCH);
 
-        MOVAPS(SRC2, R(SRC1));
-        SHUFPS(SRC2, R(SRC2), _MM_SHUFFLE(1, 1, 1, 1));
+    MOVAPS(SRC2, R(SRC1));
+    SHUFPS(SRC2, R(SRC2), _MM_SHUFFLE(1, 1, 1, 1));
 
-        MOVAPS(SRC3, R(SRC1));
-        SHUFPS(SRC3, R(SRC3), _MM_SHUFFLE(2, 2, 2, 2));
+    MOVAPS(SRC3, R(SRC1));
+    SHUFPS(SRC3, R(SRC3), _MM_SHUFFLE(2, 2, 2, 2));
 
-        SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 0, 0, 0));
-        ADDPS(SRC1, R(SRC2));
-        ADDPS(SRC1, R(SRC3));
-    }
+    SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 0, 0, 0));
+    ADDPS(SRC1, R(SRC2));
+    ADDPS(SRC1, R(SRC3));
 
     Compile_DestEnable(instr, SRC1);
 }
@@ -332,19 +341,15 @@ void JitCompiler::Compile_DP4(Instruction instr) {
     Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
     Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
 
-    if (Common::GetCPUCaps().sse4_1) {
-        DPPS(SRC1, R(SRC2), 0xff);
-    } else {
-        MULPS(SRC1, R(SRC2));
+    Compile_SanitizedMul(SRC1, SRC2, SCRATCH);
 
-        MOVAPS(SRC2, R(SRC1));
-        SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY
-        ADDPS(SRC1, R(SRC2));
+    MOVAPS(SRC2, R(SRC1));
+    SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY
+    ADDPS(SRC1, R(SRC2));
 
-        MOVAPS(SRC2, R(SRC1));
-        SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX
-        ADDPS(SRC1, R(SRC2));
-    }
+    MOVAPS(SRC2, R(SRC1));
+    SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX
+    ADDPS(SRC1, R(SRC2));
 
     Compile_DestEnable(instr, SRC1);
 }
@@ -361,23 +366,22 @@ void JitCompiler::Compile_DPH(Instruction instr) {
     if (Common::GetCPUCaps().sse4_1) {
         // Set 4th component to 1.0
         BLENDPS(SRC1, R(ONE), 0x8); // 0b1000
-        DPPS(SRC1, R(SRC2), 0xff);
     } else {
         // Reverse to set the 4th component to 1.0
         SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3));
         MOVSS(SRC1, R(ONE));
         SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3));
+    }
 
-        MULPS(SRC1, R(SRC2));
+    Compile_SanitizedMul(SRC1, SRC2, SCRATCH);
 
-        MOVAPS(SRC2, R(SRC1));
-        SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY
-        ADDPS(SRC1, R(SRC2));
+    MOVAPS(SRC2, R(SRC1));
+    SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY
+    ADDPS(SRC1, R(SRC2));
 
-        MOVAPS(SRC2, R(SRC1));
-        SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX
-        ADDPS(SRC1, R(SRC2));
-    }
+    MOVAPS(SRC2, R(SRC1));
+    SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX
+    ADDPS(SRC1, R(SRC2));
 
     Compile_DestEnable(instr, SRC1);
 }
@@ -417,7 +421,7 @@ void JitCompiler::Compile_LG2(Instruction instr) {
 void JitCompiler::Compile_MUL(Instruction instr) {
     Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
     Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
-    MULPS(SRC1, R(SRC2));
+    Compile_SanitizedMul(SRC1, SRC2, SCRATCH);
     Compile_DestEnable(instr, SRC1);
 }
 
@@ -635,12 +639,8 @@ void JitCompiler::Compile_MAD(Instruction instr) {
         Compile_SwizzleSrc(instr, 3, instr.mad.src3, SRC3);
     }
 
-    if (Common::GetCPUCaps().fma) {
-        VFMADD213PS(SRC1, SRC2, R(SRC3));
-    } else {
-        MULPS(SRC1, R(SRC2));
-        ADDPS(SRC1, R(SRC3));
-    }
+    Compile_SanitizedMul(SRC1, SRC2, SCRATCH);
+    ADDPS(SRC1, R(SRC3));
 
     Compile_DestEnable(instr, SRC1);
 }
-- 
cgit v1.2.3


From d8ef20c85695f6272dd71dbde1b8d04bac012dbe Mon Sep 17 00:00:00 2001
From: Yuri Kunde Schlesner <yuriks@yuriks.net>
Date: Mon, 24 Aug 2015 01:15:39 -0300
Subject: Shader JIT: Tiny micro-optimization in DPH

---
 src/video_core/shader/shader_jit_x64.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'src/video_core/shader/shader_jit_x64.cpp')

diff --git a/src/video_core/shader/shader_jit_x64.cpp b/src/video_core/shader/shader_jit_x64.cpp
index ddae61cae..c8a669b51 100644
--- a/src/video_core/shader/shader_jit_x64.cpp
+++ b/src/video_core/shader/shader_jit_x64.cpp
@@ -367,10 +367,10 @@ void JitCompiler::Compile_DPH(Instruction instr) {
         // Set 4th component to 1.0
         BLENDPS(SRC1, R(ONE), 0x8); // 0b1000
     } else {
-        // Reverse to set the 4th component to 1.0
-        SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3));
-        MOVSS(SRC1, R(ONE));
-        SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3));
+        // Set 4th component to 1.0
+        MOVAPS(SCRATCH, R(SRC1));
+        UNPCKHPS(SCRATCH, R(ONE));  // XYZW, 1111 -> Z1__
+        UNPCKLPD(SRC1, R(SCRATCH)); // XYZW, Z1__ -> XYZ1
     }
 
     Compile_SanitizedMul(SRC1, SRC2, SCRATCH);
-- 
cgit v1.2.3