From 455147ee95e8de7af237fc66aba9bbd52474ec3b Mon Sep 17 00:00:00 2001
From: Yuri Kunde Schlesner <yuriks@yuriks.net>
Date: Sun, 23 Aug 2015 21:46:36 -0300
Subject: Shader JIT: Fix CMP NaN behavior to match hardware

---
 src/video_core/shader/shader_jit_x64.cpp | 31 +++++++++++++++++++++++--------
 1 file changed, 23 insertions(+), 8 deletions(-)

(limited to 'src')

diff --git a/src/video_core/shader/shader_jit_x64.cpp b/src/video_core/shader/shader_jit_x64.cpp
index 6865c64e3..2a1e51013 100644
--- a/src/video_core/shader/shader_jit_x64.cpp
+++ b/src/video_core/shader/shader_jit_x64.cpp
@@ -578,27 +578,42 @@ void JitCompiler::Compile_CALLU(Instruction instr) {
 }
 
 void JitCompiler::Compile_CMP(Instruction instr) {
+    using Op = Instruction::Common::CompareOpType::Op;
+    Op op_x = instr.common.compare_op.x;
+    Op op_y = instr.common.compare_op.y;
+
     Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
     Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
 
-    static const u8 cmp[] = { CMP_EQ, CMP_NEQ, CMP_LT, CMP_LE, CMP_NLE, CMP_NLT };
+    // SSE doesn't have greater-than (GT) or greater-equal (GE) comparison operators. You need to
+    // emulate them by swapping the lhs and rhs and using LT and LE. NLT and NLE can't be used here
+    // because they don't match when used with NaNs.
+    static const u8 cmp[] = { CMP_EQ, CMP_NEQ, CMP_LT, CMP_LE, CMP_LT, CMP_LE };
+
+    bool invert_op_x = (op_x == Op::GreaterThan || op_x == Op::GreaterEqual);
+    Gen::X64Reg lhs_x = invert_op_x ? SRC2 : SRC1;
+    Gen::X64Reg rhs_x = invert_op_x ? SRC1 : SRC2;
 
-    if (instr.common.compare_op.x == instr.common.compare_op.y) {
+    if (op_x == op_y) {
         // Compare X-component and Y-component together
-        CMPPS(SRC1, R(SRC2), cmp[instr.common.compare_op.x]);
+        CMPPS(lhs_x, R(rhs_x), cmp[op_x]);
+        MOVQ_xmm(R(COND0), lhs_x);
 
-        MOVQ_xmm(R(COND0), SRC1);
         MOV(64, R(COND1), R(COND0));
     } else {
+        bool invert_op_y = (op_y == Op::GreaterThan || op_y == Op::GreaterEqual);
+        Gen::X64Reg lhs_y = invert_op_y ? SRC2 : SRC1;
+        Gen::X64Reg rhs_y = invert_op_y ? SRC1 : SRC2;
+
         // Compare X-component
-        MOVAPS(SCRATCH, R(SRC1));
-        CMPSS(SCRATCH, R(SRC2), cmp[instr.common.compare_op.x]);
+        MOVAPS(SCRATCH, R(lhs_x));
+        CMPSS(SCRATCH, R(rhs_x), cmp[op_x]);
 
         // Compare Y-component
-        CMPPS(SRC1, R(SRC2), cmp[instr.common.compare_op.y]);
+        CMPPS(lhs_y, R(rhs_y), cmp[op_y]);
 
         MOVQ_xmm(R(COND0), SCRATCH);
-        MOVQ_xmm(R(COND1), SRC1);
+        MOVQ_xmm(R(COND1), lhs_y);
     }
 
     SHR(32, R(COND0), Imm8(31));
-- 
cgit v1.2.3


From 76247170dfc43d1d01f71447893915ec02686990 Mon Sep 17 00:00:00 2001
From: Yuri Kunde Schlesner <yuriks@yuriks.net>
Date: Mon, 24 Aug 2015 01:46:10 -0300
Subject: Shader JIT: Add name to second scratch register (XMM4)

---
 src/video_core/shader/shader_jit_x64.cpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'src')

diff --git a/src/video_core/shader/shader_jit_x64.cpp b/src/video_core/shader/shader_jit_x64.cpp
index 2a1e51013..e56bff2b8 100644
--- a/src/video_core/shader/shader_jit_x64.cpp
+++ b/src/video_core/shader/shader_jit_x64.cpp
@@ -115,6 +115,8 @@ static const X64Reg SRC1 = XMM1;
 static const X64Reg SRC2 = XMM2;
 /// Loaded with the third swizzled source register, otherwise can be used as a scratch register
 static const X64Reg SRC3 = XMM3;
+/// Additional scratch register
+static const X64Reg SCRATCH2 = XMM4;
 /// Constant vector of [1.0f, 1.0f, 1.0f, 1.0f], used to efficiently set a vector to one
 static const X64Reg ONE = XMM14;
 /// Constant vector of [-0.f, -0.f, -0.f, -0.f], used to efficiently negate a vector with XOR
@@ -227,8 +229,8 @@ void JitCompiler::Compile_DestEnable(Instruction instr,X64Reg src) {
             u8 mask = ((swiz.dest_mask & 1) << 3) | ((swiz.dest_mask & 8) >> 3) | ((swiz.dest_mask & 2) << 1) | ((swiz.dest_mask & 4) >> 1);
             BLENDPS(SCRATCH, R(src), mask);
         } else {
-            MOVAPS(XMM4, R(src));
-            UNPCKHPS(XMM4, R(SCRATCH)); // Unpack X/Y components of source and destination
+            MOVAPS(SCRATCH2, R(src));
+            UNPCKHPS(SCRATCH2, R(SCRATCH)); // Unpack X/Y components of source and destination
             UNPCKLPS(SCRATCH, R(src)); // Unpack Z/W components of source and destination
 
             // Compute selector to selectively copy source components to destination for SHUFPS instruction
@@ -236,7 +238,7 @@ void JitCompiler::Compile_DestEnable(Instruction instr,X64Reg src) {
                      ((swiz.DestComponentEnabled(1) ? 3 : 2) << 2) |
                      ((swiz.DestComponentEnabled(2) ? 0 : 1) << 4) |
                      ((swiz.DestComponentEnabled(3) ? 2 : 3) << 6);
-            SHUFPS(SCRATCH, R(XMM4), sel);
+            SHUFPS(SCRATCH, R(SCRATCH2), sel);
         }
 
         // Store dest back to memory
-- 
cgit v1.2.3


From 082b74fa241ba71f27b2e3fdf756824a0467f7f4 Mon Sep 17 00:00:00 2001
From: Yuri Kunde Schlesner <yuriks@yuriks.net>
Date: Mon, 24 Aug 2015 01:46:58 -0300
Subject: Shaders: Explicitly conform to PICA semantics in MAX/MIN

---
 src/video_core/shader/shader_interpreter.cpp | 10 ++++++++--
 src/video_core/shader/shader_jit_x64.cpp     |  2 ++
 2 files changed, 10 insertions(+), 2 deletions(-)

(limited to 'src')

diff --git a/src/video_core/shader/shader_interpreter.cpp b/src/video_core/shader/shader_interpreter.cpp
index ae5a30441..69e4efa68 100644
--- a/src/video_core/shader/shader_interpreter.cpp
+++ b/src/video_core/shader/shader_interpreter.cpp
@@ -177,7 +177,10 @@ void RunInterpreter(UnitState<Debug>& state) {
                     if (!swizzle.DestComponentEnabled(i))
                         continue;
 
-                    dest[i] = std::max(src1[i], src2[i]);
+                    // NOTE: Exact form required to match NaN semantics to hardware:
+                    //   max(0, NaN) -> NaN
+                    //   max(NaN, 0) -> 0
+                    dest[i] = (src1[i] > src2[i]) ? src1[i] : src2[i];
                 }
                 Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest);
                 break;
@@ -190,7 +193,10 @@ void RunInterpreter(UnitState<Debug>& state) {
                     if (!swizzle.DestComponentEnabled(i))
                         continue;
 
-                    dest[i] = std::min(src1[i], src2[i]);
+                    // NOTE: Exact form required to match NaN semantics to hardware:
+                    //   min(0, NaN) -> NaN
+                    //   min(NaN, 0) -> 0
+                    dest[i] = (src1[i] < src2[i]) ? src1[i] : src2[i];
                 }
                 Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest);
                 break;
diff --git a/src/video_core/shader/shader_jit_x64.cpp b/src/video_core/shader/shader_jit_x64.cpp
index e56bff2b8..456c8567d 100644
--- a/src/video_core/shader/shader_jit_x64.cpp
+++ b/src/video_core/shader/shader_jit_x64.cpp
@@ -467,6 +467,7 @@ void JitCompiler::Compile_FLR(Instruction instr) {
 void JitCompiler::Compile_MAX(Instruction instr) {
     Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
     Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
+    // SSE semantics match PICA200 ones: In case of NaN, SRC2 is returned.
     MAXPS(SRC1, R(SRC2));
     Compile_DestEnable(instr, SRC1);
 }
@@ -474,6 +475,7 @@ void JitCompiler::Compile_MAX(Instruction instr) {
 void JitCompiler::Compile_MIN(Instruction instr) {
     Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
     Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
+    // SSE semantics match PICA200 ones: In case of NaN, SRC2 is returned.
     MINPS(SRC1, R(SRC2));
     Compile_DestEnable(instr, SRC1);
 }
-- 
cgit v1.2.3


From 630a850d4d5a0509b16e96aaccc81e9384e1fba8 Mon Sep 17 00:00:00 2001
From: Yuri Kunde Schlesner <yuriks@yuriks.net>
Date: Mon, 24 Aug 2015 01:48:15 -0300
Subject: Shaders: Fix multiplications between 0.0 and inf

The PICA200 semantics for multiplication are so that when multiplying
inf by exactly 0.0, the result is 0.0, instead of NaN, as defined by
IEEE. This is relied upon by games.

Fixes #1024 (missing OoT interface items)
---
 src/video_core/pica.h                    | 14 +++++-
 src/video_core/shader/shader_jit_x64.cpp | 78 ++++++++++++++++----------------
 src/video_core/shader/shader_jit_x64.h   |  6 +++
 3 files changed, 58 insertions(+), 40 deletions(-)

(limited to 'src')

diff --git a/src/video_core/pica.h b/src/video_core/pica.h
index 58b924f9e..cf148de50 100644
--- a/src/video_core/pica.h
+++ b/src/video_core/pica.h
@@ -1021,12 +1021,20 @@ struct float24 {
         return ret;
     }
 
+    static float24 Zero() {
+        return FromFloat32(0.f);
+    }
+
     // Not recommended for anything but logging
     float ToFloat32() const {
         return value;
     }
 
     float24 operator * (const float24& flt) const {
+        if ((this->value == 0.f && flt.value == flt.value) ||
+            (flt.value == 0.f && this->value == this->value))
+            // PICA gives 0 instead of NaN when multiplying by inf
+            return Zero();
         return float24::FromFloat32(ToFloat32() * flt.ToFloat32());
     }
 
@@ -1043,7 +1051,11 @@ struct float24 {
     }
 
     float24& operator *= (const float24& flt) {
-        value *= flt.ToFloat32();
+        if ((this->value == 0.f && flt.value == flt.value) ||
+            (flt.value == 0.f && this->value == this->value))
+            // PICA gives 0 instead of NaN when multiplying by inf
+            *this = Zero();
+        else value *= flt.ToFloat32();
         return *this;
     }
 
diff --git a/src/video_core/shader/shader_jit_x64.cpp b/src/video_core/shader/shader_jit_x64.cpp
index 456c8567d..ddae61cae 100644
--- a/src/video_core/shader/shader_jit_x64.cpp
+++ b/src/video_core/shader/shader_jit_x64.cpp
@@ -246,6 +246,19 @@ void JitCompiler::Compile_DestEnable(Instruction instr,X64Reg src) {
     }
 }
 
+void JitCompiler::Compile_SanitizedMul(Gen::X64Reg src1, Gen::X64Reg src2, Gen::X64Reg scratch) {
+    MOVAPS(scratch, R(src1));
+    CMPPS(scratch, R(src2), CMP_ORD);
+
+    MULPS(src1, R(src2));
+
+    MOVAPS(src2, R(src1));
+    CMPPS(src2, R(src2), CMP_UNORD);
+
+    XORPS(scratch, R(src2));
+    ANDPS(src1, R(scratch));
+}
+
 void JitCompiler::Compile_EvaluateCondition(Instruction instr) {
     // Note: NXOR is used below to check for equality
     switch (instr.flow_control.op) {
@@ -309,21 +322,17 @@ void JitCompiler::Compile_DP3(Instruction instr) {
     Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
     Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
 
-    if (Common::GetCPUCaps().sse4_1) {
-        DPPS(SRC1, R(SRC2), 0x7f);
-    } else {
-        MULPS(SRC1, R(SRC2));
+    Compile_SanitizedMul(SRC1, SRC2, SCRATCH);
 
-        MOVAPS(SRC2, R(SRC1));
-        SHUFPS(SRC2, R(SRC2), _MM_SHUFFLE(1, 1, 1, 1));
+    MOVAPS(SRC2, R(SRC1));
+    SHUFPS(SRC2, R(SRC2), _MM_SHUFFLE(1, 1, 1, 1));
 
-        MOVAPS(SRC3, R(SRC1));
-        SHUFPS(SRC3, R(SRC3), _MM_SHUFFLE(2, 2, 2, 2));
+    MOVAPS(SRC3, R(SRC1));
+    SHUFPS(SRC3, R(SRC3), _MM_SHUFFLE(2, 2, 2, 2));
 
-        SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 0, 0, 0));
-        ADDPS(SRC1, R(SRC2));
-        ADDPS(SRC1, R(SRC3));
-    }
+    SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 0, 0, 0));
+    ADDPS(SRC1, R(SRC2));
+    ADDPS(SRC1, R(SRC3));
 
     Compile_DestEnable(instr, SRC1);
 }
@@ -332,19 +341,15 @@ void JitCompiler::Compile_DP4(Instruction instr) {
     Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
     Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
 
-    if (Common::GetCPUCaps().sse4_1) {
-        DPPS(SRC1, R(SRC2), 0xff);
-    } else {
-        MULPS(SRC1, R(SRC2));
+    Compile_SanitizedMul(SRC1, SRC2, SCRATCH);
 
-        MOVAPS(SRC2, R(SRC1));
-        SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY
-        ADDPS(SRC1, R(SRC2));
+    MOVAPS(SRC2, R(SRC1));
+    SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY
+    ADDPS(SRC1, R(SRC2));
 
-        MOVAPS(SRC2, R(SRC1));
-        SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX
-        ADDPS(SRC1, R(SRC2));
-    }
+    MOVAPS(SRC2, R(SRC1));
+    SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX
+    ADDPS(SRC1, R(SRC2));
 
     Compile_DestEnable(instr, SRC1);
 }
@@ -361,23 +366,22 @@ void JitCompiler::Compile_DPH(Instruction instr) {
     if (Common::GetCPUCaps().sse4_1) {
         // Set 4th component to 1.0
         BLENDPS(SRC1, R(ONE), 0x8); // 0b1000
-        DPPS(SRC1, R(SRC2), 0xff);
     } else {
         // Reverse to set the 4th component to 1.0
         SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3));
         MOVSS(SRC1, R(ONE));
         SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3));
+    }
 
-        MULPS(SRC1, R(SRC2));
+    Compile_SanitizedMul(SRC1, SRC2, SCRATCH);
 
-        MOVAPS(SRC2, R(SRC1));
-        SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY
-        ADDPS(SRC1, R(SRC2));
+    MOVAPS(SRC2, R(SRC1));
+    SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY
+    ADDPS(SRC1, R(SRC2));
 
-        MOVAPS(SRC2, R(SRC1));
-        SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX
-        ADDPS(SRC1, R(SRC2));
-    }
+    MOVAPS(SRC2, R(SRC1));
+    SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX
+    ADDPS(SRC1, R(SRC2));
 
     Compile_DestEnable(instr, SRC1);
 }
@@ -417,7 +421,7 @@ void JitCompiler::Compile_LG2(Instruction instr) {
 void JitCompiler::Compile_MUL(Instruction instr) {
     Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
     Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
-    MULPS(SRC1, R(SRC2));
+    Compile_SanitizedMul(SRC1, SRC2, SCRATCH);
     Compile_DestEnable(instr, SRC1);
 }
 
@@ -635,12 +639,8 @@ void JitCompiler::Compile_MAD(Instruction instr) {
         Compile_SwizzleSrc(instr, 3, instr.mad.src3, SRC3);
     }
 
-    if (Common::GetCPUCaps().fma) {
-        VFMADD213PS(SRC1, SRC2, R(SRC3));
-    } else {
-        MULPS(SRC1, R(SRC2));
-        ADDPS(SRC1, R(SRC3));
-    }
+    Compile_SanitizedMul(SRC1, SRC2, SCRATCH);
+    ADDPS(SRC1, R(SRC3));
 
     Compile_DestEnable(instr, SRC1);
 }
diff --git a/src/video_core/shader/shader_jit_x64.h b/src/video_core/shader/shader_jit_x64.h
index fbe19fe93..58828ecc8 100644
--- a/src/video_core/shader/shader_jit_x64.h
+++ b/src/video_core/shader/shader_jit_x64.h
@@ -68,6 +68,12 @@ private:
     void Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRegister src_reg, Gen::X64Reg dest);
     void Compile_DestEnable(Instruction instr, Gen::X64Reg dest);
 
+    /**
+     * Compiles a `MUL src1, src2` operation, properly handling the PICA semantics when multiplying
+     * zero by inf. Clobbers `src2` and `scratch`.
+     */
+    void Compile_SanitizedMul(Gen::X64Reg src1, Gen::X64Reg src2, Gen::X64Reg scratch);
+
     void Compile_EvaluateCondition(Instruction instr);
     void Compile_UniformCondition(Instruction instr);
 
-- 
cgit v1.2.3


From d8ef20c85695f6272dd71dbde1b8d04bac012dbe Mon Sep 17 00:00:00 2001
From: Yuri Kunde Schlesner <yuriks@yuriks.net>
Date: Mon, 24 Aug 2015 01:15:39 -0300
Subject: Shader JIT: Tiny micro-optimization in DPH

---
 src/video_core/shader/shader_jit_x64.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'src')

diff --git a/src/video_core/shader/shader_jit_x64.cpp b/src/video_core/shader/shader_jit_x64.cpp
index ddae61cae..c8a669b51 100644
--- a/src/video_core/shader/shader_jit_x64.cpp
+++ b/src/video_core/shader/shader_jit_x64.cpp
@@ -367,10 +367,10 @@ void JitCompiler::Compile_DPH(Instruction instr) {
         // Set 4th component to 1.0
         BLENDPS(SRC1, R(ONE), 0x8); // 0b1000
     } else {
-        // Reverse to set the 4th component to 1.0
-        SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3));
-        MOVSS(SRC1, R(ONE));
-        SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3));
+        // Set 4th component to 1.0
+        MOVAPS(SCRATCH, R(SRC1));
+        UNPCKHPS(SCRATCH, R(ONE));  // XYZW, 1111 -> Z1__
+        UNPCKLPD(SRC1, R(SCRATCH)); // XYZW, Z1__ -> XYZ1
     }
 
     Compile_SanitizedMul(SRC1, SRC2, SCRATCH);
-- 
cgit v1.2.3


From eff10959de41027da7a09298941fcb403f610192 Mon Sep 17 00:00:00 2001
From: Yuri Kunde Schlesner <yuriks@yuriks.net>
Date: Mon, 24 Aug 2015 02:10:11 -0300
Subject: fixup! Shaders: Fix multiplications between 0.0 and inf

---
 src/video_core/pica.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'src')

diff --git a/src/video_core/pica.h b/src/video_core/pica.h
index cf148de50..bb689f2a9 100644
--- a/src/video_core/pica.h
+++ b/src/video_core/pica.h
@@ -1031,8 +1031,8 @@ struct float24 {
     }
 
     float24 operator * (const float24& flt) const {
-        if ((this->value == 0.f && flt.value == flt.value) ||
-            (flt.value == 0.f && this->value == this->value))
+        if ((this->value == 0.f && !std::isnan(flt.value)) ||
+            (flt.value == 0.f && !std::isnan(this->value)))
             // PICA gives 0 instead of NaN when multiplying by inf
             return Zero();
         return float24::FromFloat32(ToFloat32() * flt.ToFloat32());
@@ -1051,8 +1051,8 @@ struct float24 {
     }
 
     float24& operator *= (const float24& flt) {
-        if ((this->value == 0.f && flt.value == flt.value) ||
-            (flt.value == 0.f && this->value == this->value))
+        if ((this->value == 0.f && !std::isnan(flt.value)) ||
+            (flt.value == 0.f && !std::isnan(this->value)))
             // PICA gives 0 instead of NaN when multiplying by inf
             *this = Zero();
         else value *= flt.ToFloat32();
-- 
cgit v1.2.3