1 files changed, 381 insertions, 220 deletions
diff --git a/src/common/x64/emitter.h b/src/common/x64/emitter.h
index 60a77dfe1..467f7812f 100644
--- a/src/common/x64/emitter.h
+++ b/src/common/x64/emitter.h
@@ -21,8 +21,8 @@
 
 #include "common/assert.h"
 #include "common/bit_set.h"
-#include "common/common_types.h"
 #include "common/code_block.h"
+#include "common/common_types.h"
 
 #if defined(ARCHITECTURE_x86_64) && !defined(_ARCH_64)
 #define _ARCH_64
@@ -34,75 +34,145 @@
 #define PTRBITS 32
 #endif
 
-namespace Gen
-{
-
-enum X64Reg
-{
-    EAX = 0, EBX = 3, ECX = 1, EDX = 2,
-    ESI = 6, EDI = 7, EBP = 5, ESP = 4,
-
-    RAX = 0, RBX = 3, RCX = 1, RDX = 2,
-    RSI = 6, RDI = 7, RBP = 5, RSP = 4,
-    R8  = 8, R9  = 9, R10 = 10,R11 = 11,
-    R12 = 12,R13 = 13,R14 = 14,R15 = 15,
-
-    AL = 0, BL = 3, CL = 1, DL = 2,
-    SIL = 6, DIL = 7, BPL = 5, SPL = 4,
-    AH = 0x104, BH = 0x107, CH = 0x105, DH = 0x106,
-
-    AX = 0, BX = 3, CX = 1, DX = 2,
-    SI = 6, DI = 7, BP = 5, SP = 4,
-
-    XMM0=0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
-    XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15,
-
-    YMM0=0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7,
-    YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15,
+namespace Gen {
+
+enum X64Reg {
+    EAX = 0,
+    EBX = 3,
+    ECX = 1,
+    EDX = 2,
+    ESI = 6,
+    EDI = 7,
+    EBP = 5,
+    ESP = 4,
+
+    RAX = 0,
+    RBX = 3,
+    RCX = 1,
+    RDX = 2,
+    RSI = 6,
+    RDI = 7,
+    RBP = 5,
+    RSP = 4,
+    R8 = 8,
+    R9 = 9,
+    R10 = 10,
+    R11 = 11,
+    R12 = 12,
+    R13 = 13,
+    R14 = 14,
+    R15 = 15,
+
+    AL = 0,
+    BL = 3,
+    CL = 1,
+    DL = 2,
+    SIL = 6,
+    DIL = 7,
+    BPL = 5,
+    SPL = 4,
+    AH = 0x104,
+    BH = 0x107,
+    CH = 0x105,
+    DH = 0x106,
+
+    AX = 0,
+    BX = 3,
+    CX = 1,
+    DX = 2,
+    SI = 6,
+    DI = 7,
+    BP = 5,
+    SP = 4,
+
+    XMM0 = 0,
+    XMM1,
+    XMM2,
+    XMM3,
+    XMM4,
+    XMM5,
+    XMM6,
+    XMM7,
+    XMM8,
+    XMM9,
+    XMM10,
+    XMM11,
+    XMM12,
+    XMM13,
+    XMM14,
+    XMM15,
+
+    YMM0 = 0,
+    YMM1,
+    YMM2,
+    YMM3,
+    YMM4,
+    YMM5,
+    YMM6,
+    YMM7,
+    YMM8,
+    YMM9,
+    YMM10,
+    YMM11,
+    YMM12,
+    YMM13,
+    YMM14,
+    YMM15,
 
     INVALID_REG = 0xFFFFFFFF
 };
 
-enum CCFlags
-{
-    CC_O   = 0,
-    CC_NO  = 1,
-    CC_B   = 2, CC_C   = 2, CC_NAE = 2,
-    CC_NB  = 3, CC_NC  = 3, CC_AE  = 3,
-    CC_Z   = 4, CC_E   = 4,
-    CC_NZ  = 5, CC_NE  = 5,
-    CC_BE  = 6, CC_NA  = 6,
-    CC_NBE = 7, CC_A   = 7,
-    CC_S   = 8,
-    CC_NS  = 9,
-    CC_P   = 0xA, CC_PE  = 0xA,
-    CC_NP  = 0xB, CC_PO  = 0xB,
-    CC_L   = 0xC, CC_NGE = 0xC,
-    CC_NL  = 0xD, CC_GE  = 0xD,
-    CC_LE  = 0xE, CC_NG  = 0xE,
-    CC_NLE = 0xF, CC_G   = 0xF
+enum CCFlags {
+    CC_O = 0,
+    CC_NO = 1,
+    CC_B = 2,
+    CC_C = 2,
+    CC_NAE = 2,
+    CC_NB = 3,
+    CC_NC = 3,
+    CC_AE = 3,
+    CC_Z = 4,
+    CC_E = 4,
+    CC_NZ = 5,
+    CC_NE = 5,
+    CC_BE = 6,
+    CC_NA = 6,
+    CC_NBE = 7,
+    CC_A = 7,
+    CC_S = 8,
+    CC_NS = 9,
+    CC_P = 0xA,
+    CC_PE = 0xA,
+    CC_NP = 0xB,
+    CC_PO = 0xB,
+    CC_L = 0xC,
+    CC_NGE = 0xC,
+    CC_NL = 0xD,
+    CC_GE = 0xD,
+    CC_LE = 0xE,
+    CC_NG = 0xE,
+    CC_NLE = 0xF,
+    CC_G = 0xF
 };
 
-enum
-{
+enum {
     NUMGPRs = 16,
     NUMXMMs = 16,
 };
 
-enum
-{
+enum {
     SCALE_NONE = 0,
     SCALE_1 = 1,
     SCALE_2 = 2,
     SCALE_4 = 4,
     SCALE_8 = 8,
     SCALE_ATREG = 16,
-    //SCALE_NOBASE_1 is not supported and can be replaced with SCALE_ATREG
+    // SCALE_NOBASE_1 is not supported and can be replaced with SCALE_ATREG
     SCALE_NOBASE_2 = 34,
     SCALE_NOBASE_4 = 36,
     SCALE_NOBASE_8 = 40,
     SCALE_RIP = 0xFF,
-    SCALE_IMM8  = 0xF0,
+    SCALE_IMM8 = 0xF0,
     SCALE_IMM16 = 0xF1,
     SCALE_IMM32 = 0xF2,
     SCALE_IMM64 = 0xF3,
@@ -114,7 +184,7 @@ enum NormalOp {
     nrmSUB,
     nrmSBB,
     nrmAND,
-    nrmOR ,
+    nrmOR,
     nrmXOR,
     nrmMOV,
     nrmTEST,
@@ -157,68 +227,74 @@ enum FloatRound {
 class XEmitter;
 
 // RIP addressing does not benefit from micro op fusion on Core arch
-struct OpArg
-{
+struct OpArg {
     friend class XEmitter;
 
-    constexpr OpArg() = default;  // dummy op arg, used for storage
+    constexpr OpArg() = default; // dummy op arg, used for storage
     constexpr OpArg(u64 offset_, int scale_, X64Reg rmReg = RAX, X64Reg scaledReg = RAX)
-        : scale(static_cast<u8>(scale_))
-        , offsetOrBaseReg(static_cast<u16>(rmReg))
-        , indexReg(static_cast<u16>(scaledReg))
-        , offset(offset_)
-    {
+        : scale(static_cast<u8>(scale_)), offsetOrBaseReg(static_cast<u16>(rmReg)),
+          indexReg(static_cast<u16>(scaledReg)), offset(offset_) {
     }
 
-    constexpr bool operator==(const OpArg &b) const
-    {
-        return operandReg      == b.operandReg      &&
-               scale           == b.scale           &&
-               offsetOrBaseReg == b.offsetOrBaseReg &&
-               indexReg        == b.indexReg        &&
-               offset          == b.offset;
+    constexpr bool operator==(const OpArg& b) const {
+        return operandReg == b.operandReg && scale == b.scale &&
+               offsetOrBaseReg == b.offsetOrBaseReg && indexReg == b.indexReg && offset == b.offset;
     }
 
-    void WriteRex(XEmitter *emit, int opBits, int bits, int customOp = -1) const;
-    void WriteVex(XEmitter* emit, X64Reg regOp1, X64Reg regOp2, int L, int pp, int mmmmm, int W = 0) const;
-    void WriteRest(XEmitter *emit, int extraBytes=0, X64Reg operandReg=INVALID_REG, bool warn_64bit_offset = true) const;
-    void WriteSingleByteOp(XEmitter *emit, u8 op, X64Reg operandReg, int bits);
-    void WriteNormalOp(XEmitter *emit, bool toRM, NormalOp op, const OpArg &operand, int bits) const;
-
-    constexpr bool IsImm() const { return scale == SCALE_IMM8 || scale == SCALE_IMM16 || scale == SCALE_IMM32 || scale == SCALE_IMM64; }
-    constexpr bool IsSimpleReg() const { return scale == SCALE_NONE; }
-    constexpr bool IsSimpleReg(X64Reg reg) const
-    {
+    void WriteRex(XEmitter* emit, int opBits, int bits, int customOp = -1) const;
+    void WriteVex(XEmitter* emit, X64Reg regOp1, X64Reg regOp2, int L, int pp, int mmmmm,
+                  int W = 0) const;
+    void WriteRest(XEmitter* emit, int extraBytes = 0, X64Reg operandReg = INVALID_REG,
+                   bool warn_64bit_offset = true) const;
+    void WriteSingleByteOp(XEmitter* emit, u8 op, X64Reg operandReg, int bits);
+    void WriteNormalOp(XEmitter* emit, bool toRM, NormalOp op, const OpArg& operand,
+                       int bits) const;
+
+    constexpr bool IsImm() const {
+        return scale == SCALE_IMM8 || scale == SCALE_IMM16 || scale == SCALE_IMM32 ||
+               scale == SCALE_IMM64;
+    }
+    constexpr bool IsSimpleReg() const {
+        return scale == SCALE_NONE;
+    }
+    constexpr bool IsSimpleReg(X64Reg reg) const {
         return IsSimpleReg() && GetSimpleReg() == reg;
     }
 
-    int GetImmBits() const
-    {
-        switch (scale)
-        {
-        case SCALE_IMM8: return 8;
-        case SCALE_IMM16: return 16;
-        case SCALE_IMM32: return 32;
-        case SCALE_IMM64: return 64;
-        default: return -1;
+    int GetImmBits() const {
+        switch (scale) {
+        case SCALE_IMM8:
+            return 8;
+        case SCALE_IMM16:
+            return 16;
+        case SCALE_IMM32:
+            return 32;
+        case SCALE_IMM64:
+            return 64;
+        default:
+            return -1;
         }
     }
 
     void SetImmBits(int bits) {
-        switch (bits)
-        {
-            case 8: scale = SCALE_IMM8; break;
-            case 16: scale = SCALE_IMM16; break;
-            case 32: scale = SCALE_IMM32; break;
-            case 64: scale = SCALE_IMM64; break;
+        switch (bits) {
+        case 8:
+            scale = SCALE_IMM8;
+            break;
+        case 16:
+            scale = SCALE_IMM16;
+            break;
+        case 32:
+            scale = SCALE_IMM32;
+            break;
+        case 64:
+            scale = SCALE_IMM64;
+            break;
         }
     }
 
-    constexpr X64Reg GetSimpleReg() const
-    {
-        return scale == SCALE_NONE
-               ? static_cast<X64Reg>(offsetOrBaseReg)
-               : INVALID_REG;
+    constexpr X64Reg GetSimpleReg() const {
+        return scale == SCALE_NONE ? static_cast<X64Reg>(offsetOrBaseReg) : INVALID_REG;
     }
 
     constexpr u32 GetImmValue() const {
@@ -234,41 +310,50 @@ private:
     u8 scale = 0;
     u16 offsetOrBaseReg = 0;
     u16 indexReg = 0;
-    u64 offset = 0;  // use RIP-relative as much as possible - 64-bit immediates are not available.
+    u64 offset = 0; // use RIP-relative as much as possible - 64-bit immediates are not available.
     u16 operandReg = 0;
 };
 
 template <typename T>
-inline OpArg M(const T *ptr)       { return OpArg(reinterpret_cast<u64>(ptr), static_cast<int>(SCALE_RIP)); }
-constexpr OpArg R(X64Reg value)    { return OpArg(0, SCALE_NONE, value); }
-constexpr OpArg MatR(X64Reg value) { return OpArg(0, SCALE_ATREG, value); }
+inline OpArg M(const T* ptr) {
+    return OpArg(reinterpret_cast<u64>(ptr), static_cast<int>(SCALE_RIP));
+}
+constexpr OpArg R(X64Reg value) {
+    return OpArg(0, SCALE_NONE, value);
+}
+constexpr OpArg MatR(X64Reg value) {
+    return OpArg(0, SCALE_ATREG, value);
+}
 
-constexpr OpArg MDisp(X64Reg value, int offset)
-{
+constexpr OpArg MDisp(X64Reg value, int offset) {
     return OpArg(static_cast<u32>(offset), SCALE_ATREG, value);
 }
 
-constexpr OpArg MComplex(X64Reg base, X64Reg scaled, int scale, int offset)
-{
+constexpr OpArg MComplex(X64Reg base, X64Reg scaled, int scale, int offset) {
     return OpArg(offset, scale, base, scaled);
 }
 
-constexpr OpArg MScaled(X64Reg scaled, int scale, int offset)
-{
-    return scale == SCALE_1
-           ? OpArg(offset, SCALE_ATREG, scaled)
-           : OpArg(offset, scale | 0x20, RAX, scaled);
+constexpr OpArg MScaled(X64Reg scaled, int scale, int offset) {
+    return scale == SCALE_1 ? OpArg(offset, SCALE_ATREG, scaled)
+                            : OpArg(offset, scale | 0x20, RAX, scaled);
 }
 
-constexpr OpArg MRegSum(X64Reg base, X64Reg offset)
-{
+constexpr OpArg MRegSum(X64Reg base, X64Reg offset) {
     return MComplex(base, offset, 1, 0);
 }
 
-constexpr OpArg Imm8 (u8 imm)  { return OpArg(imm, SCALE_IMM8);  }
-constexpr OpArg Imm16(u16 imm) { return OpArg(imm, SCALE_IMM16); } //rarely used
-constexpr OpArg Imm32(u32 imm) { return OpArg(imm, SCALE_IMM32); }
-constexpr OpArg Imm64(u64 imm) { return OpArg(imm, SCALE_IMM64); }
+constexpr OpArg Imm8(u8 imm) {
+    return OpArg(imm, SCALE_IMM8);
+}
+constexpr OpArg Imm16(u16 imm) {
+    return OpArg(imm, SCALE_IMM16);
+} // rarely used
+constexpr OpArg Imm32(u32 imm) {
+    return OpArg(imm, SCALE_IMM32);
+}
+constexpr OpArg Imm64(u64 imm) {
+    return OpArg(imm, SCALE_IMM64);
+}
 constexpr OpArg UImmAuto(u32 imm) {
     return OpArg(imm, imm >= 128 ? SCALE_IMM32 : SCALE_IMM8);
 }
@@ -277,8 +362,7 @@ constexpr OpArg SImmAuto(s32 imm) {
 }
 
 template <typename T>
-OpArg ImmPtr(const T* imm)
-{
+OpArg ImmPtr(const T* imm) {
 #ifdef _ARCH_64
     return Imm64(reinterpret_cast<u64>(imm));
 #else
@@ -286,36 +370,31 @@ OpArg ImmPtr(const T* imm)
 #endif
 }
 
-inline u32 PtrOffset(const void* ptr, const void* base)
-{
+inline u32 PtrOffset(const void* ptr, const void* base) {
 #ifdef _ARCH_64
-    s64 distance = (s64)ptr-(s64)base;
-    if (distance >= 0x80000000LL ||
-        distance < -0x80000000LL)
-    {
+    s64 distance = (s64)ptr - (s64)base;
+    if (distance >= 0x80000000LL || distance < -0x80000000LL) {
         ASSERT_MSG(0, "pointer offset out of range");
         return 0;
     }
 
     return (u32)distance;
 #else
-    return (u32)ptr-(u32)base;
+    return (u32)ptr - (u32)base;
 #endif
 }
 
-//usage: int a[]; ARRAY_OFFSET(a,10)
-#define ARRAY_OFFSET(array,index) ((u32)((u64)&(array)[index]-(u64)&(array)[0]))
-//usage: struct {int e;} s; STRUCT_OFFSET(s,e)
-#define STRUCT_OFFSET(str,elem) ((u32)((u64)&(str).elem-(u64)&(str)))
+// usage: int a[]; ARRAY_OFFSET(a,10)
+#define ARRAY_OFFSET(array, index) ((u32)((u64) & (array)[index] - (u64) & (array)[0]))
+// usage: struct {int e;} s; STRUCT_OFFSET(s,e)
+#define STRUCT_OFFSET(str, elem) ((u32)((u64) & (str).elem - (u64) & (str)))
 
-struct FixupBranch
-{
-    u8 *ptr;
-    int type; //0 = 8bit 1 = 32bit
+struct FixupBranch {
+    u8* ptr;
+    int type; // 0 = 8bit 1 = 32bit
 };
 
-enum SSECompare
-{
+enum SSECompare {
     EQ = 0,
     LT,
     LE,
@@ -326,11 +405,10 @@ enum SSECompare
     ORD,
 };
 
-class XEmitter
-{
-    friend struct OpArg;  // for Write8 etc
+class XEmitter {
+    friend struct OpArg; // for Write8 etc
 private:
-    u8 *code;
+    u8* code;
     bool flags_locked;
 
     void CheckFlags();
@@ -347,14 +425,19 @@ private:
     void WriteSSSE3Op(u8 opPrefix, u16 op, X64Reg regOp, const OpArg& arg, int extrabytes = 0);
     void WriteSSE41Op(u8 opPrefix, u16 op, X64Reg regOp, const OpArg& arg, int extrabytes = 0);
     void WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp, const OpArg& arg, int extrabytes = 0);
-    void WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, int extrabytes = 0);
-    void WriteVEXOp(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, int extrabytes = 0);
-    void WriteBMI1Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, int extrabytes = 0);
-    void WriteBMI2Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, int extrabytes = 0);
+    void WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg,
+                    int extrabytes = 0);
+    void WriteVEXOp(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg,
+                    int extrabytes = 0);
+    void WriteBMI1Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg,
+                     int extrabytes = 0);
+    void WriteBMI2Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg,
+                     int extrabytes = 0);
     void WriteFloatLoadStore(int bits, FloatOp op, FloatOp op_80b, const OpArg& arg);
-    void WriteNormalOp(XEmitter *emit, int bits, NormalOp op, const OpArg& a1, const OpArg& a2);
+    void WriteNormalOp(XEmitter* emit, int bits, NormalOp op, const OpArg& a1, const OpArg& a2);
 
-    void ABI_CalculateFrameSize(BitSet32 mask, size_t rsp_alignment, size_t needed_frame_size, size_t* shadowp, size_t* subtractionp, size_t* xmm_offsetp);
+    void ABI_CalculateFrameSize(BitSet32 mask, size_t rsp_alignment, size_t needed_frame_size,
+                                size_t* shadowp, size_t* subtractionp, size_t* xmm_offsetp);
 
 protected:
     void Write8(u8 value);
@@ -363,26 +446,38 @@ protected:
     void Write64(u64 value);
 
 public:
-    XEmitter() { code = nullptr; flags_locked = false; }
-    XEmitter(u8 *code_ptr) { code = code_ptr; flags_locked = false; }
-    virtual ~XEmitter() {}
+    XEmitter() {
+        code = nullptr;
+        flags_locked = false;
+    }
+    XEmitter(u8* code_ptr) {
+        code = code_ptr;
+        flags_locked = false;
+    }
+    virtual ~XEmitter() {
+    }
 
     void WriteModRM(int mod, int rm, int reg);
     void WriteSIB(int scale, int index, int base);
 
-    void SetCodePtr(u8 *ptr);
+    void SetCodePtr(u8* ptr);
     void ReserveCodeSpace(int bytes);
-    const u8 *AlignCode4();
-    const u8 *AlignCode16();
-    const u8 *AlignCodePage();
-    const u8 *GetCodePtr() const;
-    u8 *GetWritableCodePtr();
-
-    void LockFlags() { flags_locked = true; }
-    void UnlockFlags() { flags_locked = false; }
+    const u8* AlignCode4();
+    const u8* AlignCode16();
+    const u8* AlignCodePage();
+    const u8* GetCodePtr() const;
+    u8* GetWritableCodePtr();
+
+    void LockFlags() {
+        flags_locked = true;
+    }
+    void UnlockFlags() {
+        flags_locked = false;
+    }
 
     // Looking for one of these? It's BANNED!! Some instructions are slow on modern CPU
-    // INC, DEC, LOOP, LOOPNE, LOOPE, ENTER, LEAVE, XCHG, XLAT, REP MOVSB/MOVSD, REP SCASD + other string instr.,
+    // INC, DEC, LOOP, LOOPNE, LOOPE, ENTER, LEAVE, XCHG, XLAT, REP MOVSB/MOVSD, REP SCASD + other
+    // string instr.,
     // INC and DEC are slow on Intel Core, but not on AMD. They create a
     // false flag dependency because they only update a subset of the flags.
     // XCHG is SLOW and should be avoided.
@@ -401,11 +496,11 @@ public:
     void CLC();
     void CMC();
 
-    // These two can not be executed in 64-bit mode on early Intel 64-bit CPU:s, only on Core2 and AMD!
+    // These two can not be executed in 64-bit mode on early Intel 64-bit CPU:s, only on Core2 and
+    // AMD!
     void LAHF(); // 3 cycle vector path
     void SAHF(); // direct path fast
 
-
     // Stack control
     void PUSH(X64Reg reg);
     void POP(X64Reg reg);
@@ -422,7 +517,7 @@ public:
 
     void JMP(const u8* addr, bool force5Bytes = false);
     void JMPptr(const OpArg& arg);
-    void JMPself(); //infinite loop!
+    void JMPself(); // infinite loop!
 #ifdef CALL
 #undef CALL
 #endif
@@ -450,12 +545,11 @@ public:
     void BSR(int bits, X64Reg dest, const OpArg& src); // Top bit to bottom bit
 
     // Cache control
-    enum PrefetchLevel
-    {
-        PF_NTA, //Non-temporal (data used once and only once)
-        PF_T0,  //All cache levels
-        PF_T1,  //Levels 2+ (aliased to T0 on AMD)
-        PF_T2,  //Levels 3+ (aliased to T0 on AMD)
+    enum PrefetchLevel {
+        PF_NTA, // Non-temporal (data used once and only once)
+        PF_T0,  // All cache levels
+        PF_T1,  // Levels 2+ (aliased to T0 on AMD)
+        PF_T2,  // Levels 3+ (aliased to T0 on AMD)
     };
     void PREFETCH(PrefetchLevel level, OpArg arg);
     void MOVNTI(int bits, const OpArg& dest, X64Reg src);
@@ -464,8 +558,8 @@ public:
     void MOVNTPD(const OpArg& arg, X64Reg regOp);
 
     // Multiplication / division
-    void MUL(int bits, const OpArg& src); //UNSIGNED
-    void IMUL(int bits, const OpArg& src); //SIGNED
+    void MUL(int bits, const OpArg& src);  // UNSIGNED
+    void IMUL(int bits, const OpArg& src); // SIGNED
     void IMUL(int bits, X64Reg regOp, const OpArg& src);
     void IMUL(int bits, X64Reg regOp, const OpArg& src, const OpArg& imm);
     void DIV(int bits, const OpArg& src);
@@ -492,11 +586,19 @@ public:
 
     // Extend EAX into EDX in various ways
     void CWD(int bits = 16);
-    void CDQ() {CWD(32);}
-    void CQO() {CWD(64);}
+    void CDQ() {
+        CWD(32);
+    }
+    void CQO() {
+        CWD(64);
+    }
     void CBW(int bits = 8);
-    void CWDE() {CBW(16);}
-    void CDQE() {CBW(32);}
+    void CWDE() {
+        CBW(16);
+    }
+    void CDQE() {
+        CBW(32);
+    }
 
     // Load effective address
     void LEA(int bits, X64Reg dest, OpArg src);
@@ -511,7 +613,7 @@ public:
     void CMP(int bits, const OpArg& a1, const OpArg& a2);
 
     // Bit operations
-    void NOT (int bits, const OpArg& src);
+    void NOT(int bits, const OpArg& src);
     void OR(int bits, const OpArg& a1, const OpArg& a2);
     void XOR(int bits, const OpArg& a1, const OpArg& a2);
     void MOV(int bits, const OpArg& a1, const OpArg& a2);
@@ -525,7 +627,8 @@ public:
     void BSWAP(int bits, X64Reg reg);
 
     // Sign/zero extension
-    void MOVSX(int dbits, int sbits, X64Reg dest, OpArg src); //automatically uses MOVSXD if necessary
+    void MOVSX(int dbits, int sbits, X64Reg dest,
+               OpArg src); // automatically uses MOVSXD if necessary
     void MOVZX(int dbits, int sbits, X64Reg dest, OpArg src);
 
     // Available only on Atom or >= Haswell so far. Test with GetCPUCaps().movbe.
@@ -593,13 +696,27 @@ public:
     void CMPSS(X64Reg regOp, const OpArg& arg, u8 compare);
     void CMPSD(X64Reg regOp, const OpArg& arg, u8 compare);
 
-    void CMPEQSS(X64Reg regOp, const OpArg& arg) { CMPSS(regOp, arg, CMP_EQ); }
-    void CMPLTSS(X64Reg regOp, const OpArg& arg) { CMPSS(regOp, arg, CMP_LT); }
-    void CMPLESS(X64Reg regOp, const OpArg& arg) { CMPSS(regOp, arg, CMP_LE); }
-    void CMPUNORDSS(X64Reg regOp, const OpArg& arg) { CMPSS(regOp, arg, CMP_UNORD); }
-    void CMPNEQSS(X64Reg regOp, const OpArg& arg) { CMPSS(regOp, arg, CMP_NEQ); }
-    void CMPNLTSS(X64Reg regOp, const OpArg& arg) { CMPSS(regOp, arg, CMP_NLT); }
-    void CMPORDSS(X64Reg regOp, const OpArg& arg) { CMPSS(regOp, arg, CMP_ORD); }
+    void CMPEQSS(X64Reg regOp, const OpArg& arg) {
+        CMPSS(regOp, arg, CMP_EQ);
+    }
+    void CMPLTSS(X64Reg regOp, const OpArg& arg) {
+        CMPSS(regOp, arg, CMP_LT);
+    }
+    void CMPLESS(X64Reg regOp, const OpArg& arg) {
+        CMPSS(regOp, arg, CMP_LE);
+    }
+    void CMPUNORDSS(X64Reg regOp, const OpArg& arg) {
+        CMPSS(regOp, arg, CMP_UNORD);
+    }
+    void CMPNEQSS(X64Reg regOp, const OpArg& arg) {
+        CMPSS(regOp, arg, CMP_NEQ);
+    }
+    void CMPNLTSS(X64Reg regOp, const OpArg& arg) {
+        CMPSS(regOp, arg, CMP_NLT);
+    }
+    void CMPORDSS(X64Reg regOp, const OpArg& arg) {
+        CMPSS(regOp, arg, CMP_ORD);
+    }
 
     // SSE/SSE2: Floating point packed arithmetic (x4 for float, x2 for double)
     void ADDPS(X64Reg regOp, const OpArg& arg);
@@ -638,10 +755,12 @@ public:
     // SSE/SSE2: Useful alternative to shuffle in some cases.
     void MOVDDUP(X64Reg regOp, const OpArg& arg);
 
-    // SSE3: Horizontal operations in SIMD registers. Very slow! shufps-based code beats it handily on Ivy.
+    // SSE3: Horizontal operations in SIMD registers. Very slow! shufps-based code beats it handily
+    // on Ivy.
     void HADDPS(X64Reg dest, const OpArg& src);
 
-    // SSE4: Further horizontal operations - dot products. These are weirdly flexible, the arg contains both a read mask and a write "mask".
+    // SSE4: Further horizontal operations - dot products. These are weirdly flexible, the arg
+    // contains both a read mask and a write "mask".
     void DPPS(X64Reg dest, const OpArg& src, u8 arg);
 
     void UNPCKLPS(X64Reg dest, const OpArg& src);
@@ -694,11 +813,13 @@ public:
     void MOVD_xmm(const OpArg& arg, X64Reg src);
     void MOVQ_xmm(OpArg arg, X64Reg src);
 
-    // SSE/SSE2: Generates a mask from the high bits of the components of the packed register in question.
+    // SSE/SSE2: Generates a mask from the high bits of the components of the packed register in
+    // question.
     void MOVMSKPS(X64Reg dest, const OpArg& arg);
     void MOVMSKPD(X64Reg dest, const OpArg& arg);
 
-    // SSE2: Selective byte store, mask in src register. EDI/RDI specifies store address. This is a weird one.
+    // SSE2: Selective byte store, mask in src register. EDI/RDI specifies store address. This is a
+    // weird one.
     void MASKMOVDQU(X64Reg dest, X64Reg src);
     void LDDQU(X64Reg dest, const OpArg& src);
 
@@ -729,10 +850,10 @@ public:
     void PACKUSDW(X64Reg dest, const OpArg& arg);
     void PACKUSWB(X64Reg dest, const OpArg& arg);
 
-    void PUNPCKLBW(X64Reg dest, const OpArg &arg);
-    void PUNPCKLWD(X64Reg dest, const OpArg &arg);
-    void PUNPCKLDQ(X64Reg dest, const OpArg &arg);
-    void PUNPCKLQDQ(X64Reg dest, const OpArg &arg);
+    void PUNPCKLBW(X64Reg dest, const OpArg& arg);
+    void PUNPCKLWD(X64Reg dest, const OpArg& arg);
+    void PUNPCKLDQ(X64Reg dest, const OpArg& arg);
+    void PUNPCKLQDQ(X64Reg dest, const OpArg& arg);
 
     void PTEST(X64Reg dest, const OpArg& arg);
     void PAND(X64Reg dest, const OpArg& arg);
@@ -839,25 +960,57 @@ public:
     void ROUNDPS(X64Reg dest, const OpArg& arg, u8 mode);
     void ROUNDPD(X64Reg dest, const OpArg& arg, u8 mode);
 
-    void ROUNDNEARSS(X64Reg dest, const OpArg& arg) { ROUNDSS(dest, arg, FROUND_NEAREST); }
-    void ROUNDFLOORSS(X64Reg dest, const OpArg& arg) { ROUNDSS(dest, arg, FROUND_FLOOR); }
-    void ROUNDCEILSS(X64Reg dest, const OpArg& arg) { ROUNDSS(dest, arg, FROUND_CEIL); }
-    void ROUNDZEROSS(X64Reg dest, const OpArg& arg) { ROUNDSS(dest, arg, FROUND_ZERO); }
+    void ROUNDNEARSS(X64Reg dest, const OpArg& arg) {
+        ROUNDSS(dest, arg, FROUND_NEAREST);
+    }
+    void ROUNDFLOORSS(X64Reg dest, const OpArg& arg) {
+        ROUNDSS(dest, arg, FROUND_FLOOR);
+    }
+    void ROUNDCEILSS(X64Reg dest, const OpArg& arg) {
+        ROUNDSS(dest, arg, FROUND_CEIL);
+    }
+    void ROUNDZEROSS(X64Reg dest, const OpArg& arg) {
+        ROUNDSS(dest, arg, FROUND_ZERO);
+    }
 
-    void ROUNDNEARSD(X64Reg dest, const OpArg& arg) { ROUNDSD(dest, arg, FROUND_NEAREST); }
-    void ROUNDFLOORSD(X64Reg dest, const OpArg& arg) { ROUNDSD(dest, arg, FROUND_FLOOR); }
-    void ROUNDCEILSD(X64Reg dest, const OpArg& arg) { ROUNDSD(dest, arg, FROUND_CEIL); }
-    void ROUNDZEROSD(X64Reg dest, const OpArg& arg) { ROUNDSD(dest, arg, FROUND_ZERO); }
+    void ROUNDNEARSD(X64Reg dest, const OpArg& arg) {
+        ROUNDSD(dest, arg, FROUND_NEAREST);
+    }
+    void ROUNDFLOORSD(X64Reg dest, const OpArg& arg) {
+        ROUNDSD(dest, arg, FROUND_FLOOR);
+    }
+    void ROUNDCEILSD(X64Reg dest, const OpArg& arg) {
+        ROUNDSD(dest, arg, FROUND_CEIL);
+    }
+    void ROUNDZEROSD(X64Reg dest, const OpArg& arg) {
+        ROUNDSD(dest, arg, FROUND_ZERO);
+    }
 
-    void ROUNDNEARPS(X64Reg dest, const OpArg& arg) { ROUNDPS(dest, arg, FROUND_NEAREST); }
-    void ROUNDFLOORPS(X64Reg dest, const OpArg& arg) { ROUNDPS(dest, arg, FROUND_FLOOR); }
-    void ROUNDCEILPS(X64Reg dest, const OpArg& arg) { ROUNDPS(dest, arg, FROUND_CEIL); }
-    void ROUNDZEROPS(X64Reg dest, const OpArg& arg) { ROUNDPS(dest, arg, FROUND_ZERO); }
+    void ROUNDNEARPS(X64Reg dest, const OpArg& arg) {
+        ROUNDPS(dest, arg, FROUND_NEAREST);
+    }
+    void ROUNDFLOORPS(X64Reg dest, const OpArg& arg) {
+        ROUNDPS(dest, arg, FROUND_FLOOR);
+    }
+    void ROUNDCEILPS(X64Reg dest, const OpArg& arg) {
+        ROUNDPS(dest, arg, FROUND_CEIL);
+    }
+    void ROUNDZEROPS(X64Reg dest, const OpArg& arg) {
+        ROUNDPS(dest, arg, FROUND_ZERO);
+    }
 
-    void ROUNDNEARPD(X64Reg dest, const OpArg& arg) { ROUNDPD(dest, arg, FROUND_NEAREST); }
-    void ROUNDFLOORPD(X64Reg dest, const OpArg& arg) { ROUNDPD(dest, arg, FROUND_FLOOR); }
-    void ROUNDCEILPD(X64Reg dest, const OpArg& arg) { ROUNDPD(dest, arg, FROUND_CEIL); }
-    void ROUNDZEROPD(X64Reg dest, const OpArg& arg) { ROUNDPD(dest, arg, FROUND_ZERO); }
+    void ROUNDNEARPD(X64Reg dest, const OpArg& arg) {
+        ROUNDPD(dest, arg, FROUND_NEAREST);
+    }
+    void ROUNDFLOORPD(X64Reg dest, const OpArg& arg) {
+        ROUNDPD(dest, arg, FROUND_FLOOR);
+    }
+    void ROUNDCEILPD(X64Reg dest, const OpArg& arg) {
+        ROUNDPD(dest, arg, FROUND_CEIL);
+    }
+    void ROUNDZEROPD(X64Reg dest, const OpArg& arg) {
+        ROUNDPD(dest, arg, FROUND_ZERO);
+    }
 
     // AVX
     void VADDSD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
@@ -981,7 +1134,6 @@ public:
     void ABI_CallFunctionC16(const void* func, u16 param1);
     void ABI_CallFunctionCC16(const void* func, u32 param1, u16 param2);
 
-
     // These only support u32 parameters, but that's enough for a lot of uses.
     // These will destroy the 1 or 2 first "parameter regs".
     void ABI_CallFunctionC(const void* func, u32 param1);
@@ -1012,29 +1164,38 @@ public:
      *
      * @param mask Registers to push on the stack (high 16 bits are XMMs, low 16 bits are GPRs)
      * @param rsp_alignment Current alignment of the stack pointer, must be 0 or 8
-     * @param needed_frame_size Additional space needed, e.g., for function arguments passed on the stack
+     * @param needed_frame_size Additional space needed, e.g., for function arguments passed on the
+     * stack
      * @return Size of the shadow space, i.e., offset of the frame
      */
-    size_t ABI_PushRegistersAndAdjustStack(BitSet32 mask, size_t rsp_alignment, size_t needed_frame_size = 0);
+    size_t ABI_PushRegistersAndAdjustStack(BitSet32 mask, size_t rsp_alignment,
+                                           size_t needed_frame_size = 0);
 
     /**
-     * Restores specified registers and adjusts the stack to its original alignment, i.e., the alignment before
+     * Restores specified registers and adjusts the stack to its original alignment, i.e., the
+     * alignment before
      * the matching PushRegistersAndAdjustStack.
      *
-     * @param mask Registers to restores from the stack (high 16 bits are XMMs, low 16 bits are GPRs)
-     * @param rsp_alignment Original alignment before the matching PushRegistersAndAdjustStack, must be 0 or 8
+     * @param mask Registers to restores from the stack (high 16 bits are XMMs, low 16 bits are
+     * GPRs)
+     * @param rsp_alignment Original alignment before the matching PushRegistersAndAdjustStack, must
+     * be 0 or 8
      * @param needed_frame_size Additional space that was needed
      * @warning Stack must be currently 16-byte aligned
      */
-    void ABI_PopRegistersAndAdjustStack(BitSet32 mask, size_t rsp_alignment, size_t needed_frame_size = 0);
-
-    #ifdef _M_IX86
-    static int ABI_GetNumXMMRegs() { return 8; }
-    #else
-    static int ABI_GetNumXMMRegs() { return 16; }
-    #endif
-};  // class XEmitter
+    void ABI_PopRegistersAndAdjustStack(BitSet32 mask, size_t rsp_alignment,
+                                        size_t needed_frame_size = 0);
 
+#ifdef _M_IX86
+    static int ABI_GetNumXMMRegs() {
+        return 8;
+    }
+#else
+    static int ABI_GetNumXMMRegs() {
+        return 16;
+    }
+#endif
+}; // class XEmitter
 
 // Everything that needs to generate X86 code should inherit from this.
 // You get memory management for free, plus, you can use all the MOV etc functions without
@@ -1045,4 +1206,4 @@ public:
     void PoisonMemory() override;
 };
 
-}  // namespace
+} // namespace