15 files changed, 648 insertions, 896 deletions
diff --git a/appveyor.yml b/appveyor.yml
index 06c9a7909..c9edb9e19 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -2,28 +2,28 @@
 os: unstable
 
 # shallow clone
-clone_depth: 1
+clone_depth: 5
 
 environment:
-  QTDIR: C:\Qt\5.4\msvc2013_opengl
+  QTDIR: C:\Qt\5.4\msvc2013_64_opengl
   MEGA_EMAIL:
     secure: rEo9CGAYX87GKTqZCZ9vLCNCNqxO5JLgbERaHF3YJWg=
   MEGA_PASSWORD:
     secure: zE1zmgjS/6GfN/19ROl/O0fVR58svORQ5gdtsxI7J8k=
 
 platform:
-  - Win32
+  - x64
 
 configuration:
   - Release
 
 install:
-  - git submodule update --init --recursive
+  - git submodule update --init --recursive --depth 20
 
 before_build:
   - mkdir build
   - cd build
-  - cmake ..
+  - cmake -G "Visual Studio 12 Win64" ..
   - cd ..
 
 after_build:
@@ -32,16 +32,16 @@ after_build:
   - wget -q http://megatools.megous.com/builds/megatools-1.9.94-win64.zip
     # extract megatools silently. See http://stackoverflow.com/a/11629736/1748450
   - 7z x megatools-1.9.94-win64.zip | FIND /V "ing  "
-    # copy the qt dlls 
-  - copy C:\Qt\5.4\msvc2013_opengl\bin\icudt53.dll build\bin\release
-  - copy C:\Qt\5.4\msvc2013_opengl\bin\icuin53.dll build\bin\release
-  - copy C:\Qt\5.4\msvc2013_opengl\bin\icuuc53.dll build\bin\release
-  - copy C:\Qt\5.4\msvc2013_opengl\bin\Qt5Core.dll build\bin\release
-  - copy C:\Qt\5.4\msvc2013_opengl\bin\Qt5Gui.dll  build\bin\release
-  - copy C:\Qt\5.4\msvc2013_opengl\bin\Qt5OpenGL.dll build\bin\release
-  - copy C:\Qt\5.4\msvc2013_opengl\bin\Qt5Widgets.dll  build\bin\release
+    # copy the qt dlls
+  - copy C:\Qt\5.4\msvc2013_64_opengl\bin\icudt53.dll build\bin\release
+  - copy C:\Qt\5.4\msvc2013_64_opengl\bin\icuin53.dll build\bin\release
+  - copy C:\Qt\5.4\msvc2013_64_opengl\bin\icuuc53.dll build\bin\release
+  - copy C:\Qt\5.4\msvc2013_64_opengl\bin\Qt5Core.dll build\bin\release
+  - copy C:\Qt\5.4\msvc2013_64_opengl\bin\Qt5Gui.dll  build\bin\release
+  - copy C:\Qt\5.4\msvc2013_64_opengl\bin\Qt5OpenGL.dll build\bin\release
+  - copy C:\Qt\5.4\msvc2013_64_opengl\bin\Qt5Widgets.dll  build\bin\release
   - mkdir build\bin\release\platforms\
-  - copy C:\Qt\5.4\msvc2013_opengl\plugins\platforms\qwindows.dll build\bin\release\platforms
+  - copy C:\Qt\5.4\msvc2013_64_opengl\plugins\platforms\qwindows.dll build\bin\release\platforms
     # zip up the build folder -> build.7z
   - 7z a build .\build\bin\release\*
     # rename, upload to Mega
diff --git a/src/common/logging/log.h b/src/common/logging/log.h
index 3d94bf0d9..897ef36b8 100644
--- a/src/common/logging/log.h
+++ b/src/common/logging/log.h
@@ -32,7 +32,7 @@ typedef u8 ClassType;
 /**
  * Specifies the sub-system that generated the log message.
  *
- * @note If you add a new entry here, also add a corresponding one to `ALL_LOG_CLASSES` in log.cpp.
+ * @note If you add a new entry here, also add a corresponding one to `ALL_LOG_CLASSES` in backend.cpp.
  */
 enum class Class : ClassType {
     Log,                        ///< Messages about the log system itself
diff --git a/src/core/arm/arm_interface.h b/src/core/arm/arm_interface.h
index e612f7439..ef37ee055 100644
--- a/src/core/arm/arm_interface.h
+++ b/src/core/arm/arm_interface.h
@@ -86,6 +86,15 @@ public:
     virtual void AddTicks(u64 ticks) = 0;
 
     /**
+     * Initializes a CPU context for use on this CPU
+     * @param context Thread context to reset
+     * @param stack_top Pointer to the top of the stack
+     * @param entry_point Entry point for execution
+     * @param arg User argument for thread
+     */
+    virtual void ResetContext(Core::ThreadContext& context, u32 stack_top, u32 entry_point, u32 arg) = 0;
+
+    /**
      * Saves the current CPU context
      * @param ctx Thread context to save
      */
diff --git a/src/core/arm/dyncom/arm_dyncom.cpp b/src/core/arm/dyncom/arm_dyncom.cpp
index f6628ca33..68fddc94f 100644
--- a/src/core/arm/dyncom/arm_dyncom.cpp
+++ b/src/core/arm/dyncom/arm_dyncom.cpp
@@ -93,6 +93,16 @@ void ARM_DynCom::ExecuteInstructions(int num_instructions) {
     AddTicks(ticks_executed);
 }
 
+void ARM_DynCom::ResetContext(Core::ThreadContext& context, u32 stack_top, u32 entry_point, u32 arg) {
+    memset(&context, 0, sizeof(Core::ThreadContext));
+
+    context.cpu_registers[0] = arg;
+    context.pc = entry_point;
+    context.sp = stack_top;
+    context.cpsr = 0x1F; // Usermode
+    context.mode = 8;    // Instructs dyncom CPU core to start execution as if it's "resuming" a thread.
+}
+
 void ARM_DynCom::SaveContext(Core::ThreadContext& ctx) {
     memcpy(ctx.cpu_registers, state->Reg, sizeof(ctx.cpu_registers));
     memcpy(ctx.fpu_registers, state->ExtReg, sizeof(ctx.fpu_registers));
diff --git a/src/core/arm/dyncom/arm_dyncom.h b/src/core/arm/dyncom/arm_dyncom.h
index f16fb070c..9e2dda843 100644
--- a/src/core/arm/dyncom/arm_dyncom.h
+++ b/src/core/arm/dyncom/arm_dyncom.h
@@ -13,79 +13,24 @@
 
 class ARM_DynCom final : virtual public ARM_Interface {
 public:
-
     ARM_DynCom();
     ~ARM_DynCom();
 
-    /**
-     * Set the Program Counter to an address
-     * @param pc Address to set PC to
-     */
     void SetPC(u32 pc) override;
-
-    /*
-     * Get the current Program Counter
-     * @return Returns current PC
-     */
     u32 GetPC() const override;
-
-    /**
-     * Get an ARM register
-     * @param index Register index (0-15)
-     * @return Returns the value in the register
-     */
     u32 GetReg(int index) const override;
-
-    /**
-     * Set an ARM register
-     * @param index Register index (0-15)
-     * @param value Value to set register to
-     */
     void SetReg(int index, u32 value) override;
-
-    /**
-     * Get the current CPSR register
-     * @return Returns the value of the CPSR register
-     */
     u32 GetCPSR() const override;
-
-    /**
-     * Set the current CPSR register
-     * @param cpsr Value to set CPSR to
-     */
     void SetCPSR(u32 cpsr) override;
 
-    /**
-     * Returns the number of clock ticks since the last reset
-     * @return Returns number of clock ticks
-     */
     u64 GetTicks() const override;
-
-    /**
-    * Advance the CPU core by the specified number of ticks (e.g. to simulate CPU execution time)
-    * @param ticks Number of ticks to advance the CPU core
-    */
     void AddTicks(u64 ticks) override;
 
-    /**
-     * Saves the current CPU context
-     * @param ctx Thread context to save
-     */
+    void ResetContext(Core::ThreadContext& context, u32 stack_top, u32 entry_point, u32 arg);
     void SaveContext(Core::ThreadContext& ctx) override;
-
-    /**
-     * Loads a CPU context
-     * @param ctx Thread context to load
-     */
     void LoadContext(const Core::ThreadContext& ctx) override;
 
-    /// Prepare core for thread reschedule (if needed to correctly handle state)
     void PrepareReschedule() override;
-
-    /**
-     * Executes the given number of instructions
-     * @param num_instructions Number of instructions to executes
-     */
     void ExecuteInstructions(int num_instructions) override;
 
 private:
diff --git a/src/core/arm/skyeye_common/vfp/vfp.cpp b/src/core/arm/skyeye_common/vfp/vfp.cpp
index 888709124..1cf146c53 100644
--- a/src/core/arm/skyeye_common/vfp/vfp.cpp
+++ b/src/core/arm/skyeye_common/vfp/vfp.cpp
@@ -773,8 +773,8 @@ void vfp_raise_exceptions(ARMul_State* state, u32 exceptions, u32 inst, u32 fpsc
      * Comparison instructions always return at least one of
      * these flags set.
      */
-    if (exceptions & (FPSCR_N|FPSCR_Z|FPSCR_C|FPSCR_V))
-        fpscr &= ~(FPSCR_N|FPSCR_Z|FPSCR_C|FPSCR_V);
+    if (exceptions & (FPSCR_NFLAG|FPSCR_ZFLAG|FPSCR_CFLAG|FPSCR_VFLAG))
+        fpscr &= ~(FPSCR_NFLAG|FPSCR_ZFLAG|FPSCR_CFLAG|FPSCR_VFLAG);
 
     fpscr |= exceptions;
 
diff --git a/src/core/arm/skyeye_common/vfp/vfp_helper.h b/src/core/arm/skyeye_common/vfp/vfp_helper.h
index 581f0358f..b68090b80 100644
--- a/src/core/arm/skyeye_common/vfp/vfp_helper.h
+++ b/src/core/arm/skyeye_common/vfp/vfp_helper.h
@@ -45,444 +45,400 @@
 
 #define do_div(n, base) {n/=base;}
 
-/* From vfpinstr.h */
-
-#define INST_CPRTDO(inst)	(((inst) & 0x0f000000) == 0x0e000000)
-#define INST_CPRT(inst)		((inst) & (1 << 4))
-#define INST_CPRT_L(inst)	((inst) & (1 << 20))
-#define INST_CPRT_Rd(inst)	(((inst) & (15 << 12)) >> 12)
-#define INST_CPRT_OP(inst)	(((inst) >> 21) & 7)
-#define INST_CPNUM(inst)	((inst) & 0xf00)
-#define CPNUM(cp)		((cp) << 8)
-
-#define FOP_MASK	(0x00b00040)
-#define FOP_FMAC	(0x00000000)
-#define FOP_FNMAC	(0x00000040)
-#define FOP_FMSC	(0x00100000)
-#define FOP_FNMSC	(0x00100040)
-#define FOP_FMUL	(0x00200000)
-#define FOP_FNMUL	(0x00200040)
-#define FOP_FADD	(0x00300000)
-#define FOP_FSUB	(0x00300040)
-#define FOP_FDIV	(0x00800000)
-#define FOP_EXT		(0x00b00040)
-
-#define FOP_TO_IDX(inst)	((inst & 0x00b00000) >> 20 | (inst & (1 << 6)) >> 4)
-
-#define FEXT_MASK	(0x000f0080)
-#define FEXT_FCPY	(0x00000000)
-#define FEXT_FABS	(0x00000080)
-#define FEXT_FNEG	(0x00010000)
-#define FEXT_FSQRT	(0x00010080)
-#define FEXT_FCMP	(0x00040000)
-#define FEXT_FCMPE	(0x00040080)
-#define FEXT_FCMPZ	(0x00050000)
-#define FEXT_FCMPEZ	(0x00050080)
-#define FEXT_FCVT	(0x00070080)
-#define FEXT_FUITO	(0x00080000)
-#define FEXT_FSITO	(0x00080080)
-#define FEXT_FTOUI	(0x000c0000)
-#define FEXT_FTOUIZ	(0x000c0080)
-#define FEXT_FTOSI	(0x000d0000)
-#define FEXT_FTOSIZ	(0x000d0080)
-
-#define FEXT_TO_IDX(inst)	((inst & 0x000f0000) >> 15 | (inst & (1 << 7)) >> 7)
-
-#define vfp_get_sd(inst)	((inst & 0x0000f000) >> 11 | (inst & (1 << 22)) >> 22)
-#define vfp_get_dd(inst)	((inst & 0x0000f000) >> 12 | (inst & (1 << 22)) >> 18)
-#define vfp_get_sm(inst)	((inst & 0x0000000f) << 1 | (inst & (1 << 5)) >> 5)
-#define vfp_get_dm(inst)	((inst & 0x0000000f) | (inst & (1 << 5)) >> 1)
-#define vfp_get_sn(inst)	((inst & 0x000f0000) >> 15 | (inst & (1 << 7)) >> 7)
-#define vfp_get_dn(inst)	((inst & 0x000f0000) >> 16 | (inst & (1 << 7)) >> 3)
-
-#define vfp_single(inst)	(((inst) & 0x0000f00) == 0xa00)
-
-#define FPSCR_N	(1 << 31)
-#define FPSCR_Z	(1 << 30)
-#define FPSCR_C (1 << 29)
-#define FPSCR_V	(1 << 28)
+enum : u32 {
+    FOP_MASK  = 0x00b00040,
+    FOP_FMAC  = 0x00000000,
+    FOP_FNMAC = 0x00000040,
+    FOP_FMSC  = 0x00100000,
+    FOP_FNMSC = 0x00100040,
+    FOP_FMUL  = 0x00200000,
+    FOP_FNMUL = 0x00200040,
+    FOP_FADD  = 0x00300000,
+    FOP_FSUB  = 0x00300040,
+    FOP_FDIV  = 0x00800000,
+    FOP_EXT   = 0x00b00040
+};
+
+#define FOP_TO_IDX(inst) ((inst & 0x00b00000) >> 20 | (inst & (1 << 6)) >> 4)
+
+enum : u32 {
+    FEXT_MASK   = 0x000f0080,
+    FEXT_FCPY   = 0x00000000,
+    FEXT_FABS   = 0x00000080,
+    FEXT_FNEG   = 0x00010000,
+    FEXT_FSQRT  = 0x00010080,
+    FEXT_FCMP   = 0x00040000,
+    FEXT_FCMPE  = 0x00040080,
+    FEXT_FCMPZ  = 0x00050000,
+    FEXT_FCMPEZ = 0x00050080,
+    FEXT_FCVT   = 0x00070080,
+    FEXT_FUITO  = 0x00080000,
+    FEXT_FSITO  = 0x00080080,
+    FEXT_FTOUI  = 0x000c0000,
+    FEXT_FTOUIZ = 0x000c0080,
+    FEXT_FTOSI  = 0x000d0000,
+    FEXT_FTOSIZ = 0x000d0080
+};
+
+#define FEXT_TO_IDX(inst) ((inst & 0x000f0000) >> 15 | (inst & (1 << 7)) >> 7)
+
+#define vfp_get_sd(inst)  ((inst & 0x0000f000) >> 11 | (inst & (1 << 22)) >> 22)
+#define vfp_get_dd(inst)  ((inst & 0x0000f000) >> 12 | (inst & (1 << 22)) >> 18)
+#define vfp_get_sm(inst)  ((inst & 0x0000000f) << 1 | (inst & (1 << 5)) >> 5)
+#define vfp_get_dm(inst)  ((inst & 0x0000000f) | (inst & (1 << 5)) >> 1)
+#define vfp_get_sn(inst)  ((inst & 0x000f0000) >> 15 | (inst & (1 << 7)) >> 7)
+#define vfp_get_dn(inst)  ((inst & 0x000f0000) >> 16 | (inst & (1 << 7)) >> 3)
+
+#define vfp_single(inst)  (((inst) & 0x0000f00) == 0xa00)
 
 static inline u32 vfp_shiftright32jamming(u32 val, unsigned int shift)
 {
-	if (shift) {
-		if (shift < 32)
-			val = val >> shift | ((val << (32 - shift)) != 0);
-		else
-			val = val != 0;
-	}
-	return val;
+    if (shift) {
+        if (shift < 32)
+            val = val >> shift | ((val << (32 - shift)) != 0);
+        else
+            val = val != 0;
+    }
+    return val;
 }
 
 static inline u64 vfp_shiftright64jamming(u64 val, unsigned int shift)
 {
-	if (shift) {
-		if (shift < 64)
-			val = val >> shift | ((val << (64 - shift)) != 0);
-		else
-			val = val != 0;
-	}
-	return val;
+    if (shift) {
+        if (shift < 64)
+            val = val >> shift | ((val << (64 - shift)) != 0);
+        else
+            val = val != 0;
+    }
+    return val;
 }
 
 static inline u32 vfp_hi64to32jamming(u64 val)
 {
-	u32 v;
-	u32 highval = val >> 32;
-	u32 lowval = val & 0xffffffff;
+    u32 v;
+    u32 highval = val >> 32;
+    u32 lowval = val & 0xffffffff;
 
-	if (lowval >= 1)
-		v = highval | 1;
-	else
-		v = highval;
+    if (lowval >= 1)
+        v = highval | 1;
+    else
+        v = highval;
 
-	return v;
+    return v;
 }
 
-static inline void add128(u64 *resh, u64 *resl, u64 nh, u64 nl, u64 mh, u64 ml)
+static inline void add128(u64* resh, u64* resl, u64 nh, u64 nl, u64 mh, u64 ml)
 {
-	*resl = nl + ml;
-	*resh = nh + mh;
-	if (*resl < nl)
-		*resh += 1;
+    *resl = nl + ml;
+    *resh = nh + mh;
+    if (*resl < nl)
+        *resh += 1;
 }
 
-static inline void sub128(u64 *resh, u64 *resl, u64 nh, u64 nl, u64 mh, u64 ml)
+static inline void sub128(u64* resh, u64* resl, u64 nh, u64 nl, u64 mh, u64 ml)
 {
-	*resl = nl - ml;
-	*resh = nh - mh;
-	if (*resl > nl)
-		*resh -= 1;
+    *resl = nl - ml;
+    *resh = nh - mh;
+    if (*resl > nl)
+        *resh -= 1;
 }
 
-static inline void mul64to128(u64 *resh, u64 *resl, u64 n, u64 m)
+static inline void mul64to128(u64* resh, u64* resl, u64 n, u64 m)
 {
-	u32 nh, nl, mh, ml;
-	u64 rh, rma, rmb, rl;
+    u32 nh, nl, mh, ml;
+    u64 rh, rma, rmb, rl;
 
-	nl = n;
-	ml = m;
-	rl = (u64)nl * ml;
+    nl = n;
+    ml = m;
+    rl = (u64)nl * ml;
 
-	nh = n >> 32;
-	rma = (u64)nh * ml;
+    nh = n >> 32;
+    rma = (u64)nh * ml;
 
-	mh = m >> 32;
-	rmb = (u64)nl * mh;
-	rma += rmb;
+    mh = m >> 32;
+    rmb = (u64)nl * mh;
+    rma += rmb;
 
-	rh = (u64)nh * mh;
-	rh += ((u64)(rma < rmb) << 32) + (rma >> 32);
+    rh = (u64)nh * mh;
+    rh += ((u64)(rma < rmb) << 32) + (rma >> 32);
 
-	rma <<= 32;
-	rl += rma;
-	rh += (rl < rma);
+    rma <<= 32;
+    rl += rma;
+    rh += (rl < rma);
 
-	*resl = rl;
-	*resh = rh;
+    *resl = rl;
+    *resh = rh;
 }
 
-static inline void shift64left(u64 *resh, u64 *resl, u64 n)
+static inline void shift64left(u64* resh, u64* resl, u64 n)
 {
-	*resh = n >> 63;
-	*resl = n << 1;
+    *resh = n >> 63;
+    *resl = n << 1;
 }
 
 static inline u64 vfp_hi64multiply64(u64 n, u64 m)
 {
-	u64 rh, rl;
-	mul64to128(&rh, &rl, n, m);
-	return rh | (rl != 0);
+    u64 rh, rl;
+    mul64to128(&rh, &rl, n, m);
+    return rh | (rl != 0);
 }
 
 static inline u64 vfp_estimate_div128to64(u64 nh, u64 nl, u64 m)
 {
-	u64 mh, ml, remh, reml, termh, terml, z;
-
-	if (nh >= m)
-		return ~0ULL;
-	mh = m >> 32;
-	if (mh << 32 <= nh) {
-		z = 0xffffffff00000000ULL;
-	} else {
-		z = nh;
-		do_div(z, mh);
-		z <<= 32;
-	}
-	mul64to128(&termh, &terml, m, z);
-	sub128(&remh, &reml, nh, nl, termh, terml);
-	ml = m << 32;
-	while ((s64)remh < 0) {
-		z -= 0x100000000ULL;
-		add128(&remh, &reml, remh, reml, mh, ml);
-	}
-	remh = (remh << 32) | (reml >> 32);
-	if (mh << 32 <= remh) {
-		z |= 0xffffffff;
-	} else {
-		do_div(remh, mh);
-		z |= remh;
-	}
-	return z;
+    u64 mh, ml, remh, reml, termh, terml, z;
+
+    if (nh >= m)
+        return ~0ULL;
+    mh = m >> 32;
+    if (mh << 32 <= nh) {
+        z = 0xffffffff00000000ULL;
+    } else {
+        z = nh;
+        do_div(z, mh);
+        z <<= 32;
+    }
+    mul64to128(&termh, &terml, m, z);
+    sub128(&remh, &reml, nh, nl, termh, terml);
+    ml = m << 32;
+    while ((s64)remh < 0) {
+        z -= 0x100000000ULL;
+        add128(&remh, &reml, remh, reml, mh, ml);
+    }
+    remh = (remh << 32) | (reml >> 32);
+    if (mh << 32 <= remh) {
+        z |= 0xffffffff;
+    } else {
+        do_div(remh, mh);
+        z |= remh;
+    }
+    return z;
 }
 
-/*
- * Operations on unpacked elements
- */
-#define vfp_sign_negate(sign)	(sign ^ 0x8000)
+// Operations on unpacked elements
+#define vfp_sign_negate(sign) (sign ^ 0x8000)
 
-/*
- * Single-precision
- */
+// Single-precision
 struct vfp_single {
-	s16	exponent;
-	u16	sign;
-	u32	significand;
+    s16	exponent;
+    u16	sign;
+    u32	significand;
 };
 
-/*
- * VFP_SINGLE_MANTISSA_BITS - number of bits in the mantissa
- * VFP_SINGLE_EXPONENT_BITS - number of bits in the exponent
- * VFP_SINGLE_LOW_BITS - number of low bits in the unpacked significand
- *  which are not propagated to the float upon packing.
- */
-#define VFP_SINGLE_MANTISSA_BITS	(23)
-#define VFP_SINGLE_EXPONENT_BITS	(8)
-#define VFP_SINGLE_LOW_BITS		(32 - VFP_SINGLE_MANTISSA_BITS - 2)
-#define VFP_SINGLE_LOW_BITS_MASK	((1 << VFP_SINGLE_LOW_BITS) - 1)
+// VFP_SINGLE_MANTISSA_BITS - number of bits in the mantissa
+// VFP_SINGLE_EXPONENT_BITS - number of bits in the exponent
+// VFP_SINGLE_LOW_BITS - number of low bits in the unpacked significand
+// which are not propagated to the float upon packing.
+#define VFP_SINGLE_MANTISSA_BITS (23)
+#define VFP_SINGLE_EXPONENT_BITS (8)
+#define VFP_SINGLE_LOW_BITS      (32 - VFP_SINGLE_MANTISSA_BITS - 2)
+#define VFP_SINGLE_LOW_BITS_MASK ((1 << VFP_SINGLE_LOW_BITS) - 1)
 
-/*
- * The bit in an unpacked float which indicates that it is a quiet NaN
- */
+// The bit in an unpacked float which indicates that it is a quiet NaN
 #define VFP_SINGLE_SIGNIFICAND_QNAN	(1 << (VFP_SINGLE_MANTISSA_BITS - 1 + VFP_SINGLE_LOW_BITS))
 
-/*
- * Operations on packed single-precision numbers
- */
-#define vfp_single_packed_sign(v)	((v) & 0x80000000)
-#define vfp_single_packed_negate(v)	((v) ^ 0x80000000)
-#define vfp_single_packed_abs(v)	((v) & ~0x80000000)
-#define vfp_single_packed_exponent(v)	(((v) >> VFP_SINGLE_MANTISSA_BITS) & ((1 << VFP_SINGLE_EXPONENT_BITS) - 1))
-#define vfp_single_packed_mantissa(v)	((v) & ((1 << VFP_SINGLE_MANTISSA_BITS) - 1))
-
-/*
- * Unpack a single-precision float.  Note that this returns the magnitude
- * of the single-precision float mantissa with the 1. if necessary,
- * aligned to bit 30.
- */
-static inline void vfp_single_unpack(struct vfp_single *s, s32 val)
+// Operations on packed single-precision numbers
+#define vfp_single_packed_sign(v)     ((v) & 0x80000000)
+#define vfp_single_packed_negate(v)   ((v) ^ 0x80000000)
+#define vfp_single_packed_abs(v)      ((v) & ~0x80000000)
+#define vfp_single_packed_exponent(v) (((v) >> VFP_SINGLE_MANTISSA_BITS) & ((1 << VFP_SINGLE_EXPONENT_BITS) - 1))
+#define vfp_single_packed_mantissa(v) ((v) & ((1 << VFP_SINGLE_MANTISSA_BITS) - 1))
+
+// Unpack a single-precision float.  Note that this returns the magnitude
+// of the single-precision float mantissa with the 1. if necessary,
+// aligned to bit 30.
+static inline void vfp_single_unpack(vfp_single* s, s32 val)
 {
-	u32 significand;
+    u32 significand;
 
-	s->sign = vfp_single_packed_sign(val) >> 16,
-	s->exponent = vfp_single_packed_exponent(val);
+    s->sign = vfp_single_packed_sign(val) >> 16,
+    s->exponent = vfp_single_packed_exponent(val);
 
-	significand = (u32) val;
-	significand = (significand << (32 - VFP_SINGLE_MANTISSA_BITS)) >> 2;
-	if (s->exponent && s->exponent != 255)
-		significand |= 0x40000000;
-	s->significand = significand;
+    significand = (u32) val;
+    significand = (significand << (32 - VFP_SINGLE_MANTISSA_BITS)) >> 2;
+    if (s->exponent && s->exponent != 255)
+        significand |= 0x40000000;
+    s->significand = significand;
 }
 
-/*
- * Re-pack a single-precision float.  This assumes that the float is
- * already normalised such that the MSB is bit 30, _not_ bit 31.
- */
-static inline s32 vfp_single_pack(struct vfp_single *s)
+// Re-pack a single-precision float.  This assumes that the float is
+// already normalised such that the MSB is bit 30, _not_ bit 31.
+static inline s32 vfp_single_pack(vfp_single* s)
 {
-	u32 val;
-	val = (s->sign << 16) +
-	      (s->exponent << VFP_SINGLE_MANTISSA_BITS) +
-	      (s->significand >> VFP_SINGLE_LOW_BITS);
-	return (s32)val;
+    u32 val = (s->sign << 16) +
+              (s->exponent << VFP_SINGLE_MANTISSA_BITS) +
+              (s->significand >> VFP_SINGLE_LOW_BITS);
+    return (s32)val;
 }
 
-#define VFP_NUMBER		(1<<0)
-#define VFP_ZERO		(1<<1)
-#define VFP_DENORMAL		(1<<2)
-#define VFP_INFINITY		(1<<3)
-#define VFP_NAN			(1<<4)
-#define VFP_NAN_SIGNAL		(1<<5)
+enum : u32 {
+    VFP_NUMBER     = (1 << 0),
+    VFP_ZERO       = (1 << 1),
+    VFP_DENORMAL   = (1 << 2),
+    VFP_INFINITY   = (1 << 3),
+    VFP_NAN        = (1 << 4),
+    VFP_NAN_SIGNAL = (1 << 5),
 
-#define VFP_QNAN		(VFP_NAN)
-#define VFP_SNAN		(VFP_NAN|VFP_NAN_SIGNAL)
+    VFP_QNAN       = (VFP_NAN),
+    VFP_SNAN       = (VFP_NAN|VFP_NAN_SIGNAL)
+};
 
-static inline int vfp_single_type(struct vfp_single *s)
+static inline int vfp_single_type(vfp_single* s)
 {
-	int type = VFP_NUMBER;
-	if (s->exponent == 255) {
-		if (s->significand == 0)
-			type = VFP_INFINITY;
-		else if (s->significand & VFP_SINGLE_SIGNIFICAND_QNAN)
-			type = VFP_QNAN;
-		else
-			type = VFP_SNAN;
-	} else if (s->exponent == 0) {
-		if (s->significand == 0)
-			type |= VFP_ZERO;
-		else
-			type |= VFP_DENORMAL;
-	}
-	return type;
+    int type = VFP_NUMBER;
+    if (s->exponent == 255) {
+        if (s->significand == 0)
+            type = VFP_INFINITY;
+        else if (s->significand & VFP_SINGLE_SIGNIFICAND_QNAN)
+            type = VFP_QNAN;
+        else
+            type = VFP_SNAN;
+    } else if (s->exponent == 0) {
+        if (s->significand == 0)
+            type |= VFP_ZERO;
+        else
+            type |= VFP_DENORMAL;
+    }
+    return type;
 }
 
 
-u32 vfp_single_normaliseround(ARMul_State* state, int sd, struct vfp_single *vs, u32 fpscr, u32 exceptions, const char *func);
+u32 vfp_single_normaliseround(ARMul_State* state, int sd, vfp_single* vs, u32 fpscr, u32 exceptions, const char* func);
 
-/*
- * Double-precision
- */
+// Double-precision
 struct vfp_double {
-	s16	exponent;
-	u16	sign;
-	u64	significand;
+    s16	exponent;
+    u16	sign;
+    u64	significand;
 };
 
-/*
- * VFP_REG_ZERO is a special register number for vfp_get_double
- * which returns (double)0.0.  This is useful for the compare with
- * zero instructions.
- */
+// VFP_REG_ZERO is a special register number for vfp_get_double
+// which returns (double)0.0.  This is useful for the compare with
+// zero instructions.
 #ifdef CONFIG_VFPv3
-#define VFP_REG_ZERO	32
+#define VFP_REG_ZERO 32
 #else
-#define VFP_REG_ZERO	16
+#define VFP_REG_ZERO 16
 #endif
 
-#define VFP_DOUBLE_MANTISSA_BITS	(52)
-#define VFP_DOUBLE_EXPONENT_BITS	(11)
-#define VFP_DOUBLE_LOW_BITS		(64 - VFP_DOUBLE_MANTISSA_BITS - 2)
-#define VFP_DOUBLE_LOW_BITS_MASK	((1 << VFP_DOUBLE_LOW_BITS) - 1)
-
-/*
- * The bit in an unpacked double which indicates that it is a quiet NaN
- */
-#define VFP_DOUBLE_SIGNIFICAND_QNAN	(1ULL << (VFP_DOUBLE_MANTISSA_BITS - 1 + VFP_DOUBLE_LOW_BITS))
-
-/*
- * Operations on packed single-precision numbers
- */
-#define vfp_double_packed_sign(v)	((v) & (1ULL << 63))
-#define vfp_double_packed_negate(v)	((v) ^ (1ULL << 63))
-#define vfp_double_packed_abs(v)	((v) & ~(1ULL << 63))
-#define vfp_double_packed_exponent(v)	(((v) >> VFP_DOUBLE_MANTISSA_BITS) & ((1 << VFP_DOUBLE_EXPONENT_BITS) - 1))
-#define vfp_double_packed_mantissa(v)	((v) & ((1ULL << VFP_DOUBLE_MANTISSA_BITS) - 1))
-
-/*
- * Unpack a double-precision float.  Note that this returns the magnitude
- * of the double-precision float mantissa with the 1. if necessary,
- * aligned to bit 62.
- */
-static inline void vfp_double_unpack(struct vfp_double *s, s64 val)
+#define VFP_DOUBLE_MANTISSA_BITS (52)
+#define VFP_DOUBLE_EXPONENT_BITS (11)
+#define VFP_DOUBLE_LOW_BITS      (64 - VFP_DOUBLE_MANTISSA_BITS - 2)
+#define VFP_DOUBLE_LOW_BITS_MASK ((1 << VFP_DOUBLE_LOW_BITS) - 1)
+
+// The bit in an unpacked double which indicates that it is a quiet NaN
+#define VFP_DOUBLE_SIGNIFICAND_QNAN (1ULL << (VFP_DOUBLE_MANTISSA_BITS - 1 + VFP_DOUBLE_LOW_BITS))
+
+// Operations on packed single-precision numbers
+#define vfp_double_packed_sign(v)     ((v) & (1ULL << 63))
+#define vfp_double_packed_negate(v)   ((v) ^ (1ULL << 63))
+#define vfp_double_packed_abs(v)      ((v) & ~(1ULL << 63))
+#define vfp_double_packed_exponent(v) (((v) >> VFP_DOUBLE_MANTISSA_BITS) & ((1 << VFP_DOUBLE_EXPONENT_BITS) - 1))
+#define vfp_double_packed_mantissa(v) ((v) & ((1ULL << VFP_DOUBLE_MANTISSA_BITS) - 1))
+
+// Unpack a double-precision float.  Note that this returns the magnitude
+// of the double-precision float mantissa with the 1. if necessary,
+// aligned to bit 62.
+static inline void vfp_double_unpack(vfp_double* s, s64 val)
 {
-	u64 significand;
+    u64 significand;
 
-	s->sign = vfp_double_packed_sign(val) >> 48;
-	s->exponent = vfp_double_packed_exponent(val);
+    s->sign = vfp_double_packed_sign(val) >> 48;
+    s->exponent = vfp_double_packed_exponent(val);
 
-	significand = (u64) val;
-	significand = (significand << (64 - VFP_DOUBLE_MANTISSA_BITS)) >> 2;
-	if (s->exponent && s->exponent != 2047)
-		significand |= (1ULL << 62);
-	s->significand = significand;
+    significand = (u64) val;
+    significand = (significand << (64 - VFP_DOUBLE_MANTISSA_BITS)) >> 2;
+    if (s->exponent && s->exponent != 2047)
+        significand |= (1ULL << 62);
+    s->significand = significand;
 }
 
-/*
- * Re-pack a double-precision float.  This assumes that the float is
- * already normalised such that the MSB is bit 30, _not_ bit 31.
- */
-static inline s64 vfp_double_pack(struct vfp_double *s)
+// Re-pack a double-precision float.  This assumes that the float is
+// already normalised such that the MSB is bit 30, _not_ bit 31.
+static inline s64 vfp_double_pack(vfp_double* s)
 {
-	u64 val;
-	val = ((u64)s->sign << 48) +
-	      ((u64)s->exponent << VFP_DOUBLE_MANTISSA_BITS) +
-	      (s->significand >> VFP_DOUBLE_LOW_BITS);
-	return (s64)val;
+    u64 val = ((u64)s->sign << 48) +
+              ((u64)s->exponent << VFP_DOUBLE_MANTISSA_BITS) +
+              (s->significand >> VFP_DOUBLE_LOW_BITS);
+    return (s64)val;
 }
 
-static inline int vfp_double_type(struct vfp_double *s)
+static inline int vfp_double_type(vfp_double* s)
 {
-	int type = VFP_NUMBER;
-	if (s->exponent == 2047) {
-		if (s->significand == 0)
-			type = VFP_INFINITY;
-		else if (s->significand & VFP_DOUBLE_SIGNIFICAND_QNAN)
-			type = VFP_QNAN;
-		else
-			type = VFP_SNAN;
-	} else if (s->exponent == 0) {
-		if (s->significand == 0)
-			type |= VFP_ZERO;
-		else
-			type |= VFP_DENORMAL;
-	}
-	return type;
+    int type = VFP_NUMBER;
+    if (s->exponent == 2047) {
+        if (s->significand == 0)
+            type = VFP_INFINITY;
+        else if (s->significand & VFP_DOUBLE_SIGNIFICAND_QNAN)
+            type = VFP_QNAN;
+        else
+            type = VFP_SNAN;
+    } else if (s->exponent == 0) {
+        if (s->significand == 0)
+            type |= VFP_ZERO;
+        else
+            type |= VFP_DENORMAL;
+    }
+    return type;
 }
 
-u32 vfp_double_normaliseround(ARMul_State* state, int dd, struct vfp_double *vd, u32 fpscr, u32 exceptions, const char *func);
-
 u32 vfp_estimate_sqrt_significand(u32 exponent, u32 significand);
 
-/*
- * A special flag to tell the normalisation code not to normalise.
- */
-#define VFP_NAN_FLAG	0x100
-
-/*
- * A bit pattern used to indicate the initial (unset) value of the
- * exception mask, in case nothing handles an instruction.  This
- * doesn't include the NAN flag, which get masked out before
- * we check for an error.
- */
-#define VFP_EXCEPTION_ERROR	((u32)-1 & ~VFP_NAN_FLAG)
-
-/*
- * A flag to tell vfp instruction type.
- *  OP_SCALAR - this operation always operates in scalar mode
- *  OP_SD - the instruction exceptionally writes to a single precision result.
- *  OP_DD - the instruction exceptionally writes to a double precision result.
- *  OP_SM - the instruction exceptionally reads from a single precision operand.
- */
-#define OP_SCALAR	(1 << 0)
-#define OP_SD		(1 << 1)
-#define OP_DD		(1 << 1)
-#define OP_SM		(1 << 2)
+// A special flag to tell the normalisation code not to normalise.
+#define VFP_NAN_FLAG 0x100
+
+// A bit pattern used to indicate the initial (unset) value of the
+// exception mask, in case nothing handles an instruction.  This
+// doesn't include the NAN flag, which get masked out before
+// we check for an error.
+#define VFP_EXCEPTION_ERROR ((u32)-1 & ~VFP_NAN_FLAG)
+
+// A flag to tell vfp instruction type.
+//  OP_SCALAR - This operation always operates in scalar mode
+//  OP_SD     - The instruction exceptionally writes to a single precision result.
+//  OP_DD     - The instruction exceptionally writes to a double precision result.
+//  OP_SM     - The instruction exceptionally reads from a single precision operand.
+enum : u32 {
+    OP_SCALAR = (1 << 0),
+    OP_SD     = (1 << 1),
+    OP_DD     = (1 << 1),
+    OP_SM     = (1 << 2)
+};
 
 struct op {
-	u32 (* const fn)(ARMul_State* state, int dd, int dn, int dm, u32 fpscr);
-	u32 flags;
+    u32 (* const fn)(ARMul_State* state, int dd, int dn, int dm, u32 fpscr);
+    u32 flags;
 };
 
 static inline u32 fls(ARMword x)
 {
-	int r = 32;
-
-	if (!x)
-		return 0;
-	if (!(x & 0xffff0000u)) {
-		x <<= 16;
-		r -= 16;
-	}
-	if (!(x & 0xff000000u)) {
-		x <<= 8;
-		r -= 8;
-	}
-	if (!(x & 0xf0000000u)) {
-		x <<= 4;
-		r -= 4;
-	}
-	if (!(x & 0xc0000000u)) {
-		x <<= 2;
-		r -= 2;
-	}
-	if (!(x & 0x80000000u)) {
-		x <<= 1;
-		r -= 1;
-	}
-	return r;
+    int r = 32;
+
+    if (!x)
+        return 0;
+    if (!(x & 0xffff0000u)) {
+        x <<= 16;
+        r -= 16;
+    }
+    if (!(x & 0xff000000u)) {
+        x <<= 8;
+        r -= 8;
+    }
+    if (!(x & 0xf0000000u)) {
+        x <<= 4;
+        r -= 4;
+    }
+    if (!(x & 0xc0000000u)) {
+        x <<= 2;
+        r -= 2;
+    }
+    if (!(x & 0x80000000u)) {
+        x <<= 1;
+        r -= 1;
+    }
+    return r;
 
 }
 
-u32 vfp_double_normaliseroundintern(ARMul_State* state, struct vfp_double *vd, u32 fpscr, u32 exceptions, const char *func);
-u32 vfp_double_multiply(struct vfp_double *vdd, struct vfp_double *vdn, struct vfp_double *vdm, u32 fpscr);
-u32 vfp_double_add(struct vfp_double *vdd, struct vfp_double *vdn, struct vfp_double *vdm, u32 fpscr);
-u32 vfp_double_fcvtsinterncutting(ARMul_State* state, int sd, struct vfp_double* dm, u32 fpscr);
+u32 vfp_double_multiply(vfp_double* vdd, vfp_double* vdn, vfp_double* vdm, u32 fpscr);
+u32 vfp_double_add(vfp_double* vdd, vfp_double* vdn, vfp_double *vdm, u32 fpscr);
+u32 vfp_double_normaliseround(ARMul_State* state, int dd, vfp_double* vd, u32 fpscr, u32 exceptions, const char* func);
diff --git a/src/core/arm/skyeye_common/vfp/vfpdouble.cpp b/src/core/arm/skyeye_common/vfp/vfpdouble.cpp
index d35ca510a..2c15db12b 100644
--- a/src/core/arm/skyeye_common/vfp/vfpdouble.cpp
+++ b/src/core/arm/skyeye_common/vfp/vfpdouble.cpp
@@ -83,134 +83,6 @@ static void vfp_double_normalise_denormal(struct vfp_double *vd)
     vfp_double_dump("normalise_denormal: out", vd);
 }
 
-u32 vfp_double_normaliseroundintern(ARMul_State* state, struct vfp_double *vd, u32 fpscr, u32 exceptions, const char *func)
-{
-    u64 significand, incr;
-    int exponent, shift, underflow;
-    u32 rmode;
-
-    vfp_double_dump("pack: in", vd);
-
-    /*
-    * Infinities and NaNs are a special case.
-    */
-    if (vd->exponent == 2047 && (vd->significand == 0 || exceptions))
-        goto pack;
-
-    /*
-    * Special-case zero.
-    */
-    if (vd->significand == 0) {
-        vd->exponent = 0;
-        goto pack;
-    }
-
-    exponent = vd->exponent;
-    significand = vd->significand;
-
-    shift = 32 - fls((ARMword)(significand >> 32));
-    if (shift == 32)
-        shift = 64 - fls((ARMword)significand);
-    if (shift) {
-        exponent -= shift;
-        significand <<= shift;
-    }
-
-#if 1
-    vd->exponent = exponent;
-    vd->significand = significand;
-    vfp_double_dump("pack: normalised", vd);
-#endif
-
-    /*
-    * Tiny number?
-    */
-    underflow = exponent < 0;
-    if (underflow) {
-        significand = vfp_shiftright64jamming(significand, -exponent);
-        exponent = 0;
-#if 1
-        vd->exponent = exponent;
-        vd->significand = significand;
-        vfp_double_dump("pack: tiny number", vd);
-#endif
-        if (!(significand & ((1ULL << (VFP_DOUBLE_LOW_BITS + 1)) - 1)))
-            underflow = 0;
-    }
-
-    /*
-    * Select rounding increment.
-    */
-    incr = 0;
-    rmode = fpscr & FPSCR_RMODE_MASK;
-
-    if (rmode == FPSCR_ROUND_NEAREST) {
-        incr = 1ULL << VFP_DOUBLE_LOW_BITS;
-        if ((significand & (1ULL << (VFP_DOUBLE_LOW_BITS + 1))) == 0)
-            incr -= 1;
-    }
-    else if (rmode == FPSCR_ROUND_TOZERO) {
-        incr = 0;
-    }
-    else if ((rmode == FPSCR_ROUND_PLUSINF) ^ (vd->sign != 0))
-        incr = (1ULL << (VFP_DOUBLE_LOW_BITS + 1)) - 1;
-
-    LOG_TRACE(Core_ARM11, "VFP: rounding increment = 0x%08llx\n", incr);
-
-    /*
-    * Is our rounding going to overflow?
-    */
-    if ((significand + incr) < significand) {
-        exponent += 1;
-        significand = (significand >> 1) | (significand & 1);
-        incr >>= 1;
-#if 1
-        vd->exponent = exponent;
-        vd->significand = significand;
-        vfp_double_dump("pack: overflow", vd);
-#endif
-    }
-
-    /*
-    * If any of the low bits (which will be shifted out of the
-    * number) are non-zero, the result is inexact.
-    */
-    if (significand & ((1 << (VFP_DOUBLE_LOW_BITS + 1)) - 1))
-        exceptions |= FPSCR_IXC;
-
-    /*
-    * Do our rounding.
-    */
-    significand += incr;
-
-    /*
-    * Infinity?
-    */
-    if (exponent >= 2046) {
-        exceptions |= FPSCR_OFC | FPSCR_IXC;
-        if (incr == 0) {
-            vd->exponent = 2045;
-            vd->significand = 0x7fffffffffffffffULL;
-        }
-        else {
-            vd->exponent = 2047;		/* infinity */
-            vd->significand = 0;
-        }
-    }
-    else {
-        if (significand >> (VFP_DOUBLE_LOW_BITS + 1) == 0)
-            exponent = 0;
-        if (exponent || significand > 0x8000000000000000ULL)
-            underflow = 0;
-        if (underflow)
-            exceptions |= FPSCR_UFC;
-        vd->exponent = exponent;
-        vd->significand = significand >> 1;
-    }
- pack:
-    return 0;
-}
-
 u32 vfp_double_normaliseround(ARMul_State* state, int dd, struct vfp_double *vd, u32 fpscr, u32 exceptions, const char *func)
 {
     u64 significand, incr;
@@ -511,7 +383,7 @@ static u32 vfp_compare(ARMul_State* state, int dd, int signal_on_qnan, int dm, u
     LOG_TRACE(Core_ARM11, "In %s, state=0x%x, fpscr=0x%x\n", __FUNCTION__, state, fpscr);
     m = vfp_get_double(state, dm);
     if (vfp_double_packed_exponent(m) == 2047 && vfp_double_packed_mantissa(m)) {
-        ret |= FPSCR_C | FPSCR_V;
+        ret |= FPSCR_CFLAG | FPSCR_VFLAG;
         if (signal_on_qnan || !(vfp_double_packed_mantissa(m) & (1ULL << (VFP_DOUBLE_MANTISSA_BITS - 1))))
             /*
              * Signalling NaN, or signalling on quiet NaN
@@ -521,7 +393,7 @@ static u32 vfp_compare(ARMul_State* state, int dd, int signal_on_qnan, int dm, u
 
     d = vfp_get_double(state, dd);
     if (vfp_double_packed_exponent(d) == 2047 && vfp_double_packed_mantissa(d)) {
-        ret |= FPSCR_C | FPSCR_V;
+        ret |= FPSCR_CFLAG | FPSCR_VFLAG;
         if (signal_on_qnan || !(vfp_double_packed_mantissa(d) & (1ULL << (VFP_DOUBLE_MANTISSA_BITS - 1))))
             /*
              * Signalling NaN, or signalling on quiet NaN
@@ -535,7 +407,7 @@ static u32 vfp_compare(ARMul_State* state, int dd, int signal_on_qnan, int dm, u
             /*
              * equal
              */
-            ret |= FPSCR_Z | FPSCR_C;
+            ret |= FPSCR_ZFLAG | FPSCR_CFLAG;
             //printf("In %s,1 ret=0x%x\n", __FUNCTION__, ret);
         } else if (vfp_double_packed_sign(d ^ m)) {
             /*
@@ -545,22 +417,22 @@ static u32 vfp_compare(ARMul_State* state, int dd, int signal_on_qnan, int dm, u
                 /*
                  * d is negative, so d < m
                  */
-                ret |= FPSCR_N;
+                ret |= FPSCR_NFLAG;
             else
                 /*
                  * d is positive, so d > m
                  */
-                ret |= FPSCR_C;
+                ret |= FPSCR_CFLAG;
         } else if ((vfp_double_packed_sign(d) != 0) ^ (d < m)) {
             /*
              * d < m
              */
-            ret |= FPSCR_N;
+            ret |= FPSCR_NFLAG;
         } else if ((vfp_double_packed_sign(d) != 0) ^ (d > m)) {
             /*
              * d > m
              */
-            ret |= FPSCR_C;
+            ret |= FPSCR_CFLAG;
         }
     }
     LOG_TRACE(Core_ARM11, "In %s, state=0x%x, ret=0x%x\n", __FUNCTION__, state, ret);
@@ -592,49 +464,6 @@ static u32 vfp_double_fcmpez(ARMul_State* state, int dd, int unused, int dm, u32
     return vfp_compare(state, dd, 1, VFP_REG_ZERO, fpscr);
 }
 
-u32 vfp_double_fcvtsinterncutting(ARMul_State* state, int sd, struct vfp_double* dm, u32 fpscr) //ichfly for internal use only
-{
-    struct vfp_single vsd;
-    int tm;
-    u32 exceptions = 0;
-
-    LOG_TRACE(Core_ARM11, "In %s\n", __FUNCTION__);
-
-    tm = vfp_double_type(dm);
-
-    /*
-    * If we have a signalling NaN, signal invalid operation.
-    */
-    if (tm == VFP_SNAN)
-        exceptions = FPSCR_IOC;
-
-    if (tm & VFP_DENORMAL)
-        vfp_double_normalise_denormal(dm);
-
-    vsd.sign = dm->sign;
-    vsd.significand = vfp_hi64to32jamming(dm->significand);
-
-    /*
-    * If we have an infinity or a NaN, the exponent must be 255
-    */
-    if (tm & (VFP_INFINITY | VFP_NAN)) {
-        vsd.exponent = 255;
-        if (tm == VFP_QNAN)
-            vsd.significand |= VFP_SINGLE_SIGNIFICAND_QNAN;
-        goto pack_nan;
-    }
-    else if (tm & VFP_ZERO)
-        vsd.exponent = 0;
-    else
-        vsd.exponent = dm->exponent - (1023 - 127);
-
-    return vfp_single_normaliseround(state, sd, &vsd, fpscr, exceptions, "fcvts");
-
-pack_nan:
-    vfp_put_float(state, vfp_single_pack(&vsd), sd);
-    return exceptions;
-}
-
 static u32 vfp_double_fcvts(ARMul_State* state, int sd, int unused, int dm, u32 fpscr)
 {
     struct vfp_double vdm;
@@ -723,7 +552,7 @@ static u32 vfp_double_ftoui(ARMul_State* state, int sd, int unused, int dm, u32
         exceptions |= FPSCR_IDC;
 
     if (tm & VFP_NAN)
-        vdm.sign = 0;
+        vdm.sign = 1;
 
     if (vdm.exponent >= 1023 + 32) {
         d = vdm.sign ? 0 : 0xffffffff;
diff --git a/src/core/arm/skyeye_common/vfp/vfpsingle.cpp b/src/core/arm/skyeye_common/vfp/vfpsingle.cpp
index b7872bdc4..678b63f51 100644
--- a/src/core/arm/skyeye_common/vfp/vfpsingle.cpp
+++ b/src/core/arm/skyeye_common/vfp/vfpsingle.cpp
@@ -419,7 +419,7 @@ static u32 vfp_compare(ARMul_State* state, int sd, int signal_on_qnan, s32 m, u3
 
     d = vfp_get_float(state, sd);
     if (vfp_single_packed_exponent(m) == 255 && vfp_single_packed_mantissa(m)) {
-        ret |= FPSCR_C | FPSCR_V;
+        ret |= FPSCR_CFLAG | FPSCR_VFLAG;
         if (signal_on_qnan || !(vfp_single_packed_mantissa(m) & (1 << (VFP_SINGLE_MANTISSA_BITS - 1))))
             /*
              * Signalling NaN, or signalling on quiet NaN
@@ -428,7 +428,7 @@ static u32 vfp_compare(ARMul_State* state, int sd, int signal_on_qnan, s32 m, u3
     }
 
     if (vfp_single_packed_exponent(d) == 255 && vfp_single_packed_mantissa(d)) {
-        ret |= FPSCR_C | FPSCR_V;
+        ret |= FPSCR_CFLAG | FPSCR_VFLAG;
         if (signal_on_qnan || !(vfp_single_packed_mantissa(d) & (1 << (VFP_SINGLE_MANTISSA_BITS - 1))))
             /*
              * Signalling NaN, or signalling on quiet NaN
@@ -441,7 +441,7 @@ static u32 vfp_compare(ARMul_State* state, int sd, int signal_on_qnan, s32 m, u3
             /*
              * equal
              */
-            ret |= FPSCR_Z | FPSCR_C;
+            ret |= FPSCR_ZFLAG | FPSCR_CFLAG;
         } else if (vfp_single_packed_sign(d ^ m)) {
             /*
              * different signs
@@ -450,22 +450,22 @@ static u32 vfp_compare(ARMul_State* state, int sd, int signal_on_qnan, s32 m, u3
                 /*
                  * d is negative, so d < m
                  */
-                ret |= FPSCR_N;
+                ret |= FPSCR_NFLAG;
             else
                 /*
                  * d is positive, so d > m
                  */
-                ret |= FPSCR_C;
+                ret |= FPSCR_CFLAG;
         } else if ((vfp_single_packed_sign(d) != 0) ^ (d < m)) {
             /*
              * d < m
              */
-            ret |= FPSCR_N;
+            ret |= FPSCR_NFLAG;
         } else if ((vfp_single_packed_sign(d) != 0) ^ (d > m)) {
             /*
              * d > m
              */
-            ret |= FPSCR_C;
+            ret |= FPSCR_CFLAG;
         }
     }
     return ret;
@@ -491,46 +491,6 @@ static u32 vfp_single_fcmpez(ARMul_State* state, int sd, int unused, s32 m, u32
     return vfp_compare(state, sd, 1, 0, fpscr);
 }
 
-static s64 vfp_single_to_doubleintern(ARMul_State* state, s32 m, u32 fpscr) //ichfly for internal use only
-{
-    struct vfp_single vsm;
-    struct vfp_double vdd;
-    int tm;
-    u32 exceptions = 0;
-
-    vfp_single_unpack(&vsm, m);
-
-    tm = vfp_single_type(&vsm);
-
-    /*
-    * If we have a signalling NaN, signal invalid operation.
-    */
-    if (tm == VFP_SNAN)
-        exceptions = FPSCR_IOC;
-
-    if (tm & VFP_DENORMAL)
-        vfp_single_normalise_denormal(&vsm);
-
-    vdd.sign = vsm.sign;
-    vdd.significand = (u64)vsm.significand << 32;
-
-    /*
-    * If we have an infinity or NaN, the exponent must be 2047.
-    */
-    if (tm & (VFP_INFINITY | VFP_NAN)) {
-        vdd.exponent = 2047;
-        if (tm == VFP_QNAN)
-            vdd.significand |= VFP_DOUBLE_SIGNIFICAND_QNAN;
-        goto pack_nan;
-    } else if (tm & VFP_ZERO)
-        vdd.exponent = 0;
-    else
-        vdd.exponent = vsm.exponent + (1023 - 127);
-pack_nan:
-    vfp_double_normaliseroundintern(state, &vdd, fpscr, exceptions, "fcvtd");
-    return vfp_double_pack(&vdd);
-}
-
 static u32 vfp_single_fcvtd(ARMul_State* state, int dd, int unused, s32 m, u32 fpscr)
 {
     struct vfp_single vsm;
diff --git a/src/core/hle/kernel/kernel.cpp b/src/core/hle/kernel/kernel.cpp
index 52dca4dd8..a2ffbcdb7 100644
--- a/src/core/hle/kernel/kernel.cpp
+++ b/src/core/hle/kernel/kernel.cpp
@@ -153,12 +153,8 @@ void Shutdown() {
  * @return True on success, otherwise false
  */
 bool LoadExec(u32 entry_point) {
-    Core::g_app_core->SetPC(entry_point);
-
     // 0x30 is the typical main thread priority I've seen used so far
-    g_main_thread = Kernel::SetupMainThread(0x30, Kernel::DEFAULT_STACK_SIZE);
-    // Setup the idle thread
-    Kernel::SetupIdleThread();
+    g_main_thread = Kernel::SetupMainThread(Kernel::DEFAULT_STACK_SIZE, entry_point, 0x30);
 
     return true;
 }
diff --git a/src/core/hle/kernel/mutex.cpp b/src/core/hle/kernel/mutex.cpp
index 9f7166ca4..a811db392 100644
--- a/src/core/hle/kernel/mutex.cpp
+++ b/src/core/hle/kernel/mutex.cpp
@@ -21,7 +21,7 @@ namespace Kernel {
  */
 static void ResumeWaitingThread(Mutex* mutex) {
     // Reset mutex lock thread handle, nothing is waiting
-    mutex->locked = false;
+    mutex->lock_count = 0;
     mutex->holding_thread = nullptr;
 
     // Find the next waiting thread for the mutex...
@@ -44,8 +44,7 @@ Mutex::~Mutex() {}
 SharedPtr<Mutex> Mutex::Create(bool initial_locked, std::string name) {
     SharedPtr<Mutex> mutex(new Mutex);
 
-    mutex->initial_locked = initial_locked;
-    mutex->locked = false;
+    mutex->lock_count = 0;
     mutex->name = std::move(name);
     mutex->holding_thread = nullptr;
 
@@ -57,7 +56,7 @@ SharedPtr<Mutex> Mutex::Create(bool initial_locked, std::string name) {
 }
 
 bool Mutex::ShouldWait() {
-    return locked && holding_thread != GetCurrentThread();
+    return lock_count > 0 && holding_thread != GetCurrentThread();;
 }
 
 void Mutex::Acquire() {
@@ -66,21 +65,27 @@ void Mutex::Acquire() {
 
 void Mutex::Acquire(SharedPtr<Thread> thread) {
     _assert_msg_(Kernel, !ShouldWait(), "object unavailable!");
-    if (locked)
-        return;
 
-    locked = true;
+    // Actually "acquire" the mutex only if we don't already have it...
+    if (lock_count == 0) {
+        thread->held_mutexes.insert(this);
+        holding_thread = std::move(thread);
+    }
 
-    thread->held_mutexes.insert(this);
-    holding_thread = std::move(thread);
+    lock_count++;
 }
 
 void Mutex::Release() {
-    if (!locked)
-        return;
-
-    holding_thread->held_mutexes.erase(this);
-    ResumeWaitingThread(this);
+    // Only release if the mutex is held...
+    if (lock_count > 0) {
+        lock_count--;
+
+        // Yield to the next thread only if we've fully released the mutex...
+        if (lock_count == 0) {
+            holding_thread->held_mutexes.erase(this);
+            ResumeWaitingThread(this);
+        }
+    }
 }
 
 } // namespace
diff --git a/src/core/hle/kernel/mutex.h b/src/core/hle/kernel/mutex.h
index 548403614..d6d5328be 100644
--- a/src/core/hle/kernel/mutex.h
+++ b/src/core/hle/kernel/mutex.h
@@ -30,8 +30,7 @@ public:
     static const HandleType HANDLE_TYPE = HandleType::Mutex;
     HandleType GetHandleType() const override { return HANDLE_TYPE; }
 
-    bool initial_locked;                        ///< Initial lock state when mutex was created
-    bool locked;                                ///< Current locked state
+    int lock_count;                             ///< Number of times the mutex has been acquired
     std::string name;                           ///< Name of mutex (optional)
     SharedPtr<Thread> holding_thread;           ///< Thread that has acquired the mutex
 
diff --git a/src/core/hle/kernel/thread.cpp b/src/core/hle/kernel/thread.cpp
index 3987f9608..7f629c20e 100644
--- a/src/core/hle/kernel/thread.cpp
+++ b/src/core/hle/kernel/thread.cpp
@@ -21,8 +21,11 @@
 
 namespace Kernel {
 
+/// Event type for the thread wake up event
+static int ThreadWakeupEventType = -1;
+
 bool Thread::ShouldWait() {
-    return status != THREADSTATUS_DORMANT;
+    return status != THREADSTATUS_DEAD;
 }
 
 void Thread::Acquire() {
@@ -33,12 +36,20 @@ void Thread::Acquire() {
 static std::vector<SharedPtr<Thread>> thread_list;
 
 // Lists only ready thread ids.
-static Common::ThreadQueueList<Thread*, THREADPRIO_LOWEST+1> thread_ready_queue;
+static Common::ThreadQueueList<Thread*, THREADPRIO_LOWEST+1> ready_queue;
 
 static Thread* current_thread;
 
-static const u32 INITIAL_THREAD_ID = 1; ///< The first available thread id at startup
-static u32 next_thread_id; ///< The next available thread id
+// The first available thread id at startup
+static u32 next_thread_id = 1;
+
+/**
+ * Creates a new thread ID
+ * @return The new thread ID
+ */
+inline static u32 const NewThreadId() {
+    return next_thread_id++;
+}
 
 Thread::Thread() {}
 Thread::~Thread() {}
@@ -47,86 +58,53 @@ Thread* GetCurrentThread() {
     return current_thread;
 }
 
-/// Resets a thread
-static void ResetThread(Thread* t, u32 arg, s32 lowest_priority) {
-    memset(&t->context, 0, sizeof(Core::ThreadContext));
-
-    t->context.cpu_registers[0] = arg;
-    t->context.pc = t->entry_point;
-    t->context.sp = t->stack_top;
-    t->context.cpsr = 0x1F; // Usermode
-
-    // TODO(bunnei): This instructs the CPU core to start the execution as if it is "resuming" a
-    // thread. This is somewhat Sky-Eye specific, and should be re-architected in the future to be
-    // agnostic of the CPU core.
-    t->context.mode = 8;
-
-    if (t->current_priority < lowest_priority) {
-        t->current_priority = t->initial_priority;
-    }
-
-    t->wait_objects.clear();
-    t->wait_address = 0;
-}
-
-/// Change a thread to "ready" state
-static void ChangeReadyState(Thread* t, bool ready) {
-    if (t->IsReady()) {
-        if (!ready) {
-            thread_ready_queue.remove(t->current_priority, t);
-        }
-    }  else if (ready) {
-        if (t->IsRunning()) {
-            thread_ready_queue.push_front(t->current_priority, t);
-        } else {
-            thread_ready_queue.push_back(t->current_priority, t);
-        }
-        t->status = THREADSTATUS_READY;
-    }
-}
-
-/// Check if a thread is waiting on a the specified wait object
+/**
+ * Check if a thread is waiting on the specified wait object
+ * @param thread The thread to test
+ * @param wait_object The object to test against
+ * @return True if the thread is waiting, false otherwise
+ */
 static bool CheckWait_WaitObject(const Thread* thread, WaitObject* wait_object) {
-    auto itr = std::find(thread->wait_objects.begin(), thread->wait_objects.end(), wait_object);
+    if (thread->status != THREADSTATUS_WAIT_SYNCH)
+        return false;
 
-    if (itr != thread->wait_objects.end())
-        return thread->IsWaiting();
-
-    return false;
+    auto itr = std::find(thread->wait_objects.begin(), thread->wait_objects.end(), wait_object);
+    return itr != thread->wait_objects.end();
 }
 
-/// Check if the specified thread is waiting on the specified address to be arbitrated
+/**
+ * Check if the specified thread is waiting on the specified address to be arbitrated
+ * @param thread The thread to test
+ * @param wait_address The address to test against
+ * @return True if the thread is waiting, false otherwise
+ */
 static bool CheckWait_AddressArbiter(const Thread* thread, VAddr wait_address) {
-    return thread->IsWaiting() && thread->wait_objects.empty() && wait_address == thread->wait_address;
+    return thread->status == THREADSTATUS_WAIT_ARB && wait_address == thread->wait_address;
 }
 
-/// Stops the current thread
-void Thread::Stop(const char* reason) {
+void Thread::Stop() {
     // Release all the mutexes that this thread holds
     ReleaseThreadMutexes(this);
 
-    ChangeReadyState(this, false);
-    status = THREADSTATUS_DORMANT;
+    // Cancel any outstanding wakeup events for this thread
+    CoreTiming::UnscheduleEvent(ThreadWakeupEventType, callback_handle);
+
+    // Clean up thread from ready queue
+    // This is only needed when the thread is termintated forcefully (SVC TerminateProcess)
+    if (status == THREADSTATUS_READY){
+        ready_queue.remove(current_priority, this);
+    }
+
+    status = THREADSTATUS_DEAD;
+    
     WakeupAllWaitingThreads();
 
-    // Stopped threads are never waiting.
+    // Clean up any dangling references in objects that this thread was waiting for
     for (auto& wait_object : wait_objects) {
         wait_object->RemoveWaitingThread(this);
     }
-    wait_objects.clear();
-    wait_address = 0;
-}
-
-/// Changes a threads state
-static void ChangeThreadState(Thread* t, ThreadStatus new_status) {
-    if (!t || t->status == new_status) {
-        return;
-    }
-    ChangeReadyState(t, (new_status & THREADSTATUS_READY) != 0);
-    t->status = new_status;
 }
 
-/// Arbitrate the highest priority thread that is waiting
 Thread* ArbitrateHighestPriorityThread(u32 address) {
     Thread* highest_priority_thread = nullptr;
     s32 priority = THREADPRIO_LOWEST;
@@ -153,108 +131,113 @@ Thread* ArbitrateHighestPriorityThread(u32 address) {
     return highest_priority_thread;
 }
 
-/// Arbitrate all threads currently waiting
 void ArbitrateAllThreads(u32 address) {
-
-    // Iterate through threads, find highest priority thread that is waiting to be arbitrated...
+    // Resume all threads found to be waiting on the address
     for (auto& thread : thread_list) {
         if (CheckWait_AddressArbiter(thread.get(), address))
             thread->ResumeFromWait();
     }
 }
 
-/// Calls a thread by marking it as "ready" (note: will not actually execute until current thread yields)
-static void CallThread(Thread* t) {
-    // Stop waiting
-    ChangeThreadState(t, THREADSTATUS_READY);
-}
+/** 
+ * Switches the CPU's active thread context to that of the specified thread
+ * @param new_thread The thread to switch to
+ */
+static void SwitchContext(Thread* new_thread) {
+    _dbg_assert_msg_(Kernel, new_thread->status == THREADSTATUS_READY, "Thread must be ready to become running.");
 
-/// Switches CPU context to that of the specified thread
-static void SwitchContext(Thread* t) {
-    Thread* cur = GetCurrentThread();
+    Thread* previous_thread = GetCurrentThread();
 
-    // Save context for current thread
-    if (cur) {
-        Core::g_app_core->SaveContext(cur->context);
+    // Save context for previous thread
+    if (previous_thread) {
+        Core::g_app_core->SaveContext(previous_thread->context);
 
-        if (cur->IsRunning()) {
-            ChangeReadyState(cur, true);
+        if (previous_thread->status == THREADSTATUS_RUNNING) {
+            // This is only the case when a reschedule is triggered without the current thread
+            // yielding execution (i.e. an event triggered, system core time-sliced, etc)
+            ready_queue.push_front(previous_thread->current_priority, previous_thread);
+            previous_thread->status = THREADSTATUS_READY;
         }
     }
+
     // Load context of new thread
-    if (t) {
-        current_thread = t;
-        ChangeReadyState(t, false);
-        t->status = (t->status | THREADSTATUS_RUNNING) & ~THREADSTATUS_READY;
-        Core::g_app_core->LoadContext(t->context);
+    if (new_thread) {
+        current_thread = new_thread;
+
+        ready_queue.remove(new_thread->current_priority, new_thread);
+        new_thread->status = THREADSTATUS_RUNNING;
+
+        Core::g_app_core->LoadContext(new_thread->context);
     } else {
         current_thread = nullptr;
     }
 }
 
-/// Gets the next thread that is ready to be run by priority
-static Thread* NextThread() {
+/**
+ * Pops and returns the next thread from the thread queue
+ * @return A pointer to the next ready thread
+ */
+static Thread* PopNextReadyThread() {
     Thread* next;
-    Thread* cur = GetCurrentThread();
+    Thread* thread = GetCurrentThread();
 
-    if (cur && cur->IsRunning()) {
-        next = thread_ready_queue.pop_first_better(cur->current_priority);
+    if (thread && thread->status == THREADSTATUS_RUNNING) {
+        // We have to do better than the current thread.
+        // This call returns null when that's not possible.
+        next = ready_queue.pop_first_better(thread->current_priority);
     } else  {
-        next = thread_ready_queue.pop_first();
-    }
-    if (next == 0) {
-        return nullptr;
+        next = ready_queue.pop_first();
     }
+
     return next;
 }
 
 void WaitCurrentThread_Sleep() {
     Thread* thread = GetCurrentThread();
-    ChangeThreadState(thread, ThreadStatus(THREADSTATUS_WAIT | (thread->status & THREADSTATUS_SUSPEND)));
+    thread->status = THREADSTATUS_WAIT_SLEEP;
 }
 
-void WaitCurrentThread_WaitSynchronization(SharedPtr<WaitObject> wait_object, bool wait_set_output, bool wait_all) {
+void WaitCurrentThread_WaitSynchronization(std::vector<SharedPtr<WaitObject>> wait_objects, bool wait_set_output, bool wait_all) {
     Thread* thread = GetCurrentThread();
     thread->wait_set_output = wait_set_output;
     thread->wait_all = wait_all;
-
-    // It's possible to call WaitSynchronizationN without any objects passed in...
-    if (wait_object != nullptr)
-        thread->wait_objects.push_back(wait_object);
-
-    ChangeThreadState(thread, ThreadStatus(THREADSTATUS_WAIT | (thread->status & THREADSTATUS_SUSPEND)));
+    thread->wait_objects = std::move(wait_objects);
+    thread->status = THREADSTATUS_WAIT_SYNCH;
 }
 
 void WaitCurrentThread_ArbitrateAddress(VAddr wait_address) {
     Thread* thread = GetCurrentThread();
     thread->wait_address = wait_address;
-    ChangeThreadState(thread, ThreadStatus(THREADSTATUS_WAIT | (thread->status & THREADSTATUS_SUSPEND)));
+    thread->status = THREADSTATUS_WAIT_ARB;
 }
 
-/// Event type for the thread wake up event
-static int ThreadWakeupEventType = -1;
 // TODO(yuriks): This can be removed if Thread objects are explicitly pooled in the future, allowing
 //               us to simply use a pool index or similar.
 static Kernel::HandleTable wakeup_callback_handle_table;
 
-/// Callback that will wake up the thread it was scheduled for
+/**
+ * Callback that will wake up the thread it was scheduled for
+ * @param thread_handle The handle of the thread that's been awoken
+ * @param cycles_late The number of CPU cycles that have passed since the desired wakeup time
+ */
 static void ThreadWakeupCallback(u64 thread_handle, int cycles_late) {
     SharedPtr<Thread> thread = wakeup_callback_handle_table.Get<Thread>((Handle)thread_handle);
     if (thread == nullptr) {
-        LOG_CRITICAL(Kernel, "Callback fired for invalid thread %08X", thread_handle);
+        LOG_CRITICAL(Kernel, "Callback fired for invalid thread %08X", (Handle)thread_handle);
         return;
     }
 
-    thread->SetWaitSynchronizationResult(ResultCode(ErrorDescription::Timeout, ErrorModule::OS,
-        ErrorSummary::StatusChanged, ErrorLevel::Info));
+    if (thread->status == THREADSTATUS_WAIT_SYNCH) {
+        thread->SetWaitSynchronizationResult(ResultCode(ErrorDescription::Timeout, ErrorModule::OS,
+                                                        ErrorSummary::StatusChanged, ErrorLevel::Info));
 
-    if (thread->wait_set_output)
-        thread->SetWaitSynchronizationOutput(-1);
+        if (thread->wait_set_output)
+            thread->SetWaitSynchronizationOutput(-1);
+    }
 
     thread->ResumeFromWait();
 }
 
-
 void Thread::WakeAfterDelay(s64 nanoseconds) {
     // Don't schedule a wakeup if the thread wants to wait forever
     if (nanoseconds == -1)
@@ -265,7 +248,7 @@ void Thread::WakeAfterDelay(s64 nanoseconds) {
 }
 
 void Thread::ReleaseWaitObject(WaitObject* wait_object) {
-    if (wait_objects.empty()) {
+    if (status != THREADSTATUS_WAIT_SYNCH || wait_objects.empty()) {
         LOG_CRITICAL(Kernel, "thread is not waiting on any objects!");
         return;
     }
@@ -307,34 +290,48 @@ void Thread::ReleaseWaitObject(WaitObject* wait_object) {
 }
 
 void Thread::ResumeFromWait() {
-    // Cancel any outstanding wakeup events
+    // Cancel any outstanding wakeup events for this thread
     CoreTiming::UnscheduleEvent(ThreadWakeupEventType, callback_handle);
 
-    status &= ~THREADSTATUS_WAIT;
-
-    // Remove this thread from all other WaitObjects
-    for (auto wait_object : wait_objects)
-        wait_object->RemoveWaitingThread(this);
-
-    wait_objects.clear();
-    wait_set_output = false;
-    wait_all = false;
-    wait_address = 0;
-
-    if (!(status & (THREADSTATUS_WAITSUSPEND | THREADSTATUS_DORMANT | THREADSTATUS_DEAD))) {
-        ChangeReadyState(this, true);
+    switch (status) {
+        case THREADSTATUS_WAIT_SYNCH:
+            // Remove this thread from all other WaitObjects
+            for (auto wait_object : wait_objects)
+                wait_object->RemoveWaitingThread(this);
+            break;
+        case THREADSTATUS_WAIT_ARB:
+        case THREADSTATUS_WAIT_SLEEP:
+            break;
+        case THREADSTATUS_RUNNING:
+        case THREADSTATUS_READY:
+            LOG_ERROR(Kernel, "Thread with object id %u has already resumed.", GetObjectId());
+            _dbg_assert_(Kernel, false);
+            return;
+        case THREADSTATUS_DEAD:
+            // This should never happen, as threads must complete before being stopped.
+            LOG_CRITICAL(Kernel, "Thread with object id %u cannot be resumed because it's DEAD.",
+                GetObjectId());
+            _dbg_assert_(Kernel, false);
+            return;
     }
+    
+    ready_queue.push_back(current_priority, this);
+    status = THREADSTATUS_READY;
 }
 
-/// Prints the thread queue for debugging purposes
+/**
+ * Prints the thread queue for debugging purposes
+ */
 static void DebugThreadQueue() {
     Thread* thread = GetCurrentThread();
     if (!thread) {
-        return;
+        LOG_DEBUG(Kernel, "Current: NO CURRENT THREAD");
+    } else {
+        LOG_DEBUG(Kernel, "0x%02X %u (current)", thread->current_priority, GetCurrentThread()->GetObjectId());
     }
-    LOG_DEBUG(Kernel, "0x%02X %u (current)", thread->current_priority, GetCurrentThread()->GetObjectId());
+
     for (auto& t : thread_list) {
-        s32 priority = thread_ready_queue.contains(t.get());
+        s32 priority = ready_queue.contains(t.get());
         if (priority != -1) {
             LOG_DEBUG(Kernel, "0x%02X %u", priority, t->GetObjectId());
         }
@@ -342,14 +339,7 @@ static void DebugThreadQueue() {
 }
 
 ResultVal<SharedPtr<Thread>> Thread::Create(std::string name, VAddr entry_point, s32 priority,
-        u32 arg, s32 processor_id, VAddr stack_top, u32 stack_size) {
-    if (stack_size < 0x200) {
-        LOG_ERROR(Kernel, "(name=%s): invalid stack_size=0x%08X", name.c_str(), stack_size);
-        // TODO: Verify error
-        return ResultCode(ErrorDescription::InvalidSize, ErrorModule::Kernel,
-                ErrorSummary::InvalidArgument, ErrorLevel::Permanent);
-    }
-
+        u32 arg, s32 processor_id, VAddr stack_top) {
     if (priority < THREADPRIO_HIGHEST || priority > THREADPRIO_LOWEST) {
         s32 new_priority = CLAMP(priority, THREADPRIO_HIGHEST, THREADPRIO_LOWEST);
         LOG_WARNING(Kernel_SVC, "(name=%s): invalid priority=%d, clamping to %d",
@@ -369,13 +359,12 @@ ResultVal<SharedPtr<Thread>> Thread::Create(std::string name, VAddr entry_point,
     SharedPtr<Thread> thread(new Thread);
 
     thread_list.push_back(thread);
-    thread_ready_queue.prepare(priority);
+    ready_queue.prepare(priority);
 
-    thread->thread_id = next_thread_id++;
+    thread->thread_id = NewThreadId();
     thread->status = THREADSTATUS_DORMANT;
     thread->entry_point = entry_point;
     thread->stack_top = stack_top;
-    thread->stack_size = stack_size;
     thread->initial_priority = thread->current_priority = priority;
     thread->processor_id = processor_id;
     thread->wait_set_output = false;
@@ -385,75 +374,74 @@ ResultVal<SharedPtr<Thread>> Thread::Create(std::string name, VAddr entry_point,
     thread->name = std::move(name);
     thread->callback_handle = wakeup_callback_handle_table.Create(thread).MoveFrom();
 
-    ResetThread(thread.get(), arg, 0);
-    CallThread(thread.get());
+    // TODO(peachum): move to ScheduleThread() when scheduler is added so selected core is used
+    // to initialize the context
+    Core::g_app_core->ResetContext(thread->context, stack_top, entry_point, arg);
+
+    ready_queue.push_back(thread->current_priority, thread.get());
+    thread->status = THREADSTATUS_READY;
 
     return MakeResult<SharedPtr<Thread>>(std::move(thread));
 }
 
-/// Set the priority of the thread specified by handle
-void Thread::SetPriority(s32 priority) {
-    // If priority is invalid, clamp to valid range
-    if (priority < THREADPRIO_HIGHEST || priority > THREADPRIO_LOWEST) {
-        s32 new_priority = CLAMP(priority, THREADPRIO_HIGHEST, THREADPRIO_LOWEST);
-        LOG_WARNING(Kernel_SVC, "invalid priority=%d, clamping to %d", priority, new_priority);
+// TODO(peachum): Remove this. Range checking should be done, and an appropriate error should be returned.
+static void ClampPriority(const Thread* thread, s32* priority) {
+    if (*priority < THREADPRIO_HIGHEST || *priority > THREADPRIO_LOWEST) {
+        _dbg_assert_msg_(Kernel, false, "Application passed an out of range priority. An error should be returned.");
+
+        s32 new_priority = CLAMP(*priority, THREADPRIO_HIGHEST, THREADPRIO_LOWEST);
+        LOG_WARNING(Kernel_SVC, "(name=%s): invalid priority=%d, clamping to %d",
+                    thread->name.c_str(), *priority, new_priority);
         // TODO(bunnei): Clamping to a valid priority is not necessarily correct behavior... Confirm
         // validity of this
-        priority = new_priority;
+        *priority = new_priority;
     }
+}
 
-    // Change thread priority
-    s32 old = current_priority;
-    thread_ready_queue.remove(old, this);
-    current_priority = priority;
-    thread_ready_queue.prepare(current_priority);
+void Thread::SetPriority(s32 priority) {
+    ClampPriority(this, &priority);
 
-    // Change thread status to "ready" and push to ready queue
-    if (IsRunning()) {
-        status = (status & ~THREADSTATUS_RUNNING) | THREADSTATUS_READY;
+    if (current_priority == priority) {
+        return;
     }
-    if (IsReady()) {
-        thread_ready_queue.push_back(current_priority, this);
+
+    if (status == THREADSTATUS_READY) {
+        // If thread was ready, adjust queues
+        ready_queue.remove(current_priority, this);
+        ready_queue.prepare(priority);
+        ready_queue.push_back(priority, this);
     }
+    
+    current_priority = priority;
 }
 
 SharedPtr<Thread> SetupIdleThread() {
     // We need to pass a few valid values to get around parameter checking in Thread::Create.
     auto thread = Thread::Create("idle", Memory::KERNEL_MEMORY_VADDR, THREADPRIO_LOWEST, 0,
-            THREADPROCESSORID_0, 0, Kernel::DEFAULT_STACK_SIZE).MoveFrom();
+            THREADPROCESSORID_0, 0).MoveFrom();
 
     thread->idle = true;
-    CallThread(thread.get());
     return thread;
 }
 
-SharedPtr<Thread> SetupMainThread(s32 priority, u32 stack_size) {
+SharedPtr<Thread> SetupMainThread(u32 stack_size, u32 entry_point, s32 priority) {
+    _dbg_assert_(Kernel, !GetCurrentThread());
+
     // Initialize new "main" thread
-    auto thread_res = Thread::Create("main", Core::g_app_core->GetPC(), priority, 0,
-            THREADPROCESSORID_0, Memory::SCRATCHPAD_VADDR_END, stack_size);
-    // TODO(yuriks): Propagate error
-    _dbg_assert_(Kernel, thread_res.Succeeded());
-    SharedPtr<Thread> thread = std::move(*thread_res);
-
-    // If running another thread already, set it to "ready" state
-    Thread* cur = GetCurrentThread();
-    if (cur && cur->IsRunning()) {
-        ChangeReadyState(cur, true);
-    }
+    auto thread_res = Thread::Create("main", entry_point, priority, 0,
+            THREADPROCESSORID_0, Memory::SCRATCHPAD_VADDR_END);
+
+    SharedPtr<Thread> thread = thread_res.MoveFrom();
 
     // Run new "main" thread
-    current_thread = thread.get();
-    thread->status = THREADSTATUS_RUNNING;
-    Core::g_app_core->LoadContext(thread->context);
+    SwitchContext(thread.get());
 
     return thread;
 }
 
-
-/// Reschedules to the next available thread (call after current thread is suspended)
 void Reschedule() {
     Thread* prev = GetCurrentThread();
-    Thread* next = NextThread();
+    Thread* next = PopNextReadyThread();
     HLE::g_reschedule = false;
 
     if (next != nullptr) {
@@ -480,8 +468,10 @@ void Thread::SetWaitSynchronizationOutput(s32 output) {
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
 void ThreadingInit() {
-    next_thread_id = INITIAL_THREAD_ID;
     ThreadWakeupEventType = CoreTiming::RegisterEvent("ThreadWakeupCallback", ThreadWakeupCallback);
+
+    // Setup the idle thread
+    SetupIdleThread();
 }
 
 void ThreadingShutdown() {
diff --git a/src/core/hle/kernel/thread.h b/src/core/hle/kernel/thread.h
index 633bb7c98..cfd073a70 100644
--- a/src/core/hle/kernel/thread.h
+++ b/src/core/hle/kernel/thread.h
@@ -31,13 +31,13 @@ enum ThreadProcessorId {
 };
 
 enum ThreadStatus {
-    THREADSTATUS_RUNNING        = 1,
-    THREADSTATUS_READY          = 2,
-    THREADSTATUS_WAIT           = 4,
-    THREADSTATUS_SUSPEND        = 8,
-    THREADSTATUS_DORMANT        = 16,
-    THREADSTATUS_DEAD           = 32,
-    THREADSTATUS_WAITSUSPEND    = THREADSTATUS_WAIT | THREADSTATUS_SUSPEND
+    THREADSTATUS_RUNNING,       ///< Currently running
+    THREADSTATUS_READY,         ///< Ready to run
+    THREADSTATUS_WAIT_ARB,      ///< Waiting on an address arbiter
+    THREADSTATUS_WAIT_SLEEP,    ///< Waiting due to a SleepThread SVC
+    THREADSTATUS_WAIT_SYNCH,    ///< Waiting due to a WaitSynchronization SVC
+    THREADSTATUS_DORMANT,       ///< Created but not yet made ready
+    THREADSTATUS_DEAD           ///< Run to completion, or forcefully terminated
 };
 
 namespace Kernel {
@@ -46,8 +46,19 @@ class Mutex;
 
 class Thread final : public WaitObject {
 public:
+    /**
+     * Creates and returns a new thread. The new thread is immediately scheduled
+     * @param name The friendly name desired for the thread
+     * @param entry_point The address at which the thread should start execution
+     * @param priority The thread's priority
+     * @param arg User data to pass to the thread
+     * @param processor_id The ID(s) of the processors on which the thread is desired to be run
+     * @param stack_top The address of the thread's stack top
+     * @param stack_size The size of the thread's stack
+     * @return A shared pointer to the newly created thread
+     */
     static ResultVal<SharedPtr<Thread>> Create(std::string name, VAddr entry_point, s32 priority,
-        u32 arg, s32 processor_id, VAddr stack_top, u32 stack_size);
+        u32 arg, s32 processor_id, VAddr stack_top);
 
     std::string GetName() const override { return name; }
     std::string GetTypeName() const override { return "Thread"; }
@@ -55,22 +66,32 @@ public:
     static const HandleType HANDLE_TYPE = HandleType::Thread;
     HandleType GetHandleType() const override { return HANDLE_TYPE; }
 
-    inline bool IsRunning() const { return (status & THREADSTATUS_RUNNING) != 0; }
-    inline bool IsStopped() const { return (status & THREADSTATUS_DORMANT) != 0; }
-    inline bool IsReady() const { return (status & THREADSTATUS_READY) != 0; }
-    inline bool IsWaiting() const { return (status & THREADSTATUS_WAIT) != 0; }
-    inline bool IsSuspended() const { return (status & THREADSTATUS_SUSPEND) != 0; }
-    inline bool IsIdle() const { return idle; }
-
     bool ShouldWait() override;
     void Acquire() override;
 
+    /**
+     * Checks if the thread is an idle (stub) thread
+     * @return True if the thread is an idle (stub) thread, false otherwise
+     */
+    inline bool IsIdle() const { return idle; }
+
+    /**
+     * Gets the thread's current priority
+     * @return The current thread's priority
+     */
     s32 GetPriority() const { return current_priority; }
+
+    /**
+     * Sets the thread's current priority
+     * @param priority The new priority
+     */
     void SetPriority(s32 priority);
 
+    /**
+     * Gets the thread's thread ID
+     * @return The thread's ID
+     */
     u32 GetThreadId() const { return thread_id; }
-
-    void Stop(const char* reason);
     
     /**
      * Release an acquired wait object
@@ -78,12 +99,14 @@ public:
      */
     void ReleaseWaitObject(WaitObject* wait_object);
 
-    /// Resumes a thread from waiting by marking it as "ready"
+    /**
+     * Resumes a thread from waiting
+     */
     void ResumeFromWait();
 
     /**
-    * Schedules an event to wake up the specified thread after the specified delay.
-    * @param nanoseconds The time this thread will be allowed to sleep for.
+    * Schedules an event to wake up the specified thread after the specified delay
+    * @param nanoseconds The time this thread will be allowed to sleep for
     */
     void WakeAfterDelay(s64 nanoseconds);
 
@@ -99,6 +122,11 @@ public:
      */
     void SetWaitSynchronizationOutput(s32 output);
 
+    /**
+     * Stops a thread, invalidating it from further use
+     */
+    void Stop();
+
     Core::ThreadContext context;
 
     u32 thread_id;
@@ -106,7 +134,6 @@ public:
     u32 status;
     u32 entry_point;
     u32 stack_top;
-    u32 stack_size;
 
     s32 initial_priority;
     s32 current_priority;
@@ -136,31 +163,49 @@ private:
 
 extern SharedPtr<Thread> g_main_thread;
 
-/// Sets up the primary application thread
-SharedPtr<Thread> SetupMainThread(s32 priority, u32 stack_size);
+/**
+ * Sets up the primary application thread
+ * @param stack_size The size of the thread's stack
+ * @param entry_point The address at which the thread should start execution
+ * @param priority The priority to give the main thread
+ * @return A shared pointer to the main thread
+ */
+SharedPtr<Thread> SetupMainThread(u32 stack_size, u32 entry_point, s32 priority);
 
-/// Reschedules to the next available thread (call after current thread is suspended)
+/**
+ * Reschedules to the next available thread (call after current thread is suspended)
+ */
 void Reschedule();
 
-/// Arbitrate the highest priority thread that is waiting
+/**
+ * Arbitrate the highest priority thread that is waiting
+ * @param address The address for which waiting threads should be arbitrated
+ */
 Thread* ArbitrateHighestPriorityThread(u32 address);
 
-/// Arbitrate all threads currently waiting...
+/**
+ * Arbitrate all threads currently waiting.
+ * @param address The address for which waiting threads should be arbitrated
+ */
 void ArbitrateAllThreads(u32 address);
 
-/// Gets the current thread
+/**
+ * Gets the current thread
+ */
 Thread* GetCurrentThread();
 
-/// Waits the current thread on a sleep
+/**
+ * Waits the current thread on a sleep
+ */
 void WaitCurrentThread_Sleep();
 
 /**
  * Waits the current thread from a WaitSynchronization call
- * @param wait_object Kernel object that we are waiting on
+ * @param wait_objects Kernel objects that we are waiting on
  * @param wait_set_output If true, set the output parameter on thread wakeup (for WaitSynchronizationN only)
  * @param wait_all If true, wait on all objects before resuming (for WaitSynchronizationN only)
  */
-void WaitCurrentThread_WaitSynchronization(SharedPtr<WaitObject> wait_object, bool wait_set_output, bool wait_all);
+void WaitCurrentThread_WaitSynchronization(std::vector<SharedPtr<WaitObject>> wait_objects, bool wait_set_output, bool wait_all);
 
 /**
  * Waits the current thread from an ArbitrateAddress call
@@ -172,14 +217,18 @@ void WaitCurrentThread_ArbitrateAddress(VAddr wait_address);
  * Sets up the idle thread, this is a thread that is intended to never execute instructions,
  * only to advance the timing. It is scheduled when there are no other ready threads in the thread queue
  * and will try to yield on every call.
- * @returns The handle of the idle thread
+ * @return The handle of the idle thread
  */
 SharedPtr<Thread> SetupIdleThread();
 
-/// Initialize threading
+/**
+ * Initialize threading
+ */
 void ThreadingInit();
 
-/// Shutdown threading
+/**
+ * Shutdown threading
+ */
 void ThreadingShutdown();
 
 } // namespace
diff --git a/src/core/hle/svc.cpp b/src/core/hle/svc.cpp
index 34a27917f..96da29923 100644
--- a/src/core/hle/svc.cpp
+++ b/src/core/hle/svc.cpp
@@ -144,17 +144,17 @@ static ResultCode WaitSynchronization1(Handle handle, s64 nano_seconds) {
     LOG_TRACE(Kernel_SVC, "called handle=0x%08X(%s:%s), nanoseconds=%lld", handle,
             object->GetTypeName().c_str(), object->GetName().c_str(), nano_seconds);
 
+    HLE::Reschedule(__func__);
+
     // Check for next thread to schedule
     if (object->ShouldWait()) {
 
         object->AddWaitingThread(Kernel::GetCurrentThread());
-        Kernel::WaitCurrentThread_WaitSynchronization(object, false, false);
+        Kernel::WaitCurrentThread_WaitSynchronization({ object }, false, false);
 
         // Create an event to wake the thread up after the specified nanosecond delay has passed
         Kernel::GetCurrentThread()->WakeAfterDelay(nano_seconds);
 
-        HLE::Reschedule(__func__);
-
         // NOTE: output of this SVC will be set later depending on how the thread resumes
         return RESULT_INVALID;
     }
@@ -212,25 +212,29 @@ static ResultCode WaitSynchronizationN(s32* out, Handle* handles, s32 handle_cou
         // NOTE: This should deadlock the current thread if no timeout was specified
         if (!wait_all) {
             wait_thread = true;
-            Kernel::WaitCurrentThread_WaitSynchronization(nullptr, true, wait_all);
         }
     }
 
+    HLE::Reschedule(__func__);
+
     // If thread should wait, then set its state to waiting and then reschedule...
     if (wait_thread) {
 
         // Actually wait the current thread on each object if we decided to wait...
+        std::vector<SharedPtr<Kernel::WaitObject>> wait_objects;
+        wait_objects.reserve(handle_count);
+        
         for (int i = 0; i < handle_count; ++i) {
             auto object = Kernel::g_handle_table.GetWaitObject(handles[i]);
             object->AddWaitingThread(Kernel::GetCurrentThread());
-            Kernel::WaitCurrentThread_WaitSynchronization(object, true, wait_all);
+            wait_objects.push_back(object);
         }
 
+        Kernel::WaitCurrentThread_WaitSynchronization(std::move(wait_objects), true, wait_all);
+
         // Create an event to wake the thread up after the specified nanosecond delay has passed
         Kernel::GetCurrentThread()->WakeAfterDelay(nano_seconds);
 
-        HLE::Reschedule(__func__);
-
         // NOTE: output of this SVC will be set later depending on how the thread resumes
         return RESULT_INVALID;
     }
@@ -319,7 +323,7 @@ static ResultCode CreateThread(u32* out_handle, u32 priority, u32 entry_point, u
     }
 
     CASCADE_RESULT(SharedPtr<Thread> thread, Kernel::Thread::Create(
-            name, entry_point, priority, arg, processor_id, stack_top, Kernel::DEFAULT_STACK_SIZE));
+            name, entry_point, priority, arg, processor_id, stack_top));
     CASCADE_RESULT(*out_handle, Kernel::g_handle_table.Create(std::move(thread)));
 
     LOG_TRACE(Kernel_SVC, "called entrypoint=0x%08X (%s), arg=0x%08X, stacktop=0x%08X, "
@@ -338,7 +342,7 @@ static ResultCode CreateThread(u32* out_handle, u32 priority, u32 entry_point, u
 static void ExitThread() {
     LOG_TRACE(Kernel_SVC, "called, pc=0x%08X", Core::g_app_core->GetPC());
 
-    Kernel::GetCurrentThread()->Stop(__func__);
+    Kernel::GetCurrentThread()->Stop();
     HLE::Reschedule(__func__);
 }