83 files changed, 2118 insertions, 1107 deletions
diff --git a/src/audio_core/codec.cpp b/src/audio_core/codec.cpp
index 7a3bd7eb3..6fba9fdae 100644
--- a/src/audio_core/codec.cpp
+++ b/src/audio_core/codec.cpp
@@ -117,7 +117,9 @@ StereoBuffer16 DecodePCM16(const unsigned num_channels, const u8* const data,
             ret[i].fill(sample);
         }
     } else {
-        std::memcpy(ret.data(), data, sample_count * 2 * sizeof(u16));
+        for (size_t i = 0; i < sample_count; ++i) {
+            std::memcpy(&ret[i], data + i * sizeof(s16) * 2, 2 * sizeof(s16));
+        }
     }
 
     return ret;
diff --git a/src/audio_core/codec.h b/src/audio_core/codec.h
index 2b0c395e6..877b2202d 100644
--- a/src/audio_core/codec.h
+++ b/src/audio_core/codec.h
@@ -5,13 +5,13 @@
 #pragma once
 
 #include <array>
-#include <vector>
+#include <deque>
 #include "common/common_types.h"
 
 namespace Codec {
 
 /// A variable length buffer of signed PCM16 stereo samples.
-using StereoBuffer16 = std::vector<std::array<s16, 2>>;
+using StereoBuffer16 = std::deque<std::array<s16, 2>>;
 
 /// See: Codec::DecodeADPCM
 struct ADPCMState {
diff --git a/src/audio_core/hle/source.cpp b/src/audio_core/hle/source.cpp
index de4e88cae..c12287700 100644
--- a/src/audio_core/hle/source.cpp
+++ b/src/audio_core/hle/source.cpp
@@ -264,7 +264,7 @@ void Source::GenerateFrame() {
             break;
         }
     }
-    state.next_sample_number += frame_position;
+    state.next_sample_number += static_cast<u32>(frame_position);
 
     state.filters.ProcessFrame(current_frame);
 }
diff --git a/src/audio_core/hle/source.h b/src/audio_core/hle/source.h
index ccb7f064f..c4d2debc2 100644
--- a/src/audio_core/hle/source.h
+++ b/src/audio_core/hle/source.h
@@ -108,7 +108,7 @@ private:
 
         u32 current_sample_number = 0;
         u32 next_sample_number = 0;
-        std::vector<std::array<s16, 2>> current_buffer;
+        AudioInterp::StereoBuffer16 current_buffer;
 
         // buffer_id state
 
diff --git a/src/audio_core/interpolate.cpp b/src/audio_core/interpolate.cpp
index 16e68bc5c..83573d772 100644
--- a/src/audio_core/interpolate.cpp
+++ b/src/audio_core/interpolate.cpp
@@ -47,7 +47,7 @@ static void StepOverSamples(State& state, StereoBuffer16& input, float rate,
     state.xn1 = input[inputi + 1];
     state.fposition = fposition - inputi * scale_factor;
 
-    input.erase(input.begin(), input.begin() + inputi + 2);
+    input.erase(input.begin(), std::next(input.begin(), inputi + 2));
 }
 
 void None(State& state, StereoBuffer16& input, float rate, DSP::HLE::StereoFrame16& output,
diff --git a/src/audio_core/interpolate.h b/src/audio_core/interpolate.h
index 59f59bc14..8dff6111a 100644
--- a/src/audio_core/interpolate.h
+++ b/src/audio_core/interpolate.h
@@ -5,14 +5,14 @@
 #pragma once
 
 #include <array>
-#include <vector>
+#include <deque>
 #include "audio_core/hle/common.h"
 #include "common/common_types.h"
 
 namespace AudioInterp {
 
 /// A variable length buffer of signed PCM16 stereo samples.
-using StereoBuffer16 = std::vector<std::array<s16, 2>>;
+using StereoBuffer16 = std::deque<std::array<s16, 2>>;
 
 struct State {
     /// Two historical samples.
diff --git a/src/citra_qt/configuration/configure_graphics.ui b/src/citra_qt/configuration/configure_graphics.ui
index b340149d5..5667b14b6 100644
--- a/src/citra_qt/configuration/configure_graphics.ui
+++ b/src/citra_qt/configuration/configure_graphics.ui
@@ -63,57 +63,57 @@
               <widget class="QComboBox" name="resolution_factor_combobox">
                 <item>
                   <property name="text">
-                    <string notr="true">Auto (Window Size)</string>
+                    <string>Auto (Window Size)</string>
                   </property>
                 </item>
                 <item>
                   <property name="text">
-                    <string notr="true">Native (400x240)</string>
+                    <string>Native (400x240)</string>
                   </property>
                 </item>
                 <item>
                   <property name="text">
-                    <string notr="true">2x Native (800x480)</string>
+                    <string>2x Native (800x480)</string>
                   </property>
                 </item>
                 <item>
                   <property name="text">
-                    <string notr="true">3x Native (1200x720)</string>
+                    <string>3x Native (1200x720)</string>
                   </property>
                 </item>
                 <item>
                   <property name="text">
-                    <string notr="true">4x Native (1600x960)</string>
+                    <string>4x Native (1600x960)</string>
                   </property>
                 </item>
                 <item>
                   <property name="text">
-                    <string notr="true">5x Native (2000x1200)</string>
+                    <string>5x Native (2000x1200)</string>
                   </property>
                 </item>
                 <item>
                   <property name="text">
-                    <string notr="true">6x Native (2400x1440)</string>
+                    <string>6x Native (2400x1440)</string>
                   </property>
                 </item>
                 <item>
                   <property name="text">
-                    <string notr="true">7x Native (2800x1680)</string>
+                    <string>7x Native (2800x1680)</string>
                   </property>
                 </item>
                 <item>
                   <property name="text">
-                    <string notr="true">8x Native (3200x1920)</string>
+                    <string>8x Native (3200x1920)</string>
                   </property>
                 </item>
                 <item>
                   <property name="text">
-                    <string notr="true">9x Native (3600x2160)</string>
+                    <string>9x Native (3600x2160)</string>
                   </property>
                 </item>
                 <item>
                   <property name="text">
-                    <string notr="true">10x Native (4000x2400)</string>
+                    <string>10x Native (4000x2400)</string>
                   </property>
                 </item>
               </widget>
diff --git a/src/citra_qt/configuration/configure_system.cpp b/src/citra_qt/configuration/configure_system.cpp
index 9b1e6711d..88a067c12 100644
--- a/src/citra_qt/configuration/configure_system.cpp
+++ b/src/citra_qt/configuration/configure_system.cpp
@@ -78,7 +78,8 @@ void ConfigureSystem::ReadSystemSettings() {
 
     // set the console id
     u64 console_id = Service::CFG::GetConsoleUniqueId();
-    ui->label_console_id->setText("Console ID: 0x" + QString::number(console_id, 16).toUpper());
+    ui->label_console_id->setText(
+        tr("Console ID: 0x%1").arg(QString::number(console_id, 16).toUpper()));
 }
 
 void ConfigureSystem::applyConfiguration() {
diff --git a/src/citra_qt/configuration/configure_web.cpp b/src/citra_qt/configuration/configure_web.cpp
index 38ce19c0f..bf8c21ac7 100644
--- a/src/citra_qt/configuration/configure_web.cpp
+++ b/src/citra_qt/configuration/configure_web.cpp
@@ -24,15 +24,15 @@ ConfigureWeb::~ConfigureWeb() {}
 void ConfigureWeb::setConfiguration() {
     ui->web_credentials_disclaimer->setWordWrap(true);
     ui->telemetry_learn_more->setOpenExternalLinks(true);
-    ui->telemetry_learn_more->setText("<a "
-                                      "href='https://citra-emu.org/entry/"
-                                      "telemetry-and-why-thats-a-good-thing/'>Learn more</a>");
+    ui->telemetry_learn_more->setText(tr("<a "
+                                         "href='https://citra-emu.org/entry/"
+                                         "telemetry-and-why-thats-a-good-thing/'>Learn more</a>"));
 
     ui->web_signup_link->setOpenExternalLinks(true);
-    ui->web_signup_link->setText("<a href='https://services.citra-emu.org/'>Sign up</a>");
+    ui->web_signup_link->setText(tr("<a href='https://services.citra-emu.org/'>Sign up</a>"));
     ui->web_token_info_link->setOpenExternalLinks(true);
     ui->web_token_info_link->setText(
-        "<a href='https://citra-emu.org/wiki/citra-web-service/'>What is my token?</a>");
+        tr("<a href='https://citra-emu.org/wiki/citra-web-service/'>What is my token?</a>"));
 
     ui->toggle_telemetry->setChecked(Settings::values.enable_telemetry);
     ui->edit_username->setText(QString::fromStdString(Settings::values.citra_username));
@@ -40,8 +40,8 @@ void ConfigureWeb::setConfiguration() {
     // Connect after setting the values, to avoid calling OnLoginChanged now
     connect(ui->edit_token, &QLineEdit::textChanged, this, &ConfigureWeb::OnLoginChanged);
     connect(ui->edit_username, &QLineEdit::textChanged, this, &ConfigureWeb::OnLoginChanged);
-    ui->label_telemetry_id->setText("Telemetry ID: 0x" +
-                                    QString::number(Core::GetTelemetryId(), 16).toUpper());
+    ui->label_telemetry_id->setText(
+        tr("Telemetry ID: 0x%1").arg(QString::number(Core::GetTelemetryId(), 16).toUpper()));
     user_verified = true;
 }
 
@@ -60,8 +60,8 @@ void ConfigureWeb::applyConfiguration() {
 
 void ConfigureWeb::RefreshTelemetryID() {
     const u64 new_telemetry_id{Core::RegenerateTelemetryId()};
-    ui->label_telemetry_id->setText("Telemetry ID: 0x" +
-                                    QString::number(new_telemetry_id, 16).toUpper());
+    ui->label_telemetry_id->setText(
+        tr("Telemetry ID: 0x%1").arg(QString::number(new_telemetry_id, 16).toUpper()));
 }
 
 void ConfigureWeb::OnLoginChanged() {
diff --git a/src/citra_qt/debugger/graphics/graphics_cmdlists.cpp b/src/citra_qt/debugger/graphics/graphics_cmdlists.cpp
index 7d06ec28a..ce2b9fa50 100644
--- a/src/citra_qt/debugger/graphics/graphics_cmdlists.cpp
+++ b/src/citra_qt/debugger/graphics/graphics_cmdlists.cpp
@@ -26,8 +26,8 @@
 namespace {
 QImage LoadTexture(const u8* src, const Pica::Texture::TextureInfo& info) {
     QImage decoded_image(info.width, info.height, QImage::Format_ARGB32);
-    for (int y = 0; y < info.height; ++y) {
-        for (int x = 0; x < info.width; ++x) {
+    for (u32 y = 0; y < info.height; ++y) {
+        for (u32 x = 0; x < info.width; ++x) {
             Math::Vec4<u8> color = Pica::Texture::LookupTexture(src, x, y, info, true);
             decoded_image.setPixel(x, y, qRgba(color.r(), color.g(), color.b(), color.a()));
         }
diff --git a/src/citra_qt/debugger/graphics/graphics_surface.cpp b/src/citra_qt/debugger/graphics/graphics_surface.cpp
index 47d9924e1..c974545ef 100644
--- a/src/citra_qt/debugger/graphics/graphics_surface.cpp
+++ b/src/citra_qt/debugger/graphics/graphics_surface.cpp
@@ -273,7 +273,8 @@ void GraphicsSurfaceWidget::Pick(int x, int y) {
     surface_picker_x_control->setValue(x);
     surface_picker_y_control->setValue(y);
 
-    if (x < 0 || x >= surface_width || y < 0 || y >= surface_height) {
+    if (x < 0 || x >= static_cast<int>(surface_width) || y < 0 ||
+        y >= static_cast<int>(surface_height)) {
         surface_info_label->setText(tr("Pixel out of bounds"));
         surface_info_label->setAlignment(Qt::AlignLeft | Qt::AlignVCenter);
         return;
diff --git a/src/common/string_util.cpp b/src/common/string_util.cpp
index bad311793..6959915fa 100644
--- a/src/common/string_util.cpp
+++ b/src/common/string_util.cpp
@@ -117,7 +117,7 @@ std::string StringFromFormat(const char* format, ...) {
 }
 
 // For Debugging. Read out an u8 array.
-std::string ArrayToString(const u8* data, u32 size, int line_len, bool spaces) {
+std::string ArrayToString(const u8* data, size_t size, int line_len, bool spaces) {
     std::ostringstream oss;
     oss << std::setfill('0') << std::hex;
 
diff --git a/src/common/string_util.h b/src/common/string_util.h
index 075bf4ecb..259360aec 100644
--- a/src/common/string_util.h
+++ b/src/common/string_util.h
@@ -33,7 +33,7 @@ inline void CharArrayFromFormat(char (&out)[Count], const char* format, ...) {
 }
 
 // Good
-std::string ArrayToString(const u8* data, u32 size, int line_len = 20, bool spaces = true);
+std::string ArrayToString(const u8* data, size_t size, int line_len = 20, bool spaces = true);
 
 std::string StripSpaces(const std::string& s);
 std::string StripQuotes(const std::string& s);
diff --git a/src/common/vector_math.h b/src/common/vector_math.h
index 6e2a5ad60..3f0057d9e 100644
--- a/src/common/vector_math.h
+++ b/src/common/vector_math.h
@@ -31,6 +31,7 @@
 #pragma once
 
 #include <cmath>
+#include <type_traits>
 
 namespace Math {
 
@@ -90,7 +91,8 @@ public:
         y -= other.y;
     }
 
-    Vec2<decltype(-T{})> operator-() const {
+    template <typename U = T>
+    Vec2<std::enable_if_t<std::is_signed<U>::value, U>> operator-() const {
         return MakeVec(-x, -y);
     }
     Vec2<decltype(T{} * T{})> operator*(const Vec2& other) const {
@@ -102,8 +104,7 @@ public:
     }
     template <typename V>
     void operator*=(const V& f) {
-        x *= f;
-        y *= f;
+        *this = *this * f;
     }
     template <typename V>
     Vec2<decltype(T{} / V{})> operator/(const V& f) const {
@@ -247,7 +248,8 @@ public:
         z -= other.z;
     }
 
-    Vec3<decltype(-T{})> operator-() const {
+    template <typename U = T>
+    Vec3<std::enable_if_t<std::is_signed<U>::value, U>> operator-() const {
         return MakeVec(-x, -y, -z);
     }
     Vec3<decltype(T{} * T{})> operator*(const Vec3& other) const {
@@ -259,9 +261,7 @@ public:
     }
     template <typename V>
     void operator*=(const V& f) {
-        x *= f;
-        y *= f;
-        z *= f;
+        *this = *this * f;
     }
     template <typename V>
     Vec3<decltype(T{} / V{})> operator/(const V& f) const {
@@ -462,7 +462,8 @@ public:
         w -= other.w;
     }
 
-    Vec4<decltype(-T{})> operator-() const {
+    template <typename U = T>
+    Vec4<std::enable_if_t<std::is_signed<U>::value, U>> operator-() const {
         return MakeVec(-x, -y, -z, -w);
     }
     Vec4<decltype(T{} * T{})> operator*(const Vec4& other) const {
@@ -474,10 +475,7 @@ public:
     }
     template <typename V>
     void operator*=(const V& f) {
-        x *= f;
-        y *= f;
-        z *= f;
-        w *= f;
+        *this = *this * f;
     }
     template <typename V>
     Vec4<decltype(T{} / V{})> operator/(const V& f) const {
@@ -720,4 +718,4 @@ static inline Vec4<T> MakeVec(const T& x, const Vec3<T>& yzw) {
     return MakeVec(x, yzw[0], yzw[1], yzw[2]);
 }
 
-} // namespace
+} // namespace Math
diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt
index cd1a8de2d..3ed619991 100644
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -26,6 +26,7 @@ set(SRCS
             file_sys/archive_systemsavedata.cpp
             file_sys/disk_archive.cpp
             file_sys/ivfc_archive.cpp
+            file_sys/ncch_container.cpp
             file_sys/path_parser.cpp
             file_sys/savedata_archive.cpp
             frontend/camera/blank_camera.cpp
diff --git a/src/core/arm/arm_interface.h b/src/core/arm/arm_interface.h
index ccd43f431..ba528403c 100644
--- a/src/core/arm/arm_interface.h
+++ b/src/core/arm/arm_interface.h
@@ -41,6 +41,9 @@ public:
     /// Clear all instruction cache
     virtual void ClearInstructionCache() = 0;
 
+    /// Notify CPU emulation that page tables have changed
+    virtual void PageTableChanged() = 0;
+
     /**
      * Set the Program Counter to an address
      * @param addr Address to set PC to
@@ -122,12 +125,6 @@ public:
     virtual void SetCP15Register(CP15Register reg, u32 value) = 0;
 
     /**
-     * Advance the CPU core by the specified number of ticks (e.g. to simulate CPU execution time)
-     * @param ticks Number of ticks to advance the CPU core
-     */
-    virtual void AddTicks(u64 ticks) = 0;
-
-    /**
      * Saves the current CPU context
      * @param ctx Thread context to save
      */
@@ -147,9 +144,6 @@ public:
         return num_instructions;
     }
 
-    s64 down_count = 0; ///< A decreasing counter of remaining cycles before the next event,
-                        /// decreased by the cpu run loop
-
 protected:
     /**
      * Executes the given number of instructions
diff --git a/src/core/arm/dynarmic/arm_dynarmic.cpp b/src/core/arm/dynarmic/arm_dynarmic.cpp
index 34c5aa381..2cb56d12f 100644
--- a/src/core/arm/dynarmic/arm_dynarmic.cpp
+++ b/src/core/arm/dynarmic/arm_dynarmic.cpp
@@ -41,7 +41,7 @@ static bool IsReadOnlyMemory(u32 vaddr) {
 }
 
 static Dynarmic::UserCallbacks GetUserCallbacks(
-    const std::shared_ptr<ARMul_State>& interpeter_state) {
+    const std::shared_ptr<ARMul_State>& interpeter_state, Memory::PageTable* current_page_table) {
     Dynarmic::UserCallbacks user_callbacks{};
     user_callbacks.InterpreterFallback = &InterpreterFallback;
     user_callbacks.user_arg = static_cast<void*>(interpeter_state.get());
@@ -56,16 +56,14 @@ static Dynarmic::UserCallbacks GetUserCallbacks(
     user_callbacks.memory.Write16 = &Memory::Write16;
     user_callbacks.memory.Write32 = &Memory::Write32;
     user_callbacks.memory.Write64 = &Memory::Write64;
-    // TODO(Subv): Re-add the page table pointers once dynarmic supports switching page tables at
-    // runtime.
-    user_callbacks.page_table = nullptr;
+    user_callbacks.page_table = &current_page_table->pointers;
     user_callbacks.coprocessors[15] = std::make_shared<DynarmicCP15>(interpeter_state);
     return user_callbacks;
 }
 
 ARM_Dynarmic::ARM_Dynarmic(PrivilegeMode initial_mode) {
     interpreter_state = std::make_shared<ARMul_State>(initial_mode);
-    jit = std::make_unique<Dynarmic::Jit>(GetUserCallbacks(interpreter_state));
+    PageTableChanged();
 }
 
 void ARM_Dynarmic::SetPC(u32 pc) {
@@ -126,21 +124,15 @@ void ARM_Dynarmic::SetCP15Register(CP15Register reg, u32 value) {
     interpreter_state->CP15[reg] = value;
 }
 
-void ARM_Dynarmic::AddTicks(u64 ticks) {
-    down_count -= ticks;
-    if (down_count < 0) {
-        CoreTiming::Advance();
-    }
-}
-
 MICROPROFILE_DEFINE(ARM_Jit, "ARM JIT", "ARM JIT", MP_RGB(255, 64, 64));
 
 void ARM_Dynarmic::ExecuteInstructions(int num_instructions) {
+    ASSERT(Memory::GetCurrentPageTable() == current_page_table);
     MICROPROFILE_SCOPE(ARM_Jit);
 
     std::size_t ticks_executed = jit->Run(static_cast<unsigned>(num_instructions));
 
-    AddTicks(ticks_executed);
+    CoreTiming::AddTicks(ticks_executed);
 }
 
 void ARM_Dynarmic::SaveContext(ARM_Interface::ThreadContext& ctx) {
@@ -178,3 +170,16 @@ void ARM_Dynarmic::PrepareReschedule() {
 void ARM_Dynarmic::ClearInstructionCache() {
     jit->ClearCache();
 }
+
+void ARM_Dynarmic::PageTableChanged() {
+    current_page_table = Memory::GetCurrentPageTable();
+
+    auto iter = jits.find(current_page_table);
+    if (iter != jits.end()) {
+        jit = iter->second.get();
+        return;
+    }
+
+    jit = new Dynarmic::Jit(GetUserCallbacks(interpreter_state, current_page_table));
+    jits.emplace(current_page_table, std::unique_ptr<Dynarmic::Jit>(jit));
+}
diff --git a/src/core/arm/dynarmic/arm_dynarmic.h b/src/core/arm/dynarmic/arm_dynarmic.h
index 834dc989e..0b00158a5 100644
--- a/src/core/arm/dynarmic/arm_dynarmic.h
+++ b/src/core/arm/dynarmic/arm_dynarmic.h
@@ -4,12 +4,17 @@
 
 #pragma once
 
+#include <map>
 #include <memory>
 #include <dynarmic/dynarmic.h>
 #include "common/common_types.h"
 #include "core/arm/arm_interface.h"
 #include "core/arm/skyeye_common/armstate.h"
 
+namespace Memory {
+struct PageTable;
+} // namespace Memory
+
 class ARM_Dynarmic final : public ARM_Interface {
 public:
     ARM_Dynarmic(PrivilegeMode initial_mode);
@@ -27,8 +32,6 @@ public:
     u32 GetCP15Register(CP15Register reg) override;
     void SetCP15Register(CP15Register reg, u32 value) override;
 
-    void AddTicks(u64 ticks) override;
-
     void SaveContext(ThreadContext& ctx) override;
     void LoadContext(const ThreadContext& ctx) override;
 
@@ -36,8 +39,11 @@ public:
     void ExecuteInstructions(int num_instructions) override;
 
     void ClearInstructionCache() override;
+    void PageTableChanged() override;
 
 private:
-    std::unique_ptr<Dynarmic::Jit> jit;
+    Dynarmic::Jit* jit = nullptr;
+    Memory::PageTable* current_page_table = nullptr;
+    std::map<Memory::PageTable*, std::unique_ptr<Dynarmic::Jit>> jits;
     std::shared_ptr<ARMul_State> interpreter_state;
 };
diff --git a/src/core/arm/dyncom/arm_dyncom.cpp b/src/core/arm/dyncom/arm_dyncom.cpp
index 81f9bf99e..4d72aef77 100644
--- a/src/core/arm/dyncom/arm_dyncom.cpp
+++ b/src/core/arm/dyncom/arm_dyncom.cpp
@@ -25,6 +25,10 @@ void ARM_DynCom::ClearInstructionCache() {
     trans_cache_buf_top = 0;
 }
 
+void ARM_DynCom::PageTableChanged() {
+    ClearInstructionCache();
+}
+
 void ARM_DynCom::SetPC(u32 pc) {
     state->Reg[15] = pc;
 }
@@ -73,12 +77,6 @@ void ARM_DynCom::SetCP15Register(CP15Register reg, u32 value) {
     state->CP15[reg] = value;
 }
 
-void ARM_DynCom::AddTicks(u64 ticks) {
-    down_count -= ticks;
-    if (down_count < 0)
-        CoreTiming::Advance();
-}
-
 void ARM_DynCom::ExecuteInstructions(int num_instructions) {
     state->NumInstrsToExecute = num_instructions;
 
@@ -86,7 +84,7 @@ void ARM_DynCom::ExecuteInstructions(int num_instructions) {
     // executing one instruction at a time. Otherwise, if a block is being executed, more
     // instructions may actually be executed than specified.
     unsigned ticks_executed = InterpreterMainLoop(state.get());
-    AddTicks(ticks_executed);
+    CoreTiming::AddTicks(ticks_executed);
 }
 
 void ARM_DynCom::SaveContext(ThreadContext& ctx) {
diff --git a/src/core/arm/dyncom/arm_dyncom.h b/src/core/arm/dyncom/arm_dyncom.h
index 62c174f3c..fc1ffed6a 100644
--- a/src/core/arm/dyncom/arm_dyncom.h
+++ b/src/core/arm/dyncom/arm_dyncom.h
@@ -16,6 +16,7 @@ public:
     ~ARM_DynCom();
 
     void ClearInstructionCache() override;
+    void PageTableChanged() override;
 
     void SetPC(u32 pc) override;
     u32 GetPC() const override;
@@ -30,8 +31,6 @@ public:
     u32 GetCP15Register(CP15Register reg) override;
     void SetCP15Register(CP15Register reg, u32 value) override;
 
-    void AddTicks(u64 ticks) override;
-
     void SaveContext(ThreadContext& ctx) override;
     void LoadContext(const ThreadContext& ctx) override;
 
diff --git a/src/core/core.cpp b/src/core/core.cpp
index 59b8768e7..0c7a72987 100644
--- a/src/core/core.cpp
+++ b/src/core/core.cpp
@@ -13,6 +13,7 @@
 #include "core/core_timing.h"
 #include "core/gdbstub/gdbstub.h"
 #include "core/hle/kernel/kernel.h"
+#include "core/hle/kernel/process.h"
 #include "core/hle/kernel/thread.h"
 #include "core/hle/service/service.h"
 #include "core/hw/hw.h"
@@ -100,7 +101,7 @@ System::ResultStatus System::Load(EmuWindow* emu_window, const std::string& file
         return init_result;
     }
 
-    const Loader::ResultStatus load_result{app_loader->Load()};
+    const Loader::ResultStatus load_result{app_loader->Load(Kernel::g_current_process)};
     if (Loader::ResultStatus::Success != load_result) {
         LOG_CRITICAL(Core, "Failed to load ROM (Error %i)!", load_result);
         System::Shutdown();
@@ -114,6 +115,7 @@ System::ResultStatus System::Load(EmuWindow* emu_window, const std::string& file
             return ResultStatus::ErrorLoader;
         }
     }
+    Memory::SetCurrentPageTable(&Kernel::g_current_process->vm_manager.page_table);
     status = ResultStatus::Success;
     return status;
 }
@@ -196,4 +198,4 @@ void System::Shutdown() {
     LOG_DEBUG(Core, "Shutdown OK");
 }
 
-} // namespace
+} // namespace Core
diff --git a/src/core/core_timing.cpp b/src/core/core_timing.cpp
index 276ecfdf6..5e2a5d00f 100644
--- a/src/core/core_timing.cpp
+++ b/src/core/core_timing.cpp
@@ -57,6 +57,9 @@ static s64 idled_cycles;
 static s64 last_global_time_ticks;
 static s64 last_global_time_us;
 
+static s64 down_count = 0; ///< A decreasing counter of remaining cycles before the next event,
+                           /// decreased by the cpu run loop
+
 static std::recursive_mutex external_event_section;
 
 // Warning: not included in save state.
@@ -146,7 +149,7 @@ void UnregisterAllEvents() {
 }
 
 void Init() {
-    Core::CPU().down_count = INITIAL_SLICE_LENGTH;
+    down_count = INITIAL_SLICE_LENGTH;
     g_slice_length = INITIAL_SLICE_LENGTH;
     global_timer = 0;
     idled_cycles = 0;
@@ -185,8 +188,15 @@ void Shutdown() {
     }
 }
 
+void AddTicks(u64 ticks) {
+    down_count -= ticks;
+    if (down_count < 0) {
+        Advance();
+    }
+}
+
 u64 GetTicks() {
-    return (u64)global_timer + g_slice_length - Core::CPU().down_count;
+    return (u64)global_timer + g_slice_length - down_count;
 }
 
 u64 GetIdleTicks() {
@@ -460,18 +470,18 @@ void MoveEvents() {
 }
 
 void ForceCheck() {
-    s64 cycles_executed = g_slice_length - Core::CPU().down_count;
+    s64 cycles_executed = g_slice_length - down_count;
     global_timer += cycles_executed;
     // This will cause us to check for new events immediately.
-    Core::CPU().down_count = 0;
+    down_count = 0;
     // But let's not eat a bunch more time in Advance() because of this.
     g_slice_length = 0;
 }
 
 void Advance() {
-    s64 cycles_executed = g_slice_length - Core::CPU().down_count;
+    s64 cycles_executed = g_slice_length - down_count;
     global_timer += cycles_executed;
-    Core::CPU().down_count = g_slice_length;
+    down_count = g_slice_length;
 
     if (has_ts_events)
         MoveEvents();
@@ -480,7 +490,7 @@ void Advance() {
     if (!first) {
         if (g_slice_length < 10000) {
             g_slice_length += 10000;
-            Core::CPU().down_count += g_slice_length;
+            down_count += g_slice_length;
         }
     } else {
         // Note that events can eat cycles as well.
@@ -490,7 +500,7 @@ void Advance() {
 
         const int diff = target - g_slice_length;
         g_slice_length += diff;
-        Core::CPU().down_count += diff;
+        down_count += diff;
     }
     if (advance_callback)
         advance_callback(static_cast<int>(cycles_executed));
@@ -506,12 +516,12 @@ void LogPendingEvents() {
 }
 
 void Idle(int max_idle) {
-    s64 cycles_down = Core::CPU().down_count;
+    s64 cycles_down = down_count;
     if (max_idle != 0 && cycles_down > max_idle)
         cycles_down = max_idle;
 
     if (first && cycles_down > 0) {
-        s64 cycles_executed = g_slice_length - Core::CPU().down_count;
+        s64 cycles_executed = g_slice_length - down_count;
         s64 cycles_next_event = first->time - global_timer;
 
         if (cycles_next_event < cycles_executed + cycles_down) {
@@ -526,9 +536,9 @@ void Idle(int max_idle) {
               cycles_down / (float)(g_clock_rate_arm11 * 0.001f));
 
     idled_cycles += cycles_down;
-    Core::CPU().down_count -= cycles_down;
-    if (Core::CPU().down_count == 0)
-        Core::CPU().down_count = -1;
+    down_count -= cycles_down;
+    if (down_count == 0)
+        down_count = -1;
 }
 
 std::string GetScheduledEventsSummary() {
diff --git a/src/core/core_timing.h b/src/core/core_timing.h
index d2f85cd4d..897350801 100644
--- a/src/core/core_timing.h
+++ b/src/core/core_timing.h
@@ -67,6 +67,12 @@ void Shutdown();
 typedef void (*MHzChangeCallback)();
 typedef std::function<void(u64 userdata, int cycles_late)> TimedCallback;
 
+/**
+* Advance the CPU core by the specified number of ticks (e.g. to simulate CPU execution time)
+* @param ticks Number of ticks to advance the CPU core
+*/
+void AddTicks(u64 ticks);
+
 u64 GetTicks();
 u64 GetIdleTicks();
 u64 GetGlobalTimeUs();
diff --git a/src/core/file_sys/archive_selfncch.cpp b/src/core/file_sys/archive_selfncch.cpp
index 298a37a44..a16941c70 100644
--- a/src/core/file_sys/archive_selfncch.cpp
+++ b/src/core/file_sys/archive_selfncch.cpp
@@ -3,12 +3,14 @@
 // Refer to the license.txt file included.
 
 #include <array>
+#include <cinttypes>
 #include "common/common_types.h"
 #include "common/logging/log.h"
 #include "common/swap.h"
 #include "core/file_sys/archive_selfncch.h"
 #include "core/file_sys/errors.h"
 #include "core/file_sys/ivfc_archive.h"
+#include "core/hle/kernel/process.h"
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // FileSys namespace
@@ -102,8 +104,7 @@ public:
 
         switch (static_cast<SelfNCCHFilePathType>(file_path.type)) {
         case SelfNCCHFilePathType::UpdateRomFS:
-            LOG_WARNING(Service_FS, "(STUBBED) open update RomFS");
-            return OpenRomFS();
+            return OpenUpdateRomFS();
 
         case SelfNCCHFilePathType::RomFS:
             return OpenRomFS();
@@ -179,6 +180,17 @@ private:
         }
     }
 
+    ResultVal<std::unique_ptr<FileBackend>> OpenUpdateRomFS() const {
+        if (ncch_data.update_romfs_file) {
+            return MakeResult<std::unique_ptr<FileBackend>>(std::make_unique<IVFCFile>(
+                ncch_data.update_romfs_file, ncch_data.update_romfs_offset,
+                ncch_data.update_romfs_size));
+        } else {
+            LOG_INFO(Service_FS, "Unable to read update RomFS");
+            return ERROR_ROMFS_NOT_FOUND;
+        }
+    }
+
     ResultVal<std::unique_ptr<FileBackend>> OpenExeFS(const std::string& filename) const {
         if (filename == "icon") {
             if (ncch_data.icon) {
@@ -217,30 +229,57 @@ private:
     NCCHData ncch_data;
 };
 
-ArchiveFactory_SelfNCCH::ArchiveFactory_SelfNCCH(Loader::AppLoader& app_loader) {
+void ArchiveFactory_SelfNCCH::Register(Loader::AppLoader& app_loader) {
+    u64 program_id = 0;
+    if (app_loader.ReadProgramId(program_id) != Loader::ResultStatus::Success) {
+        LOG_WARNING(
+            Service_FS,
+            "Could not read program id when registering with SelfNCCH, this might be a 3dsx file");
+    }
+
+    LOG_DEBUG(Service_FS, "Registering program %016" PRIX64 " with the SelfNCCH archive factory",
+              program_id);
+
+    if (ncch_data.find(program_id) != ncch_data.end()) {
+        LOG_WARNING(Service_FS, "Registering program %016" PRIX64
+                                " with SelfNCCH will override existing mapping",
+                    program_id);
+    }
+
+    NCCHData& data = ncch_data[program_id];
+
     std::shared_ptr<FileUtil::IOFile> romfs_file_;
     if (Loader::ResultStatus::Success ==
-        app_loader.ReadRomFS(romfs_file_, ncch_data.romfs_offset, ncch_data.romfs_size)) {
+        app_loader.ReadRomFS(romfs_file_, data.romfs_offset, data.romfs_size)) {
+
+        data.romfs_file = std::move(romfs_file_);
+    }
+
+    std::shared_ptr<FileUtil::IOFile> update_romfs_file;
+    if (Loader::ResultStatus::Success ==
+        app_loader.ReadUpdateRomFS(update_romfs_file, data.update_romfs_offset,
+                                   data.update_romfs_size)) {
 
-        ncch_data.romfs_file = std::move(romfs_file_);
+        data.update_romfs_file = std::move(update_romfs_file);
     }
 
     std::vector<u8> buffer;
 
     if (Loader::ResultStatus::Success == app_loader.ReadIcon(buffer))
-        ncch_data.icon = std::make_shared<std::vector<u8>>(std::move(buffer));
+        data.icon = std::make_shared<std::vector<u8>>(std::move(buffer));
 
     buffer.clear();
     if (Loader::ResultStatus::Success == app_loader.ReadLogo(buffer))
-        ncch_data.logo = std::make_shared<std::vector<u8>>(std::move(buffer));
+        data.logo = std::make_shared<std::vector<u8>>(std::move(buffer));
 
     buffer.clear();
     if (Loader::ResultStatus::Success == app_loader.ReadBanner(buffer))
-        ncch_data.banner = std::make_shared<std::vector<u8>>(std::move(buffer));
+        data.banner = std::make_shared<std::vector<u8>>(std::move(buffer));
 }
 
 ResultVal<std::unique_ptr<ArchiveBackend>> ArchiveFactory_SelfNCCH::Open(const Path& path) {
-    auto archive = std::make_unique<SelfNCCHArchive>(ncch_data);
+    auto archive = std::make_unique<SelfNCCHArchive>(
+        ncch_data[Kernel::g_current_process->codeset->program_id]);
     return MakeResult<std::unique_ptr<ArchiveBackend>>(std::move(archive));
 }
 
diff --git a/src/core/file_sys/archive_selfncch.h b/src/core/file_sys/archive_selfncch.h
index f1b971296..0d6d6766e 100644
--- a/src/core/file_sys/archive_selfncch.h
+++ b/src/core/file_sys/archive_selfncch.h
@@ -6,6 +6,7 @@
 
 #include <memory>
 #include <string>
+#include <unordered_map>
 #include <vector>
 #include "common/common_types.h"
 #include "core/file_sys/archive_backend.h"
@@ -24,12 +25,19 @@ struct NCCHData {
     std::shared_ptr<FileUtil::IOFile> romfs_file;
     u64 romfs_offset = 0;
     u64 romfs_size = 0;
+
+    std::shared_ptr<FileUtil::IOFile> update_romfs_file;
+    u64 update_romfs_offset = 0;
+    u64 update_romfs_size = 0;
 };
 
 /// File system interface to the SelfNCCH archive
 class ArchiveFactory_SelfNCCH final : public ArchiveFactory {
 public:
-    explicit ArchiveFactory_SelfNCCH(Loader::AppLoader& app_loader);
+    ArchiveFactory_SelfNCCH() = default;
+
+    /// Registers a loaded application so that we can open its SelfNCCH archive when requested.
+    void Register(Loader::AppLoader& app_loader);
 
     std::string GetName() const override {
         return "SelfNCCH";
@@ -39,7 +47,8 @@ public:
     ResultVal<ArchiveFormatInfo> GetFormatInfo(const Path& path) const override;
 
 private:
-    NCCHData ncch_data;
+    /// Mapping of ProgramId -> NCCHData
+    std::unordered_map<u64, NCCHData> ncch_data;
 };
 
 } // namespace FileSys
diff --git a/src/core/file_sys/ncch_container.cpp b/src/core/file_sys/ncch_container.cpp
new file mode 100644
index 000000000..59c72f3e9
--- /dev/null
+++ b/src/core/file_sys/ncch_container.cpp
@@ -0,0 +1,316 @@
+// Copyright 2017 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <cinttypes>
+#include <cstring>
+#include <memory>
+#include "common/common_types.h"
+#include "common/logging/log.h"
+#include "core/core.h"
+#include "core/file_sys/ncch_container.h"
+#include "core/loader/loader.h"
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// FileSys namespace
+
+namespace FileSys {
+
+static const int kMaxSections = 8;   ///< Maximum number of sections (files) in an ExeFs
+static const int kBlockSize = 0x200; ///< Size of ExeFS blocks (in bytes)
+
+/**
+ * Get the decompressed size of an LZSS compressed ExeFS file
+ * @param buffer Buffer of compressed file
+ * @param size Size of compressed buffer
+ * @return Size of decompressed buffer
+ */
+static u32 LZSS_GetDecompressedSize(const u8* buffer, u32 size) {
+    u32 offset_size = *(u32*)(buffer + size - 4);
+    return offset_size + size;
+}
+
+/**
+ * Decompress ExeFS file (compressed with LZSS)
+ * @param compressed Compressed buffer
+ * @param compressed_size Size of compressed buffer
+ * @param decompressed Decompressed buffer
+ * @param decompressed_size Size of decompressed buffer
+ * @return True on success, otherwise false
+ */
+static bool LZSS_Decompress(const u8* compressed, u32 compressed_size, u8* decompressed,
+                            u32 decompressed_size) {
+    const u8* footer = compressed + compressed_size - 8;
+    u32 buffer_top_and_bottom = *reinterpret_cast<const u32*>(footer);
+    u32 out = decompressed_size;
+    u32 index = compressed_size - ((buffer_top_and_bottom >> 24) & 0xFF);
+    u32 stop_index = compressed_size - (buffer_top_and_bottom & 0xFFFFFF);
+
+    memset(decompressed, 0, decompressed_size);
+    memcpy(decompressed, compressed, compressed_size);
+
+    while (index > stop_index) {
+        u8 control = compressed[--index];
+
+        for (unsigned i = 0; i < 8; i++) {
+            if (index <= stop_index)
+                break;
+            if (index <= 0)
+                break;
+            if (out <= 0)
+                break;
+
+            if (control & 0x80) {
+                // Check if compression is out of bounds
+                if (index < 2)
+                    return false;
+                index -= 2;
+
+                u32 segment_offset = compressed[index] | (compressed[index + 1] << 8);
+                u32 segment_size = ((segment_offset >> 12) & 15) + 3;
+                segment_offset &= 0x0FFF;
+                segment_offset += 2;
+
+                // Check if compression is out of bounds
+                if (out < segment_size)
+                    return false;
+
+                for (unsigned j = 0; j < segment_size; j++) {
+                    // Check if compression is out of bounds
+                    if (out + segment_offset >= decompressed_size)
+                        return false;
+
+                    u8 data = decompressed[out + segment_offset];
+                    decompressed[--out] = data;
+                }
+            } else {
+                // Check if compression is out of bounds
+                if (out < 1)
+                    return false;
+                decompressed[--out] = compressed[--index];
+            }
+            control <<= 1;
+        }
+    }
+    return true;
+}
+
+NCCHContainer::NCCHContainer(const std::string& filepath) : filepath(filepath) {
+    file = FileUtil::IOFile(filepath, "rb");
+}
+
+Loader::ResultStatus NCCHContainer::OpenFile(const std::string& filepath) {
+    this->filepath = filepath;
+    file = FileUtil::IOFile(filepath, "rb");
+
+    if (!file.IsOpen()) {
+        LOG_WARNING(Service_FS, "Failed to open %s", filepath.c_str());
+        return Loader::ResultStatus::Error;
+    }
+
+    LOG_DEBUG(Service_FS, "Opened %s", filepath.c_str());
+    return Loader::ResultStatus::Success;
+}
+
+Loader::ResultStatus NCCHContainer::Load() {
+    if (is_loaded)
+        return Loader::ResultStatus::Success;
+
+    // Reset read pointer in case this file has been read before.
+    file.Seek(0, SEEK_SET);
+
+    if (file.ReadBytes(&ncch_header, sizeof(NCCH_Header)) != sizeof(NCCH_Header))
+        return Loader::ResultStatus::Error;
+
+    // Skip NCSD header and load first NCCH (NCSD is just a container of NCCH files)...
+    if (Loader::MakeMagic('N', 'C', 'S', 'D') == ncch_header.magic) {
+        LOG_DEBUG(Service_FS, "Only loading the first (bootable) NCCH within the NCSD file!");
+        ncch_offset = 0x4000;
+        file.Seek(ncch_offset, SEEK_SET);
+        file.ReadBytes(&ncch_header, sizeof(NCCH_Header));
+    }
+
+    // Verify we are loading the correct file type...
+    if (Loader::MakeMagic('N', 'C', 'C', 'H') != ncch_header.magic)
+        return Loader::ResultStatus::ErrorInvalidFormat;
+
+    // System archives and DLC don't have an extended header but have RomFS
+    if (ncch_header.extended_header_size) {
+        if (file.ReadBytes(&exheader_header, sizeof(ExHeader_Header)) != sizeof(ExHeader_Header))
+            return Loader::ResultStatus::Error;
+
+        is_compressed = (exheader_header.codeset_info.flags.flag & 1) == 1;
+        u32 entry_point = exheader_header.codeset_info.text.address;
+        u32 code_size = exheader_header.codeset_info.text.code_size;
+        u32 stack_size = exheader_header.codeset_info.stack_size;
+        u32 bss_size = exheader_header.codeset_info.bss_size;
+        u32 core_version = exheader_header.arm11_system_local_caps.core_version;
+        u8 priority = exheader_header.arm11_system_local_caps.priority;
+        u8 resource_limit_category =
+            exheader_header.arm11_system_local_caps.resource_limit_category;
+
+        LOG_DEBUG(Service_FS, "Name:                        %s", exheader_header.codeset_info.name);
+        LOG_DEBUG(Service_FS, "Program ID:                  %016" PRIX64, ncch_header.program_id);
+        LOG_DEBUG(Service_FS, "Code compressed:             %s", is_compressed ? "yes" : "no");
+        LOG_DEBUG(Service_FS, "Entry point:                 0x%08X", entry_point);
+        LOG_DEBUG(Service_FS, "Code size:                   0x%08X", code_size);
+        LOG_DEBUG(Service_FS, "Stack size:                  0x%08X", stack_size);
+        LOG_DEBUG(Service_FS, "Bss size:                    0x%08X", bss_size);
+        LOG_DEBUG(Service_FS, "Core version:                %d", core_version);
+        LOG_DEBUG(Service_FS, "Thread priority:             0x%X", priority);
+        LOG_DEBUG(Service_FS, "Resource limit category:     %d", resource_limit_category);
+        LOG_DEBUG(Service_FS, "System Mode:                 %d",
+                  static_cast<int>(exheader_header.arm11_system_local_caps.system_mode));
+
+        if (exheader_header.system_info.jump_id != ncch_header.program_id) {
+            LOG_ERROR(Service_FS, "ExHeader Program ID mismatch: the ROM is probably encrypted.");
+            return Loader::ResultStatus::ErrorEncrypted;
+        }
+
+        has_exheader = true;
+    }
+
+    // DLC can have an ExeFS and a RomFS but no extended header
+    if (ncch_header.exefs_size) {
+        exefs_offset = ncch_header.exefs_offset * kBlockSize;
+        u32 exefs_size = ncch_header.exefs_size * kBlockSize;
+
+        LOG_DEBUG(Service_FS, "ExeFS offset:                0x%08X", exefs_offset);
+        LOG_DEBUG(Service_FS, "ExeFS size:                  0x%08X", exefs_size);
+
+        file.Seek(exefs_offset + ncch_offset, SEEK_SET);
+        if (file.ReadBytes(&exefs_header, sizeof(ExeFs_Header)) != sizeof(ExeFs_Header))
+            return Loader::ResultStatus::Error;
+
+        has_exefs = true;
+    }
+
+    if (ncch_header.romfs_offset != 0 && ncch_header.romfs_size != 0)
+        has_romfs = true;
+
+    is_loaded = true;
+    return Loader::ResultStatus::Success;
+}
+
+Loader::ResultStatus NCCHContainer::LoadSectionExeFS(const char* name, std::vector<u8>& buffer) {
+    if (!file.IsOpen())
+        return Loader::ResultStatus::Error;
+
+    Loader::ResultStatus result = Load();
+    if (result != Loader::ResultStatus::Success)
+        return result;
+
+    if (!has_exefs)
+        return Loader::ResultStatus::ErrorNotUsed;
+
+    LOG_DEBUG(Service_FS, "%d sections:", kMaxSections);
+    // Iterate through the ExeFs archive until we find a section with the specified name...
+    for (unsigned section_number = 0; section_number < kMaxSections; section_number++) {
+        const auto& section = exefs_header.section[section_number];
+
+        // Load the specified section...
+        if (strcmp(section.name, name) == 0) {
+            LOG_DEBUG(Service_FS, "%d - offset: 0x%08X, size: 0x%08X, name: %s", section_number,
+                      section.offset, section.size, section.name);
+
+            s64 section_offset =
+                (section.offset + exefs_offset + sizeof(ExeFs_Header) + ncch_offset);
+            file.Seek(section_offset, SEEK_SET);
+
+            if (strcmp(section.name, ".code") == 0 && is_compressed) {
+                // Section is compressed, read compressed .code section...
+                std::unique_ptr<u8[]> temp_buffer;
+                try {
+                    temp_buffer.reset(new u8[section.size]);
+                } catch (std::bad_alloc&) {
+                    return Loader::ResultStatus::ErrorMemoryAllocationFailed;
+                }
+
+                if (file.ReadBytes(&temp_buffer[0], section.size) != section.size)
+                    return Loader::ResultStatus::Error;
+
+                // Decompress .code section...
+                u32 decompressed_size = LZSS_GetDecompressedSize(&temp_buffer[0], section.size);
+                buffer.resize(decompressed_size);
+                if (!LZSS_Decompress(&temp_buffer[0], section.size, &buffer[0], decompressed_size))
+                    return Loader::ResultStatus::ErrorInvalidFormat;
+            } else {
+                // Section is uncompressed...
+                buffer.resize(section.size);
+                if (file.ReadBytes(&buffer[0], section.size) != section.size)
+                    return Loader::ResultStatus::Error;
+            }
+            return Loader::ResultStatus::Success;
+        }
+    }
+    return Loader::ResultStatus::ErrorNotUsed;
+}
+
+Loader::ResultStatus NCCHContainer::ReadRomFS(std::shared_ptr<FileUtil::IOFile>& romfs_file,
+                                              u64& offset, u64& size) {
+    if (!file.IsOpen())
+        return Loader::ResultStatus::Error;
+
+    Loader::ResultStatus result = Load();
+    if (result != Loader::ResultStatus::Success)
+        return result;
+
+    if (!has_romfs) {
+        LOG_DEBUG(Service_FS, "RomFS requested from NCCH which has no RomFS");
+        return Loader::ResultStatus::ErrorNotUsed;
+    }
+
+    u32 romfs_offset = ncch_offset + (ncch_header.romfs_offset * kBlockSize) + 0x1000;
+    u32 romfs_size = (ncch_header.romfs_size * kBlockSize) - 0x1000;
+
+    LOG_DEBUG(Service_FS, "RomFS offset:           0x%08X", romfs_offset);
+    LOG_DEBUG(Service_FS, "RomFS size:             0x%08X", romfs_size);
+
+    if (file.GetSize() < romfs_offset + romfs_size)
+        return Loader::ResultStatus::Error;
+
+    // We reopen the file, to allow its position to be independent from file's
+    romfs_file = std::make_shared<FileUtil::IOFile>(filepath, "rb");
+    if (!romfs_file->IsOpen())
+        return Loader::ResultStatus::Error;
+
+    offset = romfs_offset;
+    size = romfs_size;
+
+    return Loader::ResultStatus::Success;
+}
+
+Loader::ResultStatus NCCHContainer::ReadProgramId(u64_le& program_id) {
+    Loader::ResultStatus result = Load();
+    if (result != Loader::ResultStatus::Success)
+        return result;
+
+    program_id = ncch_header.program_id;
+    return Loader::ResultStatus::Success;
+}
+
+bool NCCHContainer::HasExeFS() {
+    Loader::ResultStatus result = Load();
+    if (result != Loader::ResultStatus::Success)
+        return false;
+
+    return has_exefs;
+}
+
+bool NCCHContainer::HasRomFS() {
+    Loader::ResultStatus result = Load();
+    if (result != Loader::ResultStatus::Success)
+        return false;
+
+    return has_romfs;
+}
+
+bool NCCHContainer::HasExHeader() {
+    Loader::ResultStatus result = Load();
+    if (result != Loader::ResultStatus::Success)
+        return false;
+
+    return has_exheader;
+}
+
+} // namespace FileSys
diff --git a/src/core/file_sys/ncch_container.h b/src/core/file_sys/ncch_container.h
new file mode 100644
index 000000000..8af9032b4
--- /dev/null
+++ b/src/core/file_sys/ncch_container.h
@@ -0,0 +1,244 @@
+// Copyright 2017 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <cstddef>
+#include <memory>
+#include <string>
+#include <vector>
+#include "common/bit_field.h"
+#include "common/common_types.h"
+#include "common/file_util.h"
+#include "common/swap.h"
+#include "core/core.h"
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// NCCH header (Note: "NCCH" appears to be a publicly unknown acronym)
+
+struct NCCH_Header {
+    u8 signature[0x100];
+    u32_le magic;
+    u32_le content_size;
+    u8 partition_id[8];
+    u16_le maker_code;
+    u16_le version;
+    u8 reserved_0[4];
+    u64_le program_id;
+    u8 reserved_1[0x10];
+    u8 logo_region_hash[0x20];
+    u8 product_code[0x10];
+    u8 extended_header_hash[0x20];
+    u32_le extended_header_size;
+    u8 reserved_2[4];
+    u8 flags[8];
+    u32_le plain_region_offset;
+    u32_le plain_region_size;
+    u32_le logo_region_offset;
+    u32_le logo_region_size;
+    u32_le exefs_offset;
+    u32_le exefs_size;
+    u32_le exefs_hash_region_size;
+    u8 reserved_3[4];
+    u32_le romfs_offset;
+    u32_le romfs_size;
+    u32_le romfs_hash_region_size;
+    u8 reserved_4[4];
+    u8 exefs_super_block_hash[0x20];
+    u8 romfs_super_block_hash[0x20];
+};
+
+static_assert(sizeof(NCCH_Header) == 0x200, "NCCH header structure size is wrong");
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// ExeFS (executable file system) headers
+
+struct ExeFs_SectionHeader {
+    char name[8];
+    u32 offset;
+    u32 size;
+};
+
+struct ExeFs_Header {
+    ExeFs_SectionHeader section[8];
+    u8 reserved[0x80];
+    u8 hashes[8][0x20];
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// ExHeader (executable file system header) headers
+
+struct ExHeader_SystemInfoFlags {
+    u8 reserved[5];
+    u8 flag;
+    u8 remaster_version[2];
+};
+
+struct ExHeader_CodeSegmentInfo {
+    u32 address;
+    u32 num_max_pages;
+    u32 code_size;
+};
+
+struct ExHeader_CodeSetInfo {
+    u8 name[8];
+    ExHeader_SystemInfoFlags flags;
+    ExHeader_CodeSegmentInfo text;
+    u32 stack_size;
+    ExHeader_CodeSegmentInfo ro;
+    u8 reserved[4];
+    ExHeader_CodeSegmentInfo data;
+    u32 bss_size;
+};
+
+struct ExHeader_DependencyList {
+    u8 program_id[0x30][8];
+};
+
+struct ExHeader_SystemInfo {
+    u64 save_data_size;
+    u64_le jump_id;
+    u8 reserved_2[0x30];
+};
+
+struct ExHeader_StorageInfo {
+    u8 ext_save_data_id[8];
+    u8 system_save_data_id[8];
+    u8 reserved[8];
+    u8 access_info[7];
+    u8 other_attributes;
+};
+
+struct ExHeader_ARM11_SystemLocalCaps {
+    u64_le program_id;
+    u32_le core_version;
+    u8 reserved_flags[2];
+    union {
+        u8 flags0;
+        BitField<0, 2, u8> ideal_processor;
+        BitField<2, 2, u8> affinity_mask;
+        BitField<4, 4, u8> system_mode;
+    };
+    u8 priority;
+    u8 resource_limit_descriptor[0x10][2];
+    ExHeader_StorageInfo storage_info;
+    u8 service_access_control[0x20][8];
+    u8 ex_service_access_control[0x2][8];
+    u8 reserved[0xf];
+    u8 resource_limit_category;
+};
+
+struct ExHeader_ARM11_KernelCaps {
+    u32_le descriptors[28];
+    u8 reserved[0x10];
+};
+
+struct ExHeader_ARM9_AccessControl {
+    u8 descriptors[15];
+    u8 descversion;
+};
+
+struct ExHeader_Header {
+    ExHeader_CodeSetInfo codeset_info;
+    ExHeader_DependencyList dependency_list;
+    ExHeader_SystemInfo system_info;
+    ExHeader_ARM11_SystemLocalCaps arm11_system_local_caps;
+    ExHeader_ARM11_KernelCaps arm11_kernel_caps;
+    ExHeader_ARM9_AccessControl arm9_access_control;
+    struct {
+        u8 signature[0x100];
+        u8 ncch_public_key_modulus[0x100];
+        ExHeader_ARM11_SystemLocalCaps arm11_system_local_caps;
+        ExHeader_ARM11_KernelCaps arm11_kernel_caps;
+        ExHeader_ARM9_AccessControl arm9_access_control;
+    } access_desc;
+};
+
+static_assert(sizeof(ExHeader_Header) == 0x800, "ExHeader structure size is wrong");
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// FileSys namespace
+
+namespace FileSys {
+
+/**
+ * Helper which implements an interface to deal with NCCH containers which can
+ * contain ExeFS archives or RomFS archives for games or other applications.
+ */
+class NCCHContainer {
+public:
+    NCCHContainer(const std::string& filepath);
+    NCCHContainer() {}
+
+    Loader::ResultStatus OpenFile(const std::string& filepath);
+
+    /**
+     * Ensure ExeFS and exheader is loaded and ready for reading sections
+     * @return ResultStatus result of function
+     */
+    Loader::ResultStatus Load();
+
+    /**
+     * Reads an application ExeFS section of an NCCH file (e.g. .code, .logo, etc.)
+     * @param name Name of section to read out of NCCH file
+     * @param buffer Vector to read data into
+     * @return ResultStatus result of function
+     */
+    Loader::ResultStatus LoadSectionExeFS(const char* name, std::vector<u8>& buffer);
+
+    /**
+     * Get the RomFS of the NCCH container
+     * Since the RomFS can be huge, we return a file reference instead of copying to a buffer
+     * @param romfs_file The file containing the RomFS
+     * @param offset The offset the romfs begins on
+     * @param size The size of the romfs
+     * @return ResultStatus result of function
+     */
+    Loader::ResultStatus ReadRomFS(std::shared_ptr<FileUtil::IOFile>& romfs_file, u64& offset,
+                                   u64& size);
+
+    /**
+     * Get the Program ID of the NCCH container
+     * @return ResultStatus result of function
+     */
+    Loader::ResultStatus ReadProgramId(u64_le& program_id);
+
+    /**
+     * Checks whether the NCCH container contains an ExeFS
+     * @return bool check result
+     */
+    bool HasExeFS();
+
+    /**
+     * Checks whether the NCCH container contains a RomFS
+     * @return bool check result
+     */
+    bool HasRomFS();
+
+    /**
+     * Checks whether the NCCH container contains an ExHeader
+     * @return bool check result
+     */
+    bool HasExHeader();
+
+    NCCH_Header ncch_header;
+    ExeFs_Header exefs_header;
+    ExHeader_Header exheader_header;
+
+private:
+    bool has_exheader = false;
+    bool has_exefs = false;
+    bool has_romfs = false;
+
+    bool is_loaded = false;
+    bool is_compressed = false;
+
+    u32 ncch_offset = 0; // Offset to NCCH header, can be 0 or after NCSD header
+    u32 exefs_offset = 0;
+
+    std::string filepath;
+    FileUtil::IOFile file;
+};
+
+} // namespace FileSys
diff --git a/src/core/gdbstub/gdbstub.cpp b/src/core/gdbstub/gdbstub.cpp
index 123fe7cd4..be2b2e25f 100644
--- a/src/core/gdbstub/gdbstub.cpp
+++ b/src/core/gdbstub/gdbstub.cpp
@@ -946,7 +946,7 @@ static void Init(u16 port) {
     WSAStartup(MAKEWORD(2, 2), &InitData);
 #endif
 
-    int tmpsock = socket(PF_INET, SOCK_STREAM, 0);
+    int tmpsock = static_cast<int>(socket(PF_INET, SOCK_STREAM, 0));
     if (tmpsock == -1) {
         LOG_ERROR(Debug_GDBStub, "Failed to create gdb socket");
     }
@@ -973,7 +973,7 @@ static void Init(u16 port) {
     sockaddr_in saddr_client;
     sockaddr* client_addr = reinterpret_cast<sockaddr*>(&saddr_client);
     socklen_t client_addrlen = sizeof(saddr_client);
-    gdbserver_socket = accept(tmpsock, client_addr, &client_addrlen);
+    gdbserver_socket = static_cast<int>(accept(tmpsock, client_addr, &client_addrlen));
     if (gdbserver_socket < 0) {
         // In the case that we couldn't start the server for whatever reason, just start CPU
         // execution like normal.
diff --git a/src/core/hle/ipc.h b/src/core/hle/ipc.h
index f7f96125a..87ed85df6 100644
--- a/src/core/hle/ipc.h
+++ b/src/core/hle/ipc.h
@@ -122,11 +122,11 @@ union StaticBufferDescInfo {
     BitField<14, 18, u32> size;
 };
 
-inline u32 StaticBufferDesc(u32 size, u8 buffer_id) {
+inline u32 StaticBufferDesc(size_t size, u8 buffer_id) {
     StaticBufferDescInfo info{};
     info.descriptor_type.Assign(StaticBuffer);
     info.buffer_id.Assign(buffer_id);
-    info.size.Assign(size);
+    info.size.Assign(static_cast<u32>(size));
     return info.raw;
 }
 
@@ -160,11 +160,11 @@ union MappedBufferDescInfo {
     BitField<4, 28, u32> size;
 };
 
-inline u32 MappedBufferDesc(u32 size, MappedBufferPermissions perms) {
+inline u32 MappedBufferDesc(size_t size, MappedBufferPermissions perms) {
     MappedBufferDescInfo info{};
     info.flags.Assign(MappedBuffer);
     info.perms.Assign(perms);
-    info.size.Assign(size);
+    info.size.Assign(static_cast<u32>(size));
     return info.raw;
 }
 
diff --git a/src/core/hle/ipc_helpers.h b/src/core/hle/ipc_helpers.h
index f0d89cffe..7cb95cbac 100644
--- a/src/core/hle/ipc_helpers.h
+++ b/src/core/hle/ipc_helpers.h
@@ -117,9 +117,9 @@ public:
 
     void PushCurrentPIDHandle();
 
-    void PushStaticBuffer(VAddr buffer_vaddr, u32 size, u8 buffer_id);
+    void PushStaticBuffer(VAddr buffer_vaddr, size_t size, u8 buffer_id);
 
-    void PushMappedBuffer(VAddr buffer_vaddr, u32 size, MappedBufferPermissions perms);
+    void PushMappedBuffer(VAddr buffer_vaddr, size_t size, MappedBufferPermissions perms);
 };
 
 /// Push ///
@@ -190,12 +190,12 @@ inline void RequestBuilder::PushCurrentPIDHandle() {
     Push(u32(0));
 }
 
-inline void RequestBuilder::PushStaticBuffer(VAddr buffer_vaddr, u32 size, u8 buffer_id) {
+inline void RequestBuilder::PushStaticBuffer(VAddr buffer_vaddr, size_t size, u8 buffer_id) {
     Push(StaticBufferDesc(size, buffer_id));
     Push(buffer_vaddr);
 }
 
-inline void RequestBuilder::PushMappedBuffer(VAddr buffer_vaddr, u32 size,
+inline void RequestBuilder::PushMappedBuffer(VAddr buffer_vaddr, size_t size,
                                              MappedBufferPermissions perms) {
     Push(MappedBufferDesc(size, perms));
     Push(buffer_vaddr);
@@ -227,8 +227,8 @@ public:
                                bool validateHeader = true) {
         if (validateHeader)
             ValidateHeader();
-        Header builderHeader{
-            MakeHeader(header.command_id, normal_params_size, translate_params_size)};
+        Header builderHeader{MakeHeader(static_cast<u16>(header.command_id), normal_params_size,
+                                        translate_params_size)};
         if (context != nullptr)
             return {*context, builderHeader};
         else
diff --git a/src/core/hle/kernel/hle_ipc.cpp b/src/core/hle/kernel/hle_ipc.cpp
index 5ebe2eca4..6020e9764 100644
--- a/src/core/hle/kernel/hle_ipc.cpp
+++ b/src/core/hle/kernel/hle_ipc.cpp
@@ -37,7 +37,7 @@ SharedPtr<Object> HLERequestContext::GetIncomingHandle(u32 id_from_cmdbuf) const
 
 u32 HLERequestContext::AddOutgoingHandle(SharedPtr<Object> object) {
     request_handles.push_back(std::move(object));
-    return request_handles.size() - 1;
+    return static_cast<u32>(request_handles.size() - 1);
 }
 
 void HLERequestContext::ClearIncomingObjects() {
diff --git a/src/core/hle/kernel/mutex.cpp b/src/core/hle/kernel/mutex.cpp
index cef961289..2cbca5e5b 100644
--- a/src/core/hle/kernel/mutex.cpp
+++ b/src/core/hle/kernel/mutex.cpp
@@ -90,7 +90,7 @@ void Mutex::UpdatePriority() {
     if (!holding_thread)
         return;
 
-    s32 best_priority = THREADPRIO_LOWEST;
+    u32 best_priority = THREADPRIO_LOWEST;
     for (auto& waiter : GetWaitingThreads()) {
         if (waiter->current_priority < best_priority)
             best_priority = waiter->current_priority;
diff --git a/src/core/hle/kernel/process.cpp b/src/core/hle/kernel/process.cpp
index 522ad2333..cf3163e0f 100644
--- a/src/core/hle/kernel/process.cpp
+++ b/src/core/hle/kernel/process.cpp
@@ -147,7 +147,7 @@ void Process::Run(s32 main_thread_priority, u32 stack_size) {
     }
 
     vm_manager.LogLayout(Log::Level::Debug);
-    Kernel::SetupMainThread(codeset->entrypoint, main_thread_priority);
+    Kernel::SetupMainThread(codeset->entrypoint, main_thread_priority, this);
 }
 
 VAddr Process::GetLinearHeapAreaAddress() const {
diff --git a/src/core/hle/kernel/resource_limit.cpp b/src/core/hle/kernel/resource_limit.cpp
index a8f10a3ee..517dc47a8 100644
--- a/src/core/hle/kernel/resource_limit.cpp
+++ b/src/core/hle/kernel/resource_limit.cpp
@@ -61,7 +61,7 @@ s32 ResourceLimit::GetCurrentResourceValue(u32 resource) const {
     }
 }
 
-s32 ResourceLimit::GetMaxResourceValue(u32 resource) const {
+u32 ResourceLimit::GetMaxResourceValue(u32 resource) const {
     switch (resource) {
     case PRIORITY:
         return max_priority;
diff --git a/src/core/hle/kernel/resource_limit.h b/src/core/hle/kernel/resource_limit.h
index 6cdfbcf8d..42874eb8d 100644
--- a/src/core/hle/kernel/resource_limit.h
+++ b/src/core/hle/kernel/resource_limit.h
@@ -67,7 +67,7 @@ public:
      * @param resource Requested resource type
      * @returns The max value of the resource type
      */
-    s32 GetMaxResourceValue(u32 resource) const;
+    u32 GetMaxResourceValue(u32 resource) const;
 
     /// Name of resource limit object.
     std::string name;
diff --git a/src/core/hle/kernel/shared_memory.cpp b/src/core/hle/kernel/shared_memory.cpp
index a7b66142f..d45daca35 100644
--- a/src/core/hle/kernel/shared_memory.cpp
+++ b/src/core/hle/kernel/shared_memory.cpp
@@ -42,7 +42,8 @@ SharedPtr<SharedMemory> SharedMemory::Create(SharedPtr<Process> owner_process, u
         memory_region->used += size;
 
         shared_memory->linear_heap_phys_address =
-            Memory::FCRAM_PADDR + memory_region->base + shared_memory->backing_block_offset;
+            Memory::FCRAM_PADDR + memory_region->base +
+            static_cast<PAddr>(shared_memory->backing_block_offset);
 
         // Increase the amount of used linear heap memory for the owner process.
         if (shared_memory->owner_process != nullptr) {
@@ -54,22 +55,19 @@ SharedPtr<SharedMemory> SharedMemory::Create(SharedPtr<Process> owner_process, u
             Kernel::g_current_process->vm_manager.RefreshMemoryBlockMappings(linheap_memory.get());
         }
     } else {
-        // TODO(Subv): What happens if an application tries to create multiple memory blocks
-        // pointing to the same address?
         auto& vm_manager = shared_memory->owner_process->vm_manager;
         // The memory is already available and mapped in the owner process.
-        auto vma = vm_manager.FindVMA(address)->second;
-        // Copy it over to our own storage
-        shared_memory->backing_block = std::make_shared<std::vector<u8>>(
-            vma.backing_block->data() + vma.offset, vma.backing_block->data() + vma.offset + size);
-        shared_memory->backing_block_offset = 0;
-        // Unmap the existing pages
-        vm_manager.UnmapRange(address, size);
-        // Map our own block into the address space
-        vm_manager.MapMemoryBlock(address, shared_memory->backing_block, 0, size,
-                                  MemoryState::Shared);
-        // Reprotect the block with the new permissions
-        vm_manager.ReprotectRange(address, size, ConvertPermissions(permissions));
+        auto vma = vm_manager.FindVMA(address);
+        ASSERT_MSG(vma != vm_manager.vma_map.end(), "Invalid memory address");
+        ASSERT_MSG(vma->second.backing_block, "Backing block doesn't exist for address");
+
+        // The returned VMA might be a bigger one encompassing the desired address.
+        auto vma_offset = address - vma->first;
+        ASSERT_MSG(vma_offset + size <= vma->second.size,
+                   "Shared memory exceeds bounds of mapped block");
+
+        shared_memory->backing_block = vma->second.backing_block;
+        shared_memory->backing_block_offset = vma->second.offset + vma_offset;
     }
 
     shared_memory->base_address = address;
@@ -183,4 +181,4 @@ u8* SharedMemory::GetPointer(u32 offset) {
     return backing_block->data() + backing_block_offset + offset;
 }
 
-} // namespace
+} // namespace Kernel
diff --git a/src/core/hle/kernel/shared_memory.h b/src/core/hle/kernel/shared_memory.h
index 94b335ed1..93a6f2182 100644
--- a/src/core/hle/kernel/shared_memory.h
+++ b/src/core/hle/kernel/shared_memory.h
@@ -114,7 +114,7 @@ public:
     /// Backing memory for this shared memory block.
     std::shared_ptr<std::vector<u8>> backing_block;
     /// Offset into the backing block for this shared memory.
-    u32 backing_block_offset;
+    size_t backing_block_offset;
     /// Size of the memory block. Page-aligned.
     u32 size;
     /// Permission restrictions applied to the process which created the block.
diff --git a/src/core/hle/kernel/thread.cpp b/src/core/hle/kernel/thread.cpp
index 324415a36..0f7970ebe 100644
--- a/src/core/hle/kernel/thread.cpp
+++ b/src/core/hle/kernel/thread.cpp
@@ -111,7 +111,7 @@ void Thread::Stop() {
 
 Thread* ArbitrateHighestPriorityThread(u32 address) {
     Thread* highest_priority_thread = nullptr;
-    s32 priority = THREADPRIO_LOWEST;
+    u32 priority = THREADPRIO_LOWEST;
 
     // Iterate through threads, find highest priority thread that is waiting to be arbitrated...
     for (auto& thread : thread_list) {
@@ -178,16 +178,13 @@ static void SwitchContext(Thread* new_thread) {
         ready_queue.remove(new_thread->current_priority, new_thread);
         new_thread->status = THREADSTATUS_RUNNING;
 
-        Core::CPU().LoadContext(new_thread->context);
-        Core::CPU().SetCP15Register(CP15_THREAD_URO, new_thread->GetTLSAddress());
-
         if (previous_process != current_thread->owner_process) {
             Kernel::g_current_process = current_thread->owner_process;
-            Memory::current_page_table = &Kernel::g_current_process->vm_manager.page_table;
-            // We have switched processes and thus, page tables, clear the instruction cache so we
-            // don't keep stale data from the previous process.
-            Core::CPU().ClearInstructionCache();
+            SetCurrentPageTable(&Kernel::g_current_process->vm_manager.page_table);
         }
+
+        Core::CPU().LoadContext(new_thread->context);
+        Core::CPU().SetCP15Register(CP15_THREAD_URO, new_thread->GetTLSAddress());
     } else {
         current_thread = nullptr;
         // Note: We do not reset the current process and current page table when idling because
@@ -250,12 +247,15 @@ static void ThreadWakeupCallback(u64 thread_handle, int cycles_late) {
 
     if (thread->status == THREADSTATUS_WAIT_SYNCH_ANY ||
         thread->status == THREADSTATUS_WAIT_SYNCH_ALL || thread->status == THREADSTATUS_WAIT_ARB) {
-        thread->wait_set_output = false;
+
+        // Invoke the wakeup callback before clearing the wait objects
+        if (thread->wakeup_callback)
+            thread->wakeup_callback(ThreadWakeupReason::Timeout, thread, nullptr);
+
         // Remove the thread from each of its waiting objects' waitlists
         for (auto& object : thread->wait_objects)
             object->RemoveWaitingThread(thread.get());
         thread->wait_objects.clear();
-        thread->SetWaitSynchronizationResult(RESULT_TIMEOUT);
     }
 
     thread->ResumeFromWait();
@@ -281,6 +281,9 @@ void Thread::ResumeFromWait() {
         break;
 
     case THREADSTATUS_READY:
+        // The thread's wakeup callback must have already been cleared when the thread was first
+        // awoken.
+        ASSERT(wakeup_callback == nullptr);
         // If the thread is waiting on multiple wait objects, it might be awoken more than once
         // before actually resuming. We can ignore subsequent wakeups if the thread status has
         // already been set to THREADSTATUS_READY.
@@ -296,6 +299,8 @@ void Thread::ResumeFromWait() {
         return;
     }
 
+    wakeup_callback = nullptr;
+
     ready_queue.push_back(current_priority, this);
     status = THREADSTATUS_READY;
     Core::System::GetInstance().PrepareReschedule();
@@ -314,7 +319,7 @@ static void DebugThreadQueue() {
     }
 
     for (auto& t : thread_list) {
-        s32 priority = ready_queue.contains(t.get());
+        u32 priority = ready_queue.contains(t.get());
         if (priority != -1) {
             LOG_DEBUG(Kernel, "0x%02X %u", priority, t->GetObjectId());
         }
@@ -364,7 +369,8 @@ static void ResetThreadContext(ARM_Interface::ThreadContext& context, u32 stack_
 }
 
 ResultVal<SharedPtr<Thread>> Thread::Create(std::string name, VAddr entry_point, u32 priority,
-                                            u32 arg, s32 processor_id, VAddr stack_top) {
+                                            u32 arg, s32 processor_id, VAddr stack_top,
+                                            SharedPtr<Process> owner_process) {
     // Check if priority is in ranged. Lowest priority -> highest priority id.
     if (priority > THREADPRIO_LOWEST) {
         LOG_ERROR(Kernel_SVC, "Invalid thread priority: %d", priority);
@@ -378,7 +384,7 @@ ResultVal<SharedPtr<Thread>> Thread::Create(std::string name, VAddr entry_point,
 
     // TODO(yuriks): Other checks, returning 0xD9001BEA
 
-    if (!Memory::IsValidVirtualAddress(entry_point)) {
+    if (!Memory::IsValidVirtualAddress(*owner_process, entry_point)) {
         LOG_ERROR(Kernel_SVC, "(name=%s): invalid entry %08x", name.c_str(), entry_point);
         // TODO: Verify error
         return ResultCode(ErrorDescription::InvalidAddress, ErrorModule::Kernel,
@@ -397,15 +403,14 @@ ResultVal<SharedPtr<Thread>> Thread::Create(std::string name, VAddr entry_point,
     thread->nominal_priority = thread->current_priority = priority;
     thread->last_running_ticks = CoreTiming::GetTicks();
     thread->processor_id = processor_id;
-    thread->wait_set_output = false;
     thread->wait_objects.clear();
     thread->wait_address = 0;
     thread->name = std::move(name);
     thread->callback_handle = wakeup_callback_handle_table.Create(thread).Unwrap();
-    thread->owner_process = g_current_process;
+    thread->owner_process = owner_process;
 
     // Find the next available TLS index, and mark it as used
-    auto& tls_slots = Kernel::g_current_process->tls_slots;
+    auto& tls_slots = owner_process->tls_slots;
     bool needs_allocation = true;
     u32 available_page; // Which allocated page has free space
     u32 available_slot; // Which slot within the page is free
@@ -424,18 +429,18 @@ ResultVal<SharedPtr<Thread>> Thread::Create(std::string name, VAddr entry_point,
             return ERR_OUT_OF_MEMORY;
         }
 
-        u32 offset = linheap_memory->size();
+        size_t offset = linheap_memory->size();
 
         // Allocate some memory from the end of the linear heap for this region.
         linheap_memory->insert(linheap_memory->end(), Memory::PAGE_SIZE, 0);
         memory_region->used += Memory::PAGE_SIZE;
-        Kernel::g_current_process->linear_heap_used += Memory::PAGE_SIZE;
+        owner_process->linear_heap_used += Memory::PAGE_SIZE;
 
         tls_slots.emplace_back(0); // The page is completely available at the start
-        available_page = tls_slots.size() - 1;
+        available_page = static_cast<u32>(tls_slots.size() - 1);
         available_slot = 0; // Use the first slot in the new page
 
-        auto& vm_manager = Kernel::g_current_process->vm_manager;
+        auto& vm_manager = owner_process->vm_manager;
         vm_manager.RefreshMemoryBlockMappings(linheap_memory.get());
 
         // Map the page to the current process' address space.
@@ -459,7 +464,7 @@ ResultVal<SharedPtr<Thread>> Thread::Create(std::string name, VAddr entry_point,
     return MakeResult<SharedPtr<Thread>>(std::move(thread));
 }
 
-void Thread::SetPriority(s32 priority) {
+void Thread::SetPriority(u32 priority) {
     ASSERT_MSG(priority <= THREADPRIO_LOWEST && priority >= THREADPRIO_HIGHEST,
                "Invalid priority value.");
     // If thread was ready, adjust queues
@@ -472,7 +477,7 @@ void Thread::SetPriority(s32 priority) {
 }
 
 void Thread::UpdatePriority() {
-    s32 best_priority = nominal_priority;
+    u32 best_priority = nominal_priority;
     for (auto& mutex : held_mutexes) {
         if (mutex->priority < best_priority)
             best_priority = mutex->priority;
@@ -480,7 +485,7 @@ void Thread::UpdatePriority() {
     BoostPriority(best_priority);
 }
 
-void Thread::BoostPriority(s32 priority) {
+void Thread::BoostPriority(u32 priority) {
     // If thread was ready, adjust queues
     if (status == THREADSTATUS_READY)
         ready_queue.move(this, current_priority, priority);
@@ -489,10 +494,10 @@ void Thread::BoostPriority(s32 priority) {
     current_priority = priority;
 }
 
-SharedPtr<Thread> SetupMainThread(u32 entry_point, s32 priority) {
+SharedPtr<Thread> SetupMainThread(u32 entry_point, u32 priority, SharedPtr<Process> owner_process) {
     // Initialize new "main" thread
     auto thread_res = Thread::Create("main", entry_point, priority, 0, THREADPROCESSORID_0,
-                                     Memory::HEAP_VADDR_END);
+                                     Memory::HEAP_VADDR_END, owner_process);
 
     SharedPtr<Thread> thread = std::move(thread_res).Unwrap();
 
@@ -533,7 +538,13 @@ void Thread::SetWaitSynchronizationOutput(s32 output) {
 s32 Thread::GetWaitObjectIndex(WaitObject* object) const {
     ASSERT_MSG(!wait_objects.empty(), "Thread is not waiting for anything");
     auto match = std::find(wait_objects.rbegin(), wait_objects.rend(), object);
-    return std::distance(match, wait_objects.rend()) - 1;
+    return static_cast<s32>(std::distance(match, wait_objects.rend()) - 1);
+}
+
+VAddr Thread::GetCommandBufferAddress() const {
+    // Offset from the start of TLS at which the IPC command buffer begins.
+    static constexpr int CommandHeaderOffset = 0x80;
+    return GetTLSAddress() + CommandHeaderOffset;
 }
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/src/core/hle/kernel/thread.h b/src/core/hle/kernel/thread.h
index 6a3566f15..314fba81f 100644
--- a/src/core/hle/kernel/thread.h
+++ b/src/core/hle/kernel/thread.h
@@ -15,7 +15,7 @@
 #include "core/hle/kernel/wait_object.h"
 #include "core/hle/result.h"
 
-enum ThreadPriority : s32 {
+enum ThreadPriority : u32 {
     THREADPRIO_HIGHEST = 0,       ///< Highest thread priority
     THREADPRIO_USERLAND_MAX = 24, ///< Highest thread priority for userland apps
     THREADPRIO_DEFAULT = 48,      ///< Default thread priority for userland apps
@@ -41,6 +41,11 @@ enum ThreadStatus {
     THREADSTATUS_DEAD            ///< Run to completion, or forcefully terminated
 };
 
+enum class ThreadWakeupReason {
+    Signal, // The thread was woken up by WakeupAllWaitingThreads due to an object signal.
+    Timeout // The thread was woken up due to a wait timeout.
+};
+
 namespace Kernel {
 
 class Mutex;
@@ -56,10 +61,12 @@ public:
      * @param arg User data to pass to the thread
      * @param processor_id The ID(s) of the processors on which the thread is desired to be run
      * @param stack_top The address of the thread's stack top
+     * @param owner_process The parent process for the thread
      * @return A shared pointer to the newly created thread
      */
     static ResultVal<SharedPtr<Thread>> Create(std::string name, VAddr entry_point, u32 priority,
-                                               u32 arg, s32 processor_id, VAddr stack_top);
+                                               u32 arg, s32 processor_id, VAddr stack_top,
+                                               SharedPtr<Process> owner_process);
 
     std::string GetName() const override {
         return name;
@@ -80,7 +87,7 @@ public:
      * Gets the thread's current priority
      * @return The current thread's priority
      */
-    s32 GetPriority() const {
+    u32 GetPriority() const {
         return current_priority;
     }
 
@@ -88,7 +95,7 @@ public:
      * Sets the thread's current priority
      * @param priority The new priority
      */
-    void SetPriority(s32 priority);
+    void SetPriority(u32 priority);
 
     /**
      * Boost's a thread's priority to the best priority among the thread's held mutexes.
@@ -100,7 +107,7 @@ public:
      * Temporarily boosts the thread's priority until the next time it is scheduled
      * @param priority The new priority
      */
-    void BoostPriority(s32 priority);
+    void BoostPriority(u32 priority);
 
     /**
      * Gets the thread's thread ID
@@ -116,9 +123,9 @@ public:
     void ResumeFromWait();
 
     /**
-    * Schedules an event to wake up the specified thread after the specified delay
-    * @param nanoseconds The time this thread will be allowed to sleep for
-    */
+     * Schedules an event to wake up the specified thread after the specified delay
+     * @param nanoseconds The time this thread will be allowed to sleep for
+     */
     void WakeAfterDelay(s64 nanoseconds);
 
     /**
@@ -157,6 +164,12 @@ public:
         return tls_address;
     }
 
+    /*
+     * Returns the address of the current thread's command buffer, located in the TLS.
+     * @returns VAddr of the thread's command buffer.
+     */
+    VAddr GetCommandBufferAddress() const;
+
     /**
      * Returns whether this thread is waiting for all the objects in
      * its wait list to become ready, as a result of a WaitSynchronizationN call
@@ -174,8 +187,8 @@ public:
     u32 entry_point;
     u32 stack_top;
 
-    s32 nominal_priority; ///< Nominal thread priority, as set by the emulated application
-    s32 current_priority; ///< Current thread priority, can be temporarily changed
+    u32 nominal_priority; ///< Nominal thread priority, as set by the emulated application
+    u32 current_priority; ///< Current thread priority, can be temporarily changed
 
     u64 last_running_ticks; ///< CPU tick when thread was last running
 
@@ -197,14 +210,18 @@ public:
 
     VAddr wait_address; ///< If waiting on an AddressArbiter, this is the arbitration address
 
-    /// True if the WaitSynchronizationN output parameter should be set on thread wakeup.
-    bool wait_set_output;
-
     std::string name;
 
     /// Handle used as userdata to reference this object when inserting into the CoreTiming queue.
     Handle callback_handle;
 
+    using WakeupCallback = void(ThreadWakeupReason reason, SharedPtr<Thread> thread,
+                                SharedPtr<WaitObject> object);
+    // Callback that will be invoked when the thread is resumed from a waiting state. If the thread
+    // was waiting via WaitSynchronizationN then the object will be the last object that became
+    // available. In case of a timeout, the object will be nullptr.
+    std::function<WakeupCallback> wakeup_callback;
+
 private:
     Thread();
     ~Thread() override;
@@ -214,9 +231,10 @@ private:
  * Sets up the primary application thread
  * @param entry_point The address at which the thread should start execution
  * @param priority The priority to give the main thread
+ * @param owner_process The parent process for the main thread
  * @return A shared pointer to the main thread
  */
-SharedPtr<Thread> SetupMainThread(u32 entry_point, s32 priority);
+SharedPtr<Thread> SetupMainThread(u32 entry_point, u32 priority, SharedPtr<Process> owner_process);
 
 /**
  * Returns whether there are any threads that are ready to run.
@@ -276,4 +294,4 @@ void ThreadingShutdown();
  */
 const std::vector<SharedPtr<Thread>>& GetThreadList();
 
-} // namespace
+} // namespace Kernel
diff --git a/src/core/hle/kernel/wait_object.cpp b/src/core/hle/kernel/wait_object.cpp
index f245eda6c..469554908 100644
--- a/src/core/hle/kernel/wait_object.cpp
+++ b/src/core/hle/kernel/wait_object.cpp
@@ -34,7 +34,7 @@ void WaitObject::RemoveWaitingThread(Thread* thread) {
 
 SharedPtr<Thread> WaitObject::GetHighestPriorityReadyThread() {
     Thread* candidate = nullptr;
-    s32 candidate_priority = THREADPRIO_LOWEST + 1;
+    u32 candidate_priority = THREADPRIO_LOWEST + 1;
 
     for (const auto& thread : waiting_threads) {
         // The list of waiting threads must not contain threads that are not waiting to be awakened.
@@ -71,23 +71,20 @@ void WaitObject::WakeupAllWaitingThreads() {
     while (auto thread = GetHighestPriorityReadyThread()) {
         if (!thread->IsSleepingOnWaitAll()) {
             Acquire(thread.get());
-            // Set the output index of the WaitSynchronizationN call to the index of this object.
-            if (thread->wait_set_output) {
-                thread->SetWaitSynchronizationOutput(thread->GetWaitObjectIndex(this));
-                thread->wait_set_output = false;
-            }
         } else {
             for (auto& object : thread->wait_objects) {
                 object->Acquire(thread.get());
             }
-            // Note: This case doesn't update the output index of WaitSynchronizationN.
         }
 
+        // Invoke the wakeup callback before clearing the wait objects
+        if (thread->wakeup_callback)
+            thread->wakeup_callback(ThreadWakeupReason::Signal, thread, this);
+
         for (auto& object : thread->wait_objects)
             object->RemoveWaitingThread(thread.get());
         thread->wait_objects.clear();
 
-        thread->SetWaitSynchronizationResult(RESULT_SUCCESS);
         thread->ResumeFromWait();
     }
 }
diff --git a/src/core/hle/service/apt/apt.cpp b/src/core/hle/service/apt/apt.cpp
index c36775473..59ea9823d 100644
--- a/src/core/hle/service/apt/apt.cpp
+++ b/src/core/hle/service/apt/apt.cpp
@@ -65,6 +65,7 @@ union AppletAttributes {
     u32 raw;
 
     BitField<0, 3, u32> applet_pos;
+    BitField<29, 1, u32> is_home_menu;
 
     AppletAttributes() : raw(0) {}
     AppletAttributes(u32 attributes) : raw(attributes) {}
@@ -158,6 +159,11 @@ static AppletSlotData* GetAppletSlotData(AppletAttributes attributes) {
     if (slot == AppletSlot::Error)
         return nullptr;
 
+    // The Home Menu is a system applet, however, it has its own applet slot so that it can run
+    // concurrently with other system applets.
+    if (slot == AppletSlot::SystemApplet && attributes.is_home_menu)
+        return &applet_slots[static_cast<size_t>(AppletSlot::HomeMenu)];
+
     return &applet_slots[static_cast<size_t>(slot)];
 }
 
@@ -201,6 +207,19 @@ void Initialize(Service::Interface* self) {
     rb.Push(RESULT_SUCCESS);
     rb.PushCopyHandles(Kernel::g_handle_table.Create(slot_data->notification_event).Unwrap(),
                        Kernel::g_handle_table.Create(slot_data->parameter_event).Unwrap());
+
+    if (slot_data->applet_id == AppletId::Application ||
+        slot_data->applet_id == AppletId::HomeMenu) {
+        // Initialize the APT parameter to wake up the application.
+        next_parameter.emplace();
+        next_parameter->signal = static_cast<u32>(SignalType::Wakeup);
+        next_parameter->sender_id = static_cast<u32>(AppletId::None);
+        next_parameter->destination_id = app_id;
+        // Not signaling the parameter event will cause the application (or Home Menu) to hang
+        // during startup. In the real console, it is usually the Kernel and Home Menu who cause NS
+        // to signal the HomeMenu and Application parameter events, respectively.
+        slot_data->parameter_event->Signal();
+    }
 }
 
 static u32 DecompressLZ11(const u8* in, u8* out) {
@@ -563,7 +582,7 @@ void ReceiveParameter(Service::Interface* self) {
                            ? Kernel::g_handle_table.Create(next_parameter->object).Unwrap()
                            : 0);
 
-    rb.PushStaticBuffer(buffer, static_cast<u32>(next_parameter->buffer.size()), 0);
+    rb.PushStaticBuffer(buffer, next_parameter->buffer.size(), 0);
 
     Memory::WriteBlock(buffer, next_parameter->buffer.data(), next_parameter->buffer.size());
 
@@ -611,7 +630,7 @@ void GlanceParameter(Service::Interface* self) {
                            ? Kernel::g_handle_table.Create(next_parameter->object).Unwrap()
                            : 0);
 
-    rb.PushStaticBuffer(buffer, static_cast<u32>(next_parameter->buffer.size()), 0);
+    rb.PushStaticBuffer(buffer, next_parameter->buffer.size(), 0);
 
     Memory::WriteBlock(buffer, next_parameter->buffer.data(), next_parameter->buffer.size());
 
@@ -763,6 +782,20 @@ void PrepareToStartLibraryApplet(Service::Interface* self) {
     }
 }
 
+void PrepareToStartNewestHomeMenu(Service::Interface* self) {
+    IPC::RequestParser rp(Kernel::GetCommandBuffer(), 0x1A, 0, 0); // 0x1A0000
+    IPC::RequestBuilder rb = rp.MakeBuilder(1, 0);
+
+    // TODO(Subv): This command can only be called by a System Applet (return 0xC8A0CC04 otherwise).
+
+    // This command must return an error when called, otherwise the Home Menu will try to reboot the
+    // system.
+    rb.Push(ResultCode(ErrorDescription::AlreadyExists, ErrorModule::Applet,
+                       ErrorSummary::InvalidState, ErrorLevel::Status));
+
+    LOG_DEBUG(Service_APT, "called");
+}
+
 void PreloadLibraryApplet(Service::Interface* self) {
     IPC::RequestParser rp(Kernel::GetCommandBuffer(), 0x16, 1, 0); // 0x160040
     AppletId applet_id = static_cast<AppletId>(rp.Pop<u32>());
@@ -1058,12 +1091,6 @@ void Init() {
         slot_data.parameter_event =
             Kernel::Event::Create(Kernel::ResetType::OneShot, "APT:Parameter");
     }
-
-    // Initialize the parameter to wake up the application.
-    next_parameter.emplace();
-    next_parameter->signal = static_cast<u32>(SignalType::Wakeup);
-    next_parameter->destination_id = static_cast<u32>(AppletId::Application);
-    applet_slots[static_cast<size_t>(AppletSlot::Application)].parameter_event->Signal();
 }
 
 void Shutdown() {
diff --git a/src/core/hle/service/apt/apt.h b/src/core/hle/service/apt/apt.h
index 96b28b438..7b79e1f3e 100644
--- a/src/core/hle/service/apt/apt.h
+++ b/src/core/hle/service/apt/apt.h
@@ -420,6 +420,16 @@ void GetAppCpuTimeLimit(Service::Interface* self);
 void PrepareToStartLibraryApplet(Service::Interface* self);
 
 /**
+ * APT::PrepareToStartNewestHomeMenu service function
+ *  Inputs:
+ *      0 : Command header [0x001A0000]
+ *  Outputs:
+ *      0 : Return header
+ *      1 : Result of function
+ */
+void PrepareToStartNewestHomeMenu(Service::Interface* self);
+
+/**
  * APT::PreloadLibraryApplet service function
  *  Inputs:
  *      0 : Command header [0x00160040]
diff --git a/src/core/hle/service/apt/apt_s.cpp b/src/core/hle/service/apt/apt_s.cpp
index cf74c2a36..bb78ee7d7 100644
--- a/src/core/hle/service/apt/apt_s.cpp
+++ b/src/core/hle/service/apt/apt_s.cpp
@@ -17,7 +17,7 @@ const Interface::FunctionInfo FunctionTable[] = {
     {0x00060040, GetAppletInfo, "GetAppletInfo"},
     {0x00070000, nullptr, "GetLastSignaledAppletId"},
     {0x00080000, nullptr, "CountRegisteredApplet"},
-    {0x00090040, nullptr, "IsRegistered"},
+    {0x00090040, IsRegistered, "IsRegistered"},
     {0x000A0040, nullptr, "GetAttribute"},
     {0x000B0040, InquireNotification, "InquireNotification"},
     {0x000C0104, SendParameter, "SendParameter"},
@@ -34,7 +34,7 @@ const Interface::FunctionInfo FunctionTable[] = {
     {0x00170040, nullptr, "FinishPreloadingLibraryApplet"},
     {0x00180040, PrepareToStartLibraryApplet, "PrepareToStartLibraryApplet"},
     {0x00190040, nullptr, "PrepareToStartSystemApplet"},
-    {0x001A0000, nullptr, "PrepareToStartNewestHomeMenu"},
+    {0x001A0000, PrepareToStartNewestHomeMenu, "PrepareToStartNewestHomeMenu"},
     {0x001B00C4, nullptr, "StartApplication"},
     {0x001C0000, nullptr, "WakeupApplication"},
     {0x001D0000, nullptr, "CancelApplication"},
diff --git a/src/core/hle/service/cam/cam.cpp b/src/core/hle/service/cam/cam.cpp
index c9f9e9d95..8172edae8 100644
--- a/src/core/hle/service/cam/cam.cpp
+++ b/src/core/hle/service/cam/cam.cpp
@@ -177,7 +177,7 @@ void CompletionEventCallBack(u64 port_id, int) {
             LOG_ERROR(Service_CAM, "The destination size (%u) doesn't match the source (%zu)!",
                       port.dest_size, buffer_size);
         }
-        Memory::WriteBlock(port.dest, buffer.data(), std::min<u32>(port.dest_size, buffer_size));
+        Memory::WriteBlock(port.dest, buffer.data(), std::min<size_t>(port.dest_size, buffer_size));
     }
 
     port.is_receiving = false;
diff --git a/src/core/hle/service/cfg/cfg.cpp b/src/core/hle/service/cfg/cfg.cpp
index f26a1f65f..f78c25fb2 100644
--- a/src/core/hle/service/cfg/cfg.cpp
+++ b/src/core/hle/service/cfg/cfg.cpp
@@ -141,7 +141,7 @@ void GetCountryCodeString(Service::Interface* self) {
 
 void GetCountryCodeID(Service::Interface* self) {
     u32* cmd_buff = Kernel::GetCommandBuffer();
-    u16 country_code = cmd_buff[1];
+    u16 country_code = static_cast<u16>(cmd_buff[1]);
     u16 country_code_id = 0;
 
     // The following algorithm will fail if the first country code isn't 0.
diff --git a/src/core/hle/service/fs/archive.cpp b/src/core/hle/service/fs/archive.cpp
index 033fbc9aa..4ee7df73c 100644
--- a/src/core/hle/service/fs/archive.cpp
+++ b/src/core/hle/service/fs/archive.cpp
@@ -20,6 +20,7 @@
 #include "core/file_sys/archive_savedata.h"
 #include "core/file_sys/archive_sdmc.h"
 #include "core/file_sys/archive_sdmcwriteonly.h"
+#include "core/file_sys/archive_selfncch.h"
 #include "core/file_sys/archive_systemsavedata.h"
 #include "core/file_sys/directory_backend.h"
 #include "core/file_sys/errors.h"
@@ -48,7 +49,7 @@ struct hash<Service::FS::ArchiveIdCode> {
         return std::hash<Type>()(static_cast<Type>(id_code));
     }
 };
-}
+} // namespace std
 
 static constexpr Kernel::Handle INVALID_HANDLE{};
 
@@ -216,7 +217,7 @@ void Directory::HandleSyncRequest(Kernel::SharedPtr<Kernel::ServerSession> serve
         LOG_TRACE(Service_FS, "Read %s: count=%d", GetName().c_str(), count);
 
         // Number of entries actually read
-        u32 read = backend->Read(entries.size(), entries.data());
+        u32 read = backend->Read(static_cast<u32>(entries.size()), entries.data());
         cmd_buff[2] = read;
         Memory::WriteBlock(address, entries.data(), read * sizeof(FileSys::Entry));
         break;
@@ -564,6 +565,21 @@ void RegisterArchiveTypes() {
     auto systemsavedata_factory =
         std::make_unique<FileSys::ArchiveFactory_SystemSaveData>(nand_directory);
     RegisterArchiveType(std::move(systemsavedata_factory), ArchiveIdCode::SystemSaveData);
+
+    auto selfncch_factory = std::make_unique<FileSys::ArchiveFactory_SelfNCCH>();
+    RegisterArchiveType(std::move(selfncch_factory), ArchiveIdCode::SelfNCCH);
+}
+
+void RegisterSelfNCCH(Loader::AppLoader& app_loader) {
+    auto itr = id_code_map.find(ArchiveIdCode::SelfNCCH);
+    if (itr == id_code_map.end()) {
+        LOG_ERROR(Service_FS,
+                  "Could not register a new NCCH because the SelfNCCH archive hasn't been created");
+        return;
+    }
+
+    auto* factory = static_cast<FileSys::ArchiveFactory_SelfNCCH*>(itr->second.get());
+    factory->Register(app_loader);
 }
 
 void UnregisterArchiveTypes() {
diff --git a/src/core/hle/service/fs/archive.h b/src/core/hle/service/fs/archive.h
index 3a3371c88..e3c8fc2ef 100644
--- a/src/core/hle/service/fs/archive.h
+++ b/src/core/hle/service/fs/archive.h
@@ -21,6 +21,10 @@ static constexpr char SYSTEM_ID[]{"00000000000000000000000000000000"};
 /// The scrambled SD card CID, also known as ID1
 static constexpr char SDCARD_ID[]{"00000000000000000000000000000000"};
 
+namespace Loader {
+class AppLoader;
+}
+
 namespace Service {
 namespace FS {
 
@@ -259,6 +263,9 @@ void ArchiveInit();
 /// Shutdown archives
 void ArchiveShutdown();
 
+/// Registers a new NCCH file with the SelfNCCH archive factory
+void RegisterSelfNCCH(Loader::AppLoader& app_loader);
+
 /// Register all archive types
 void RegisterArchiveTypes();
 
diff --git a/src/core/hle/service/hid/hid.cpp b/src/core/hle/service/hid/hid.cpp
index aa5d821f9..379fbd71c 100644
--- a/src/core/hle/service/hid/hid.cpp
+++ b/src/core/hle/service/hid/hid.cpp
@@ -251,7 +251,7 @@ static void UpdateGyroscopeCallback(u64 userdata, int cycles_late) {
     Math::Vec3<float> gyro;
     std::tie(std::ignore, gyro) = motion_device->GetStatus();
     double stretch = Core::System::GetInstance().perf_stats.GetLastFrameTimeScale();
-    gyro *= gyroscope_coef * stretch;
+    gyro *= gyroscope_coef * static_cast<float>(stretch);
     gyroscope_entry.x = static_cast<s16>(gyro.x);
     gyroscope_entry.y = static_cast<s16>(gyro.y);
     gyroscope_entry.z = static_cast<s16>(gyro.z);
diff --git a/src/core/hle/service/ldr_ro/cro_helper.h b/src/core/hle/service/ldr_ro/cro_helper.h
index 3bc10dbdc..57b4fb6df 100644
--- a/src/core/hle/service/ldr_ro/cro_helper.h
+++ b/src/core/hle/service/ldr_ro/cro_helper.h
@@ -413,7 +413,8 @@ private:
      */
     template <typename T>
     void GetEntry(std::size_t index, T& data) const {
-        Memory::ReadBlock(GetField(T::TABLE_OFFSET_FIELD) + index * sizeof(T), &data, sizeof(T));
+        Memory::ReadBlock(GetField(T::TABLE_OFFSET_FIELD) + static_cast<u32>(index * sizeof(T)),
+                          &data, sizeof(T));
     }
 
     /**
@@ -425,7 +426,8 @@ private:
      */
     template <typename T>
     void SetEntry(std::size_t index, const T& data) {
-        Memory::WriteBlock(GetField(T::TABLE_OFFSET_FIELD) + index * sizeof(T), &data, sizeof(T));
+        Memory::WriteBlock(GetField(T::TABLE_OFFSET_FIELD) + static_cast<u32>(index * sizeof(T)),
+                           &data, sizeof(T));
     }
 
     /**
diff --git a/src/core/hle/service/nim/nim.cpp b/src/core/hle/service/nim/nim.cpp
index d5624fe54..b10d5852b 100644
--- a/src/core/hle/service/nim/nim.cpp
+++ b/src/core/hle/service/nim/nim.cpp
@@ -5,6 +5,8 @@
 #include "common/common_types.h"
 #include "common/logging/log.h"
 #include "core/hle/ipc.h"
+#include "core/hle/ipc_helpers.h"
+#include "core/hle/kernel/event.h"
 #include "core/hle/service/nim/nim.h"
 #include "core/hle/service/nim/nim_aoc.h"
 #include "core/hle/service/nim/nim_s.h"
@@ -14,6 +16,16 @@
 namespace Service {
 namespace NIM {
 
+static Kernel::SharedPtr<Kernel::Event> nim_system_update_event;
+
+void CheckForSysUpdateEvent(Service::Interface* self) {
+    IPC::RequestParser rp(Kernel::GetCommandBuffer(), 0x5, 0, 0); // 0x50000
+    IPC::RequestBuilder rb = rp.MakeBuilder(1, 2);
+    rb.Push(RESULT_SUCCESS);
+    rb.PushCopyHandles(Kernel::g_handle_table.Create(nim_system_update_event).Unwrap());
+    LOG_TRACE(Service_NIM, "called");
+}
+
 void CheckSysUpdateAvailable(Service::Interface* self) {
     u32* cmd_buff = Kernel::GetCommandBuffer();
 
@@ -29,9 +41,13 @@ void Init() {
     AddService(new NIM_AOC_Interface);
     AddService(new NIM_S_Interface);
     AddService(new NIM_U_Interface);
+
+    nim_system_update_event = Kernel::Event::Create(ResetType::OneShot, "NIM System Update Event");
 }
 
-void Shutdown() {}
+void Shutdown() {
+    nim_system_update_event = nullptr;
+}
 
 } // namespace NIM
 
diff --git a/src/core/hle/service/nim/nim.h b/src/core/hle/service/nim/nim.h
index c3106f18b..dbf605e5a 100644
--- a/src/core/hle/service/nim/nim.h
+++ b/src/core/hle/service/nim/nim.h
@@ -11,6 +11,17 @@ class Interface;
 namespace NIM {
 
 /**
+ * NIM::CheckForSysUpdateEvent service function
+ *  Inputs:
+ *      1 : None
+ *  Outputs:
+ *      1 : Result of function, 0 on success, otherwise error code
+ *      2 : Copy handle descriptor
+ *      3 : System Update event handle
+ */
+void CheckForSysUpdateEvent(Service::Interface* self);
+
+/**
  * NIM::CheckSysUpdateAvailable service function
  *  Inputs:
  *      1 : None
diff --git a/src/core/hle/service/nim/nim_u.cpp b/src/core/hle/service/nim/nim_u.cpp
index 7664bad60..569660278 100644
--- a/src/core/hle/service/nim/nim_u.cpp
+++ b/src/core/hle/service/nim/nim_u.cpp
@@ -12,7 +12,7 @@ const Interface::FunctionInfo FunctionTable[] = {
     {0x00010000, nullptr, "StartSysUpdate"},
     {0x00020000, nullptr, "GetUpdateDownloadProgress"},
     {0x00040000, nullptr, "FinishTitlesInstall"},
-    {0x00050000, nullptr, "CheckForSysUpdateEvent"},
+    {0x00050000, CheckForSysUpdateEvent, "CheckForSysUpdateEvent"},
     {0x00090000, CheckSysUpdateAvailable, "CheckSysUpdateAvailable"},
     {0x000A0000, nullptr, "GetState"},
     {0x000B0000, nullptr, "GetSystemTitleHash"},
diff --git a/src/core/hle/service/nwm/nwm_uds.cpp b/src/core/hle/service/nwm/nwm_uds.cpp
index 893bbb1e7..0aa63cc1e 100644
--- a/src/core/hle/service/nwm/nwm_uds.cpp
+++ b/src/core/hle/service/nwm/nwm_uds.cpp
@@ -2,8 +2,10 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
+#include <algorithm>
 #include <array>
 #include <cstring>
+#include <list>
 #include <mutex>
 #include <unordered_map>
 #include <vector>
@@ -13,6 +15,7 @@
 #include "core/hle/ipc_helpers.h"
 #include "core/hle/kernel/event.h"
 #include "core/hle/kernel/shared_memory.h"
+#include "core/hle/lock.h"
 #include "core/hle/result.h"
 #include "core/hle/service/nwm/nwm_uds.h"
 #include "core/hle/service/nwm/uds_beacon.h"
@@ -37,9 +40,12 @@ static ConnectionStatus connection_status{};
 /* Node information about the current network.
  * The amount of elements in this vector is always the maximum number
  * of nodes specified in the network configuration.
- * The first node is always the host, so this always contains at least 1 entry.
+ * The first node is always the host.
  */
-static NodeList node_info(1);
+static NodeList node_info;
+
+// Node information about our own system.
+static NodeInfo current_node;
 
 // Mapping of bind node ids to their respective events.
 static std::unordered_map<u32, Kernel::SharedPtr<Kernel::Event>> bind_node_events;
@@ -54,6 +60,10 @@ static NetworkInfo network_info;
 // Event that will generate and send the 802.11 beacon frames.
 static int beacon_broadcast_event;
 
+// Mutex to synchronize access to the connection status between the emulation thread and the
+// network thread.
+static std::mutex connection_status_mutex;
+
 // Mutex to synchronize access to the list of received beacons between the emulation thread and the
 // network thread.
 static std::mutex beacon_mutex;
@@ -63,14 +73,26 @@ static std::mutex beacon_mutex;
 constexpr size_t MaxBeaconFrames = 15;
 
 // List of the last <MaxBeaconFrames> beacons received from the network.
-static std::deque<Network::WifiPacket> received_beacons;
+static std::list<Network::WifiPacket> received_beacons;
 
 /**
  * Returns a list of received 802.11 beacon frames from the specified sender since the last call.
  */
-std::deque<Network::WifiPacket> GetReceivedBeacons(const MacAddress& sender) {
+std::list<Network::WifiPacket> GetReceivedBeacons(const MacAddress& sender) {
     std::lock_guard<std::mutex> lock(beacon_mutex);
-    // TODO(Subv): Filter by sender.
+    if (sender != Network::BroadcastMac) {
+        std::list<Network::WifiPacket> filtered_list;
+        const auto beacon = std::find_if(received_beacons.begin(), received_beacons.end(),
+                                         [&sender](const Network::WifiPacket& packet) {
+                                             return packet.transmitter_address == sender;
+                                         });
+        if (beacon != received_beacons.end()) {
+            filtered_list.push_back(*beacon);
+            // TODO(B3N30): Check if the complete deque is cleared or just the fetched entries
+            received_beacons.erase(beacon);
+        }
+        return filtered_list;
+    }
     return std::move(received_beacons);
 }
 
@@ -79,10 +101,33 @@ void SendPacket(Network::WifiPacket& packet) {
     // TODO(Subv): Implement.
 }
 
+/*
+ * Returns an available index in the nodes array for the
+ * currently-hosted UDS network.
+ */
+static u16 GetNextAvailableNodeId() {
+    for (u16 index = 0; index < connection_status.max_nodes; ++index) {
+        if ((connection_status.node_bitmask & (1 << index)) == 0)
+            return index;
+    }
+
+    // Any connection attempts to an already full network should have been refused.
+    ASSERT_MSG(false, "No available connection slots in the network");
+}
+
 // Inserts the received beacon frame in the beacon queue and removes any older beacons if the size
 // limit is exceeded.
 void HandleBeaconFrame(const Network::WifiPacket& packet) {
     std::lock_guard<std::mutex> lock(beacon_mutex);
+    const auto unique_beacon =
+        std::find_if(received_beacons.begin(), received_beacons.end(),
+                     [&packet](const Network::WifiPacket& new_packet) {
+                         return new_packet.transmitter_address == packet.transmitter_address;
+                     });
+    if (unique_beacon != received_beacons.end()) {
+        // We already have a beacon from the same mac in the deque, remove the old one;
+        received_beacons.erase(unique_beacon);
+    }
 
     received_beacons.emplace_back(packet);
 
@@ -91,21 +136,110 @@ void HandleBeaconFrame(const Network::WifiPacket& packet) {
         received_beacons.pop_front();
 }
 
-/*
- * Returns an available index in the nodes array for the
- * currently-hosted UDS network.
- */
-static u16 GetNextAvailableNodeId() {
-    ASSERT_MSG(connection_status.status == static_cast<u32>(NetworkStatus::ConnectedAsHost),
-               "Can not accept clients if we're not hosting a network");
+void HandleAssociationResponseFrame(const Network::WifiPacket& packet) {
+    auto assoc_result = GetAssociationResult(packet.data);
 
-    for (u16 index = 0; index < connection_status.max_nodes; ++index) {
-        if ((connection_status.node_bitmask & (1 << index)) == 0)
-            return index;
+    ASSERT_MSG(std::get<AssocStatus>(assoc_result) == AssocStatus::Successful,
+               "Could not join network");
+    {
+        std::lock_guard<std::mutex> lock(connection_status_mutex);
+        ASSERT(connection_status.status == static_cast<u32>(NetworkStatus::Connecting));
     }
 
-    // Any connection attempts to an already full network should have been refused.
-    ASSERT_MSG(false, "No available connection slots in the network");
+    // Send the EAPoL-Start packet to the server.
+    using Network::WifiPacket;
+    WifiPacket eapol_start;
+    eapol_start.channel = network_channel;
+    eapol_start.data = GenerateEAPoLStartFrame(std::get<u16>(assoc_result), current_node);
+    // TODO(B3N30): Encrypt the packet.
+    eapol_start.destination_address = packet.transmitter_address;
+    eapol_start.type = WifiPacket::PacketType::Data;
+
+    SendPacket(eapol_start);
+}
+
+static void HandleEAPoLPacket(const Network::WifiPacket& packet) {
+    std::lock_guard<std::mutex> lock(connection_status_mutex);
+
+    if (GetEAPoLFrameType(packet.data) == EAPoLStartMagic) {
+        if (connection_status.status != static_cast<u32>(NetworkStatus::ConnectedAsHost)) {
+            LOG_DEBUG(Service_NWM, "Connection sequence aborted, because connection status is %u",
+                      connection_status.status);
+            return;
+        }
+
+        auto node = DeserializeNodeInfoFromFrame(packet.data);
+
+        if (connection_status.max_nodes == connection_status.total_nodes) {
+            // Reject connection attempt
+            LOG_ERROR(Service_NWM, "Reached maximum nodes, but reject packet wasn't sent.");
+            // TODO(B3N30): Figure out what packet is sent here
+            return;
+        }
+
+        // Get an unused network node id
+        u16 node_id = GetNextAvailableNodeId();
+        node.network_node_id = node_id + 1;
+
+        connection_status.node_bitmask |= 1 << node_id;
+        connection_status.changed_nodes |= 1 << node_id;
+        connection_status.nodes[node_id] = node.network_node_id;
+        connection_status.total_nodes++;
+
+        u8 current_nodes = network_info.total_nodes;
+        node_info[current_nodes] = node;
+
+        network_info.total_nodes++;
+
+        // Send the EAPoL-Logoff packet.
+        using Network::WifiPacket;
+        WifiPacket eapol_logoff;
+        eapol_logoff.channel = network_channel;
+        eapol_logoff.data =
+            GenerateEAPoLLogoffFrame(packet.transmitter_address, node.network_node_id, node_info,
+                                     network_info.max_nodes, network_info.total_nodes);
+        // TODO(Subv): Encrypt the packet.
+        eapol_logoff.destination_address = packet.transmitter_address;
+        eapol_logoff.type = WifiPacket::PacketType::Data;
+
+        SendPacket(eapol_logoff);
+        // TODO(B3N30): Broadcast updated node list
+        // The 3ds does this presumably to support spectators.
+        std::lock_guard<std::recursive_mutex> lock(HLE::g_hle_lock);
+        connection_status_event->Signal();
+    } else {
+        if (connection_status.status != static_cast<u32>(NetworkStatus::NotConnected)) {
+            LOG_DEBUG(Service_NWM, "Connection sequence aborted, because connection status is %u",
+                      connection_status.status);
+            return;
+        }
+        auto logoff = ParseEAPoLLogoffFrame(packet.data);
+
+        network_info.total_nodes = logoff.connected_nodes;
+        network_info.max_nodes = logoff.max_nodes;
+
+        connection_status.network_node_id = logoff.assigned_node_id;
+        connection_status.total_nodes = logoff.connected_nodes;
+        connection_status.max_nodes = logoff.max_nodes;
+
+        node_info.clear();
+        node_info.reserve(network_info.max_nodes);
+        for (size_t index = 0; index < logoff.connected_nodes; ++index) {
+            connection_status.node_bitmask |= 1 << index;
+            connection_status.changed_nodes |= 1 << index;
+            connection_status.nodes[index] = logoff.nodes[index].network_node_id;
+
+            node_info.emplace_back(DeserializeNodeInfo(logoff.nodes[index]));
+        }
+
+        // We're now connected, signal the application
+        connection_status.status = static_cast<u32>(NetworkStatus::ConnectedAsClient);
+        // Some games require ConnectToNetwork to block, for now it doesn't
+        // If blocking is implemented this lock needs to be changed,
+        // otherwise it might cause deadlocks
+        std::lock_guard<std::recursive_mutex> lock(HLE::g_hle_lock);
+        connection_status_event->Signal();
+    }
 }
 
 /*
@@ -113,35 +247,46 @@ static u16 GetNextAvailableNodeId() {
  * authentication frame with SEQ1.
  */
 void StartConnectionSequence(const MacAddress& server) {
-    ASSERT(connection_status.status == static_cast<u32>(NetworkStatus::NotConnected));
-
-    // TODO(Subv): Handle timeout.
-
-    // Send an authentication frame with SEQ1
     using Network::WifiPacket;
     WifiPacket auth_request;
-    auth_request.channel = network_channel;
-    auth_request.data = GenerateAuthenticationFrame(AuthenticationSeq::SEQ1);
-    auth_request.destination_address = server;
-    auth_request.type = WifiPacket::PacketType::Authentication;
+    {
+        std::lock_guard<std::mutex> lock(connection_status_mutex);
+        ASSERT(connection_status.status == static_cast<u32>(NetworkStatus::NotConnected));
+
+        // TODO(Subv): Handle timeout.
+
+        // Send an authentication frame with SEQ1
+        auth_request.channel = network_channel;
+        auth_request.data = GenerateAuthenticationFrame(AuthenticationSeq::SEQ1);
+        auth_request.destination_address = server;
+        auth_request.type = WifiPacket::PacketType::Authentication;
+    }
 
     SendPacket(auth_request);
 }
 
 /// Sends an Association Response frame to the specified mac address
 void SendAssociationResponseFrame(const MacAddress& address) {
-    ASSERT_MSG(connection_status.status == static_cast<u32>(NetworkStatus::ConnectedAsHost));
-
     using Network::WifiPacket;
     WifiPacket assoc_response;
-    assoc_response.channel = network_channel;
-    // TODO(Subv): This will cause multiple clients to end up with the same association id, but
-    // we're not using that for anything.
-    u16 association_id = 1;
-    assoc_response.data = GenerateAssocResponseFrame(AssocStatus::Successful, association_id,
-                                                     network_info.network_id);
-    assoc_response.destination_address = address;
-    assoc_response.type = WifiPacket::PacketType::AssociationResponse;
+
+    {
+        std::lock_guard<std::mutex> lock(connection_status_mutex);
+        if (connection_status.status != static_cast<u32>(NetworkStatus::ConnectedAsHost)) {
+            LOG_ERROR(Service_NWM, "Connection sequence aborted, because connection status is %u",
+                      connection_status.status);
+            return;
+        }
+
+        assoc_response.channel = network_channel;
+        // TODO(Subv): This will cause multiple clients to end up with the same association id, but
+        // we're not using that for anything.
+        u16 association_id = 1;
+        assoc_response.data = GenerateAssocResponseFrame(AssocStatus::Successful, association_id,
+                                                         network_info.network_id);
+        assoc_response.destination_address = address;
+        assoc_response.type = WifiPacket::PacketType::AssociationResponse;
+    }
 
     SendPacket(assoc_response);
 }
@@ -155,22 +300,40 @@ void SendAssociationResponseFrame(const MacAddress& address) {
 void HandleAuthenticationFrame(const Network::WifiPacket& packet) {
     // Only the SEQ1 auth frame is handled here, the SEQ2 frame doesn't need any special behavior
     if (GetAuthenticationSeqNumber(packet.data) == AuthenticationSeq::SEQ1) {
-        ASSERT_MSG(connection_status.status == static_cast<u32>(NetworkStatus::ConnectedAsHost));
-
-        // Respond with an authentication response frame with SEQ2
         using Network::WifiPacket;
         WifiPacket auth_request;
-        auth_request.channel = network_channel;
-        auth_request.data = GenerateAuthenticationFrame(AuthenticationSeq::SEQ2);
-        auth_request.destination_address = packet.transmitter_address;
-        auth_request.type = WifiPacket::PacketType::Authentication;
-
+        {
+            std::lock_guard<std::mutex> lock(connection_status_mutex);
+            if (connection_status.status != static_cast<u32>(NetworkStatus::ConnectedAsHost)) {
+                LOG_ERROR(Service_NWM,
+                          "Connection sequence aborted, because connection status is %u",
+                          connection_status.status);
+                return;
+            }
+
+            // Respond with an authentication response frame with SEQ2
+            auth_request.channel = network_channel;
+            auth_request.data = GenerateAuthenticationFrame(AuthenticationSeq::SEQ2);
+            auth_request.destination_address = packet.transmitter_address;
+            auth_request.type = WifiPacket::PacketType::Authentication;
+        }
         SendPacket(auth_request);
 
         SendAssociationResponseFrame(packet.transmitter_address);
     }
 }
 
+static void HandleDataFrame(const Network::WifiPacket& packet) {
+    switch (GetFrameEtherType(packet.data)) {
+    case EtherType::EAPoL:
+        HandleEAPoLPacket(packet);
+        break;
+    case EtherType::SecureData:
+        // TODO(B3N30): Handle SecureData packets
+        break;
+    }
+}
+
 /// Callback to parse and handle a received wifi packet.
 void OnWifiPacketReceived(const Network::WifiPacket& packet) {
     switch (packet.type) {
@@ -180,6 +343,12 @@ void OnWifiPacketReceived(const Network::WifiPacket& packet) {
     case Network::WifiPacket::PacketType::Authentication:
         HandleAuthenticationFrame(packet);
         break;
+    case Network::WifiPacket::PacketType::AssociationResponse:
+        HandleAssociationResponseFrame(packet);
+        break;
+    case Network::WifiPacket::PacketType::Data:
+        HandleDataFrame(packet);
+        break;
     }
 }
 
@@ -246,7 +415,7 @@ static void RecvBeaconBroadcastData(Interface* self) {
     auto beacons = GetReceivedBeacons(mac_address);
 
     BeaconDataReplyHeader data_reply_header{};
-    data_reply_header.total_entries = beacons.size();
+    data_reply_header.total_entries = static_cast<u32>(beacons.size());
     data_reply_header.max_output_size = out_buffer_size;
 
     Memory::WriteBlock(current_buffer_pos, &data_reply_header, sizeof(BeaconDataReplyHeader));
@@ -256,8 +425,8 @@ static void RecvBeaconBroadcastData(Interface* self) {
     for (const auto& beacon : beacons) {
         BeaconEntryHeader entry{};
         // TODO(Subv): Figure out what this size is used for.
-        entry.unk_size = sizeof(BeaconEntryHeader) + beacon.data.size();
-        entry.total_size = sizeof(BeaconEntryHeader) + beacon.data.size();
+        entry.unk_size = static_cast<u32>(sizeof(BeaconEntryHeader) + beacon.data.size());
+        entry.total_size = static_cast<u32>(sizeof(BeaconEntryHeader) + beacon.data.size());
         entry.wifi_channel = beacon.channel;
         entry.header_size = sizeof(BeaconEntryHeader);
         entry.mac_address = beacon.transmitter_address;
@@ -268,9 +437,9 @@ static void RecvBeaconBroadcastData(Interface* self) {
         current_buffer_pos += sizeof(BeaconEntryHeader);
 
         Memory::WriteBlock(current_buffer_pos, beacon.data.data(), beacon.data.size());
-        current_buffer_pos += beacon.data.size();
+        current_buffer_pos += static_cast<VAddr>(beacon.data.size());
 
-        total_size += sizeof(BeaconEntryHeader) + beacon.data.size();
+        total_size += static_cast<u32>(sizeof(BeaconEntryHeader) + beacon.data.size());
     }
 
     // Update the total size in the structure and write it to the buffer again.
@@ -305,7 +474,7 @@ static void InitializeWithVersion(Interface* self) {
     u32 sharedmem_size = rp.Pop<u32>();
 
     // Update the node information with the data the game gave us.
-    rp.PopRaw(node_info[0]);
+    rp.PopRaw(current_node);
 
     u16 version = rp.Pop<u16>();
 
@@ -315,10 +484,14 @@ static void InitializeWithVersion(Interface* self) {
 
     ASSERT_MSG(recv_buffer_memory->size == sharedmem_size, "Invalid shared memory size.");
 
-    // Reset the connection status, it contains all zeros after initialization,
-    // except for the actual status value.
-    connection_status = {};
-    connection_status.status = static_cast<u32>(NetworkStatus::NotConnected);
+    {
+        std::lock_guard<std::mutex> lock(connection_status_mutex);
+
+        // Reset the connection status, it contains all zeros after initialization,
+        // except for the actual status value.
+        connection_status = {};
+        connection_status.status = static_cast<u32>(NetworkStatus::NotConnected);
+    }
 
     IPC::RequestBuilder rb = rp.MakeBuilder(1, 2);
     rb.Push(RESULT_SUCCESS);
@@ -348,12 +521,16 @@ static void GetConnectionStatus(Interface* self) {
     IPC::RequestBuilder rb = rp.MakeBuilder(13, 0);
 
     rb.Push(RESULT_SUCCESS);
-    rb.PushRaw(connection_status);
-
-    // Reset the bitmask of changed nodes after each call to this
-    // function to prevent falsely informing games of outstanding
-    // changes in subsequent calls.
-    connection_status.changed_nodes = 0;
+    {
+        std::lock_guard<std::mutex> lock(connection_status_mutex);
+        rb.PushRaw(connection_status);
+
+        // Reset the bitmask of changed nodes after each call to this
+        // function to prevent falsely informing games of outstanding
+        // changes in subsequent calls.
+        // TODO(Subv): Find exactly where the NWM module resets this value.
+        connection_status.changed_nodes = 0;
+    }
 
     LOG_DEBUG(Service_NWM, "called");
 }
@@ -434,31 +611,36 @@ static void BeginHostingNetwork(Interface* self) {
     // The real UDS module throws a fatal error if this assert fails.
     ASSERT_MSG(network_info.max_nodes > 1, "Trying to host a network of only one member.");
 
-    connection_status.status = static_cast<u32>(NetworkStatus::ConnectedAsHost);
-
-    // Ensure the application data size is less than the maximum value.
-    ASSERT_MSG(network_info.application_data_size <= ApplicationDataSize, "Data size is too big.");
-
-    // Set up basic information for this network.
-    network_info.oui_value = NintendoOUI;
-    network_info.oui_type = static_cast<u8>(NintendoTagId::NetworkInfo);
-
-    connection_status.max_nodes = network_info.max_nodes;
-
-    // Resize the nodes list to hold max_nodes.
-    node_info.resize(network_info.max_nodes);
-
-    // There's currently only one node in the network (the host).
-    connection_status.total_nodes = 1;
-    network_info.total_nodes = 1;
-    // The host is always the first node
-    connection_status.network_node_id = 1;
-    node_info[0].network_node_id = 1;
-    connection_status.nodes[0] = connection_status.network_node_id;
-    // Set the bit 0 in the nodes bitmask to indicate that node 1 is already taken.
-    connection_status.node_bitmask |= 1;
-    // Notify the application that the first node was set.
-    connection_status.changed_nodes |= 1;
+    {
+        std::lock_guard<std::mutex> lock(connection_status_mutex);
+        connection_status.status = static_cast<u32>(NetworkStatus::ConnectedAsHost);
+
+        // Ensure the application data size is less than the maximum value.
+        ASSERT_MSG(network_info.application_data_size <= ApplicationDataSize,
+                   "Data size is too big.");
+
+        // Set up basic information for this network.
+        network_info.oui_value = NintendoOUI;
+        network_info.oui_type = static_cast<u8>(NintendoTagId::NetworkInfo);
+
+        connection_status.max_nodes = network_info.max_nodes;
+
+        // Resize the nodes list to hold max_nodes.
+        node_info.resize(network_info.max_nodes);
+
+        // There's currently only one node in the network (the host).
+        connection_status.total_nodes = 1;
+        network_info.total_nodes = 1;
+        // The host is always the first node
+        connection_status.network_node_id = 1;
+        current_node.network_node_id = 1;
+        connection_status.nodes[0] = connection_status.network_node_id;
+        // Set the bit 0 in the nodes bitmask to indicate that node 1 is already taken.
+        connection_status.node_bitmask |= 1;
+        // Notify the application that the first node was set.
+        connection_status.changed_nodes |= 1;
+        node_info[0] = current_node;
+    }
 
     // If the game has a preferred channel, use that instead.
     if (network_info.channel != 0)
@@ -495,9 +677,13 @@ static void DestroyNetwork(Interface* self) {
     // Unschedule the beacon broadcast event.
     CoreTiming::UnscheduleEvent(beacon_broadcast_event, 0);
 
-    // TODO(Subv): Check if connection_status is indeed reset after this call.
-    connection_status = {};
-    connection_status.status = static_cast<u8>(NetworkStatus::NotConnected);
+    {
+        std::lock_guard<std::mutex> lock(connection_status_mutex);
+
+        // TODO(Subv): Check if connection_status is indeed reset after this call.
+        connection_status = {};
+        connection_status.status = static_cast<u8>(NetworkStatus::NotConnected);
+    }
     connection_status_event->Signal();
 
     IPC::RequestBuilder rb = rp.MakeBuilder(1, 0);
@@ -540,17 +726,24 @@ static void SendTo(Interface* self) {
 
     IPC::RequestBuilder rb = rp.MakeBuilder(1, 0);
 
-    if (connection_status.status != static_cast<u32>(NetworkStatus::ConnectedAsClient) &&
-        connection_status.status != static_cast<u32>(NetworkStatus::ConnectedAsHost)) {
-        rb.Push(ResultCode(ErrorDescription::NotAuthorized, ErrorModule::UDS,
-                           ErrorSummary::InvalidState, ErrorLevel::Status));
-        return;
-    }
+    u16 network_node_id;
 
-    if (dest_node_id == connection_status.network_node_id) {
-        rb.Push(ResultCode(ErrorDescription::NotFound, ErrorModule::UDS,
-                           ErrorSummary::WrongArgument, ErrorLevel::Status));
-        return;
+    {
+        std::lock_guard<std::mutex> lock(connection_status_mutex);
+        if (connection_status.status != static_cast<u32>(NetworkStatus::ConnectedAsClient) &&
+            connection_status.status != static_cast<u32>(NetworkStatus::ConnectedAsHost)) {
+            rb.Push(ResultCode(ErrorDescription::NotAuthorized, ErrorModule::UDS,
+                               ErrorSummary::InvalidState, ErrorLevel::Status));
+            return;
+        }
+
+        if (dest_node_id == connection_status.network_node_id) {
+            rb.Push(ResultCode(ErrorDescription::NotFound, ErrorModule::UDS,
+                               ErrorSummary::WrongArgument, ErrorLevel::Status));
+            return;
+        }
+
+        network_node_id = connection_status.network_node_id;
     }
 
     // TODO(Subv): Do something with the flags.
@@ -567,8 +760,8 @@ static void SendTo(Interface* self) {
 
     // TODO(Subv): Increment the sequence number after each sent packet.
     u16 sequence_number = 0;
-    std::vector<u8> data_payload = GenerateDataPayload(
-        data, data_channel, dest_node_id, connection_status.network_node_id, sequence_number);
+    std::vector<u8> data_payload =
+        GenerateDataPayload(data, data_channel, dest_node_id, network_node_id, sequence_number);
 
     // TODO(Subv): Retrieve the MAC address of the dest_node_id and our own to encrypt
     // and encapsulate the payload.
@@ -595,6 +788,7 @@ static void GetChannel(Interface* self) {
     IPC::RequestParser rp(Kernel::GetCommandBuffer(), 0x1A, 0, 0);
     IPC::RequestBuilder rb = rp.MakeBuilder(2, 0);
 
+    std::lock_guard<std::mutex> lock(connection_status_mutex);
     bool is_connected = connection_status.status != static_cast<u32>(NetworkStatus::NotConnected);
 
     u8 channel = is_connected ? network_channel : 0;
@@ -766,6 +960,7 @@ static void BeaconBroadcastCallback(u64 userdata, int cycles_late) {
  * @param network_node_id Network Node Id of the connecting client.
  */
 void OnClientConnected(u16 network_node_id) {
+    std::lock_guard<std::mutex> lock(connection_status_mutex);
     ASSERT_MSG(connection_status.status == static_cast<u32>(NetworkStatus::ConnectedAsHost),
                "Can not accept clients if we're not hosting a network");
     ASSERT_MSG(connection_status.total_nodes < connection_status.max_nodes,
@@ -827,8 +1022,11 @@ NWM_UDS::~NWM_UDS() {
     connection_status_event = nullptr;
     recv_buffer_memory = nullptr;
 
-    connection_status = {};
-    connection_status.status = static_cast<u32>(NetworkStatus::NotConnected);
+    {
+        std::lock_guard<std::mutex> lock(connection_status_mutex);
+        connection_status = {};
+        connection_status.status = static_cast<u32>(NetworkStatus::NotConnected);
+    }
 
     CoreTiming::UnscheduleEvent(beacon_broadcast_event, 0);
 }
diff --git a/src/core/hle/service/nwm/uds_beacon.cpp b/src/core/hle/service/nwm/uds_beacon.cpp
index 552eaf65e..73a80d940 100644
--- a/src/core/hle/service/nwm/uds_beacon.cpp
+++ b/src/core/hle/service/nwm/uds_beacon.cpp
@@ -243,7 +243,7 @@ std::vector<u8> GenerateNintendoFirstEncryptedDataTag(const NetworkInfo& network
 
     EncryptedDataTag tag{};
     tag.header.tag_id = static_cast<u8>(TagId::VendorSpecific);
-    tag.header.length = sizeof(tag) - sizeof(TagHeader) + payload_size;
+    tag.header.length = static_cast<u8>(sizeof(tag) - sizeof(TagHeader) + payload_size);
     tag.oui_type = static_cast<u8>(NintendoTagId::EncryptedData0);
     tag.oui = NintendoOUI;
 
@@ -279,7 +279,7 @@ std::vector<u8> GenerateNintendoSecondEncryptedDataTag(const NetworkInfo& networ
 
     EncryptedDataTag tag{};
     tag.header.tag_id = static_cast<u8>(TagId::VendorSpecific);
-    tag.header.length = tag_length;
+    tag.header.length = static_cast<u8>(tag_length);
     tag.oui_type = static_cast<u8>(NintendoTagId::EncryptedData1);
     tag.oui = NintendoOUI;
 
diff --git a/src/core/hle/service/nwm/uds_connection.cpp b/src/core/hle/service/nwm/uds_connection.cpp
index c8a76ec2a..c74f51253 100644
--- a/src/core/hle/service/nwm/uds_connection.cpp
+++ b/src/core/hle/service/nwm/uds_connection.cpp
@@ -75,5 +75,14 @@ std::vector<u8> GenerateAssocResponseFrame(AssocStatus status, u16 association_i
     return data;
 }
 
+std::tuple<AssocStatus, u16> GetAssociationResult(const std::vector<u8>& body) {
+    AssociationResponseFrame frame;
+    memcpy(&frame, body.data(), sizeof(frame));
+
+    constexpr u16 AssociationIdMask = 0x3FFF;
+    return std::make_tuple(static_cast<AssocStatus>(frame.status_code),
+                           frame.assoc_id & AssociationIdMask);
+}
+
 } // namespace NWM
 } // namespace Service
diff --git a/src/core/hle/service/nwm/uds_connection.h b/src/core/hle/service/nwm/uds_connection.h
index 73f55a4fd..a664f8471 100644
--- a/src/core/hle/service/nwm/uds_connection.h
+++ b/src/core/hle/service/nwm/uds_connection.h
@@ -4,6 +4,7 @@
 
 #pragma once
 
+#include <tuple>
 #include <vector>
 #include "common/common_types.h"
 #include "common/swap.h"
@@ -47,5 +48,9 @@ AuthenticationSeq GetAuthenticationSeqNumber(const std::vector<u8>& body);
 /// network id, starting at the frame body.
 std::vector<u8> GenerateAssocResponseFrame(AssocStatus status, u16 association_id, u32 network_id);
 
+/// Returns a tuple of (association status, association id) from the body of an AssociationResponse
+/// frame.
+std::tuple<AssocStatus, u16> GetAssociationResult(const std::vector<u8>& body);
+
 } // namespace NWM
 } // namespace Service
diff --git a/src/core/hle/service/nwm/uds_data.cpp b/src/core/hle/service/nwm/uds_data.cpp
index 8c6742dba..4b389710f 100644
--- a/src/core/hle/service/nwm/uds_data.cpp
+++ b/src/core/hle/service/nwm/uds_data.cpp
@@ -2,6 +2,7 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
+#include <algorithm>
 #include <cstring>
 #include <cryptopp/aes.h>
 #include <cryptopp/ccm.h>
@@ -197,7 +198,7 @@ static std::vector<u8> DecryptDataFrame(const std::vector<u8>& encrypted_payload
         df.ChannelMessageEnd(CryptoPP::DEFAULT_CHANNEL);
         df.SetRetrievalChannel(CryptoPP::DEFAULT_CHANNEL);
 
-        int size = df.MaxRetrievable();
+        size_t size = df.MaxRetrievable();
 
         std::vector<u8> pdata(size);
         df.Get(pdata.data(), size);
@@ -251,7 +252,7 @@ static std::vector<u8> EncryptDataFrame(const std::vector<u8>& payload,
 
         df.SetRetrievalChannel(CryptoPP::DEFAULT_CHANNEL);
 
-        int size = df.MaxRetrievable();
+        size_t size = df.MaxRetrievable();
 
         std::vector<u8> cipher(size);
         df.Get(cipher.data(), size);
@@ -266,13 +267,107 @@ static std::vector<u8> EncryptDataFrame(const std::vector<u8>& payload,
 std::vector<u8> GenerateDataPayload(const std::vector<u8>& data, u8 channel, u16 dest_node,
                                     u16 src_node, u16 sequence_number) {
     std::vector<u8> buffer = GenerateLLCHeader(EtherType::SecureData);
-    std::vector<u8> securedata_header =
-        GenerateSecureDataHeader(data.size(), channel, dest_node, src_node, sequence_number);
+    std::vector<u8> securedata_header = GenerateSecureDataHeader(
+        static_cast<u16>(data.size()), channel, dest_node, src_node, sequence_number);
 
     buffer.insert(buffer.end(), securedata_header.begin(), securedata_header.end());
     buffer.insert(buffer.end(), data.begin(), data.end());
     return buffer;
 }
 
+std::vector<u8> GenerateEAPoLStartFrame(u16 association_id, const NodeInfo& node_info) {
+    EAPoLStartPacket eapol_start{};
+    eapol_start.association_id = association_id;
+    eapol_start.node.friend_code_seed = node_info.friend_code_seed;
+
+    std::copy(node_info.username.begin(), node_info.username.end(),
+              eapol_start.node.username.begin());
+
+    // Note: The network_node_id and unknown bytes seem to be uninitialized in the NWM module.
+    // TODO(B3N30): The last 8 bytes seem to have a fixed value of 07 88 15 00 04 e9 13 00 in
+    // EAPoL-Start packets from different 3DSs to the same host during a Super Smash Bros. 4 game.
+    // Find out what that means.
+
+    std::vector<u8> eapol_buffer(sizeof(EAPoLStartPacket));
+    std::memcpy(eapol_buffer.data(), &eapol_start, sizeof(eapol_start));
+
+    std::vector<u8> buffer = GenerateLLCHeader(EtherType::EAPoL);
+    buffer.insert(buffer.end(), eapol_buffer.begin(), eapol_buffer.end());
+    return buffer;
+}
+
+EtherType GetFrameEtherType(const std::vector<u8>& frame) {
+    LLCHeader header;
+    std::memcpy(&header, frame.data(), sizeof(header));
+
+    u16 ethertype = header.protocol;
+    return static_cast<EtherType>(ethertype);
+}
+
+u16 GetEAPoLFrameType(const std::vector<u8>& frame) {
+    // Ignore the LLC header
+    u16_be eapol_type;
+    std::memcpy(&eapol_type, frame.data() + sizeof(LLCHeader), sizeof(eapol_type));
+    return eapol_type;
+}
+
+NodeInfo DeserializeNodeInfoFromFrame(const std::vector<u8>& frame) {
+    EAPoLStartPacket eapol_start;
+
+    // Skip the LLC header
+    std::memcpy(&eapol_start, frame.data() + sizeof(LLCHeader), sizeof(eapol_start));
+
+    NodeInfo node{};
+    node.friend_code_seed = eapol_start.node.friend_code_seed;
+
+    std::copy(eapol_start.node.username.begin(), eapol_start.node.username.end(),
+              node.username.begin());
+
+    return node;
+}
+
+NodeInfo DeserializeNodeInfo(const EAPoLNodeInfo& node) {
+    NodeInfo node_info{};
+    node_info.friend_code_seed = node.friend_code_seed;
+    node_info.network_node_id = node.network_node_id;
+
+    std::copy(node.username.begin(), node.username.end(), node_info.username.begin());
+
+    return node_info;
+}
+
+std::vector<u8> GenerateEAPoLLogoffFrame(const MacAddress& mac_address, u16 network_node_id,
+                                         const NodeList& nodes, u8 max_nodes, u8 total_nodes) {
+    EAPoLLogoffPacket eapol_logoff{};
+    eapol_logoff.assigned_node_id = network_node_id;
+    eapol_logoff.connected_nodes = total_nodes;
+    eapol_logoff.max_nodes = max_nodes;
+
+    for (size_t index = 0; index < total_nodes; ++index) {
+        const auto& node_info = nodes[index];
+        auto& node = eapol_logoff.nodes[index];
+
+        node.friend_code_seed = node_info.friend_code_seed;
+        node.network_node_id = node_info.network_node_id;
+
+        std::copy(node_info.username.begin(), node_info.username.end(), node.username.begin());
+    }
+
+    std::vector<u8> eapol_buffer(sizeof(EAPoLLogoffPacket));
+    std::memcpy(eapol_buffer.data(), &eapol_logoff, sizeof(eapol_logoff));
+
+    std::vector<u8> buffer = GenerateLLCHeader(EtherType::EAPoL);
+    buffer.insert(buffer.end(), eapol_buffer.begin(), eapol_buffer.end());
+    return buffer;
+}
+
+EAPoLLogoffPacket ParseEAPoLLogoffFrame(const std::vector<u8>& frame) {
+    EAPoLLogoffPacket eapol_logoff;
+
+    // Skip the LLC header
+    std::memcpy(&eapol_logoff, frame.data() + sizeof(LLCHeader), sizeof(eapol_logoff));
+    return eapol_logoff;
+}
+
 } // namespace NWM
 } // namespace Service
diff --git a/src/core/hle/service/nwm/uds_data.h b/src/core/hle/service/nwm/uds_data.h
index a23520a41..76bccb1bf 100644
--- a/src/core/hle/service/nwm/uds_data.h
+++ b/src/core/hle/service/nwm/uds_data.h
@@ -8,6 +8,7 @@
 #include <vector>
 #include "common/common_types.h"
 #include "common/swap.h"
+#include "core/hle/service/nwm/uds_beacon.h"
 #include "core/hle/service/service.h"
 
 namespace Service {
@@ -67,6 +68,49 @@ struct DataFrameCryptoCTR {
 
 static_assert(sizeof(DataFrameCryptoCTR) == 16, "DataFrameCryptoCTR has the wrong size");
 
+struct EAPoLNodeInfo {
+    u64_be friend_code_seed;
+    std::array<u16_be, 10> username;
+    INSERT_PADDING_BYTES(4);
+    u16_be network_node_id;
+    INSERT_PADDING_BYTES(6);
+};
+
+static_assert(sizeof(EAPoLNodeInfo) == 0x28, "EAPoLNodeInfo has the wrong size");
+
+constexpr u16 EAPoLStartMagic = 0x201;
+
+/*
+ * Nintendo EAPoLStartPacket, is used to initaliaze a connection between client and host
+ */
+struct EAPoLStartPacket {
+    u16_be magic = EAPoLStartMagic;
+    u16_be association_id;
+    // This value is hardcoded to 1 in the NWM module.
+    u16_be unknown = 1;
+    INSERT_PADDING_BYTES(2);
+    EAPoLNodeInfo node;
+};
+
+static_assert(sizeof(EAPoLStartPacket) == 0x30, "EAPoLStartPacket has the wrong size");
+
+constexpr u16 EAPoLLogoffMagic = 0x202;
+
+struct EAPoLLogoffPacket {
+    u16_be magic = EAPoLLogoffMagic;
+    INSERT_PADDING_BYTES(2);
+    u16_be assigned_node_id;
+    MacAddress client_mac_address;
+    INSERT_PADDING_BYTES(6);
+    u8 connected_nodes;
+    u8 max_nodes;
+    INSERT_PADDING_BYTES(4);
+
+    std::array<EAPoLNodeInfo, UDSMaxNodes> nodes;
+};
+
+static_assert(sizeof(EAPoLLogoffPacket) == 0x298, "EAPoLLogoffPacket has the wrong size");
+
 /**
  * Generates an unencrypted 802.11 data payload.
  * @returns The generated frame payload.
@@ -74,5 +118,47 @@ static_assert(sizeof(DataFrameCryptoCTR) == 16, "DataFrameCryptoCTR has the wron
 std::vector<u8> GenerateDataPayload(const std::vector<u8>& data, u8 channel, u16 dest_node,
                                     u16 src_node, u16 sequence_number);
 
+/*
+ * Generates an unencrypted 802.11 data frame body with the EAPoL-Start format for UDS
+ * communication.
+ * @returns The generated frame body.
+ */
+std::vector<u8> GenerateEAPoLStartFrame(u16 association_id, const NodeInfo& node_info);
+
+/*
+ * Returns the EtherType of the specified 802.11 frame.
+ */
+EtherType GetFrameEtherType(const std::vector<u8>& frame);
+
+/*
+ * Returns the EAPoL type (Start / Logoff) of the specified 802.11 frame.
+ * Note: The frame *must* be an EAPoL frame.
+ */
+u16 GetEAPoLFrameType(const std::vector<u8>& frame);
+
+/*
+ * Returns a deserialized NodeInfo structure from the information inside an EAPoL-Start packet
+ * encapsulated in an 802.11 data frame.
+ */
+NodeInfo DeserializeNodeInfoFromFrame(const std::vector<u8>& frame);
+
+/*
+ * Returns a NodeInfo constructed from the data in the specified EAPoLNodeInfo.
+ */
+NodeInfo DeserializeNodeInfo(const EAPoLNodeInfo& node);
+
+/*
+ * Generates an unencrypted 802.11 data frame body with the EAPoL-Logoff format for UDS
+ * communication.
+ * @returns The generated frame body.
+ */
+std::vector<u8> GenerateEAPoLLogoffFrame(const MacAddress& mac_address, u16 network_node_id,
+                                         const NodeList& nodes, u8 max_nodes, u8 total_nodes);
+
+/*
+ * Returns a EAPoLLogoffPacket representing the specified 802.11-encapsulated data frame.
+ */
+EAPoLLogoffPacket ParseEAPoLLogoffFrame(const std::vector<u8>& frame);
+
 } // namespace NWM
 } // namespace Service
diff --git a/src/core/hle/service/sm/sm.cpp b/src/core/hle/service/sm/sm.cpp
index 5e7fc68f9..854ab9a05 100644
--- a/src/core/hle/service/sm/sm.cpp
+++ b/src/core/hle/service/sm/sm.cpp
@@ -36,6 +36,10 @@ ResultVal<Kernel::SharedPtr<Kernel::ServerPort>> ServiceManager::RegisterService
     std::string name, unsigned int max_sessions) {
 
     CASCADE_CODE(ValidateServiceName(name));
+
+    if (registered_services.find(name) != registered_services.end())
+        return ERR_ALREADY_REGISTERED;
+
     Kernel::SharedPtr<Kernel::ServerPort> server_port;
     Kernel::SharedPtr<Kernel::ClientPort> client_port;
     std::tie(server_port, client_port) = Kernel::ServerPort::CreatePortPair(max_sessions, name);
diff --git a/src/core/hle/service/sm/sm.h b/src/core/hle/service/sm/sm.h
index 8f0dbf2db..9f60a7965 100644
--- a/src/core/hle/service/sm/sm.h
+++ b/src/core/hle/service/sm/sm.h
@@ -32,6 +32,9 @@ constexpr ResultCode ERR_ACCESS_DENIED(6, ErrorModule::SRV, ErrorSummary::Invali
                                        ErrorLevel::Permanent); // 0xD8E06406
 constexpr ResultCode ERR_NAME_CONTAINS_NUL(7, ErrorModule::SRV, ErrorSummary::WrongArgument,
                                            ErrorLevel::Permanent); // 0xD9006407
+constexpr ResultCode ERR_ALREADY_REGISTERED(ErrorDescription::AlreadyExists, ErrorModule::OS,
+                                            ErrorSummary::WrongArgument,
+                                            ErrorLevel::Permanent); // 0xD9001BFC
 
 class ServiceManager {
 public:
diff --git a/src/core/hle/service/sm/srv.cpp b/src/core/hle/service/sm/srv.cpp
index 352941e69..5c955cf54 100644
--- a/src/core/hle/service/sm/srv.cpp
+++ b/src/core/hle/service/sm/srv.cpp
@@ -13,6 +13,7 @@
 #include "core/hle/kernel/errors.h"
 #include "core/hle/kernel/hle_ipc.h"
 #include "core/hle/kernel/semaphore.h"
+#include "core/hle/kernel/server_port.h"
 #include "core/hle/kernel/server_session.h"
 #include "core/hle/service/sm/sm.h"
 #include "core/hle/service/sm/srv.h"
@@ -184,12 +185,35 @@ void SRV::PublishToSubscriber(Kernel::HLERequestContext& ctx) {
                 flags);
 }
 
+void SRV::RegisterService(Kernel::HLERequestContext& ctx) {
+    IPC::RequestParser rp(ctx, 0x3, 4, 0);
+
+    auto name_buf = rp.PopRaw<std::array<char, 8>>();
+    size_t name_len = rp.Pop<u32>();
+    u32 max_sessions = rp.Pop<u32>();
+
+    std::string name(name_buf.data(), std::min(name_len, name_buf.size()));
+
+    auto port = service_manager->RegisterService(name, max_sessions);
+
+    if (port.Failed()) {
+        IPC::RequestBuilder rb = rp.MakeBuilder(1, 0);
+        rb.Push(port.Code());
+        LOG_ERROR(Service_SRV, "called service=%s -> error 0x%08X", name.c_str(), port.Code().raw);
+        return;
+    }
+
+    IPC::RequestBuilder rb = rp.MakeBuilder(1, 2);
+    rb.Push(RESULT_SUCCESS);
+    rb.PushObjects(port.Unwrap());
+}
+
 SRV::SRV(std::shared_ptr<ServiceManager> service_manager)
     : ServiceFramework("srv:", 4), service_manager(std::move(service_manager)) {
     static const FunctionInfo functions[] = {
         {0x00010002, &SRV::RegisterClient, "RegisterClient"},
         {0x00020000, &SRV::EnableNotification, "EnableNotification"},
-        {0x00030100, nullptr, "RegisterService"},
+        {0x00030100, &SRV::RegisterService, "RegisterService"},
         {0x000400C0, nullptr, "UnregisterService"},
         {0x00050100, &SRV::GetServiceHandle, "GetServiceHandle"},
         {0x000600C2, nullptr, "RegisterPort"},
diff --git a/src/core/hle/service/sm/srv.h b/src/core/hle/service/sm/srv.h
index 75cca5184..aad839563 100644
--- a/src/core/hle/service/sm/srv.h
+++ b/src/core/hle/service/sm/srv.h
@@ -28,6 +28,7 @@ private:
     void Subscribe(Kernel::HLERequestContext& ctx);
     void Unsubscribe(Kernel::HLERequestContext& ctx);
     void PublishToSubscriber(Kernel::HLERequestContext& ctx);
+    void RegisterService(Kernel::HLERequestContext& ctx);
 
     std::shared_ptr<ServiceManager> service_manager;
     Kernel::SharedPtr<Kernel::Semaphore> notification_semaphore;
diff --git a/src/core/hle/svc.cpp b/src/core/hle/svc.cpp
index dfc36748c..6be5db13f 100644
--- a/src/core/hle/svc.cpp
+++ b/src/core/hle/svc.cpp
@@ -271,6 +271,24 @@ static ResultCode WaitSynchronization1(Kernel::Handle handle, s64 nano_seconds)
         // Create an event to wake the thread up after the specified nanosecond delay has passed
         thread->WakeAfterDelay(nano_seconds);
 
+        thread->wakeup_callback = [](ThreadWakeupReason reason,
+                                     Kernel::SharedPtr<Kernel::Thread> thread,
+                                     Kernel::SharedPtr<Kernel::WaitObject> object) {
+
+            ASSERT(thread->status == THREADSTATUS_WAIT_SYNCH_ANY);
+
+            if (reason == ThreadWakeupReason::Timeout) {
+                thread->SetWaitSynchronizationResult(Kernel::RESULT_TIMEOUT);
+                return;
+            }
+
+            ASSERT(reason == ThreadWakeupReason::Signal);
+            thread->SetWaitSynchronizationResult(RESULT_SUCCESS);
+
+            // WaitSynchronization1 doesn't have an output index like WaitSynchronizationN, so we
+            // don't have to do anything else here.
+        };
+
         Core::System::GetInstance().PrepareReschedule();
 
         // Note: The output of this SVC will be set to RESULT_SUCCESS if the thread
@@ -344,6 +362,23 @@ static ResultCode WaitSynchronizationN(s32* out, Kernel::Handle* handles, s32 ha
         // Create an event to wake the thread up after the specified nanosecond delay has passed
         thread->WakeAfterDelay(nano_seconds);
 
+        thread->wakeup_callback = [](ThreadWakeupReason reason,
+                                     Kernel::SharedPtr<Kernel::Thread> thread,
+                                     Kernel::SharedPtr<Kernel::WaitObject> object) {
+
+            ASSERT(thread->status == THREADSTATUS_WAIT_SYNCH_ALL);
+
+            if (reason == ThreadWakeupReason::Timeout) {
+                thread->SetWaitSynchronizationResult(Kernel::RESULT_TIMEOUT);
+                return;
+            }
+
+            ASSERT(reason == ThreadWakeupReason::Signal);
+
+            thread->SetWaitSynchronizationResult(RESULT_SUCCESS);
+            // The wait_all case does not update the output index.
+        };
+
         Core::System::GetInstance().PrepareReschedule();
 
         // This value gets set to -1 by default in this case, it is not modified after this.
@@ -361,7 +396,7 @@ static ResultCode WaitSynchronizationN(s32* out, Kernel::Handle* handles, s32 ha
             // We found a ready object, acquire it and set the result value
             Kernel::WaitObject* object = itr->get();
             object->Acquire(thread);
-            *out = std::distance(objects.begin(), itr);
+            *out = static_cast<s32>(std::distance(objects.begin(), itr));
             return RESULT_SUCCESS;
         }
 
@@ -389,12 +424,28 @@ static ResultCode WaitSynchronizationN(s32* out, Kernel::Handle* handles, s32 ha
         // Create an event to wake the thread up after the specified nanosecond delay has passed
         thread->WakeAfterDelay(nano_seconds);
 
+        thread->wakeup_callback = [](ThreadWakeupReason reason,
+                                     Kernel::SharedPtr<Kernel::Thread> thread,
+                                     Kernel::SharedPtr<Kernel::WaitObject> object) {
+
+            ASSERT(thread->status == THREADSTATUS_WAIT_SYNCH_ANY);
+
+            if (reason == ThreadWakeupReason::Timeout) {
+                thread->SetWaitSynchronizationResult(Kernel::RESULT_TIMEOUT);
+                return;
+            }
+
+            ASSERT(reason == ThreadWakeupReason::Signal);
+
+            thread->SetWaitSynchronizationResult(RESULT_SUCCESS);
+            thread->SetWaitSynchronizationOutput(thread->GetWaitObjectIndex(object.get()));
+        };
+
         Core::System::GetInstance().PrepareReschedule();
 
         // Note: The output of this SVC will be set to RESULT_SUCCESS if the thread resumes due to a
         // signal in one of its wait objects.
         // Otherwise we retain the default value of timeout, and -1 in the out parameter
-        thread->wait_set_output = true;
         *out = -1;
         return Kernel::RESULT_TIMEOUT;
     }
@@ -469,7 +520,7 @@ static ResultCode ReplyAndReceive(s32* index, Kernel::Handle* handles, s32 handl
         // We found a ready object, acquire it and set the result value
         Kernel::WaitObject* object = itr->get();
         object->Acquire(thread);
-        *index = std::distance(objects.begin(), itr);
+        *index = static_cast<s32>(std::distance(objects.begin(), itr));
 
         if (object->GetHandleType() == Kernel::HandleType::ServerSession) {
             auto server_session = static_cast<Kernel::ServerSession*>(object);
@@ -483,8 +534,6 @@ static ResultCode ReplyAndReceive(s32* index, Kernel::Handle* handles, s32 handl
 
     // No objects were ready to be acquired, prepare to suspend the thread.
 
-    // TODO(Subv): Perform IPC translation upon wakeup.
-
     // Put the thread to sleep
     thread->status = THREADSTATUS_WAIT_SYNCH_ANY;
 
@@ -496,12 +545,24 @@ static ResultCode ReplyAndReceive(s32* index, Kernel::Handle* handles, s32 handl
 
     thread->wait_objects = std::move(objects);
 
+    thread->wakeup_callback = [](ThreadWakeupReason reason,
+                                 Kernel::SharedPtr<Kernel::Thread> thread,
+                                 Kernel::SharedPtr<Kernel::WaitObject> object) {
+
+        ASSERT(thread->status == THREADSTATUS_WAIT_SYNCH_ANY);
+        ASSERT(reason == ThreadWakeupReason::Signal);
+
+        thread->SetWaitSynchronizationResult(RESULT_SUCCESS);
+        thread->SetWaitSynchronizationOutput(thread->GetWaitObjectIndex(object.get()));
+
+        // TODO(Subv): Perform IPC translation upon wakeup.
+    };
+
     Core::System::GetInstance().PrepareReschedule();
 
     // Note: The output of this SVC will be set to RESULT_SUCCESS if the thread resumes due to a
     // signal in one of its wait objects, or to 0xC8A01836 if there was a translation error.
     // By default the index is set to -1.
-    thread->wait_set_output = true;
     *index = -1;
     return RESULT_SUCCESS;
 }
@@ -656,8 +717,9 @@ static ResultCode CreateThread(Kernel::Handle* out_handle, u32 priority, u32 ent
                   "Newly created thread must run in the SysCore (Core1), unimplemented.");
     }
 
-    CASCADE_RESULT(SharedPtr<Thread> thread, Kernel::Thread::Create(name, entry_point, priority,
-                                                                    arg, processor_id, stack_top));
+    CASCADE_RESULT(SharedPtr<Thread> thread,
+                   Kernel::Thread::Create(name, entry_point, priority, arg, processor_id, stack_top,
+                                          Kernel::g_current_process));
 
     thread->context.fpscr =
         FPSCR_DEFAULT_NAN | FPSCR_FLUSH_TO_ZERO | FPSCR_ROUND_TOZERO; // 0x03C00000
@@ -682,7 +744,7 @@ static void ExitThread() {
 }
 
 /// Gets the priority for the specified thread
-static ResultCode GetThreadPriority(s32* priority, Kernel::Handle handle) {
+static ResultCode GetThreadPriority(u32* priority, Kernel::Handle handle) {
     const SharedPtr<Kernel::Thread> thread = Kernel::g_handle_table.Get<Kernel::Thread>(handle);
     if (thread == nullptr)
         return ERR_INVALID_HANDLE;
@@ -692,7 +754,7 @@ static ResultCode GetThreadPriority(s32* priority, Kernel::Handle handle) {
 }
 
 /// Sets the priority for the specified thread
-static ResultCode SetThreadPriority(Kernel::Handle handle, s32 priority) {
+static ResultCode SetThreadPriority(Kernel::Handle handle, u32 priority) {
     if (priority > THREADPRIO_LOWEST) {
         return Kernel::ERR_OUT_OF_RANGE;
     }
@@ -977,7 +1039,7 @@ static void SleepThread(s64 nanoseconds) {
 static s64 GetSystemTick() {
     s64 result = CoreTiming::GetTicks();
     // Advance time to defeat dumb games (like Cubic Ninja) that busy-wait for the frame to end.
-    Core::CPU().AddTicks(150); // Measured time between two calls on a 9.2 o3DS with Ninjhax 1.1b
+    CoreTiming::AddTicks(150); // Measured time between two calls on a 9.2 o3DS with Ninjhax 1.1b
     return result;
 }
 
diff --git a/src/core/loader/3dsx.cpp b/src/core/loader/3dsx.cpp
index 69cdc0867..918038f1e 100644
--- a/src/core/loader/3dsx.cpp
+++ b/src/core/loader/3dsx.cpp
@@ -91,8 +91,8 @@ static u32 TranslateAddr(u32 addr, const THREEloadinfo* loadinfo, u32* offsets)
     return loadinfo->seg_addrs[2] + addr - offsets[1];
 }
 
-using Kernel::SharedPtr;
 using Kernel::CodeSet;
+using Kernel::SharedPtr;
 
 static THREEDSX_Error Load3DSXFile(FileUtil::IOFile& file, u32 base_addr,
                                    SharedPtr<CodeSet>* out_codeset) {
@@ -255,7 +255,7 @@ FileType AppLoader_THREEDSX::IdentifyType(FileUtil::IOFile& file) {
     return FileType::Error;
 }
 
-ResultStatus AppLoader_THREEDSX::Load() {
+ResultStatus AppLoader_THREEDSX::Load(Kernel::SharedPtr<Kernel::Process>& process) {
     if (is_loaded)
         return ResultStatus::ErrorAlreadyLoaded;
 
@@ -267,19 +267,17 @@ ResultStatus AppLoader_THREEDSX::Load() {
         return ResultStatus::Error;
     codeset->name = filename;
 
-    Kernel::g_current_process = Kernel::Process::Create(std::move(codeset));
-    Kernel::g_current_process->svc_access_mask.set();
-    Kernel::g_current_process->address_mappings = default_address_mappings;
-    Memory::current_page_table = &Kernel::g_current_process->vm_manager.page_table;
+    process = Kernel::Process::Create(std::move(codeset));
+    process->svc_access_mask.set();
+    process->address_mappings = default_address_mappings;
 
     // Attach the default resource limit (APPLICATION) to the process
-    Kernel::g_current_process->resource_limit =
+    process->resource_limit =
         Kernel::ResourceLimit::GetForCategory(Kernel::ResourceLimitCategory::APPLICATION);
 
-    Kernel::g_current_process->Run(48, Kernel::DEFAULT_STACK_SIZE);
+    process->Run(48, Kernel::DEFAULT_STACK_SIZE);
 
-    Service::FS::RegisterArchiveType(std::make_unique<FileSys::ArchiveFactory_SelfNCCH>(*this),
-                                     Service::FS::ArchiveIdCode::SelfNCCH);
+    Service::FS::RegisterSelfNCCH(*this);
 
     is_loaded = true;
     return ResultStatus::Success;
diff --git a/src/core/loader/3dsx.h b/src/core/loader/3dsx.h
index 3f376778a..1e59bbb9d 100644
--- a/src/core/loader/3dsx.h
+++ b/src/core/loader/3dsx.h
@@ -31,7 +31,7 @@ public:
         return IdentifyType(file);
     }
 
-    ResultStatus Load() override;
+    ResultStatus Load(Kernel::SharedPtr<Kernel::Process>& process) override;
 
     ResultStatus ReadIcon(std::vector<u8>& buffer) override;
 
diff --git a/src/core/loader/elf.cpp b/src/core/loader/elf.cpp
index 2f27606a1..e36e42120 100644
--- a/src/core/loader/elf.cpp
+++ b/src/core/loader/elf.cpp
@@ -13,8 +13,8 @@
 #include "core/loader/elf.h"
 #include "core/memory.h"
 
-using Kernel::SharedPtr;
 using Kernel::CodeSet;
+using Kernel::SharedPtr;
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // ELF Header Constants
@@ -375,7 +375,7 @@ FileType AppLoader_ELF::IdentifyType(FileUtil::IOFile& file) {
     return FileType::Error;
 }
 
-ResultStatus AppLoader_ELF::Load() {
+ResultStatus AppLoader_ELF::Load(Kernel::SharedPtr<Kernel::Process>& process) {
     if (is_loaded)
         return ResultStatus::ErrorAlreadyLoaded;
 
@@ -394,16 +394,15 @@ ResultStatus AppLoader_ELF::Load() {
     SharedPtr<CodeSet> codeset = elf_reader.LoadInto(Memory::PROCESS_IMAGE_VADDR);
     codeset->name = filename;
 
-    Kernel::g_current_process = Kernel::Process::Create(std::move(codeset));
-    Kernel::g_current_process->svc_access_mask.set();
-    Kernel::g_current_process->address_mappings = default_address_mappings;
-    Memory::current_page_table = &Kernel::g_current_process->vm_manager.page_table;
+    process = Kernel::Process::Create(std::move(codeset));
+    process->svc_access_mask.set();
+    process->address_mappings = default_address_mappings;
 
     // Attach the default resource limit (APPLICATION) to the process
-    Kernel::g_current_process->resource_limit =
+    process->resource_limit =
         Kernel::ResourceLimit::GetForCategory(Kernel::ResourceLimitCategory::APPLICATION);
 
-    Kernel::g_current_process->Run(48, Kernel::DEFAULT_STACK_SIZE);
+    process->Run(48, Kernel::DEFAULT_STACK_SIZE);
 
     is_loaded = true;
     return ResultStatus::Success;
diff --git a/src/core/loader/elf.h b/src/core/loader/elf.h
index 862aa90d8..113da5917 100644
--- a/src/core/loader/elf.h
+++ b/src/core/loader/elf.h
@@ -30,7 +30,7 @@ public:
         return IdentifyType(file);
     }
 
-    ResultStatus Load() override;
+    ResultStatus Load(Kernel::SharedPtr<Kernel::Process>& process) override;
 
 private:
     std::string filename;
diff --git a/src/core/loader/loader.h b/src/core/loader/loader.h
index e731888a2..82b2be6a3 100644
--- a/src/core/loader/loader.h
+++ b/src/core/loader/loader.h
@@ -13,10 +13,12 @@
 #include <boost/optional.hpp>
 #include "common/common_types.h"
 #include "common/file_util.h"
+#include "core/hle/kernel/kernel.h"
 
 namespace Kernel {
 struct AddressMapping;
-}
+class Process;
+} // namespace Kernel
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Loader namespace
@@ -92,10 +94,11 @@ public:
     virtual FileType GetFileType() = 0;
 
     /**
-     * Load the application
-     * @return ResultStatus result of function
+     * Load the application and return the created Process instance
+     * @param process The newly created process.
+     * @return The status result of the operation.
      */
-    virtual ResultStatus Load() = 0;
+    virtual ResultStatus Load(Kernel::SharedPtr<Kernel::Process>& process) = 0;
 
     /**
      * Loads the system mode that this application needs.
@@ -167,6 +170,19 @@ public:
     }
 
     /**
+     * Get the update RomFS of the application
+     * Since the RomFS can be huge, we return a file reference instead of copying to a buffer
+     * @param romfs_file The file containing the RomFS
+     * @param offset The offset the romfs begins on
+     * @param size The size of the romfs
+     * @return ResultStatus result of function
+     */
+    virtual ResultStatus ReadUpdateRomFS(std::shared_ptr<FileUtil::IOFile>& romfs_file, u64& offset,
+                                         u64& size) {
+        return ResultStatus::ErrorNotImplemented;
+    }
+
+    /**
      * Get the title of the application
      * @param title Reference to store the application title into
      * @return ResultStatus result of function
@@ -193,4 +209,4 @@ extern const std::initializer_list<Kernel::AddressMapping> default_address_mappi
  */
 std::unique_ptr<AppLoader> GetLoader(const std::string& filename);
 
-} // namespace
+} // namespace Loader
diff --git a/src/core/loader/ncch.cpp b/src/core/loader/ncch.cpp
index 79ea50147..66bc5823d 100644
--- a/src/core/loader/ncch.cpp
+++ b/src/core/loader/ncch.cpp
@@ -13,6 +13,7 @@
 #include "common/swap.h"
 #include "core/core.h"
 #include "core/file_sys/archive_selfncch.h"
+#include "core/file_sys/ncch_container.h"
 #include "core/hle/kernel/process.h"
 #include "core/hle/kernel/resource_limit.h"
 #include "core/hle/service/cfg/cfg.h"
@@ -27,87 +28,7 @@
 
 namespace Loader {
 
-static const int kMaxSections = 8;   ///< Maximum number of sections (files) in an ExeFs
-static const int kBlockSize = 0x200; ///< Size of ExeFS blocks (in bytes)
-
-/**
- * Get the decompressed size of an LZSS compressed ExeFS file
- * @param buffer Buffer of compressed file
- * @param size Size of compressed buffer
- * @return Size of decompressed buffer
- */
-static u32 LZSS_GetDecompressedSize(const u8* buffer, u32 size) {
-    u32 offset_size = *(u32*)(buffer + size - 4);
-    return offset_size + size;
-}
-
-/**
- * Decompress ExeFS file (compressed with LZSS)
- * @param compressed Compressed buffer
- * @param compressed_size Size of compressed buffer
- * @param decompressed Decompressed buffer
- * @param decompressed_size Size of decompressed buffer
- * @return True on success, otherwise false
- */
-static bool LZSS_Decompress(const u8* compressed, u32 compressed_size, u8* decompressed,
-                            u32 decompressed_size) {
-    const u8* footer = compressed + compressed_size - 8;
-    u32 buffer_top_and_bottom = *reinterpret_cast<const u32*>(footer);
-    u32 out = decompressed_size;
-    u32 index = compressed_size - ((buffer_top_and_bottom >> 24) & 0xFF);
-    u32 stop_index = compressed_size - (buffer_top_and_bottom & 0xFFFFFF);
-
-    memset(decompressed, 0, decompressed_size);
-    memcpy(decompressed, compressed, compressed_size);
-
-    while (index > stop_index) {
-        u8 control = compressed[--index];
-
-        for (unsigned i = 0; i < 8; i++) {
-            if (index <= stop_index)
-                break;
-            if (index <= 0)
-                break;
-            if (out <= 0)
-                break;
-
-            if (control & 0x80) {
-                // Check if compression is out of bounds
-                if (index < 2)
-                    return false;
-                index -= 2;
-
-                u32 segment_offset = compressed[index] | (compressed[index + 1] << 8);
-                u32 segment_size = ((segment_offset >> 12) & 15) + 3;
-                segment_offset &= 0x0FFF;
-                segment_offset += 2;
-
-                // Check if compression is out of bounds
-                if (out < segment_size)
-                    return false;
-
-                for (unsigned j = 0; j < segment_size; j++) {
-                    // Check if compression is out of bounds
-                    if (out + segment_offset >= decompressed_size)
-                        return false;
-
-                    u8 data = decompressed[out + segment_offset];
-                    decompressed[--out] = data;
-                }
-            } else {
-                // Check if compression is out of bounds
-                if (out < 1)
-                    return false;
-                decompressed[--out] = compressed[--index];
-            }
-            control <<= 1;
-        }
-    }
-    return true;
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-// AppLoader_NCCH class
+static const u64 UPDATE_MASK = 0x0000000e00000000;
 
 FileType AppLoader_NCCH::IdentifyType(FileUtil::IOFile& file) {
     u32 magic;
@@ -124,203 +45,94 @@ FileType AppLoader_NCCH::IdentifyType(FileUtil::IOFile& file) {
     return FileType::Error;
 }
 
+static std::string GetUpdateNCCHPath(u64_le program_id) {
+    u32 high = static_cast<u32>((program_id | UPDATE_MASK) >> 32);
+    u32 low = static_cast<u32>((program_id | UPDATE_MASK) & 0xFFFFFFFF);
+
+    return Common::StringFromFormat("%sNintendo 3DS/%s/%s/title/%08x/%08x/content/00000000.app",
+                                    FileUtil::GetUserPath(D_SDMC_IDX).c_str(), SYSTEM_ID, SDCARD_ID,
+                                    high, low);
+}
+
 std::pair<boost::optional<u32>, ResultStatus> AppLoader_NCCH::LoadKernelSystemMode() {
     if (!is_loaded) {
-        ResultStatus res = LoadExeFS();
+        ResultStatus res = base_ncch.Load();
         if (res != ResultStatus::Success) {
             return std::make_pair(boost::none, res);
         }
     }
+
     // Set the system mode as the one from the exheader.
-    return std::make_pair(exheader_header.arm11_system_local_caps.system_mode.Value(),
+    return std::make_pair(overlay_ncch->exheader_header.arm11_system_local_caps.system_mode.Value(),
                           ResultStatus::Success);
 }
 
-ResultStatus AppLoader_NCCH::LoadExec() {
-    using Kernel::SharedPtr;
+ResultStatus AppLoader_NCCH::LoadExec(Kernel::SharedPtr<Kernel::Process>& process) {
     using Kernel::CodeSet;
+    using Kernel::SharedPtr;
 
     if (!is_loaded)
         return ResultStatus::ErrorNotLoaded;
 
     std::vector<u8> code;
-    if (ResultStatus::Success == ReadCode(code)) {
+    u64_le program_id;
+    if (ResultStatus::Success == ReadCode(code) &&
+        ResultStatus::Success == ReadProgramId(program_id)) {
         std::string process_name = Common::StringFromFixedZeroTerminatedBuffer(
-            (const char*)exheader_header.codeset_info.name, 8);
+            (const char*)overlay_ncch->exheader_header.codeset_info.name, 8);
 
-        SharedPtr<CodeSet> codeset = CodeSet::Create(process_name, ncch_header.program_id);
+        SharedPtr<CodeSet> codeset = CodeSet::Create(process_name, program_id);
 
         codeset->code.offset = 0;
-        codeset->code.addr = exheader_header.codeset_info.text.address;
-        codeset->code.size = exheader_header.codeset_info.text.num_max_pages * Memory::PAGE_SIZE;
+        codeset->code.addr = overlay_ncch->exheader_header.codeset_info.text.address;
+        codeset->code.size =
+            overlay_ncch->exheader_header.codeset_info.text.num_max_pages * Memory::PAGE_SIZE;
 
         codeset->rodata.offset = codeset->code.offset + codeset->code.size;
-        codeset->rodata.addr = exheader_header.codeset_info.ro.address;
-        codeset->rodata.size = exheader_header.codeset_info.ro.num_max_pages * Memory::PAGE_SIZE;
+        codeset->rodata.addr = overlay_ncch->exheader_header.codeset_info.ro.address;
+        codeset->rodata.size =
+            overlay_ncch->exheader_header.codeset_info.ro.num_max_pages * Memory::PAGE_SIZE;
 
         // TODO(yuriks): Not sure if the bss size is added to the page-aligned .data size or just
         //               to the regular size. Playing it safe for now.
-        u32 bss_page_size = (exheader_header.codeset_info.bss_size + 0xFFF) & ~0xFFF;
+        u32 bss_page_size = (overlay_ncch->exheader_header.codeset_info.bss_size + 0xFFF) & ~0xFFF;
         code.resize(code.size() + bss_page_size, 0);
 
         codeset->data.offset = codeset->rodata.offset + codeset->rodata.size;
-        codeset->data.addr = exheader_header.codeset_info.data.address;
+        codeset->data.addr = overlay_ncch->exheader_header.codeset_info.data.address;
         codeset->data.size =
-            exheader_header.codeset_info.data.num_max_pages * Memory::PAGE_SIZE + bss_page_size;
+            overlay_ncch->exheader_header.codeset_info.data.num_max_pages * Memory::PAGE_SIZE +
+            bss_page_size;
 
         codeset->entrypoint = codeset->code.addr;
         codeset->memory = std::make_shared<std::vector<u8>>(std::move(code));
 
-        Kernel::g_current_process = Kernel::Process::Create(std::move(codeset));
-        Memory::current_page_table = &Kernel::g_current_process->vm_manager.page_table;
+        process = Kernel::Process::Create(std::move(codeset));
 
         // Attach a resource limit to the process based on the resource limit category
-        Kernel::g_current_process->resource_limit =
+        process->resource_limit =
             Kernel::ResourceLimit::GetForCategory(static_cast<Kernel::ResourceLimitCategory>(
-                exheader_header.arm11_system_local_caps.resource_limit_category));
+                overlay_ncch->exheader_header.arm11_system_local_caps.resource_limit_category));
 
         // Set the default CPU core for this process
-        Kernel::g_current_process->ideal_processor =
-            exheader_header.arm11_system_local_caps.ideal_processor;
+        process->ideal_processor =
+            overlay_ncch->exheader_header.arm11_system_local_caps.ideal_processor;
 
         // Copy data while converting endianness
-        std::array<u32, ARRAY_SIZE(exheader_header.arm11_kernel_caps.descriptors)> kernel_caps;
-        std::copy_n(exheader_header.arm11_kernel_caps.descriptors, kernel_caps.size(),
+        std::array<u32, ARRAY_SIZE(overlay_ncch->exheader_header.arm11_kernel_caps.descriptors)>
+            kernel_caps;
+        std::copy_n(overlay_ncch->exheader_header.arm11_kernel_caps.descriptors, kernel_caps.size(),
                     begin(kernel_caps));
-        Kernel::g_current_process->ParseKernelCaps(kernel_caps.data(), kernel_caps.size());
+        process->ParseKernelCaps(kernel_caps.data(), kernel_caps.size());
 
-        s32 priority = exheader_header.arm11_system_local_caps.priority;
-        u32 stack_size = exheader_header.codeset_info.stack_size;
-        Kernel::g_current_process->Run(priority, stack_size);
+        s32 priority = overlay_ncch->exheader_header.arm11_system_local_caps.priority;
+        u32 stack_size = overlay_ncch->exheader_header.codeset_info.stack_size;
+        process->Run(priority, stack_size);
         return ResultStatus::Success;
     }
     return ResultStatus::Error;
 }
 
-ResultStatus AppLoader_NCCH::LoadSectionExeFS(const char* name, std::vector<u8>& buffer) {
-    if (!file.IsOpen())
-        return ResultStatus::Error;
-
-    ResultStatus result = LoadExeFS();
-    if (result != ResultStatus::Success)
-        return result;
-
-    LOG_DEBUG(Loader, "%d sections:", kMaxSections);
-    // Iterate through the ExeFs archive until we find a section with the specified name...
-    for (unsigned section_number = 0; section_number < kMaxSections; section_number++) {
-        const auto& section = exefs_header.section[section_number];
-
-        // Load the specified section...
-        if (strcmp(section.name, name) == 0) {
-            LOG_DEBUG(Loader, "%d - offset: 0x%08X, size: 0x%08X, name: %s", section_number,
-                      section.offset, section.size, section.name);
-
-            s64 section_offset =
-                (section.offset + exefs_offset + sizeof(ExeFs_Header) + ncch_offset);
-            file.Seek(section_offset, SEEK_SET);
-
-            if (strcmp(section.name, ".code") == 0 && is_compressed) {
-                // Section is compressed, read compressed .code section...
-                std::unique_ptr<u8[]> temp_buffer;
-                try {
-                    temp_buffer.reset(new u8[section.size]);
-                } catch (std::bad_alloc&) {
-                    return ResultStatus::ErrorMemoryAllocationFailed;
-                }
-
-                if (file.ReadBytes(&temp_buffer[0], section.size) != section.size)
-                    return ResultStatus::Error;
-
-                // Decompress .code section...
-                u32 decompressed_size = LZSS_GetDecompressedSize(&temp_buffer[0], section.size);
-                buffer.resize(decompressed_size);
-                if (!LZSS_Decompress(&temp_buffer[0], section.size, &buffer[0], decompressed_size))
-                    return ResultStatus::ErrorInvalidFormat;
-            } else {
-                // Section is uncompressed...
-                buffer.resize(section.size);
-                if (file.ReadBytes(&buffer[0], section.size) != section.size)
-                    return ResultStatus::Error;
-            }
-            return ResultStatus::Success;
-        }
-    }
-    return ResultStatus::ErrorNotUsed;
-}
-
-ResultStatus AppLoader_NCCH::LoadExeFS() {
-    if (is_exefs_loaded)
-        return ResultStatus::Success;
-
-    if (!file.IsOpen())
-        return ResultStatus::Error;
-
-    // Reset read pointer in case this file has been read before.
-    file.Seek(0, SEEK_SET);
-
-    if (file.ReadBytes(&ncch_header, sizeof(NCCH_Header)) != sizeof(NCCH_Header))
-        return ResultStatus::Error;
-
-    // Skip NCSD header and load first NCCH (NCSD is just a container of NCCH files)...
-    if (MakeMagic('N', 'C', 'S', 'D') == ncch_header.magic) {
-        LOG_DEBUG(Loader, "Only loading the first (bootable) NCCH within the NCSD file!");
-        ncch_offset = 0x4000;
-        file.Seek(ncch_offset, SEEK_SET);
-        file.ReadBytes(&ncch_header, sizeof(NCCH_Header));
-    }
-
-    // Verify we are loading the correct file type...
-    if (MakeMagic('N', 'C', 'C', 'H') != ncch_header.magic)
-        return ResultStatus::ErrorInvalidFormat;
-
-    // Read ExHeader...
-
-    if (file.ReadBytes(&exheader_header, sizeof(ExHeader_Header)) != sizeof(ExHeader_Header))
-        return ResultStatus::Error;
-
-    is_compressed = (exheader_header.codeset_info.flags.flag & 1) == 1;
-    entry_point = exheader_header.codeset_info.text.address;
-    code_size = exheader_header.codeset_info.text.code_size;
-    stack_size = exheader_header.codeset_info.stack_size;
-    bss_size = exheader_header.codeset_info.bss_size;
-    core_version = exheader_header.arm11_system_local_caps.core_version;
-    priority = exheader_header.arm11_system_local_caps.priority;
-    resource_limit_category = exheader_header.arm11_system_local_caps.resource_limit_category;
-
-    LOG_DEBUG(Loader, "Name:                        %s", exheader_header.codeset_info.name);
-    LOG_DEBUG(Loader, "Program ID:                  %016" PRIX64, ncch_header.program_id);
-    LOG_DEBUG(Loader, "Code compressed:             %s", is_compressed ? "yes" : "no");
-    LOG_DEBUG(Loader, "Entry point:                 0x%08X", entry_point);
-    LOG_DEBUG(Loader, "Code size:                   0x%08X", code_size);
-    LOG_DEBUG(Loader, "Stack size:                  0x%08X", stack_size);
-    LOG_DEBUG(Loader, "Bss size:                    0x%08X", bss_size);
-    LOG_DEBUG(Loader, "Core version:                %d", core_version);
-    LOG_DEBUG(Loader, "Thread priority:             0x%X", priority);
-    LOG_DEBUG(Loader, "Resource limit category:     %d", resource_limit_category);
-    LOG_DEBUG(Loader, "System Mode:                 %d",
-              static_cast<int>(exheader_header.arm11_system_local_caps.system_mode));
-
-    if (exheader_header.arm11_system_local_caps.program_id != ncch_header.program_id) {
-        LOG_ERROR(Loader, "ExHeader Program ID mismatch: the ROM is probably encrypted.");
-        return ResultStatus::ErrorEncrypted;
-    }
-
-    // Read ExeFS...
-
-    exefs_offset = ncch_header.exefs_offset * kBlockSize;
-    u32 exefs_size = ncch_header.exefs_size * kBlockSize;
-
-    LOG_DEBUG(Loader, "ExeFS offset:                0x%08X", exefs_offset);
-    LOG_DEBUG(Loader, "ExeFS size:                  0x%08X", exefs_size);
-
-    file.Seek(exefs_offset + ncch_offset, SEEK_SET);
-    if (file.ReadBytes(&exefs_header, sizeof(ExeFs_Header)) != sizeof(ExeFs_Header))
-        return ResultStatus::Error;
-
-    is_exefs_loaded = true;
-    return ResultStatus::Success;
-}
-
 void AppLoader_NCCH::ParseRegionLockoutInfo() {
     std::vector<u8> smdh_buffer;
     if (ReadIcon(smdh_buffer) == ResultStatus::Success && smdh_buffer.size() >= sizeof(SMDH)) {
@@ -338,35 +150,43 @@ void AppLoader_NCCH::ParseRegionLockoutInfo() {
     }
 }
 
-ResultStatus AppLoader_NCCH::Load() {
+ResultStatus AppLoader_NCCH::Load(Kernel::SharedPtr<Kernel::Process>& process) {
+    u64_le ncch_program_id;
+
     if (is_loaded)
         return ResultStatus::ErrorAlreadyLoaded;
 
-    ResultStatus result = LoadExeFS();
+    ResultStatus result = base_ncch.Load();
     if (result != ResultStatus::Success)
         return result;
 
-    std::string program_id{Common::StringFromFormat("%016" PRIX64, ncch_header.program_id)};
+    ReadProgramId(ncch_program_id);
+    std::string program_id{Common::StringFromFormat("%016" PRIX64, ncch_program_id)};
 
     LOG_INFO(Loader, "Program ID: %s", program_id.c_str());
 
+    update_ncch.OpenFile(GetUpdateNCCHPath(ncch_program_id));
+    result = update_ncch.Load();
+    if (result == ResultStatus::Success) {
+        overlay_ncch = &update_ncch;
+    }
+
     Core::Telemetry().AddField(Telemetry::FieldType::Session, "ProgramId", program_id);
 
     if (auto room_member = Network::GetRoomMember().lock()) {
         Network::GameInfo game_info;
         ReadTitle(game_info.name);
-        game_info.id = ncch_header.program_id;
+        game_info.id = ncch_program_id;
         room_member->SendGameInfo(game_info);
     }
 
     is_loaded = true; // Set state to loaded
 
-    result = LoadExec(); // Load the executable into memory for booting
+    result = LoadExec(process); // Load the executable into memory for booting
     if (ResultStatus::Success != result)
         return result;
 
-    Service::FS::RegisterArchiveType(std::make_unique<FileSys::ArchiveFactory_SelfNCCH>(*this),
-                                     Service::FS::ArchiveIdCode::SelfNCCH);
+    Service::FS::RegisterSelfNCCH(*this);
 
     ParseRegionLockoutInfo();
 
@@ -374,61 +194,40 @@ ResultStatus AppLoader_NCCH::Load() {
 }
 
 ResultStatus AppLoader_NCCH::ReadCode(std::vector<u8>& buffer) {
-    return LoadSectionExeFS(".code", buffer);
+    return overlay_ncch->LoadSectionExeFS(".code", buffer);
 }
 
 ResultStatus AppLoader_NCCH::ReadIcon(std::vector<u8>& buffer) {
-    return LoadSectionExeFS("icon", buffer);
+    return overlay_ncch->LoadSectionExeFS("icon", buffer);
 }
 
 ResultStatus AppLoader_NCCH::ReadBanner(std::vector<u8>& buffer) {
-    return LoadSectionExeFS("banner", buffer);
+    return overlay_ncch->LoadSectionExeFS("banner", buffer);
 }
 
 ResultStatus AppLoader_NCCH::ReadLogo(std::vector<u8>& buffer) {
-    return LoadSectionExeFS("logo", buffer);
+    return overlay_ncch->LoadSectionExeFS("logo", buffer);
 }
 
 ResultStatus AppLoader_NCCH::ReadProgramId(u64& out_program_id) {
-    if (!file.IsOpen())
-        return ResultStatus::Error;
-
-    ResultStatus result = LoadExeFS();
+    ResultStatus result = base_ncch.ReadProgramId(out_program_id);
     if (result != ResultStatus::Success)
         return result;
 
-    out_program_id = ncch_header.program_id;
     return ResultStatus::Success;
 }
 
 ResultStatus AppLoader_NCCH::ReadRomFS(std::shared_ptr<FileUtil::IOFile>& romfs_file, u64& offset,
                                        u64& size) {
-    if (!file.IsOpen())
-        return ResultStatus::Error;
-
-    // Check if the NCCH has a RomFS...
-    if (ncch_header.romfs_offset != 0 && ncch_header.romfs_size != 0) {
-        u32 romfs_offset = ncch_offset + (ncch_header.romfs_offset * kBlockSize) + 0x1000;
-        u32 romfs_size = (ncch_header.romfs_size * kBlockSize) - 0x1000;
-
-        LOG_DEBUG(Loader, "RomFS offset:           0x%08X", romfs_offset);
-        LOG_DEBUG(Loader, "RomFS size:             0x%08X", romfs_size);
-
-        if (file.GetSize() < romfs_offset + romfs_size)
-            return ResultStatus::Error;
-
-        // We reopen the file, to allow its position to be independent from file's
-        romfs_file = std::make_shared<FileUtil::IOFile>(filepath, "rb");
-        if (!romfs_file->IsOpen())
-            return ResultStatus::Error;
+    return base_ncch.ReadRomFS(romfs_file, offset, size);
+}
 
-        offset = romfs_offset;
-        size = romfs_size;
+ResultStatus AppLoader_NCCH::ReadUpdateRomFS(std::shared_ptr<FileUtil::IOFile>& romfs_file,
+                                             u64& offset, u64& size) {
+    ResultStatus result = update_ncch.ReadRomFS(romfs_file, offset, size);
 
-        return ResultStatus::Success;
-    }
-    LOG_DEBUG(Loader, "NCCH has no RomFS");
-    return ResultStatus::ErrorNotUsed;
+    if (result != ResultStatus::Success)
+        return base_ncch.ReadRomFS(romfs_file, offset, size);
 }
 
 ResultStatus AppLoader_NCCH::ReadTitle(std::string& title) {
diff --git a/src/core/loader/ncch.h b/src/core/loader/ncch.h
index e40cef764..09230ae33 100644
--- a/src/core/loader/ncch.h
+++ b/src/core/loader/ncch.h
@@ -5,155 +5,12 @@
 #pragma once
 
 #include <memory>
-#include "common/bit_field.h"
 #include "common/common_types.h"
 #include "common/swap.h"
+#include "core/file_sys/ncch_container.h"
 #include "core/loader/loader.h"
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
-/// NCCH header (Note: "NCCH" appears to be a publicly unknown acronym)
-
-struct NCCH_Header {
-    u8 signature[0x100];
-    u32_le magic;
-    u32_le content_size;
-    u8 partition_id[8];
-    u16_le maker_code;
-    u16_le version;
-    u8 reserved_0[4];
-    u64_le program_id;
-    u8 reserved_1[0x10];
-    u8 logo_region_hash[0x20];
-    u8 product_code[0x10];
-    u8 extended_header_hash[0x20];
-    u32_le extended_header_size;
-    u8 reserved_2[4];
-    u8 flags[8];
-    u32_le plain_region_offset;
-    u32_le plain_region_size;
-    u32_le logo_region_offset;
-    u32_le logo_region_size;
-    u32_le exefs_offset;
-    u32_le exefs_size;
-    u32_le exefs_hash_region_size;
-    u8 reserved_3[4];
-    u32_le romfs_offset;
-    u32_le romfs_size;
-    u32_le romfs_hash_region_size;
-    u8 reserved_4[4];
-    u8 exefs_super_block_hash[0x20];
-    u8 romfs_super_block_hash[0x20];
-};
-
-static_assert(sizeof(NCCH_Header) == 0x200, "NCCH header structure size is wrong");
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-// ExeFS (executable file system) headers
-
-struct ExeFs_SectionHeader {
-    char name[8];
-    u32 offset;
-    u32 size;
-};
-
-struct ExeFs_Header {
-    ExeFs_SectionHeader section[8];
-    u8 reserved[0x80];
-    u8 hashes[8][0x20];
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-// ExHeader (executable file system header) headers
-
-struct ExHeader_SystemInfoFlags {
-    u8 reserved[5];
-    u8 flag;
-    u8 remaster_version[2];
-};
-
-struct ExHeader_CodeSegmentInfo {
-    u32 address;
-    u32 num_max_pages;
-    u32 code_size;
-};
-
-struct ExHeader_CodeSetInfo {
-    u8 name[8];
-    ExHeader_SystemInfoFlags flags;
-    ExHeader_CodeSegmentInfo text;
-    u32 stack_size;
-    ExHeader_CodeSegmentInfo ro;
-    u8 reserved[4];
-    ExHeader_CodeSegmentInfo data;
-    u32 bss_size;
-};
-
-struct ExHeader_DependencyList {
-    u8 program_id[0x30][8];
-};
-
-struct ExHeader_SystemInfo {
-    u64 save_data_size;
-    u8 jump_id[8];
-    u8 reserved_2[0x30];
-};
-
-struct ExHeader_StorageInfo {
-    u8 ext_save_data_id[8];
-    u8 system_save_data_id[8];
-    u8 reserved[8];
-    u8 access_info[7];
-    u8 other_attributes;
-};
-
-struct ExHeader_ARM11_SystemLocalCaps {
-    u64_le program_id;
-    u32_le core_version;
-    u8 reserved_flags[2];
-    union {
-        u8 flags0;
-        BitField<0, 2, u8> ideal_processor;
-        BitField<2, 2, u8> affinity_mask;
-        BitField<4, 4, u8> system_mode;
-    };
-    u8 priority;
-    u8 resource_limit_descriptor[0x10][2];
-    ExHeader_StorageInfo storage_info;
-    u8 service_access_control[0x20][8];
-    u8 ex_service_access_control[0x2][8];
-    u8 reserved[0xf];
-    u8 resource_limit_category;
-};
-
-struct ExHeader_ARM11_KernelCaps {
-    u32_le descriptors[28];
-    u8 reserved[0x10];
-};
-
-struct ExHeader_ARM9_AccessControl {
-    u8 descriptors[15];
-    u8 descversion;
-};
-
-struct ExHeader_Header {
-    ExHeader_CodeSetInfo codeset_info;
-    ExHeader_DependencyList dependency_list;
-    ExHeader_SystemInfo system_info;
-    ExHeader_ARM11_SystemLocalCaps arm11_system_local_caps;
-    ExHeader_ARM11_KernelCaps arm11_kernel_caps;
-    ExHeader_ARM9_AccessControl arm9_access_control;
-    struct {
-        u8 signature[0x100];
-        u8 ncch_public_key_modulus[0x100];
-        ExHeader_ARM11_SystemLocalCaps arm11_system_local_caps;
-        ExHeader_ARM11_KernelCaps arm11_kernel_caps;
-        ExHeader_ARM9_AccessControl arm9_access_control;
-    } access_desc;
-};
-
-static_assert(sizeof(ExHeader_Header) == 0x800, "ExHeader structure size is wrong");
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
 // Loader namespace
 
 namespace Loader {
@@ -162,7 +19,8 @@ namespace Loader {
 class AppLoader_NCCH final : public AppLoader {
 public:
     AppLoader_NCCH(FileUtil::IOFile&& file, const std::string& filepath)
-        : AppLoader(std::move(file)), filepath(filepath) {}
+        : AppLoader(std::move(file)), filepath(filepath), base_ncch(filepath),
+          overlay_ncch(&base_ncch) {}
 
     /**
      * Returns the type of the file
@@ -175,7 +33,7 @@ public:
         return IdentifyType(file);
     }
 
-    ResultStatus Load() override;
+    ResultStatus Load(Kernel::SharedPtr<Kernel::Process>& process) override;
 
     /**
      * Loads the Exheader and returns the system mode for this application.
@@ -196,48 +54,25 @@ public:
     ResultStatus ReadRomFS(std::shared_ptr<FileUtil::IOFile>& romfs_file, u64& offset,
                            u64& size) override;
 
+    ResultStatus ReadUpdateRomFS(std::shared_ptr<FileUtil::IOFile>& romfs_file, u64& offset,
+                                 u64& size) override;
+
     ResultStatus ReadTitle(std::string& title) override;
 
 private:
     /**
-     * Reads an application ExeFS section of an NCCH file into AppLoader (e.g. .code, .logo, etc.)
-     * @param name Name of section to read out of NCCH file
-     * @param buffer Vector to read data into
-     * @return ResultStatus result of function
-     */
-    ResultStatus LoadSectionExeFS(const char* name, std::vector<u8>& buffer);
-
-    /**
      * Loads .code section into memory for booting
+     * @param process The newly created process
      * @return ResultStatus result of function
      */
-    ResultStatus LoadExec();
-
-    /**
-     * Ensure ExeFS is loaded and ready for reading sections
-     * @return ResultStatus result of function
-     */
-    ResultStatus LoadExeFS();
+    ResultStatus LoadExec(Kernel::SharedPtr<Kernel::Process>& process);
 
     /// Reads the region lockout info in the SMDH and send it to CFG service
     void ParseRegionLockoutInfo();
 
-    bool is_exefs_loaded = false;
-    bool is_compressed = false;
-
-    u32 entry_point = 0;
-    u32 code_size = 0;
-    u32 stack_size = 0;
-    u32 bss_size = 0;
-    u32 core_version = 0;
-    u8 priority = 0;
-    u8 resource_limit_category = 0;
-    u32 ncch_offset = 0; // Offset to NCCH header, can be 0 or after NCSD header
-    u32 exefs_offset = 0;
-
-    NCCH_Header ncch_header;
-    ExeFs_Header exefs_header;
-    ExHeader_Header exheader_header;
+    FileSys::NCCHContainer base_ncch;
+    FileSys::NCCHContainer update_ncch;
+    FileSys::NCCHContainer* overlay_ncch;
 
     std::string filepath;
 };
diff --git a/src/core/memory.cpp b/src/core/memory.cpp
index 68a6b1ac2..7f58be6de 100644
--- a/src/core/memory.cpp
+++ b/src/core/memory.cpp
@@ -9,6 +9,8 @@
 #include "common/common_types.h"
 #include "common/logging/log.h"
 #include "common/swap.h"
+#include "core/arm/arm_interface.h"
+#include "core/core.h"
 #include "core/hle/kernel/memory.h"
 #include "core/hle/kernel/process.h"
 #include "core/hle/lock.h"
@@ -22,10 +24,17 @@ namespace Memory {
 static std::array<u8, Memory::VRAM_SIZE> vram;
 static std::array<u8, Memory::N3DS_EXTRA_RAM_SIZE> n3ds_extra_ram;
 
-PageTable* current_page_table = nullptr;
+static PageTable* current_page_table = nullptr;
 
-std::array<u8*, PAGE_TABLE_NUM_ENTRIES>* GetCurrentPageTablePointers() {
-    return &current_page_table->pointers;
+void SetCurrentPageTable(PageTable* page_table) {
+    current_page_table = page_table;
+    if (Core::System::GetInstance().IsPoweredOn()) {
+        Core::CPU().PageTableChanged();
+    }
+}
+
+PageTable* GetCurrentPageTable() {
+    return current_page_table;
 }
 
 static void MapPages(PageTable& page_table, u32 base, u32 size, u8* memory, PageType type) {
@@ -73,10 +82,10 @@ void UnmapRegion(PageTable& page_table, VAddr base, u32 size) {
  * Gets a pointer to the exact memory at the virtual address (i.e. not page aligned)
  * using a VMA from the current process
  */
-static u8* GetPointerFromVMA(VAddr vaddr) {
+static u8* GetPointerFromVMA(const Kernel::Process& process, VAddr vaddr) {
     u8* direct_pointer = nullptr;
 
-    auto& vm_manager = Kernel::g_current_process->vm_manager;
+    auto& vm_manager = process.vm_manager;
 
     auto it = vm_manager.FindVMA(vaddr);
     ASSERT(it != vm_manager.vma_map.end());
@@ -99,10 +108,18 @@ static u8* GetPointerFromVMA(VAddr vaddr) {
 }
 
 /**
+ * Gets a pointer to the exact memory at the virtual address (i.e. not page aligned)
+ * using a VMA from the current process.
+ */
+static u8* GetPointerFromVMA(VAddr vaddr) {
+    return GetPointerFromVMA(*Kernel::g_current_process, vaddr);
+}
+
+/**
  * This function should only be called for virtual addreses with attribute `PageType::Special`.
  */
-static MMIORegionPointer GetMMIOHandler(VAddr vaddr) {
-    for (const auto& region : current_page_table->special_regions) {
+static MMIORegionPointer GetMMIOHandler(const PageTable& page_table, VAddr vaddr) {
+    for (const auto& region : page_table.special_regions) {
         if (vaddr >= region.base && vaddr < (region.base + region.size)) {
             return region.handler;
         }
@@ -111,6 +128,11 @@ static MMIORegionPointer GetMMIOHandler(VAddr vaddr) {
     return nullptr; // Should never happen
 }
 
+static MMIORegionPointer GetMMIOHandler(VAddr vaddr) {
+    const PageTable& page_table = Kernel::g_current_process->vm_manager.page_table;
+    return GetMMIOHandler(page_table, vaddr);
+}
+
 template <typename T>
 T ReadMMIO(MMIORegionPointer mmio_handler, VAddr addr);
 
@@ -195,18 +217,20 @@ void Write(const VAddr vaddr, const T data) {
     }
 }
 
-bool IsValidVirtualAddress(const VAddr vaddr) {
-    const u8* page_pointer = current_page_table->pointers[vaddr >> PAGE_BITS];
+bool IsValidVirtualAddress(const Kernel::Process& process, const VAddr vaddr) {
+    auto& page_table = process.vm_manager.page_table;
+
+    const u8* page_pointer = page_table.pointers[vaddr >> PAGE_BITS];
     if (page_pointer)
         return true;
 
-    if (current_page_table->attributes[vaddr >> PAGE_BITS] == PageType::RasterizerCachedMemory)
+    if (page_table.attributes[vaddr >> PAGE_BITS] == PageType::RasterizerCachedMemory)
         return true;
 
-    if (current_page_table->attributes[vaddr >> PAGE_BITS] != PageType::Special)
+    if (page_table.attributes[vaddr >> PAGE_BITS] != PageType::Special)
         return false;
 
-    MMIORegionPointer mmio_region = GetMMIOHandler(vaddr);
+    MMIORegionPointer mmio_region = GetMMIOHandler(page_table, vaddr);
     if (mmio_region) {
         return mmio_region->IsValidAddress(vaddr);
     }
@@ -214,6 +238,10 @@ bool IsValidVirtualAddress(const VAddr vaddr) {
     return false;
 }
 
+bool IsValidVirtualAddress(const VAddr vaddr) {
+    return IsValidVirtualAddress(*Kernel::g_current_process, vaddr);
+}
+
 bool IsValidPhysicalAddress(const PAddr paddr) {
     return GetPhysicalPointer(paddr) != nullptr;
 }
@@ -316,8 +344,15 @@ void RasterizerMarkRegionCached(PAddr start, u32 size, int count_delta) {
 
     for (unsigned i = 0; i < num_pages; ++i, paddr += PAGE_SIZE) {
         boost::optional<VAddr> maybe_vaddr = PhysicalToVirtualAddress(paddr);
-        if (!maybe_vaddr)
+        // While the physical <-> virtual mapping is 1:1 for the regions supported by the cache,
+        // some games (like Pokemon Super Mystery Dungeon) will try to use textures that go beyond
+        // the end address of VRAM, causing the Virtual->Physical translation to fail when flushing
+        // parts of the texture.
+        if (!maybe_vaddr) {
+            LOG_ERROR(HW_Memory,
+                      "Trying to flush a cached region to an invalid physical address %08X", paddr);
             continue;
+        }
         VAddr vaddr = *maybe_vaddr;
 
         u8& res_count = current_page_table->cached_res_count[vaddr >> PAGE_BITS];
@@ -329,6 +364,10 @@ void RasterizerMarkRegionCached(PAddr start, u32 size, int count_delta) {
         if (res_count == 0) {
             PageType& page_type = current_page_table->attributes[vaddr >> PAGE_BITS];
             switch (page_type) {
+            case PageType::Unmapped:
+                // It is not necessary for a process to have this region mapped into its address
+                // space, for example, a system module need not have a VRAM mapping.
+                break;
             case PageType::Memory:
                 page_type = PageType::RasterizerCachedMemory;
                 current_page_table->pointers[vaddr >> PAGE_BITS] = nullptr;
@@ -347,6 +386,10 @@ void RasterizerMarkRegionCached(PAddr start, u32 size, int count_delta) {
         if (res_count == 0) {
             PageType& page_type = current_page_table->attributes[vaddr >> PAGE_BITS];
             switch (page_type) {
+            case PageType::Unmapped:
+                // It is not necessary for a process to have this region mapped into its address
+                // space, for example, a system module need not have a VRAM mapping.
+                break;
             case PageType::RasterizerCachedMemory: {
                 u8* pointer = GetPointerFromVMA(vaddr & ~PAGE_MASK);
                 if (pointer == nullptr) {
@@ -435,16 +478,19 @@ u64 Read64(const VAddr addr) {
     return Read<u64_le>(addr);
 }
 
-void ReadBlock(const VAddr src_addr, void* dest_buffer, const size_t size) {
+void ReadBlock(const Kernel::Process& process, const VAddr src_addr, void* dest_buffer,
+               const size_t size) {
+    auto& page_table = process.vm_manager.page_table;
+
     size_t remaining_size = size;
     size_t page_index = src_addr >> PAGE_BITS;
     size_t page_offset = src_addr & PAGE_MASK;
 
     while (remaining_size > 0) {
         const size_t copy_amount = std::min(PAGE_SIZE - page_offset, remaining_size);
-        const VAddr current_vaddr = (page_index << PAGE_BITS) + page_offset;
+        const VAddr current_vaddr = static_cast<VAddr>((page_index << PAGE_BITS) + page_offset);
 
-        switch (current_page_table->attributes[page_index]) {
+        switch (page_table.attributes[page_index]) {
         case PageType::Unmapped: {
             LOG_ERROR(HW_Memory, "unmapped ReadBlock @ 0x%08X (start address = 0x%08X, size = %zu)",
                       current_vaddr, src_addr, size);
@@ -452,27 +498,30 @@ void ReadBlock(const VAddr src_addr, void* dest_buffer, const size_t size) {
             break;
         }
         case PageType::Memory: {
-            DEBUG_ASSERT(current_page_table->pointers[page_index]);
+            DEBUG_ASSERT(page_table.pointers[page_index]);
 
-            const u8* src_ptr = current_page_table->pointers[page_index] + page_offset;
+            const u8* src_ptr = page_table.pointers[page_index] + page_offset;
             std::memcpy(dest_buffer, src_ptr, copy_amount);
             break;
         }
         case PageType::Special: {
-            DEBUG_ASSERT(GetMMIOHandler(current_vaddr));
-
-            GetMMIOHandler(current_vaddr)->ReadBlock(current_vaddr, dest_buffer, copy_amount);
+            MMIORegionPointer handler = GetMMIOHandler(page_table, current_vaddr);
+            DEBUG_ASSERT(handler);
+            handler->ReadBlock(current_vaddr, dest_buffer, copy_amount);
             break;
         }
         case PageType::RasterizerCachedMemory: {
-            RasterizerFlushVirtualRegion(current_vaddr, copy_amount, FlushMode::Flush);
-            std::memcpy(dest_buffer, GetPointerFromVMA(current_vaddr), copy_amount);
+            RasterizerFlushVirtualRegion(current_vaddr, static_cast<u32>(copy_amount),
+                                         FlushMode::Flush);
+            std::memcpy(dest_buffer, GetPointerFromVMA(process, current_vaddr), copy_amount);
             break;
         }
         case PageType::RasterizerCachedSpecial: {
-            DEBUG_ASSERT(GetMMIOHandler(current_vaddr));
-            RasterizerFlushVirtualRegion(current_vaddr, copy_amount, FlushMode::Flush);
-            GetMMIOHandler(current_vaddr)->ReadBlock(current_vaddr, dest_buffer, copy_amount);
+            MMIORegionPointer handler = GetMMIOHandler(page_table, current_vaddr);
+            DEBUG_ASSERT(handler);
+            RasterizerFlushVirtualRegion(current_vaddr, static_cast<u32>(copy_amount),
+                                         FlushMode::Flush);
+            handler->ReadBlock(current_vaddr, dest_buffer, copy_amount);
             break;
         }
         default:
@@ -486,6 +535,10 @@ void ReadBlock(const VAddr src_addr, void* dest_buffer, const size_t size) {
     }
 }
 
+void ReadBlock(const VAddr src_addr, void* dest_buffer, const size_t size) {
+    ReadBlock(*Kernel::g_current_process, src_addr, dest_buffer, size);
+}
+
 void Write8(const VAddr addr, const u8 data) {
     Write<u8>(addr, data);
 }
@@ -502,16 +555,18 @@ void Write64(const VAddr addr, const u64 data) {
     Write<u64_le>(addr, data);
 }
 
-void WriteBlock(const VAddr dest_addr, const void* src_buffer, const size_t size) {
+void WriteBlock(const Kernel::Process& process, const VAddr dest_addr, const void* src_buffer,
+                const size_t size) {
+    auto& page_table = process.vm_manager.page_table;
     size_t remaining_size = size;
     size_t page_index = dest_addr >> PAGE_BITS;
     size_t page_offset = dest_addr & PAGE_MASK;
 
     while (remaining_size > 0) {
         const size_t copy_amount = std::min(PAGE_SIZE - page_offset, remaining_size);
-        const VAddr current_vaddr = (page_index << PAGE_BITS) + page_offset;
+        const VAddr current_vaddr = static_cast<VAddr>((page_index << PAGE_BITS) + page_offset);
 
-        switch (current_page_table->attributes[page_index]) {
+        switch (page_table.attributes[page_index]) {
         case PageType::Unmapped: {
             LOG_ERROR(HW_Memory,
                       "unmapped WriteBlock @ 0x%08X (start address = 0x%08X, size = %zu)",
@@ -519,27 +574,30 @@ void WriteBlock(const VAddr dest_addr, const void* src_buffer, const size_t size
             break;
         }
         case PageType::Memory: {
-            DEBUG_ASSERT(current_page_table->pointers[page_index]);
+            DEBUG_ASSERT(page_table.pointers[page_index]);
 
-            u8* dest_ptr = current_page_table->pointers[page_index] + page_offset;
+            u8* dest_ptr = page_table.pointers[page_index] + page_offset;
             std::memcpy(dest_ptr, src_buffer, copy_amount);
             break;
         }
         case PageType::Special: {
-            DEBUG_ASSERT(GetMMIOHandler(current_vaddr));
-
-            GetMMIOHandler(current_vaddr)->WriteBlock(current_vaddr, src_buffer, copy_amount);
+            MMIORegionPointer handler = GetMMIOHandler(page_table, current_vaddr);
+            DEBUG_ASSERT(handler);
+            handler->WriteBlock(current_vaddr, src_buffer, copy_amount);
             break;
         }
         case PageType::RasterizerCachedMemory: {
-            RasterizerFlushVirtualRegion(current_vaddr, copy_amount, FlushMode::FlushAndInvalidate);
-            std::memcpy(GetPointerFromVMA(current_vaddr), src_buffer, copy_amount);
+            RasterizerFlushVirtualRegion(current_vaddr, static_cast<u32>(copy_amount),
+                                         FlushMode::FlushAndInvalidate);
+            std::memcpy(GetPointerFromVMA(process, current_vaddr), src_buffer, copy_amount);
             break;
         }
         case PageType::RasterizerCachedSpecial: {
-            DEBUG_ASSERT(GetMMIOHandler(current_vaddr));
-            RasterizerFlushVirtualRegion(current_vaddr, copy_amount, FlushMode::FlushAndInvalidate);
-            GetMMIOHandler(current_vaddr)->WriteBlock(current_vaddr, src_buffer, copy_amount);
+            MMIORegionPointer handler = GetMMIOHandler(page_table, current_vaddr);
+            DEBUG_ASSERT(handler);
+            RasterizerFlushVirtualRegion(current_vaddr, static_cast<u32>(copy_amount),
+                                         FlushMode::FlushAndInvalidate);
+            handler->WriteBlock(current_vaddr, src_buffer, copy_amount);
             break;
         }
         default:
@@ -553,6 +611,10 @@ void WriteBlock(const VAddr dest_addr, const void* src_buffer, const size_t size
     }
 }
 
+void WriteBlock(const VAddr dest_addr, const void* src_buffer, const size_t size) {
+    WriteBlock(*Kernel::g_current_process, dest_addr, src_buffer, size);
+}
+
 void ZeroBlock(const VAddr dest_addr, const size_t size) {
     size_t remaining_size = size;
     size_t page_index = dest_addr >> PAGE_BITS;
@@ -562,7 +624,7 @@ void ZeroBlock(const VAddr dest_addr, const size_t size) {
 
     while (remaining_size > 0) {
         const size_t copy_amount = std::min(PAGE_SIZE - page_offset, remaining_size);
-        const VAddr current_vaddr = (page_index << PAGE_BITS) + page_offset;
+        const VAddr current_vaddr = static_cast<VAddr>((page_index << PAGE_BITS) + page_offset);
 
         switch (current_page_table->attributes[page_index]) {
         case PageType::Unmapped: {
@@ -584,13 +646,15 @@ void ZeroBlock(const VAddr dest_addr, const size_t size) {
             break;
         }
         case PageType::RasterizerCachedMemory: {
-            RasterizerFlushVirtualRegion(current_vaddr, copy_amount, FlushMode::FlushAndInvalidate);
+            RasterizerFlushVirtualRegion(current_vaddr, static_cast<u32>(copy_amount),
+                                         FlushMode::FlushAndInvalidate);
             std::memset(GetPointerFromVMA(current_vaddr), 0, copy_amount);
             break;
         }
         case PageType::RasterizerCachedSpecial: {
             DEBUG_ASSERT(GetMMIOHandler(current_vaddr));
-            RasterizerFlushVirtualRegion(current_vaddr, copy_amount, FlushMode::FlushAndInvalidate);
+            RasterizerFlushVirtualRegion(current_vaddr, static_cast<u32>(copy_amount),
+                                         FlushMode::FlushAndInvalidate);
             GetMMIOHandler(current_vaddr)->WriteBlock(current_vaddr, zeros.data(), copy_amount);
             break;
         }
@@ -611,7 +675,7 @@ void CopyBlock(VAddr dest_addr, VAddr src_addr, const size_t size) {
 
     while (remaining_size > 0) {
         const size_t copy_amount = std::min(PAGE_SIZE - page_offset, remaining_size);
-        const VAddr current_vaddr = (page_index << PAGE_BITS) + page_offset;
+        const VAddr current_vaddr = static_cast<VAddr>((page_index << PAGE_BITS) + page_offset);
 
         switch (current_page_table->attributes[page_index]) {
         case PageType::Unmapped: {
@@ -635,13 +699,15 @@ void CopyBlock(VAddr dest_addr, VAddr src_addr, const size_t size) {
             break;
         }
         case PageType::RasterizerCachedMemory: {
-            RasterizerFlushVirtualRegion(current_vaddr, copy_amount, FlushMode::Flush);
+            RasterizerFlushVirtualRegion(current_vaddr, static_cast<u32>(copy_amount),
+                                         FlushMode::Flush);
             WriteBlock(dest_addr, GetPointerFromVMA(current_vaddr), copy_amount);
             break;
         }
         case PageType::RasterizerCachedSpecial: {
             DEBUG_ASSERT(GetMMIOHandler(current_vaddr));
-            RasterizerFlushVirtualRegion(current_vaddr, copy_amount, FlushMode::Flush);
+            RasterizerFlushVirtualRegion(current_vaddr, static_cast<u32>(copy_amount),
+                                         FlushMode::Flush);
 
             std::vector<u8> buffer(copy_amount);
             GetMMIOHandler(current_vaddr)->ReadBlock(current_vaddr, buffer.data(), buffer.size());
@@ -654,8 +720,8 @@ void CopyBlock(VAddr dest_addr, VAddr src_addr, const size_t size) {
 
         page_index++;
         page_offset = 0;
-        dest_addr += copy_amount;
-        src_addr += copy_amount;
+        dest_addr += static_cast<VAddr>(copy_amount);
+        src_addr += static_cast<VAddr>(copy_amount);
         remaining_size -= copy_amount;
     }
 }
diff --git a/src/core/memory.h b/src/core/memory.h
index b228a48c2..dd599f73e 100644
--- a/src/core/memory.h
+++ b/src/core/memory.h
@@ -12,6 +12,10 @@
 #include "common/common_types.h"
 #include "core/mmio.h"
 
+namespace Kernel {
+class Process;
+}
+
 namespace Memory {
 
 /**
@@ -182,9 +186,13 @@ enum : VAddr {
 };
 
 /// Currently active page table
-extern PageTable* current_page_table;
+void SetCurrentPageTable(PageTable* page_table);
+PageTable* GetCurrentPageTable();
 
+/// Determines if the given VAddr is valid for the specified process.
+bool IsValidVirtualAddress(const Kernel::Process& process, const VAddr vaddr);
 bool IsValidVirtualAddress(const VAddr addr);
+
 bool IsValidPhysicalAddress(const PAddr addr);
 
 u8 Read8(VAddr addr);
@@ -197,7 +205,11 @@ void Write16(VAddr addr, u16 data);
 void Write32(VAddr addr, u32 data);
 void Write64(VAddr addr, u64 data);
 
+void ReadBlock(const Kernel::Process& process, const VAddr src_addr, void* dest_buffer,
+               size_t size);
 void ReadBlock(const VAddr src_addr, void* dest_buffer, size_t size);
+void WriteBlock(const Kernel::Process& process, const VAddr dest_addr, const void* src_buffer,
+                size_t size);
 void WriteBlock(const VAddr dest_addr, const void* src_buffer, size_t size);
 void ZeroBlock(const VAddr dest_addr, const size_t size);
 void CopyBlock(VAddr dest_addr, VAddr src_addr, size_t size);
@@ -259,10 +271,4 @@ enum class FlushMode {
  */
 void RasterizerFlushVirtualRegion(VAddr start, u32 size, FlushMode mode);
 
-/**
- * Dynarmic has an optimization to memory accesses when the pointer to the page exists that
- * can be used by setting up the current page table as a callback. This function is used to
- * retrieve the current page table for that purpose.
- */
-std::array<u8*, PAGE_TABLE_NUM_ENTRIES>* GetCurrentPageTablePointers();
 } // namespace Memory
diff --git a/src/network/packet.cpp b/src/network/packet.cpp
index cc60f2fbc..7e1a812f3 100644
--- a/src/network/packet.cpp
+++ b/src/network/packet.cpp
@@ -233,7 +233,7 @@ Packet& Packet::operator<<(double in_data) {
 
 Packet& Packet::operator<<(const char* in_data) {
     // First insert string length
-    u32 length = std::strlen(in_data);
+    u32 length = static_cast<u32>(std::strlen(in_data));
     *this << length;
 
     // Then insert characters
diff --git a/src/tests/CMakeLists.txt b/src/tests/CMakeLists.txt
index 5e9c4c2bf..1aac0daa2 100644
--- a/src/tests/CMakeLists.txt
+++ b/src/tests/CMakeLists.txt
@@ -4,6 +4,7 @@ set(SRCS
             core/arm/dyncom/arm_dyncom_vfp_tests.cpp
             core/file_sys/path_parser.cpp
             core/hle/kernel/hle_ipc.cpp
+            core/memory/memory.cpp
             glad.cpp
             tests.cpp
             )
diff --git a/src/tests/core/arm/arm_test_common.cpp b/src/tests/core/arm/arm_test_common.cpp
index 8384ce744..484713a92 100644
--- a/src/tests/core/arm/arm_test_common.cpp
+++ b/src/tests/core/arm/arm_test_common.cpp
@@ -3,30 +3,34 @@
 // Refer to the license.txt file included.
 
 #include "core/core.h"
+#include "core/hle/kernel/process.h"
 #include "core/memory.h"
 #include "core/memory_setup.h"
 #include "tests/core/arm/arm_test_common.h"
 
 namespace ArmTests {
 
-static Memory::PageTable page_table;
+static Memory::PageTable* page_table = nullptr;
 
 TestEnvironment::TestEnvironment(bool mutable_memory_)
     : mutable_memory(mutable_memory_), test_memory(std::make_shared<TestMemory>(this)) {
 
-    page_table.pointers.fill(nullptr);
-    page_table.attributes.fill(Memory::PageType::Unmapped);
-    page_table.cached_res_count.fill(0);
+    Kernel::g_current_process = Kernel::Process::Create(Kernel::CodeSet::Create("", 0));
+    page_table = &Kernel::g_current_process->vm_manager.page_table;
 
-    Memory::MapIoRegion(page_table, 0x00000000, 0x80000000, test_memory);
-    Memory::MapIoRegion(page_table, 0x80000000, 0x80000000, test_memory);
+    page_table->pointers.fill(nullptr);
+    page_table->attributes.fill(Memory::PageType::Unmapped);
+    page_table->cached_res_count.fill(0);
 
-    Memory::current_page_table = &page_table;
+    Memory::MapIoRegion(*page_table, 0x00000000, 0x80000000, test_memory);
+    Memory::MapIoRegion(*page_table, 0x80000000, 0x80000000, test_memory);
+
+    Memory::SetCurrentPageTable(page_table);
 }
 
 TestEnvironment::~TestEnvironment() {
-    Memory::UnmapRegion(page_table, 0x80000000, 0x80000000);
-    Memory::UnmapRegion(page_table, 0x00000000, 0x80000000);
+    Memory::UnmapRegion(*page_table, 0x80000000, 0x80000000);
+    Memory::UnmapRegion(*page_table, 0x00000000, 0x80000000);
 }
 
 void TestEnvironment::SetMemory64(VAddr vaddr, u64 value) {
diff --git a/src/tests/core/arm/dyncom/arm_dyncom_vfp_tests.cpp b/src/tests/core/arm/dyncom/arm_dyncom_vfp_tests.cpp
index 86de41773..83719a58e 100644
--- a/src/tests/core/arm/dyncom/arm_dyncom_vfp_tests.cpp
+++ b/src/tests/core/arm/dyncom/arm_dyncom_vfp_tests.cpp
@@ -5,6 +5,7 @@
 #include <catch.hpp>
 
 #include "core/arm/dyncom/arm_dyncom.h"
+#include "core/core_timing.h"
 #include "tests/core/arm/arm_test_common.h"
 
 namespace ArmTests {
@@ -29,7 +30,6 @@ TEST_CASE("ARM_DynCom (vfp): vadd", "[arm_dyncom]") {
     }};
 
     for (const auto& test_case : test_cases) {
-        dyncom.down_count = 1000; // Ensure that CoreTimeing will not be called.
         dyncom.SetPC(0);
         dyncom.SetVFPSystemReg(VFP_FPSCR, test_case.initial_fpscr);
         dyncom.SetVFPReg(4, test_case.a);
diff --git a/src/tests/core/memory/memory.cpp b/src/tests/core/memory/memory.cpp
new file mode 100644
index 000000000..a01b896f7
--- /dev/null
+++ b/src/tests/core/memory/memory.cpp
@@ -0,0 +1,56 @@
+// Copyright 2017 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <catch.hpp>
+#include "core/hle/kernel/memory.h"
+#include "core/hle/kernel/process.h"
+#include "core/memory.h"
+
+TEST_CASE("Memory::IsValidVirtualAddress", "[core][memory]") {
+    SECTION("these regions should not be mapped on an empty process") {
+        auto process = Kernel::Process::Create(Kernel::CodeSet::Create("", 0));
+        CHECK(Memory::IsValidVirtualAddress(*process, Memory::PROCESS_IMAGE_VADDR) == false);
+        CHECK(Memory::IsValidVirtualAddress(*process, Memory::HEAP_VADDR) == false);
+        CHECK(Memory::IsValidVirtualAddress(*process, Memory::LINEAR_HEAP_VADDR) == false);
+        CHECK(Memory::IsValidVirtualAddress(*process, Memory::VRAM_VADDR) == false);
+        CHECK(Memory::IsValidVirtualAddress(*process, Memory::CONFIG_MEMORY_VADDR) == false);
+        CHECK(Memory::IsValidVirtualAddress(*process, Memory::SHARED_PAGE_VADDR) == false);
+        CHECK(Memory::IsValidVirtualAddress(*process, Memory::TLS_AREA_VADDR) == false);
+    }
+
+    SECTION("CONFIG_MEMORY_VADDR and SHARED_PAGE_VADDR should be valid after mapping them") {
+        auto process = Kernel::Process::Create(Kernel::CodeSet::Create("", 0));
+        Kernel::MapSharedPages(process->vm_manager);
+        CHECK(Memory::IsValidVirtualAddress(*process, Memory::CONFIG_MEMORY_VADDR) == true);
+        CHECK(Memory::IsValidVirtualAddress(*process, Memory::SHARED_PAGE_VADDR) == true);
+    }
+
+    SECTION("special regions should be valid after mapping them") {
+        auto process = Kernel::Process::Create(Kernel::CodeSet::Create("", 0));
+        SECTION("VRAM") {
+            Kernel::HandleSpecialMapping(process->vm_manager,
+                                         {Memory::VRAM_VADDR, Memory::VRAM_SIZE, false, false});
+            CHECK(Memory::IsValidVirtualAddress(*process, Memory::VRAM_VADDR) == true);
+        }
+
+        SECTION("IO (Not yet implemented)") {
+            Kernel::HandleSpecialMapping(
+                process->vm_manager, {Memory::IO_AREA_VADDR, Memory::IO_AREA_SIZE, false, false});
+            CHECK_FALSE(Memory::IsValidVirtualAddress(*process, Memory::IO_AREA_VADDR) == true);
+        }
+
+        SECTION("DSP") {
+            Kernel::HandleSpecialMapping(
+                process->vm_manager, {Memory::DSP_RAM_VADDR, Memory::DSP_RAM_SIZE, false, false});
+            CHECK(Memory::IsValidVirtualAddress(*process, Memory::DSP_RAM_VADDR) == true);
+        }
+    }
+
+    SECTION("Unmapping a VAddr should make it invalid") {
+        auto process = Kernel::Process::Create(Kernel::CodeSet::Create("", 0));
+        Kernel::MapSharedPages(process->vm_manager);
+        process->vm_manager.UnmapRange(Memory::CONFIG_MEMORY_VADDR, Memory::CONFIG_MEMORY_SIZE);
+        CHECK(Memory::IsValidVirtualAddress(*process, Memory::CONFIG_MEMORY_VADDR) == false);
+    }
+}
diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp
index fb65a3a0a..caf9f7a06 100644
--- a/src/video_core/command_processor.cpp
+++ b/src/video_core/command_processor.cpp
@@ -119,6 +119,224 @@ static void WriteUniformFloatReg(ShaderRegs& config, Shader::ShaderSetup& setup,
     }
 }
 
+static void LoadDefaultVertexAttributes(u32 register_value) {
+    auto& regs = g_state.regs;
+
+    // TODO: Does actual hardware indeed keep an intermediate buffer or does
+    //       it directly write the values?
+    default_attr_write_buffer[default_attr_counter++] = register_value;
+
+    // Default attributes are written in a packed format such that four float24 values are encoded
+    // in three 32-bit numbers.
+    // We write to internal memory once a full such vector is written.
+    if (default_attr_counter >= 3) {
+        default_attr_counter = 0;
+
+        auto& setup = regs.pipeline.vs_default_attributes_setup;
+
+        if (setup.index >= 16) {
+            LOG_ERROR(HW_GPU, "Invalid VS default attribute index %d", (int)setup.index);
+            return;
+        }
+
+        Math::Vec4<float24> attribute;
+
+        // NOTE: The destination component order indeed is "backwards"
+        attribute.w = float24::FromRaw(default_attr_write_buffer[0] >> 8);
+        attribute.z = float24::FromRaw(((default_attr_write_buffer[0] & 0xFF) << 16) |
+                                       ((default_attr_write_buffer[1] >> 16) & 0xFFFF));
+        attribute.y = float24::FromRaw(((default_attr_write_buffer[1] & 0xFFFF) << 8) |
+                                       ((default_attr_write_buffer[2] >> 24) & 0xFF));
+        attribute.x = float24::FromRaw(default_attr_write_buffer[2] & 0xFFFFFF);
+
+        LOG_TRACE(HW_GPU, "Set default VS attribute %x to (%f %f %f %f)", (int)setup.index,
+                  attribute.x.ToFloat32(), attribute.y.ToFloat32(), attribute.z.ToFloat32(),
+                  attribute.w.ToFloat32());
+
+        // TODO: Verify that this actually modifies the register!
+        if (setup.index < 15) {
+            g_state.input_default_attributes.attr[setup.index] = attribute;
+            setup.index++;
+        } else {
+            // Put each attribute into an immediate input buffer.  When all specified immediate
+            // attributes are present, the Vertex Shader is invoked and everything is sent to
+            // the primitive assembler.
+
+            auto& immediate_input = g_state.immediate.input_vertex;
+            auto& immediate_attribute_id = g_state.immediate.current_attribute;
+
+            immediate_input.attr[immediate_attribute_id] = attribute;
+
+            if (immediate_attribute_id < regs.pipeline.max_input_attrib_index) {
+                immediate_attribute_id += 1;
+            } else {
+                MICROPROFILE_SCOPE(GPU_Drawing);
+                immediate_attribute_id = 0;
+
+                auto* shader_engine = Shader::GetEngine();
+                shader_engine->SetupBatch(g_state.vs, regs.vs.main_offset);
+
+                // Send to vertex shader
+                if (g_debug_context)
+                    g_debug_context->OnEvent(DebugContext::Event::VertexShaderInvocation,
+                                             static_cast<void*>(&immediate_input));
+                Shader::UnitState shader_unit;
+                Shader::AttributeBuffer output{};
+
+                shader_unit.LoadInput(regs.vs, immediate_input);
+                shader_engine->Run(g_state.vs, shader_unit);
+                shader_unit.WriteOutput(regs.vs, output);
+
+                // Send to geometry pipeline
+                if (g_state.immediate.reset_geometry_pipeline) {
+                    g_state.geometry_pipeline.Reconfigure();
+                    g_state.immediate.reset_geometry_pipeline = false;
+                }
+                ASSERT(!g_state.geometry_pipeline.NeedIndexInput());
+                g_state.geometry_pipeline.Setup(shader_engine);
+                g_state.geometry_pipeline.SubmitVertex(output);
+
+                // TODO: If drawing after every immediate mode triangle kills performance,
+                // change it to flush triangles whenever a drawing config register changes
+                // See: https://github.com/citra-emu/citra/pull/2866#issuecomment-327011550
+                VideoCore::g_renderer->Rasterizer()->DrawTriangles();
+                if (g_debug_context) {
+                    g_debug_context->OnEvent(DebugContext::Event::FinishedPrimitiveBatch, nullptr);
+                }
+            }
+        }
+    }
+}
+
+static void Draw(u32 command_id) {
+    MICROPROFILE_SCOPE(GPU_Drawing);
+    auto& regs = g_state.regs;
+
+#if PICA_LOG_TEV
+    DebugUtils::DumpTevStageConfig(regs.GetTevStages());
+#endif
+    if (g_debug_context)
+        g_debug_context->OnEvent(DebugContext::Event::IncomingPrimitiveBatch, nullptr);
+
+    // Processes information about internal vertex attributes to figure out how a vertex is
+    // loaded.
+    // Later, these can be compiled and cached.
+    const u32 base_address = regs.pipeline.vertex_attributes.GetPhysicalBaseAddress();
+    VertexLoader loader(regs.pipeline);
+
+    // Load vertices
+    bool is_indexed = (command_id == PICA_REG_INDEX(pipeline.trigger_draw_indexed));
+
+    const auto& index_info = regs.pipeline.index_array;
+    const u8* index_address_8 = Memory::GetPhysicalPointer(base_address + index_info.offset);
+    const u16* index_address_16 = reinterpret_cast<const u16*>(index_address_8);
+    bool index_u16 = index_info.format != 0;
+
+    PrimitiveAssembler<Shader::OutputVertex>& primitive_assembler = g_state.primitive_assembler;
+
+    if (g_debug_context && g_debug_context->recorder) {
+        for (int i = 0; i < 3; ++i) {
+            const auto texture = regs.texturing.GetTextures()[i];
+            if (!texture.enabled)
+                continue;
+
+            u8* texture_data = Memory::GetPhysicalPointer(texture.config.GetPhysicalAddress());
+            g_debug_context->recorder->MemoryAccessed(
+                texture_data, Pica::TexturingRegs::NibblesPerPixel(texture.format) *
+                                  texture.config.width / 2 * texture.config.height,
+                texture.config.GetPhysicalAddress());
+        }
+    }
+
+    DebugUtils::MemoryAccessTracker memory_accesses;
+
+    // Simple circular-replacement vertex cache
+    // The size has been tuned for optimal balance between hit-rate and the cost of lookup
+    const size_t VERTEX_CACHE_SIZE = 32;
+    std::array<u16, VERTEX_CACHE_SIZE> vertex_cache_ids;
+    std::array<Shader::AttributeBuffer, VERTEX_CACHE_SIZE> vertex_cache;
+    Shader::AttributeBuffer vs_output;
+
+    unsigned int vertex_cache_pos = 0;
+    vertex_cache_ids.fill(-1);
+
+    auto* shader_engine = Shader::GetEngine();
+    Shader::UnitState shader_unit;
+
+    shader_engine->SetupBatch(g_state.vs, regs.vs.main_offset);
+
+    g_state.geometry_pipeline.Reconfigure();
+    g_state.geometry_pipeline.Setup(shader_engine);
+    if (g_state.geometry_pipeline.NeedIndexInput())
+        ASSERT(is_indexed);
+
+    for (unsigned int index = 0; index < regs.pipeline.num_vertices; ++index) {
+        // Indexed rendering doesn't use the start offset
+        unsigned int vertex = is_indexed
+                                  ? (index_u16 ? index_address_16[index] : index_address_8[index])
+                                  : (index + regs.pipeline.vertex_offset);
+
+        // -1 is a common special value used for primitive restart. Since it's unknown if
+        // the PICA supports it, and it would mess up the caching, guard against it here.
+        ASSERT(vertex != -1);
+
+        bool vertex_cache_hit = false;
+
+        if (is_indexed) {
+            if (g_state.geometry_pipeline.NeedIndexInput()) {
+                g_state.geometry_pipeline.SubmitIndex(vertex);
+                continue;
+            }
+
+            if (g_debug_context && Pica::g_debug_context->recorder) {
+                int size = index_u16 ? 2 : 1;
+                memory_accesses.AddAccess(base_address + index_info.offset + size * index, size);
+            }
+
+            for (unsigned int i = 0; i < VERTEX_CACHE_SIZE; ++i) {
+                if (vertex == vertex_cache_ids[i]) {
+                    vs_output = vertex_cache[i];
+                    vertex_cache_hit = true;
+                    break;
+                }
+            }
+        }
+
+        if (!vertex_cache_hit) {
+            // Initialize data for the current vertex
+            Shader::AttributeBuffer input;
+            loader.LoadVertex(base_address, index, vertex, input, memory_accesses);
+
+            // Send to vertex shader
+            if (g_debug_context)
+                g_debug_context->OnEvent(DebugContext::Event::VertexShaderInvocation,
+                                         (void*)&input);
+            shader_unit.LoadInput(regs.vs, input);
+            shader_engine->Run(g_state.vs, shader_unit);
+            shader_unit.WriteOutput(regs.vs, vs_output);
+
+            if (is_indexed) {
+                vertex_cache[vertex_cache_pos] = vs_output;
+                vertex_cache_ids[vertex_cache_pos] = vertex;
+                vertex_cache_pos = (vertex_cache_pos + 1) % VERTEX_CACHE_SIZE;
+            }
+        }
+
+        // Send to geometry pipeline
+        g_state.geometry_pipeline.SubmitVertex(vs_output);
+    }
+
+    for (auto& range : memory_accesses.ranges) {
+        g_debug_context->recorder->MemoryAccessed(Memory::GetPhysicalPointer(range.first),
+                                                  range.second, range.first);
+    }
+
+    VideoCore::g_renderer->Rasterizer()->DrawTriangles();
+    if (g_debug_context) {
+        g_debug_context->OnEvent(DebugContext::Event::FinishedPrimitiveBatch, nullptr);
+    }
+}
+
 static void WritePicaReg(u32 id, u32 value, u32 mask) {
     auto& regs = g_state.regs;
 
@@ -168,98 +386,12 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
     // Load default vertex input attributes
     case PICA_REG_INDEX_WORKAROUND(pipeline.vs_default_attributes_setup.set_value[0], 0x233):
     case PICA_REG_INDEX_WORKAROUND(pipeline.vs_default_attributes_setup.set_value[1], 0x234):
-    case PICA_REG_INDEX_WORKAROUND(pipeline.vs_default_attributes_setup.set_value[2], 0x235): {
-        // TODO: Does actual hardware indeed keep an intermediate buffer or does
-        //       it directly write the values?
-        default_attr_write_buffer[default_attr_counter++] = value;
-
-        // Default attributes are written in a packed format such that four float24 values are
-        // encoded in
-        // three 32-bit numbers. We write to internal memory once a full such vector is
-        // written.
-        if (default_attr_counter >= 3) {
-            default_attr_counter = 0;
-
-            auto& setup = regs.pipeline.vs_default_attributes_setup;
-
-            if (setup.index >= 16) {
-                LOG_ERROR(HW_GPU, "Invalid VS default attribute index %d", (int)setup.index);
-                break;
-            }
-
-            Math::Vec4<float24> attribute;
-
-            // NOTE: The destination component order indeed is "backwards"
-            attribute.w = float24::FromRaw(default_attr_write_buffer[0] >> 8);
-            attribute.z = float24::FromRaw(((default_attr_write_buffer[0] & 0xFF) << 16) |
-                                           ((default_attr_write_buffer[1] >> 16) & 0xFFFF));
-            attribute.y = float24::FromRaw(((default_attr_write_buffer[1] & 0xFFFF) << 8) |
-                                           ((default_attr_write_buffer[2] >> 24) & 0xFF));
-            attribute.x = float24::FromRaw(default_attr_write_buffer[2] & 0xFFFFFF);
-
-            LOG_TRACE(HW_GPU, "Set default VS attribute %x to (%f %f %f %f)", (int)setup.index,
-                      attribute.x.ToFloat32(), attribute.y.ToFloat32(), attribute.z.ToFloat32(),
-                      attribute.w.ToFloat32());
-
-            // TODO: Verify that this actually modifies the register!
-            if (setup.index < 15) {
-                g_state.input_default_attributes.attr[setup.index] = attribute;
-                setup.index++;
-            } else {
-                // Put each attribute into an immediate input buffer.  When all specified immediate
-                // attributes are present, the Vertex Shader is invoked and everything is sent to
-                // the primitive assembler.
-
-                auto& immediate_input = g_state.immediate.input_vertex;
-                auto& immediate_attribute_id = g_state.immediate.current_attribute;
-
-                immediate_input.attr[immediate_attribute_id] = attribute;
-
-                if (immediate_attribute_id < regs.pipeline.max_input_attrib_index) {
-                    immediate_attribute_id += 1;
-                } else {
-                    MICROPROFILE_SCOPE(GPU_Drawing);
-                    immediate_attribute_id = 0;
-
-                    auto* shader_engine = Shader::GetEngine();
-                    shader_engine->SetupBatch(g_state.vs, regs.vs.main_offset);
-
-                    // Send to vertex shader
-                    if (g_debug_context)
-                        g_debug_context->OnEvent(DebugContext::Event::VertexShaderInvocation,
-                                                 static_cast<void*>(&immediate_input));
-                    Shader::UnitState shader_unit;
-                    Shader::AttributeBuffer output{};
-
-                    shader_unit.LoadInput(regs.vs, immediate_input);
-                    shader_engine->Run(g_state.vs, shader_unit);
-                    shader_unit.WriteOutput(regs.vs, output);
-
-                    // Send to geometry pipeline
-                    if (g_state.immediate.reset_geometry_pipeline) {
-                        g_state.geometry_pipeline.Reconfigure();
-                        g_state.immediate.reset_geometry_pipeline = false;
-                    }
-                    ASSERT(!g_state.geometry_pipeline.NeedIndexInput());
-                    g_state.geometry_pipeline.Setup(shader_engine);
-                    g_state.geometry_pipeline.SubmitVertex(output);
-                }
-            }
-        }
+    case PICA_REG_INDEX_WORKAROUND(pipeline.vs_default_attributes_setup.set_value[2], 0x235):
+        LoadDefaultVertexAttributes(value);
         break;
-    }
 
     case PICA_REG_INDEX(pipeline.gpu_mode):
-        if (regs.pipeline.gpu_mode == PipelineRegs::GPUMode::Configuring) {
-            MICROPROFILE_SCOPE(GPU_Drawing);
-
-            // Draw immediate mode triangles when GPU Mode is set to GPUMode::Configuring
-            VideoCore::g_renderer->Rasterizer()->DrawTriangles();
-
-            if (g_debug_context) {
-                g_debug_context->OnEvent(DebugContext::Event::FinishedPrimitiveBatch, nullptr);
-            }
-        }
+        // This register likely just enables vertex processing and doesn't need any special handling
         break;
 
     case PICA_REG_INDEX_WORKAROUND(pipeline.command_buffer.trigger[0], 0x23c):
@@ -275,131 +407,9 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
 
     // It seems like these trigger vertex rendering
     case PICA_REG_INDEX(pipeline.trigger_draw):
-    case PICA_REG_INDEX(pipeline.trigger_draw_indexed): {
-        MICROPROFILE_SCOPE(GPU_Drawing);
-
-#if PICA_LOG_TEV
-        DebugUtils::DumpTevStageConfig(regs.GetTevStages());
-#endif
-        if (g_debug_context)
-            g_debug_context->OnEvent(DebugContext::Event::IncomingPrimitiveBatch, nullptr);
-
-        // Processes information about internal vertex attributes to figure out how a vertex is
-        // loaded.
-        // Later, these can be compiled and cached.
-        const u32 base_address = regs.pipeline.vertex_attributes.GetPhysicalBaseAddress();
-        VertexLoader loader(regs.pipeline);
-
-        // Load vertices
-        bool is_indexed = (id == PICA_REG_INDEX(pipeline.trigger_draw_indexed));
-
-        const auto& index_info = regs.pipeline.index_array;
-        const u8* index_address_8 = Memory::GetPhysicalPointer(base_address + index_info.offset);
-        const u16* index_address_16 = reinterpret_cast<const u16*>(index_address_8);
-        bool index_u16 = index_info.format != 0;
-
-        PrimitiveAssembler<Shader::OutputVertex>& primitive_assembler = g_state.primitive_assembler;
-
-        if (g_debug_context && g_debug_context->recorder) {
-            for (int i = 0; i < 3; ++i) {
-                const auto texture = regs.texturing.GetTextures()[i];
-                if (!texture.enabled)
-                    continue;
-
-                u8* texture_data = Memory::GetPhysicalPointer(texture.config.GetPhysicalAddress());
-                g_debug_context->recorder->MemoryAccessed(
-                    texture_data, Pica::TexturingRegs::NibblesPerPixel(texture.format) *
-                                      texture.config.width / 2 * texture.config.height,
-                    texture.config.GetPhysicalAddress());
-            }
-        }
-
-        DebugUtils::MemoryAccessTracker memory_accesses;
-
-        // Simple circular-replacement vertex cache
-        // The size has been tuned for optimal balance between hit-rate and the cost of lookup
-        const size_t VERTEX_CACHE_SIZE = 32;
-        std::array<u16, VERTEX_CACHE_SIZE> vertex_cache_ids;
-        std::array<Shader::AttributeBuffer, VERTEX_CACHE_SIZE> vertex_cache;
-        Shader::AttributeBuffer vs_output;
-
-        unsigned int vertex_cache_pos = 0;
-        vertex_cache_ids.fill(-1);
-
-        auto* shader_engine = Shader::GetEngine();
-        Shader::UnitState shader_unit;
-
-        shader_engine->SetupBatch(g_state.vs, regs.vs.main_offset);
-
-        g_state.geometry_pipeline.Reconfigure();
-        g_state.geometry_pipeline.Setup(shader_engine);
-        if (g_state.geometry_pipeline.NeedIndexInput())
-            ASSERT(is_indexed);
-
-        for (unsigned int index = 0; index < regs.pipeline.num_vertices; ++index) {
-            // Indexed rendering doesn't use the start offset
-            unsigned int vertex =
-                is_indexed ? (index_u16 ? index_address_16[index] : index_address_8[index])
-                           : (index + regs.pipeline.vertex_offset);
-
-            // -1 is a common special value used for primitive restart. Since it's unknown if
-            // the PICA supports it, and it would mess up the caching, guard against it here.
-            ASSERT(vertex != -1);
-
-            bool vertex_cache_hit = false;
-
-            if (is_indexed) {
-                if (g_state.geometry_pipeline.NeedIndexInput()) {
-                    g_state.geometry_pipeline.SubmitIndex(vertex);
-                    continue;
-                }
-
-                if (g_debug_context && Pica::g_debug_context->recorder) {
-                    int size = index_u16 ? 2 : 1;
-                    memory_accesses.AddAccess(base_address + index_info.offset + size * index,
-                                              size);
-                }
-
-                for (unsigned int i = 0; i < VERTEX_CACHE_SIZE; ++i) {
-                    if (vertex == vertex_cache_ids[i]) {
-                        vs_output = vertex_cache[i];
-                        vertex_cache_hit = true;
-                        break;
-                    }
-                }
-            }
-
-            if (!vertex_cache_hit) {
-                // Initialize data for the current vertex
-                Shader::AttributeBuffer input;
-                loader.LoadVertex(base_address, index, vertex, input, memory_accesses);
-
-                // Send to vertex shader
-                if (g_debug_context)
-                    g_debug_context->OnEvent(DebugContext::Event::VertexShaderInvocation,
-                                             (void*)&input);
-                shader_unit.LoadInput(regs.vs, input);
-                shader_engine->Run(g_state.vs, shader_unit);
-                shader_unit.WriteOutput(regs.vs, vs_output);
-
-                if (is_indexed) {
-                    vertex_cache[vertex_cache_pos] = vs_output;
-                    vertex_cache_ids[vertex_cache_pos] = vertex;
-                    vertex_cache_pos = (vertex_cache_pos + 1) % VERTEX_CACHE_SIZE;
-                }
-            }
-
-            // Send to geometry pipeline
-            g_state.geometry_pipeline.SubmitVertex(vs_output);
-        }
-
-        for (auto& range : memory_accesses.ranges) {
-            g_debug_context->recorder->MemoryAccessed(Memory::GetPhysicalPointer(range.first),
-                                                      range.second, range.first);
-        }
-
+    case PICA_REG_INDEX(pipeline.trigger_draw_indexed):
+        Draw(id);
         break;
-    }
 
     case PICA_REG_INDEX(gs.bool_uniforms):
         WriteUniformBoolReg(g_state.gs, g_state.regs.gs.bool_uniforms.Value());
@@ -632,6 +642,6 @@ void ProcessCommandList(const u32* list, u32 size) {
     }
 }
 
-} // namespace
+} // namespace CommandProcessor
 
-} // namespace
+} // namespace Pica
diff --git a/src/video_core/geometry_pipeline.cpp b/src/video_core/geometry_pipeline.cpp
index b146e2ecb..98ff2ccd3 100644
--- a/src/video_core/geometry_pipeline.cpp
+++ b/src/video_core/geometry_pipeline.cpp
@@ -105,7 +105,7 @@ public:
         DEBUG_ASSERT(need_index);
 
         // The number of vertex input is put to the uniform register
-        float24 vertex_num = float24::FromFloat32(val);
+        float24 vertex_num = float24::FromFloat32(static_cast<float>(val));
         setup.uniforms.f[0] = Math::MakeVec(vertex_num, vertex_num, vertex_num, vertex_num);
 
         // The second uniform register and so on are used for receiving input vertices
diff --git a/src/video_core/pica_types.h b/src/video_core/pica_types.h
index 5d7e10066..2eafa7e9e 100644
--- a/src/video_core/pica_types.h
+++ b/src/video_core/pica_types.h
@@ -58,11 +58,12 @@ public:
     }
 
     Float<M, E> operator*(const Float<M, E>& flt) const {
-        if ((this->value == 0.f && !std::isnan(flt.value)) ||
-            (flt.value == 0.f && !std::isnan(this->value)))
-            // PICA gives 0 instead of NaN when multiplying by inf
-            return Zero();
-        return Float<M, E>::FromFloat32(ToFloat32() * flt.ToFloat32());
+        float result = value * flt.ToFloat32();
+        // PICA gives 0 instead of NaN when multiplying by inf
+        if (!std::isnan(value) && !std::isnan(flt.ToFloat32()))
+            if (std::isnan(result))
+                result = 0.f;
+        return Float<M, E>::FromFloat32(result);
     }
 
     Float<M, E> operator/(const Float<M, E>& flt) const {
@@ -78,12 +79,7 @@ public:
     }
 
     Float<M, E>& operator*=(const Float<M, E>& flt) {
-        if ((this->value == 0.f && !std::isnan(flt.value)) ||
-            (flt.value == 0.f && !std::isnan(this->value)))
-            // PICA gives 0 instead of NaN when multiplying by inf
-            *this = Zero();
-        else
-            value *= flt.ToFloat32();
+        value = operator*(flt).value;
         return *this;
     }
 
diff --git a/src/video_core/renderer_opengl/gl_state.cpp b/src/video_core/renderer_opengl/gl_state.cpp
index 06a905766..5770ae08f 100644
--- a/src/video_core/renderer_opengl/gl_state.cpp
+++ b/src/video_core/renderer_opengl/gl_state.cpp
@@ -267,9 +267,9 @@ void OpenGLState::Apply() const {
     for (size_t i = 0; i < clip_distance.size(); ++i) {
         if (clip_distance[i] != cur_state.clip_distance[i]) {
             if (clip_distance[i]) {
-                glEnable(GL_CLIP_DISTANCE0 + i);
+                glEnable(GL_CLIP_DISTANCE0 + static_cast<GLenum>(i));
             } else {
-                glDisable(GL_CLIP_DISTANCE0 + i);
+                glDisable(GL_CLIP_DISTANCE0 + static_cast<GLenum>(i));
             }
         }
     }
diff --git a/src/video_core/swrasterizer/clipper.cpp b/src/video_core/swrasterizer/clipper.cpp
index a52129eb7..c1ed48398 100644
--- a/src/video_core/swrasterizer/clipper.cpp
+++ b/src/video_core/swrasterizer/clipper.cpp
@@ -98,7 +98,7 @@ void ProcessTriangle(const OutputVertex& v0, const OutputVertex& v1, const Outpu
 
     auto FlipQuaternionIfOpposite = [](auto& a, const auto& b) {
         if (Math::Dot(a, b) < float24::Zero())
-            a = -a;
+            a = a * float24::FromFloat32(-1.0f);
     };
 
     // Flip the quaternions if they are opposite to prevent interpolating them over the wrong
diff --git a/src/video_core/utils.h b/src/video_core/utils.h
index 7ce83a055..d8567f314 100644
--- a/src/video_core/utils.h
+++ b/src/video_core/utils.h
@@ -8,17 +8,11 @@
 
 namespace VideoCore {
 
-/**
- * Interleave the lower 3 bits of each coordinate to get the intra-block offsets, which are
- * arranged in a Z-order curve. More details on the bit manipulation at:
- * https://fgiesen.wordpress.com/2009/12/13/decoding-morton-codes/
- */
+// 8x8 Z-Order coordinate from 2D coordinates
 static inline u32 MortonInterleave(u32 x, u32 y) {
-    u32 i = (x & 7) | ((y & 7) << 8); // ---- -210
-    i = (i ^ (i << 2)) & 0x1313;      // ---2 --10
-    i = (i ^ (i << 1)) & 0x1515;      // ---2 -1-0
-    i = (i | (i >> 7)) & 0x3F;
-    return i;
+    static const u32 xlut[] = {0x00, 0x01, 0x04, 0x05, 0x10, 0x11, 0x14, 0x15};
+    static const u32 ylut[] = {0x00, 0x02, 0x08, 0x0a, 0x20, 0x22, 0x28, 0x2a};
+    return xlut[x % 8] + ylut[y % 8];
 }
 
 /**