65 files changed, 1614 insertions, 582 deletions
diff --git a/src/audio_core/hle/source.cpp b/src/audio_core/hle/source.cpp
index 92484c526..de4e88cae 100644
--- a/src/audio_core/hle/source.cpp
+++ b/src/audio_core/hle/source.cpp
@@ -244,17 +244,27 @@ void Source::GenerateFrame() {
             break;
         }
 
-        const size_t size_to_copy =
-            std::min(state.current_buffer.size(), current_frame.size() - frame_position);
-
-        std::copy(state.current_buffer.begin(), state.current_buffer.begin() + size_to_copy,
-                  current_frame.begin() + frame_position);
-        state.current_buffer.erase(state.current_buffer.begin(),
-                                   state.current_buffer.begin() + size_to_copy);
-
-        frame_position += size_to_copy;
-        state.next_sample_number += static_cast<u32>(size_to_copy);
+        switch (state.interpolation_mode) {
+        case InterpolationMode::None:
+            AudioInterp::None(state.interp_state, state.current_buffer, state.rate_multiplier,
+                              current_frame, frame_position);
+            break;
+        case InterpolationMode::Linear:
+            AudioInterp::Linear(state.interp_state, state.current_buffer, state.rate_multiplier,
+                                current_frame, frame_position);
+            break;
+        case InterpolationMode::Polyphase:
+            // TODO(merry): Implement polyphase interpolation
+            LOG_DEBUG(Audio_DSP, "Polyphase interpolation unimplemented; falling back to linear");
+            AudioInterp::Linear(state.interp_state, state.current_buffer, state.rate_multiplier,
+                                current_frame, frame_position);
+            break;
+        default:
+            UNIMPLEMENTED();
+            break;
+        }
     }
+    state.next_sample_number += frame_position;
 
     state.filters.ProcessFrame(current_frame);
 }
@@ -305,25 +315,6 @@ bool Source::DequeueBuffer() {
         return true;
     }
 
-    switch (state.interpolation_mode) {
-    case InterpolationMode::None:
-        state.current_buffer =
-            AudioInterp::None(state.interp_state, state.current_buffer, state.rate_multiplier);
-        break;
-    case InterpolationMode::Linear:
-        state.current_buffer =
-            AudioInterp::Linear(state.interp_state, state.current_buffer, state.rate_multiplier);
-        break;
-    case InterpolationMode::Polyphase:
-        // TODO(merry): Implement polyphase interpolation
-        state.current_buffer =
-            AudioInterp::Linear(state.interp_state, state.current_buffer, state.rate_multiplier);
-        break;
-    default:
-        UNIMPLEMENTED();
-        break;
-    }
-
     // the first playthrough starts at play_position, loops start at the beginning of the buffer
     state.current_sample_number = (!buf.has_played) ? buf.play_position : 0;
     state.next_sample_number = state.current_sample_number;
diff --git a/src/audio_core/interpolate.cpp b/src/audio_core/interpolate.cpp
index 8a5d4181a..16e68bc5c 100644
--- a/src/audio_core/interpolate.cpp
+++ b/src/audio_core/interpolate.cpp
@@ -13,74 +13,64 @@ namespace AudioInterp {
 constexpr u64 scale_factor = 1 << 24;
 constexpr u64 scale_mask = scale_factor - 1;
 
-/// Here we step over the input in steps of rate_multiplier, until we consume all of the input.
+/// Here we step over the input in steps of rate, until we consume all of the input.
 /// Three adjacent samples are passed to fn each step.
 template <typename Function>
-static StereoBuffer16 StepOverSamples(State& state, const StereoBuffer16& input,
-                                      float rate_multiplier, Function fn) {
-    ASSERT(rate_multiplier > 0);
+static void StepOverSamples(State& state, StereoBuffer16& input, float rate,
+                            DSP::HLE::StereoFrame16& output, size_t& outputi, Function fn) {
+    ASSERT(rate > 0);
 
-    if (input.size() < 2)
-        return {};
+    if (input.empty())
+        return;
 
-    StereoBuffer16 output;
-    output.reserve(static_cast<size_t>(input.size() / rate_multiplier));
+    input.insert(input.begin(), {state.xn2, state.xn1});
 
-    u64 step_size = static_cast<u64>(rate_multiplier * scale_factor);
+    const u64 step_size = static_cast<u64>(rate * scale_factor);
+    u64 fposition = state.fposition;
+    size_t inputi = 0;
 
-    u64 fposition = 0;
-    const u64 max_fposition = input.size() * scale_factor;
+    while (outputi < output.size()) {
+        inputi = static_cast<size_t>(fposition / scale_factor);
 
-    while (fposition < 1 * scale_factor) {
-        u64 fraction = fposition & scale_mask;
-
-        output.push_back(fn(fraction, state.xn2, state.xn1, input[0]));
-
-        fposition += step_size;
-    }
-
-    while (fposition < 2 * scale_factor) {
-        u64 fraction = fposition & scale_mask;
-
-        output.push_back(fn(fraction, state.xn1, input[0], input[1]));
-
-        fposition += step_size;
-    }
+        if (inputi + 2 >= input.size()) {
+            inputi = input.size() - 2;
+            break;
+        }
 
-    while (fposition < max_fposition) {
         u64 fraction = fposition & scale_mask;
-
-        size_t index = static_cast<size_t>(fposition / scale_factor);
-        output.push_back(fn(fraction, input[index - 2], input[index - 1], input[index]));
+        output[outputi++] = fn(fraction, input[inputi], input[inputi + 1], input[inputi + 2]);
 
         fposition += step_size;
     }
 
-    state.xn2 = input[input.size() - 2];
-    state.xn1 = input[input.size() - 1];
+    state.xn2 = input[inputi];
+    state.xn1 = input[inputi + 1];
+    state.fposition = fposition - inputi * scale_factor;
 
-    return output;
+    input.erase(input.begin(), input.begin() + inputi + 2);
 }
 
-StereoBuffer16 None(State& state, const StereoBuffer16& input, float rate_multiplier) {
-    return StepOverSamples(
-        state, input, rate_multiplier,
+void None(State& state, StereoBuffer16& input, float rate, DSP::HLE::StereoFrame16& output,
+          size_t& outputi) {
+    StepOverSamples(
+        state, input, rate, output, outputi,
         [](u64 fraction, const auto& x0, const auto& x1, const auto& x2) { return x0; });
 }
 
-StereoBuffer16 Linear(State& state, const StereoBuffer16& input, float rate_multiplier) {
+void Linear(State& state, StereoBuffer16& input, float rate, DSP::HLE::StereoFrame16& output,
+            size_t& outputi) {
     // Note on accuracy: Some values that this produces are +/- 1 from the actual firmware.
-    return StepOverSamples(state, input, rate_multiplier,
-                           [](u64 fraction, const auto& x0, const auto& x1, const auto& x2) {
-                               // This is a saturated subtraction. (Verified by black-box fuzzing.)
-                               s64 delta0 = MathUtil::Clamp<s64>(x1[0] - x0[0], -32768, 32767);
-                               s64 delta1 = MathUtil::Clamp<s64>(x1[1] - x0[1], -32768, 32767);
-
-                               return std::array<s16, 2>{
-                                   static_cast<s16>(x0[0] + fraction * delta0 / scale_factor),
-                                   static_cast<s16>(x0[1] + fraction * delta1 / scale_factor),
-                               };
-                           });
+    StepOverSamples(state, input, rate, output, outputi,
+                    [](u64 fraction, const auto& x0, const auto& x1, const auto& x2) {
+                        // This is a saturated subtraction. (Verified by black-box fuzzing.)
+                        s64 delta0 = MathUtil::Clamp<s64>(x1[0] - x0[0], -32768, 32767);
+                        s64 delta1 = MathUtil::Clamp<s64>(x1[1] - x0[1], -32768, 32767);
+
+                        return std::array<s16, 2>{
+                            static_cast<s16>(x0[0] + fraction * delta0 / scale_factor),
+                            static_cast<s16>(x0[1] + fraction * delta1 / scale_factor),
+                        };
+                    });
 }
 
 } // namespace AudioInterp
diff --git a/src/audio_core/interpolate.h b/src/audio_core/interpolate.h
index 19a7b66cb..59f59bc14 100644
--- a/src/audio_core/interpolate.h
+++ b/src/audio_core/interpolate.h
@@ -6,6 +6,7 @@
 
 #include <array>
 #include <vector>
+#include "audio_core/hle/common.h"
 #include "common/common_types.h"
 
 namespace AudioInterp {
@@ -14,31 +15,35 @@ namespace AudioInterp {
 using StereoBuffer16 = std::vector<std::array<s16, 2>>;
 
 struct State {
-    // Two historical samples.
+    /// Two historical samples.
     std::array<s16, 2> xn1 = {}; ///< x[n-1]
     std::array<s16, 2> xn2 = {}; ///< x[n-2]
+    /// Current fractional position.
+    u64 fposition = 0;
 };
 
 /**
  * No interpolation. This is equivalent to a zero-order hold. There is a two-sample predelay.
  * @param state Interpolation state.
  * @param input Input buffer.
- * @param rate_multiplier Stretch factor. Must be a positive non-zero value.
- *                        rate_multiplier > 1.0 performs decimation and rate_multipler < 1.0
- *                        performs upsampling.
- * @return The resampled audio buffer.
+ * @param rate Stretch factor. Must be a positive non-zero value.
+ *             rate > 1.0 performs decimation and rate < 1.0 performs upsampling.
+ * @param output The resampled audio buffer.
+ * @param outputi The index of output to start writing to.
  */
-StereoBuffer16 None(State& state, const StereoBuffer16& input, float rate_multiplier);
+void None(State& state, StereoBuffer16& input, float rate, DSP::HLE::StereoFrame16& output,
+          size_t& outputi);
 
 /**
  * Linear interpolation. This is equivalent to a first-order hold. There is a two-sample predelay.
  * @param state Interpolation state.
  * @param input Input buffer.
- * @param rate_multiplier Stretch factor. Must be a positive non-zero value.
- *                        rate_multiplier > 1.0 performs decimation and rate_multipler < 1.0
- *                        performs upsampling.
- * @return The resampled audio buffer.
+ * @param rate Stretch factor. Must be a positive non-zero value.
+ *             rate > 1.0 performs decimation and rate < 1.0 performs upsampling.
+ * @param output The resampled audio buffer.
+ * @param outputi The index of output to start writing to.
  */
-StereoBuffer16 Linear(State& state, const StereoBuffer16& input, float rate_multiplier);
+void Linear(State& state, StereoBuffer16& input, float rate, DSP::HLE::StereoFrame16& output,
+            size_t& outputi);
 
 } // namespace AudioInterp
diff --git a/src/citra/citra.rc b/src/citra/citra.rc
index fea603004..c490ef302 100644
--- a/src/citra/citra.rc
+++ b/src/citra/citra.rc
@@ -1,3 +1,4 @@
+#include "winresrc.h"
 /////////////////////////////////////////////////////////////////////////////
 //
 // Icon
@@ -7,3 +8,10 @@
 // remains consistent on all systems.
 CITRA_ICON              ICON                    "../../dist/citra.ico"
 
+
+/////////////////////////////////////////////////////////////////////////////
+//
+// RT_MANIFEST
+//
+
+1                       RT_MANIFEST             "../../dist/citra.manifest"
diff --git a/src/citra/config.cpp b/src/citra/config.cpp
index 3869b6b5d..a48ef08c7 100644
--- a/src/citra/config.cpp
+++ b/src/citra/config.cpp
@@ -78,6 +78,8 @@ void Config::ReadValues() {
 
     Settings::values.motion_device = sdl2_config->Get(
         "Controls", "motion_device", "engine:motion_emu,update_period:100,sensitivity:0.01");
+    Settings::values.touch_device =
+        sdl2_config->Get("Controls", "touch_device", "engine:emu_window");
 
     // Core
     Settings::values.use_cpu_jit = sdl2_config->GetBoolean("Core", "use_cpu_jit", true);
diff --git a/src/citra/default_ini.h b/src/citra/default_ini.h
index ea02a788d..4b13a2e1b 100644
--- a/src/citra/default_ini.h
+++ b/src/citra/default_ini.h
@@ -62,6 +62,10 @@ c_stick=
 #      - "sensitivity": the coefficient converting mouse movement to tilting angle (default to 0.01)
 motion_device=
 
+# for touch input, the following devices are available:
+#  - "emu_window" (default) for emulating touch input from mouse input to the emulation window. No parameters required
+touch_device=
+
 [Core]
 # Whether to use the Just-In-Time (JIT) compiler for CPU emulation
 # 0: Interpreter (slow), 1 (default): JIT (fast)
diff --git a/src/citra_qt/citra-qt.rc b/src/citra_qt/citra-qt.rc
index fea603004..a48a9440d 100644
--- a/src/citra_qt/citra-qt.rc
+++ b/src/citra_qt/citra-qt.rc
@@ -1,3 +1,4 @@
+#include "winresrc.h"
 /////////////////////////////////////////////////////////////////////////////
 //
 // Icon
@@ -5,5 +6,14 @@
 
 // Icon with lowest ID value placed first to ensure application icon
 // remains consistent on all systems.
-CITRA_ICON              ICON                    "../../dist/citra.ico"
+// QT requires that the default application icon is named IDI_ICON1
 
+IDI_ICON1               ICON                    "../../dist/citra.ico"
+
+
+/////////////////////////////////////////////////////////////////////////////
+//
+// RT_MANIFEST
+//
+
+1                       RT_MANIFEST             "../../dist/citra.manifest"
diff --git a/src/citra_qt/configuration/config.cpp b/src/citra_qt/configuration/config.cpp
index e2dceaa4c..ef114aad3 100644
--- a/src/citra_qt/configuration/config.cpp
+++ b/src/citra_qt/configuration/config.cpp
@@ -61,6 +61,8 @@ void Config::ReadValues() {
         qt_config->value("motion_device", "engine:motion_emu,update_period:100,sensitivity:0.01")
             .toString()
             .toStdString();
+    Settings::values.touch_device =
+        qt_config->value("touch_device", "engine:emu_window").toString().toStdString();
 
     qt_config->endGroup();
 
@@ -213,6 +215,7 @@ void Config::SaveValues() {
                             QString::fromStdString(Settings::values.analogs[i]));
     }
     qt_config->setValue("motion_device", QString::fromStdString(Settings::values.motion_device));
+    qt_config->setValue("touch_device", QString::fromStdString(Settings::values.touch_device));
     qt_config->endGroup();
 
     qt_config->beginGroup("Core");
diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt
index 89578024f..cd1a8de2d 100644
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -146,6 +146,7 @@ set(SRCS
             hle/service/nwm/nwm_tst.cpp
             hle/service/nwm/nwm_uds.cpp
             hle/service/nwm/uds_beacon.cpp
+            hle/service/nwm/uds_connection.cpp
             hle/service/nwm/uds_data.cpp
             hle/service/pm_app.cpp
             hle/service/ptm/ptm.cpp
@@ -346,6 +347,7 @@ set(HEADERS
             hle/service/nwm/nwm_tst.h
             hle/service/nwm/nwm_uds.h
             hle/service/nwm/uds_beacon.h
+            hle/service/nwm/uds_connection.h
             hle/service/nwm/uds_data.h
             hle/service/pm_app.h
             hle/service/ptm/ptm.h
diff --git a/src/core/arm/dynarmic/arm_dynarmic.cpp b/src/core/arm/dynarmic/arm_dynarmic.cpp
index 0a0b91590..34c5aa381 100644
--- a/src/core/arm/dynarmic/arm_dynarmic.cpp
+++ b/src/core/arm/dynarmic/arm_dynarmic.cpp
@@ -56,7 +56,9 @@ static Dynarmic::UserCallbacks GetUserCallbacks(
     user_callbacks.memory.Write16 = &Memory::Write16;
     user_callbacks.memory.Write32 = &Memory::Write32;
     user_callbacks.memory.Write64 = &Memory::Write64;
-    user_callbacks.page_table = Memory::GetCurrentPageTablePointers();
+    // TODO(Subv): Re-add the page table pointers once dynarmic supports switching page tables at
+    // runtime.
+    user_callbacks.page_table = nullptr;
     user_callbacks.coprocessors[15] = std::make_shared<DynarmicCP15>(interpeter_state);
     return user_callbacks;
 }
diff --git a/src/core/core.cpp b/src/core/core.cpp
index 5332318cf..59b8768e7 100644
--- a/src/core/core.cpp
+++ b/src/core/core.cpp
@@ -137,7 +137,6 @@ void System::Reschedule() {
 }
 
 System::ResultStatus System::Init(EmuWindow* emu_window, u32 system_mode) {
-    Memory::InitMemoryMap();
     LOG_DEBUG(HW_Memory, "initialized OK");
 
     if (Settings::values.use_cpu_jit) {
diff --git a/src/core/frontend/emu_window.cpp b/src/core/frontend/emu_window.cpp
index 54fa5c7fa..e67394177 100644
--- a/src/core/frontend/emu_window.cpp
+++ b/src/core/frontend/emu_window.cpp
@@ -2,14 +2,55 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
-#include <algorithm>
 #include <cmath>
-#include "common/assert.h"
-#include "core/3ds.h"
-#include "core/core.h"
+#include <mutex>
 #include "core/frontend/emu_window.h"
+#include "core/frontend/input.h"
 #include "core/settings.h"
 
+class EmuWindow::TouchState : public Input::Factory<Input::TouchDevice>,
+                              public std::enable_shared_from_this<TouchState> {
+public:
+    std::unique_ptr<Input::TouchDevice> Create(const Common::ParamPackage&) override {
+        return std::make_unique<Device>(shared_from_this());
+    }
+
+    std::mutex mutex;
+
+    bool touch_pressed = false; ///< True if touchpad area is currently pressed, otherwise false
+
+    float touch_x = 0.0f; ///< Touchpad X-position
+    float touch_y = 0.0f; ///< Touchpad Y-position
+
+private:
+    class Device : public Input::TouchDevice {
+    public:
+        explicit Device(std::weak_ptr<TouchState>&& touch_state) : touch_state(touch_state) {}
+        std::tuple<float, float, bool> GetStatus() const override {
+            if (auto state = touch_state.lock()) {
+                std::lock_guard<std::mutex> guard(state->mutex);
+                return std::make_tuple(state->touch_x, state->touch_y, state->touch_pressed);
+            }
+            return std::make_tuple(0.0f, 0.0f, false);
+        }
+
+    private:
+        std::weak_ptr<TouchState> touch_state;
+    };
+};
+
+EmuWindow::EmuWindow() {
+    // TODO: Find a better place to set this.
+    config.min_client_area_size = std::make_pair(400u, 480u);
+    active_config = config;
+    touch_state = std::make_shared<TouchState>();
+    Input::RegisterFactory<Input::TouchDevice>("emu_window", touch_state);
+}
+
+EmuWindow::~EmuWindow() {
+    Input::UnregisterFactory<Input::TouchDevice>("emu_window");
+}
+
 /**
  * Check if the given x/y coordinates are within the touchpad specified by the framebuffer layout
  * @param layout FramebufferLayout object describing the framebuffer size and screen positions
@@ -38,22 +79,26 @@ void EmuWindow::TouchPressed(unsigned framebuffer_x, unsigned framebuffer_y) {
     if (!IsWithinTouchscreen(framebuffer_layout, framebuffer_x, framebuffer_y))
         return;
 
-    touch_x = Core::kScreenBottomWidth * (framebuffer_x - framebuffer_layout.bottom_screen.left) /
-              (framebuffer_layout.bottom_screen.right - framebuffer_layout.bottom_screen.left);
-    touch_y = Core::kScreenBottomHeight * (framebuffer_y - framebuffer_layout.bottom_screen.top) /
-              (framebuffer_layout.bottom_screen.bottom - framebuffer_layout.bottom_screen.top);
+    std::lock_guard<std::mutex> guard(touch_state->mutex);
+    touch_state->touch_x =
+        static_cast<float>(framebuffer_x - framebuffer_layout.bottom_screen.left) /
+        (framebuffer_layout.bottom_screen.right - framebuffer_layout.bottom_screen.left);
+    touch_state->touch_y =
+        static_cast<float>(framebuffer_y - framebuffer_layout.bottom_screen.top) /
+        (framebuffer_layout.bottom_screen.bottom - framebuffer_layout.bottom_screen.top);
 
-    touch_pressed = true;
+    touch_state->touch_pressed = true;
 }
 
 void EmuWindow::TouchReleased() {
-    touch_pressed = false;
-    touch_x = 0;
-    touch_y = 0;
+    std::lock_guard<std::mutex> guard(touch_state->mutex);
+    touch_state->touch_pressed = false;
+    touch_state->touch_x = 0;
+    touch_state->touch_y = 0;
 }
 
 void EmuWindow::TouchMoved(unsigned framebuffer_x, unsigned framebuffer_y) {
-    if (!touch_pressed)
+    if (!touch_state->touch_pressed)
         return;
 
     if (!IsWithinTouchscreen(framebuffer_layout, framebuffer_x, framebuffer_y))
diff --git a/src/core/frontend/emu_window.h b/src/core/frontend/emu_window.h
index 7bdee251c..c10dee51b 100644
--- a/src/core/frontend/emu_window.h
+++ b/src/core/frontend/emu_window.h
@@ -4,11 +4,10 @@
 
 #pragma once
 
-#include <mutex>
+#include <memory>
 #include <tuple>
 #include <utility>
 #include "common/common_types.h"
-#include "common/math_util.h"
 #include "core/frontend/framebuffer_layout.h"
 
 /**
@@ -69,17 +68,6 @@ public:
     void TouchMoved(unsigned framebuffer_x, unsigned framebuffer_y);
 
     /**
-     * Gets the current touch screen state (touch X/Y coordinates and whether or not it is pressed).
-     * @note This should be called by the core emu thread to get a state set by the window thread.
-     * @todo Fix this function to be thread-safe.
-     * @return std::tuple of (x, y, pressed) where `x` and `y` are the touch coordinates and
-     *         `pressed` is true if the touch screen is currently being pressed
-     */
-    std::tuple<u16, u16, bool> GetTouchState() const {
-        return std::make_tuple(touch_x, touch_y, touch_pressed);
-    }
-
-    /**
      * Returns currently active configuration.
      * @note Accesses to the returned object need not be consistent because it may be modified in
      * another thread
@@ -113,15 +101,8 @@ public:
     void UpdateCurrentFramebufferLayout(unsigned width, unsigned height);
 
 protected:
-    EmuWindow() {
-        // TODO: Find a better place to set this.
-        config.min_client_area_size = std::make_pair(400u, 480u);
-        active_config = config;
-        touch_x = 0;
-        touch_y = 0;
-        touch_pressed = false;
-    }
-    virtual ~EmuWindow() {}
+    EmuWindow();
+    virtual ~EmuWindow();
 
     /**
      * Processes any pending configuration changes from the last SetConfig call.
@@ -177,10 +158,8 @@ private:
                                 /// ProcessConfigurationChanges)
     WindowConfig active_config; ///< Internal active configuration
 
-    bool touch_pressed; ///< True if touchpad area is currently pressed, otherwise false
-
-    u16 touch_x; ///< Touchpad X-position in native 3DS pixel coordinates (0-320)
-    u16 touch_y; ///< Touchpad Y-position in native 3DS pixel coordinates (0-240)
+    class TouchState;
+    std::shared_ptr<TouchState> touch_state;
 
     /**
      * Clip the provided coordinates to be inside the touchscreen area.
diff --git a/src/core/frontend/input.h b/src/core/frontend/input.h
index 5916a901d..8c256beb5 100644
--- a/src/core/frontend/input.h
+++ b/src/core/frontend/input.h
@@ -126,4 +126,10 @@ using AnalogDevice = InputDevice<std::tuple<float, float>>;
  */
 using MotionDevice = InputDevice<std::tuple<Math::Vec3<float>, Math::Vec3<float>>>;
 
+/**
+ * A touch device is an input device that returns a tuple of two floats and a bool. The floats are
+ * x and y coordinates in the range 0.0 - 1.0, and the bool indicates whether it is pressed.
+ */
+using TouchDevice = InputDevice<std::tuple<float, float, bool>>;
+
 } // namespace Input
diff --git a/src/core/hle/applets/mii_selector.cpp b/src/core/hle/applets/mii_selector.cpp
index 705859f1e..f225c23a5 100644
--- a/src/core/hle/applets/mii_selector.cpp
+++ b/src/core/hle/applets/mii_selector.cpp
@@ -66,7 +66,7 @@ ResultCode MiiSelector::StartImpl(const Service::APT::AppletStartupParameter& pa
     // continue.
     MiiResult result;
     memset(&result, 0, sizeof(result));
-    result.result_code = 0;
+    result.return_code = 0;
 
     // Let the application know that we're closing
     Service::APT::MessageParameter message;
@@ -82,5 +82,5 @@ ResultCode MiiSelector::StartImpl(const Service::APT::AppletStartupParameter& pa
 }
 
 void MiiSelector::Update() {}
-}
-} // namespace
+} // namespace Applets
+} // namespace HLE
diff --git a/src/core/hle/applets/mii_selector.h b/src/core/hle/applets/mii_selector.h
index ec00e29d2..136ce8948 100644
--- a/src/core/hle/applets/mii_selector.h
+++ b/src/core/hle/applets/mii_selector.h
@@ -16,51 +16,46 @@ namespace HLE {
 namespace Applets {
 
 struct MiiConfig {
-    u8 unk_000;
-    u8 unk_001;
-    u8 unk_002;
-    u8 unk_003;
-    u8 unk_004;
+    u8 enable_cancel_button;
+    u8 enable_guest_mii;
+    u8 show_on_top_screen;
+    INSERT_PADDING_BYTES(5);
+    u16 title[0x40];
+    INSERT_PADDING_BYTES(4);
+    u8 show_guest_miis;
     INSERT_PADDING_BYTES(3);
-    u16 unk_008;
-    INSERT_PADDING_BYTES(0x82);
-    u8 unk_08C;
-    INSERT_PADDING_BYTES(3);
-    u16 unk_090;
+    u32 initially_selected_mii_index;
+    u8 guest_mii_whitelist[6];
+    u8 user_mii_whitelist[0x64];
     INSERT_PADDING_BYTES(2);
-    u32 unk_094;
-    u16 unk_098;
-    u8 unk_09A[0x64];
-    u8 unk_0FE;
-    u8 unk_0FF;
-    u32 unk_100;
+    u32 magic_value;
 };
-
 static_assert(sizeof(MiiConfig) == 0x104, "MiiConfig structure has incorrect size");
 #define ASSERT_REG_POSITION(field_name, position)                                                  \
     static_assert(offsetof(MiiConfig, field_name) == position,                                     \
                   "Field " #field_name " has invalid position")
-ASSERT_REG_POSITION(unk_008, 0x08);
-ASSERT_REG_POSITION(unk_08C, 0x8C);
-ASSERT_REG_POSITION(unk_090, 0x90);
-ASSERT_REG_POSITION(unk_094, 0x94);
-ASSERT_REG_POSITION(unk_0FE, 0xFE);
+ASSERT_REG_POSITION(title, 0x08);
+ASSERT_REG_POSITION(show_guest_miis, 0x8C);
+ASSERT_REG_POSITION(initially_selected_mii_index, 0x90);
+ASSERT_REG_POSITION(guest_mii_whitelist, 0x94);
 #undef ASSERT_REG_POSITION
 
 struct MiiResult {
-    u32 result_code;
-    u8 unk_04;
-    INSERT_PADDING_BYTES(7);
-    u8 unk_0C[0x60];
-    u8 unk_6C[0x16];
+    u32 return_code;
+    u32 is_guest_mii_selected;
+    u32 selected_guest_mii_index;
+    // TODO(mailwl): expand to Mii Format structure: https://www.3dbrew.org/wiki/Mii
+    u8 selected_mii_data[0x5C];
     INSERT_PADDING_BYTES(2);
+    u16 mii_data_checksum;
+    u16 guest_mii_name[0xC];
 };
 static_assert(sizeof(MiiResult) == 0x84, "MiiResult structure has incorrect size");
 #define ASSERT_REG_POSITION(field_name, position)                                                  \
     static_assert(offsetof(MiiResult, field_name) == position,                                     \
                   "Field " #field_name " has invalid position")
-ASSERT_REG_POSITION(unk_0C, 0x0C);
-ASSERT_REG_POSITION(unk_6C, 0x6C);
+ASSERT_REG_POSITION(selected_mii_data, 0x0C);
+ASSERT_REG_POSITION(guest_mii_name, 0x6C);
 #undef ASSERT_REG_POSITION
 
 class MiiSelector final : public Applet {
@@ -79,5 +74,5 @@ private:
 
     MiiConfig config;
 };
-}
-} // namespace
+} // namespace Applets
+} // namespace HLE
diff --git a/src/core/hle/kernel/memory.cpp b/src/core/hle/kernel/memory.cpp
index 496d07cb5..7f27e9655 100644
--- a/src/core/hle/kernel/memory.cpp
+++ b/src/core/hle/kernel/memory.cpp
@@ -8,7 +8,6 @@
 #include <memory>
 #include <utility>
 #include <vector>
-#include "audio_core/audio_core.h"
 #include "common/assert.h"
 #include "common/common_types.h"
 #include "common/logging/log.h"
@@ -24,7 +23,7 @@
 
 namespace Kernel {
 
-static MemoryRegionInfo memory_regions[3];
+MemoryRegionInfo memory_regions[3];
 
 /// Size of the APPLICATION, SYSTEM and BASE memory regions (respectively) for each system
 /// memory configuration type.
@@ -96,9 +95,6 @@ MemoryRegionInfo* GetMemoryRegion(MemoryRegion region) {
     }
 }
 
-std::array<u8, Memory::VRAM_SIZE> vram;
-std::array<u8, Memory::N3DS_EXTRA_RAM_SIZE> n3ds_extra_ram;
-
 void HandleSpecialMapping(VMManager& address_space, const AddressMapping& mapping) {
     using namespace Memory;
 
@@ -143,30 +139,14 @@ void HandleSpecialMapping(VMManager& address_space, const AddressMapping& mappin
         return;
     }
 
-    // TODO(yuriks): Use GetPhysicalPointer when that becomes independent of the virtual
-    // mappings.
-    u8* target_pointer = nullptr;
-    switch (area->paddr_base) {
-    case VRAM_PADDR:
-        target_pointer = vram.data();
-        break;
-    case DSP_RAM_PADDR:
-        target_pointer = AudioCore::GetDspMemory().data();
-        break;
-    case N3DS_EXTRA_RAM_PADDR:
-        target_pointer = n3ds_extra_ram.data();
-        break;
-    default:
-        UNREACHABLE();
-    }
+    u8* target_pointer = Memory::GetPhysicalPointer(area->paddr_base + offset_into_region);
 
     // TODO(yuriks): This flag seems to have some other effect, but it's unknown what
     MemoryState memory_state = mapping.unk_flag ? MemoryState::Static : MemoryState::IO;
 
-    auto vma = address_space
-                   .MapBackingMemory(mapping.address, target_pointer + offset_into_region,
-                                     mapping.size, memory_state)
-                   .Unwrap();
+    auto vma =
+        address_space.MapBackingMemory(mapping.address, target_pointer, mapping.size, memory_state)
+            .Unwrap();
     address_space.Reprotect(vma,
                             mapping.read_only ? VMAPermission::Read : VMAPermission::ReadWrite);
 }
diff --git a/src/core/hle/kernel/memory.h b/src/core/hle/kernel/memory.h
index 08c1a9989..da6bb3563 100644
--- a/src/core/hle/kernel/memory.h
+++ b/src/core/hle/kernel/memory.h
@@ -26,4 +26,6 @@ MemoryRegionInfo* GetMemoryRegion(MemoryRegion region);
 
 void HandleSpecialMapping(VMManager& address_space, const AddressMapping& mapping);
 void MapSharedPages(VMManager& address_space);
+
+extern MemoryRegionInfo memory_regions[3];
 } // namespace Kernel
diff --git a/src/core/hle/kernel/thread.cpp b/src/core/hle/kernel/thread.cpp
index b957c45dd..324415a36 100644
--- a/src/core/hle/kernel/thread.cpp
+++ b/src/core/hle/kernel/thread.cpp
@@ -171,6 +171,8 @@ static void SwitchContext(Thread* new_thread) {
         // Cancel any outstanding wakeup events for this thread
         CoreTiming::UnscheduleEvent(ThreadWakeupEventType, new_thread->callback_handle);
 
+        auto previous_process = Kernel::g_current_process;
+
         current_thread = new_thread;
 
         ready_queue.remove(new_thread->current_priority, new_thread);
@@ -178,8 +180,18 @@ static void SwitchContext(Thread* new_thread) {
 
         Core::CPU().LoadContext(new_thread->context);
         Core::CPU().SetCP15Register(CP15_THREAD_URO, new_thread->GetTLSAddress());
+
+        if (previous_process != current_thread->owner_process) {
+            Kernel::g_current_process = current_thread->owner_process;
+            Memory::current_page_table = &Kernel::g_current_process->vm_manager.page_table;
+            // We have switched processes and thus, page tables, clear the instruction cache so we
+            // don't keep stale data from the previous process.
+            Core::CPU().ClearInstructionCache();
+        }
     } else {
         current_thread = nullptr;
+        // Note: We do not reset the current process and current page table when idling because
+        // technically we haven't changed processes, our threads are just paused.
     }
 }
 
diff --git a/src/core/hle/kernel/vm_manager.cpp b/src/core/hle/kernel/vm_manager.cpp
index cef1f7fa8..7a007c065 100644
--- a/src/core/hle/kernel/vm_manager.cpp
+++ b/src/core/hle/kernel/vm_manager.cpp
@@ -56,6 +56,10 @@ void VMManager::Reset() {
     initial_vma.size = MAX_ADDRESS;
     vma_map.emplace(initial_vma.base, initial_vma);
 
+    page_table.pointers.fill(nullptr);
+    page_table.attributes.fill(Memory::PageType::Unmapped);
+    page_table.cached_res_count.fill(0);
+
     UpdatePageTableForVMA(initial_vma);
 }
 
@@ -328,16 +332,17 @@ VMManager::VMAIter VMManager::MergeAdjacent(VMAIter iter) {
 void VMManager::UpdatePageTableForVMA(const VirtualMemoryArea& vma) {
     switch (vma.type) {
     case VMAType::Free:
-        Memory::UnmapRegion(vma.base, vma.size);
+        Memory::UnmapRegion(page_table, vma.base, vma.size);
         break;
     case VMAType::AllocatedMemoryBlock:
-        Memory::MapMemoryRegion(vma.base, vma.size, vma.backing_block->data() + vma.offset);
+        Memory::MapMemoryRegion(page_table, vma.base, vma.size,
+                                vma.backing_block->data() + vma.offset);
         break;
     case VMAType::BackingMemory:
-        Memory::MapMemoryRegion(vma.base, vma.size, vma.backing_memory);
+        Memory::MapMemoryRegion(page_table, vma.base, vma.size, vma.backing_memory);
         break;
     case VMAType::MMIO:
-        Memory::MapIoRegion(vma.base, vma.size, vma.mmio_handler);
+        Memory::MapIoRegion(page_table, vma.base, vma.size, vma.mmio_handler);
         break;
     }
 }
diff --git a/src/core/hle/kernel/vm_manager.h b/src/core/hle/kernel/vm_manager.h
index 38e0d74d0..1302527bb 100644
--- a/src/core/hle/kernel/vm_manager.h
+++ b/src/core/hle/kernel/vm_manager.h
@@ -9,6 +9,7 @@
 #include <vector>
 #include "common/common_types.h"
 #include "core/hle/result.h"
+#include "core/memory.h"
 #include "core/mmio.h"
 
 namespace Kernel {
@@ -102,7 +103,6 @@ struct VirtualMemoryArea {
  *  - http://duartes.org/gustavo/blog/post/page-cache-the-affair-between-memory-and-files/
  */
 class VMManager final {
-    // TODO(yuriks): Make page tables switchable to support multiple VMManagers
 public:
     /**
      * The maximum amount of address space managed by the kernel. Addresses above this are never
@@ -184,6 +184,10 @@ public:
     /// Dumps the address space layout to the log, for debugging
     void LogLayout(Log::Level log_level) const;
 
+    /// Each VMManager has its own page table, which is set as the main one when the owning process
+    /// is scheduled.
+    Memory::PageTable page_table;
+
 private:
     using VMAIter = decltype(vma_map)::iterator;
 
diff --git a/src/core/hle/lock.cpp b/src/core/hle/lock.cpp
index 082f689c8..1c24c7ce9 100644
--- a/src/core/hle/lock.cpp
+++ b/src/core/hle/lock.cpp
@@ -7,5 +7,5 @@
 #include <core/hle/lock.h>
 
 namespace HLE {
-std::mutex g_hle_lock;
+std::recursive_mutex g_hle_lock;
 }
diff --git a/src/core/hle/lock.h b/src/core/hle/lock.h
index 8265621e1..5c99fe996 100644
--- a/src/core/hle/lock.h
+++ b/src/core/hle/lock.h
@@ -14,5 +14,5 @@ namespace HLE {
  * to the emulated memory is not protected by this mutex, and should be avoided in any threads other
  * than the CPU thread.
  */
-extern std::mutex g_hle_lock;
+extern std::recursive_mutex g_hle_lock;
 } // namespace HLE
diff --git a/src/core/hle/service/apt/apt.cpp b/src/core/hle/service/apt/apt.cpp
index 58d94768c..8c0ba73f2 100644
--- a/src/core/hle/service/apt/apt.cpp
+++ b/src/core/hle/service/apt/apt.cpp
@@ -19,6 +19,7 @@
 #include "core/hle/service/apt/apt_s.h"
 #include "core/hle/service/apt/apt_u.h"
 #include "core/hle/service/apt/bcfnt/bcfnt.h"
+#include "core/hle/service/cfg/cfg.h"
 #include "core/hle/service/fs/archive.h"
 #include "core/hle/service/ptm/ptm.h"
 #include "core/hle/service/service.h"
@@ -198,6 +199,143 @@ void Initialize(Service::Interface* self) {
                        Kernel::g_handle_table.Create(slot_data->parameter_event).Unwrap());
 }
 
+static u32 DecompressLZ11(const u8* in, u8* out) {
+    u32_le decompressed_size;
+    memcpy(&decompressed_size, in, sizeof(u32));
+    in += 4;
+
+    u8 type = decompressed_size & 0xFF;
+    ASSERT(type == 0x11);
+    decompressed_size >>= 8;
+
+    u32 current_out_size = 0;
+    u8 flags = 0, mask = 1;
+    while (current_out_size < decompressed_size) {
+        if (mask == 1) {
+            flags = *(in++);
+            mask = 0x80;
+        } else {
+            mask >>= 1;
+        }
+
+        if (flags & mask) {
+            u8 byte1 = *(in++);
+            u32 length = byte1 >> 4;
+            u32 offset;
+            if (length == 0) {
+                u8 byte2 = *(in++);
+                u8 byte3 = *(in++);
+                length = (((byte1 & 0x0F) << 4) | (byte2 >> 4)) + 0x11;
+                offset = (((byte2 & 0x0F) << 8) | byte3) + 0x1;
+            } else if (length == 1) {
+                u8 byte2 = *(in++);
+                u8 byte3 = *(in++);
+                u8 byte4 = *(in++);
+                length = (((byte1 & 0x0F) << 12) | (byte2 << 4) | (byte3 >> 4)) + 0x111;
+                offset = (((byte3 & 0x0F) << 8) | byte4) + 0x1;
+            } else {
+                u8 byte2 = *(in++);
+                length = (byte1 >> 4) + 0x1;
+                offset = (((byte1 & 0x0F) << 8) | byte2) + 0x1;
+            }
+
+            for (u32 i = 0; i < length; i++) {
+                *out = *(out - offset);
+                ++out;
+            }
+
+            current_out_size += length;
+        } else {
+            *(out++) = *(in++);
+            current_out_size++;
+        }
+    }
+    return decompressed_size;
+}
+
+static bool LoadSharedFont() {
+    u8 font_region_code;
+    switch (CFG::GetRegionValue()) {
+    case 4: // CHN
+        font_region_code = 2;
+        break;
+    case 5: // KOR
+        font_region_code = 3;
+        break;
+    case 6: // TWN
+        font_region_code = 4;
+        break;
+    default: // JPN/EUR/USA
+        font_region_code = 1;
+        break;
+    }
+
+    const u64_le shared_font_archive_id_low = 0x0004009b00014002 | ((font_region_code - 1) << 8);
+    const u64_le shared_font_archive_id_high = 0x00000001ffffff00;
+    std::vector<u8> shared_font_archive_id(16);
+    std::memcpy(&shared_font_archive_id[0], &shared_font_archive_id_low, sizeof(u64));
+    std::memcpy(&shared_font_archive_id[8], &shared_font_archive_id_high, sizeof(u64));
+    FileSys::Path archive_path(shared_font_archive_id);
+    auto archive_result = Service::FS::OpenArchive(Service::FS::ArchiveIdCode::NCCH, archive_path);
+    if (archive_result.Failed())
+        return false;
+
+    std::vector<u8> romfs_path(20, 0); // 20-byte all zero path for opening RomFS
+    FileSys::Path file_path(romfs_path);
+    FileSys::Mode open_mode = {};
+    open_mode.read_flag.Assign(1);
+    auto file_result = Service::FS::OpenFileFromArchive(*archive_result, file_path, open_mode);
+    if (file_result.Failed())
+        return false;
+
+    auto romfs = std::move(file_result).Unwrap();
+    std::vector<u8> romfs_buffer(romfs->backend->GetSize());
+    romfs->backend->Read(0, romfs_buffer.size(), romfs_buffer.data());
+    romfs->backend->Close();
+
+    const char16_t* file_name[4] = {u"cbf_std.bcfnt.lz", u"cbf_zh-Hans-CN.bcfnt.lz",
+                                    u"cbf_ko-Hang-KR.bcfnt.lz", u"cbf_zh-Hant-TW.bcfnt.lz"};
+    const u8* font_file =
+        RomFS::GetFilePointer(romfs_buffer.data(), {file_name[font_region_code - 1]});
+    if (font_file == nullptr)
+        return false;
+
+    struct {
+        u32_le status;
+        u32_le region;
+        u32_le decompressed_size;
+        INSERT_PADDING_WORDS(0x1D);
+    } shared_font_header{};
+    static_assert(sizeof(shared_font_header) == 0x80, "shared_font_header has incorrect size");
+
+    shared_font_header.status = 2; // successfully loaded
+    shared_font_header.region = font_region_code;
+    shared_font_header.decompressed_size =
+        DecompressLZ11(font_file, shared_font_mem->GetPointer(0x80));
+    std::memcpy(shared_font_mem->GetPointer(), &shared_font_header, sizeof(shared_font_header));
+    *shared_font_mem->GetPointer(0x83) = 'U'; // Change the magic from "CFNT" to "CFNU"
+
+    return true;
+}
+
+static bool LoadLegacySharedFont() {
+    // This is the legacy method to load shared font.
+    // The expected format is a decrypted, uncompressed BCFNT file with the 0x80 byte header
+    // generated by the APT:U service. The best way to get is by dumping it from RAM. We've provided
+    // a homebrew app to do this: https://github.com/citra-emu/3dsutils. Put the resulting file
+    // "shared_font.bin" in the Citra "sysdata" directory.
+    std::string filepath = FileUtil::GetUserPath(D_SYSDATA_IDX) + SHARED_FONT;
+
+    FileUtil::CreateFullPath(filepath); // Create path if not already created
+    FileUtil::IOFile file(filepath, "rb");
+    if (file.IsOpen()) {
+        file.ReadBytes(shared_font_mem->GetPointer(), file.GetSize());
+        return true;
+    }
+
+    return false;
+}
+
 void GetSharedFont(Service::Interface* self) {
     IPC::RequestParser rp(Kernel::GetCommandBuffer(), 0x44, 0, 0); // 0x00440000
     IPC::RequestBuilder rb = rp.MakeBuilder(2, 2);
@@ -206,11 +344,20 @@ void GetSharedFont(Service::Interface* self) {
     Core::Telemetry().AddField(Telemetry::FieldType::Session, "RequiresSharedFont", true);
 
     if (!shared_font_loaded) {
-        LOG_ERROR(Service_APT, "shared font file missing - go dump it from your 3ds");
-        rb.Push<u32>(-1); // TODO: Find the right error code
-        rb.Skip(1 + 2, true);
-        Core::System::GetInstance().SetStatus(Core::System::ResultStatus::ErrorSharedFont);
-        return;
+        // On real 3DS, font loading happens on booting. However, we load it on demand to coordinate
+        // with CFG region auto configuration, which happens later than APT initialization.
+        if (LoadSharedFont()) {
+            shared_font_loaded = true;
+        } else if (LoadLegacySharedFont()) {
+            LOG_WARNING(Service_APT, "Loaded shared font by legacy method");
+            shared_font_loaded = true;
+        } else {
+            LOG_ERROR(Service_APT, "shared font file missing - go dump it from your 3ds");
+            rb.Push<u32>(-1); // TODO: Find the right error code
+            rb.Skip(1 + 2, true);
+            Core::System::GetInstance().SetStatus(Core::System::ResultStatus::ErrorSharedFont);
+            return;
+        }
     }
 
     // The shared font has to be relocated to the new address before being passed to the
@@ -863,125 +1010,6 @@ void CheckNew3DS(Service::Interface* self) {
     LOG_WARNING(Service_APT, "(STUBBED) called");
 }
 
-static u32 DecompressLZ11(const u8* in, u8* out) {
-    u32_le decompressed_size;
-    memcpy(&decompressed_size, in, sizeof(u32));
-    in += 4;
-
-    u8 type = decompressed_size & 0xFF;
-    ASSERT(type == 0x11);
-    decompressed_size >>= 8;
-
-    u32 current_out_size = 0;
-    u8 flags = 0, mask = 1;
-    while (current_out_size < decompressed_size) {
-        if (mask == 1) {
-            flags = *(in++);
-            mask = 0x80;
-        } else {
-            mask >>= 1;
-        }
-
-        if (flags & mask) {
-            u8 byte1 = *(in++);
-            u32 length = byte1 >> 4;
-            u32 offset;
-            if (length == 0) {
-                u8 byte2 = *(in++);
-                u8 byte3 = *(in++);
-                length = (((byte1 & 0x0F) << 4) | (byte2 >> 4)) + 0x11;
-                offset = (((byte2 & 0x0F) << 8) | byte3) + 0x1;
-            } else if (length == 1) {
-                u8 byte2 = *(in++);
-                u8 byte3 = *(in++);
-                u8 byte4 = *(in++);
-                length = (((byte1 & 0x0F) << 12) | (byte2 << 4) | (byte3 >> 4)) + 0x111;
-                offset = (((byte3 & 0x0F) << 8) | byte4) + 0x1;
-            } else {
-                u8 byte2 = *(in++);
-                length = (byte1 >> 4) + 0x1;
-                offset = (((byte1 & 0x0F) << 8) | byte2) + 0x1;
-            }
-
-            for (u32 i = 0; i < length; i++) {
-                *out = *(out - offset);
-                ++out;
-            }
-
-            current_out_size += length;
-        } else {
-            *(out++) = *(in++);
-            current_out_size++;
-        }
-    }
-    return decompressed_size;
-}
-
-static bool LoadSharedFont() {
-    // TODO (wwylele): load different font archive for region CHN/KOR/TWN
-    const u64_le shared_font_archive_id_low = 0x0004009b00014002;
-    const u64_le shared_font_archive_id_high = 0x00000001ffffff00;
-    std::vector<u8> shared_font_archive_id(16);
-    std::memcpy(&shared_font_archive_id[0], &shared_font_archive_id_low, sizeof(u64));
-    std::memcpy(&shared_font_archive_id[8], &shared_font_archive_id_high, sizeof(u64));
-    FileSys::Path archive_path(shared_font_archive_id);
-    auto archive_result = Service::FS::OpenArchive(Service::FS::ArchiveIdCode::NCCH, archive_path);
-    if (archive_result.Failed())
-        return false;
-
-    std::vector<u8> romfs_path(20, 0); // 20-byte all zero path for opening RomFS
-    FileSys::Path file_path(romfs_path);
-    FileSys::Mode open_mode = {};
-    open_mode.read_flag.Assign(1);
-    auto file_result = Service::FS::OpenFileFromArchive(*archive_result, file_path, open_mode);
-    if (file_result.Failed())
-        return false;
-
-    auto romfs = std::move(file_result).Unwrap();
-    std::vector<u8> romfs_buffer(romfs->backend->GetSize());
-    romfs->backend->Read(0, romfs_buffer.size(), romfs_buffer.data());
-    romfs->backend->Close();
-
-    const u8* font_file = RomFS::GetFilePointer(romfs_buffer.data(), {u"cbf_std.bcfnt.lz"});
-    if (font_file == nullptr)
-        return false;
-
-    struct {
-        u32_le status;
-        u32_le region;
-        u32_le decompressed_size;
-        INSERT_PADDING_WORDS(0x1D);
-    } shared_font_header{};
-    static_assert(sizeof(shared_font_header) == 0x80, "shared_font_header has incorrect size");
-
-    shared_font_header.status = 2; // successfully loaded
-    shared_font_header.region = 1; // region JPN/EUR/USA
-    shared_font_header.decompressed_size =
-        DecompressLZ11(font_file, shared_font_mem->GetPointer(0x80));
-    std::memcpy(shared_font_mem->GetPointer(), &shared_font_header, sizeof(shared_font_header));
-    *shared_font_mem->GetPointer(0x83) = 'U'; // Change the magic from "CFNT" to "CFNU"
-
-    return true;
-}
-
-static bool LoadLegacySharedFont() {
-    // This is the legacy method to load shared font.
-    // The expected format is a decrypted, uncompressed BCFNT file with the 0x80 byte header
-    // generated by the APT:U service. The best way to get is by dumping it from RAM. We've provided
-    // a homebrew app to do this: https://github.com/citra-emu/3dsutils. Put the resulting file
-    // "shared_font.bin" in the Citra "sysdata" directory.
-    std::string filepath = FileUtil::GetUserPath(D_SYSDATA_IDX) + SHARED_FONT;
-
-    FileUtil::CreateFullPath(filepath); // Create path if not already created
-    FileUtil::IOFile file(filepath, "rb");
-    if (file.IsOpen()) {
-        file.ReadBytes(shared_font_mem->GetPointer(), file.GetSize());
-        return true;
-    }
-
-    return false;
-}
-
 void Init() {
     AddService(new APT_A_Interface);
     AddService(new APT_S_Interface);
@@ -995,16 +1023,6 @@ void Init() {
                                      MemoryPermission::ReadWrite, MemoryPermission::Read, 0,
                                      Kernel::MemoryRegion::SYSTEM, "APT:SharedFont");
 
-    if (LoadSharedFont()) {
-        shared_font_loaded = true;
-    } else if (LoadLegacySharedFont()) {
-        LOG_WARNING(Service_APT, "Loaded shared font by legacy method");
-        shared_font_loaded = true;
-    } else {
-        LOG_WARNING(Service_APT, "Unable to load shared font");
-        shared_font_loaded = false;
-    }
-
     lock = Kernel::Mutex::Create(false, "APT_U:Lock");
 
     cpu_percent = 0;
diff --git a/src/core/hle/service/cfg/cfg.cpp b/src/core/hle/service/cfg/cfg.cpp
index 3dbeb27cc..f26a1f65f 100644
--- a/src/core/hle/service/cfg/cfg.cpp
+++ b/src/core/hle/service/cfg/cfg.cpp
@@ -168,7 +168,7 @@ void GetCountryCodeID(Service::Interface* self) {
     cmd_buff[2] = country_code_id;
 }
 
-static u32 GetRegionValue() {
+u32 GetRegionValue() {
     if (Settings::values.region_value == Settings::REGION_VALUE_AUTO_SELECT)
         return preferred_region_code;
 
diff --git a/src/core/hle/service/cfg/cfg.h b/src/core/hle/service/cfg/cfg.h
index 1659ebf32..282b6936b 100644
--- a/src/core/hle/service/cfg/cfg.h
+++ b/src/core/hle/service/cfg/cfg.h
@@ -101,6 +101,8 @@ void GetCountryCodeString(Service::Interface* self);
  */
 void GetCountryCodeID(Service::Interface* self);
 
+u32 GetRegionValue();
+
 /**
  * CFG::SecureInfoGetRegion service function
  *  Inputs:
diff --git a/src/core/hle/service/hid/hid.cpp b/src/core/hle/service/hid/hid.cpp
index 31f34a7ae..aa5d821f9 100644
--- a/src/core/hle/service/hid/hid.cpp
+++ b/src/core/hle/service/hid/hid.cpp
@@ -7,9 +7,9 @@
 #include <cmath>
 #include <memory>
 #include "common/logging/log.h"
+#include "core/3ds.h"
 #include "core/core.h"
 #include "core/core_timing.h"
-#include "core/frontend/emu_window.h"
 #include "core/frontend/input.h"
 #include "core/hle/ipc.h"
 #include "core/hle/kernel/event.h"
@@ -19,7 +19,6 @@
 #include "core/hle/service/hid/hid_spvr.h"
 #include "core/hle/service/hid/hid_user.h"
 #include "core/hle/service/service.h"
-#include "video_core/video_core.h"
 
 namespace Service {
 namespace HID {
@@ -59,6 +58,7 @@ static std::array<std::unique_ptr<Input::ButtonDevice>, Settings::NativeButton::
     buttons;
 static std::unique_ptr<Input::AnalogDevice> circle_pad;
 static std::unique_ptr<Input::MotionDevice> motion_device;
+static std::unique_ptr<Input::TouchDevice> touch_device;
 
 DirectionState GetStickDirectionState(s16 circle_pad_x, s16 circle_pad_y) {
     // 30 degree and 60 degree are angular thresholds for directions
@@ -96,6 +96,7 @@ static void LoadInputDevices() {
     circle_pad = Input::CreateDevice<Input::AnalogDevice>(
         Settings::values.analogs[Settings::NativeAnalog::CirclePad]);
     motion_device = Input::CreateDevice<Input::MotionDevice>(Settings::values.motion_device);
+    touch_device = Input::CreateDevice<Input::TouchDevice>(Settings::values.touch_device);
 }
 
 static void UnloadInputDevices() {
@@ -104,6 +105,7 @@ static void UnloadInputDevices() {
     }
     circle_pad.reset();
     motion_device.reset();
+    touch_device.reset();
 }
 
 static void UpdatePadCallback(u64 userdata, int cycles_late) {
@@ -172,8 +174,10 @@ static void UpdatePadCallback(u64 userdata, int cycles_late) {
     // Get the current touch entry
     TouchDataEntry& touch_entry = mem->touch.entries[mem->touch.index];
     bool pressed = false;
-
-    std::tie(touch_entry.x, touch_entry.y, pressed) = VideoCore::g_emu_window->GetTouchState();
+    float x, y;
+    std::tie(x, y, pressed) = touch_device->GetStatus();
+    touch_entry.x = static_cast<u16>(x * Core::kScreenBottomWidth);
+    touch_entry.y = static_cast<u16>(y * Core::kScreenBottomHeight);
     touch_entry.valid.Assign(pressed ? 1 : 0);
 
     // TODO(bunnei): We're not doing anything with offset 0xA8 + 0x18 of HID SharedMemory, which
diff --git a/src/core/hle/service/nwm/nwm_uds.cpp b/src/core/hle/service/nwm/nwm_uds.cpp
index 6dbdff044..893bbb1e7 100644
--- a/src/core/hle/service/nwm/nwm_uds.cpp
+++ b/src/core/hle/service/nwm/nwm_uds.cpp
@@ -4,6 +4,7 @@
 
 #include <array>
 #include <cstring>
+#include <mutex>
 #include <unordered_map>
 #include <vector>
 #include "common/common_types.h"
@@ -15,8 +16,10 @@
 #include "core/hle/result.h"
 #include "core/hle/service/nwm/nwm_uds.h"
 #include "core/hle/service/nwm/uds_beacon.h"
+#include "core/hle/service/nwm/uds_connection.h"
 #include "core/hle/service/nwm/uds_data.h"
 #include "core/memory.h"
+#include "network/network.h"
 
 namespace Service {
 namespace NWM {
@@ -51,6 +54,135 @@ static NetworkInfo network_info;
 // Event that will generate and send the 802.11 beacon frames.
 static int beacon_broadcast_event;
 
+// Mutex to synchronize access to the list of received beacons between the emulation thread and the
+// network thread.
+static std::mutex beacon_mutex;
+
+// Number of beacons to store before we start dropping the old ones.
+// TODO(Subv): Find a more accurate value for this limit.
+constexpr size_t MaxBeaconFrames = 15;
+
+// List of the last <MaxBeaconFrames> beacons received from the network.
+static std::deque<Network::WifiPacket> received_beacons;
+
+/**
+ * Returns a list of received 802.11 beacon frames from the specified sender since the last call.
+ */
+std::deque<Network::WifiPacket> GetReceivedBeacons(const MacAddress& sender) {
+    std::lock_guard<std::mutex> lock(beacon_mutex);
+    // TODO(Subv): Filter by sender.
+    return std::move(received_beacons);
+}
+
+/// Sends a WifiPacket to the room we're currently connected to.
+void SendPacket(Network::WifiPacket& packet) {
+    // TODO(Subv): Implement.
+}
+
+// Inserts the received beacon frame in the beacon queue and removes any older beacons if the size
+// limit is exceeded.
+void HandleBeaconFrame(const Network::WifiPacket& packet) {
+    std::lock_guard<std::mutex> lock(beacon_mutex);
+
+    received_beacons.emplace_back(packet);
+
+    // Discard old beacons if the buffer is full.
+    if (received_beacons.size() > MaxBeaconFrames)
+        received_beacons.pop_front();
+}
+
+/*
+ * Returns an available index in the nodes array for the
+ * currently-hosted UDS network.
+ */
+static u16 GetNextAvailableNodeId() {
+    ASSERT_MSG(connection_status.status == static_cast<u32>(NetworkStatus::ConnectedAsHost),
+               "Can not accept clients if we're not hosting a network");
+
+    for (u16 index = 0; index < connection_status.max_nodes; ++index) {
+        if ((connection_status.node_bitmask & (1 << index)) == 0)
+            return index;
+    }
+
+    // Any connection attempts to an already full network should have been refused.
+    ASSERT_MSG(false, "No available connection slots in the network");
+}
+
+/*
+ * Start a connection sequence with an UDS server. The sequence starts by sending an 802.11
+ * authentication frame with SEQ1.
+ */
+void StartConnectionSequence(const MacAddress& server) {
+    ASSERT(connection_status.status == static_cast<u32>(NetworkStatus::NotConnected));
+
+    // TODO(Subv): Handle timeout.
+
+    // Send an authentication frame with SEQ1
+    using Network::WifiPacket;
+    WifiPacket auth_request;
+    auth_request.channel = network_channel;
+    auth_request.data = GenerateAuthenticationFrame(AuthenticationSeq::SEQ1);
+    auth_request.destination_address = server;
+    auth_request.type = WifiPacket::PacketType::Authentication;
+
+    SendPacket(auth_request);
+}
+
+/// Sends an Association Response frame to the specified mac address
+void SendAssociationResponseFrame(const MacAddress& address) {
+    ASSERT_MSG(connection_status.status == static_cast<u32>(NetworkStatus::ConnectedAsHost));
+
+    using Network::WifiPacket;
+    WifiPacket assoc_response;
+    assoc_response.channel = network_channel;
+    // TODO(Subv): This will cause multiple clients to end up with the same association id, but
+    // we're not using that for anything.
+    u16 association_id = 1;
+    assoc_response.data = GenerateAssocResponseFrame(AssocStatus::Successful, association_id,
+                                                     network_info.network_id);
+    assoc_response.destination_address = address;
+    assoc_response.type = WifiPacket::PacketType::AssociationResponse;
+
+    SendPacket(assoc_response);
+}
+
+/*
+ * Handles the authentication request frame and sends the authentication response and association
+ * response frames. Once an Authentication frame with SEQ1 is received by the server, it responds
+ * with an Authentication frame containing SEQ2, and immediately sends an Association response frame
+ * containing the details of the access point and the assigned association id for the new client.
+ */
+void HandleAuthenticationFrame(const Network::WifiPacket& packet) {
+    // Only the SEQ1 auth frame is handled here, the SEQ2 frame doesn't need any special behavior
+    if (GetAuthenticationSeqNumber(packet.data) == AuthenticationSeq::SEQ1) {
+        ASSERT_MSG(connection_status.status == static_cast<u32>(NetworkStatus::ConnectedAsHost));
+
+        // Respond with an authentication response frame with SEQ2
+        using Network::WifiPacket;
+        WifiPacket auth_request;
+        auth_request.channel = network_channel;
+        auth_request.data = GenerateAuthenticationFrame(AuthenticationSeq::SEQ2);
+        auth_request.destination_address = packet.transmitter_address;
+        auth_request.type = WifiPacket::PacketType::Authentication;
+
+        SendPacket(auth_request);
+
+        SendAssociationResponseFrame(packet.transmitter_address);
+    }
+}
+
+/// Callback to parse and handle a received wifi packet.
+void OnWifiPacketReceived(const Network::WifiPacket& packet) {
+    switch (packet.type) {
+    case Network::WifiPacket::PacketType::Beacon:
+        HandleBeaconFrame(packet);
+        break;
+    case Network::WifiPacket::PacketType::Authentication:
+        HandleAuthenticationFrame(packet);
+        break;
+    }
+}
+
 /**
  * NWM_UDS::Shutdown service function
  *  Inputs:
@@ -111,8 +243,7 @@ static void RecvBeaconBroadcastData(Interface* self) {
     u32 total_size = sizeof(BeaconDataReplyHeader);
 
     // Retrieve all beacon frames that were received from the desired mac address.
-    std::deque<WifiPacket> beacons =
-        GetReceivedPackets(WifiPacket::PacketType::Beacon, mac_address);
+    auto beacons = GetReceivedBeacons(mac_address);
 
     BeaconDataReplyHeader data_reply_header{};
     data_reply_header.total_entries = beacons.size();
@@ -193,6 +324,9 @@ static void InitializeWithVersion(Interface* self) {
     rb.Push(RESULT_SUCCESS);
     rb.PushCopyHandles(Kernel::g_handle_table.Create(connection_status_event).Unwrap());
 
+    // TODO(Subv): Connect the OnWifiPacketReceived function to the wifi packet received callback of
+    // the room we're currently in.
+
     LOG_DEBUG(Service_NWM, "called sharedmem_size=0x%08X, version=0x%08X, sharedmem_handle=0x%08X",
               sharedmem_size, version, sharedmem_handle);
 }
@@ -610,32 +744,23 @@ static void BeaconBroadcastCallback(u64 userdata, int cycles_late) {
     if (connection_status.status != static_cast<u32>(NetworkStatus::ConnectedAsHost))
         return;
 
-    // TODO(Subv): Actually send the beacon.
     std::vector<u8> frame = GenerateBeaconFrame(network_info, node_info);
 
+    using Network::WifiPacket;
+    WifiPacket packet;
+    packet.type = WifiPacket::PacketType::Beacon;
+    packet.data = std::move(frame);
+    packet.destination_address = Network::BroadcastMac;
+    packet.channel = network_channel;
+
+    SendPacket(packet);
+
     // Start broadcasting the network, send a beacon frame every 102.4ms.
     CoreTiming::ScheduleEvent(msToCycles(DefaultBeaconInterval * MillisecondsPerTU) - cycles_late,
                               beacon_broadcast_event, 0);
 }
 
 /*
- * Returns an available index in the nodes array for the
- * currently-hosted UDS network.
- */
-static u32 GetNextAvailableNodeId() {
-    ASSERT_MSG(connection_status.status == static_cast<u32>(NetworkStatus::ConnectedAsHost),
-               "Can not accept clients if we're not hosting a network");
-
-    for (unsigned index = 0; index < connection_status.max_nodes; ++index) {
-        if ((connection_status.node_bitmask & (1 << index)) == 0)
-            return index;
-    }
-
-    // Any connection attempts to an already full network should have been refused.
-    ASSERT_MSG(false, "No available connection slots in the network");
-}
-
-/*
  * Called when a client connects to an UDS network we're hosting,
  * updates the connection status and signals the update event.
  * @param network_node_id Network Node Id of the connecting client.
diff --git a/src/core/hle/service/nwm/nwm_uds.h b/src/core/hle/service/nwm/nwm_uds.h
index 141f49f9c..f1caaf974 100644
--- a/src/core/hle/service/nwm/nwm_uds.h
+++ b/src/core/hle/service/nwm/nwm_uds.h
@@ -42,6 +42,7 @@ using NodeList = std::vector<NodeInfo>;
 enum class NetworkStatus {
     NotConnected = 3,
     ConnectedAsHost = 6,
+    Connecting = 7,
     ConnectedAsClient = 9,
     ConnectedAsSpectator = 10,
 };
@@ -85,6 +86,17 @@ static_assert(offsetof(NetworkInfo, oui_value) == 0xC, "oui_value is at the wron
 static_assert(offsetof(NetworkInfo, wlan_comm_id) == 0x10, "wlancommid is at the wrong offset.");
 static_assert(sizeof(NetworkInfo) == 0x108, "NetworkInfo has incorrect size.");
 
+/// Additional block tag ids in the Beacon and Association Response frames
+enum class TagId : u8 {
+    SSID = 0,
+    SupportedRates = 1,
+    DSParameterSet = 2,
+    TrafficIndicationMap = 5,
+    CountryInformation = 7,
+    ERPInformation = 42,
+    VendorSpecific = 221
+};
+
 class NWM_UDS final : public Interface {
 public:
     NWM_UDS();
diff --git a/src/core/hle/service/nwm/uds_beacon.cpp b/src/core/hle/service/nwm/uds_beacon.cpp
index 6332b404c..552eaf65e 100644
--- a/src/core/hle/service/nwm/uds_beacon.cpp
+++ b/src/core/hle/service/nwm/uds_beacon.cpp
@@ -325,8 +325,5 @@ std::vector<u8> GenerateBeaconFrame(const NetworkInfo& network_info, const NodeL
     return buffer;
 }
 
-std::deque<WifiPacket> GetReceivedPackets(WifiPacket::PacketType type, const MacAddress& sender) {
-    return {};
-}
 } // namespace NWM
 } // namespace Service
diff --git a/src/core/hle/service/nwm/uds_beacon.h b/src/core/hle/service/nwm/uds_beacon.h
index caacf4c6f..50cc76da2 100644
--- a/src/core/hle/service/nwm/uds_beacon.h
+++ b/src/core/hle/service/nwm/uds_beacon.h
@@ -17,17 +17,6 @@ namespace NWM {
 using MacAddress = std::array<u8, 6>;
 constexpr std::array<u8, 3> NintendoOUI = {0x00, 0x1F, 0x32};
 
-/// Additional block tag ids in the Beacon frames
-enum class TagId : u8 {
-    SSID = 0,
-    SupportedRates = 1,
-    DSParameterSet = 2,
-    TrafficIndicationMap = 5,
-    CountryInformation = 7,
-    ERPInformation = 42,
-    VendorSpecific = 221
-};
-
 /**
  * Internal vendor-specific tag ids as stored inside
  * VendorSpecific blocks in the Beacon frames.
@@ -135,20 +124,6 @@ struct BeaconData {
 
 static_assert(sizeof(BeaconData) == 0x12, "BeaconData has incorrect size.");
 
-/// Information about a received WiFi packet.
-/// Acts as our own 802.11 header.
-struct WifiPacket {
-    enum class PacketType { Beacon, Data };
-
-    PacketType type; ///< The type of 802.11 frame, Beacon / Data.
-
-    /// Raw 802.11 frame data, starting at the management frame header for management frames.
-    std::vector<u8> data;
-    MacAddress transmitter_address; ///< Mac address of the transmitter.
-    MacAddress destination_address; ///< Mac address of the receiver.
-    u8 channel;                     ///< WiFi channel where this frame was transmitted.
-};
-
 /**
  * Decrypts the beacon data buffer for the network described by `network_info`.
  */
@@ -161,10 +136,5 @@ void DecryptBeaconData(const NetworkInfo& network_info, std::vector<u8>& buffer)
  */
 std::vector<u8> GenerateBeaconFrame(const NetworkInfo& network_info, const NodeList& nodes);
 
-/**
- * Returns a list of received 802.11 frames from the specified sender
- * matching the type since the last call.
- */
-std::deque<WifiPacket> GetReceivedPackets(WifiPacket::PacketType type, const MacAddress& sender);
 } // namespace NWM
 } // namespace Service
diff --git a/src/core/hle/service/nwm/uds_connection.cpp b/src/core/hle/service/nwm/uds_connection.cpp
new file mode 100644
index 000000000..c8a76ec2a
--- /dev/null
+++ b/src/core/hle/service/nwm/uds_connection.cpp
@@ -0,0 +1,79 @@
+// Copyright 2017 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "core/hle/service/nwm/nwm_uds.h"
+#include "core/hle/service/nwm/uds_connection.h"
+#include "fmt/format.h"
+
+namespace Service {
+namespace NWM {
+
+// Note: These values were taken from a packet capture of an o3DS XL
+// broadcasting a Super Smash Bros. 4 lobby.
+constexpr u16 DefaultExtraCapabilities = 0x0431;
+
+std::vector<u8> GenerateAuthenticationFrame(AuthenticationSeq seq) {
+    AuthenticationFrame frame{};
+    frame.auth_seq = static_cast<u16>(seq);
+
+    std::vector<u8> data(sizeof(frame));
+    std::memcpy(data.data(), &frame, sizeof(frame));
+
+    return data;
+}
+
+AuthenticationSeq GetAuthenticationSeqNumber(const std::vector<u8>& body) {
+    AuthenticationFrame frame;
+    std::memcpy(&frame, body.data(), sizeof(frame));
+
+    return static_cast<AuthenticationSeq>(frame.auth_seq);
+}
+
+/**
+ * Generates an SSID tag of an 802.11 Beacon frame with an 8-byte character representation of the
+ * specified network id as the SSID value.
+ * @param network_id The network id to use.
+ * @returns A buffer with the SSID tag.
+ */
+static std::vector<u8> GenerateSSIDTag(u32 network_id) {
+    constexpr u8 SSIDSize = 8;
+
+    struct {
+        u8 id = static_cast<u8>(TagId::SSID);
+        u8 size = SSIDSize;
+    } tag_header;
+
+    std::vector<u8> buffer(sizeof(tag_header) + SSIDSize);
+
+    std::memcpy(buffer.data(), &tag_header, sizeof(tag_header));
+
+    std::string network_name = fmt::format("{0:08X}", network_id);
+
+    std::memcpy(buffer.data() + sizeof(tag_header), network_name.c_str(), SSIDSize);
+
+    return buffer;
+}
+
+std::vector<u8> GenerateAssocResponseFrame(AssocStatus status, u16 association_id, u32 network_id) {
+    AssociationResponseFrame frame{};
+    frame.capabilities = DefaultExtraCapabilities;
+    frame.status_code = static_cast<u16>(status);
+    // The association id is ORed with this magic value (0xC000)
+    constexpr u16 AssociationIdMagic = 0xC000;
+    frame.assoc_id = association_id | AssociationIdMagic;
+
+    std::vector<u8> data(sizeof(frame));
+    std::memcpy(data.data(), &frame, sizeof(frame));
+
+    auto ssid_tag = GenerateSSIDTag(network_id);
+    data.insert(data.end(), ssid_tag.begin(), ssid_tag.end());
+
+    // TODO(Subv): Add the SupportedRates tag.
+    // TODO(Subv): Add the DSParameterSet tag.
+    // TODO(Subv): Add the ERPInformation tag.
+    return data;
+}
+
+} // namespace NWM
+} // namespace Service
diff --git a/src/core/hle/service/nwm/uds_connection.h b/src/core/hle/service/nwm/uds_connection.h
new file mode 100644
index 000000000..73f55a4fd
--- /dev/null
+++ b/src/core/hle/service/nwm/uds_connection.h
@@ -0,0 +1,51 @@
+// Copyright 2017 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <vector>
+#include "common/common_types.h"
+#include "common/swap.h"
+#include "core/hle/service/service.h"
+
+namespace Service {
+namespace NWM {
+
+/// Sequence number of the 802.11 authentication frames.
+enum class AuthenticationSeq : u16 { SEQ1 = 1, SEQ2 = 2 };
+
+enum class AuthAlgorithm : u16 { OpenSystem = 0 };
+
+enum class AuthStatus : u16 { Successful = 0 };
+
+enum class AssocStatus : u16 { Successful = 0 };
+
+struct AuthenticationFrame {
+    u16_le auth_algorithm = static_cast<u16>(AuthAlgorithm::OpenSystem);
+    u16_le auth_seq;
+    u16_le status_code = static_cast<u16>(AuthStatus::Successful);
+};
+
+static_assert(sizeof(AuthenticationFrame) == 6, "AuthenticationFrame has wrong size");
+
+struct AssociationResponseFrame {
+    u16_le capabilities;
+    u16_le status_code;
+    u16_le assoc_id;
+};
+
+static_assert(sizeof(AssociationResponseFrame) == 6, "AssociationResponseFrame has wrong size");
+
+/// Generates an 802.11 authentication frame, starting at the frame body.
+std::vector<u8> GenerateAuthenticationFrame(AuthenticationSeq seq);
+
+/// Returns the sequence number from the body of an Authentication frame.
+AuthenticationSeq GetAuthenticationSeqNumber(const std::vector<u8>& body);
+
+/// Generates an 802.11 association response frame with the specified status, association id and
+/// network id, starting at the frame body.
+std::vector<u8> GenerateAssocResponseFrame(AssocStatus status, u16 association_id, u32 network_id);
+
+} // namespace NWM
+} // namespace Service
diff --git a/src/core/hle/svc.cpp b/src/core/hle/svc.cpp
index b98938cb4..dfc36748c 100644
--- a/src/core/hle/svc.cpp
+++ b/src/core/hle/svc.cpp
@@ -1334,7 +1334,7 @@ void CallSVC(u32 immediate) {
     MICROPROFILE_SCOPE(Kernel_SVC);
 
     // Lock the global kernel mutex when we enter the kernel HLE.
-    std::lock_guard<std::mutex> lock(HLE::g_hle_lock);
+    std::lock_guard<std::recursive_mutex> lock(HLE::g_hle_lock);
 
     const FunctionDef* info = GetSVCInfo(immediate);
     if (info) {
diff --git a/src/core/loader/3dsx.cpp b/src/core/loader/3dsx.cpp
index 74e336487..69cdc0867 100644
--- a/src/core/loader/3dsx.cpp
+++ b/src/core/loader/3dsx.cpp
@@ -270,6 +270,7 @@ ResultStatus AppLoader_THREEDSX::Load() {
     Kernel::g_current_process = Kernel::Process::Create(std::move(codeset));
     Kernel::g_current_process->svc_access_mask.set();
     Kernel::g_current_process->address_mappings = default_address_mappings;
+    Memory::current_page_table = &Kernel::g_current_process->vm_manager.page_table;
 
     // Attach the default resource limit (APPLICATION) to the process
     Kernel::g_current_process->resource_limit =
diff --git a/src/core/loader/elf.cpp b/src/core/loader/elf.cpp
index cfcde9167..2f27606a1 100644
--- a/src/core/loader/elf.cpp
+++ b/src/core/loader/elf.cpp
@@ -397,6 +397,7 @@ ResultStatus AppLoader_ELF::Load() {
     Kernel::g_current_process = Kernel::Process::Create(std::move(codeset));
     Kernel::g_current_process->svc_access_mask.set();
     Kernel::g_current_process->address_mappings = default_address_mappings;
+    Memory::current_page_table = &Kernel::g_current_process->vm_manager.page_table;
 
     // Attach the default resource limit (APPLICATION) to the process
     Kernel::g_current_process->resource_limit =
diff --git a/src/core/loader/ncch.cpp b/src/core/loader/ncch.cpp
index 7aff7f29b..79ea50147 100644
--- a/src/core/loader/ncch.cpp
+++ b/src/core/loader/ncch.cpp
@@ -172,6 +172,7 @@ ResultStatus AppLoader_NCCH::LoadExec() {
         codeset->memory = std::make_shared<std::vector<u8>>(std::move(code));
 
         Kernel::g_current_process = Kernel::Process::Create(std::move(codeset));
+        Memory::current_page_table = &Kernel::g_current_process->vm_manager.page_table;
 
         // Attach a resource limit to the process based on the resource limit category
         Kernel::g_current_process->resource_limit =
diff --git a/src/core/memory.cpp b/src/core/memory.cpp
index a3c5f4a9d..68a6b1ac2 100644
--- a/src/core/memory.cpp
+++ b/src/core/memory.cpp
@@ -4,83 +4,31 @@
 
 #include <array>
 #include <cstring>
+#include "audio_core/audio_core.h"
 #include "common/assert.h"
 #include "common/common_types.h"
 #include "common/logging/log.h"
 #include "common/swap.h"
+#include "core/hle/kernel/memory.h"
 #include "core/hle/kernel/process.h"
 #include "core/hle/lock.h"
 #include "core/memory.h"
 #include "core/memory_setup.h"
-#include "core/mmio.h"
 #include "video_core/renderer_base.h"
 #include "video_core/video_core.h"
 
 namespace Memory {
 
-enum class PageType {
-    /// Page is unmapped and should cause an access error.
-    Unmapped,
-    /// Page is mapped to regular memory. This is the only type you can get pointers to.
-    Memory,
-    /// Page is mapped to regular memory, but also needs to check for rasterizer cache flushing and
-    /// invalidation
-    RasterizerCachedMemory,
-    /// Page is mapped to a I/O region. Writing and reading to this page is handled by functions.
-    Special,
-    /// Page is mapped to a I/O region, but also needs to check for rasterizer cache flushing and
-    /// invalidation
-    RasterizerCachedSpecial,
-};
-
-struct SpecialRegion {
-    VAddr base;
-    u32 size;
-    MMIORegionPointer handler;
-};
+static std::array<u8, Memory::VRAM_SIZE> vram;
+static std::array<u8, Memory::N3DS_EXTRA_RAM_SIZE> n3ds_extra_ram;
 
-/**
- * A (reasonably) fast way of allowing switchable and remappable process address spaces. It loosely
- * mimics the way a real CPU page table works, but instead is optimized for minimal decoding and
- * fetching requirements when accessing. In the usual case of an access to regular memory, it only
- * requires an indexed fetch and a check for NULL.
- */
-struct PageTable {
-    /**
-     * Array of memory pointers backing each page. An entry can only be non-null if the
-     * corresponding entry in the `attributes` array is of type `Memory`.
-     */
-    std::array<u8*, PAGE_TABLE_NUM_ENTRIES> pointers;
-
-    /**
-     * Contains MMIO handlers that back memory regions whose entries in the `attribute` array is of
-     * type `Special`.
-     */
-    std::vector<SpecialRegion> special_regions;
-
-    /**
-     * Array of fine grained page attributes. If it is set to any value other than `Memory`, then
-     * the corresponding entry in `pointers` MUST be set to null.
-     */
-    std::array<PageType, PAGE_TABLE_NUM_ENTRIES> attributes;
-
-    /**
-     * Indicates the number of externally cached resources touching a page that should be
-     * flushed before the memory is accessed
-     */
-    std::array<u8, PAGE_TABLE_NUM_ENTRIES> cached_res_count;
-};
-
-/// Singular page table used for the singleton process
-static PageTable main_page_table;
-/// Currently active page table
-static PageTable* current_page_table = &main_page_table;
+PageTable* current_page_table = nullptr;
 
 std::array<u8*, PAGE_TABLE_NUM_ENTRIES>* GetCurrentPageTablePointers() {
     return &current_page_table->pointers;
 }
 
-static void MapPages(u32 base, u32 size, u8* memory, PageType type) {
+static void MapPages(PageTable& page_table, u32 base, u32 size, u8* memory, PageType type) {
     LOG_DEBUG(HW_Memory, "Mapping %p onto %08X-%08X", memory, base * PAGE_SIZE,
               (base + size) * PAGE_SIZE);
 
@@ -91,9 +39,9 @@ static void MapPages(u32 base, u32 size, u8* memory, PageType type) {
     while (base != end) {
         ASSERT_MSG(base < PAGE_TABLE_NUM_ENTRIES, "out of range mapping at %08X", base);
 
-        current_page_table->attributes[base] = type;
-        current_page_table->pointers[base] = memory;
-        current_page_table->cached_res_count[base] = 0;
+        page_table.attributes[base] = type;
+        page_table.pointers[base] = memory;
+        page_table.cached_res_count[base] = 0;
 
         base += 1;
         if (memory != nullptr)
@@ -101,30 +49,24 @@ static void MapPages(u32 base, u32 size, u8* memory, PageType type) {
     }
 }
 
-void InitMemoryMap() {
-    main_page_table.pointers.fill(nullptr);
-    main_page_table.attributes.fill(PageType::Unmapped);
-    main_page_table.cached_res_count.fill(0);
-}
-
-void MapMemoryRegion(VAddr base, u32 size, u8* target) {
+void MapMemoryRegion(PageTable& page_table, VAddr base, u32 size, u8* target) {
     ASSERT_MSG((size & PAGE_MASK) == 0, "non-page aligned size: %08X", size);
     ASSERT_MSG((base & PAGE_MASK) == 0, "non-page aligned base: %08X", base);
-    MapPages(base / PAGE_SIZE, size / PAGE_SIZE, target, PageType::Memory);
+    MapPages(page_table, base / PAGE_SIZE, size / PAGE_SIZE, target, PageType::Memory);
 }
 
-void MapIoRegion(VAddr base, u32 size, MMIORegionPointer mmio_handler) {
+void MapIoRegion(PageTable& page_table, VAddr base, u32 size, MMIORegionPointer mmio_handler) {
     ASSERT_MSG((size & PAGE_MASK) == 0, "non-page aligned size: %08X", size);
     ASSERT_MSG((base & PAGE_MASK) == 0, "non-page aligned base: %08X", base);
-    MapPages(base / PAGE_SIZE, size / PAGE_SIZE, nullptr, PageType::Special);
+    MapPages(page_table, base / PAGE_SIZE, size / PAGE_SIZE, nullptr, PageType::Special);
 
-    current_page_table->special_regions.emplace_back(SpecialRegion{base, size, mmio_handler});
+    page_table.special_regions.emplace_back(SpecialRegion{base, size, mmio_handler});
 }
 
-void UnmapRegion(VAddr base, u32 size) {
+void UnmapRegion(PageTable& page_table, VAddr base, u32 size) {
     ASSERT_MSG((size & PAGE_MASK) == 0, "non-page aligned size: %08X", size);
     ASSERT_MSG((base & PAGE_MASK) == 0, "non-page aligned base: %08X", base);
-    MapPages(base / PAGE_SIZE, size / PAGE_SIZE, nullptr, PageType::Unmapped);
+    MapPages(page_table, base / PAGE_SIZE, size / PAGE_SIZE, nullptr, PageType::Unmapped);
 }
 
 /**
@@ -183,7 +125,7 @@ T Read(const VAddr vaddr) {
     }
 
     // The memory access might do an MMIO or cached access, so we have to lock the HLE kernel state
-    std::lock_guard<std::mutex> lock(HLE::g_hle_lock);
+    std::lock_guard<std::recursive_mutex> lock(HLE::g_hle_lock);
 
     PageType type = current_page_table->attributes[vaddr >> PAGE_BITS];
     switch (type) {
@@ -224,7 +166,7 @@ void Write(const VAddr vaddr, const T data) {
     }
 
     // The memory access might do an MMIO or cached access, so we have to lock the HLE kernel state
-    std::lock_guard<std::mutex> lock(HLE::g_hle_lock);
+    std::lock_guard<std::recursive_mutex> lock(HLE::g_hle_lock);
 
     PageType type = current_page_table->attributes[vaddr >> PAGE_BITS];
     switch (type) {
@@ -273,8 +215,7 @@ bool IsValidVirtualAddress(const VAddr vaddr) {
 }
 
 bool IsValidPhysicalAddress(const PAddr paddr) {
-    boost::optional<VAddr> vaddr = PhysicalToVirtualAddress(paddr);
-    return vaddr && IsValidVirtualAddress(*vaddr);
+    return GetPhysicalPointer(paddr) != nullptr;
 }
 
 u8* GetPointer(const VAddr vaddr) {
@@ -306,9 +247,63 @@ std::string ReadCString(VAddr vaddr, std::size_t max_length) {
 }
 
 u8* GetPhysicalPointer(PAddr address) {
-    // TODO(Subv): This call should not go through the application's memory mapping.
-    boost::optional<VAddr> vaddr = PhysicalToVirtualAddress(address);
-    return vaddr ? GetPointer(*vaddr) : nullptr;
+    struct MemoryArea {
+        PAddr paddr_base;
+        u32 size;
+    };
+
+    static constexpr MemoryArea memory_areas[] = {
+        {VRAM_PADDR, VRAM_SIZE},
+        {IO_AREA_PADDR, IO_AREA_SIZE},
+        {DSP_RAM_PADDR, DSP_RAM_SIZE},
+        {FCRAM_PADDR, FCRAM_N3DS_SIZE},
+        {N3DS_EXTRA_RAM_PADDR, N3DS_EXTRA_RAM_SIZE},
+    };
+
+    const auto area =
+        std::find_if(std::begin(memory_areas), std::end(memory_areas), [&](const auto& area) {
+            return address >= area.paddr_base && address < area.paddr_base + area.size;
+        });
+
+    if (area == std::end(memory_areas)) {
+        LOG_ERROR(HW_Memory, "unknown GetPhysicalPointer @ 0x%08X", address);
+        return nullptr;
+    }
+
+    if (area->paddr_base == IO_AREA_PADDR) {
+        LOG_ERROR(HW_Memory, "MMIO mappings are not supported yet. phys_addr=0x%08X", address);
+        return nullptr;
+    }
+
+    u32 offset_into_region = address - area->paddr_base;
+
+    u8* target_pointer = nullptr;
+    switch (area->paddr_base) {
+    case VRAM_PADDR:
+        target_pointer = vram.data() + offset_into_region;
+        break;
+    case DSP_RAM_PADDR:
+        target_pointer = AudioCore::GetDspMemory().data() + offset_into_region;
+        break;
+    case FCRAM_PADDR:
+        for (const auto& region : Kernel::memory_regions) {
+            if (offset_into_region >= region.base &&
+                offset_into_region < region.base + region.size) {
+                target_pointer =
+                    region.linear_heap_memory->data() + offset_into_region - region.base;
+                break;
+            }
+        }
+        ASSERT_MSG(target_pointer != nullptr, "Invalid FCRAM address");
+        break;
+    case N3DS_EXTRA_RAM_PADDR:
+        target_pointer = n3ds_extra_ram.data() + offset_into_region;
+        break;
+    default:
+        UNREACHABLE();
+    }
+
+    return target_pointer;
 }
 
 void RasterizerMarkRegionCached(PAddr start, u32 size, int count_delta) {
diff --git a/src/core/memory.h b/src/core/memory.h
index c8c56babd..b228a48c2 100644
--- a/src/core/memory.h
+++ b/src/core/memory.h
@@ -7,8 +7,10 @@
 #include <array>
 #include <cstddef>
 #include <string>
+#include <vector>
 #include <boost/optional.hpp>
 #include "common/common_types.h"
+#include "core/mmio.h"
 
 namespace Memory {
 
@@ -21,6 +23,59 @@ const u32 PAGE_MASK = PAGE_SIZE - 1;
 const int PAGE_BITS = 12;
 const size_t PAGE_TABLE_NUM_ENTRIES = 1 << (32 - PAGE_BITS);
 
+enum class PageType {
+    /// Page is unmapped and should cause an access error.
+    Unmapped,
+    /// Page is mapped to regular memory. This is the only type you can get pointers to.
+    Memory,
+    /// Page is mapped to regular memory, but also needs to check for rasterizer cache flushing and
+    /// invalidation
+    RasterizerCachedMemory,
+    /// Page is mapped to a I/O region. Writing and reading to this page is handled by functions.
+    Special,
+    /// Page is mapped to a I/O region, but also needs to check for rasterizer cache flushing and
+    /// invalidation
+    RasterizerCachedSpecial,
+};
+
+struct SpecialRegion {
+    VAddr base;
+    u32 size;
+    MMIORegionPointer handler;
+};
+
+/**
+ * A (reasonably) fast way of allowing switchable and remappable process address spaces. It loosely
+ * mimics the way a real CPU page table works, but instead is optimized for minimal decoding and
+ * fetching requirements when accessing. In the usual case of an access to regular memory, it only
+ * requires an indexed fetch and a check for NULL.
+ */
+struct PageTable {
+    /**
+     * Array of memory pointers backing each page. An entry can only be non-null if the
+     * corresponding entry in the `attributes` array is of type `Memory`.
+     */
+    std::array<u8*, PAGE_TABLE_NUM_ENTRIES> pointers;
+
+    /**
+     * Contains MMIO handlers that back memory regions whose entries in the `attribute` array is of
+     * type `Special`.
+     */
+    std::vector<SpecialRegion> special_regions;
+
+    /**
+     * Array of fine grained page attributes. If it is set to any value other than `Memory`, then
+     * the corresponding entry in `pointers` MUST be set to null.
+     */
+    std::array<PageType, PAGE_TABLE_NUM_ENTRIES> attributes;
+
+    /**
+     * Indicates the number of externally cached resources touching a page that should be
+     * flushed before the memory is accessed
+     */
+    std::array<u8, PAGE_TABLE_NUM_ENTRIES> cached_res_count;
+};
+
 /// Physical memory regions as seen from the ARM11
 enum : PAddr {
     /// IO register area
@@ -126,6 +181,9 @@ enum : VAddr {
     NEW_LINEAR_HEAP_VADDR_END = NEW_LINEAR_HEAP_VADDR + NEW_LINEAR_HEAP_SIZE,
 };
 
+/// Currently active page table
+extern PageTable* current_page_table;
+
 bool IsValidVirtualAddress(const VAddr addr);
 bool IsValidPhysicalAddress(const PAddr addr);
 
@@ -169,8 +227,6 @@ boost::optional<VAddr> PhysicalToVirtualAddress(PAddr addr);
 
 /**
  * Gets a pointer to the memory region beginning at the specified physical address.
- *
- * @note This is currently implemented using PhysicalToVirtualAddress().
  */
 u8* GetPhysicalPointer(PAddr address);
 
@@ -209,4 +265,4 @@ void RasterizerFlushVirtualRegion(VAddr start, u32 size, FlushMode mode);
  * retrieve the current page table for that purpose.
  */
 std::array<u8*, PAGE_TABLE_NUM_ENTRIES>* GetCurrentPageTablePointers();
-}
+} // namespace Memory
diff --git a/src/core/memory_setup.h b/src/core/memory_setup.h
index 3fdf3a87d..c58baa50b 100644
--- a/src/core/memory_setup.h
+++ b/src/core/memory_setup.h
@@ -9,24 +9,24 @@
 
 namespace Memory {
 
-void InitMemoryMap();
-
 /**
  * Maps an allocated buffer onto a region of the emulated process address space.
  *
+ * @param page_table The page table of the emulated process.
  * @param base The address to start mapping at. Must be page-aligned.
  * @param size The amount of bytes to map. Must be page-aligned.
  * @param target Buffer with the memory backing the mapping. Must be of length at least `size`.
  */
-void MapMemoryRegion(VAddr base, u32 size, u8* target);
+void MapMemoryRegion(PageTable& page_table, VAddr base, u32 size, u8* target);
 
 /**
  * Maps a region of the emulated process address space as a IO region.
+ * @param page_table The page table of the emulated process.
  * @param base The address to start mapping at. Must be page-aligned.
  * @param size The amount of bytes to map. Must be page-aligned.
  * @param mmio_handler The handler that backs the mapping.
  */
-void MapIoRegion(VAddr base, u32 size, MMIORegionPointer mmio_handler);
+void MapIoRegion(PageTable& page_table, VAddr base, u32 size, MMIORegionPointer mmio_handler);
 
-void UnmapRegion(VAddr base, u32 size);
+void UnmapRegion(PageTable& page_table, VAddr base, u32 size);
 }
diff --git a/src/core/settings.h b/src/core/settings.h
index bf8014c5a..024f14666 100644
--- a/src/core/settings.h
+++ b/src/core/settings.h
@@ -81,6 +81,7 @@ struct Values {
     std::array<std::string, NativeButton::NumButtons> buttons;
     std::array<std::string, NativeAnalog::NumAnalogs> analogs;
     std::string motion_device;
+    std::string touch_device;
 
     // Core
     bool use_cpu_jit;
diff --git a/src/tests/core/arm/arm_test_common.cpp b/src/tests/core/arm/arm_test_common.cpp
index 1df6c5677..8384ce744 100644
--- a/src/tests/core/arm/arm_test_common.cpp
+++ b/src/tests/core/arm/arm_test_common.cpp
@@ -3,20 +3,30 @@
 // Refer to the license.txt file included.
 
 #include "core/core.h"
+#include "core/memory.h"
 #include "core/memory_setup.h"
 #include "tests/core/arm/arm_test_common.h"
 
 namespace ArmTests {
 
+static Memory::PageTable page_table;
+
 TestEnvironment::TestEnvironment(bool mutable_memory_)
     : mutable_memory(mutable_memory_), test_memory(std::make_shared<TestMemory>(this)) {
-    Memory::MapIoRegion(0x00000000, 0x80000000, test_memory);
-    Memory::MapIoRegion(0x80000000, 0x80000000, test_memory);
+
+    page_table.pointers.fill(nullptr);
+    page_table.attributes.fill(Memory::PageType::Unmapped);
+    page_table.cached_res_count.fill(0);
+
+    Memory::MapIoRegion(page_table, 0x00000000, 0x80000000, test_memory);
+    Memory::MapIoRegion(page_table, 0x80000000, 0x80000000, test_memory);
+
+    Memory::current_page_table = &page_table;
 }
 
 TestEnvironment::~TestEnvironment() {
-    Memory::UnmapRegion(0x80000000, 0x80000000);
-    Memory::UnmapRegion(0x00000000, 0x80000000);
+    Memory::UnmapRegion(page_table, 0x80000000, 0x80000000);
+    Memory::UnmapRegion(page_table, 0x00000000, 0x80000000);
 }
 
 void TestEnvironment::SetMemory64(VAddr vaddr, u64 value) {
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index cffa4c952..82f47d8a9 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -1,6 +1,7 @@
 set(SRCS
             command_processor.cpp
             debug_utils/debug_utils.cpp
+            geometry_pipeline.cpp
             pica.cpp
             primitive_assembly.cpp
             regs.cpp
@@ -29,6 +30,7 @@ set(SRCS
 set(HEADERS
             command_processor.h
             debug_utils/debug_utils.h
+            geometry_pipeline.h
             gpu_debugger.h
             pica.h
             pica_state.h
diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp
index f98ca3302..fb65a3a0a 100644
--- a/src/video_core/command_processor.cpp
+++ b/src/video_core/command_processor.cpp
@@ -161,6 +161,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
 
     case PICA_REG_INDEX(pipeline.vs_default_attributes_setup.index):
         g_state.immediate.current_attribute = 0;
+        g_state.immediate.reset_geometry_pipeline = true;
         default_attr_counter = 0;
         break;
 
@@ -234,16 +235,14 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
                     shader_engine->Run(g_state.vs, shader_unit);
                     shader_unit.WriteOutput(regs.vs, output);
 
-                    // Send to renderer
-                    using Pica::Shader::OutputVertex;
-                    auto AddTriangle = [](const OutputVertex& v0, const OutputVertex& v1,
-                                          const OutputVertex& v2) {
-                        VideoCore::g_renderer->Rasterizer()->AddTriangle(v0, v1, v2);
-                    };
-
-                    g_state.primitive_assembler.SubmitVertex(
-                        Shader::OutputVertex::FromAttributeBuffer(regs.rasterizer, output),
-                        AddTriangle);
+                    // Send to geometry pipeline
+                    if (g_state.immediate.reset_geometry_pipeline) {
+                        g_state.geometry_pipeline.Reconfigure();
+                        g_state.immediate.reset_geometry_pipeline = false;
+                    }
+                    ASSERT(!g_state.geometry_pipeline.NeedIndexInput());
+                    g_state.geometry_pipeline.Setup(shader_engine);
+                    g_state.geometry_pipeline.SubmitVertex(output);
                 }
             }
         }
@@ -321,8 +320,8 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
         // The size has been tuned for optimal balance between hit-rate and the cost of lookup
         const size_t VERTEX_CACHE_SIZE = 32;
         std::array<u16, VERTEX_CACHE_SIZE> vertex_cache_ids;
-        std::array<Shader::OutputVertex, VERTEX_CACHE_SIZE> vertex_cache;
-        Shader::OutputVertex output_vertex;
+        std::array<Shader::AttributeBuffer, VERTEX_CACHE_SIZE> vertex_cache;
+        Shader::AttributeBuffer vs_output;
 
         unsigned int vertex_cache_pos = 0;
         vertex_cache_ids.fill(-1);
@@ -332,6 +331,11 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
 
         shader_engine->SetupBatch(g_state.vs, regs.vs.main_offset);
 
+        g_state.geometry_pipeline.Reconfigure();
+        g_state.geometry_pipeline.Setup(shader_engine);
+        if (g_state.geometry_pipeline.NeedIndexInput())
+            ASSERT(is_indexed);
+
         for (unsigned int index = 0; index < regs.pipeline.num_vertices; ++index) {
             // Indexed rendering doesn't use the start offset
             unsigned int vertex =
@@ -345,6 +349,11 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
             bool vertex_cache_hit = false;
 
             if (is_indexed) {
+                if (g_state.geometry_pipeline.NeedIndexInput()) {
+                    g_state.geometry_pipeline.SubmitIndex(vertex);
+                    continue;
+                }
+
                 if (g_debug_context && Pica::g_debug_context->recorder) {
                     int size = index_u16 ? 2 : 1;
                     memory_accesses.AddAccess(base_address + index_info.offset + size * index,
@@ -353,7 +362,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
 
                 for (unsigned int i = 0; i < VERTEX_CACHE_SIZE; ++i) {
                     if (vertex == vertex_cache_ids[i]) {
-                        output_vertex = vertex_cache[i];
+                        vs_output = vertex_cache[i];
                         vertex_cache_hit = true;
                         break;
                     }
@@ -362,7 +371,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
 
             if (!vertex_cache_hit) {
                 // Initialize data for the current vertex
-                Shader::AttributeBuffer input, output{};
+                Shader::AttributeBuffer input;
                 loader.LoadVertex(base_address, index, vertex, input, memory_accesses);
 
                 // Send to vertex shader
@@ -371,26 +380,17 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
                                              (void*)&input);
                 shader_unit.LoadInput(regs.vs, input);
                 shader_engine->Run(g_state.vs, shader_unit);
-                shader_unit.WriteOutput(regs.vs, output);
-
-                // Retrieve vertex from register data
-                output_vertex = Shader::OutputVertex::FromAttributeBuffer(regs.rasterizer, output);
+                shader_unit.WriteOutput(regs.vs, vs_output);
 
                 if (is_indexed) {
-                    vertex_cache[vertex_cache_pos] = output_vertex;
+                    vertex_cache[vertex_cache_pos] = vs_output;
                     vertex_cache_ids[vertex_cache_pos] = vertex;
                     vertex_cache_pos = (vertex_cache_pos + 1) % VERTEX_CACHE_SIZE;
                 }
             }
 
-            // Send to renderer
-            using Pica::Shader::OutputVertex;
-            auto AddTriangle = [](const OutputVertex& v0, const OutputVertex& v1,
-                                  const OutputVertex& v2) {
-                VideoCore::g_renderer->Rasterizer()->AddTriangle(v0, v1, v2);
-            };
-
-            primitive_assembler.SubmitVertex(output_vertex, AddTriangle);
+            // Send to geometry pipeline
+            g_state.geometry_pipeline.SubmitVertex(vs_output);
         }
 
         for (auto& range : memory_accesses.ranges) {
diff --git a/src/video_core/geometry_pipeline.cpp b/src/video_core/geometry_pipeline.cpp
new file mode 100644
index 000000000..b146e2ecb
--- /dev/null
+++ b/src/video_core/geometry_pipeline.cpp
@@ -0,0 +1,274 @@
+// Copyright 2017 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "video_core/geometry_pipeline.h"
+#include "video_core/pica_state.h"
+#include "video_core/regs.h"
+#include "video_core/renderer_base.h"
+#include "video_core/video_core.h"
+
+namespace Pica {
+
+/// An attribute buffering interface for different pipeline modes
+class GeometryPipelineBackend {
+public:
+    virtual ~GeometryPipelineBackend() = default;
+
+    /// Checks if there is no incomplete data transfer
+    virtual bool IsEmpty() const = 0;
+
+    /// Checks if the pipeline needs a direct input from index buffer
+    virtual bool NeedIndexInput() const = 0;
+
+    /// Submits an index from index buffer
+    virtual void SubmitIndex(unsigned int val) = 0;
+
+    /**
+     * Submits vertex attributes
+     * @param input attributes of a vertex output from vertex shader
+     * @return if the buffer is full and the geometry shader should be invoked
+     */
+    virtual bool SubmitVertex(const Shader::AttributeBuffer& input) = 0;
+};
+
+// In the Point mode, vertex attributes are sent to the input registers in the geometry shader unit.
+// The size of vertex shader outputs and geometry shader inputs are constants. Geometry shader is
+// invoked upon inputs buffer filled up by vertex shader outputs. For example, if we have a geometry
+// shader that takes 6 inputs, and the vertex shader outputs 2 attributes, it would take 3 vertices
+// for one geometry shader invocation.
+// TODO: what happens when the input size is not divisible by the output size?
+class GeometryPipeline_Point : public GeometryPipelineBackend {
+public:
+    GeometryPipeline_Point(const Regs& regs, Shader::GSUnitState& unit) : regs(regs), unit(unit) {
+        ASSERT(regs.pipeline.variable_primitive == 0);
+        ASSERT(regs.gs.input_to_uniform == 0);
+        vs_output_num = regs.pipeline.vs_outmap_total_minus_1_a + 1;
+        size_t gs_input_num = regs.gs.max_input_attribute_index + 1;
+        ASSERT(gs_input_num % vs_output_num == 0);
+        buffer_cur = attribute_buffer.attr;
+        buffer_end = attribute_buffer.attr + gs_input_num;
+    }
+
+    bool IsEmpty() const override {
+        return buffer_cur == attribute_buffer.attr;
+    }
+
+    bool NeedIndexInput() const override {
+        return false;
+    }
+
+    void SubmitIndex(unsigned int val) override {
+        UNREACHABLE();
+    }
+
+    bool SubmitVertex(const Shader::AttributeBuffer& input) override {
+        buffer_cur = std::copy(input.attr, input.attr + vs_output_num, buffer_cur);
+        if (buffer_cur == buffer_end) {
+            buffer_cur = attribute_buffer.attr;
+            unit.LoadInput(regs.gs, attribute_buffer);
+            return true;
+        }
+        return false;
+    }
+
+private:
+    const Regs& regs;
+    Shader::GSUnitState& unit;
+    Shader::AttributeBuffer attribute_buffer;
+    Math::Vec4<float24>* buffer_cur;
+    Math::Vec4<float24>* buffer_end;
+    unsigned int vs_output_num;
+};
+
+// In VariablePrimitive mode, vertex attributes are buffered into the uniform registers in the
+// geometry shader unit. The number of vertex is variable, which is specified by the first index
+// value in the batch. This mode is usually used for subdivision.
+class GeometryPipeline_VariablePrimitive : public GeometryPipelineBackend {
+public:
+    GeometryPipeline_VariablePrimitive(const Regs& regs, Shader::ShaderSetup& setup)
+        : regs(regs), setup(setup) {
+        ASSERT(regs.pipeline.variable_primitive == 1);
+        ASSERT(regs.gs.input_to_uniform == 1);
+        vs_output_num = regs.pipeline.vs_outmap_total_minus_1_a + 1;
+    }
+
+    bool IsEmpty() const override {
+        return need_index;
+    }
+
+    bool NeedIndexInput() const override {
+        return need_index;
+    }
+
+    void SubmitIndex(unsigned int val) override {
+        DEBUG_ASSERT(need_index);
+
+        // The number of vertex input is put to the uniform register
+        float24 vertex_num = float24::FromFloat32(val);
+        setup.uniforms.f[0] = Math::MakeVec(vertex_num, vertex_num, vertex_num, vertex_num);
+
+        // The second uniform register and so on are used for receiving input vertices
+        buffer_cur = setup.uniforms.f + 1;
+
+        main_vertex_num = regs.pipeline.variable_vertex_main_num_minus_1 + 1;
+        total_vertex_num = val;
+        need_index = false;
+    }
+
+    bool SubmitVertex(const Shader::AttributeBuffer& input) override {
+        DEBUG_ASSERT(!need_index);
+        if (main_vertex_num != 0) {
+            // For main vertices, receive all attributes
+            buffer_cur = std::copy(input.attr, input.attr + vs_output_num, buffer_cur);
+            --main_vertex_num;
+        } else {
+            // For other vertices, only receive the first attribute (usually the position)
+            *(buffer_cur++) = input.attr[0];
+        }
+        --total_vertex_num;
+
+        if (total_vertex_num == 0) {
+            need_index = true;
+            return true;
+        }
+
+        return false;
+    }
+
+private:
+    bool need_index = true;
+    const Regs& regs;
+    Shader::ShaderSetup& setup;
+    unsigned int main_vertex_num;
+    unsigned int total_vertex_num;
+    Math::Vec4<float24>* buffer_cur;
+    unsigned int vs_output_num;
+};
+
+// In FixedPrimitive mode, vertex attributes are buffered into the uniform registers in the geometry
+// shader unit. The number of vertex per shader invocation is constant. This is usually used for
+// particle system.
+class GeometryPipeline_FixedPrimitive : public GeometryPipelineBackend {
+public:
+    GeometryPipeline_FixedPrimitive(const Regs& regs, Shader::ShaderSetup& setup)
+        : regs(regs), setup(setup) {
+        ASSERT(regs.pipeline.variable_primitive == 0);
+        ASSERT(regs.gs.input_to_uniform == 1);
+        vs_output_num = regs.pipeline.vs_outmap_total_minus_1_a + 1;
+        ASSERT(vs_output_num == regs.pipeline.gs_config.stride_minus_1 + 1);
+        size_t vertex_num = regs.pipeline.gs_config.fixed_vertex_num_minus_1 + 1;
+        buffer_cur = buffer_begin = setup.uniforms.f + regs.pipeline.gs_config.start_index;
+        buffer_end = buffer_begin + vs_output_num * vertex_num;
+    }
+
+    bool IsEmpty() const override {
+        return buffer_cur == buffer_begin;
+    }
+
+    bool NeedIndexInput() const override {
+        return false;
+    }
+
+    void SubmitIndex(unsigned int val) override {
+        UNREACHABLE();
+    }
+
+    bool SubmitVertex(const Shader::AttributeBuffer& input) override {
+        buffer_cur = std::copy(input.attr, input.attr + vs_output_num, buffer_cur);
+        if (buffer_cur == buffer_end) {
+            buffer_cur = buffer_begin;
+            return true;
+        }
+        return false;
+    }
+
+private:
+    const Regs& regs;
+    Shader::ShaderSetup& setup;
+    Math::Vec4<float24>* buffer_begin;
+    Math::Vec4<float24>* buffer_cur;
+    Math::Vec4<float24>* buffer_end;
+    unsigned int vs_output_num;
+};
+
+GeometryPipeline::GeometryPipeline(State& state) : state(state) {}
+
+GeometryPipeline::~GeometryPipeline() = default;
+
+void GeometryPipeline::SetVertexHandler(Shader::VertexHandler vertex_handler) {
+    this->vertex_handler = vertex_handler;
+}
+
+void GeometryPipeline::Setup(Shader::ShaderEngine* shader_engine) {
+    if (!backend)
+        return;
+
+    this->shader_engine = shader_engine;
+    shader_engine->SetupBatch(state.gs, state.regs.gs.main_offset);
+}
+
+void GeometryPipeline::Reconfigure() {
+    ASSERT(!backend || backend->IsEmpty());
+
+    if (state.regs.pipeline.use_gs == PipelineRegs::UseGS::No) {
+        backend = nullptr;
+        return;
+    }
+
+    ASSERT(state.regs.pipeline.use_gs == PipelineRegs::UseGS::Yes);
+
+    // The following assumes that when geometry shader is in use, the shader unit 3 is configured as
+    // a geometry shader unit.
+    // TODO: what happens if this is not true?
+    ASSERT(state.regs.pipeline.gs_unit_exclusive_configuration == 1);
+    ASSERT(state.regs.gs.shader_mode == ShaderRegs::ShaderMode::GS);
+
+    state.gs_unit.ConfigOutput(state.regs.gs);
+
+    ASSERT(state.regs.pipeline.vs_outmap_total_minus_1_a ==
+           state.regs.pipeline.vs_outmap_total_minus_1_b);
+
+    switch (state.regs.pipeline.gs_config.mode) {
+    case PipelineRegs::GSMode::Point:
+        backend = std::make_unique<GeometryPipeline_Point>(state.regs, state.gs_unit);
+        break;
+    case PipelineRegs::GSMode::VariablePrimitive:
+        backend = std::make_unique<GeometryPipeline_VariablePrimitive>(state.regs, state.gs);
+        break;
+    case PipelineRegs::GSMode::FixedPrimitive:
+        backend = std::make_unique<GeometryPipeline_FixedPrimitive>(state.regs, state.gs);
+        break;
+    default:
+        UNREACHABLE();
+    }
+}
+
+bool GeometryPipeline::NeedIndexInput() const {
+    if (!backend)
+        return false;
+    return backend->NeedIndexInput();
+}
+
+void GeometryPipeline::SubmitIndex(unsigned int val) {
+    backend->SubmitIndex(val);
+}
+
+void GeometryPipeline::SubmitVertex(const Shader::AttributeBuffer& input) {
+    if (!backend) {
+        // No backend means the geometry shader is disabled, so we send the vertex shader output
+        // directly to the primitive assembler.
+        vertex_handler(input);
+    } else {
+        if (backend->SubmitVertex(input)) {
+            shader_engine->Run(state.gs, state.gs_unit);
+
+            // The uniform b15 is set to true after every geometry shader invocation. This is useful
+            // for the shader to know if this is the first invocation in a batch, if the program set
+            // b15 to false first.
+            state.gs.uniforms.b[15] = true;
+        }
+    }
+}
+
+} // namespace Pica
diff --git a/src/video_core/geometry_pipeline.h b/src/video_core/geometry_pipeline.h
new file mode 100644
index 000000000..91fdd3192
--- /dev/null
+++ b/src/video_core/geometry_pipeline.h
@@ -0,0 +1,49 @@
+// Copyright 2017 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <memory>
+#include "video_core/shader/shader.h"
+
+namespace Pica {
+
+struct State;
+
+class GeometryPipelineBackend;
+
+/// A pipeline receiving from vertex shader and sending to geometry shader and primitive assembler
+class GeometryPipeline {
+public:
+    explicit GeometryPipeline(State& state);
+    ~GeometryPipeline();
+
+    /// Sets the handler for receiving vertex outputs from vertex shader
+    void SetVertexHandler(Shader::VertexHandler vertex_handler);
+
+    /**
+     * Setup the geometry shader unit if it is in use
+     * @param shader_engine the shader engine for the geometry shader to run
+     */
+    void Setup(Shader::ShaderEngine* shader_engine);
+
+    /// Reconfigures the pipeline according to current register settings
+    void Reconfigure();
+
+    /// Checks if the pipeline needs a direct input from index buffer
+    bool NeedIndexInput() const;
+
+    /// Submits an index from index buffer. Call this only when NeedIndexInput returns true
+    void SubmitIndex(unsigned int val);
+
+    /// Submits vertex attributes output from vertex shader
+    void SubmitVertex(const Shader::AttributeBuffer& input);
+
+private:
+    Shader::VertexHandler vertex_handler;
+    Shader::ShaderEngine* shader_engine;
+    std::unique_ptr<GeometryPipelineBackend> backend;
+    State& state;
+};
+} // namespace Pica
diff --git a/src/video_core/pica.cpp b/src/video_core/pica.cpp
index b95148a6a..218e06883 100644
--- a/src/video_core/pica.cpp
+++ b/src/video_core/pica.cpp
@@ -3,9 +3,11 @@
 // Refer to the license.txt file included.
 
 #include <cstring>
+#include "video_core/geometry_pipeline.h"
 #include "video_core/pica.h"
 #include "video_core/pica_state.h"
-#include "video_core/regs_pipeline.h"
+#include "video_core/renderer_base.h"
+#include "video_core/video_core.h"
 
 namespace Pica {
 
@@ -24,6 +26,23 @@ void Zero(T& o) {
     memset(&o, 0, sizeof(o));
 }
 
+State::State() : geometry_pipeline(*this) {
+    auto SubmitVertex = [this](const Shader::AttributeBuffer& vertex) {
+        using Pica::Shader::OutputVertex;
+        auto AddTriangle = [this](const OutputVertex& v0, const OutputVertex& v1,
+                                  const OutputVertex& v2) {
+            VideoCore::g_renderer->Rasterizer()->AddTriangle(v0, v1, v2);
+        };
+        primitive_assembler.SubmitVertex(
+            Shader::OutputVertex::FromAttributeBuffer(regs.rasterizer, vertex), AddTriangle);
+    };
+
+    auto SetWinding = [this]() { primitive_assembler.SetWinding(); };
+
+    g_state.gs_unit.SetVertexHandler(SubmitVertex, SetWinding);
+    g_state.geometry_pipeline.SetVertexHandler(SubmitVertex);
+}
+
 void State::Reset() {
     Zero(regs);
     Zero(vs);
diff --git a/src/video_core/pica_state.h b/src/video_core/pica_state.h
index 864a2c9e6..c6634a0bc 100644
--- a/src/video_core/pica_state.h
+++ b/src/video_core/pica_state.h
@@ -8,6 +8,7 @@
 #include "common/bit_field.h"
 #include "common/common_types.h"
 #include "common/vector_math.h"
+#include "video_core/geometry_pipeline.h"
 #include "video_core/primitive_assembly.h"
 #include "video_core/regs.h"
 #include "video_core/shader/shader.h"
@@ -16,6 +17,7 @@ namespace Pica {
 
 /// Struct used to describe current Pica state
 struct State {
+    State();
     void Reset();
 
     /// Pica registers
@@ -137,8 +139,17 @@ struct State {
         Shader::AttributeBuffer input_vertex;
         // Index of the next attribute to be loaded into `input_vertex`.
         u32 current_attribute = 0;
+        // Indicates the immediate mode just started and the geometry pipeline needs to reconfigure
+        bool reset_geometry_pipeline = true;
     } immediate;
 
+    // the geometry shader needs to be kept in the global state because some shaders relie on
+    // preserved register value across shader invocation.
+    // TODO: also bring the three vertex shader units here and implement the shader scheduler.
+    Shader::GSUnitState gs_unit;
+
+    GeometryPipeline geometry_pipeline;
+
     // This is constructed with a dummy triangle topology
     PrimitiveAssembler<Shader::OutputVertex> primitive_assembler;
 };
diff --git a/src/video_core/primitive_assembly.cpp b/src/video_core/primitive_assembly.cpp
index acd2ac5e2..9c3dd4cab 100644
--- a/src/video_core/primitive_assembly.cpp
+++ b/src/video_core/primitive_assembly.cpp
@@ -17,15 +17,18 @@ template <typename VertexType>
 void PrimitiveAssembler<VertexType>::SubmitVertex(const VertexType& vtx,
                                                   TriangleHandler triangle_handler) {
     switch (topology) {
-    // TODO: Figure out what's different with TriangleTopology::Shader.
     case PipelineRegs::TriangleTopology::List:
     case PipelineRegs::TriangleTopology::Shader:
         if (buffer_index < 2) {
             buffer[buffer_index++] = vtx;
         } else {
             buffer_index = 0;
-
-            triangle_handler(buffer[0], buffer[1], vtx);
+            if (topology == PipelineRegs::TriangleTopology::Shader && winding) {
+                triangle_handler(buffer[1], buffer[0], vtx);
+                winding = false;
+            } else {
+                triangle_handler(buffer[0], buffer[1], vtx);
+            }
         }
         break;
 
@@ -51,9 +54,15 @@ void PrimitiveAssembler<VertexType>::SubmitVertex(const VertexType& vtx,
 }
 
 template <typename VertexType>
+void PrimitiveAssembler<VertexType>::SetWinding() {
+    winding = true;
+}
+
+template <typename VertexType>
 void PrimitiveAssembler<VertexType>::Reset() {
     buffer_index = 0;
     strip_ready = false;
+    winding = false;
 }
 
 template <typename VertexType>
diff --git a/src/video_core/primitive_assembly.h b/src/video_core/primitive_assembly.h
index e8eccdf27..12de8e3b9 100644
--- a/src/video_core/primitive_assembly.h
+++ b/src/video_core/primitive_assembly.h
@@ -30,6 +30,12 @@ struct PrimitiveAssembler {
     void SubmitVertex(const VertexType& vtx, TriangleHandler triangle_handler);
 
     /**
+     * Invert the vertex order of the next triangle. Called by geometry shader emitter.
+     * This only takes effect for TriangleTopology::Shader.
+     */
+    void SetWinding();
+
+    /**
      * Resets the internal state of the PrimitiveAssembler.
      */
     void Reset();
@@ -45,6 +51,7 @@ private:
     int buffer_index;
     VertexType buffer[2];
     bool strip_ready = false;
+    bool winding = false;
 };
 
 } // namespace
diff --git a/src/video_core/regs_pipeline.h b/src/video_core/regs_pipeline.h
index 8b6369297..e78c3e331 100644
--- a/src/video_core/regs_pipeline.h
+++ b/src/video_core/regs_pipeline.h
@@ -147,7 +147,15 @@ struct PipelineRegs {
     // Number of vertices to render
     u32 num_vertices;
 
-    INSERT_PADDING_WORDS(0x1);
+    enum class UseGS : u32 {
+        No = 0,
+        Yes = 2,
+    };
+
+    union {
+        BitField<0, 2, UseGS> use_gs;
+        BitField<31, 1, u32> variable_primitive;
+    };
 
     // The index of the first vertex to render
     u32 vertex_offset;
@@ -218,7 +226,29 @@ struct PipelineRegs {
 
     GPUMode gpu_mode;
 
-    INSERT_PADDING_WORDS(0x18);
+    INSERT_PADDING_WORDS(0x4);
+    BitField<0, 4, u32> vs_outmap_total_minus_1_a;
+    INSERT_PADDING_WORDS(0x6);
+    BitField<0, 4, u32> vs_outmap_total_minus_1_b;
+
+    enum class GSMode : u32 {
+        Point = 0,
+        VariablePrimitive = 1,
+        FixedPrimitive = 2,
+    };
+
+    union {
+        BitField<0, 8, GSMode> mode;
+        BitField<8, 4, u32> fixed_vertex_num_minus_1;
+        BitField<12, 4, u32> stride_minus_1;
+        BitField<16, 4, u32> start_index;
+    } gs_config;
+
+    INSERT_PADDING_WORDS(0x1);
+
+    u32 variable_vertex_main_num_minus_1;
+
+    INSERT_PADDING_WORDS(0x9);
 
     enum class TriangleTopology : u32 {
         List = 0,
diff --git a/src/video_core/regs_rasterizer.h b/src/video_core/regs_rasterizer.h
index 2874fd127..4fef00d76 100644
--- a/src/video_core/regs_rasterizer.h
+++ b/src/video_core/regs_rasterizer.h
@@ -5,10 +5,10 @@
 #pragma once
 
 #include <array>
-
 #include "common/bit_field.h"
 #include "common/common_funcs.h"
 #include "common/common_types.h"
+#include "video_core/pica_types.h"
 
 namespace Pica {
 
@@ -31,7 +31,17 @@ struct RasterizerRegs {
 
     BitField<0, 24, u32> viewport_size_y;
 
-    INSERT_PADDING_WORDS(0x9);
+    INSERT_PADDING_WORDS(0x3);
+
+    BitField<0, 1, u32> clip_enable;
+    BitField<0, 24, u32> clip_coef[4]; // float24
+
+    Math::Vec4<float24> GetClipCoef() const {
+        return {float24::FromRaw(clip_coef[0]), float24::FromRaw(clip_coef[1]),
+                float24::FromRaw(clip_coef[2]), float24::FromRaw(clip_coef[3])};
+    }
+
+    INSERT_PADDING_WORDS(0x1);
 
     BitField<0, 24, u32> viewport_depth_range;      // float24
     BitField<0, 24, u32> viewport_depth_near_plane; // float24
diff --git a/src/video_core/regs_shader.h b/src/video_core/regs_shader.h
index ddb1ee451..c15d4d162 100644
--- a/src/video_core/regs_shader.h
+++ b/src/video_core/regs_shader.h
@@ -24,9 +24,16 @@ struct ShaderRegs {
 
     INSERT_PADDING_WORDS(0x4);
 
+    enum ShaderMode {
+        GS = 0x08,
+        VS = 0xA0,
+    };
+
     union {
         // Number of input attributes to shader unit - 1
         BitField<0, 4, u32> max_input_attribute_index;
+        BitField<8, 8, u32> input_to_uniform;
+        BitField<24, 8, ShaderMode> shader_mode;
     };
 
     // Offset to shader program entry point (in words)
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index aa95ef21d..7b0cd1b66 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -169,6 +169,8 @@ RasterizerOpenGL::RasterizerOpenGL() : shader_dirty(true) {
     glTexBuffer(GL_TEXTURE_BUFFER, GL_RGBA32F, proctex_diff_lut_buffer.handle);
 
     // Sync fixed function OpenGL state
+    SyncClipEnabled();
+    SyncClipCoef();
     SyncCullMode();
     SyncBlendEnabled();
     SyncBlendFuncs();
@@ -401,6 +403,18 @@ void RasterizerOpenGL::NotifyPicaRegisterChanged(u32 id) {
         SyncCullMode();
         break;
 
+    // Clipping plane
+    case PICA_REG_INDEX(rasterizer.clip_enable):
+        SyncClipEnabled();
+        break;
+
+    case PICA_REG_INDEX_WORKAROUND(rasterizer.clip_coef[0], 0x48):
+    case PICA_REG_INDEX_WORKAROUND(rasterizer.clip_coef[1], 0x49):
+    case PICA_REG_INDEX_WORKAROUND(rasterizer.clip_coef[2], 0x4a):
+    case PICA_REG_INDEX_WORKAROUND(rasterizer.clip_coef[3], 0x4b):
+        SyncClipCoef();
+        break;
+
     // Depth modifiers
     case PICA_REG_INDEX(rasterizer.viewport_depth_range):
         SyncDepthScale();
@@ -1280,6 +1294,20 @@ void RasterizerOpenGL::SetShader() {
     }
 }
 
+void RasterizerOpenGL::SyncClipEnabled() {
+    state.clip_distance[1] = Pica::g_state.regs.rasterizer.clip_enable != 0;
+}
+
+void RasterizerOpenGL::SyncClipCoef() {
+    const auto raw_clip_coef = Pica::g_state.regs.rasterizer.GetClipCoef();
+    const GLvec4 new_clip_coef = {raw_clip_coef.x.ToFloat32(), raw_clip_coef.y.ToFloat32(),
+                                  raw_clip_coef.z.ToFloat32(), raw_clip_coef.w.ToFloat32()};
+    if (new_clip_coef != uniform_block_data.data.clip_coef) {
+        uniform_block_data.data.clip_coef = new_clip_coef;
+        uniform_block_data.dirty = true;
+    }
+}
+
 void RasterizerOpenGL::SyncCullMode() {
     const auto& regs = Pica::g_state.regs;
 
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h
index 78e218efe..46c62961c 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -151,14 +151,21 @@ private:
         LightSrc light_src[8];
         alignas(16) GLvec4 const_color[6]; // A vec4 color for each of the six tev stages
         alignas(16) GLvec4 tev_combiner_buffer_color;
+        alignas(16) GLvec4 clip_coef;
     };
 
     static_assert(
-        sizeof(UniformData) == 0x460,
+        sizeof(UniformData) == 0x470,
         "The size of the UniformData structure has changed, update the structure in the shader");
     static_assert(sizeof(UniformData) < 16384,
                   "UniformData structure must be less than 16kb as per the OpenGL spec");
 
+    /// Syncs the clip enabled status to match the PICA register
+    void SyncClipEnabled();
+
+    /// Syncs the clip coefficients to match the PICA register
+    void SyncClipCoef();
+
     /// Sets the OpenGL shader in accordance with the current PICA register state
     void SetShader();
 
diff --git a/src/video_core/renderer_opengl/gl_shader_gen.cpp b/src/video_core/renderer_opengl/gl_shader_gen.cpp
index 015e69da9..9fe183944 100644
--- a/src/video_core/renderer_opengl/gl_shader_gen.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_gen.cpp
@@ -8,6 +8,7 @@
 #include "common/assert.h"
 #include "common/bit_field.h"
 #include "common/logging/log.h"
+#include "core/core.h"
 #include "video_core/regs_framebuffer.h"
 #include "video_core/regs_lighting.h"
 #include "video_core/regs_rasterizer.h"
@@ -24,6 +25,42 @@ using TevStageConfig = TexturingRegs::TevStageConfig;
 
 namespace GLShader {
 
+static const std::string UniformBlockDef = R"(
+#define NUM_TEV_STAGES 6
+#define NUM_LIGHTS 8
+
+struct LightSrc {
+    vec3 specular_0;
+    vec3 specular_1;
+    vec3 diffuse;
+    vec3 ambient;
+    vec3 position;
+    vec3 spot_direction;
+    float dist_atten_bias;
+    float dist_atten_scale;
+};
+
+layout (std140) uniform shader_data {
+    vec2 framebuffer_scale;
+    int alphatest_ref;
+    float depth_scale;
+    float depth_offset;
+    int scissor_x1;
+    int scissor_y1;
+    int scissor_x2;
+    int scissor_y2;
+    vec3 fog_color;
+    vec2 proctex_noise_f;
+    vec2 proctex_noise_a;
+    vec2 proctex_noise_p;
+    vec3 lighting_global_ambient;
+    LightSrc light_src[NUM_LIGHTS];
+    vec4 const_color[NUM_TEV_STAGES];
+    vec4 tev_combiner_buffer_color;
+    vec4 clip_coef;
+};
+)";
+
 PicaShaderConfig PicaShaderConfig::BuildFromRegs(const Pica::Regs& regs) {
     PicaShaderConfig res;
 
@@ -594,8 +631,8 @@ static void WriteLighting(std::string& out, const PicaShaderConfig& config) {
                 // Note: even if the normal vector is modified by normal map, which is not the
                 // normal of the tangent plane anymore, the half angle vector is still projected
                 // using the modified normal vector.
-                std::string half_angle_proj = "normalize(half_vector) - normal / dot(normal, "
-                                              "normal) * dot(normal, normalize(half_vector))";
+                std::string half_angle_proj =
+                    "normalize(half_vector) - normal * dot(normal, normalize(half_vector))";
                 // Note: the half angle vector projection is confirmed not normalized before the dot
                 // product. The result is in fact not cos(phi) as the name suggested.
                 index = "dot(" + half_angle_proj + ", tangent)";
@@ -750,7 +787,8 @@ static void WriteLighting(std::string& out, const PicaShaderConfig& config) {
         }
 
         // Fresnel
-        if (lighting.lut_fr.enable &&
+        // Note: only the last entry in the light slots applies the Fresnel factor
+        if (light_index == lighting.src_num - 1 && lighting.lut_fr.enable &&
             LightingRegs::IsLightingSamplerSupported(lighting.config,
                                                      LightingRegs::LightingSampler::Fresnel)) {
             // Lookup fresnel LUT value
@@ -759,17 +797,17 @@ static void WriteLighting(std::string& out, const PicaShaderConfig& config) {
                             lighting.lut_fr.type, lighting.lut_fr.abs_input);
             value = "(" + std::to_string(lighting.lut_fr.scale) + " * " + value + ")";
 
-            // Enabled for difffuse lighting alpha component
+            // Enabled for diffuse lighting alpha component
             if (lighting.fresnel_selector == LightingRegs::LightingFresnelSelector::PrimaryAlpha ||
                 lighting.fresnel_selector == LightingRegs::LightingFresnelSelector::Both) {
-                out += "diffuse_sum.a  *= " + value + ";\n";
+                out += "diffuse_sum.a = " + value + ";\n";
             }
 
             // Enabled for the specular lighting alpha component
             if (lighting.fresnel_selector ==
                     LightingRegs::LightingFresnelSelector::SecondaryAlpha ||
                 lighting.fresnel_selector == LightingRegs::LightingFresnelSelector::Both) {
-                out += "specular_sum.a *= " + value + ";\n";
+                out += "specular_sum.a = " + value + ";\n";
             }
         }
 
@@ -1008,8 +1046,6 @@ std::string GenerateFragmentShader(const PicaShaderConfig& config) {
 
     std::string out = R"(
 #version 330 core
-#define NUM_TEV_STAGES 6
-#define NUM_LIGHTS 8
 
 in vec4 primary_color;
 in vec2 texcoord[3];
@@ -1021,36 +1057,6 @@ in vec4 gl_FragCoord;
 
 out vec4 color;
 
-struct LightSrc {
-    vec3 specular_0;
-    vec3 specular_1;
-    vec3 diffuse;
-    vec3 ambient;
-    vec3 position;
-    vec3 spot_direction;
-    float dist_atten_bias;
-    float dist_atten_scale;
-};
-
-layout (std140) uniform shader_data {
-    vec2 framebuffer_scale;
-    int alphatest_ref;
-    float depth_scale;
-    float depth_offset;
-    int scissor_x1;
-    int scissor_y1;
-    int scissor_x2;
-    int scissor_y2;
-    vec3 fog_color;
-    vec2 proctex_noise_f;
-    vec2 proctex_noise_a;
-    vec2 proctex_noise_p;
-    vec3 lighting_global_ambient;
-    LightSrc light_src[NUM_LIGHTS];
-    vec4 const_color[NUM_TEV_STAGES];
-    vec4 tev_combiner_buffer_color;
-};
-
 uniform sampler2D tex[3];
 uniform samplerBuffer lighting_lut;
 uniform samplerBuffer fog_lut;
@@ -1059,7 +1065,11 @@ uniform samplerBuffer proctex_color_map;
 uniform samplerBuffer proctex_alpha_map;
 uniform samplerBuffer proctex_lut;
 uniform samplerBuffer proctex_diff_lut;
+)";
+
+    out += UniformBlockDef;
 
+    out += R"(
 // Rotate the vector v by the quaternion q
 vec3 quaternion_rotate(vec4 q, vec3 v) {
     return v + 2.0 * cross(q.xyz, cross(q.xyz, v) + q.w * v);
@@ -1155,6 +1165,11 @@ vec4 secondary_fragment_color = vec4(0.0);
 
         // Blend the fog
         out += "last_tex_env_out.rgb = mix(fog_color.rgb, last_tex_env_out.rgb, fog_factor);\n";
+    } else if (state.fog_mode == TexturingRegs::FogMode::Gas) {
+        Core::Telemetry().AddField(Telemetry::FieldType::Session, "VideoCore_Pica_UseGasMode",
+                                   true);
+        LOG_CRITICAL(Render_OpenGL, "Unimplemented gas mode");
+        UNIMPLEMENTED();
     }
 
     out += "gl_FragDepth = depth;\n";
@@ -1190,6 +1205,12 @@ out float texcoord0_w;
 out vec4 normquat;
 out vec3 view;
 
+)";
+
+    out += UniformBlockDef;
+
+    out += R"(
+
 void main() {
     primary_color = vert_color;
     texcoord[0] = vert_texcoord0;
@@ -1200,7 +1221,7 @@ void main() {
     view = vert_view;
     gl_Position = vert_position;
     gl_ClipDistance[0] = -vert_position.z; // fixed PICA clipping plane z <= 0
-    // TODO (wwylele): calculate gl_ClipDistance[1] from user-defined clipping plane
+    gl_ClipDistance[1] = dot(clip_coef, vert_position);
 }
 )";
 
diff --git a/src/video_core/shader/shader.cpp b/src/video_core/shader/shader.cpp
index 67ed19ba8..e9063e616 100644
--- a/src/video_core/shader/shader.cpp
+++ b/src/video_core/shader/shader.cpp
@@ -21,7 +21,8 @@ namespace Pica {
 
 namespace Shader {
 
-OutputVertex OutputVertex::FromAttributeBuffer(const RasterizerRegs& regs, AttributeBuffer& input) {
+OutputVertex OutputVertex::FromAttributeBuffer(const RasterizerRegs& regs,
+                                               const AttributeBuffer& input) {
     // Setup output data
     union {
         OutputVertex ret{};
@@ -82,6 +83,44 @@ void UnitState::WriteOutput(const ShaderRegs& config, AttributeBuffer& output) {
     }
 }
 
+UnitState::UnitState(GSEmitter* emitter) : emitter_ptr(emitter) {}
+
+GSEmitter::GSEmitter() {
+    handlers = new Handlers;
+}
+
+GSEmitter::~GSEmitter() {
+    delete handlers;
+}
+
+void GSEmitter::Emit(Math::Vec4<float24> (&vertex)[16]) {
+    ASSERT(vertex_id < 3);
+    std::copy(std::begin(vertex), std::end(vertex), buffer[vertex_id].begin());
+    if (prim_emit) {
+        if (winding)
+            handlers->winding_setter();
+        for (size_t i = 0; i < buffer.size(); ++i) {
+            AttributeBuffer output;
+            unsigned int output_i = 0;
+            for (unsigned int reg : Common::BitSet<u32>(output_mask)) {
+                output.attr[output_i++] = buffer[i][reg];
+            }
+            handlers->vertex_handler(output);
+        }
+    }
+}
+
+GSUnitState::GSUnitState() : UnitState(&emitter) {}
+
+void GSUnitState::SetVertexHandler(VertexHandler vertex_handler, WindingSetter winding_setter) {
+    emitter.handlers->vertex_handler = std::move(vertex_handler);
+    emitter.handlers->winding_setter = std::move(winding_setter);
+}
+
+void GSUnitState::ConfigOutput(const ShaderRegs& config) {
+    emitter.output_mask = config.output_mask;
+}
+
 MICROPROFILE_DEFINE(GPU_Shader, "GPU", "Shader", MP_RGB(50, 50, 240));
 
 #ifdef ARCHITECTURE_x86_64
diff --git a/src/video_core/shader/shader.h b/src/video_core/shader/shader.h
index e156f6aef..a3789da01 100644
--- a/src/video_core/shader/shader.h
+++ b/src/video_core/shader/shader.h
@@ -6,6 +6,7 @@
 
 #include <array>
 #include <cstddef>
+#include <functional>
 #include <type_traits>
 #include <nihstro/shader_bytecode.h>
 #include "common/assert.h"
@@ -31,6 +32,12 @@ struct AttributeBuffer {
     alignas(16) Math::Vec4<float24> attr[16];
 };
 
+/// Handler type for receiving vertex outputs from vertex shader or geometry shader
+using VertexHandler = std::function<void(const AttributeBuffer&)>;
+
+/// Handler type for signaling to invert the vertex order of the next triangle
+using WindingSetter = std::function<void()>;
+
 struct OutputVertex {
     Math::Vec4<float24> pos;
     Math::Vec4<float24> quat;
@@ -43,7 +50,8 @@ struct OutputVertex {
     INSERT_PADDING_WORDS(1);
     Math::Vec2<float24> tc2;
 
-    static OutputVertex FromAttributeBuffer(const RasterizerRegs& regs, AttributeBuffer& output);
+    static OutputVertex FromAttributeBuffer(const RasterizerRegs& regs,
+                                            const AttributeBuffer& output);
 };
 #define ASSERT_POS(var, pos)                                                                       \
     static_assert(offsetof(OutputVertex, var) == pos * sizeof(float24), "Semantic at wrong "       \
@@ -61,12 +69,36 @@ static_assert(std::is_pod<OutputVertex>::value, "Structure is not POD");
 static_assert(sizeof(OutputVertex) == 24 * sizeof(float), "OutputVertex has invalid size");
 
 /**
+ * This structure contains state information for primitive emitting in geometry shader.
+ */
+struct GSEmitter {
+    std::array<std::array<Math::Vec4<float24>, 16>, 3> buffer;
+    u8 vertex_id;
+    bool prim_emit;
+    bool winding;
+    u32 output_mask;
+
+    // Function objects are hidden behind a raw pointer to make the structure standard layout type,
+    // for JIT to use offsetof to access other members.
+    struct Handlers {
+        VertexHandler vertex_handler;
+        WindingSetter winding_setter;
+    } * handlers;
+
+    GSEmitter();
+    ~GSEmitter();
+    void Emit(Math::Vec4<float24> (&vertex)[16]);
+};
+static_assert(std::is_standard_layout<GSEmitter>::value, "GSEmitter is not standard layout type");
+
+/**
  * This structure contains the state information that needs to be unique for a shader unit. The 3DS
  * has four shader units that process shaders in parallel. At the present, Citra only implements a
  * single shader unit that processes all shaders serially. Putting the state information in a struct
  * here will make it easier for us to parallelize the shader processing later.
  */
 struct UnitState {
+    explicit UnitState(GSEmitter* emitter = nullptr);
     struct Registers {
         // The registers are accessed by the shader JIT using SSE instructions, and are therefore
         // required to be 16-byte aligned.
@@ -82,6 +114,8 @@ struct UnitState {
     // TODO: How many bits do these actually have?
     s32 address_registers[3];
 
+    GSEmitter* emitter_ptr;
+
     static size_t InputOffset(const SourceRegister& reg) {
         switch (reg.GetRegisterType()) {
         case RegisterType::Input:
@@ -125,6 +159,19 @@ struct UnitState {
     void WriteOutput(const ShaderRegs& config, AttributeBuffer& output);
 };
 
+/**
+ * This is an extended shader unit state that represents the special unit that can run both vertex
+ * shader and geometry shader. It contains an additional primitive emitter and utilities for
+ * geometry shader.
+ */
+struct GSUnitState : public UnitState {
+    GSUnitState();
+    void SetVertexHandler(VertexHandler vertex_handler, WindingSetter winding_setter);
+    void ConfigOutput(const ShaderRegs& config);
+
+    GSEmitter emitter;
+};
+
 struct ShaderSetup {
     struct {
         // The float uniforms are accessed by the shader JIT using SSE instructions, and are
diff --git a/src/video_core/shader/shader_interpreter.cpp b/src/video_core/shader/shader_interpreter.cpp
index 206c0978a..9d4da4904 100644
--- a/src/video_core/shader/shader_interpreter.cpp
+++ b/src/video_core/shader/shader_interpreter.cpp
@@ -636,6 +636,22 @@ static void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData
                 break;
             }
 
+            case OpCode::Id::EMIT: {
+                GSEmitter* emitter = state.emitter_ptr;
+                ASSERT_MSG(emitter, "Execute EMIT on VS");
+                emitter->Emit(state.registers.output);
+                break;
+            }
+
+            case OpCode::Id::SETEMIT: {
+                GSEmitter* emitter = state.emitter_ptr;
+                ASSERT_MSG(emitter, "Execute SETEMIT on VS");
+                emitter->vertex_id = instr.setemit.vertex_id;
+                emitter->prim_emit = instr.setemit.prim_emit != 0;
+                emitter->winding = instr.setemit.winding != 0;
+                break;
+            }
+
             default:
                 LOG_ERROR(HW_GPU, "Unhandled instruction: 0x%02x (%s): 0x%08x",
                           (int)instr.opcode.Value().EffectiveOpCode(),
diff --git a/src/video_core/shader/shader_jit_x64_compiler.cpp b/src/video_core/shader/shader_jit_x64_compiler.cpp
index 42a57aab1..1b31623bd 100644
--- a/src/video_core/shader/shader_jit_x64_compiler.cpp
+++ b/src/video_core/shader/shader_jit_x64_compiler.cpp
@@ -75,8 +75,8 @@ const JitFunction instr_table[64] = {
     &JitShader::Compile_IF,    // ifu
     &JitShader::Compile_IF,    // ifc
     &JitShader::Compile_LOOP,  // loop
-    nullptr,                   // emit
-    nullptr,                   // sete
+    &JitShader::Compile_EMIT,  // emit
+    &JitShader::Compile_SETE,  // sete
     &JitShader::Compile_JMP,   // jmpc
     &JitShader::Compile_JMP,   // jmpu
     &JitShader::Compile_CMP,   // cmp
@@ -772,6 +772,51 @@ void JitShader::Compile_JMP(Instruction instr) {
     }
 }
 
+static void Emit(GSEmitter* emitter, Math::Vec4<float24> (*output)[16]) {
+    emitter->Emit(*output);
+}
+
+void JitShader::Compile_EMIT(Instruction instr) {
+    Label have_emitter, end;
+    mov(rax, qword[STATE + offsetof(UnitState, emitter_ptr)]);
+    test(rax, rax);
+    jnz(have_emitter);
+
+    ABI_PushRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0);
+    mov(ABI_PARAM1, reinterpret_cast<size_t>("Execute EMIT on VS"));
+    CallFarFunction(*this, LogCritical);
+    ABI_PopRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0);
+    jmp(end);
+
+    L(have_emitter);
+    ABI_PushRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0);
+    mov(ABI_PARAM1, rax);
+    mov(ABI_PARAM2, STATE);
+    add(ABI_PARAM2, static_cast<Xbyak::uint32>(offsetof(UnitState, registers.output)));
+    CallFarFunction(*this, Emit);
+    ABI_PopRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0);
+    L(end);
+}
+
+void JitShader::Compile_SETE(Instruction instr) {
+    Label have_emitter, end;
+    mov(rax, qword[STATE + offsetof(UnitState, emitter_ptr)]);
+    test(rax, rax);
+    jnz(have_emitter);
+
+    ABI_PushRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0);
+    mov(ABI_PARAM1, reinterpret_cast<size_t>("Execute SETEMIT on VS"));
+    CallFarFunction(*this, LogCritical);
+    ABI_PopRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0);
+    jmp(end);
+
+    L(have_emitter);
+    mov(byte[rax + offsetof(GSEmitter, vertex_id)], instr.setemit.vertex_id);
+    mov(byte[rax + offsetof(GSEmitter, prim_emit)], instr.setemit.prim_emit);
+    mov(byte[rax + offsetof(GSEmitter, winding)], instr.setemit.winding);
+    L(end);
+}
+
 void JitShader::Compile_Block(unsigned end) {
     while (program_counter < end) {
         Compile_NextInstr();
diff --git a/src/video_core/shader/shader_jit_x64_compiler.h b/src/video_core/shader/shader_jit_x64_compiler.h
index 31af0ca48..4aee56b1d 100644
--- a/src/video_core/shader/shader_jit_x64_compiler.h
+++ b/src/video_core/shader/shader_jit_x64_compiler.h
@@ -66,6 +66,8 @@ public:
     void Compile_JMP(Instruction instr);
     void Compile_CMP(Instruction instr);
     void Compile_MAD(Instruction instr);
+    void Compile_EMIT(Instruction instr);
+    void Compile_SETE(Instruction instr);
 
 private:
     void Compile_Block(unsigned end);
diff --git a/src/video_core/swrasterizer/clipper.cpp b/src/video_core/swrasterizer/clipper.cpp
index cdbc71502..a52129eb7 100644
--- a/src/video_core/swrasterizer/clipper.cpp
+++ b/src/video_core/swrasterizer/clipper.cpp
@@ -31,7 +31,7 @@ public:
         : coeffs(coeffs), bias(bias) {}
 
     bool IsInside(const Vertex& vertex) const {
-        return Math::Dot(vertex.pos + bias, coeffs) <= float24::FromFloat32(0);
+        return Math::Dot(vertex.pos + bias, coeffs) >= float24::FromFloat32(0);
     }
 
     bool IsOutSide(const Vertex& vertex) const {
@@ -116,19 +116,18 @@ void ProcessTriangle(const OutputVertex& v0, const OutputVertex& v1, const Outpu
     static const float24 f0 = float24::FromFloat32(0.0);
     static const float24 f1 = float24::FromFloat32(1.0);
     static const std::array<ClippingEdge, 7> clipping_edges = {{
-        {Math::MakeVec(f1, f0, f0, -f1)},                                           // x = +w
-        {Math::MakeVec(-f1, f0, f0, -f1)},                                          // x = -w
-        {Math::MakeVec(f0, f1, f0, -f1)},                                           // y = +w
-        {Math::MakeVec(f0, -f1, f0, -f1)},                                          // y = -w
-        {Math::MakeVec(f0, f0, f1, f0)},                                            // z =  0
-        {Math::MakeVec(f0, f0, -f1, -f1)},                                          // z = -w
-        {Math::MakeVec(f0, f0, f0, -f1), Math::Vec4<float24>(f0, f0, f0, EPSILON)}, // w = EPSILON
+        {Math::MakeVec(-f1, f0, f0, f1)},                                          // x = +w
+        {Math::MakeVec(f1, f0, f0, f1)},                                           // x = -w
+        {Math::MakeVec(f0, -f1, f0, f1)},                                          // y = +w
+        {Math::MakeVec(f0, f1, f0, f1)},                                           // y = -w
+        {Math::MakeVec(f0, f0, -f1, f0)},                                          // z =  0
+        {Math::MakeVec(f0, f0, f1, f1)},                                           // z = -w
+        {Math::MakeVec(f0, f0, f0, f1), Math::Vec4<float24>(f0, f0, f0, EPSILON)}, // w = EPSILON
     }};
 
     // Simple implementation of the Sutherland-Hodgman clipping algorithm.
     // TODO: Make this less inefficient (currently lots of useless buffering overhead happens here)
-    for (auto edge : clipping_edges) {
-
+    auto Clip = [&](const ClippingEdge& edge) {
         std::swap(input_list, output_list);
         output_list->clear();
 
@@ -147,12 +146,24 @@ void ProcessTriangle(const OutputVertex& v0, const OutputVertex& v1, const Outpu
             }
             reference_vertex = &vertex;
         }
+    };
+
+    for (auto edge : clipping_edges) {
+        Clip(edge);
 
         // Need to have at least a full triangle to continue...
         if (output_list->size() < 3)
             return;
     }
 
+    if (g_state.regs.rasterizer.clip_enable) {
+        ClippingEdge custom_edge{g_state.regs.rasterizer.GetClipCoef()};
+        Clip(custom_edge);
+
+        if (output_list->size() < 3)
+            return;
+    }
+
     InitScreenCoordinates((*output_list)[0]);
     InitScreenCoordinates((*output_list)[1]);
 
diff --git a/src/video_core/swrasterizer/lighting.cpp b/src/video_core/swrasterizer/lighting.cpp
index 39a3e396d..5fa748611 100644
--- a/src/video_core/swrasterizer/lighting.cpp
+++ b/src/video_core/swrasterizer/lighting.cpp
@@ -22,18 +22,37 @@ static float LookupLightingLut(const Pica::State::Lighting& lighting, size_t lut
 
 std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(
     const Pica::LightingRegs& lighting, const Pica::State::Lighting& lighting_state,
-    const Math::Quaternion<float>& normquat, const Math::Vec3<float>& view) {
+    const Math::Quaternion<float>& normquat, const Math::Vec3<float>& view,
+    const Math::Vec4<u8> (&texture_color)[4]) {
 
-    // TODO(Subv): Bump mapping
-    Math::Vec3<float> surface_normal = {0.0f, 0.0f, 1.0f};
+    Math::Vec3<float> surface_normal;
+    Math::Vec3<float> surface_tangent;
 
     if (lighting.config0.bump_mode != LightingRegs::LightingBumpMode::None) {
-        LOG_CRITICAL(HW_GPU, "unimplemented bump mapping");
-        UNIMPLEMENTED();
+        Math::Vec3<float> perturbation =
+            texture_color[lighting.config0.bump_selector].xyz().Cast<float>() / 127.5f -
+            Math::MakeVec(1.0f, 1.0f, 1.0f);
+        if (lighting.config0.bump_mode == LightingRegs::LightingBumpMode::NormalMap) {
+            if (!lighting.config0.disable_bump_renorm) {
+                const float z_square = 1 - perturbation.xy().Length2();
+                perturbation.z = std::sqrt(std::max(z_square, 0.0f));
+            }
+            surface_normal = perturbation;
+            surface_tangent = Math::MakeVec(1.0f, 0.0f, 0.0f);
+        } else if (lighting.config0.bump_mode == LightingRegs::LightingBumpMode::TangentMap) {
+            surface_normal = Math::MakeVec(0.0f, 0.0f, 1.0f);
+            surface_tangent = perturbation;
+        } else {
+            LOG_ERROR(HW_GPU, "Unknown bump mode %u", lighting.config0.bump_mode.Value());
+        }
+    } else {
+        surface_normal = Math::MakeVec(0.0f, 0.0f, 1.0f);
+        surface_tangent = Math::MakeVec(1.0f, 0.0f, 0.0f);
     }
 
     // Use the normalized the quaternion when performing the rotation
     auto normal = Math::QuaternionRotate(normquat, surface_normal);
+    auto tangent = Math::QuaternionRotate(normquat, surface_tangent);
 
     Math::Vec4<float> diffuse_sum = {0.0f, 0.0f, 0.0f, 1.0f};
     Math::Vec4<float> specular_sum = {0.0f, 0.0f, 0.0f, 1.0f};
@@ -102,6 +121,16 @@ std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(
                 result = Math::Dot(light_vector, spot_dir.Cast<float>() / 2047.0f);
                 break;
             }
+            case LightingRegs::LightingLutInput::CP:
+                if (lighting.config0.config == LightingRegs::LightingConfig::Config7) {
+                    const Math::Vec3<float> norm_half_vector = half_vector.Normalized();
+                    const Math::Vec3<float> half_vector_proj =
+                        norm_half_vector - normal * Math::Dot(normal, norm_half_vector);
+                    result = Math::Dot(half_vector_proj, tangent);
+                } else {
+                    result = 0.0f;
+                }
+                break;
             default:
                 LOG_CRITICAL(HW_GPU, "Unknown lighting LUT input %u\n", static_cast<u32>(input));
                 UNIMPLEMENTED();
@@ -201,7 +230,8 @@ std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(
             d1_lut_value * refl_value * light_config.specular_1.ToVec3f();
 
         // Fresnel
-        if (lighting.config1.disable_lut_fr == 0 &&
+        // Note: only the last entry in the light slots applies the Fresnel factor
+        if (light_index == lighting.max_light_index && lighting.config1.disable_lut_fr == 0 &&
             LightingRegs::IsLightingSamplerSupported(lighting.config0.config,
                                                      LightingRegs::LightingSampler::Fresnel)) {
 
@@ -213,14 +243,14 @@ std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(
             if (lighting.config0.fresnel_selector ==
                     LightingRegs::LightingFresnelSelector::PrimaryAlpha ||
                 lighting.config0.fresnel_selector == LightingRegs::LightingFresnelSelector::Both) {
-                diffuse_sum.a() *= lut_value;
+                diffuse_sum.a() = lut_value;
             }
 
             // Enabled for the specular lighting alpha component
             if (lighting.config0.fresnel_selector ==
                     LightingRegs::LightingFresnelSelector::SecondaryAlpha ||
                 lighting.config0.fresnel_selector == LightingRegs::LightingFresnelSelector::Both) {
-                specular_sum.a() *= lut_value;
+                specular_sum.a() = lut_value;
             }
         }
 
diff --git a/src/video_core/swrasterizer/lighting.h b/src/video_core/swrasterizer/lighting.h
index 438dca926..d807a3d94 100644
--- a/src/video_core/swrasterizer/lighting.h
+++ b/src/video_core/swrasterizer/lighting.h
@@ -13,6 +13,7 @@ namespace Pica {
 
 std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(
     const Pica::LightingRegs& lighting, const Pica::State::Lighting& lighting_state,
-    const Math::Quaternion<float>& normquat, const Math::Vec3<float>& view);
+    const Math::Quaternion<float>& normquat, const Math::Vec3<float>& view,
+    const Math::Vec4<u8> (&texture_color)[4]);
 
 } // namespace Pica
diff --git a/src/video_core/swrasterizer/rasterizer.cpp b/src/video_core/swrasterizer/rasterizer.cpp
index fdc1df199..862135614 100644
--- a/src/video_core/swrasterizer/rasterizer.cpp
+++ b/src/video_core/swrasterizer/rasterizer.cpp
@@ -437,8 +437,8 @@ static void ProcessTriangleInternal(const Vertex& v0, const Vertex& v1, const Ve
                     GetInterpolatedAttribute(v0.view.y, v1.view.y, v2.view.y).ToFloat32(),
                     GetInterpolatedAttribute(v0.view.z, v1.view.z, v2.view.z).ToFloat32(),
                 };
-                std::tie(primary_fragment_color, secondary_fragment_color) =
-                    ComputeFragmentsColors(g_state.regs.lighting, g_state.lighting, normquat, view);
+                std::tie(primary_fragment_color, secondary_fragment_color) = ComputeFragmentsColors(
+                    g_state.regs.lighting, g_state.lighting, normquat, view, texture_color);
             }
 
             for (unsigned tev_stage_index = 0; tev_stage_index < tev_stages.size();