12 files changed, 439 insertions, 79 deletions
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index 6036d6ed3..60529323e 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -111,7 +111,9 @@ if (ENABLE_VULKAN)
         renderer_vulkan/vk_resource_manager.cpp
         renderer_vulkan/vk_resource_manager.h
         renderer_vulkan/vk_scheduler.cpp
-        renderer_vulkan/vk_scheduler.h)
+        renderer_vulkan/vk_scheduler.h
+        renderer_vulkan/vk_stream_buffer.cpp
+        renderer_vulkan/vk_stream_buffer.h)
 
     target_include_directories(video_core PRIVATE ../../externals/Vulkan-Headers/include)
     target_compile_definitions(video_core PRIVATE HAS_VULKAN)
diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp
index 2d2136067..144e7fa82 100644
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@@ -107,21 +107,23 @@ void Maxwell3D::CallMacroMethod(u32 method, std::vector<u32> parameters) {
 void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) {
     auto debug_context = system.GetGPUDebugContext();
 
+    const u32 method = method_call.method;
+
     // It is an error to write to a register other than the current macro's ARG register before it
     // has finished execution.
     if (executing_macro != 0) {
-        ASSERT(method_call.method == executing_macro + 1);
+        ASSERT(method == executing_macro + 1);
     }
 
     // Methods after 0xE00 are special, they're actually triggers for some microcode that was
     // uploaded to the GPU during initialization.
-    if (method_call.method >= MacroRegistersStart) {
+    if (method >= MacroRegistersStart) {
         // We're trying to execute a macro
         if (executing_macro == 0) {
             // A macro call must begin by writing the macro method's register, not its argument.
-            ASSERT_MSG((method_call.method % 2) == 0,
+            ASSERT_MSG((method % 2) == 0,
                        "Can't start macro execution by writing to the ARGS register");
-            executing_macro = method_call.method;
+            executing_macro = method;
         }
 
         macro_params.push_back(method_call.argument);
@@ -133,66 +135,62 @@ void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) {
         return;
     }
 
-    ASSERT_MSG(method_call.method < Regs::NUM_REGS,
+    ASSERT_MSG(method < Regs::NUM_REGS,
                "Invalid Maxwell3D register, increase the size of the Regs structure");
 
     if (debug_context) {
         debug_context->OnEvent(Tegra::DebugContext::Event::MaxwellCommandLoaded, nullptr);
     }
 
-    if (regs.reg_array[method_call.method] != method_call.argument) {
-        regs.reg_array[method_call.method] = method_call.argument;
+    if (regs.reg_array[method] != method_call.argument) {
+        regs.reg_array[method] = method_call.argument;
         // Color buffers
         constexpr u32 first_rt_reg = MAXWELL3D_REG_INDEX(rt);
         constexpr u32 registers_per_rt = sizeof(regs.rt[0]) / sizeof(u32);
-        if (method_call.method >= first_rt_reg &&
-            method_call.method < first_rt_reg + registers_per_rt * Regs::NumRenderTargets) {
-            const std::size_t rt_index = (method_call.method - first_rt_reg) / registers_per_rt;
-            dirty_flags.color_buffer |= 1u << static_cast<u32>(rt_index);
+        if (method >= first_rt_reg &&
+            method < first_rt_reg + registers_per_rt * Regs::NumRenderTargets) {
+            const std::size_t rt_index = (method - first_rt_reg) / registers_per_rt;
+            dirty_flags.color_buffer.set(rt_index);
         }
 
         // Zeta buffer
         constexpr u32 registers_in_zeta = sizeof(regs.zeta) / sizeof(u32);
-        if (method_call.method == MAXWELL3D_REG_INDEX(zeta_enable) ||
-            method_call.method == MAXWELL3D_REG_INDEX(zeta_width) ||
-            method_call.method == MAXWELL3D_REG_INDEX(zeta_height) ||
-            (method_call.method >= MAXWELL3D_REG_INDEX(zeta) &&
-             method_call.method < MAXWELL3D_REG_INDEX(zeta) + registers_in_zeta)) {
+        if (method == MAXWELL3D_REG_INDEX(zeta_enable) ||
+            method == MAXWELL3D_REG_INDEX(zeta_width) ||
+            method == MAXWELL3D_REG_INDEX(zeta_height) ||
+            (method >= MAXWELL3D_REG_INDEX(zeta) &&
+             method < MAXWELL3D_REG_INDEX(zeta) + registers_in_zeta)) {
             dirty_flags.zeta_buffer = true;
         }
 
         // Shader
         constexpr u32 shader_registers_count =
             sizeof(regs.shader_config[0]) * Regs::MaxShaderProgram / sizeof(u32);
-        if (method_call.method >= MAXWELL3D_REG_INDEX(shader_config[0]) &&
-            method_call.method < MAXWELL3D_REG_INDEX(shader_config[0]) + shader_registers_count) {
+        if (method >= MAXWELL3D_REG_INDEX(shader_config[0]) &&
+            method < MAXWELL3D_REG_INDEX(shader_config[0]) + shader_registers_count) {
             dirty_flags.shaders = true;
         }
 
         // Vertex format
-        if (method_call.method >= MAXWELL3D_REG_INDEX(vertex_attrib_format) &&
-            method_call.method <
-                MAXWELL3D_REG_INDEX(vertex_attrib_format) + regs.vertex_attrib_format.size()) {
+        if (method >= MAXWELL3D_REG_INDEX(vertex_attrib_format) &&
+            method < MAXWELL3D_REG_INDEX(vertex_attrib_format) + regs.vertex_attrib_format.size()) {
             dirty_flags.vertex_attrib_format = true;
         }
 
         // Vertex buffer
-        if (method_call.method >= MAXWELL3D_REG_INDEX(vertex_array) &&
-            method_call.method < MAXWELL3D_REG_INDEX(vertex_array) + 4 * 32) {
-            dirty_flags.vertex_array |=
-                1u << ((method_call.method - MAXWELL3D_REG_INDEX(vertex_array)) >> 2);
-        } else if (method_call.method >= MAXWELL3D_REG_INDEX(vertex_array_limit) &&
-                   method_call.method < MAXWELL3D_REG_INDEX(vertex_array_limit) + 2 * 32) {
-            dirty_flags.vertex_array |=
-                1u << ((method_call.method - MAXWELL3D_REG_INDEX(vertex_array_limit)) >> 1);
-        } else if (method_call.method >= MAXWELL3D_REG_INDEX(instanced_arrays) &&
-                   method_call.method < MAXWELL3D_REG_INDEX(instanced_arrays) + 32) {
-            dirty_flags.vertex_array |=
-                1u << (method_call.method - MAXWELL3D_REG_INDEX(instanced_arrays));
+        if (method >= MAXWELL3D_REG_INDEX(vertex_array) &&
+            method < MAXWELL3D_REG_INDEX(vertex_array) + 4 * 32) {
+            dirty_flags.vertex_array.set((method - MAXWELL3D_REG_INDEX(vertex_array)) >> 2);
+        } else if (method >= MAXWELL3D_REG_INDEX(vertex_array_limit) &&
+                   method < MAXWELL3D_REG_INDEX(vertex_array_limit) + 2 * 32) {
+            dirty_flags.vertex_array.set((method - MAXWELL3D_REG_INDEX(vertex_array_limit)) >> 1);
+        } else if (method >= MAXWELL3D_REG_INDEX(instanced_arrays) &&
+                   method < MAXWELL3D_REG_INDEX(instanced_arrays) + 32) {
+            dirty_flags.vertex_array.set(method - MAXWELL3D_REG_INDEX(instanced_arrays));
         }
     }
 
-    switch (method_call.method) {
+    switch (method) {
     case MAXWELL3D_REG_INDEX(macros.data): {
         ProcessMacroUpload(method_call.argument);
         break;
diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h
index 584f51c48..7fbf1026e 100644
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -5,8 +5,10 @@
 #pragma once
 
 #include <array>
+#include <bitset>
 #include <unordered_map>
 #include <vector>
+
 #include "common/assert.h"
 #include "common/bit_field.h"
 #include "common/common_funcs.h"
@@ -1094,19 +1096,18 @@ public:
     MemoryManager& memory_manager;
 
     struct DirtyFlags {
-        u8 color_buffer = 0xFF;
-        bool zeta_buffer = true;
-
-        bool shaders = true;
+        std::bitset<8> color_buffer{0xFF};
+        std::bitset<32> vertex_array{0xFFFFFFFF};
 
         bool vertex_attrib_format = true;
-        u32 vertex_array = 0xFFFFFFFF;
+        bool zeta_buffer = true;
+        bool shaders = true;
 
         void OnMemoryWrite() {
-            color_buffer = 0xFF;
             zeta_buffer = true;
             shaders = true;
-            vertex_array = 0xFFFFFFFF;
+            color_buffer.set();
+            vertex_array.set();
         }
     };
 
diff --git a/src/video_core/rasterizer_cache.h b/src/video_core/rasterizer_cache.h
index bcf0c15a4..a7bcf26fb 100644
--- a/src/video_core/rasterizer_cache.h
+++ b/src/video_core/rasterizer_cache.h
@@ -129,6 +129,15 @@ protected:
         return ++modified_ticks;
     }
 
+    /// Flushes the specified object, updating appropriate cache state as needed
+    void FlushObject(const T& object) {
+        if (!object->IsDirty()) {
+            return;
+        }
+        object->Flush();
+        object->MarkAsModified(false, *this);
+    }
+
 private:
     /// Returns a list of cached objects from the specified memory region, ordered by access time
     std::vector<T> GetSortedObjectsFromRegion(VAddr addr, u64 size) {
@@ -154,15 +163,6 @@ private:
         return objects;
     }
 
-    /// Flushes the specified object, updating appropriate cache state as needed
-    void FlushObject(const T& object) {
-        if (!object->IsDirty()) {
-            return;
-        }
-        object->Flush();
-        object->MarkAsModified(false, *this);
-    }
-
     using ObjectSet = std::set<T>;
     using ObjectCache = std::unordered_map<VAddr, T>;
     using IntervalCache = boost::icl::interval_map<VAddr, ObjectSet>;
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index 09fa01d25..c8c1d6911 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -102,8 +102,8 @@ struct FramebufferCacheKey {
 
 RasterizerOpenGL::RasterizerOpenGL(Core::Frontend::EmuWindow& window, Core::System& system,
                                    ScreenInfo& info)
-    : res_cache{*this}, shader_cache{*this, system}, emu_window{window}, screen_info{info},
-      buffer_cache(*this, STREAM_BUFFER_SIZE), global_cache{*this} {
+    : res_cache{*this}, shader_cache{*this, system}, global_cache{*this}, emu_window{window},
+      screen_info{info}, buffer_cache(*this, STREAM_BUFFER_SIZE) {
     // Create sampler objects
     for (std::size_t i = 0; i < texture_samplers.size(); ++i) {
         texture_samplers[i].Create();
@@ -200,7 +200,7 @@ GLuint RasterizerOpenGL::SetupVertexFormat() {
     }
 
     // Rebinding the VAO invalidates the vertex buffer bindings.
-    gpu.dirty_flags.vertex_array = 0xFFFFFFFF;
+    gpu.dirty_flags.vertex_array.set();
 
     state.draw.vertex_array = vao_entry.handle;
     return vao_entry.handle;
@@ -210,14 +210,14 @@ void RasterizerOpenGL::SetupVertexBuffer(GLuint vao) {
     auto& gpu = Core::System::GetInstance().GPU().Maxwell3D();
     const auto& regs = gpu.regs;
 
-    if (!gpu.dirty_flags.vertex_array)
+    if (gpu.dirty_flags.vertex_array.none())
         return;
 
     MICROPROFILE_SCOPE(OpenGL_VB);
 
     // Upload all guest vertex arrays sequentially to our buffer
     for (u32 index = 0; index < Maxwell::NumVertexArrays; ++index) {
-        if (~gpu.dirty_flags.vertex_array & (1u << index))
+        if (!gpu.dirty_flags.vertex_array[index])
             continue;
 
         const auto& vertex_array = regs.vertex_array[index];
@@ -244,7 +244,7 @@ void RasterizerOpenGL::SetupVertexBuffer(GLuint vao) {
         }
     }
 
-    gpu.dirty_flags.vertex_array = 0;
+    gpu.dirty_flags.vertex_array.reset();
 }
 
 DrawParameters RasterizerOpenGL::SetupDraw() {
@@ -488,13 +488,13 @@ std::pair<bool, bool> RasterizerOpenGL::ConfigureFramebuffers(
     OpenGLState& current_state, bool using_color_fb, bool using_depth_fb, bool preserve_contents,
     std::optional<std::size_t> single_color_target) {
     MICROPROFILE_SCOPE(OpenGL_Framebuffer);
-    const auto& gpu = Core::System::GetInstance().GPU().Maxwell3D();
+    auto& gpu = Core::System::GetInstance().GPU().Maxwell3D();
     const auto& regs = gpu.regs;
 
     const FramebufferConfigState fb_config_state{using_color_fb, using_depth_fb, preserve_contents,
                                                  single_color_target};
-    if (fb_config_state == current_framebuffer_config_state && gpu.dirty_flags.color_buffer == 0 &&
-        !gpu.dirty_flags.zeta_buffer) {
+    if (fb_config_state == current_framebuffer_config_state &&
+        gpu.dirty_flags.color_buffer.none() && !gpu.dirty_flags.zeta_buffer) {
         // Only skip if the previous ConfigureFramebuffers call was from the same kind (multiple or
         // single color targets). This is done because the guest registers may not change but the
         // host framebuffer may contain different attachments
@@ -721,10 +721,10 @@ void RasterizerOpenGL::DrawArrays() {
     // Add space for at least 18 constant buffers
     buffer_size += Maxwell::MaxConstBuffers * (MaxConstbufferSize + uniform_buffer_alignment);
 
-    bool invalidate = buffer_cache.Map(buffer_size);
+    const bool invalidate = buffer_cache.Map(buffer_size);
     if (invalidate) {
         // As all cached buffers are invalidated, we need to recheck their state.
-        gpu.dirty_flags.vertex_array = 0xFFFFFFFF;
+        gpu.dirty_flags.vertex_array.set();
     }
 
     const GLuint vao = SetupVertexFormat();
@@ -738,9 +738,13 @@ void RasterizerOpenGL::DrawArrays() {
     shader_program_manager->ApplyTo(state);
     state.Apply();
 
+    res_cache.SignalPreDrawCall();
+
     // Execute draw call
     params.DispatchDraw();
 
+    res_cache.SignalPostDrawCall();
+
     // Disable scissor test
     state.viewports[0].scissor.enabled = false;
 
diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
index 49c79811d..5fdf1164d 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
@@ -3,6 +3,7 @@
 // Refer to the license.txt file included.
 
 #include <algorithm>
+#include <optional>
 #include <glad/glad.h>
 
 #include "common/alignment.h"
@@ -549,6 +550,8 @@ CachedSurface::CachedSurface(const SurfaceParams& params)
     // alternatives. This signals a bug on those functions.
     const auto width = static_cast<GLsizei>(params.MipWidth(0));
     const auto height = static_cast<GLsizei>(params.MipHeight(0));
+    memory_size = params.MemorySize();
+    reinterpreted = false;
 
     const auto& format_tuple = GetFormatTuple(params.pixel_format, params.component_type);
     gl_internal_format = format_tuple.internal_format;
@@ -962,30 +965,31 @@ Surface RasterizerCacheOpenGL::GetColorBufferSurface(std::size_t index, bool pre
     auto& gpu{Core::System::GetInstance().GPU().Maxwell3D()};
     const auto& regs{gpu.regs};
 
-    if ((gpu.dirty_flags.color_buffer & (1u << static_cast<u32>(index))) == 0) {
+    if (!gpu.dirty_flags.color_buffer[index]) {
         return last_color_buffers[index];
     }
-    gpu.dirty_flags.color_buffer &= ~(1u << static_cast<u32>(index));
+    gpu.dirty_flags.color_buffer.reset(index);
 
     ASSERT(index < Tegra::Engines::Maxwell3D::Regs::NumRenderTargets);
 
     if (index >= regs.rt_control.count) {
-        return last_color_buffers[index] = {};
+        return current_color_buffers[index] = {};
     }
 
     if (regs.rt[index].Address() == 0 || regs.rt[index].format == Tegra::RenderTargetFormat::NONE) {
-        return last_color_buffers[index] = {};
+        return current_color_buffers[index] = {};
     }
 
     const SurfaceParams color_params{SurfaceParams::CreateForFramebuffer(index)};
 
-    return last_color_buffers[index] = GetSurface(color_params, preserve_contents);
+    return current_color_buffers[index] = GetSurface(color_params, preserve_contents);
 }
 
 void RasterizerCacheOpenGL::LoadSurface(const Surface& surface) {
     surface->LoadGLBuffer();
     surface->UploadGLTexture(read_framebuffer.handle, draw_framebuffer.handle);
     surface->MarkAsModified(false, *this);
+    surface->MarkForReload(false);
 }
 
 Surface RasterizerCacheOpenGL::GetSurface(const SurfaceParams& params, bool preserve_contents) {
@@ -997,18 +1001,23 @@ Surface RasterizerCacheOpenGL::GetSurface(const SurfaceParams& params, bool pres
     Surface surface{TryGet(params.addr)};
     if (surface) {
         if (surface->GetSurfaceParams().IsCompatibleSurface(params)) {
-            // Use the cached surface as-is
+            // Use the cached surface as-is unless it's not synced with memory
+            if (surface->MustReload())
+                LoadSurface(surface);
             return surface;
         } else if (preserve_contents) {
             // If surface parameters changed and we care about keeping the previous data, recreate
             // the surface from the old one
             Surface new_surface{RecreateSurface(surface, params)};
-            Unregister(surface);
+            UnregisterSurface(surface);
             Register(new_surface);
+            if (new_surface->IsUploaded()) {
+                RegisterReinterpretSurface(new_surface);
+            }
             return new_surface;
         } else {
             // Delete the old surface before creating a new one to prevent collisions.
-            Unregister(surface);
+            UnregisterSurface(surface);
         }
     }
 
@@ -1290,4 +1299,107 @@ Surface RasterizerCacheOpenGL::TryGetReservedSurface(const SurfaceParams& params
     return {};
 }
 
+static std::optional<u32> TryFindBestMipMap(std::size_t memory, const SurfaceParams params,
+                                            u32 height) {
+    for (u32 i = 0; i < params.max_mip_level; i++) {
+        if (memory == params.GetMipmapSingleSize(i) && params.MipHeight(i) == height) {
+            return {i};
+        }
+    }
+    return {};
+}
+
+static std::optional<u32> TryFindBestLayer(VAddr addr, const SurfaceParams params, u32 mipmap) {
+    const std::size_t size = params.LayerMemorySize();
+    VAddr start = params.addr + params.GetMipmapLevelOffset(mipmap);
+    for (u32 i = 0; i < params.depth; i++) {
+        if (start == addr) {
+            return {i};
+        }
+        start += size;
+    }
+    return {};
+}
+
+static bool LayerFitReinterpretSurface(RasterizerCacheOpenGL& cache, const Surface render_surface,
+                                       const Surface blitted_surface) {
+    const auto& dst_params = blitted_surface->GetSurfaceParams();
+    const auto& src_params = render_surface->GetSurfaceParams();
+    const std::size_t src_memory_size = src_params.size_in_bytes;
+    const std::optional<u32> level =
+        TryFindBestMipMap(src_memory_size, dst_params, src_params.height);
+    if (level.has_value()) {
+        if (src_params.width == dst_params.MipWidthGobAligned(*level) &&
+            src_params.height == dst_params.MipHeight(*level) &&
+            src_params.block_height >= dst_params.MipBlockHeight(*level)) {
+            const std::optional<u32> slot =
+                TryFindBestLayer(render_surface->GetAddr(), dst_params, *level);
+            if (slot.has_value()) {
+                glCopyImageSubData(render_surface->Texture().handle,
+                                   SurfaceTargetToGL(src_params.target), 0, 0, 0, 0,
+                                   blitted_surface->Texture().handle,
+                                   SurfaceTargetToGL(dst_params.target), *level, 0, 0, *slot,
+                                   dst_params.MipWidth(*level), dst_params.MipHeight(*level), 1);
+                blitted_surface->MarkAsModified(true, cache);
+                return true;
+            }
+        }
+    }
+    return false;
+}
+
+static bool IsReinterpretInvalid(const Surface render_surface, const Surface blitted_surface) {
+    const VAddr bound1 = blitted_surface->GetAddr() + blitted_surface->GetMemorySize();
+    const VAddr bound2 = render_surface->GetAddr() + render_surface->GetMemorySize();
+    if (bound2 > bound1)
+        return true;
+    const auto& dst_params = blitted_surface->GetSurfaceParams();
+    const auto& src_params = render_surface->GetSurfaceParams();
+    return (dst_params.component_type != src_params.component_type);
+}
+
+static bool IsReinterpretInvalidSecond(const Surface render_surface,
+                                       const Surface blitted_surface) {
+    const auto& dst_params = blitted_surface->GetSurfaceParams();
+    const auto& src_params = render_surface->GetSurfaceParams();
+    return (dst_params.height > src_params.height && dst_params.width > src_params.width);
+}
+
+bool RasterizerCacheOpenGL::PartialReinterpretSurface(Surface triggering_surface,
+                                                      Surface intersect) {
+    if (IsReinterpretInvalid(triggering_surface, intersect)) {
+        UnregisterSurface(intersect);
+        return false;
+    }
+    if (!LayerFitReinterpretSurface(*this, triggering_surface, intersect)) {
+        if (IsReinterpretInvalidSecond(triggering_surface, intersect)) {
+            UnregisterSurface(intersect);
+            return false;
+        }
+        FlushObject(intersect);
+        FlushObject(triggering_surface);
+        intersect->MarkForReload(true);
+    }
+    return true;
+}
+
+void RasterizerCacheOpenGL::SignalPreDrawCall() {
+    if (texception && GLAD_GL_ARB_texture_barrier) {
+        glTextureBarrier();
+    }
+    texception = false;
+}
+
+void RasterizerCacheOpenGL::SignalPostDrawCall() {
+    for (u32 i = 0; i < Maxwell::NumRenderTargets; i++) {
+        if (current_color_buffers[i] != nullptr) {
+            Surface intersect = CollideOnReinterpretedSurface(current_color_buffers[i]->GetAddr());
+            if (intersect != nullptr) {
+                PartialReinterpretSurface(current_color_buffers[i], intersect);
+                texception = true;
+            }
+        }
+    }
+}
+
 } // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.h b/src/video_core/renderer_opengl/gl_rasterizer_cache.h
index 838554c35..797bbdc9c 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer_cache.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.h
@@ -34,6 +34,7 @@ using SurfaceTarget = VideoCore::Surface::SurfaceTarget;
 using SurfaceType = VideoCore::Surface::SurfaceType;
 using PixelFormat = VideoCore::Surface::PixelFormat;
 using ComponentType = VideoCore::Surface::ComponentType;
+using Maxwell = Tegra::Engines::Maxwell3D::Regs;
 
 struct SurfaceParams {
     enum class SurfaceClass {
@@ -140,10 +141,18 @@ struct SurfaceParams {
         return offset;
     }
 
+    std::size_t GetMipmapSingleSize(u32 mip_level) const {
+        return InnerMipmapMemorySize(mip_level, false, is_layered);
+    }
+
     u32 MipWidth(u32 mip_level) const {
         return std::max(1U, width >> mip_level);
     }
 
+    u32 MipWidthGobAligned(u32 mip_level) const {
+        return Common::AlignUp(std::max(1U, width >> mip_level), 64U * 8U / GetFormatBpp());
+    }
+
     u32 MipHeight(u32 mip_level) const {
         return std::max(1U, height >> mip_level);
     }
@@ -346,6 +355,10 @@ public:
         return cached_size_in_bytes;
     }
 
+    std::size_t GetMemorySize() const {
+        return memory_size;
+    }
+
     void Flush() override {
         FlushGLBuffer();
     }
@@ -395,6 +408,26 @@ public:
                        Tegra::Texture::SwizzleSource swizzle_z,
                        Tegra::Texture::SwizzleSource swizzle_w);
 
+    void MarkReinterpreted() {
+        reinterpreted = true;
+    }
+
+    bool IsReinterpreted() const {
+        return reinterpreted;
+    }
+
+    void MarkForReload(bool reload) {
+        must_reload = reload;
+    }
+
+    bool MustReload() const {
+        return must_reload;
+    }
+
+    bool IsUploaded() const {
+        return params.identity == SurfaceParams::SurfaceClass::Uploaded;
+    }
+
 private:
     void UploadGLMipmapTexture(u32 mip_map, GLuint read_fb_handle, GLuint draw_fb_handle);
 
@@ -408,6 +441,9 @@ private:
     GLenum gl_internal_format{};
     std::size_t cached_size_in_bytes{};
     std::array<GLenum, 4> swizzle{GL_RED, GL_GREEN, GL_BLUE, GL_ALPHA};
+    std::size_t memory_size;
+    bool reinterpreted = false;
+    bool must_reload = false;
 };
 
 class RasterizerCacheOpenGL final : public RasterizerCache<Surface> {
@@ -433,6 +469,9 @@ public:
                           const Common::Rectangle<u32>& src_rect,
                           const Common::Rectangle<u32>& dst_rect);
 
+    void SignalPreDrawCall();
+    void SignalPostDrawCall();
+
 private:
     void LoadSurface(const Surface& surface);
     Surface GetSurface(const SurfaceParams& params, bool preserve_contents = true);
@@ -449,6 +488,10 @@ private:
     /// Tries to get a reserved surface for the specified parameters
     Surface TryGetReservedSurface(const SurfaceParams& params);
 
+    // Partialy reinterpret a surface based on a triggering_surface that collides with it.
+    // returns true if the reinterpret was successful, false in case it was not.
+    bool PartialReinterpretSurface(Surface triggering_surface, Surface intersect);
+
     /// Performs a slow but accurate surface copy, flushing to RAM and reinterpreting the data
     void AccurateCopySurface(const Surface& src_surface, const Surface& dst_surface);
     void FastLayeredCopySurface(const Surface& src_surface, const Surface& dst_surface);
@@ -465,12 +508,50 @@ private:
     OGLFramebuffer read_framebuffer;
     OGLFramebuffer draw_framebuffer;
 
+    bool texception = false;
+
     /// Use a Pixel Buffer Object to download the previous texture and then upload it to the new one
     /// using the new format.
     OGLBuffer copy_pbo;
 
-    std::array<Surface, Tegra::Engines::Maxwell3D::Regs::NumRenderTargets> last_color_buffers;
+    std::array<Surface, Maxwell::NumRenderTargets> last_color_buffers;
+    std::array<Surface, Maxwell::NumRenderTargets> current_color_buffers;
     Surface last_depth_buffer;
+
+    using SurfaceIntervalCache = boost::icl::interval_map<VAddr, Surface>;
+    using SurfaceInterval = typename SurfaceIntervalCache::interval_type;
+
+    static auto GetReinterpretInterval(const Surface& object) {
+        return SurfaceInterval::right_open(object->GetAddr() + 1,
+                                           object->GetAddr() + object->GetMemorySize() - 1);
+    }
+
+    // Reinterpreted surfaces are very fragil as the game may keep rendering into them.
+    SurfaceIntervalCache reinterpreted_surfaces;
+
+    void RegisterReinterpretSurface(Surface reinterpret_surface) {
+        auto interval = GetReinterpretInterval(reinterpret_surface);
+        reinterpreted_surfaces.insert({interval, reinterpret_surface});
+        reinterpret_surface->MarkReinterpreted();
+    }
+
+    Surface CollideOnReinterpretedSurface(VAddr addr) const {
+        const SurfaceInterval interval{addr};
+        for (auto& pair :
+             boost::make_iterator_range(reinterpreted_surfaces.equal_range(interval))) {
+            return pair.second;
+        }
+        return nullptr;
+    }
+
+    /// Unregisters an object from the cache
+    void UnregisterSurface(const Surface& object) {
+        if (object->IsReinterpreted()) {
+            auto interval = GetReinterpretInterval(object);
+            reinterpreted_surfaces.erase(interval);
+        }
+        Unregister(object);
+    }
 };
 
 } // namespace OpenGL
diff --git a/src/video_core/renderer_vulkan/vk_memory_manager.cpp b/src/video_core/renderer_vulkan/vk_memory_manager.cpp
index 17ee93b91..0451babbf 100644
--- a/src/video_core/renderer_vulkan/vk_memory_manager.cpp
+++ b/src/video_core/renderer_vulkan/vk_memory_manager.cpp
@@ -238,7 +238,7 @@ bool VKMemoryManager::AllocMemory(vk::MemoryPropertyFlags wanted_properties, u32
 
 VKMemoryCommitImpl::VKMemoryCommitImpl(VKMemoryAllocation* allocation, vk::DeviceMemory memory,
                                        u8* data, u64 begin, u64 end)
-    : allocation{allocation}, memory{memory}, data{data}, interval(std::make_pair(begin, end)) {}
+    : interval(std::make_pair(begin, end)), memory{memory}, allocation{allocation}, data{data} {}
 
 VKMemoryCommitImpl::~VKMemoryCommitImpl() {
     allocation->Free(this);
diff --git a/src/video_core/renderer_vulkan/vk_resource_manager.cpp b/src/video_core/renderer_vulkan/vk_resource_manager.cpp
index 1678463c7..a1e117443 100644
--- a/src/video_core/renderer_vulkan/vk_resource_manager.cpp
+++ b/src/video_core/renderer_vulkan/vk_resource_manager.cpp
@@ -125,11 +125,12 @@ void VKFence::Protect(VKResource* resource) {
     protected_resources.push_back(resource);
 }
 
-void VKFence::Unprotect(const VKResource* resource) {
+void VKFence::Unprotect(VKResource* resource) {
     const auto it = std::find(protected_resources.begin(), protected_resources.end(), resource);
-    if (it != protected_resources.end()) {
-        protected_resources.erase(it);
-    }
+    ASSERT(it != protected_resources.end());
+
+    resource->OnFenceRemoval(this);
+    protected_resources.erase(it);
 }
 
 VKFenceWatch::VKFenceWatch() = default;
@@ -141,12 +142,11 @@ VKFenceWatch::~VKFenceWatch() {
 }
 
 void VKFenceWatch::Wait() {
-    if (!fence) {
+    if (fence == nullptr) {
         return;
     }
     fence->Wait();
     fence->Unprotect(this);
-    fence = nullptr;
 }
 
 void VKFenceWatch::Watch(VKFence& new_fence) {
diff --git a/src/video_core/renderer_vulkan/vk_resource_manager.h b/src/video_core/renderer_vulkan/vk_resource_manager.h
index 5018dfa44..5bfe4cead 100644
--- a/src/video_core/renderer_vulkan/vk_resource_manager.h
+++ b/src/video_core/renderer_vulkan/vk_resource_manager.h
@@ -63,7 +63,7 @@ public:
     void Protect(VKResource* resource);
 
     /// Removes protection for a resource.
-    void Unprotect(const VKResource* resource);
+    void Unprotect(VKResource* resource);
 
     /// Retreives the fence.
     operator vk::Fence() const {
diff --git a/src/video_core/renderer_vulkan/vk_stream_buffer.cpp b/src/video_core/renderer_vulkan/vk_stream_buffer.cpp
new file mode 100644
index 000000000..58ffa42f2
--- /dev/null
+++ b/src/video_core/renderer_vulkan/vk_stream_buffer.cpp
@@ -0,0 +1,90 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <algorithm>
+#include <memory>
+#include <optional>
+#include <vector>
+
+#include "common/assert.h"
+#include "video_core/renderer_vulkan/declarations.h"
+#include "video_core/renderer_vulkan/vk_device.h"
+#include "video_core/renderer_vulkan/vk_memory_manager.h"
+#include "video_core/renderer_vulkan/vk_resource_manager.h"
+#include "video_core/renderer_vulkan/vk_scheduler.h"
+#include "video_core/renderer_vulkan/vk_stream_buffer.h"
+
+namespace Vulkan {
+
+constexpr u64 WATCHES_INITIAL_RESERVE = 0x4000;
+constexpr u64 WATCHES_RESERVE_CHUNK = 0x1000;
+
+VKStreamBuffer::VKStreamBuffer(const VKDevice& device, VKMemoryManager& memory_manager,
+                               VKScheduler& scheduler, u64 size, vk::BufferUsageFlags usage,
+                               vk::AccessFlags access, vk::PipelineStageFlags pipeline_stage)
+    : device{device}, scheduler{scheduler}, buffer_size{size}, access{access}, pipeline_stage{
+                                                                                   pipeline_stage} {
+    CreateBuffers(memory_manager, usage);
+    ReserveWatches(WATCHES_INITIAL_RESERVE);
+}
+
+VKStreamBuffer::~VKStreamBuffer() = default;
+
+std::tuple<u8*, u64, bool> VKStreamBuffer::Reserve(u64 size) {
+    ASSERT(size <= buffer_size);
+    mapped_size = size;
+
+    if (offset + size > buffer_size) {
+        // The buffer would overflow, save the amount of used buffers, signal an invalidation and
+        // reset the state.
+        invalidation_mark = used_watches;
+        used_watches = 0;
+        offset = 0;
+    }
+
+    return {mapped_pointer + offset, offset, invalidation_mark.has_value()};
+}
+
+VKExecutionContext VKStreamBuffer::Send(VKExecutionContext exctx, u64 size) {
+    ASSERT_MSG(size <= mapped_size, "Reserved size is too small");
+
+    if (invalidation_mark) {
+        // TODO(Rodrigo): Find a better way to invalidate than waiting for all watches to finish.
+        exctx = scheduler.Flush();
+        std::for_each(watches.begin(), watches.begin() + *invalidation_mark,
+                      [&](auto& resource) { resource->Wait(); });
+        invalidation_mark = std::nullopt;
+    }
+
+    if (used_watches + 1 >= watches.size()) {
+        // Ensure that there are enough watches.
+        ReserveWatches(WATCHES_RESERVE_CHUNK);
+    }
+    // Add a watch for this allocation.
+    watches[used_watches++]->Watch(exctx.GetFence());
+
+    offset += size;
+
+    return exctx;
+}
+
+void VKStreamBuffer::CreateBuffers(VKMemoryManager& memory_manager, vk::BufferUsageFlags usage) {
+    const vk::BufferCreateInfo buffer_ci({}, buffer_size, usage, vk::SharingMode::eExclusive, 0,
+                                         nullptr);
+
+    const auto dev = device.GetLogical();
+    const auto& dld = device.GetDispatchLoader();
+    buffer = dev.createBufferUnique(buffer_ci, nullptr, dld);
+    commit = memory_manager.Commit(*buffer, true);
+    mapped_pointer = commit->GetData();
+}
+
+void VKStreamBuffer::ReserveWatches(std::size_t grow_size) {
+    const std::size_t previous_size = watches.size();
+    watches.resize(previous_size + grow_size);
+    std::generate(watches.begin() + previous_size, watches.end(),
+                  []() { return std::make_unique<VKFenceWatch>(); });
+}
+
+} // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_stream_buffer.h b/src/video_core/renderer_vulkan/vk_stream_buffer.h
new file mode 100644
index 000000000..69d036ccd
--- /dev/null
+++ b/src/video_core/renderer_vulkan/vk_stream_buffer.h
@@ -0,0 +1,72 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <memory>
+#include <optional>
+#include <tuple>
+#include <vector>
+
+#include "common/common_types.h"
+#include "video_core/renderer_vulkan/declarations.h"
+#include "video_core/renderer_vulkan/vk_memory_manager.h"
+
+namespace Vulkan {
+
+class VKDevice;
+class VKFence;
+class VKFenceWatch;
+class VKResourceManager;
+class VKScheduler;
+
+class VKStreamBuffer {
+public:
+    explicit VKStreamBuffer(const VKDevice& device, VKMemoryManager& memory_manager,
+                            VKScheduler& scheduler, u64 size, vk::BufferUsageFlags usage,
+                            vk::AccessFlags access, vk::PipelineStageFlags pipeline_stage);
+    ~VKStreamBuffer();
+
+    /**
+     * Reserves a region of memory from the stream buffer.
+     * @param size Size to reserve.
+     * @returns A tuple in the following order: Raw memory pointer (with offset added), buffer
+     * offset and a boolean that's true when buffer has been invalidated.
+     */
+    std::tuple<u8*, u64, bool> Reserve(u64 size);
+
+    /// Ensures that "size" bytes of memory are available to the GPU, potentially recording a copy.
+    [[nodiscard]] VKExecutionContext Send(VKExecutionContext exctx, u64 size);
+
+    vk::Buffer GetBuffer() const {
+        return *buffer;
+    }
+
+private:
+    /// Creates Vulkan buffer handles committing the required the required memory.
+    void CreateBuffers(VKMemoryManager& memory_manager, vk::BufferUsageFlags usage);
+
+    /// Increases the amount of watches available.
+    void ReserveWatches(std::size_t grow_size);
+
+    const VKDevice& device;                      ///< Vulkan device manager.
+    VKScheduler& scheduler;                      ///< Command scheduler.
+    const u64 buffer_size;                       ///< Total size of the stream buffer.
+    const vk::AccessFlags access;                ///< Access usage of this stream buffer.
+    const vk::PipelineStageFlags pipeline_stage; ///< Pipeline usage of this stream buffer.
+
+    UniqueBuffer buffer;   ///< Mapped buffer.
+    VKMemoryCommit commit; ///< Memory commit.
+    u8* mapped_pointer{};  ///< Pointer to the host visible commit
+
+    u64 offset{};      ///< Buffer iterator.
+    u64 mapped_size{}; ///< Size reserved for the current copy.
+
+    std::vector<std::unique_ptr<VKFenceWatch>> watches; ///< Total watches
+    std::size_t used_watches{}; ///< Count of watches, reset on invalidation.
+    std::optional<std::size_t>
+        invalidation_mark{}; ///< Number of watches used in the current invalidation.
+};
+
+} // namespace Vulkan