diff options
Diffstat (limited to 'src/video_core')
28 files changed, 1214 insertions, 277 deletions
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index 162108301..183709d8b 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt @@ -11,8 +11,9 @@ set(SRCS pica.cpp primitive_assembly.cpp rasterizer.cpp + shader/shader.cpp + shader/shader_interpreter.cpp utils.cpp - vertex_shader.cpp video_core.cpp ) @@ -35,11 +36,20 @@ set(HEADERS primitive_assembly.h rasterizer.h renderer_base.h + shader/shader.h + shader/shader_interpreter.h utils.h - vertex_shader.h video_core.h ) +if(ARCHITECTURE_x86_64) + set(SRCS ${SRCS} + shader/shader_jit_x64.cpp) + + set(HEADERS ${HEADERS} + shader/shader_jit_x64.h) +endif() + create_directory_groups(${SRCS} ${HEADERS}) add_library(video_core STATIC ${SRCS} ${HEADERS}) diff --git a/src/video_core/clipper.cpp b/src/video_core/clipper.cpp index 558b49d60..bb6048cc0 100644 --- a/src/video_core/clipper.cpp +++ b/src/video_core/clipper.cpp @@ -7,7 +7,7 @@ #include "clipper.h" #include "pica.h" #include "rasterizer.h" -#include "vertex_shader.h" +#include "shader/shader_interpreter.h" namespace Pica { diff --git a/src/video_core/clipper.h b/src/video_core/clipper.h index 19ce8e140..6ed01e877 100644 --- a/src/video_core/clipper.h +++ b/src/video_core/clipper.h @@ -6,13 +6,13 @@ namespace Pica { -namespace VertexShader { +namespace Shader { struct OutputVertex; } namespace Clipper { -using VertexShader::OutputVertex; +using Shader::OutputVertex; void ProcessTriangle(OutputVertex& v0, OutputVertex& v1, OutputVertex& v2); diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp index 243abe842..374c4748d 100644 --- a/src/video_core/command_processor.cpp +++ b/src/video_core/command_processor.cpp @@ -18,7 +18,7 @@ #include "pica.h" #include "primitive_assembly.h" #include "renderer_base.h" -#include "vertex_shader.h" +#include "shader/shader_interpreter.h" #include "video_core.h" namespace Pica { @@ -165,7 +165,7 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) { DebugUtils::GeometryDumper geometry_dumper; PrimitiveAssembler<DebugUtils::GeometryDumper::Vertex> dumping_primitive_assembler(regs.triangle_topology.Value()); #endif - PrimitiveAssembler<VertexShader::OutputVertex> primitive_assembler(regs.triangle_topology.Value()); + PrimitiveAssembler<Shader::OutputVertex> primitive_assembler(regs.triangle_topology.Value()); if (g_debug_context) { for (int i = 0; i < 3; ++i) { @@ -210,11 +210,14 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) { // The size has been tuned for optimal balance between hit-rate and the cost of lookup const size_t VERTEX_CACHE_SIZE = 32; std::array<u16, VERTEX_CACHE_SIZE> vertex_cache_ids; - std::array<VertexShader::OutputVertex, VERTEX_CACHE_SIZE> vertex_cache; + std::array<Shader::OutputVertex, VERTEX_CACHE_SIZE> vertex_cache; unsigned int vertex_cache_pos = 0; vertex_cache_ids.fill(-1); + Shader::UnitState shader_unit; + Shader::Setup(shader_unit); + for (unsigned int index = 0; index < regs.num_vertices; ++index) { unsigned int vertex = is_indexed ? (index_u16 ? index_address_16[index] : index_address_8[index]) : index; @@ -224,7 +227,7 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) { ASSERT(vertex != -1); bool vertex_cache_hit = false; - VertexShader::OutputVertex output; + Shader::OutputVertex output; if (is_indexed) { if (g_debug_context && Pica::g_debug_context->recorder) { @@ -243,7 +246,7 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) { if (!vertex_cache_hit) { // Initialize data for the current vertex - VertexShader::InputVertex input; + Shader::InputVertex input; for (int i = 0; i < attribute_config.GetNumTotalAttributes(); ++i) { if (vertex_attribute_elements[i] != 0) { @@ -306,9 +309,8 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) { std::bind(&DebugUtils::GeometryDumper::AddTriangle, &geometry_dumper, _1, _2, _3)); #endif - // Send to vertex shader - output = VertexShader::RunShader(input, attribute_config.GetNumTotalAttributes(), g_state.regs.vs, g_state.vs); + output = Shader::Run(shader_unit, input, attribute_config.GetNumTotalAttributes()); if (is_indexed) { vertex_cache[vertex_cache_pos] = output; @@ -319,9 +321,9 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) { if (Settings::values.use_hw_renderer) { // Send to hardware renderer - static auto AddHWTriangle = [](const Pica::VertexShader::OutputVertex& v0, - const Pica::VertexShader::OutputVertex& v1, - const Pica::VertexShader::OutputVertex& v2) { + static auto AddHWTriangle = [](const Pica::Shader::OutputVertex& v0, + const Pica::Shader::OutputVertex& v1, + const Pica::Shader::OutputVertex& v2) { VideoCore::g_renderer->hw_rasterizer->AddTriangle(v0, v1, v2); }; diff --git a/src/video_core/debug_utils/debug_utils.cpp b/src/video_core/debug_utils/debug_utils.cpp index e9a858411..572b4fd62 100644 --- a/src/video_core/debug_utils/debug_utils.cpp +++ b/src/video_core/debug_utils/debug_utils.cpp @@ -18,6 +18,7 @@ #include "common/assert.h" #include "common/color.h" +#include "common/common_types.h" #include "common/file_util.h" #include "common/math_util.h" #include "common/vector_math.h" @@ -233,7 +234,7 @@ void DumpShader(const u32* binary_data, u32 binary_size, const u32* swizzle_data dvle.main_offset_words = main_offset; dvle.output_register_table_offset = write_offset - dvlb.dvle_offset; - dvle.output_register_table_size = static_cast<uint32_t>(output_info_table.size()); + dvle.output_register_table_size = static_cast<u32>(output_info_table.size()); QueueForWriting((u8*)output_info_table.data(), static_cast<u32>(output_info_table.size() * sizeof(OutputRegisterInfo))); // TODO: Create a label table for "main" diff --git a/src/video_core/hwrasterizer_base.h b/src/video_core/hwrasterizer_base.h index c8746c608..54b8892fb 100644 --- a/src/video_core/hwrasterizer_base.h +++ b/src/video_core/hwrasterizer_base.h @@ -7,7 +7,7 @@ #include "common/common_types.h" namespace Pica { -namespace VertexShader { +namespace Shader { struct OutputVertex; } } @@ -24,9 +24,9 @@ public: virtual void Reset() = 0; /// Queues the primitive formed by the given vertices for rendering - virtual void AddTriangle(const Pica::VertexShader::OutputVertex& v0, - const Pica::VertexShader::OutputVertex& v1, - const Pica::VertexShader::OutputVertex& v2) = 0; + virtual void AddTriangle(const Pica::Shader::OutputVertex& v0, + const Pica::Shader::OutputVertex& v1, + const Pica::Shader::OutputVertex& v2) = 0; /// Draw the current batch of triangles virtual void DrawTriangles() = 0; diff --git a/src/video_core/pica.cpp b/src/video_core/pica.cpp index 17cb66780..c73a8178e 100644 --- a/src/video_core/pica.cpp +++ b/src/video_core/pica.cpp @@ -6,6 +6,7 @@ #include <unordered_map> #include "pica.h" +#include "shader/shader.h" namespace Pica { @@ -84,6 +85,8 @@ void Init() { } void Shutdown() { + Shader::Shutdown(); + memset(&g_state, 0, sizeof(State)); } diff --git a/src/video_core/pica.h b/src/video_core/pica.h index 34b02b2f8..6ce90f95a 100644 --- a/src/video_core/pica.h +++ b/src/video_core/pica.h @@ -1083,6 +1083,7 @@ private: // TODO: Perform proper arithmetic on this! float value; }; +static_assert(sizeof(float24) == sizeof(float), "Shader JIT assumes float24 is implemented as a 32-bit float"); /// Struct used to describe current Pica state struct State { @@ -1092,7 +1093,10 @@ struct State { /// Vertex shader memory struct ShaderSetup { struct { - Math::Vec4<float24> f[96]; + // The float uniforms are accessed by the shader JIT using SSE instructions, and are + // therefore required to be 16-byte aligned. + Math::Vec4<float24> MEMORY_ALIGNED16(f[96]); + std::array<bool, 16> b; std::array<Math::Vec4<u8>, 4> i; } uniforms; diff --git a/src/video_core/primitive_assembly.cpp b/src/video_core/primitive_assembly.cpp index 2f22bdcce..e2b1df44c 100644 --- a/src/video_core/primitive_assembly.cpp +++ b/src/video_core/primitive_assembly.cpp @@ -4,7 +4,7 @@ #include "pica.h" #include "primitive_assembly.h" -#include "vertex_shader.h" +#include "shader/shader_interpreter.h" #include "common/logging/log.h" #include "video_core/debug_utils/debug_utils.h" @@ -56,7 +56,7 @@ void PrimitiveAssembler<VertexType>::SubmitVertex(VertexType& vtx, TriangleHandl // explicitly instantiate use cases template -struct PrimitiveAssembler<VertexShader::OutputVertex>; +struct PrimitiveAssembler<Shader::OutputVertex>; template struct PrimitiveAssembler<DebugUtils::GeometryDumper::Vertex>; diff --git a/src/video_core/primitive_assembly.h b/src/video_core/primitive_assembly.h index 52ff4cd89..80432d68a 100644 --- a/src/video_core/primitive_assembly.h +++ b/src/video_core/primitive_assembly.h @@ -8,7 +8,7 @@ #include "video_core/pica.h" -#include "video_core/vertex_shader.h" +#include "video_core/shader/shader_interpreter.h" namespace Pica { diff --git a/src/video_core/rasterizer.cpp b/src/video_core/rasterizer.cpp index 68b7cc05d..b83798b0f 100644 --- a/src/video_core/rasterizer.cpp +++ b/src/video_core/rasterizer.cpp @@ -16,7 +16,7 @@ #include "math.h" #include "pica.h" #include "rasterizer.h" -#include "vertex_shader.h" +#include "shader/shader_interpreter.h" #include "video_core/utils.h" namespace Pica { @@ -272,9 +272,9 @@ static Common::Profiling::TimingCategory rasterization_category("Rasterization") * Helper function for ProcessTriangle with the "reversed" flag to allow for implementing * culling via recursion. */ -static void ProcessTriangleInternal(const VertexShader::OutputVertex& v0, - const VertexShader::OutputVertex& v1, - const VertexShader::OutputVertex& v2, +static void ProcessTriangleInternal(const Shader::OutputVertex& v0, + const Shader::OutputVertex& v1, + const Shader::OutputVertex& v2, bool reversed = false) { const auto& regs = g_state.regs; @@ -1107,9 +1107,9 @@ static void ProcessTriangleInternal(const VertexShader::OutputVertex& v0, } } -void ProcessTriangle(const VertexShader::OutputVertex& v0, - const VertexShader::OutputVertex& v1, - const VertexShader::OutputVertex& v2) { +void ProcessTriangle(const Shader::OutputVertex& v0, + const Shader::OutputVertex& v1, + const Shader::OutputVertex& v2) { ProcessTriangleInternal(v0, v1, v2); } diff --git a/src/video_core/rasterizer.h b/src/video_core/rasterizer.h index 42148f8b1..a6a9634b4 100644 --- a/src/video_core/rasterizer.h +++ b/src/video_core/rasterizer.h @@ -6,15 +6,15 @@ namespace Pica { -namespace VertexShader { +namespace Shader { struct OutputVertex; } namespace Rasterizer { -void ProcessTriangle(const VertexShader::OutputVertex& v0, - const VertexShader::OutputVertex& v1, - const VertexShader::OutputVertex& v2); +void ProcessTriangle(const Shader::OutputVertex& v0, + const Shader::OutputVertex& v1, + const Shader::OutputVertex& v2); } // namespace Rasterizer diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index 1fc4e56b1..9f1552adf 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -99,7 +99,6 @@ void RasterizerOpenGL::InitObjects() { fb_color_texture.texture.Create(); ReconfigureColorTexture(fb_color_texture, Pica::Regs::ColorFormat::RGBA8, 1, 1); - state.texture_units[0].enabled_2d = true; state.texture_units[0].texture_2d = fb_color_texture.texture.handle; state.Apply(); @@ -115,7 +114,6 @@ void RasterizerOpenGL::InitObjects() { fb_depth_texture.texture.Create(); ReconfigureDepthTexture(fb_depth_texture, Pica::Regs::DepthFormat::D16, 1, 1); - state.texture_units[0].enabled_2d = true; state.texture_units[0].texture_2d = fb_depth_texture.texture.handle; state.Apply(); @@ -204,9 +202,9 @@ void RasterizerOpenGL::Reset() { res_cache.FullFlush(); } -void RasterizerOpenGL::AddTriangle(const Pica::VertexShader::OutputVertex& v0, - const Pica::VertexShader::OutputVertex& v1, - const Pica::VertexShader::OutputVertex& v2) { +void RasterizerOpenGL::AddTriangle(const Pica::Shader::OutputVertex& v0, + const Pica::Shader::OutputVertex& v1, + const Pica::Shader::OutputVertex& v2) { vertex_batch.push_back(HardwareVertex(v0)); vertex_batch.push_back(HardwareVertex(v1)); vertex_batch.push_back(HardwareVertex(v2)); @@ -493,7 +491,6 @@ void RasterizerOpenGL::ReconfigureColorTexture(TextureInfo& texture, Pica::Regs: break; } - state.texture_units[0].enabled_2d = true; state.texture_units[0].texture_2d = texture.texture.handle; state.Apply(); @@ -537,7 +534,6 @@ void RasterizerOpenGL::ReconfigureDepthTexture(DepthTextureInfo& texture, Pica:: break; } - state.texture_units[0].enabled_2d = true; state.texture_units[0].texture_2d = texture.texture.handle; state.Apply(); @@ -766,10 +762,9 @@ void RasterizerOpenGL::SyncDrawState() { const auto& texture = pica_textures[texture_index]; if (texture.enabled) { - state.texture_units[texture_index].enabled_2d = true; res_cache.LoadAndBindTexture(state, texture_index, texture); } else { - state.texture_units[texture_index].enabled_2d = false; + state.texture_units[texture_index].texture_2d = 0; } } @@ -804,7 +799,6 @@ void RasterizerOpenGL::ReloadColorBuffer() { } } - state.texture_units[0].enabled_2d = true; state.texture_units[0].texture_2d = fb_color_texture.texture.handle; state.Apply(); @@ -862,7 +856,6 @@ void RasterizerOpenGL::ReloadDepthBuffer() { } } - state.texture_units[0].enabled_2d = true; state.texture_units[0].texture_2d = fb_depth_texture.texture.handle; state.Apply(); @@ -887,7 +880,6 @@ void RasterizerOpenGL::CommitColorBuffer() { std::unique_ptr<u8[]> temp_gl_color_buffer(new u8[fb_color_texture.width * fb_color_texture.height * bytes_per_pixel]); - state.texture_units[0].enabled_2d = true; state.texture_units[0].texture_2d = fb_color_texture.texture.handle; state.Apply(); @@ -927,7 +919,6 @@ void RasterizerOpenGL::CommitDepthBuffer() { std::unique_ptr<u8[]> temp_gl_depth_buffer(new u8[fb_depth_texture.width * fb_depth_texture.height * gl_bpp]); - state.texture_units[0].enabled_2d = true; state.texture_units[0].texture_2d = fb_depth_texture.texture.handle; state.Apply(); diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h index ae7b26fc6..a02d5c856 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.h +++ b/src/video_core/renderer_opengl/gl_rasterizer.h @@ -9,7 +9,7 @@ #include "common/common_types.h" #include "video_core/hwrasterizer_base.h" -#include "video_core/vertex_shader.h" +#include "video_core/shader/shader_interpreter.h" #include "gl_state.h" #include "gl_rasterizer_cache.h" @@ -27,9 +27,9 @@ public: void Reset() override; /// Queues the primitive formed by the given vertices for rendering - void AddTriangle(const Pica::VertexShader::OutputVertex& v0, - const Pica::VertexShader::OutputVertex& v1, - const Pica::VertexShader::OutputVertex& v2) override; + void AddTriangle(const Pica::Shader::OutputVertex& v0, + const Pica::Shader::OutputVertex& v1, + const Pica::Shader::OutputVertex& v2) override; /// Draw the current batch of triangles void DrawTriangles() override; @@ -82,7 +82,7 @@ private: /// Structure that the hardware rendered vertices are composed of struct HardwareVertex { - HardwareVertex(const Pica::VertexShader::OutputVertex& v) { + HardwareVertex(const Pica::Shader::OutputVertex& v) { position[0] = v.pos.x.ToFloat32(); position[1] = v.pos.y.ToFloat32(); position[2] = v.pos.z.ToFloat32(); diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp index dc3ffdf22..70f0ba5f1 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp @@ -30,6 +30,7 @@ void RasterizerCacheOpenGL::LoadAndBindTexture(OpenGLState &state, unsigned text new_texture->texture.Create(); state.texture_units[texture_unit].texture_2d = new_texture->texture.handle; state.Apply(); + glActiveTexture(GL_TEXTURE0 + texture_unit); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, PicaToGL::TextureFilterMode(config.config.mag_filter)); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, PicaToGL::TextureFilterMode(config.config.min_filter)); diff --git a/src/video_core/renderer_opengl/gl_resource_manager.h b/src/video_core/renderer_opengl/gl_resource_manager.h index 6f9dc012d..82173d59a 100644 --- a/src/video_core/renderer_opengl/gl_resource_manager.h +++ b/src/video_core/renderer_opengl/gl_resource_manager.h @@ -10,6 +10,7 @@ #include "video_core/renderer_opengl/generated/gl_3_2_core.h" #include "video_core/renderer_opengl/gl_shader_util.h" +#include "video_core/renderer_opengl/gl_state.h" class OGLTexture : private NonCopyable { public: @@ -28,6 +29,7 @@ public: void Release() { if (handle == 0) return; glDeleteTextures(1, &handle); + OpenGLState::ResetTexture(handle); handle = 0; } @@ -51,6 +53,7 @@ public: void Release() { if (handle == 0) return; glDeleteProgram(handle); + OpenGLState::ResetProgram(handle); handle = 0; } @@ -74,6 +77,7 @@ public: void Release() { if (handle == 0) return; glDeleteBuffers(1, &handle); + OpenGLState::ResetBuffer(handle); handle = 0; } @@ -97,6 +101,7 @@ public: void Release() { if (handle == 0) return; glDeleteVertexArrays(1, &handle); + OpenGLState::ResetVertexArray(handle); handle = 0; } @@ -120,6 +125,7 @@ public: void Release() { if (handle == 0) return; glDeleteFramebuffers(1, &handle); + OpenGLState::ResetFramebuffer(handle); handle = 0; } diff --git a/src/video_core/renderer_opengl/gl_state.cpp b/src/video_core/renderer_opengl/gl_state.cpp index 9efc15337..871324014 100644 --- a/src/video_core/renderer_opengl/gl_state.cpp +++ b/src/video_core/renderer_opengl/gl_state.cpp @@ -40,7 +40,6 @@ OpenGLState::OpenGLState() { logic_op = GL_COPY; for (auto& texture_unit : texture_units) { - texture_unit.enabled_2d = false; texture_unit.texture_2d = 0; } @@ -147,16 +146,9 @@ void OpenGLState::Apply() { // Textures for (unsigned texture_index = 0; texture_index < ARRAY_SIZE(texture_units); ++texture_index) { - if (texture_units[texture_index].enabled_2d != cur_state.texture_units[texture_index].enabled_2d || - texture_units[texture_index].texture_2d != cur_state.texture_units[texture_index].texture_2d) { - + if (texture_units[texture_index].texture_2d != cur_state.texture_units[texture_index].texture_2d) { glActiveTexture(GL_TEXTURE0 + texture_index); - - if (texture_units[texture_index].enabled_2d) { - glBindTexture(GL_TEXTURE_2D, texture_units[texture_index].texture_2d); - } else { - glBindTexture(GL_TEXTURE_2D, 0); - } + glBindTexture(GL_TEXTURE_2D, texture_units[texture_index].texture_2d); } } @@ -182,3 +174,35 @@ void OpenGLState::Apply() { cur_state = *this; } + +void OpenGLState::ResetTexture(GLuint id) { + for (auto& unit : cur_state.texture_units) { + if (unit.texture_2d == id) { + unit.texture_2d = 0; + } + } +} + +void OpenGLState::ResetProgram(GLuint id) { + if (cur_state.draw.shader_program == id) { + cur_state.draw.shader_program = 0; + } +} + +void OpenGLState::ResetBuffer(GLuint id) { + if (cur_state.draw.vertex_buffer == id) { + cur_state.draw.vertex_buffer = 0; + } +} + +void OpenGLState::ResetVertexArray(GLuint id) { + if (cur_state.draw.vertex_array == id) { + cur_state.draw.vertex_array = 0; + } +} + +void OpenGLState::ResetFramebuffer(GLuint id) { + if (cur_state.draw.framebuffer == id) { + cur_state.draw.framebuffer = 0; + } +} diff --git a/src/video_core/renderer_opengl/gl_state.h b/src/video_core/renderer_opengl/gl_state.h index 26b916360..3e2379021 100644 --- a/src/video_core/renderer_opengl/gl_state.h +++ b/src/video_core/renderer_opengl/gl_state.h @@ -53,7 +53,6 @@ public: // 3 texture units - one for each that is used in PICA fragment shader emulation struct { - bool enabled_2d; // GL_TEXTURE_2D GLuint texture_2d; // GL_TEXTURE_BINDING_2D } texture_units[3]; @@ -74,6 +73,12 @@ public: /// Apply this state as the current OpenGL state void Apply(); + static void ResetTexture(GLuint id); + static void ResetProgram(GLuint id); + static void ResetBuffer(GLuint id); + static void ResetVertexArray(GLuint id); + static void ResetFramebuffer(GLuint id); + private: static OpenGLState cur_state; }; diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp index 96e12839a..79a940ff6 100644 --- a/src/video_core/renderer_opengl/renderer_opengl.cpp +++ b/src/video_core/renderer_opengl/renderer_opengl.cpp @@ -163,7 +163,6 @@ void RendererOpenGL::LoadFBToActiveGLTexture(const GPU::Regs::FramebufferConfig& // only allows rows to have a memory alignement of 4. ASSERT(pixel_stride % 4 == 0); - state.texture_units[0].enabled_2d = true; state.texture_units[0].texture_2d = texture.handle; state.Apply(); @@ -191,7 +190,6 @@ void RendererOpenGL::LoadFBToActiveGLTexture(const GPU::Regs::FramebufferConfig& */ void RendererOpenGL::LoadColorToActiveGLTexture(u8 color_r, u8 color_g, u8 color_b, const TextureInfo& texture) { - state.texture_units[0].enabled_2d = true; state.texture_units[0].texture_2d = texture.handle; state.Apply(); @@ -239,7 +237,6 @@ void RendererOpenGL::InitOpenGLObjects() { // Allocation of storage is deferred until the first frame, when we // know the framebuffer size. - state.texture_units[0].enabled_2d = true; state.texture_units[0].texture_2d = texture.handle; state.Apply(); @@ -305,7 +302,6 @@ void RendererOpenGL::ConfigureFramebufferTexture(TextureInfo& texture, UNIMPLEMENTED(); } - state.texture_units[0].enabled_2d = true; state.texture_units[0].texture_2d = texture.handle; state.Apply(); @@ -325,7 +321,6 @@ void RendererOpenGL::DrawSingleScreenRotated(const TextureInfo& texture, float x ScreenRectVertex(x+w, y+h, 0.f, 1.f), }; - state.texture_units[0].enabled_2d = true; state.texture_units[0].texture_2d = texture.handle; state.Apply(); diff --git a/src/video_core/shader/shader.cpp b/src/video_core/shader/shader.cpp new file mode 100644 index 000000000..6a27a8015 --- /dev/null +++ b/src/video_core/shader/shader.cpp @@ -0,0 +1,145 @@ +// Copyright 2015 Citra Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <memory> +#include <unordered_map> + +#include "common/hash.h" +#include "common/make_unique.h" +#include "common/profiler.h" + +#include "video_core/debug_utils/debug_utils.h" +#include "video_core/pica.h" +#include "video_core/video_core.h" + +#include "shader.h" +#include "shader_interpreter.h" + +#ifdef ARCHITECTURE_x86_64 +#include "shader_jit_x64.h" +#endif // ARCHITECTURE_x86_64 + +namespace Pica { + +namespace Shader { + +#ifdef ARCHITECTURE_x86_64 +static std::unordered_map<u64, CompiledShader*> shader_map; +static JitCompiler jit; +static CompiledShader* jit_shader; +#endif // ARCHITECTURE_x86_64 + +void Setup(UnitState& state) { +#ifdef ARCHITECTURE_x86_64 + if (VideoCore::g_shader_jit_enabled) { + u64 cache_key = (Common::ComputeHash64(&g_state.vs.program_code, sizeof(g_state.vs.program_code)) ^ + Common::ComputeHash64(&g_state.vs.swizzle_data, sizeof(g_state.vs.swizzle_data)) ^ + g_state.regs.vs.main_offset); + + auto iter = shader_map.find(cache_key); + if (iter != shader_map.end()) { + jit_shader = iter->second; + } else { + jit_shader = jit.Compile(); + shader_map.emplace(cache_key, jit_shader); + } + } +#endif // ARCHITECTURE_x86_64 +} + +void Shutdown() { + shader_map.clear(); +} + +static Common::Profiling::TimingCategory shader_category("Vertex Shader"); + +OutputVertex Run(UnitState& state, const InputVertex& input, int num_attributes) { + auto& config = g_state.regs.vs; + auto& setup = g_state.vs; + + Common::Profiling::ScopeTimer timer(shader_category); + + state.program_counter = config.main_offset; + state.debug.max_offset = 0; + state.debug.max_opdesc_id = 0; + + // Setup input register table + const auto& attribute_register_map = config.input_register_map; + + if (num_attributes > 0) state.registers.input[attribute_register_map.attribute0_register] = input.attr[0]; + if (num_attributes > 1) state.registers.input[attribute_register_map.attribute1_register] = input.attr[1]; + if (num_attributes > 2) state.registers.input[attribute_register_map.attribute2_register] = input.attr[2]; + if (num_attributes > 3) state.registers.input[attribute_register_map.attribute3_register] = input.attr[3]; + if (num_attributes > 4) state.registers.input[attribute_register_map.attribute4_register] = input.attr[4]; + if (num_attributes > 5) state.registers.input[attribute_register_map.attribute5_register] = input.attr[5]; + if (num_attributes > 6) state.registers.input[attribute_register_map.attribute6_register] = input.attr[6]; + if (num_attributes > 7) state.registers.input[attribute_register_map.attribute7_register] = input.attr[7]; + if (num_attributes > 8) state.registers.input[attribute_register_map.attribute8_register] = input.attr[8]; + if (num_attributes > 9) state.registers.input[attribute_register_map.attribute9_register] = input.attr[9]; + if (num_attributes > 10) state.registers.input[attribute_register_map.attribute10_register] = input.attr[10]; + if (num_attributes > 11) state.registers.input[attribute_register_map.attribute11_register] = input.attr[11]; + if (num_attributes > 12) state.registers.input[attribute_register_map.attribute12_register] = input.attr[12]; + if (num_attributes > 13) state.registers.input[attribute_register_map.attribute13_register] = input.attr[13]; + if (num_attributes > 14) state.registers.input[attribute_register_map.attribute14_register] = input.attr[14]; + if (num_attributes > 15) state.registers.input[attribute_register_map.attribute15_register] = input.attr[15]; + + state.conditional_code[0] = false; + state.conditional_code[1] = false; + +#ifdef ARCHITECTURE_x86_64 + if (VideoCore::g_shader_jit_enabled) + jit_shader(&state.registers); + else + RunInterpreter(state); +#else + RunInterpreter(state); +#endif // ARCHITECTURE_x86_64 + +#if PICA_DUMP_SHADERS + DebugUtils::DumpShader(setup.program_code.data(), state.debug.max_offset, setup.swizzle_data.data(), + state.debug.max_opdesc_id, config.main_offset, + g_state.regs.vs_output_attributes); // TODO: Don't hardcode VS here +#endif + + // Setup output data + OutputVertex ret; + // TODO(neobrain): Under some circumstances, up to 16 attributes may be output. We need to + // figure out what those circumstances are and enable the remaining outputs then. + for (int i = 0; i < 7; ++i) { + const auto& output_register_map = g_state.regs.vs_output_attributes[i]; // TODO: Don't hardcode VS here + + u32 semantics[4] = { + output_register_map.map_x, output_register_map.map_y, + output_register_map.map_z, output_register_map.map_w + }; + + for (int comp = 0; comp < 4; ++comp) { + float24* out = ((float24*)&ret) + semantics[comp]; + if (semantics[comp] != Regs::VSOutputAttributes::INVALID) { + *out = state.registers.output[i][comp]; + } else { + // Zero output so that attributes which aren't output won't have denormals in them, + // which would slow us down later. + memset(out, 0, sizeof(*out)); + } + } + } + + // The hardware takes the absolute and saturates vertex colors like this, *before* doing interpolation + for (int i = 0; i < 4; ++i) { + ret.color[i] = float24::FromFloat32( + std::fmin(std::fabs(ret.color[i].ToFloat32()), 1.0f)); + } + + LOG_TRACE(Render_Software, "Output vertex: pos (%.2f, %.2f, %.2f, %.2f), col(%.2f, %.2f, %.2f, %.2f), tc0(%.2f, %.2f)", + ret.pos.x.ToFloat32(), ret.pos.y.ToFloat32(), ret.pos.z.ToFloat32(), ret.pos.w.ToFloat32(), + ret.color.x.ToFloat32(), ret.color.y.ToFloat32(), ret.color.z.ToFloat32(), ret.color.w.ToFloat32(), + ret.tc0.u().ToFloat32(), ret.tc0.v().ToFloat32()); + + return ret; +} + +} // namespace Shader + +} // namespace Pica diff --git a/src/video_core/shader/shader.h b/src/video_core/shader/shader.h new file mode 100644 index 000000000..2007a2844 --- /dev/null +++ b/src/video_core/shader/shader.h @@ -0,0 +1,169 @@ +// Copyright 2015 Citra Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <boost/container/static_vector.hpp> +#include <nihstro/shader_binary.h> + +#include "common/common_funcs.h" +#include "common/common_types.h" +#include "common/vector_math.h" + +#include "video_core/pica.h" + +using nihstro::RegisterType; +using nihstro::SourceRegister; +using nihstro::DestRegister; + +namespace Pica { + +namespace Shader { + +struct InputVertex { + Math::Vec4<float24> attr[16]; +}; + +struct OutputVertex { + OutputVertex() = default; + + // VS output attributes + Math::Vec4<float24> pos; + Math::Vec4<float24> dummy; // quaternions (not implemented, yet) + Math::Vec4<float24> color; + Math::Vec2<float24> tc0; + Math::Vec2<float24> tc1; + float24 pad[6]; + Math::Vec2<float24> tc2; + + // Padding for optimal alignment + float24 pad2[4]; + + // Attributes used to store intermediate results + + // position after perspective divide + Math::Vec3<float24> screenpos; + float24 pad3; + + // Linear interpolation + // factor: 0=this, 1=vtx + void Lerp(float24 factor, const OutputVertex& vtx) { + pos = pos * factor + vtx.pos * (float24::FromFloat32(1) - factor); + + // TODO: Should perform perspective correct interpolation here... + tc0 = tc0 * factor + vtx.tc0 * (float24::FromFloat32(1) - factor); + tc1 = tc1 * factor + vtx.tc1 * (float24::FromFloat32(1) - factor); + tc2 = tc2 * factor + vtx.tc2 * (float24::FromFloat32(1) - factor); + + screenpos = screenpos * factor + vtx.screenpos * (float24::FromFloat32(1) - factor); + + color = color * factor + vtx.color * (float24::FromFloat32(1) - factor); + } + + // Linear interpolation + // factor: 0=v0, 1=v1 + static OutputVertex Lerp(float24 factor, const OutputVertex& v0, const OutputVertex& v1) { + OutputVertex ret = v0; + ret.Lerp(factor, v1); + return ret; + } +}; +static_assert(std::is_pod<OutputVertex>::value, "Structure is not POD"); +static_assert(sizeof(OutputVertex) == 32 * sizeof(float), "OutputVertex has invalid size"); + +/** + * This structure contains the state information that needs to be unique for a shader unit. The 3DS + * has four shader units that process shaders in parallel. At the present, Citra only implements a + * single shader unit that processes all shaders serially. Putting the state information in a struct + * here will make it easier for us to parallelize the shader processing later. + */ +struct UnitState { + struct Registers { + // The registers are accessed by the shader JIT using SSE instructions, and are therefore + // required to be 16-byte aligned. + Math::Vec4<float24> MEMORY_ALIGNED16(input[16]); + Math::Vec4<float24> MEMORY_ALIGNED16(output[16]); + Math::Vec4<float24> MEMORY_ALIGNED16(temporary[16]); + } registers; + static_assert(std::is_pod<Registers>::value, "Structure is not POD"); + + u32 program_counter; + bool conditional_code[2]; + + // Two Address registers and one loop counter + // TODO: How many bits do these actually have? + s32 address_registers[3]; + + enum { + INVALID_ADDRESS = 0xFFFFFFFF + }; + + struct CallStackElement { + u32 final_address; // Address upon which we jump to return_address + u32 return_address; // Where to jump when leaving scope + u8 repeat_counter; // How often to repeat until this call stack element is removed + u8 loop_increment; // Which value to add to the loop counter after an iteration + // TODO: Should this be a signed value? Does it even matter? + u32 loop_address; // The address where we'll return to after each loop iteration + }; + + // TODO: Is there a maximal size for this? + boost::container::static_vector<CallStackElement, 16> call_stack; + + struct { + u32 max_offset; // maximum program counter ever reached + u32 max_opdesc_id; // maximum swizzle pattern index ever used + } debug; + + static int InputOffset(const SourceRegister& reg) { + switch (reg.GetRegisterType()) { + case RegisterType::Input: + return (int)offsetof(UnitState::Registers, input) + reg.GetIndex()*sizeof(Math::Vec4<float24>); + + case RegisterType::Temporary: + return (int)offsetof(UnitState::Registers, temporary) + reg.GetIndex()*sizeof(Math::Vec4<float24>); + + default: + UNREACHABLE(); + return 0; + } + } + + static int OutputOffset(const DestRegister& reg) { + switch (reg.GetRegisterType()) { + case RegisterType::Output: + return (int)offsetof(UnitState::Registers, output) + reg.GetIndex()*sizeof(Math::Vec4<float24>); + + case RegisterType::Temporary: + return (int)offsetof(UnitState::Registers, temporary) + reg.GetIndex()*sizeof(Math::Vec4<float24>); + + default: + UNREACHABLE(); + return 0; + } + } +}; + +/** + * Performs any shader unit setup that only needs to happen once per shader (as opposed to once per + * vertex, which would happen within the `Run` function). + * @param state Shader unit state, must be setup per shader and per shader unit + */ +void Setup(UnitState& state); + +/// Performs any cleanup when the emulator is shutdown +void Shutdown(); + +/** + * Runs the currently setup shader + * @param state Shader unit state, must be setup per shader and per shader unit + * @param input Input vertex into the shader + * @param num_attributes The number of vertex shader attributes + * @return The output vertex, after having been processed by the vertex shader + */ +OutputVertex Run(UnitState& state, const InputVertex& input, int num_attributes); + +} // namespace Shader + +} // namespace Pica diff --git a/src/video_core/vertex_shader.cpp b/src/video_core/shader/shader_interpreter.cpp index 5f66f3455..c8489f920 100644 --- a/src/video_core/vertex_shader.cpp +++ b/src/video_core/shader/shader_interpreter.cpp @@ -2,18 +2,14 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. -#include <boost/container/static_vector.hpp> -#include <boost/range/algorithm.hpp> - #include <common/file_util.h> #include <nihstro/shader_bytecode.h> -#include "common/profiler.h" +#include "video_core/pica.h" -#include "pica.h" -#include "vertex_shader.h" -#include "debug_utils/debug_utils.h" +#include "shader.h" +#include "shader_interpreter.h" using nihstro::OpCode; using nihstro::Instruction; @@ -23,44 +19,9 @@ using nihstro::SwizzlePattern; namespace Pica { -namespace VertexShader { - -struct VertexShaderState { - u32 program_counter; - - const float24* input_register_table[16]; - Math::Vec4<float24> output_registers[16]; - - Math::Vec4<float24> temporary_registers[16]; - bool conditional_code[2]; - - // Two Address registers and one loop counter - // TODO: How many bits do these actually have? - s32 address_registers[3]; - - enum { - INVALID_ADDRESS = 0xFFFFFFFF - }; +namespace Shader { - struct CallStackElement { - u32 final_address; // Address upon which we jump to return_address - u32 return_address; // Where to jump when leaving scope - u8 repeat_counter; // How often to repeat until this call stack element is removed - u8 loop_increment; // Which value to add to the loop counter after an iteration - // TODO: Should this be a signed value? Does it even matter? - u32 loop_address; // The address where we'll return to after each loop iteration - }; - - // TODO: Is there a maximal size for this? - boost::container::static_vector<CallStackElement, 16> call_stack; - - struct { - u32 max_offset; // maximum program counter ever reached - u32 max_opdesc_id; // maximum swizzle pattern index ever used - } debug; -}; - -static void ProcessShaderCode(VertexShaderState& state) { +void RunInterpreter(UnitState& state) { const auto& uniforms = g_state.vs.uniforms; const auto& swizzle_data = g_state.vs.swizzle_data; const auto& program_code = g_state.vs.program_code; @@ -90,7 +51,7 @@ static void ProcessShaderCode(VertexShaderState& state) { const Instruction instr = { program_code[state.program_counter] }; const SwizzlePattern swizzle = { swizzle_data[instr.common.operand_desc_id] }; - static auto call = [](VertexShaderState& state, u32 offset, u32 num_instructions, + static auto call = [](UnitState& state, u32 offset, u32 num_instructions, u32 return_offset, u8 repeat_count, u8 loop_increment) { state.program_counter = offset - 1; // -1 to make sure when incrementing the PC we end up at the correct offset ASSERT(state.call_stack.size() < state.call_stack.capacity()); @@ -101,10 +62,10 @@ static void ProcessShaderCode(VertexShaderState& state) { auto LookupSourceRegister = [&](const SourceRegister& source_reg) -> const float24* { switch (source_reg.GetRegisterType()) { case RegisterType::Input: - return state.input_register_table[source_reg.GetIndex()]; + return &state.registers.input[source_reg.GetIndex()].x; case RegisterType::Temporary: - return &state.temporary_registers[source_reg.GetIndex()].x; + return &state.registers.temporary[source_reg.GetIndex()].x; case RegisterType::FloatUniform: return &uniforms.f[source_reg.GetIndex()].x; @@ -153,8 +114,8 @@ static void ProcessShaderCode(VertexShaderState& state) { src2[3] = src2[3] * float24::FromFloat32(-1); } - float24* dest = (instr.common.dest.Value() < 0x10) ? &state.output_registers[instr.common.dest.Value().GetIndex()][0] - : (instr.common.dest.Value() < 0x20) ? &state.temporary_registers[instr.common.dest.Value().GetIndex()][0] + float24* dest = (instr.common.dest.Value() < 0x10) ? &state.registers.output[instr.common.dest.Value().GetIndex()][0] + : (instr.common.dest.Value() < 0x20) ? &state.registers.temporary[instr.common.dest.Value().GetIndex()][0] : dummy_vec4_float24; state.debug.max_opdesc_id = std::max<u32>(state.debug.max_opdesc_id, 1+instr.common.operand_desc_id); @@ -394,8 +355,8 @@ static void ProcessShaderCode(VertexShaderState& state) { src3[3] = src3[3] * float24::FromFloat32(-1); } - float24* dest = (instr.mad.dest.Value() < 0x10) ? &state.output_registers[instr.mad.dest.Value().GetIndex()][0] - : (instr.mad.dest.Value() < 0x20) ? &state.temporary_registers[instr.mad.dest.Value().GetIndex()][0] + float24* dest = (instr.mad.dest.Value() < 0x10) ? &state.registers.output[instr.mad.dest.Value().GetIndex()][0] + : (instr.mad.dest.Value() < 0x20) ? &state.registers.temporary[instr.mad.dest.Value().GetIndex()][0] : dummy_vec4_float24; for (int i = 0; i < 4; ++i) { @@ -413,7 +374,7 @@ static void ProcessShaderCode(VertexShaderState& state) { default: { - static auto evaluate_condition = [](const VertexShaderState& state, bool refx, bool refy, Instruction::FlowControlType flow_control) { + static auto evaluate_condition = [](const UnitState& state, bool refx, bool refy, Instruction::FlowControlType flow_control) { bool results[2] = { refx == state.conditional_code[0], refy == state.conditional_code[1] }; @@ -542,88 +503,6 @@ static void ProcessShaderCode(VertexShaderState& state) { } } -static Common::Profiling::TimingCategory shader_category("Vertex Shader"); - -OutputVertex RunShader(const InputVertex& input, int num_attributes, const Regs::ShaderConfig& config, const State::ShaderSetup& setup) { - Common::Profiling::ScopeTimer timer(shader_category); - - VertexShaderState state; - - state.program_counter = config.main_offset; - state.debug.max_offset = 0; - state.debug.max_opdesc_id = 0; - - // Setup input register table - const auto& attribute_register_map = config.input_register_map; - float24 dummy_register; - boost::fill(state.input_register_table, &dummy_register); - - if (num_attributes > 0) state.input_register_table[attribute_register_map.attribute0_register] = &input.attr[0].x; - if (num_attributes > 1) state.input_register_table[attribute_register_map.attribute1_register] = &input.attr[1].x; - if (num_attributes > 2) state.input_register_table[attribute_register_map.attribute2_register] = &input.attr[2].x; - if (num_attributes > 3) state.input_register_table[attribute_register_map.attribute3_register] = &input.attr[3].x; - if (num_attributes > 4) state.input_register_table[attribute_register_map.attribute4_register] = &input.attr[4].x; - if (num_attributes > 5) state.input_register_table[attribute_register_map.attribute5_register] = &input.attr[5].x; - if (num_attributes > 6) state.input_register_table[attribute_register_map.attribute6_register] = &input.attr[6].x; - if (num_attributes > 7) state.input_register_table[attribute_register_map.attribute7_register] = &input.attr[7].x; - if (num_attributes > 8) state.input_register_table[attribute_register_map.attribute8_register] = &input.attr[8].x; - if (num_attributes > 9) state.input_register_table[attribute_register_map.attribute9_register] = &input.attr[9].x; - if (num_attributes > 10) state.input_register_table[attribute_register_map.attribute10_register] = &input.attr[10].x; - if (num_attributes > 11) state.input_register_table[attribute_register_map.attribute11_register] = &input.attr[11].x; - if (num_attributes > 12) state.input_register_table[attribute_register_map.attribute12_register] = &input.attr[12].x; - if (num_attributes > 13) state.input_register_table[attribute_register_map.attribute13_register] = &input.attr[13].x; - if (num_attributes > 14) state.input_register_table[attribute_register_map.attribute14_register] = &input.attr[14].x; - if (num_attributes > 15) state.input_register_table[attribute_register_map.attribute15_register] = &input.attr[15].x; - - state.conditional_code[0] = false; - state.conditional_code[1] = false; - - ProcessShaderCode(state); -#if PICA_DUMP_SHADERS - DebugUtils::DumpShader(setup.program_code.data(), state.debug.max_offset, setup.swizzle_data.data(), - state.debug.max_opdesc_id, config.main_offset, - g_state.regs.vs_output_attributes); // TODO: Don't hardcode VS here -#endif - - // Setup output data - OutputVertex ret; - // TODO(neobrain): Under some circumstances, up to 16 attributes may be output. We need to - // figure out what those circumstances are and enable the remaining outputs then. - for (int i = 0; i < 7; ++i) { - const auto& output_register_map = g_state.regs.vs_output_attributes[i]; // TODO: Don't hardcode VS here - - u32 semantics[4] = { - output_register_map.map_x, output_register_map.map_y, - output_register_map.map_z, output_register_map.map_w - }; - - for (int comp = 0; comp < 4; ++comp) { - float24* out = ((float24*)&ret) + semantics[comp]; - if (semantics[comp] != Regs::VSOutputAttributes::INVALID) { - *out = state.output_registers[i][comp]; - } else { - // Zero output so that attributes which aren't output won't have denormals in them, - // which would slow us down later. - memset(out, 0, sizeof(*out)); - } - } - } - - // The hardware takes the absolute and saturates vertex colors like this, *before* doing interpolation - for (int i = 0; i < 4; ++i) { - ret.color[i] = float24::FromFloat32( - std::fmin(std::fabs(ret.color[i].ToFloat32()), 1.0f)); - } - - LOG_TRACE(Render_Software, "Output vertex: pos (%.2f, %.2f, %.2f, %.2f), col(%.2f, %.2f, %.2f, %.2f), tc0(%.2f, %.2f)", - ret.pos.x.ToFloat32(), ret.pos.y.ToFloat32(), ret.pos.z.ToFloat32(), ret.pos.w.ToFloat32(), - ret.color.x.ToFloat32(), ret.color.y.ToFloat32(), ret.color.z.ToFloat32(), ret.color.w.ToFloat32(), - ret.tc0.u().ToFloat32(), ret.tc0.v().ToFloat32()); - - return ret; -} - - } // namespace } // namespace diff --git a/src/video_core/shader/shader_interpreter.h b/src/video_core/shader/shader_interpreter.h new file mode 100644 index 000000000..ad6e58e39 --- /dev/null +++ b/src/video_core/shader/shader_interpreter.h @@ -0,0 +1,19 @@ +// Copyright 2014 Citra Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include "video_core/pica.h" + +#include "shader.h" + +namespace Pica { + +namespace Shader { + +void RunInterpreter(UnitState& state); + +} // namespace + +} // namespace diff --git a/src/video_core/shader/shader_jit_x64.cpp b/src/video_core/shader/shader_jit_x64.cpp new file mode 100644 index 000000000..ce47774d5 --- /dev/null +++ b/src/video_core/shader/shader_jit_x64.cpp @@ -0,0 +1,675 @@ +// Copyright 2015 Citra Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <smmintrin.h> + +#include "common/x64/abi.h" +#include "common/x64/cpu_detect.h" +#include "common/x64/emitter.h" + +#include "shader.h" +#include "shader_jit_x64.h" + +namespace Pica { + +namespace Shader { + +using namespace Gen; + +typedef void (JitCompiler::*JitFunction)(Instruction instr); + +const JitFunction instr_table[64] = { + &JitCompiler::Compile_ADD, // add + &JitCompiler::Compile_DP3, // dp3 + &JitCompiler::Compile_DP4, // dp4 + nullptr, // dph + nullptr, // unknown + nullptr, // ex2 + nullptr, // lg2 + nullptr, // unknown + &JitCompiler::Compile_MUL, // mul + nullptr, // lge + nullptr, // slt + &JitCompiler::Compile_FLR, // flr + &JitCompiler::Compile_MAX, // max + &JitCompiler::Compile_MIN, // min + &JitCompiler::Compile_RCP, // rcp + &JitCompiler::Compile_RSQ, // rsq + nullptr, // unknown + nullptr, // unknown + &JitCompiler::Compile_MOVA, // mova + &JitCompiler::Compile_MOV, // mov + nullptr, // unknown + nullptr, // unknown + nullptr, // unknown + nullptr, // unknown + nullptr, // dphi + nullptr, // unknown + nullptr, // sgei + &JitCompiler::Compile_SLTI, // slti + nullptr, // unknown + nullptr, // unknown + nullptr, // unknown + nullptr, // unknown + nullptr, // unknown + &JitCompiler::Compile_NOP, // nop + &JitCompiler::Compile_END, // end + nullptr, // break + &JitCompiler::Compile_CALL, // call + &JitCompiler::Compile_CALLC, // callc + &JitCompiler::Compile_CALLU, // callu + &JitCompiler::Compile_IF, // ifu + &JitCompiler::Compile_IF, // ifc + &JitCompiler::Compile_LOOP, // loop + nullptr, // emit + nullptr, // sete + &JitCompiler::Compile_JMP, // jmpc + &JitCompiler::Compile_JMP, // jmpu + &JitCompiler::Compile_CMP, // cmp + &JitCompiler::Compile_CMP, // cmp + &JitCompiler::Compile_MAD, // madi + &JitCompiler::Compile_MAD, // madi + &JitCompiler::Compile_MAD, // madi + &JitCompiler::Compile_MAD, // madi + &JitCompiler::Compile_MAD, // madi + &JitCompiler::Compile_MAD, // madi + &JitCompiler::Compile_MAD, // madi + &JitCompiler::Compile_MAD, // madi + &JitCompiler::Compile_MAD, // mad + &JitCompiler::Compile_MAD, // mad + &JitCompiler::Compile_MAD, // mad + &JitCompiler::Compile_MAD, // mad + &JitCompiler::Compile_MAD, // mad + &JitCompiler::Compile_MAD, // mad + &JitCompiler::Compile_MAD, // mad + &JitCompiler::Compile_MAD, // mad +}; + +// The following is used to alias some commonly used registers. Generally, RAX-RDX and XMM0-XMM3 can +// be used as scratch registers within a compiler function. The other registers have designated +// purposes, as documented below: + +/// Pointer to the uniform memory +static const X64Reg UNIFORMS = R9; +/// The two 32-bit VS address offset registers set by the MOVA instruction +static const X64Reg ADDROFFS_REG_0 = R10; +static const X64Reg ADDROFFS_REG_1 = R11; +/// VS loop count register +static const X64Reg LOOPCOUNT_REG = R12; +/// Current VS loop iteration number (we could probably use LOOPCOUNT_REG, but this quicker) +static const X64Reg LOOPCOUNT = RSI; +/// Number to increment LOOPCOUNT_REG by on each loop iteration +static const X64Reg LOOPINC = RDI; +/// Result of the previous CMP instruction for the X-component comparison +static const X64Reg COND0 = R13; +/// Result of the previous CMP instruction for the Y-component comparison +static const X64Reg COND1 = R14; +/// Pointer to the UnitState instance for the current VS unit +static const X64Reg REGISTERS = R15; +/// SIMD scratch register +static const X64Reg SCRATCH = XMM0; +/// Loaded with the first swizzled source register, otherwise can be used as a scratch register +static const X64Reg SRC1 = XMM1; +/// Loaded with the second swizzled source register, otherwise can be used as a scratch register +static const X64Reg SRC2 = XMM2; +/// Loaded with the third swizzled source register, otherwise can be used as a scratch register +static const X64Reg SRC3 = XMM3; +/// Constant vector of [1.0f, 1.0f, 1.0f, 1.0f], used to efficiently set a vector to one +static const X64Reg ONE = XMM14; +/// Constant vector of [-0.f, -0.f, -0.f, -0.f], used to efficiently negate a vector with XOR +static const X64Reg NEGBIT = XMM15; + +/// Raw constant for the source register selector that indicates no swizzling is performed +static const u8 NO_SRC_REG_SWIZZLE = 0x1b; +/// Raw constant for the destination register enable mask that indicates all components are enabled +static const u8 NO_DEST_REG_MASK = 0xf; + +/** + * Loads and swizzles a source register into the specified XMM register. + * @param instr VS instruction, used for determining how to load the source register + * @param src_num Number indicating which source register to load (1 = src1, 2 = src2, 3 = src3) + * @param src_reg SourceRegister object corresponding to the source register to load + * @param dest Destination XMM register to store the loaded, swizzled source register + */ +void JitCompiler::Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRegister src_reg, X64Reg dest) { + X64Reg src_ptr; + int src_offset; + + if (src_reg.GetRegisterType() == RegisterType::FloatUniform) { + src_ptr = UNIFORMS; + src_offset = src_reg.GetIndex() * sizeof(float24) * 4; + } else { + src_ptr = REGISTERS; + src_offset = UnitState::InputOffset(src_reg); + } + + unsigned operand_desc_id; + if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MAD || + instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MADI) { + // The MAD and MADI instructions do not use the address offset registers, so loading the + // source is a bit simpler here + + operand_desc_id = instr.mad.operand_desc_id; + + // Load the source + MOVAPS(dest, MDisp(src_ptr, src_offset)); + } else { + operand_desc_id = instr.common.operand_desc_id; + + const bool is_inverted = (0 != (instr.opcode.Value().GetInfo().subtype & OpCode::Info::SrcInversed)); + unsigned offset_src = is_inverted ? 2 : 1; + + if (src_num == offset_src && instr.common.address_register_index != 0) { + switch (instr.common.address_register_index) { + case 1: // address offset 1 + MOVAPS(dest, MComplex(src_ptr, ADDROFFS_REG_0, 1, src_offset)); + break; + case 2: // address offset 2 + MOVAPS(dest, MComplex(src_ptr, ADDROFFS_REG_1, 1, src_offset)); + break; + case 3: // adddress offet 3 + MOVAPS(dest, MComplex(src_ptr, LOOPCOUNT_REG, 1, src_offset)); + break; + default: + UNREACHABLE(); + break; + } + } else { + // Load the source + MOVAPS(dest, MDisp(src_ptr, src_offset)); + } + } + + SwizzlePattern swiz = { g_state.vs.swizzle_data[operand_desc_id] }; + + // Generate instructions for source register swizzling as needed + u8 sel = swiz.GetRawSelector(src_num); + if (sel != NO_SRC_REG_SWIZZLE) { + // Selector component order needs to be reversed for the SHUFPS instruction + sel = ((sel & 0xc0) >> 6) | ((sel & 3) << 6) | ((sel & 0xc) << 2) | ((sel & 0x30) >> 2); + + // Shuffle inputs for swizzle + SHUFPS(dest, R(dest), sel); + } + + // If the source register should be negated, flip the negative bit using XOR + const bool negate[] = { swiz.negate_src1, swiz.negate_src2, swiz.negate_src3 }; + if (negate[src_num - 1]) { + XORPS(dest, R(NEGBIT)); + } +} + +void JitCompiler::Compile_DestEnable(Instruction instr,X64Reg src) { + DestRegister dest; + unsigned operand_desc_id; + if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MAD || + instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MADI) { + operand_desc_id = instr.mad.operand_desc_id; + dest = instr.mad.dest.Value(); + } else { + operand_desc_id = instr.common.operand_desc_id; + dest = instr.common.dest.Value(); + } + + SwizzlePattern swiz = { g_state.vs.swizzle_data[operand_desc_id] }; + + // If all components are enabled, write the result to the destination register + if (swiz.dest_mask == NO_DEST_REG_MASK) { + // Store dest back to memory + MOVAPS(MDisp(REGISTERS, UnitState::OutputOffset(dest)), src); + + } else { + // Not all components are enabled, so mask the result when storing to the destination register... + MOVAPS(SCRATCH, MDisp(REGISTERS, UnitState::OutputOffset(dest))); + + if (Common::GetCPUCaps().sse4_1) { + u8 mask = ((swiz.dest_mask & 1) << 3) | ((swiz.dest_mask & 8) >> 3) | ((swiz.dest_mask & 2) << 1) | ((swiz.dest_mask & 4) >> 1); + BLENDPS(SCRATCH, R(src), mask); + } else { + MOVAPS(XMM4, R(src)); + UNPCKHPS(XMM4, R(SCRATCH)); // Unpack X/Y components of source and destination + UNPCKLPS(SCRATCH, R(src)); // Unpack Z/W components of source and destination + + // Compute selector to selectively copy source components to destination for SHUFPS instruction + u8 sel = ((swiz.DestComponentEnabled(0) ? 1 : 0) << 0) | + ((swiz.DestComponentEnabled(1) ? 3 : 2) << 2) | + ((swiz.DestComponentEnabled(2) ? 0 : 1) << 4) | + ((swiz.DestComponentEnabled(3) ? 2 : 3) << 6); + SHUFPS(SCRATCH, R(XMM4), sel); + } + + // Store dest back to memory + MOVAPS(MDisp(REGISTERS, UnitState::OutputOffset(dest)), SCRATCH); + } +} + +void JitCompiler::Compile_EvaluateCondition(Instruction instr) { + // Note: NXOR is used below to check for equality + switch (instr.flow_control.op) { + case Instruction::FlowControlType::Or: + MOV(32, R(RAX), R(COND0)); + MOV(32, R(RBX), R(COND1)); + XOR(32, R(RAX), Imm32(instr.flow_control.refx.Value() ^ 1)); + XOR(32, R(RBX), Imm32(instr.flow_control.refy.Value() ^ 1)); + OR(32, R(RAX), R(RBX)); + break; + + case Instruction::FlowControlType::And: + MOV(32, R(RAX), R(COND0)); + MOV(32, R(RBX), R(COND1)); + XOR(32, R(RAX), Imm32(instr.flow_control.refx.Value() ^ 1)); + XOR(32, R(RBX), Imm32(instr.flow_control.refy.Value() ^ 1)); + AND(32, R(RAX), R(RBX)); + break; + + case Instruction::FlowControlType::JustX: + MOV(32, R(RAX), R(COND0)); + XOR(32, R(RAX), Imm32(instr.flow_control.refx.Value() ^ 1)); + break; + + case Instruction::FlowControlType::JustY: + MOV(32, R(RAX), R(COND1)); + XOR(32, R(RAX), Imm32(instr.flow_control.refy.Value() ^ 1)); + break; + } +} + +void JitCompiler::Compile_UniformCondition(Instruction instr) { + int offset = offsetof(decltype(g_state.vs.uniforms), b) + (instr.flow_control.bool_uniform_id * sizeof(bool)); + CMP(sizeof(bool) * 8, MDisp(UNIFORMS, offset), Imm8(0)); +} + +void JitCompiler::Compile_ADD(Instruction instr) { + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); + ADDPS(SRC1, R(SRC2)); + Compile_DestEnable(instr, SRC1); +} + +void JitCompiler::Compile_DP3(Instruction instr) { + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); + + if (Common::GetCPUCaps().sse4_1) { + DPPS(SRC1, R(SRC2), 0x7f); + } else { + MULPS(SRC1, R(SRC2)); + + MOVAPS(SRC2, R(SRC1)); + SHUFPS(SRC2, R(SRC2), _MM_SHUFFLE(1, 1, 1, 1)); + + MOVAPS(SRC3, R(SRC1)); + SHUFPS(SRC3, R(SRC3), _MM_SHUFFLE(2, 2, 2, 2)); + + SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 0, 0, 0)); + ADDPS(SRC1, R(SRC2)); + ADDPS(SRC1, R(SRC3)); + } + + Compile_DestEnable(instr, SRC1); +} + +void JitCompiler::Compile_DP4(Instruction instr) { + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); + + if (Common::GetCPUCaps().sse4_1) { + DPPS(SRC1, R(SRC2), 0xff); + } else { + MULPS(SRC1, R(SRC2)); + + MOVAPS(SRC2, R(SRC1)); + SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY + ADDPS(SRC1, R(SRC2)); + + MOVAPS(SRC2, R(SRC1)); + SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX + ADDPS(SRC1, R(SRC2)); + } + + Compile_DestEnable(instr, SRC1); +} + +void JitCompiler::Compile_MUL(Instruction instr) { + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); + MULPS(SRC1, R(SRC2)); + Compile_DestEnable(instr, SRC1); +} + +void JitCompiler::Compile_FLR(Instruction instr) { + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + + if (Common::GetCPUCaps().sse4_1) { + ROUNDFLOORPS(SRC1, R(SRC1)); + } else { + CVTPS2DQ(SRC1, R(SRC1)); + CVTDQ2PS(SRC1, R(SRC1)); + } + + Compile_DestEnable(instr, SRC1); +} + +void JitCompiler::Compile_MAX(Instruction instr) { + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); + MAXPS(SRC1, R(SRC2)); + Compile_DestEnable(instr, SRC1); +} + +void JitCompiler::Compile_MIN(Instruction instr) { + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); + MINPS(SRC1, R(SRC2)); + Compile_DestEnable(instr, SRC1); +} + +void JitCompiler::Compile_MOVA(Instruction instr) { + SwizzlePattern swiz = { g_state.vs.swizzle_data[instr.common.operand_desc_id] }; + + if (!swiz.DestComponentEnabled(0) && !swiz.DestComponentEnabled(1)) { + return; // NoOp + } + + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + + // Convert floats to integers (only care about X and Y components) + CVTPS2DQ(SRC1, R(SRC1)); + + // Get result + MOVQ_xmm(R(RAX), SRC1); + + // Handle destination enable + if (swiz.DestComponentEnabled(0) && swiz.DestComponentEnabled(1)) { + // Move and sign-extend low 32 bits + MOVSX(64, 32, ADDROFFS_REG_0, R(RAX)); + + // Move and sign-extend high 32 bits + SHR(64, R(RAX), Imm8(32)); + MOVSX(64, 32, ADDROFFS_REG_1, R(RAX)); + + // Multiply by 16 to be used as an offset later + SHL(64, R(ADDROFFS_REG_0), Imm8(4)); + SHL(64, R(ADDROFFS_REG_1), Imm8(4)); + } else { + if (swiz.DestComponentEnabled(0)) { + // Move and sign-extend low 32 bits + MOVSX(64, 32, ADDROFFS_REG_0, R(RAX)); + + // Multiply by 16 to be used as an offset later + SHL(64, R(ADDROFFS_REG_0), Imm8(4)); + } else if (swiz.DestComponentEnabled(1)) { + // Move and sign-extend high 32 bits + SHR(64, R(RAX), Imm8(32)); + MOVSX(64, 32, ADDROFFS_REG_1, R(RAX)); + + // Multiply by 16 to be used as an offset later + SHL(64, R(ADDROFFS_REG_1), Imm8(4)); + } + } +} + +void JitCompiler::Compile_MOV(Instruction instr) { + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + Compile_DestEnable(instr, SRC1); +} + +void JitCompiler::Compile_SLTI(Instruction instr) { + Compile_SwizzleSrc(instr, 1, instr.common.src1i, SRC1); + Compile_SwizzleSrc(instr, 1, instr.common.src2i, SRC2); + + CMPSS(SRC1, R(SRC2), CMP_LT); + ANDPS(SRC1, R(ONE)); + + Compile_DestEnable(instr, SRC1); +} + +void JitCompiler::Compile_RCP(Instruction instr) { + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + + // TODO(bunnei): RCPPS is a pretty rough approximation, this might cause problems if Pica + // performs this operation more accurately. This should be checked on hardware. + RCPPS(SRC1, R(SRC1)); + + Compile_DestEnable(instr, SRC1); +} + +void JitCompiler::Compile_RSQ(Instruction instr) { + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + + // TODO(bunnei): RSQRTPS is a pretty rough approximation, this might cause problems if Pica + // performs this operation more accurately. This should be checked on hardware. + RSQRTPS(SRC1, R(SRC1)); + + Compile_DestEnable(instr, SRC1); +} + +void JitCompiler::Compile_NOP(Instruction instr) { +} + +void JitCompiler::Compile_END(Instruction instr) { + ABI_PopAllCalleeSavedRegsAndAdjustStack(); + RET(); +} + +void JitCompiler::Compile_CALL(Instruction instr) { + unsigned offset = instr.flow_control.dest_offset; + while (offset < (instr.flow_control.dest_offset + instr.flow_control.num_instructions)) { + Compile_NextInstr(&offset); + } +} + +void JitCompiler::Compile_CALLC(Instruction instr) { + Compile_EvaluateCondition(instr); + FixupBranch b = J_CC(CC_Z, true); + Compile_CALL(instr); + SetJumpTarget(b); +} + +void JitCompiler::Compile_CALLU(Instruction instr) { + Compile_UniformCondition(instr); + FixupBranch b = J_CC(CC_Z, true); + Compile_CALL(instr); + SetJumpTarget(b); +} + +void JitCompiler::Compile_CMP(Instruction instr) { + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); + + static const u8 cmp[] = { CMP_EQ, CMP_NEQ, CMP_LT, CMP_LE, CMP_NLE, CMP_NLT }; + + if (instr.common.compare_op.x == instr.common.compare_op.y) { + // Compare X-component and Y-component together + CMPPS(SRC1, R(SRC2), cmp[instr.common.compare_op.x]); + + MOVQ_xmm(R(COND0), SRC1); + MOV(64, R(COND1), R(COND0)); + } else { + // Compare X-component + MOVAPS(SCRATCH, R(SRC1)); + CMPSS(SCRATCH, R(SRC2), cmp[instr.common.compare_op.x]); + + // Compare Y-component + CMPPS(SRC1, R(SRC2), cmp[instr.common.compare_op.y]); + + MOVQ_xmm(R(COND0), SCRATCH); + MOVQ_xmm(R(COND1), SRC1); + } + + SHR(32, R(COND0), Imm8(31)); + SHR(64, R(COND1), Imm8(63)); +} + +void JitCompiler::Compile_MAD(Instruction instr) { + Compile_SwizzleSrc(instr, 1, instr.mad.src1, SRC1); + + if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MADI) { + Compile_SwizzleSrc(instr, 2, instr.mad.src2i, SRC2); + Compile_SwizzleSrc(instr, 3, instr.mad.src3i, SRC3); + } else { + Compile_SwizzleSrc(instr, 2, instr.mad.src2, SRC2); + Compile_SwizzleSrc(instr, 3, instr.mad.src3, SRC3); + } + + if (Common::GetCPUCaps().fma) { + VFMADD213PS(SRC1, SRC2, R(SRC3)); + } else { + MULPS(SRC1, R(SRC2)); + ADDPS(SRC1, R(SRC3)); + } + + Compile_DestEnable(instr, SRC1); +} + +void JitCompiler::Compile_IF(Instruction instr) { + ASSERT_MSG(instr.flow_control.dest_offset > *offset_ptr, "Backwards if-statements not supported"); + + // Evaluate the "IF" condition + if (instr.opcode.Value() == OpCode::Id::IFU) { + Compile_UniformCondition(instr); + } else if (instr.opcode.Value() == OpCode::Id::IFC) { + Compile_EvaluateCondition(instr); + } + FixupBranch b = J_CC(CC_Z, true); + + // Compile the code that corresponds to the condition evaluating as true + Compile_Block(instr.flow_control.dest_offset - 1); + + // If there isn't an "ELSE" condition, we are done here + if (instr.flow_control.num_instructions == 0) { + SetJumpTarget(b); + return; + } + + FixupBranch b2 = J(true); + + SetJumpTarget(b); + + // This code corresponds to the "ELSE" condition + // Comple the code that corresponds to the condition evaluating as false + Compile_Block(instr.flow_control.dest_offset + instr.flow_control.num_instructions - 1); + + SetJumpTarget(b2); +} + +void JitCompiler::Compile_LOOP(Instruction instr) { + ASSERT_MSG(instr.flow_control.dest_offset > *offset_ptr, "Backwards loops not supported"); + ASSERT_MSG(!looping, "Nested loops not supported"); + + looping = true; + + int offset = offsetof(decltype(g_state.vs.uniforms), i) + (instr.flow_control.int_uniform_id * sizeof(Math::Vec4<u8>)); + MOV(32, R(LOOPCOUNT), MDisp(UNIFORMS, offset)); + MOV(32, R(LOOPCOUNT_REG), R(LOOPCOUNT)); + SHR(32, R(LOOPCOUNT_REG), Imm8(8)); + AND(32, R(LOOPCOUNT_REG), Imm32(0xff)); // Y-component is the start + MOV(32, R(LOOPINC), R(LOOPCOUNT)); + SHR(32, R(LOOPINC), Imm8(16)); + MOVZX(32, 8, LOOPINC, R(LOOPINC)); // Z-component is the incrementer + MOVZX(32, 8, LOOPCOUNT, R(LOOPCOUNT)); // X-component is iteration count + ADD(32, R(LOOPCOUNT), Imm8(1)); // Iteration count is X-component + 1 + + auto loop_start = GetCodePtr(); + + Compile_Block(instr.flow_control.dest_offset); + + ADD(32, R(LOOPCOUNT_REG), R(LOOPINC)); // Increment LOOPCOUNT_REG by Z-component + SUB(32, R(LOOPCOUNT), Imm8(1)); // Increment loop count by 1 + J_CC(CC_NZ, loop_start); // Loop if not equal + + looping = false; +} + +void JitCompiler::Compile_JMP(Instruction instr) { + ASSERT_MSG(instr.flow_control.dest_offset > *offset_ptr, "Backwards jumps not supported"); + + if (instr.opcode.Value() == OpCode::Id::JMPC) + Compile_EvaluateCondition(instr); + else if (instr.opcode.Value() == OpCode::Id::JMPU) + Compile_UniformCondition(instr); + else + UNREACHABLE(); + + FixupBranch b = J_CC(CC_NZ, true); + + Compile_Block(instr.flow_control.dest_offset); + + SetJumpTarget(b); +} + +void JitCompiler::Compile_Block(unsigned stop) { + // Save current offset pointer + unsigned* prev_offset_ptr = offset_ptr; + unsigned offset = *prev_offset_ptr; + + while (offset <= stop) + Compile_NextInstr(&offset); + + // Restore current offset pointer + offset_ptr = prev_offset_ptr; + *offset_ptr = offset; +} + +void JitCompiler::Compile_NextInstr(unsigned* offset) { + offset_ptr = offset; + + Instruction instr = *(Instruction*)&g_state.vs.program_code[(*offset_ptr)++]; + OpCode::Id opcode = instr.opcode.Value(); + auto instr_func = instr_table[static_cast<unsigned>(opcode)]; + + if (instr_func) { + // JIT the instruction! + ((*this).*instr_func)(instr); + } else { + // Unhandled instruction + LOG_CRITICAL(HW_GPU, "Unhandled instruction: 0x%02x (0x%08x)", instr.opcode.Value(), instr.hex); + } +} + +CompiledShader* JitCompiler::Compile() { + const u8* start = GetCodePtr(); + const auto& code = g_state.vs.program_code; + unsigned offset = g_state.regs.vs.main_offset; + + ABI_PushAllCalleeSavedRegsAndAdjustStack(); + + MOV(PTRBITS, R(REGISTERS), R(ABI_PARAM1)); + MOV(PTRBITS, R(UNIFORMS), ImmPtr(&g_state.vs.uniforms)); + + // Zero address/loop registers + XOR(64, R(ADDROFFS_REG_0), R(ADDROFFS_REG_0)); + XOR(64, R(ADDROFFS_REG_1), R(ADDROFFS_REG_1)); + XOR(64, R(LOOPCOUNT_REG), R(LOOPCOUNT_REG)); + + // Used to set a register to one + static const __m128 one = { 1.f, 1.f, 1.f, 1.f }; + MOV(PTRBITS, R(RAX), ImmPtr(&one)); + MOVAPS(ONE, MDisp(RAX, 0)); + + // Used to negate registers + static const __m128 neg = { -0.f, -0.f, -0.f, -0.f }; + MOV(PTRBITS, R(RAX), ImmPtr(&neg)); + MOVAPS(NEGBIT, MDisp(RAX, 0)); + + looping = false; + + while (offset < g_state.vs.program_code.size()) { + Compile_NextInstr(&offset); + } + + return (CompiledShader*)start; +} + +JitCompiler::JitCompiler() { + AllocCodeSpace(1024 * 1024 * 4); +} + +void JitCompiler::Clear() { + ClearCodeSpace(); +} + +} // namespace Shader + +} // namespace Pica diff --git a/src/video_core/shader/shader_jit_x64.h b/src/video_core/shader/shader_jit_x64.h new file mode 100644 index 000000000..b88f2a0d2 --- /dev/null +++ b/src/video_core/shader/shader_jit_x64.h @@ -0,0 +1,79 @@ +// Copyright 2015 Citra Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <nihstro/shader_bytecode.h> + +#include "common/x64/emitter.h" + +#include "video_core/pica.h" + +#include "shader.h" + +using nihstro::Instruction; +using nihstro::OpCode; +using nihstro::SwizzlePattern; + +namespace Pica { + +namespace Shader { + +using CompiledShader = void(void* registers); + +/** + * This class implements the shader JIT compiler. It recompiles a Pica shader program into x86_64 + * code that can be executed on the host machine directly. + */ +class JitCompiler : public Gen::XCodeBlock { +public: + JitCompiler(); + + CompiledShader* Compile(); + + void Clear(); + + void Compile_ADD(Instruction instr); + void Compile_DP3(Instruction instr); + void Compile_DP4(Instruction instr); + void Compile_MUL(Instruction instr); + void Compile_FLR(Instruction instr); + void Compile_MAX(Instruction instr); + void Compile_MIN(Instruction instr); + void Compile_RCP(Instruction instr); + void Compile_RSQ(Instruction instr); + void Compile_MOVA(Instruction instr); + void Compile_MOV(Instruction instr); + void Compile_SLTI(Instruction instr); + void Compile_NOP(Instruction instr); + void Compile_END(Instruction instr); + void Compile_CALL(Instruction instr); + void Compile_CALLC(Instruction instr); + void Compile_CALLU(Instruction instr); + void Compile_IF(Instruction instr); + void Compile_LOOP(Instruction instr); + void Compile_JMP(Instruction instr); + void Compile_CMP(Instruction instr); + void Compile_MAD(Instruction instr); + +private: + void Compile_Block(unsigned stop); + void Compile_NextInstr(unsigned* offset); + + void Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRegister src_reg, Gen::X64Reg dest); + void Compile_DestEnable(Instruction instr, Gen::X64Reg dest); + + void Compile_EvaluateCondition(Instruction instr); + void Compile_UniformCondition(Instruction instr); + + /// Pointer to the variable that stores the current Pica code offset. Used to handle nested code blocks. + unsigned* offset_ptr = nullptr; + + /// Set to true if currently in a loop, used to check for the existence of nested loops + bool looping = false; +}; + +} // Shader + +} // Pica diff --git a/src/video_core/vertex_shader.h b/src/video_core/vertex_shader.h deleted file mode 100644 index 97f9250dd..000000000 --- a/src/video_core/vertex_shader.h +++ /dev/null @@ -1,73 +0,0 @@ -// Copyright 2014 Citra Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#pragma once - -#include <type_traits> - -#include "common/vector_math.h" - -#include "pica.h" - -namespace Pica { - -namespace VertexShader { - -struct InputVertex { - Math::Vec4<float24> attr[16]; -}; - -struct OutputVertex { - OutputVertex() = default; - - // VS output attributes - Math::Vec4<float24> pos; - Math::Vec4<float24> dummy; // quaternions (not implemented, yet) - Math::Vec4<float24> color; - Math::Vec2<float24> tc0; - Math::Vec2<float24> tc1; - float24 pad[6]; - Math::Vec2<float24> tc2; - - // Padding for optimal alignment - float24 pad2[4]; - - // Attributes used to store intermediate results - - // position after perspective divide - Math::Vec3<float24> screenpos; - float24 pad3; - - // Linear interpolation - // factor: 0=this, 1=vtx - void Lerp(float24 factor, const OutputVertex& vtx) { - pos = pos * factor + vtx.pos * (float24::FromFloat32(1) - factor); - - // TODO: Should perform perspective correct interpolation here... - tc0 = tc0 * factor + vtx.tc0 * (float24::FromFloat32(1) - factor); - tc1 = tc1 * factor + vtx.tc1 * (float24::FromFloat32(1) - factor); - tc2 = tc2 * factor + vtx.tc2 * (float24::FromFloat32(1) - factor); - - screenpos = screenpos * factor + vtx.screenpos * (float24::FromFloat32(1) - factor); - - color = color * factor + vtx.color * (float24::FromFloat32(1) - factor); - } - - // Linear interpolation - // factor: 0=v0, 1=v1 - static OutputVertex Lerp(float24 factor, const OutputVertex& v0, const OutputVertex& v1) { - OutputVertex ret = v0; - ret.Lerp(factor, v1); - return ret; - } -}; -static_assert(std::is_pod<OutputVertex>::value, "Structure is not POD"); -static_assert(sizeof(OutputVertex) == 32 * sizeof(float), "OutputVertex has invalid size"); - -OutputVertex RunShader(const InputVertex& input, int num_attributes, const Regs::ShaderConfig& config, const State::ShaderSetup& setup); - -} // namespace - -} // namespace - diff --git a/src/video_core/video_core.cpp b/src/video_core/video_core.cpp index 3becc4261..943fde5ee 100644 --- a/src/video_core/video_core.cpp +++ b/src/video_core/video_core.cpp @@ -23,6 +23,7 @@ EmuWindow* g_emu_window = nullptr; ///< Frontend emulator window RendererBase* g_renderer = nullptr; ///< Renderer plugin std::atomic<bool> g_hw_renderer_enabled; +std::atomic<bool> g_shader_jit_enabled; /// Initialize the video core void Init(EmuWindow* emu_window) { diff --git a/src/video_core/video_core.h b/src/video_core/video_core.h index 14b33c9dd..2867bf03e 100644 --- a/src/video_core/video_core.h +++ b/src/video_core/video_core.h @@ -32,8 +32,9 @@ static const int kScreenBottomHeight = 240; ///< 3DS bottom screen height extern RendererBase* g_renderer; ///< Renderer plugin extern EmuWindow* g_emu_window; ///< Emu window -// TODO: Wrap this in a user settings struct along with any other graphics settings (often set from qt ui) +// TODO: Wrap these in a user settings struct along with any other graphics settings (often set from qt ui) extern std::atomic<bool> g_hw_renderer_enabled; +extern std::atomic<bool> g_shader_jit_enabled; /// Start the video core void Start(); |