diff options
Diffstat (limited to '')
-rw-r--r-- | src/audio_core/CMakeLists.txt | 2 | ||||
-rw-r--r-- | src/audio_core/hle/dsp.cpp | 16 | ||||
-rw-r--r-- | src/audio_core/time_stretch.cpp | 144 | ||||
-rw-r--r-- | src/audio_core/time_stretch.h | 57 | ||||
-rw-r--r-- | src/core/hle/kernel/memory.cpp | 3 | ||||
-rw-r--r-- | src/core/hle/service/dsp_dsp.cpp | 4 | ||||
-rw-r--r-- | src/video_core/renderer_opengl/gl_rasterizer.cpp | 26 | ||||
-rw-r--r-- | src/video_core/renderer_opengl/gl_rasterizer.h | 11 | ||||
-rw-r--r-- | src/video_core/renderer_opengl/gl_state.cpp | 7 | ||||
-rw-r--r-- | src/video_core/renderer_opengl/gl_state.h | 2 | ||||
-rw-r--r-- | src/video_core/renderer_opengl/pica_to_gl.h | 20 | ||||
-rw-r--r-- | src/video_core/shader/shader.cpp | 9 | ||||
-rw-r--r-- | src/video_core/shader/shader.h | 25 | ||||
-rw-r--r-- | src/video_core/shader/shader_interpreter.cpp | 8 | ||||
-rw-r--r-- | src/video_core/shader/shader_interpreter.h | 2 | ||||
-rw-r--r-- | src/video_core/shader/shader_jit_x64.cpp | 32 | ||||
-rw-r--r-- | src/video_core/shader/shader_jit_x64.h | 6 |
17 files changed, 329 insertions, 45 deletions
diff --git a/src/audio_core/CMakeLists.txt b/src/audio_core/CMakeLists.txt index 13b5e400e..eba0a5697 100644 --- a/src/audio_core/CMakeLists.txt +++ b/src/audio_core/CMakeLists.txt @@ -7,6 +7,7 @@ set(SRCS hle/source.cpp interpolate.cpp sink_details.cpp + time_stretch.cpp ) set(HEADERS @@ -21,6 +22,7 @@ set(HEADERS null_sink.h sink.h sink_details.h + time_stretch.h ) include_directories(../../externals/soundtouch/include) diff --git a/src/audio_core/hle/dsp.cpp b/src/audio_core/hle/dsp.cpp index 0cdbdb06a..5113ad8ca 100644 --- a/src/audio_core/hle/dsp.cpp +++ b/src/audio_core/hle/dsp.cpp @@ -9,6 +9,7 @@ #include "audio_core/hle/pipe.h" #include "audio_core/hle/source.h" #include "audio_core/sink.h" +#include "audio_core/time_stretch.h" namespace DSP { namespace HLE { @@ -48,15 +49,29 @@ static std::array<Source, num_sources> sources = { }; static std::unique_ptr<AudioCore::Sink> sink; +static AudioCore::TimeStretcher time_stretcher; void Init() { DSP::HLE::ResetPipes(); + for (auto& source : sources) { source.Reset(); } + + time_stretcher.Reset(); + if (sink) { + time_stretcher.SetOutputSampleRate(sink->GetNativeSampleRate()); + } } void Shutdown() { + time_stretcher.Flush(); + while (true) { + std::vector<s16> residual_audio = time_stretcher.Process(sink->SamplesInQueue()); + if (residual_audio.empty()) + break; + sink->EnqueueSamples(residual_audio); + } } bool Tick() { @@ -77,6 +92,7 @@ bool Tick() { void SetSink(std::unique_ptr<AudioCore::Sink> sink_) { sink = std::move(sink_); + time_stretcher.SetOutputSampleRate(sink->GetNativeSampleRate()); } } // namespace HLE diff --git a/src/audio_core/time_stretch.cpp b/src/audio_core/time_stretch.cpp new file mode 100644 index 000000000..ea38f40d0 --- /dev/null +++ b/src/audio_core/time_stretch.cpp @@ -0,0 +1,144 @@ +// Copyright 2016 Citra Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <chrono> +#include <cmath> +#include <vector> + +#include <SoundTouch.h> + +#include "audio_core/audio_core.h" +#include "audio_core/time_stretch.h" + +#include "common/common_types.h" +#include "common/logging/log.h" +#include "common/math_util.h" + +using steady_clock = std::chrono::steady_clock; + +namespace AudioCore { + +constexpr double MIN_RATIO = 0.1; +constexpr double MAX_RATIO = 100.0; + +static double ClampRatio(double ratio) { + return MathUtil::Clamp(ratio, MIN_RATIO, MAX_RATIO); +} + +constexpr double MIN_DELAY_TIME = 0.05; // Units: seconds +constexpr double MAX_DELAY_TIME = 0.25; // Units: seconds +constexpr size_t DROP_FRAMES_SAMPLE_DELAY = 16000; // Units: samples + +constexpr double SMOOTHING_FACTOR = 0.007; + +struct TimeStretcher::Impl { + soundtouch::SoundTouch soundtouch; + + steady_clock::time_point frame_timer = steady_clock::now(); + size_t samples_queued = 0; + + double smoothed_ratio = 1.0; + + double sample_rate = static_cast<double>(native_sample_rate); +}; + +std::vector<s16> TimeStretcher::Process(size_t samples_in_queue) { + // This is a very simple algorithm without any fancy control theory. It works and is stable. + + double ratio = CalculateCurrentRatio(); + ratio = CorrectForUnderAndOverflow(ratio, samples_in_queue); + impl->smoothed_ratio = (1.0 - SMOOTHING_FACTOR) * impl->smoothed_ratio + SMOOTHING_FACTOR * ratio; + impl->smoothed_ratio = ClampRatio(impl->smoothed_ratio); + + // SoundTouch's tempo definition the inverse of our ratio definition. + impl->soundtouch.setTempo(1.0 / impl->smoothed_ratio); + + std::vector<s16> samples = GetSamples(); + if (samples_in_queue >= DROP_FRAMES_SAMPLE_DELAY) { + samples.clear(); + LOG_DEBUG(Audio, "Dropping frames!"); + } + return samples; +} + +TimeStretcher::TimeStretcher() : impl(std::make_unique<Impl>()) { + impl->soundtouch.setPitch(1.0); + impl->soundtouch.setChannels(2); + impl->soundtouch.setSampleRate(native_sample_rate); + Reset(); +} + +TimeStretcher::~TimeStretcher() { + impl->soundtouch.clear(); +} + +void TimeStretcher::SetOutputSampleRate(unsigned int sample_rate) { + impl->sample_rate = static_cast<double>(sample_rate); + impl->soundtouch.setRate(static_cast<double>(native_sample_rate) / impl->sample_rate); +} + +void TimeStretcher::AddSamples(const s16* buffer, size_t num_samples) { + impl->soundtouch.putSamples(buffer, static_cast<uint>(num_samples)); + impl->samples_queued += num_samples; +} + +void TimeStretcher::Flush() { + impl->soundtouch.flush(); +} + +void TimeStretcher::Reset() { + impl->soundtouch.setTempo(1.0); + impl->soundtouch.clear(); + impl->smoothed_ratio = 1.0; + impl->frame_timer = steady_clock::now(); + impl->samples_queued = 0; + SetOutputSampleRate(native_sample_rate); +} + +double TimeStretcher::CalculateCurrentRatio() { + const steady_clock::time_point now = steady_clock::now(); + const std::chrono::duration<double> duration = now - impl->frame_timer; + + const double expected_time = static_cast<double>(impl->samples_queued) / static_cast<double>(native_sample_rate); + const double actual_time = duration.count(); + + double ratio; + if (expected_time != 0) { + ratio = ClampRatio(actual_time / expected_time); + } else { + ratio = impl->smoothed_ratio; + } + + impl->frame_timer = now; + impl->samples_queued = 0; + + return ratio; +} + +double TimeStretcher::CorrectForUnderAndOverflow(double ratio, size_t sample_delay) const { + const size_t min_sample_delay = static_cast<size_t>(MIN_DELAY_TIME * impl->sample_rate); + const size_t max_sample_delay = static_cast<size_t>(MAX_DELAY_TIME * impl->sample_rate); + + if (sample_delay < min_sample_delay) { + // Make the ratio bigger. + ratio = ratio > 1.0 ? ratio * ratio : sqrt(ratio); + } else if (sample_delay > max_sample_delay) { + // Make the ratio smaller. + ratio = ratio > 1.0 ? sqrt(ratio) : ratio * ratio; + } + + return ClampRatio(ratio); +} + +std::vector<s16> TimeStretcher::GetSamples() { + uint available = impl->soundtouch.numSamples(); + + std::vector<s16> output(static_cast<size_t>(available) * 2); + + impl->soundtouch.receiveSamples(output.data(), available); + + return output; +} + +} // namespace AudioCore diff --git a/src/audio_core/time_stretch.h b/src/audio_core/time_stretch.h new file mode 100644 index 000000000..1fde3f72a --- /dev/null +++ b/src/audio_core/time_stretch.h @@ -0,0 +1,57 @@ +// Copyright 2016 Citra Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <cstddef> +#include <memory> +#include <vector> + +#include "common/common_types.h" + +namespace AudioCore { + +class TimeStretcher final { +public: + TimeStretcher(); + ~TimeStretcher(); + + /** + * Set sample rate for the samples that Process returns. + * @param sample_rate The sample rate. + */ + void SetOutputSampleRate(unsigned int sample_rate); + + /** + * Add samples to be processed. + * @param sample_buffer Buffer of samples in interleaved stereo PCM16 format. + * @param num_sample Number of samples. + */ + void AddSamples(const s16* sample_buffer, size_t num_samples); + + /// Flush audio remaining in internal buffers. + void Flush(); + + /// Resets internal state and clears buffers. + void Reset(); + + /** + * Does audio stretching and produces the time-stretched samples. + * Timer calculations use sample_delay to determine how much of a margin we have. + * @param sample_delay How many samples are buffered downstream of this module and haven't been played yet. + * @return Samples to play in interleaved stereo PCM16 format. + */ + std::vector<s16> Process(size_t sample_delay); + +private: + struct Impl; + std::unique_ptr<Impl> impl; + + /// INTERNAL: ratio = wallclock time / emulated time + double CalculateCurrentRatio(); + /// INTERNAL: If we have too many or too few samples downstream, nudge ratio in the appropriate direction. + double CorrectForUnderAndOverflow(double ratio, size_t sample_delay) const; + /// INTERNAL: Gets the time-stretched samples from SoundTouch. + std::vector<s16> GetSamples(); +}; + +} // namespace AudioCore diff --git a/src/core/hle/kernel/memory.cpp b/src/core/hle/kernel/memory.cpp index 4be20db22..17ae87aef 100644 --- a/src/core/hle/kernel/memory.cpp +++ b/src/core/hle/kernel/memory.cpp @@ -55,6 +55,9 @@ void MemoryInit(u32 mem_type) { memory_regions[i].size = memory_region_sizes[mem_type][i]; memory_regions[i].used = 0; memory_regions[i].linear_heap_memory = std::make_shared<std::vector<u8>>(); + // Reserve enough space for this region of FCRAM. + // We do not want this block of memory to be relocated when allocating from it. + memory_regions[i].linear_heap_memory->reserve(memory_regions[i].size); base += memory_regions[i].size; } diff --git a/src/core/hle/service/dsp_dsp.cpp b/src/core/hle/service/dsp_dsp.cpp index 274fc751a..10730d7ac 100644 --- a/src/core/hle/service/dsp_dsp.cpp +++ b/src/core/hle/service/dsp_dsp.cpp @@ -440,9 +440,9 @@ static void GetHeadphoneStatus(Service::Interface* self) { cmd_buff[0] = IPC::MakeHeader(0x1F, 2, 0); cmd_buff[1] = RESULT_SUCCESS.raw; // No error - cmd_buff[2] = 0; // Not using headphones? + cmd_buff[2] = 0; // Not using headphones - LOG_WARNING(Service_DSP, "(STUBBED) called"); + LOG_DEBUG(Service_DSP, "called"); } /** diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index ed2e2f3ae..bcd1ae78d 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -104,7 +104,6 @@ RasterizerOpenGL::RasterizerOpenGL() : shader_dirty(true) { // Sync fixed function OpenGL state SyncCullMode(); - SyncDepthModifiers(); SyncBlendEnabled(); SyncBlendFuncs(); SyncBlendColor(); @@ -259,8 +258,10 @@ void RasterizerOpenGL::NotifyPicaRegisterChanged(u32 id) { // Depth modifiers case PICA_REG_INDEX(viewport_depth_range): + SyncDepthScale(); + break; case PICA_REG_INDEX(viewport_depth_near_plane): - SyncDepthModifiers(); + SyncDepthOffset(); break; // Depth buffering @@ -880,6 +881,8 @@ void RasterizerOpenGL::SetShader() { glUniformBlockBinding(current_shader->shader.handle, block_index, 0); // Update uniforms + SyncDepthScale(); + SyncDepthOffset(); SyncAlphaTest(); SyncCombinerColor(); auto& tev_stages = Pica::g_state.regs.GetTevStages(); @@ -922,13 +925,20 @@ void RasterizerOpenGL::SyncCullMode() { } } -void RasterizerOpenGL::SyncDepthModifiers() { +void RasterizerOpenGL::SyncDepthScale() { float depth_scale = Pica::float24::FromRaw(Pica::g_state.regs.viewport_depth_range).ToFloat32(); - float depth_offset = Pica::float24::FromRaw(Pica::g_state.regs.viewport_depth_near_plane).ToFloat32(); + if (depth_scale != uniform_block_data.data.depth_scale) { + uniform_block_data.data.depth_scale = depth_scale; + uniform_block_data.dirty = true; + } +} - uniform_block_data.data.depth_scale = depth_scale; - uniform_block_data.data.depth_offset = depth_offset; - uniform_block_data.dirty = true; +void RasterizerOpenGL::SyncDepthOffset() { + float depth_offset = Pica::float24::FromRaw(Pica::g_state.regs.viewport_depth_near_plane).ToFloat32(); + if (depth_offset != uniform_block_data.data.depth_offset) { + uniform_block_data.data.depth_offset = depth_offset; + uniform_block_data.dirty = true; + } } void RasterizerOpenGL::SyncBlendEnabled() { @@ -937,6 +947,8 @@ void RasterizerOpenGL::SyncBlendEnabled() { void RasterizerOpenGL::SyncBlendFuncs() { const auto& regs = Pica::g_state.regs; + state.blend.rgb_equation = PicaToGL::BlendEquation(regs.output_merger.alpha_blending.blend_equation_rgb); + state.blend.a_equation = PicaToGL::BlendEquation(regs.output_merger.alpha_blending.blend_equation_a); state.blend.src_rgb_func = PicaToGL::BlendFunc(regs.output_merger.alpha_blending.factor_source_rgb); state.blend.dst_rgb_func = PicaToGL::BlendFunc(regs.output_merger.alpha_blending.factor_dest_rgb); state.blend.src_a_func = PicaToGL::BlendFunc(regs.output_merger.alpha_blending.factor_source_a); diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h index eed00011a..d70369400 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.h +++ b/src/video_core/renderer_opengl/gl_rasterizer.h @@ -339,8 +339,11 @@ private: /// Syncs the cull mode to match the PICA register void SyncCullMode(); - /// Syncs the depth scale and offset to match the PICA registers - void SyncDepthModifiers(); + /// Syncs the depth scale to match the PICA register + void SyncDepthScale(); + + /// Syncs the depth offset to match the PICA register + void SyncDepthOffset(); /// Syncs the blend enabled status to match the PICA register void SyncBlendEnabled(); @@ -413,7 +416,7 @@ private: UniformData data; bool lut_dirty[6]; bool dirty; - } uniform_block_data; + } uniform_block_data = {}; std::array<SamplerInfo, 3> texture_samplers; OGLVertexArray vertex_array; @@ -422,5 +425,5 @@ private: OGLFramebuffer framebuffer; std::array<OGLTexture, 6> lighting_luts; - std::array<std::array<GLvec4, 256>, 6> lighting_lut_data; + std::array<std::array<GLvec4, 256>, 6> lighting_lut_data{}; }; diff --git a/src/video_core/renderer_opengl/gl_state.cpp b/src/video_core/renderer_opengl/gl_state.cpp index 02cd9f417..fa141fc9a 100644 --- a/src/video_core/renderer_opengl/gl_state.cpp +++ b/src/video_core/renderer_opengl/gl_state.cpp @@ -36,6 +36,8 @@ OpenGLState::OpenGLState() { stencil.action_stencil_fail = GL_KEEP; blend.enabled = false; + blend.rgb_equation = GL_FUNC_ADD; + blend.a_equation = GL_FUNC_ADD; blend.src_rgb_func = GL_ONE; blend.dst_rgb_func = GL_ZERO; blend.src_a_func = GL_ONE; @@ -165,6 +167,11 @@ void OpenGLState::Apply() const { blend.src_a_func, blend.dst_a_func); } + if (blend.rgb_equation != cur_state.blend.rgb_equation || + blend.a_equation != cur_state.blend.a_equation) { + glBlendEquationSeparate(blend.rgb_equation, blend.a_equation); + } + if (logic_op != cur_state.logic_op) { glLogicOp(logic_op); } diff --git a/src/video_core/renderer_opengl/gl_state.h b/src/video_core/renderer_opengl/gl_state.h index 24f20e47c..228727054 100644 --- a/src/video_core/renderer_opengl/gl_state.h +++ b/src/video_core/renderer_opengl/gl_state.h @@ -40,6 +40,8 @@ public: struct { bool enabled; // GL_BLEND + GLenum rgb_equation; // GL_BLEND_EQUATION_RGB + GLenum a_equation; // GL_BLEND_EQUATION_ALPHA GLenum src_rgb_func; // GL_BLEND_SRC_RGB GLenum dst_rgb_func; // GL_BLEND_DST_RGB GLenum src_a_func; // GL_BLEND_SRC_ALPHA diff --git a/src/video_core/renderer_opengl/pica_to_gl.h b/src/video_core/renderer_opengl/pica_to_gl.h index 976d1f364..6dc2758c5 100644 --- a/src/video_core/renderer_opengl/pica_to_gl.h +++ b/src/video_core/renderer_opengl/pica_to_gl.h @@ -78,6 +78,26 @@ inline GLenum WrapMode(Pica::Regs::TextureConfig::WrapMode mode) { return gl_mode; } +inline GLenum BlendEquation(Pica::Regs::BlendEquation equation) { + static const GLenum blend_equation_table[] = { + GL_FUNC_ADD, // BlendEquation::Add + GL_FUNC_SUBTRACT, // BlendEquation::Subtract + GL_FUNC_REVERSE_SUBTRACT, // BlendEquation::ReverseSubtract + GL_MIN, // BlendEquation::Min + GL_MAX, // BlendEquation::Max + }; + + // Range check table for input + if (static_cast<size_t>(equation) >= ARRAY_SIZE(blend_equation_table)) { + LOG_CRITICAL(Render_OpenGL, "Unknown blend equation %d", equation); + UNREACHABLE(); + + return GL_FUNC_ADD; + } + + return blend_equation_table[(unsigned)equation]; +} + inline GLenum BlendFunc(Pica::Regs::BlendFactor factor) { static const GLenum blend_func_table[] = { GL_ZERO, // BlendFactor::Zero diff --git a/src/video_core/shader/shader.cpp b/src/video_core/shader/shader.cpp index e93a9d92a..161097610 100644 --- a/src/video_core/shader/shader.cpp +++ b/src/video_core/shader/shader.cpp @@ -64,6 +64,7 @@ MICROPROFILE_DEFINE(GPU_Shader, "GPU", "Shader", MP_RGB(50, 50, 240)); OutputVertex ShaderSetup::Run(UnitState<false>& state, const InputVertex& input, int num_attributes) { auto& config = g_state.regs.vs; + auto& setup = g_state.vs; MICROPROFILE_SCOPE(GPU_Shader); @@ -81,11 +82,11 @@ OutputVertex ShaderSetup::Run(UnitState<false>& state, const InputVertex& input, #ifdef ARCHITECTURE_x86_64 if (VideoCore::g_shader_jit_enabled) - jit_shader->Run(&state.registers, g_state.regs.vs.main_offset); + jit_shader->Run(setup, state, config.main_offset); else - RunInterpreter(state); + RunInterpreter(setup, state, config.main_offset); #else - RunInterpreter(state); + RunInterpreter(setup, state, config.main_offset); #endif // ARCHITECTURE_x86_64 // Setup output data @@ -156,7 +157,7 @@ DebugData<true> ShaderSetup::ProduceDebugInfo(const InputVertex& input, int num_ state.conditional_code[0] = false; state.conditional_code[1] = false; - RunInterpreter(state); + RunInterpreter(setup, state, config.main_offset); return state.debug; } diff --git a/src/video_core/shader/shader.h b/src/video_core/shader/shader.h index 983e4a967..84898f21c 100644 --- a/src/video_core/shader/shader.h +++ b/src/video_core/shader/shader.h @@ -283,10 +283,10 @@ struct UnitState { static size_t InputOffset(const SourceRegister& reg) { switch (reg.GetRegisterType()) { case RegisterType::Input: - return offsetof(UnitState::Registers, input) + reg.GetIndex()*sizeof(Math::Vec4<float24>); + return offsetof(UnitState, registers.input) + reg.GetIndex()*sizeof(Math::Vec4<float24>); case RegisterType::Temporary: - return offsetof(UnitState::Registers, temporary) + reg.GetIndex()*sizeof(Math::Vec4<float24>); + return offsetof(UnitState, registers.temporary) + reg.GetIndex()*sizeof(Math::Vec4<float24>); default: UNREACHABLE(); @@ -297,10 +297,10 @@ struct UnitState { static size_t OutputOffset(const DestRegister& reg) { switch (reg.GetRegisterType()) { case RegisterType::Output: - return offsetof(UnitState::Registers, output) + reg.GetIndex()*sizeof(Math::Vec4<float24>); + return offsetof(UnitState, registers.output) + reg.GetIndex()*sizeof(Math::Vec4<float24>); case RegisterType::Temporary: - return offsetof(UnitState::Registers, temporary) + reg.GetIndex()*sizeof(Math::Vec4<float24>); + return offsetof(UnitState, registers.temporary) + reg.GetIndex()*sizeof(Math::Vec4<float24>); default: UNREACHABLE(); @@ -323,6 +323,23 @@ struct ShaderSetup { std::array<Math::Vec4<u8>, 4> i; } uniforms; + static size_t UniformOffset(RegisterType type, unsigned index) { + switch (type) { + case RegisterType::FloatUniform: + return offsetof(ShaderSetup, uniforms.f) + index*sizeof(Math::Vec4<float24>); + + case RegisterType::BoolUniform: + return offsetof(ShaderSetup, uniforms.b) + index*sizeof(bool); + + case RegisterType::IntUniform: + return offsetof(ShaderSetup, uniforms.i) + index*sizeof(Math::Vec4<u8>); + + default: + UNREACHABLE(); + return 0; + } + } + std::array<u32, 1024> program_code; std::array<u32, 1024> swizzle_data; diff --git a/src/video_core/shader/shader_interpreter.cpp b/src/video_core/shader/shader_interpreter.cpp index 3a827d11f..714e8bfd5 100644 --- a/src/video_core/shader/shader_interpreter.cpp +++ b/src/video_core/shader/shader_interpreter.cpp @@ -41,11 +41,11 @@ struct CallStackElement { }; template<bool Debug> -void RunInterpreter(UnitState<Debug>& state) { +void RunInterpreter(const ShaderSetup& setup, UnitState<Debug>& state, unsigned offset) { // TODO: Is there a maximal size for this? boost::container::static_vector<CallStackElement, 16> call_stack; - u32 program_counter = g_state.regs.vs.main_offset; + u32 program_counter = offset; const auto& uniforms = g_state.vs.uniforms; const auto& swizzle_data = g_state.vs.swizzle_data; @@ -647,8 +647,8 @@ void RunInterpreter(UnitState<Debug>& state) { } // Explicit instantiation -template void RunInterpreter(UnitState<false>& state); -template void RunInterpreter(UnitState<true>& state); +template void RunInterpreter(const ShaderSetup& setup, UnitState<false>& state, unsigned offset); +template void RunInterpreter(const ShaderSetup& setup, UnitState<true>& state, unsigned offset); } // namespace diff --git a/src/video_core/shader/shader_interpreter.h b/src/video_core/shader/shader_interpreter.h index 6048cdf3a..bb3ce1c6e 100644 --- a/src/video_core/shader/shader_interpreter.h +++ b/src/video_core/shader/shader_interpreter.h @@ -11,7 +11,7 @@ namespace Shader { template <bool Debug> struct UnitState; template<bool Debug> -void RunInterpreter(UnitState<Debug>& state); +void RunInterpreter(const ShaderSetup& setup, UnitState<Debug>& state, unsigned offset); } // namespace diff --git a/src/video_core/shader/shader_jit_x64.cpp b/src/video_core/shader/shader_jit_x64.cpp index 99f6c51eb..43e7e6b4c 100644 --- a/src/video_core/shader/shader_jit_x64.cpp +++ b/src/video_core/shader/shader_jit_x64.cpp @@ -102,7 +102,7 @@ const JitFunction instr_table[64] = { // purposes, as documented below: /// Pointer to the uniform memory -static const X64Reg UNIFORMS = R9; +static const X64Reg SETUP = R9; /// The two 32-bit VS address offset registers set by the MOVA instruction static const X64Reg ADDROFFS_REG_0 = R10; static const X64Reg ADDROFFS_REG_1 = R11; @@ -117,7 +117,7 @@ static const X64Reg COND0 = R13; /// Result of the previous CMP instruction for the Y-component comparison static const X64Reg COND1 = R14; /// Pointer to the UnitState instance for the current VS unit -static const X64Reg REGISTERS = R15; +static const X64Reg STATE = R15; /// SIMD scratch register static const X64Reg SCRATCH = XMM0; /// Loaded with the first swizzled source register, otherwise can be used as a scratch register @@ -136,7 +136,7 @@ static const X64Reg NEGBIT = XMM15; // State registers that must not be modified by external functions calls // Scratch registers, e.g., SRC1 and SCRATCH, have to be saved on the side if needed static const BitSet32 persistent_regs = { - UNIFORMS, REGISTERS, // Pointers to register blocks + SETUP, STATE, // Pointers to register blocks ADDROFFS_REG_0, ADDROFFS_REG_1, LOOPCOUNT_REG, COND0, COND1, // Cached registers ONE+16, NEGBIT+16, // Constants }; @@ -177,10 +177,10 @@ void JitShader::Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRe size_t src_offset; if (src_reg.GetRegisterType() == RegisterType::FloatUniform) { - src_ptr = UNIFORMS; - src_offset = src_reg.GetIndex() * sizeof(float24) * 4; + src_ptr = SETUP; + src_offset = ShaderSetup::UniformOffset(RegisterType::FloatUniform, src_reg.GetIndex()); } else { - src_ptr = REGISTERS; + src_ptr = STATE; src_offset = UnitState<false>::InputOffset(src_reg); } @@ -264,11 +264,11 @@ void JitShader::Compile_DestEnable(Instruction instr,X64Reg src) { // If all components are enabled, write the result to the destination register if (swiz.dest_mask == NO_DEST_REG_MASK) { // Store dest back to memory - MOVAPS(MDisp(REGISTERS, dest_offset_disp), src); + MOVAPS(MDisp(STATE, dest_offset_disp), src); } else { // Not all components are enabled, so mask the result when storing to the destination register... - MOVAPS(SCRATCH, MDisp(REGISTERS, dest_offset_disp)); + MOVAPS(SCRATCH, MDisp(STATE, dest_offset_disp)); if (Common::GetCPUCaps().sse4_1) { u8 mask = ((swiz.dest_mask & 1) << 3) | ((swiz.dest_mask & 8) >> 3) | ((swiz.dest_mask & 2) << 1) | ((swiz.dest_mask & 4) >> 1); @@ -287,7 +287,7 @@ void JitShader::Compile_DestEnable(Instruction instr,X64Reg src) { } // Store dest back to memory - MOVAPS(MDisp(REGISTERS, dest_offset_disp), SCRATCH); + MOVAPS(MDisp(STATE, dest_offset_disp), SCRATCH); } } @@ -336,8 +336,8 @@ void JitShader::Compile_EvaluateCondition(Instruction instr) { } void JitShader::Compile_UniformCondition(Instruction instr) { - int offset = offsetof(decltype(g_state.vs.uniforms), b) + (instr.flow_control.bool_uniform_id * sizeof(bool)); - CMP(sizeof(bool) * 8, MDisp(UNIFORMS, offset), Imm8(0)); + int offset = ShaderSetup::UniformOffset(RegisterType::BoolUniform, instr.flow_control.bool_uniform_id); + CMP(sizeof(bool) * 8, MDisp(SETUP, offset), Imm8(0)); } BitSet32 JitShader::PersistentCallerSavedRegs() { @@ -714,8 +714,8 @@ void JitShader::Compile_LOOP(Instruction instr) { looping = true; - int offset = offsetof(decltype(g_state.vs.uniforms), i) + (instr.flow_control.int_uniform_id * sizeof(Math::Vec4<u8>)); - MOV(32, R(LOOPCOUNT), MDisp(UNIFORMS, offset)); + int offset = ShaderSetup::UniformOffset(RegisterType::IntUniform, instr.flow_control.int_uniform_id); + MOV(32, R(LOOPCOUNT), MDisp(SETUP, offset)); MOV(32, R(LOOPCOUNT_REG), R(LOOPCOUNT)); SHR(32, R(LOOPCOUNT_REG), Imm8(8)); AND(32, R(LOOPCOUNT_REG), Imm32(0xff)); // Y-component is the start @@ -826,8 +826,8 @@ void JitShader::Compile() { // The stack pointer is 8 modulo 16 at the entry of a procedure ABI_PushRegistersAndAdjustStack(ABI_ALL_CALLEE_SAVED, 8); - MOV(PTRBITS, R(REGISTERS), R(ABI_PARAM1)); - MOV(PTRBITS, R(UNIFORMS), ImmPtr(&g_state.vs.uniforms)); + MOV(PTRBITS, R(SETUP), R(ABI_PARAM1)); + MOV(PTRBITS, R(STATE), R(ABI_PARAM2)); // Zero address/loop registers XOR(64, R(ADDROFFS_REG_0), R(ADDROFFS_REG_0)); @@ -845,7 +845,7 @@ void JitShader::Compile() { MOVAPS(NEGBIT, MatR(RAX)); // Jump to start of the shader program - JMPptr(R(ABI_PARAM2)); + JMPptr(R(ABI_PARAM3)); // Compile entire program Compile_Block(static_cast<unsigned>(g_state.vs.program_code.size())); diff --git a/src/video_core/shader/shader_jit_x64.h b/src/video_core/shader/shader_jit_x64.h index 30aa7ff30..5468459d4 100644 --- a/src/video_core/shader/shader_jit_x64.h +++ b/src/video_core/shader/shader_jit_x64.h @@ -36,8 +36,8 @@ class JitShader : public Gen::XCodeBlock { public: JitShader(); - void Run(void* registers, unsigned offset) const { - program(registers, code_ptr[offset]); + void Run(const ShaderSetup& setup, UnitState<false>& state, unsigned offset) const { + program(&setup, &state, code_ptr[offset]); } void Compile(); @@ -117,7 +117,7 @@ private: /// Branches that need to be fixed up once the entire shader program is compiled std::vector<std::pair<Gen::FixupBranch, unsigned>> fixup_branches; - using CompiledShader = void(void* registers, const u8* start_addr); + using CompiledShader = void(const void* setup, void* state, const u8* start_addr); CompiledShader* program = nullptr; }; |