From 3e6663da433d98a0bf4db1256ea3ccdefd404a0c Mon Sep 17 00:00:00 2001 From: Yuri Kunde Schlesner Date: Sun, 7 Jun 2015 22:24:03 -0300 Subject: Y2R: Rework conversion process, enabling support for all formats --- src/core/CMakeLists.txt | 2 + src/core/hle/service/y2r_u.cpp | 376 +++++++++++++++++++++++------------------ src/core/hle/service/y2r_u.h | 96 +++++++++++ src/core/hw/y2r.cpp | 369 ++++++++++++++++++++++++++++++++++++++++ src/core/hw/y2r.h | 15 ++ 5 files changed, 695 insertions(+), 163 deletions(-) create mode 100644 src/core/hw/y2r.cpp create mode 100644 src/core/hw/y2r.h diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt index bbc285168..ea5533dcf 100644 --- a/src/core/CMakeLists.txt +++ b/src/core/CMakeLists.txt @@ -96,6 +96,7 @@ set(SRCS hw/gpu.cpp hw/hw.cpp hw/lcd.cpp + hw/y2r.cpp loader/3dsx.cpp loader/elf.cpp loader/loader.cpp @@ -209,6 +210,7 @@ set(HEADERS hw/gpu.h hw/hw.h hw/lcd.h + hw/y2r.h loader/3dsx.h loader/elf.h loader/loader.h diff --git a/src/core/hle/service/y2r_u.cpp b/src/core/hle/service/y2r_u.cpp index 73a0899dd..17cb4f0f0 100644 --- a/src/core/hle/service/y2r_u.cpp +++ b/src/core/hle/service/y2r_u.cpp @@ -9,8 +9,8 @@ #include "core/hle/hle.h" #include "core/hle/kernel/event.h" #include "core/hle/service/y2r_u.h" +#include "core/hw/y2r.h" #include "core/mem_map.h" -#include "core/memory.h" #include "video_core/utils.h" #include "video_core/video_core.h" @@ -20,47 +20,6 @@ namespace Y2R_U { -enum class InputFormat : u8 { - /// 8-bit input, with YUV components in separate planes and using 4:2:2 subsampling. - YUV422_Indiv8 = 0, - /// 8-bit input, with YUV components in separate planes and using 4:2:0 subsampling. - YUV420_Indiv8 = 1, - - YUV422_INDIV_16 = 2, - YUV420_INDIV_16 = 3, - YUV422_BATCH = 4, -}; - -enum class OutputFormat : u8 { - Rgb32 = 0, - Rgb24 = 1, - Rgb16_555 = 2, - Rgb16_565 = 3, -}; - -enum class Rotation : u8 { - None = 0, - Clockwise_90 = 1, - Clockwise_180 = 2, - Clockwise_270 = 3, -}; - -enum class BlockAlignment : u8 { - /// Image is output in linear format suitable for use as a framebuffer. - Linear = 0, - /// Image is output in tiled PICA format, suitable for use as a texture. - Block8x8 = 1, -}; - -enum class StandardCoefficient : u8 { - ITU_Rec601 = 0, - ITU_Rec709 = 1, - ITU_Rec601_Scaling = 2, - ITU_Rec709_Scaling = 3, -}; - -static Kernel::SharedPtr completion_event; - struct ConversionParameters { InputFormat input_format; OutputFormat output_format; @@ -74,28 +33,60 @@ struct ConversionParameters { }; static_assert(sizeof(ConversionParameters) == 12, "ConversionParameters struct has incorrect size"); -struct ConversionBuffer { - VAddr address; - u32 image_size; - u16 transfer_unit; - u16 stride; -}; +static Kernel::SharedPtr completion_event; +static ConversionConfiguration conversion; -struct ConversionData { - ConversionParameters params; - /// Input parameters for the Y (luma) plane - ConversionBuffer src_Y; - /// Output parameters for the conversion results - ConversionBuffer dst; +static const CoefficientSet standard_coefficients[4] = { + {{ 0x100, 0x166, 0xB6, 0x58, 0x1C5, -0x166F, 0x10EE, -0x1C5B }}, // ITU_Rec601 + {{ 0x100, 0x193, 0x77, 0x2F, 0x1DB, -0x1933, 0xA7C, -0x1D51 }}, // ITU_Rec709 + {{ 0x12A, 0x198, 0xD0, 0x64, 0x204, -0x1BDE, 0x10F2, -0x229B }}, // ITU_Rec601_Scaling + {{ 0x12A, 0x1CA, 0x88, 0x36, 0x21C, -0x1F04, 0x99C, -0x2421 }}, // ITU_Rec709_Scaling }; -static ConversionData conversion; +ResultCode ConversionConfiguration::SetInputLineWidth(u16 width) { + if (width == 0 || width > 1024 || width % 8 != 0) { + return ResultCode(ErrorDescription::OutOfRange, ErrorModule::CAM, + ErrorSummary::InvalidArgument, ErrorLevel::Usage); // 0xE0E053FD + } + + // Note: The hardware uses the register value 0 to represent a width of 1024, so for a width of + // 1024 the `camera` module would set the value 0 here, but we don't need to emulate this + // internal detail. + this->input_line_width = width; + return RESULT_SUCCESS; +} + +ResultCode ConversionConfiguration::SetInputLines(u16 lines) { + if (lines == 0 || lines > 1024) { + return ResultCode(ErrorDescription::OutOfRange, ErrorModule::CAM, + ErrorSummary::InvalidArgument, ErrorLevel::Usage); // 0xE0E053FD + } + + // Note: In what appears to be a bug, the `camera` module does not set the hardware register at + // all if `lines` is 1024, so the conversion uses the last value that was set. The intention + // was probably to set it to 0 like in SetInputLineWidth. + if (lines != 1024) { + this->input_lines = lines; + } + return RESULT_SUCCESS; +} + +ResultCode ConversionConfiguration::SetStandardCoefficient(StandardCoefficient standard_coefficient) { + size_t index = static_cast(standard_coefficient); + if (index >= 4) { + return ResultCode(ErrorDescription::InvalidEnumValue, ErrorModule::CAM, + ErrorSummary::InvalidArgument, ErrorLevel::Usage); // 0xE0E053ED + } + + std::memcpy(coefficients.data(), standard_coefficients[index].data(), sizeof(coefficients)); + return RESULT_SUCCESS; +} static void SetInputFormat(Service::Interface* self) { u32* cmd_buff = Kernel::GetCommandBuffer(); - conversion.params.input_format = static_cast(cmd_buff[1]); - LOG_DEBUG(Service_Y2R, "called input_format=%u", conversion.params.input_format); + conversion.input_format = static_cast(cmd_buff[1]); + LOG_DEBUG(Service_Y2R, "called input_format=%hhu", conversion.input_format); cmd_buff[1] = RESULT_SUCCESS.raw; } @@ -103,8 +94,8 @@ static void SetInputFormat(Service::Interface* self) { static void SetOutputFormat(Service::Interface* self) { u32* cmd_buff = Kernel::GetCommandBuffer(); - conversion.params.output_format = static_cast(cmd_buff[1]); - LOG_DEBUG(Service_Y2R, "called output_format=%u", conversion.params.output_format); + conversion.output_format = static_cast(cmd_buff[1]); + LOG_DEBUG(Service_Y2R, "called output_format=%hhu", conversion.output_format); cmd_buff[1] = RESULT_SUCCESS.raw; } @@ -112,8 +103,8 @@ static void SetOutputFormat(Service::Interface* self) { static void SetRotation(Service::Interface* self) { u32* cmd_buff = Kernel::GetCommandBuffer(); - conversion.params.rotation = static_cast(cmd_buff[1]); - LOG_DEBUG(Service_Y2R, "called rotation=%u", conversion.params.rotation); + conversion.rotation = static_cast(cmd_buff[1]); + LOG_DEBUG(Service_Y2R, "called rotation=%hhu", conversion.rotation); cmd_buff[1] = RESULT_SUCCESS.raw; } @@ -121,10 +112,18 @@ static void SetRotation(Service::Interface* self) { static void SetBlockAlignment(Service::Interface* self) { u32* cmd_buff = Kernel::GetCommandBuffer(); - conversion.params.block_alignment = static_cast(cmd_buff[1]); - LOG_DEBUG(Service_Y2R, "called alignment=%u", conversion.params.block_alignment); + conversion.block_alignment = static_cast(cmd_buff[1]); + LOG_DEBUG(Service_Y2R, "called alignment=%hhu", conversion.block_alignment); + + cmd_buff[1] = RESULT_SUCCESS.raw; +} + +static void SetTransferEndInterrupt(Service::Interface* self) { + u32* cmd_buff = Kernel::GetCommandBuffer(); + cmd_buff[0] = 0x000D0040; cmd_buff[1] = RESULT_SUCCESS.raw; + LOG_DEBUG(Service_Y2R, "(STUBBED) called"); } /** @@ -147,11 +146,56 @@ static void SetSendingY(Service::Interface* self) { conversion.src_Y.address = cmd_buff[1]; conversion.src_Y.image_size = cmd_buff[2]; conversion.src_Y.transfer_unit = cmd_buff[3]; - conversion.src_Y.stride = cmd_buff[4]; + conversion.src_Y.gap = cmd_buff[4]; u32 src_process_handle = cmd_buff[6]; LOG_DEBUG(Service_Y2R, "called image_size=0x%08X, transfer_unit=%hu, transfer_stride=%hu, " "src_process_handle=0x%08X", conversion.src_Y.image_size, - conversion.src_Y.transfer_unit, conversion.src_Y.stride, src_process_handle); + conversion.src_Y.transfer_unit, conversion.src_Y.gap, src_process_handle); + + cmd_buff[1] = RESULT_SUCCESS.raw; +} + +static void SetSendingU(Service::Interface* self) { + u32* cmd_buff = Kernel::GetCommandBuffer(); + + conversion.src_U.address = cmd_buff[1]; + conversion.src_U.image_size = cmd_buff[2]; + conversion.src_U.transfer_unit = cmd_buff[3]; + conversion.src_U.gap = cmd_buff[4]; + u32 src_process_handle = cmd_buff[6]; + LOG_DEBUG(Service_Y2R, "called image_size=0x%08X, transfer_unit=%hu, transfer_stride=%hu, " + "src_process_handle=0x%08X", conversion.src_U.image_size, + conversion.src_U.transfer_unit, conversion.src_U.gap, src_process_handle); + + cmd_buff[1] = RESULT_SUCCESS.raw; +} + +static void SetSendingV(Service::Interface* self) { + u32* cmd_buff = Kernel::GetCommandBuffer(); + + conversion.src_V.address = cmd_buff[1]; + conversion.src_V.image_size = cmd_buff[2]; + conversion.src_V.transfer_unit = cmd_buff[3]; + conversion.src_V.gap = cmd_buff[4]; + u32 src_process_handle = cmd_buff[6]; + LOG_DEBUG(Service_Y2R, "called image_size=0x%08X, transfer_unit=%hu, transfer_stride=%hu, " + "src_process_handle=0x%08X", conversion.src_V.image_size, + conversion.src_V.transfer_unit, conversion.src_V.gap, src_process_handle); + + cmd_buff[1] = RESULT_SUCCESS.raw; +} + +static void SetSendingYUYV(Service::Interface* self) { + u32* cmd_buff = Kernel::GetCommandBuffer(); + + conversion.src_YUYV.address = cmd_buff[1]; + conversion.src_YUYV.image_size = cmd_buff[2]; + conversion.src_YUYV.transfer_unit = cmd_buff[3]; + conversion.src_YUYV.gap = cmd_buff[4]; + u32 src_process_handle = cmd_buff[6]; + LOG_DEBUG(Service_Y2R, "called image_size=0x%08X, transfer_unit=%hu, transfer_stride=%hu, " + "src_process_handle=0x%08X", conversion.src_YUYV.image_size, + conversion.src_YUYV.transfer_unit, conversion.src_YUYV.gap, src_process_handle); cmd_buff[1] = RESULT_SUCCESS.raw; } @@ -162,11 +206,11 @@ static void SetReceiving(Service::Interface* self) { conversion.dst.address = cmd_buff[1]; conversion.dst.image_size = cmd_buff[2]; conversion.dst.transfer_unit = cmd_buff[3]; - conversion.dst.stride = cmd_buff[4]; + conversion.dst.gap = cmd_buff[4]; u32 dst_process_handle = cmd_buff[6]; LOG_DEBUG(Service_Y2R, "called image_size=0x%08X, transfer_unit=%hu, transfer_stride=%hu, " "dst_process_handle=0x%08X", conversion.dst.image_size, - conversion.dst.transfer_unit, conversion.dst.stride, + conversion.dst.transfer_unit, conversion.dst.gap, dst_process_handle); cmd_buff[1] = RESULT_SUCCESS.raw; @@ -175,107 +219,54 @@ static void SetReceiving(Service::Interface* self) { static void SetInputLineWidth(Service::Interface* self) { u32* cmd_buff = Kernel::GetCommandBuffer(); - conversion.params.input_line_width = cmd_buff[1]; - LOG_DEBUG(Service_Y2R, "input_line_width=%u", conversion.params.input_line_width); - - cmd_buff[1] = RESULT_SUCCESS.raw; + LOG_DEBUG(Service_Y2R, "called input_line_width=%u", cmd_buff[1]); + cmd_buff[1] = conversion.SetInputLineWidth(cmd_buff[1]).raw; } static void SetInputLines(Service::Interface* self) { u32* cmd_buff = Kernel::GetCommandBuffer(); - conversion.params.input_lines = cmd_buff[1]; - LOG_DEBUG(Service_Y2R, "input_line_number=%u", conversion.params.input_lines); - - cmd_buff[1] = RESULT_SUCCESS.raw; + LOG_DEBUG(Service_Y2R, "called input_line_number=%u", cmd_buff[1]); + cmd_buff[1] = conversion.SetInputLines(cmd_buff[1]).raw; } -static void StartConversion(Service::Interface* self) { +static void SetCoefficient(Service::Interface* self) { u32* cmd_buff = Kernel::GetCommandBuffer(); - const ConversionParameters& params = conversion.params; - - const u8* srcY_buffer = Memory::GetPointer(conversion.src_Y.address); - u8* dst_buffer = Memory::GetPointer(conversion.dst.address); - - // TODO: support color and other kinds of conversions - ASSERT(params.input_format == InputFormat::YUV422_Indiv8 - || params.input_format == InputFormat::YUV420_Indiv8); - ASSERT(params.output_format == OutputFormat::Rgb24); - ASSERT(params.rotation == Rotation::None); - const int bpp = 3; - - switch (params.block_alignment) { - case BlockAlignment::Linear: - { - const size_t input_lines = params.input_lines; - const size_t input_line_width = params.input_line_width; - const size_t srcY_stride = conversion.src_Y.stride; - const size_t dst_stride = conversion.dst.stride; - - size_t srcY_offset = 0; - size_t dst_offset = 0; - - for (size_t line = 0; line < input_lines; ++line) { - for (size_t i = 0; i < input_line_width; ++i) { - u8 Y = srcY_buffer[srcY_offset]; - dst_buffer[dst_offset + 0] = Y; - dst_buffer[dst_offset + 1] = Y; - dst_buffer[dst_offset + 2] = Y; - - srcY_offset += 1; - dst_offset += bpp; - } - srcY_offset += srcY_stride; - dst_offset += dst_stride; - } - break; - } - case BlockAlignment::Block8x8: - { - const size_t input_lines = params.input_lines; - const size_t input_line_width = params.input_line_width; - const size_t srcY_stride = conversion.src_Y.stride; - const size_t dst_transfer_unit = conversion.dst.transfer_unit; - const size_t dst_stride = conversion.dst.stride; - - size_t srcY_offset = 0; - size_t dst_tile_line_offs = 0; + const u16* coefficients = reinterpret_cast(&cmd_buff[1]); + std::memcpy(conversion.coefficients.data(), coefficients, sizeof(CoefficientSet)); + LOG_DEBUG(Service_Y2R, "called coefficients=[%hX, %hX, %hX, %hX, %hX, %hX, %hX, %hX]", + coefficients[0], coefficients[1], coefficients[2], coefficients[3], + coefficients[4], coefficients[5], coefficients[6], coefficients[7]); - const size_t tile_size = 8 * 8 * bpp; + cmd_buff[1] = RESULT_SUCCESS.raw; +} - for (size_t line = 0; line < input_lines;) { - size_t max_line = line + 8; +static void SetStandardCoefficient(Service::Interface* self) { + u32* cmd_buff = Kernel::GetCommandBuffer(); - for (; line < max_line; ++line) { - for (size_t x = 0; x < input_line_width; ++x) { - size_t tile_x = x / 8; + LOG_DEBUG(Service_Y2R, "called standard_coefficient=%u", cmd_buff[1]); - size_t dst_tile_offs = dst_tile_line_offs + tile_x * tile_size; - size_t tile_i = VideoCore::MortonInterleave((u32)x, (u32)line); + cmd_buff[1] = conversion.SetStandardCoefficient((StandardCoefficient)cmd_buff[1]).raw; +} - size_t dst_offset = dst_tile_offs + tile_i * bpp; +static void SetAlpha(Service::Interface* self) { + u32* cmd_buff = Kernel::GetCommandBuffer(); - u8 Y = srcY_buffer[srcY_offset]; - dst_buffer[dst_offset + 0] = Y; - dst_buffer[dst_offset + 1] = Y; - dst_buffer[dst_offset + 2] = Y; + conversion.alpha = cmd_buff[1]; + LOG_DEBUG(Service_Y2R, "called alpha=%hu", conversion.alpha); - srcY_offset += 1; - } + cmd_buff[1] = RESULT_SUCCESS.raw; +} - srcY_offset += srcY_stride; - } +static void StartConversion(Service::Interface* self) { + u32* cmd_buff = Kernel::GetCommandBuffer(); - dst_tile_line_offs += dst_transfer_unit + dst_stride; - } - break; - } - } + HW::Y2R::PerformConversion(conversion); - // dst_image_size would seem to be perfect for this, but it doesn't include the stride :( - u32 total_output_size = params.input_lines * - (conversion.dst.transfer_unit + conversion.dst.stride); + // dst_image_size would seem to be perfect for this, but it doesn't include the gap :( + u32 total_output_size = conversion.input_lines * + (conversion.dst.transfer_unit + conversion.dst.gap); VideoCore::g_renderer->hw_rasterizer->NotifyFlush( Memory::VirtualToPhysicalAddress(conversion.dst.address), total_output_size); @@ -285,6 +276,14 @@ static void StartConversion(Service::Interface* self) { cmd_buff[1] = RESULT_SUCCESS.raw; } +static void StopConversion(Service::Interface* self) { + u32* cmd_buff = Kernel::GetCommandBuffer(); + + cmd_buff[0] = 0x00270040; + cmd_buff[1] = RESULT_SUCCESS.raw; + LOG_DEBUG(Service_Y2R, "called"); +} + /** * Y2R_U::IsBusyConversion service function * Outputs: @@ -306,15 +305,31 @@ static void SetConversionParams(Service::Interface* self) { u32* cmd_buff = Kernel::GetCommandBuffer(); auto params = reinterpret_cast(&cmd_buff[1]); - conversion.params = *params; - - cmd_buff[0] = 0x00290000; // TODO verify - cmd_buff[1] = RESULT_SUCCESS.raw; LOG_DEBUG(Service_Y2R, "called input_format=%hhu output_format=%hhu rotation=%hhu block_alignment=%hhu " - "input_line_width=%hX input_lines=%hu standard_coefficient=%hhu reserved=%hhu alpha=%hX", + "input_line_width=%hu input_lines=%hu standard_coefficient=%hhu " + "reserved=%hhu alpha=%hX", params->input_format, params->output_format, params->rotation, params->block_alignment, - params->input_line_width, params->input_lines, params->standard_coefficient); + params->input_line_width, params->input_lines, params->standard_coefficient, + params->reserved, params->alpha); + + ResultCode result = RESULT_SUCCESS; + + conversion.input_format = params->input_format; + conversion.output_format = params->output_format; + conversion.rotation = params->rotation; + conversion.block_alignment = params->block_alignment; + result = conversion.SetInputLineWidth(params->input_line_width); + if (result.IsError()) goto cleanup; + result = conversion.SetInputLines(params->input_lines); + if (result.IsError()) goto cleanup; + result = conversion.SetStandardCoefficient(params->standard_coefficient); + if (result.IsError()) goto cleanup; + conversion.alpha = params->alpha; + +cleanup: + cmd_buff[0] = 0x00290040; // TODO verify + cmd_buff[1] = result.raw; } static void PingProcess(Service::Interface* self) { @@ -325,28 +340,63 @@ static void PingProcess(Service::Interface* self) { LOG_WARNING(Service_Y2R, "(STUBBED) called"); } +static void DriverInitialize(Service::Interface* self) { + u32* cmd_buff = Kernel::GetCommandBuffer(); + + conversion.input_format = InputFormat::YUV422_Indiv8; + conversion.output_format = OutputFormat::RGBA8; + conversion.rotation = Rotation::None; + conversion.block_alignment = BlockAlignment::Linear; + conversion.coefficients.fill(0); + conversion.SetInputLineWidth(1024); + conversion.SetInputLines(1024); + conversion.alpha = 0; + + ConversionBuffer zero_buffer = {}; + conversion.src_Y = zero_buffer; + conversion.src_U = zero_buffer; + conversion.src_V = zero_buffer; + conversion.dst = zero_buffer; + + completion_event->Clear(); + + cmd_buff[0] = 0x002B0040; + cmd_buff[1] = RESULT_SUCCESS.raw; + LOG_DEBUG(Service_Y2R, "called"); +} + +static void DriverFinalize(Service::Interface* self) { + u32* cmd_buff = Kernel::GetCommandBuffer(); + + cmd_buff[0] = 0x002C0040; + cmd_buff[1] = RESULT_SUCCESS.raw; + LOG_DEBUG(Service_Y2R, "called"); +} + const Interface::FunctionInfo FunctionTable[] = { {0x00010040, SetInputFormat, "SetInputFormat"}, {0x00030040, SetOutputFormat, "SetOutputFormat"}, {0x00050040, SetRotation, "SetRotation"}, {0x00070040, SetBlockAlignment, "SetBlockAlignment"}, - {0x000D0040, nullptr, "SetTransferEndInterrupt"}, + {0x000D0040, SetTransferEndInterrupt, "SetTransferEndInterrupt"}, {0x000F0000, GetTransferEndEvent, "GetTransferEndEvent"}, {0x00100102, SetSendingY, "SetSendingY"}, - {0x00110102, nullptr, "SetSendingU"}, - {0x00120102, nullptr, "SetSendingV"}, + {0x00110102, SetSendingU, "SetSendingU"}, + {0x00120102, SetSendingV, "SetSendingV"}, + {0x00130102, SetSendingYUYV, "SetSendingYUYV"}, {0x00180102, SetReceiving, "SetReceiving"}, {0x001A0040, SetInputLineWidth, "SetInputLineWidth"}, {0x001C0040, SetInputLines, "SetInputLines"}, - {0x00200040, nullptr, "SetStandardCoefficient"}, - {0x00220040, nullptr, "SetAlpha"}, + {0x001E0100, SetCoefficient, "SetCoefficient"}, + {0x00200040, SetStandardCoefficient, "SetStandardCoefficient"}, + {0x00220040, SetAlpha, "SetAlpha"}, {0x00260000, StartConversion, "StartConversion"}, - {0x00270000, nullptr, "StopConversion"}, + {0x00270000, StopConversion, "StopConversion"}, {0x00280000, IsBusyConversion, "IsBusyConversion"}, {0x002901C0, SetConversionParams, "SetConversionParams"}, {0x002A0000, PingProcess, "PingProcess"}, - {0x002B0000, nullptr, "DriverInitialize"}, - {0x002C0000, nullptr, "DriverFinalize"}, + {0x002B0000, DriverInitialize, "DriverInitialize"}, + {0x002C0000, DriverFinalize, "DriverFinalize"}, }; //////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/core/hle/service/y2r_u.h b/src/core/hle/service/y2r_u.h index 171aecfd1..7df47fcb9 100644 --- a/src/core/hle/service/y2r_u.h +++ b/src/core/hle/service/y2r_u.h @@ -4,6 +4,10 @@ #pragma once +#include + +#include "common/common_types.h" + #include "core/hle/service/service.h" //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -11,6 +15,98 @@ namespace Y2R_U { +enum class InputFormat : u8 { + /// 8-bit input, with YUV components in separate planes and 4:2:2 subsampling. + YUV422_Indiv8 = 0, + /// 8-bit input, with YUV components in separate planes and 4:2:0 subsampling. + YUV420_Indiv8 = 1, + + /// 16-bit input (only LSB used), with YUV components in separate planes and 4:2:2 subsampling. + YUV422_Indiv16 = 2, + /// 16-bit input (only LSB used), with YUV components in separate planes and 4:2:0 subsampling. + YUV420_Indiv16 = 3, + + /// 8-bit input, with a single interleaved stream in YUYV format and 4:2:2 subsampling. + YUYV422_Interleaved = 4, +}; + +enum class OutputFormat : u8 { + RGBA8 = 0, + RGB8 = 1, + RGB5A1 = 2, + RGB565 = 3, +}; + +enum class Rotation : u8 { + None = 0, + Clockwise_90 = 1, + Clockwise_180 = 2, + Clockwise_270 = 3, +}; + +enum class BlockAlignment : u8 { + /// Image is output in linear format suitable for use as a framebuffer. + Linear = 0, + /// Image is output in tiled PICA format, suitable for use as a texture. + Block8x8 = 1, +}; + +enum class StandardCoefficient : u8 { + /// ITU Rec. BT.601 primaries, with PC ranges. + ITU_Rec601 = 0, + /// ITU Rec. BT.709 primaries, with PC ranges. + ITU_Rec709 = 1, + /// ITU Rec. BT.601 primaries, with TV ranges. + ITU_Rec601_Scaling = 2, + /// ITU Rec. BT.709 primaries, with TV ranges. + ITU_Rec709_Scaling = 3, +}; + +/** + * A set of coefficients configuring the RGB to YUV conversion. Coefficients 0-4 are unsigned 2.8 + * fixed pointer numbers representing entries on the conversion matrix, while coefficient 5-7 are + * signed 11.5 fixed point numbers added as offsets to the RGB result. + * + * The overall conversion process formula is: + * ``` + * R = trunc((c_0 * Y + c_1 * V) + c_5 + 0.75) + * G = trunc((c_0 * Y - c_3 * U - c_2 * V) + c_6 + 0.75) + * B = trunc((c_0 * Y + c_4 * U ) + c_7 + 0.75) + * ``` + */ +using CoefficientSet = std::array; + +struct ConversionBuffer { + /// Current reading/writing address of this buffer. + VAddr address; + /// Remaining amount of bytes to be DMAed, does not include the inter-trasfer gap. + u32 image_size; + /// Size of a single DMA transfer. + u16 transfer_unit; + /// Amount of bytes to be skipped between copying each `transfer_unit` bytes. + u16 gap; +}; + +struct ConversionConfiguration { + InputFormat input_format; + OutputFormat output_format; + Rotation rotation; + BlockAlignment block_alignment; + u16 input_line_width; + u16 input_lines; + CoefficientSet coefficients; + u16 alpha; + + /// Input parameters for the Y (luma) plane + ConversionBuffer src_Y, src_U, src_V, src_YUYV; + /// Output parameters for the conversion results + ConversionBuffer dst; + + ResultCode SetInputLineWidth(u16 width); + ResultCode SetInputLines(u16 lines); + ResultCode SetStandardCoefficient(StandardCoefficient standard_coefficient); +}; + class Interface : public Service::Interface { public: Interface(); diff --git a/src/core/hw/y2r.cpp b/src/core/hw/y2r.cpp new file mode 100644 index 000000000..5b7fb39e1 --- /dev/null +++ b/src/core/hw/y2r.cpp @@ -0,0 +1,369 @@ +// Copyright 2015 Citra Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include +#include + +#include "common/assert.h" +#include "common/color.h" +#include "common/common_types.h" +#include "common/math_util.h" +#include "common/vector_math.h" + +#include "core/hle/service/y2r_u.h" +#include "core/memory.h" + +namespace HW { +namespace Y2R { + +using namespace Y2R_U; + +static const size_t MAX_TILES = 1024 / 8; +static const size_t TILE_SIZE = 8 * 8; +using ImageTile = std::array; + +/// Converts a image strip from the source YUV format into individual 8x8 RGB32 tiles. +static void ConvertYUVToRGB(InputFormat input_format, + const u8* input_Y, const u8* input_U, const u8* input_V, ImageTile output[], + unsigned int width, unsigned int height, const CoefficientSet& coefficients) { + + for (unsigned int y = 0; y < height; ++y) { + for (unsigned int x = 0; x < width; ++x) { + s32 Y, U, V; + switch (input_format) { + case InputFormat::YUV422_Indiv8: + case InputFormat::YUV422_Indiv16: + Y = input_Y[y * width + x]; + U = input_U[(y * width + x) / 2]; + V = input_V[(y * width + x) / 2]; + break; + case InputFormat::YUV420_Indiv8: + case InputFormat::YUV420_Indiv16: + Y = input_Y[y * width + x]; + U = input_U[((y / 2) * width + x) / 2]; + V = input_V[((y / 2) * width + x) / 2]; + break; + case InputFormat::YUYV422_Interleaved: + Y = input_Y[(y * width + x) * 2]; + U = input_Y[(y * width + (x / 2) * 2) * 2 + 1]; + V = input_Y[(y * width + (x / 2) * 2) * 2 + 3]; + break; + } + + // This conversion process is bit-exact with hardware, as far as could be tested. + auto& c = coefficients; + s32 cY = c[0]*Y; + + s32 r = cY + c[1]*V; + s32 g = cY - c[3]*U - c[2]*V; + s32 b = cY + c[4]*U; + + const s32 rounding_offset = 0x18; + r = (r >> 3) + c[5] + rounding_offset; + g = (g >> 3) + c[6] + rounding_offset; + b = (b >> 3) + c[7] + rounding_offset; + + unsigned int tile = x / 8; + unsigned int tile_x = x % 8; + u32* out = &output[tile][y * 8 + tile_x]; + + using MathUtil::Clamp; + *out = ((u32)Clamp(r >> 5, 0, 0xFF) << 24) | + ((u32)Clamp(g >> 5, 0, 0xFF) << 16) | + ((u32)Clamp(b >> 5, 0, 0xFF) << 8); + } + } +} + +/// Simulates an incoming CDMA transfer. The N parameter is used to automatically convert 16-bit formats to 8-bit. +template +static void ReceiveData(u8* output, ConversionBuffer& buf, size_t amount_of_data) { + const u8* input = Memory::GetPointer(buf.address); + + size_t output_unit = buf.transfer_unit / N; + ASSERT(amount_of_data % output_unit == 0); + + while (amount_of_data > 0) { + for (size_t i = 0; i < output_unit; ++i) { + output[i] = input[i * N]; + } + + output += output_unit; + input += buf.transfer_unit + buf.gap; + + buf.address += buf.transfer_unit + buf.gap; + buf.image_size -= buf.transfer_unit; + amount_of_data -= output_unit; + } +} + +/// Convert intermediate RGB32 format to the final output format while simulating an outgoing CDMA transfer. +static void SendData(const u32* input, ConversionBuffer& buf, int amount_of_data, + OutputFormat output_format, u8 alpha) { + + u8* output = Memory::GetPointer(buf.address); + + while (amount_of_data > 0) { + u8* unit_end = output + buf.transfer_unit; + while (output < unit_end) { + u32 color = *input++; + Math::Vec4 col_vec{ + (color >> 24) & 0xFF, (color >> 16) & 0xFF, (color >> 8) & 0xFF, alpha, + }; + + switch (output_format) { + case OutputFormat::RGBA8: + Color::EncodeRGBA8(col_vec, output); + output += 4; + break; + case OutputFormat::RGB8: + Color::EncodeRGB8(col_vec, output); + output += 3; + break; + case OutputFormat::RGB5A1: + Color::EncodeRGB5A1(col_vec, output); + output += 2; + break; + case OutputFormat::RGB565: + Color::EncodeRGB565(col_vec, output); + output += 2; + break; + } + + amount_of_data -= 1; + } + + output += buf.gap; + buf.address += buf.transfer_unit + buf.gap; + buf.image_size -= buf.transfer_unit; + } +} + +static const u8 linear_lut[64] = { + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, + 56, 57, 58, 59, 60, 61, 62, 63, +}; + +static const u8 morton_lut[64] = { + 0, 1, 4, 5, 16, 17, 20, 21, + 2, 3, 6, 7, 18, 19, 22, 23, + 8, 9, 12, 13, 24, 25, 28, 29, + 10, 11, 14, 15, 26, 27, 30, 31, + 32, 33, 36, 37, 48, 49, 52, 53, + 34, 35, 38, 39, 50, 51, 54, 55, + 40, 41, 44, 45, 56, 57, 60, 61, + 42, 43, 46, 47, 58, 59, 62, 63, +}; + +static void RotateTile0(const ImageTile& input, ImageTile& output, int height, const u8 out_map[64]) { + for (int i = 0; i < height * 8; ++i) { + output[out_map[i]] = input[i]; + } +} + +static void RotateTile90(const ImageTile& input, ImageTile& output, int height, const u8 out_map[64]) { + int out_i = 0; + for (int x = 0; x < 8; ++x) { + for (int y = height - 1; y >= 0; --y) { + output[out_map[out_i++]] = input[y * 8 + x]; + } + } +} + +static void RotateTile180(const ImageTile& input, ImageTile& output, int height, const u8 out_map[64]) { + int out_i = 0; + for (int i = height * 8 - 1; i >= 0; --i) { + output[out_map[out_i++]] = input[i]; + } +} + +static void RotateTile270(const ImageTile& input, ImageTile& output, int height, const u8 out_map[64]) { + int out_i = 0; + for (int x = 8-1; x >= 0; --x) { + for (int y = 0; y < height; ++y) { + output[out_map[out_i++]] = input[y * 8 + x]; + } + } +} + +static void WriteTileToOutput(u32* output, const ImageTile& tile, int height, int line_stride) { + for (int y = 0; y < height; ++y) { + for (int x = 0; x < 8; ++x) { + output[y * line_stride + x] = tile[y * 8 + x]; + } + } +} + +/** + * Performs a Y2R colorspace conversion. + * + * The Y2R hardware implements hardware-accelerated YUV to RGB colorspace conversions. It is most + * commonly used for video playback or to display camera input to the screen. + * + * The conversion process is quite configurable, and can be divided in distinct steps. From + * observation, it appears that the hardware buffers a single 8-pixel tall strip of image data + * internally and converts it in one go before writing to the output and loading the next strip. + * + * The steps taken to convert one strip of image data are: + * + * - The hardware receives data via CDMA (http://3dbrew.org/wiki/Corelink_DMA_Engines), which is + * presumably stored in one or more internal buffers. This process can be done in several separate + * transfers, as long as they don't exceed the size of the internal image buffer. This allows + * flexibility in input strides. + * - The input data is decoded into a YUV tuple. Several formats are suported, see the `InputFormat` + * enum. + * - The YUV tuple is converted, using fixed point calculations, to RGB. This step can be configured + * using a set of coefficients to support different colorspace standards. See `CoefficientSet`. + * - The strip can be optionally rotated 90, 180 or 270 degrees. Since each strip is processed + * independently, this notably rotates each *strip*, not the entire image. This means that for 90 + * or 270 degree rotations, the output will be in terms of several 8 x height images, and for any + * non-zero rotation the strips will have to be re-arranged so that the parts of the image will + * not be shuffled together. This limitation makes this a feature of somewhat dubious utility. 90 + * or 270 degree rotations in images with non-even height don't seem to work properly. + * - The data is converted to the output RGB format. See the `OutputFormat` enum. + * - The data can be output either linearly line-by-line or in the swizzled 8x8 tile format used by + * the PICA. This is decided by the `BlockAlignment` enum. If 8x8 alignment is used, then the + * image must have a height divisible by 8. The image width must always be divisible by 8. + * - The final data is then CDMAed out to main memory and the next image strip is processed. This + * offers the same flexibility as the input stage. + * + * In this implementation, to avoid the combinatorial explosion of parameter combinations, common + * intermediate formats are used and where possible tables or parameters are used instead of + * diverging code paths to keep the amount of branches in check. Some steps are also merged to + * increase efficiency. + * + * Output for all valid settings combinations matches hardware, however output in some edge-cases + * differs: + * + * - `Block8x8` alignment with non-mod8 height produces different garbage patterns on the last + * strip, especially when combined with rotation. + * - Hardware, when using `Linear` alignment with a non-even height and 90 or 270 degree rotation + * produces misaligned output on the last strip. This implmentation produces output with the + * correct "expected" alignment. + * + * Hardware behaves strangely (doesn't fire the completion interrupt, for example) in these cases, + * so they are believed to be invalid configurations anyway. + */ +void PerformConversion(ConversionConfiguration& cvt) { + ASSERT(cvt.input_line_width % 8 == 0); + ASSERT(cvt.block_alignment != BlockAlignment::Block8x8 || cvt.input_lines % 8 == 0); + // Tiles per row + size_t num_tiles = cvt.input_line_width / 8; + ASSERT(num_tiles < MAX_TILES); + + // Buffer used as a CDMA source/target. + std::unique_ptr data_buffer(new u8[cvt.input_line_width * 8 * 4]); + // Intermediate storage for decoded 8x8 image tiles. Always stored as RGB32. + std::unique_ptr tiles(new ImageTile[num_tiles]); + ImageTile tmp_tile; + + // LUT used to remap writes to a tile. Used to allow linear or swizzled output without + // requiring two different code paths. + const u8* tile_remap; + switch (cvt.block_alignment) { + case BlockAlignment::Linear: + tile_remap = linear_lut; break; + case BlockAlignment::Block8x8: + tile_remap = morton_lut; break; + } + + for (unsigned int y = 0; y < cvt.input_lines; y += 8) { + unsigned int row_height = std::min(cvt.input_lines - y, 8u); + + // Total size in pixels of incoming data required for this strip. + const size_t row_data_size = row_height * cvt.input_line_width; + + u8* input_Y = data_buffer.get(); + u8* input_U = input_Y + 8 * cvt.input_line_width; + u8* input_V = input_U + 8 * cvt.input_line_width / 2; + + switch (cvt.input_format) { + case InputFormat::YUV422_Indiv8: + ReceiveData<1>(input_Y, cvt.src_Y, row_data_size); + ReceiveData<1>(input_U, cvt.src_U, row_data_size / 2); + ReceiveData<1>(input_V, cvt.src_V, row_data_size / 2); + break; + case InputFormat::YUV420_Indiv8: + ReceiveData<1>(input_Y, cvt.src_Y, row_data_size); + ReceiveData<1>(input_U, cvt.src_U, row_data_size / 4); + ReceiveData<1>(input_V, cvt.src_V, row_data_size / 4); + break; + case InputFormat::YUV422_Indiv16: + ReceiveData<2>(input_Y, cvt.src_Y, row_data_size); + ReceiveData<2>(input_U, cvt.src_U, row_data_size / 2); + ReceiveData<2>(input_V, cvt.src_V, row_data_size / 2); + break; + case InputFormat::YUV420_Indiv16: + ReceiveData<2>(input_Y, cvt.src_Y, row_data_size); + ReceiveData<2>(input_U, cvt.src_U, row_data_size / 4); + ReceiveData<2>(input_V, cvt.src_V, row_data_size / 4); + break; + case InputFormat::YUYV422_Interleaved: + input_U = nullptr; + input_V = nullptr; + ReceiveData<1>(input_Y, cvt.src_YUYV, row_data_size * 2); + break; + } + + // Note(yuriks): If additional optimization is required, input_format can be moved to a + // template parameter, so that its dispatch can be moved to outside the inner loop. + ConvertYUVToRGB(cvt.input_format, input_Y, input_U, input_V, tiles.get(), + cvt.input_line_width, row_height, cvt.coefficients); + + u32* output_buffer = reinterpret_cast(data_buffer.get()); + + for (int i = 0; i < num_tiles; ++i) { + int image_strip_width, output_stride; + + switch (cvt.rotation) { + case Rotation::None: + RotateTile0(tiles[i], tmp_tile, row_height, tile_remap); + image_strip_width = cvt.input_line_width; + output_stride = 8; + break; + case Rotation::Clockwise_90: + RotateTile90(tiles[i], tmp_tile, row_height, tile_remap); + image_strip_width = 8; + output_stride = 8 * row_height; + break; + case Rotation::Clockwise_180: + // For 180 and 270 degree rotations we also invert the order of tiles in the strip, + // since the rotates are done individually on each tile. + RotateTile180(tiles[num_tiles - i - 1], tmp_tile, row_height, tile_remap); + image_strip_width = cvt.input_line_width; + output_stride = 8; + break; + case Rotation::Clockwise_270: + RotateTile270(tiles[num_tiles - i - 1], tmp_tile, row_height, tile_remap); + image_strip_width = 8; + output_stride = 8 * row_height; + break; + } + + switch (cvt.block_alignment) { + case BlockAlignment::Linear: + WriteTileToOutput(output_buffer, tmp_tile, row_height, image_strip_width); + output_buffer += output_stride; + break; + case BlockAlignment::Block8x8: + WriteTileToOutput(output_buffer, tmp_tile, 8, 8); + output_buffer += TILE_SIZE; + break; + } + } + + // Note(yuriks): If additional optimization is required, output_format can be moved to a + // template parameter, so that its dispatch can be moved to outside the inner loop. + SendData(reinterpret_cast(data_buffer.get()), cvt.dst, (int)row_data_size, cvt.output_format, (u8)cvt.alpha); + } +} + +} +} diff --git a/src/core/hw/y2r.h b/src/core/hw/y2r.h new file mode 100644 index 000000000..729e1eee3 --- /dev/null +++ b/src/core/hw/y2r.h @@ -0,0 +1,15 @@ +// Copyright 2015 Citra Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +namespace Y2R_U { + struct ConversionConfiguration; +} + +namespace HW { +namespace Y2R { + +void PerformConversion(Y2R_U::ConversionConfiguration& cvt); + +} +} -- cgit v1.2.3