diff options
Diffstat (limited to '')
66 files changed, 2436 insertions, 1339 deletions
diff --git a/src/citra/citra.cpp b/src/citra/citra.cpp index d6fcb66a5..46f4a07c9 100644 --- a/src/citra/citra.cpp +++ b/src/citra/citra.cpp @@ -6,6 +6,9 @@ #include <thread> #include <iostream> +// This needs to be included before getopt.h because the latter #defines symbols used by it +#include "common/microprofile.h" + #ifdef _MSC_VER #include <getopt.h> #else @@ -59,6 +62,8 @@ int main(int argc, char **argv) { Log::Filter log_filter(Log::Level::Debug); Log::SetFilter(&log_filter); + MicroProfileOnThreadCreate("EmuThread"); + if (boot_filename.empty()) { LOG_CRITICAL(Frontend, "Failed to load ROM: No ROM specified"); return -1; @@ -89,5 +94,7 @@ int main(int argc, char **argv) { delete emu_window; + MicroProfileShutdown(); + return 0; } diff --git a/src/citra_qt/CMakeLists.txt b/src/citra_qt/CMakeLists.txt index 0c0515054..a82e8a85b 100644 --- a/src/citra_qt/CMakeLists.txt +++ b/src/citra_qt/CMakeLists.txt @@ -18,6 +18,7 @@ set(SRCS debugger/ramview.cpp debugger/registers.cpp util/spinbox.cpp + util/util.cpp bootmanager.cpp hotkeys.cpp main.cpp @@ -42,6 +43,7 @@ set(HEADERS debugger/ramview.h debugger/registers.h util/spinbox.h + util/util.h bootmanager.h hotkeys.h main.h diff --git a/src/citra_qt/bootmanager.cpp b/src/citra_qt/bootmanager.cpp index a96fbea5f..f8aacb527 100644 --- a/src/citra_qt/bootmanager.cpp +++ b/src/citra_qt/bootmanager.cpp @@ -14,6 +14,7 @@ #include "common/string_util.h" #include "common/scm_rev.h" #include "common/key_map.h" +#include "common/microprofile.h" #include "core/core.h" #include "core/settings.h" @@ -37,6 +38,8 @@ EmuThread::EmuThread(GRenderWindow* render_window) : void EmuThread::run() { render_window->MakeCurrent(); + MicroProfileOnThreadCreate("EmuThread"); + stop_run = false; // holds whether the cpu was running during the last iteration, @@ -69,6 +72,8 @@ void EmuThread::run() { } } + MicroProfileOnThreadExit(); + render_window->moveContext(); } diff --git a/src/citra_qt/debugger/graphics.cpp b/src/citra_qt/debugger/graphics.cpp index 7424671f1..7d15028f0 100644 --- a/src/citra_qt/debugger/graphics.cpp +++ b/src/citra_qt/debugger/graphics.cpp @@ -7,6 +7,8 @@ #include <QVBoxLayout> #include <QDebug> +#include "citra_qt/util/util.h" + extern GraphicsDebugger g_debugger; GPUCommandStreamItemModel::GPUCommandStreamItemModel(QObject* parent) : QAbstractListModel(parent), command_count(0) @@ -79,7 +81,7 @@ GPUCommandStreamWidget::GPUCommandStreamWidget(QWidget* parent) : QDockWidget(tr QListView* command_list = new QListView; command_list->setModel(command_model); - command_list->setFont(QFont("monospace")); + command_list->setFont(GetMonospaceFont()); setWidget(command_list); } diff --git a/src/citra_qt/debugger/graphics_cmdlists.cpp b/src/citra_qt/debugger/graphics_cmdlists.cpp index 35a3140b2..025434687 100644 --- a/src/citra_qt/debugger/graphics_cmdlists.cpp +++ b/src/citra_qt/debugger/graphics_cmdlists.cpp @@ -14,6 +14,8 @@ #include <QSpinBox> #include <QComboBox> +#include "citra_qt/util/util.h" + #include "common/vector_math.h" #include "video_core/debug_utils/debug_utils.h" @@ -303,9 +305,7 @@ GPUCommandListWidget::GPUCommandListWidget(QWidget* parent) : QDockWidget(tr("Pi list_widget = new QTreeView; list_widget->setModel(model); - QFont font("monospace"); - font.setStyleHint(QFont::Monospace); // Automatic fallback to a monospace font on on platforms without a font called "monospace" - list_widget->setFont(font); + list_widget->setFont(GetMonospaceFont()); list_widget->setRootIsDecorated(false); list_widget->setUniformRowHeights(true); diff --git a/src/citra_qt/debugger/graphics_vertex_shader.cpp b/src/citra_qt/debugger/graphics_vertex_shader.cpp index 0c17edee0..1d9a00e89 100644 --- a/src/citra_qt/debugger/graphics_vertex_shader.cpp +++ b/src/citra_qt/debugger/graphics_vertex_shader.cpp @@ -15,6 +15,8 @@ #include <QSpinBox> #include <QTreeView> +#include "citra_qt/util/util.h" + #include "video_core/shader/shader.h" #include "graphics_vertex_shader.h" @@ -245,7 +247,7 @@ QVariant GraphicsVertexShaderModel::data(const QModelIndex& index, int role) con } case Qt::FontRole: - return QFont("monospace"); + return GetMonospaceFont(); case Qt::BackgroundRole: // Highlight instructions which have no debug data associated to them diff --git a/src/citra_qt/debugger/profiler.cpp b/src/citra_qt/debugger/profiler.cpp index 89b28c2f4..5261d4836 100644 --- a/src/citra_qt/debugger/profiler.cpp +++ b/src/citra_qt/debugger/profiler.cpp @@ -2,9 +2,21 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. +#include <QMouseEvent> +#include <QPainter> +#include <QString> + #include "profiler.h" +#include "citra_qt/util/util.h" + #include "common/profiler_reporting.h" +#include "common/microprofile.h" + +// Include the implementation of the UI in this file. This isn't in microprofile.cpp because the +// non-Qt frontends don't need it (and don't implement the UI drawing hooks either). +#define MICROPROFILEUI_IMPL 1 +#include "common/microprofileui.h" using namespace Common::Profiling; @@ -136,3 +148,193 @@ void ProfilerWidget::setProfilingInfoUpdateEnabled(bool enable) update_timer.stop(); } } + +class MicroProfileWidget : public QWidget { +public: + MicroProfileWidget(QWidget* parent = 0); + +protected: + void paintEvent(QPaintEvent* ev) override; + void showEvent(QShowEvent* ev) override; + void hideEvent(QHideEvent* ev) override; + + void mouseMoveEvent(QMouseEvent* ev) override; + void mousePressEvent(QMouseEvent* ev) override; + void mouseReleaseEvent(QMouseEvent* ev) override; + void wheelEvent(QWheelEvent* ev) override; + + void keyPressEvent(QKeyEvent* ev) override; + void keyReleaseEvent(QKeyEvent* ev) override; + +private: + /// This timer is used to redraw the widget's contents continuously. To save resources, it only + /// runs while the widget is visible. + QTimer update_timer; +}; + +MicroProfileDialog::MicroProfileDialog(QWidget* parent) + : QWidget(parent, Qt::Dialog) +{ + setObjectName("MicroProfile"); + setWindowTitle(tr("MicroProfile")); + resize(1000, 600); + // Remove the "?" button from the titlebar and enable the maximize button + setWindowFlags(windowFlags() & ~Qt::WindowContextHelpButtonHint | Qt::WindowMaximizeButtonHint); + + MicroProfileWidget* widget = new MicroProfileWidget(this); + + QLayout* layout = new QVBoxLayout(this); + layout->setContentsMargins(0, 0, 0, 0); + layout->addWidget(widget); + setLayout(layout); + + // Configure focus so that widget is focusable and the dialog automatically forwards focus to it. + setFocusProxy(widget); + widget->setFocusPolicy(Qt::StrongFocus); + widget->setFocus(); +} + +QAction* MicroProfileDialog::toggleViewAction() { + if (toggle_view_action == nullptr) { + toggle_view_action = new QAction(windowTitle(), this); + toggle_view_action->setCheckable(true); + toggle_view_action->setChecked(isVisible()); + connect(toggle_view_action, SIGNAL(toggled(bool)), SLOT(setVisible(bool))); + } + + return toggle_view_action; +} + +void MicroProfileDialog::showEvent(QShowEvent* ev) { + if (toggle_view_action) { + toggle_view_action->setChecked(isVisible()); + } + QWidget::showEvent(ev); +} + +void MicroProfileDialog::hideEvent(QHideEvent* ev) { + if (toggle_view_action) { + toggle_view_action->setChecked(isVisible()); + } + QWidget::hideEvent(ev); +} + +/// There's no way to pass a user pointer to MicroProfile, so this variable is used to make the +/// QPainter available inside the drawing callbacks. +static QPainter* mp_painter = nullptr; + +MicroProfileWidget::MicroProfileWidget(QWidget* parent) : QWidget(parent) { + // Send mouse motion events even when not dragging. + setMouseTracking(true); + + MicroProfileSetDisplayMode(1); // Timers screen + MicroProfileInitUI(); + + connect(&update_timer, SIGNAL(timeout()), SLOT(update())); +} + +void MicroProfileWidget::paintEvent(QPaintEvent* ev) { + QPainter painter(this); + + painter.setBackground(Qt::black); + painter.eraseRect(rect()); + + QFont font = GetMonospaceFont(); + font.setPixelSize(MICROPROFILE_TEXT_HEIGHT); + painter.setFont(font); + + mp_painter = &painter; + MicroProfileDraw(rect().width(), rect().height()); + mp_painter = nullptr; +} + +void MicroProfileWidget::showEvent(QShowEvent* ev) { + update_timer.start(15); // ~60 Hz + QWidget::showEvent(ev); +} + +void MicroProfileWidget::hideEvent(QHideEvent* ev) { + update_timer.stop(); + QWidget::hideEvent(ev); +} + +void MicroProfileWidget::mouseMoveEvent(QMouseEvent* ev) { + MicroProfileMousePosition(ev->x(), ev->y(), 0); + ev->accept(); +} + +void MicroProfileWidget::mousePressEvent(QMouseEvent* ev) { + MicroProfileMousePosition(ev->x(), ev->y(), 0); + MicroProfileMouseButton(ev->buttons() & Qt::LeftButton, ev->buttons() & Qt::RightButton); + ev->accept(); +} + +void MicroProfileWidget::mouseReleaseEvent(QMouseEvent* ev) { + MicroProfileMousePosition(ev->x(), ev->y(), 0); + MicroProfileMouseButton(ev->buttons() & Qt::LeftButton, ev->buttons() & Qt::RightButton); + ev->accept(); +} + +void MicroProfileWidget::wheelEvent(QWheelEvent* ev) { + MicroProfileMousePosition(ev->x(), ev->y(), ev->delta() / 120); + ev->accept(); +} + +void MicroProfileWidget::keyPressEvent(QKeyEvent* ev) { + if (ev->key() == Qt::Key_Control) { + // Inform MicroProfile that the user is holding Ctrl. + MicroProfileModKey(1); + } + QWidget::keyPressEvent(ev); +} + +void MicroProfileWidget::keyReleaseEvent(QKeyEvent* ev) { + if (ev->key() == Qt::Key_Control) { + MicroProfileModKey(0); + } + QWidget::keyReleaseEvent(ev); +} + +// These functions are called by MicroProfileDraw to draw the interface elements on the screen. + +void MicroProfileDrawText(int x, int y, u32 hex_color, const char* text, u32 text_length) { + // hex_color does not include an alpha, so it must be assumed to be 255 + mp_painter->setPen(QColor::fromRgb(hex_color)); + + // It's impossible to draw a string using a monospaced font with a fixed width per cell in a + // way that's reliable across different platforms and fonts as far as I (yuriks) can tell, so + // draw each character individually in order to precisely control the text advance. + for (u32 i = 0; i < text_length; ++i) { + // Position the text baseline 1 pixel above the bottom of the text cell, this gives nice + // vertical alignment of text for a wide range of tested fonts. + mp_painter->drawText(x, y + MICROPROFILE_TEXT_HEIGHT - 2, QChar(text[i])); + x += MICROPROFILE_TEXT_WIDTH + 1; + } +} + +void MicroProfileDrawBox(int left, int top, int right, int bottom, u32 hex_color, MicroProfileBoxType type) { + QColor color = QColor::fromRgba(hex_color); + QBrush brush = color; + if (type == MicroProfileBoxTypeBar) { + QLinearGradient gradient(left, top, left, bottom); + gradient.setColorAt(0.f, color.lighter(125)); + gradient.setColorAt(1.f, color.darker(125)); + brush = gradient; + } + mp_painter->fillRect(left, top, right - left, bottom - top, brush); +} + +void MicroProfileDrawLine2D(u32 vertices_length, float* vertices, u32 hex_color) { + // Temporary vector used to convert between the float array and QPointF. Marked static to reuse + // the allocation across calls. + static std::vector<QPointF> point_buf; + + for (u32 i = 0; i < vertices_length; ++i) { + point_buf.emplace_back(vertices[i*2 + 0], vertices[i*2 + 1]); + } + + // hex_color does not include an alpha, so it must be assumed to be 255 + mp_painter->setPen(QColor::fromRgb(hex_color)); + mp_painter->drawPolyline(point_buf.data(), vertices_length); + point_buf.clear(); +} diff --git a/src/citra_qt/debugger/profiler.h b/src/citra_qt/debugger/profiler.h index fabf279b8..2199eaef1 100644 --- a/src/citra_qt/debugger/profiler.h +++ b/src/citra_qt/debugger/profiler.h @@ -48,3 +48,20 @@ private: QTimer update_timer; }; + +class MicroProfileDialog : public QWidget { + Q_OBJECT + +public: + MicroProfileDialog(QWidget* parent = 0); + + /// Returns a QAction that can be used to toggle visibility of this dialog. + QAction* toggleViewAction(); + +protected: + void showEvent(QShowEvent* ev) override; + void hideEvent(QHideEvent* ev) override; + +private: + QAction* toggle_view_action = nullptr; +}; diff --git a/src/citra_qt/main.cpp b/src/citra_qt/main.cpp index a1a4865bd..7fb1b0dcb 100644 --- a/src/citra_qt/main.cpp +++ b/src/citra_qt/main.cpp @@ -17,6 +17,7 @@ #include "common/logging/backend.h" #include "common/logging/filter.h" #include "common/make_unique.h" +#include "common/microprofile.h" #include "common/platform.h" #include "common/scm_rev.h" #include "common/scope_exit.h" @@ -64,6 +65,9 @@ GMainWindow::GMainWindow() : emu_thread(nullptr) addDockWidget(Qt::BottomDockWidgetArea, profilerWidget); profilerWidget->hide(); + microProfileDialog = new MicroProfileDialog(this); + microProfileDialog->hide(); + disasmWidget = new DisassemblerWidget(this, emu_thread.get()); addDockWidget(Qt::BottomDockWidgetArea, disasmWidget); disasmWidget->hide(); @@ -102,6 +106,7 @@ GMainWindow::GMainWindow() : emu_thread(nullptr) QMenu* debug_menu = ui.menu_View->addMenu(tr("Debugging")); debug_menu->addAction(profilerWidget->toggleViewAction()); + debug_menu->addAction(microProfileDialog->toggleViewAction()); debug_menu->addAction(disasmWidget->toggleViewAction()); debug_menu->addAction(registersWidget->toggleViewAction()); debug_menu->addAction(callstackWidget->toggleViewAction()); @@ -128,6 +133,8 @@ GMainWindow::GMainWindow() : emu_thread(nullptr) restoreGeometry(settings.value("geometry").toByteArray()); restoreState(settings.value("state").toByteArray()); render_window->restoreGeometry(settings.value("geometryRenderWindow").toByteArray()); + microProfileDialog->restoreGeometry(settings.value("microProfileDialogGeometry").toByteArray()); + microProfileDialog->setVisible(settings.value("microProfileDialogVisible").toBool()); ui.action_Use_Hardware_Renderer->setChecked(Settings::values.use_hw_renderer); SetHardwareRendererEnabled(ui.action_Use_Hardware_Renderer->isChecked()); @@ -287,6 +294,17 @@ void GMainWindow::ShutdownGame() { render_window->hide(); } +void GMainWindow::StoreRecentFile(const QString& filename) +{ + QSettings settings; + QStringList recent_files = settings.value("recentFiles").toStringList(); + recent_files.prepend(filename); + recent_files.removeDuplicates(); + settings.setValue("recentFiles", recent_files); + + UpdateRecentFiles(); +} + void GMainWindow::UpdateRecentFiles() { QSettings settings; QStringList recent_files = settings.value("recentFiles").toStringList(); @@ -297,6 +315,7 @@ void GMainWindow::UpdateRecentFiles() { QString text = QString("&%1. %2").arg(i + 1).arg(QFileInfo(recent_files[i]).fileName()); actions_recent_files[i]->setText(text); actions_recent_files[i]->setData(recent_files[i]); + actions_recent_files[i]->setToolTip(recent_files[i]); actions_recent_files[i]->setVisible(true); } @@ -319,11 +338,7 @@ void GMainWindow::OnMenuLoadFile() { QString filename = QFileDialog::getOpenFileName(this, tr("Load File"), rom_path, tr("3DS executable (*.3ds *.3dsx *.elf *.axf *.cci *.cxi)")); if (filename.size()) { settings.setValue("romsPath", QFileInfo(filename).path()); - // Update recent files list - QStringList recent_files = settings.value("recentFiles").toStringList(); - recent_files.prepend(filename); - settings.setValue("recentFiles", recent_files); - UpdateRecentFiles(); // Update UI + StoreRecentFile(filename); BootGame(filename.toLatin1().data()); } @@ -349,6 +364,7 @@ void GMainWindow::OnMenuRecentFile() { QFileInfo file_info(filename); if (file_info.exists()) { BootGame(filename.toLatin1().data()); + StoreRecentFile(filename); // Put the filename on top of the list } else { // Display an error message and remove the file from the list. QMessageBox::information(this, tr("File not found"), tr("File \"%1\" not found").arg(filename)); @@ -357,12 +373,7 @@ void GMainWindow::OnMenuRecentFile() { QStringList recent_files = settings.value("recentFiles").toStringList(); recent_files.removeOne(filename); settings.setValue("recentFiles", recent_files); - - action->setVisible(false); - // Grey out the recent files menu if the list is empty - if (ui.menu_recent_files->isEmpty()) { - ui.menu_recent_files->setEnabled(false); - } + UpdateRecentFiles(); } } @@ -430,6 +441,8 @@ void GMainWindow::closeEvent(QCloseEvent* event) { settings.setValue("geometry", saveGeometry()); settings.setValue("state", saveState()); settings.setValue("geometryRenderWindow", render_window->saveGeometry()); + settings.setValue("microProfileDialogGeometry", microProfileDialog->saveGeometry()); + settings.setValue("microProfileDialogVisible", microProfileDialog->isVisible()); settings.setValue("singleWindowMode", ui.action_Single_Window_Mode->isChecked()); settings.setValue("displayTitleBars", ui.actionDisplay_widget_title_bars->isChecked()); settings.setValue("firstStart", false); @@ -452,6 +465,11 @@ int main(int argc, char* argv[]) { Log::Filter log_filter(Log::Level::Info); Log::SetFilter(&log_filter); + MicroProfileOnThreadCreate("Frontend"); + SCOPE_EXIT({ + MicroProfileShutdown(); + }); + // Init settings params QSettings::setDefaultFormat(QSettings::IniFormat); QCoreApplication::setOrganizationName("Citra team"); diff --git a/src/citra_qt/main.h b/src/citra_qt/main.h index 4b260ae8b..32523fded 100644 --- a/src/citra_qt/main.h +++ b/src/citra_qt/main.h @@ -14,6 +14,7 @@ class GImageInfo; class GRenderWindow; class EmuThread; class ProfilerWidget; +class MicroProfileDialog; class DisassemblerWidget; class RegistersWidget; class CallstackWidget; @@ -60,6 +61,24 @@ private: void BootGame(const std::string& filename); void ShutdownGame(); + /** + * Stores the filename in the recently loaded files list. + * The new filename is stored at the beginning of the recently loaded files list. + * After inserting the new entry, duplicates are removed meaning that if + * this was inserted from \a OnMenuRecentFile(), the entry will be put on top + * and remove from its previous position. + * + * Finally, this function calls \a UpdateRecentFiles() to update the UI. + * + * @param filename the filename to store + */ + void StoreRecentFile(const QString& filename); + + /** + * Updates the recent files menu. + * Menu entries are rebuilt from the configuration file. + * If there is no entry in the menu, the menu is greyed out. + */ void UpdateRecentFiles(); void closeEvent(QCloseEvent* event) override; @@ -86,6 +105,7 @@ private: std::unique_ptr<EmuThread> emu_thread; ProfilerWidget* profilerWidget; + MicroProfileDialog* microProfileDialog; DisassemblerWidget* disasmWidget; RegistersWidget* registersWidget; CallstackWidget* callstackWidget; diff --git a/src/citra_qt/util/util.cpp b/src/citra_qt/util/util.cpp new file mode 100644 index 000000000..2cb939af1 --- /dev/null +++ b/src/citra_qt/util/util.cpp @@ -0,0 +1,13 @@ +// Copyright 2015 Citra Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include "util.h" + +QFont GetMonospaceFont() { + QFont font("monospace"); + // Automatic fallback to a monospace font on on platforms without a font called "monospace" + font.setStyleHint(QFont::Monospace); + font.setFixedPitch(true); + return font; +} diff --git a/src/citra_qt/util/util.h b/src/citra_qt/util/util.h new file mode 100644 index 000000000..98a944047 --- /dev/null +++ b/src/citra_qt/util/util.h @@ -0,0 +1,10 @@ +// Copyright 2015 Citra Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <QFont> + +/// Returns a QFont object appropriate to use as a monospace font for debugging widgets, etc. +QFont GetMonospaceFont(); diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt index e743a026d..7f3712efa 100644 --- a/src/common/CMakeLists.txt +++ b/src/common/CMakeLists.txt @@ -11,6 +11,7 @@ set(SRCS logging/text_formatter.cpp logging/backend.cpp memory_util.cpp + microprofile.cpp misc.cpp profiler.cpp scm_rev.cpp @@ -43,6 +44,8 @@ set(HEADERS make_unique.h math_util.h memory_util.h + microprofile.h + microprofileui.h platform.h profiler.h profiler_reporting.h diff --git a/src/common/common_funcs.h b/src/common/common_funcs.h index 88e452a16..ed20c3629 100644 --- a/src/common/common_funcs.h +++ b/src/common/common_funcs.h @@ -45,14 +45,20 @@ // GCC 4.8 defines all the rotate functions now // Small issue with GCC's lrotl/lrotr intrinsics is they are still 32bit while we require 64bit -#ifndef _rotl -inline u32 _rotl(u32 x, int shift) { +#ifdef _rotl +#define rotl _rotl +#else +inline u32 rotl(u32 x, int shift) { shift &= 31; if (!shift) return x; return (x << shift) | (x >> (32 - shift)); } +#endif -inline u32 _rotr(u32 x, int shift) { +#ifdef _rotr +#define rotr _rotr +#else +inline u32 rotr(u32 x, int shift) { shift &= 31; if (!shift) return x; return (x >> shift) | (x << (32 - shift)); diff --git a/src/common/file_util.h b/src/common/file_util.h index d0dccdf69..e71a9b2fa 100644 --- a/src/common/file_util.h +++ b/src/common/file_util.h @@ -244,7 +244,7 @@ private: template <typename T> void OpenFStream(T& fstream, const std::string& filename, std::ios_base::openmode openmode) { -#ifdef _WIN32 +#ifdef _MSC_VER fstream.open(Common::UTF8ToTStr(filename).c_str(), openmode); #else fstream.open(filename.c_str(), openmode); diff --git a/src/common/logging/log.h b/src/common/logging/log.h index e16dde7fc..5fd3bd7f5 100644 --- a/src/common/logging/log.h +++ b/src/common/logging/log.h @@ -91,17 +91,16 @@ void LogMessage(Class log_class, Level log_level, } // namespace Log #define LOG_GENERIC(log_class, log_level, ...) \ - ::Log::LogMessage(::Log::Class::log_class, ::Log::Level::log_level, \ - __FILE__, __LINE__, __func__, __VA_ARGS__) + ::Log::LogMessage(log_class, log_level, __FILE__, __LINE__, __func__, __VA_ARGS__) #ifdef _DEBUG -#define LOG_TRACE( log_class, ...) LOG_GENERIC(log_class, Trace, __VA_ARGS__) +#define LOG_TRACE( log_class, ...) LOG_GENERIC(::Log::Class::log_class, ::Log::Level::Trace, __VA_ARGS__) #else #define LOG_TRACE( log_class, ...) (void(0)) #endif -#define LOG_DEBUG( log_class, ...) LOG_GENERIC(log_class, Debug, __VA_ARGS__) -#define LOG_INFO( log_class, ...) LOG_GENERIC(log_class, Info, __VA_ARGS__) -#define LOG_WARNING( log_class, ...) LOG_GENERIC(log_class, Warning, __VA_ARGS__) -#define LOG_ERROR( log_class, ...) LOG_GENERIC(log_class, Error, __VA_ARGS__) -#define LOG_CRITICAL(log_class, ...) LOG_GENERIC(log_class, Critical, __VA_ARGS__) +#define LOG_DEBUG( log_class, ...) LOG_GENERIC(::Log::Class::log_class, ::Log::Level::Debug, __VA_ARGS__) +#define LOG_INFO( log_class, ...) LOG_GENERIC(::Log::Class::log_class, ::Log::Level::Info, __VA_ARGS__) +#define LOG_WARNING( log_class, ...) LOG_GENERIC(::Log::Class::log_class, ::Log::Level::Warning, __VA_ARGS__) +#define LOG_ERROR( log_class, ...) LOG_GENERIC(::Log::Class::log_class, ::Log::Level::Error, __VA_ARGS__) +#define LOG_CRITICAL(log_class, ...) LOG_GENERIC(::Log::Class::log_class, ::Log::Level::Critical, __VA_ARGS__) diff --git a/src/common/microprofile.cpp b/src/common/microprofile.cpp new file mode 100644 index 000000000..ee25dd37f --- /dev/null +++ b/src/common/microprofile.cpp @@ -0,0 +1,7 @@ +// Copyright 2015 Citra Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +// Includes the MicroProfile implementation in this file for compilation +#define MICROPROFILE_IMPL 1 +#include "common/microprofile.h" diff --git a/src/common/microprofile.h b/src/common/microprofile.h new file mode 100644 index 000000000..9eb6016a8 --- /dev/null +++ b/src/common/microprofile.h @@ -0,0 +1,25 @@ +// Copyright 2015 Citra Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +// Customized Citra settings. +// This file wraps the MicroProfile header so that these are consistent everywhere. +#define MICROPROFILE_WEBSERVER 0 +#define MICROPROFILE_GPU_TIMERS 0 // TODO: Implement timer queries when we upgrade to OpenGL 3.3 +#define MICROPROFILE_CONTEXT_SWITCH_TRACE 0 +#define MICROPROFILE_PER_THREAD_BUFFER_SIZE (2048<<12) // 8 MB + +#include <microprofile.h> + +#define MP_RGB(r, g, b) ((r) << 16 | (g) << 8 | (b) << 0) + +// On OS X, some Mach header included by MicroProfile defines these as macros, conflicting with +// identifiers we use. +#ifdef PAGE_SIZE +#undef PAGE_SIZE +#endif +#ifdef PAGE_MASK +#undef PAGE_MASK +#endif diff --git a/src/common/microprofileui.h b/src/common/microprofileui.h new file mode 100644 index 000000000..97c369bd9 --- /dev/null +++ b/src/common/microprofileui.h @@ -0,0 +1,16 @@ +// Copyright 2015 Citra Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include "common/microprofile.h" + +// Customized Citra settings. +// This file wraps the MicroProfile header so that these are consistent everywhere. +#define MICROPROFILE_TEXT_WIDTH 6 +#define MICROPROFILE_TEXT_HEIGHT 12 +#define MICROPROFILE_HELP_ALT "Right-Click" +#define MICROPROFILE_HELP_MOD "Ctrl" + +#include <microprofileui.h> diff --git a/src/common/x64/emitter.cpp b/src/common/x64/emitter.cpp index 4b79acd1f..939df210e 100644 --- a/src/common/x64/emitter.cpp +++ b/src/common/x64/emitter.cpp @@ -15,6 +15,7 @@ // Official SVN repository and contact information can be found at // http://code.google.com/p/dolphin-emu/ +#include <cinttypes> #include <cstring> #include "common/assert.h" @@ -25,11 +26,6 @@ #include "cpu_detect.h" #include "emitter.h" -#define PRIx64 "llx" - -// Minimize the diff against Dolphin -#define DYNA_REC JIT - namespace Gen { @@ -113,6 +109,29 @@ u8 *XEmitter::GetWritableCodePtr() return code; } +void XEmitter::Write8(u8 value) +{ + *code++ = value; +} + +void XEmitter::Write16(u16 value) +{ + std::memcpy(code, &value, sizeof(u16)); + code += sizeof(u16); +} + +void XEmitter::Write32(u32 value) +{ + std::memcpy(code, &value, sizeof(u32)); + code += sizeof(u32); +} + +void XEmitter::Write64(u64 value) +{ + std::memcpy(code, &value, sizeof(u64)); + code += sizeof(u64); +} + void XEmitter::ReserveCodeSpace(int bytes) { for (int i = 0; i < bytes; i++) @@ -374,7 +393,7 @@ void XEmitter::Rex(int w, int r, int x, int b) Write8(rx); } -void XEmitter::JMP(const u8 *addr, bool force5Bytes) +void XEmitter::JMP(const u8* addr, bool force5Bytes) { u64 fn = (u64)addr; if (!force5Bytes) @@ -398,7 +417,7 @@ void XEmitter::JMP(const u8 *addr, bool force5Bytes) } } -void XEmitter::JMPptr(const OpArg &arg2) +void XEmitter::JMPptr(const OpArg& arg2) { OpArg arg = arg2; if (arg.IsImm()) ASSERT_MSG(0, "JMPptr - Imm argument"); @@ -425,7 +444,7 @@ void XEmitter::CALLptr(OpArg arg) arg.WriteRest(this); } -void XEmitter::CALL(const void *fnptr) +void XEmitter::CALL(const void* fnptr) { u64 distance = u64(fnptr) - (u64(code) + 5); ASSERT_MSG( @@ -496,7 +515,7 @@ void XEmitter::J_CC(CCFlags conditionCode, const u8* addr, bool force5bytes) } } -void XEmitter::SetJumpTarget(const FixupBranch &branch) +void XEmitter::SetJumpTarget(const FixupBranch& branch) { if (branch.type == 0) { @@ -512,30 +531,6 @@ void XEmitter::SetJumpTarget(const FixupBranch &branch) } } -// INC/DEC considered harmful on newer CPUs due to partial flag set. -// Use ADD, SUB instead. - -/* -void XEmitter::INC(int bits, OpArg arg) -{ - if (arg.IsImm()) ASSERT_MSG(0, "INC - Imm argument"); - arg.operandReg = 0; - if (bits == 16) {Write8(0x66);} - arg.WriteRex(this, bits, bits); - Write8(bits == 8 ? 0xFE : 0xFF); - arg.WriteRest(this); -} -void XEmitter::DEC(int bits, OpArg arg) -{ - if (arg.IsImm()) ASSERT_MSG(0, "DEC - Imm argument"); - arg.operandReg = 1; - if (bits == 16) {Write8(0x66);} - arg.WriteRex(this, bits, bits); - Write8(bits == 8 ? 0xFE : 0xFF); - arg.WriteRest(this); -} -*/ - //Single byte opcodes //There is no PUSHAD/POPAD in 64-bit mode. void XEmitter::INT3() {Write8(0xCC);} @@ -667,7 +662,7 @@ void XEmitter::CBW(int bits) void XEmitter::PUSH(X64Reg reg) {WriteSimple1Byte(32, 0x50, reg);} void XEmitter::POP(X64Reg reg) {WriteSimple1Byte(32, 0x58, reg);} -void XEmitter::PUSH(int bits, const OpArg ®) +void XEmitter::PUSH(int bits, const OpArg& reg) { if (reg.IsSimpleReg()) PUSH(reg.GetSimpleReg()); @@ -703,7 +698,7 @@ void XEmitter::PUSH(int bits, const OpArg ®) } } -void XEmitter::POP(int /*bits*/, const OpArg ®) +void XEmitter::POP(int /*bits*/, const OpArg& reg) { if (reg.IsSimpleReg()) POP(reg.GetSimpleReg()); @@ -791,12 +786,12 @@ void XEmitter::WriteMulDivType(int bits, OpArg src, int ext) src.WriteRest(this); } -void XEmitter::MUL(int bits, OpArg src) {WriteMulDivType(bits, src, 4);} -void XEmitter::DIV(int bits, OpArg src) {WriteMulDivType(bits, src, 6);} -void XEmitter::IMUL(int bits, OpArg src) {WriteMulDivType(bits, src, 5);} -void XEmitter::IDIV(int bits, OpArg src) {WriteMulDivType(bits, src, 7);} -void XEmitter::NEG(int bits, OpArg src) {WriteMulDivType(bits, src, 3);} -void XEmitter::NOT(int bits, OpArg src) {WriteMulDivType(bits, src, 2);} +void XEmitter::MUL(int bits, const OpArg& src) {WriteMulDivType(bits, src, 4);} +void XEmitter::DIV(int bits, const OpArg& src) {WriteMulDivType(bits, src, 6);} +void XEmitter::IMUL(int bits, const OpArg& src) {WriteMulDivType(bits, src, 5);} +void XEmitter::IDIV(int bits, const OpArg& src) {WriteMulDivType(bits, src, 7);} +void XEmitter::NEG(int bits, const OpArg& src) {WriteMulDivType(bits, src, 3);} +void XEmitter::NOT(int bits, const OpArg& src) {WriteMulDivType(bits, src, 2);} void XEmitter::WriteBitSearchType(int bits, X64Reg dest, OpArg src, u8 byte2, bool rep) { @@ -813,24 +808,24 @@ void XEmitter::WriteBitSearchType(int bits, X64Reg dest, OpArg src, u8 byte2, bo src.WriteRest(this); } -void XEmitter::MOVNTI(int bits, OpArg dest, X64Reg src) +void XEmitter::MOVNTI(int bits, const OpArg& dest, X64Reg src) { if (bits <= 16) ASSERT_MSG(0, "MOVNTI - bits<=16"); WriteBitSearchType(bits, src, dest, 0xC3); } -void XEmitter::BSF(int bits, X64Reg dest, OpArg src) {WriteBitSearchType(bits,dest,src,0xBC);} //bottom bit to top bit -void XEmitter::BSR(int bits, X64Reg dest, OpArg src) {WriteBitSearchType(bits,dest,src,0xBD);} //top bit to bottom bit +void XEmitter::BSF(int bits, X64Reg dest, const OpArg& src) {WriteBitSearchType(bits,dest,src,0xBC);} // Bottom bit to top bit +void XEmitter::BSR(int bits, X64Reg dest, const OpArg& src) {WriteBitSearchType(bits,dest,src,0xBD);} // Top bit to bottom bit -void XEmitter::TZCNT(int bits, X64Reg dest, OpArg src) +void XEmitter::TZCNT(int bits, X64Reg dest, const OpArg& src) { CheckFlags(); if (!Common::GetCPUCaps().bmi1) ASSERT_MSG(0, "Trying to use BMI1 on a system that doesn't support it. Bad programmer."); WriteBitSearchType(bits, dest, src, 0xBC, true); } -void XEmitter::LZCNT(int bits, X64Reg dest, OpArg src) +void XEmitter::LZCNT(int bits, X64Reg dest, const OpArg& src) { CheckFlags(); if (!Common::GetCPUCaps().lzcnt) @@ -950,7 +945,7 @@ void XEmitter::LEA(int bits, X64Reg dest, OpArg src) } //shift can be either imm8 or cl -void XEmitter::WriteShift(int bits, OpArg dest, OpArg &shift, int ext) +void XEmitter::WriteShift(int bits, OpArg dest, const OpArg& shift, int ext) { CheckFlags(); bool writeImm = false; @@ -991,16 +986,16 @@ void XEmitter::WriteShift(int bits, OpArg dest, OpArg &shift, int ext) // large rotates and shift are slower on intel than amd // intel likes to rotate by 1, and the op is smaller too -void XEmitter::ROL(int bits, OpArg dest, OpArg shift) {WriteShift(bits, dest, shift, 0);} -void XEmitter::ROR(int bits, OpArg dest, OpArg shift) {WriteShift(bits, dest, shift, 1);} -void XEmitter::RCL(int bits, OpArg dest, OpArg shift) {WriteShift(bits, dest, shift, 2);} -void XEmitter::RCR(int bits, OpArg dest, OpArg shift) {WriteShift(bits, dest, shift, 3);} -void XEmitter::SHL(int bits, OpArg dest, OpArg shift) {WriteShift(bits, dest, shift, 4);} -void XEmitter::SHR(int bits, OpArg dest, OpArg shift) {WriteShift(bits, dest, shift, 5);} -void XEmitter::SAR(int bits, OpArg dest, OpArg shift) {WriteShift(bits, dest, shift, 7);} +void XEmitter::ROL(int bits, const OpArg& dest, const OpArg& shift) {WriteShift(bits, dest, shift, 0);} +void XEmitter::ROR(int bits, const OpArg& dest, const OpArg& shift) {WriteShift(bits, dest, shift, 1);} +void XEmitter::RCL(int bits, const OpArg& dest, const OpArg& shift) {WriteShift(bits, dest, shift, 2);} +void XEmitter::RCR(int bits, const OpArg& dest, const OpArg& shift) {WriteShift(bits, dest, shift, 3);} +void XEmitter::SHL(int bits, const OpArg& dest, const OpArg& shift) {WriteShift(bits, dest, shift, 4);} +void XEmitter::SHR(int bits, const OpArg& dest, const OpArg& shift) {WriteShift(bits, dest, shift, 5);} +void XEmitter::SAR(int bits, const OpArg& dest, const OpArg& shift) {WriteShift(bits, dest, shift, 7);} // index can be either imm8 or register, don't use memory destination because it's slow -void XEmitter::WriteBitTest(int bits, OpArg &dest, OpArg &index, int ext) +void XEmitter::WriteBitTest(int bits, const OpArg& dest, const OpArg& index, int ext) { CheckFlags(); if (dest.IsImm()) @@ -1029,13 +1024,13 @@ void XEmitter::WriteBitTest(int bits, OpArg &dest, OpArg &index, int ext) } } -void XEmitter::BT(int bits, OpArg dest, OpArg index) {WriteBitTest(bits, dest, index, 4);} -void XEmitter::BTS(int bits, OpArg dest, OpArg index) {WriteBitTest(bits, dest, index, 5);} -void XEmitter::BTR(int bits, OpArg dest, OpArg index) {WriteBitTest(bits, dest, index, 6);} -void XEmitter::BTC(int bits, OpArg dest, OpArg index) {WriteBitTest(bits, dest, index, 7);} +void XEmitter::BT(int bits, const OpArg& dest, const OpArg& index) {WriteBitTest(bits, dest, index, 4);} +void XEmitter::BTS(int bits, const OpArg& dest, const OpArg& index) {WriteBitTest(bits, dest, index, 5);} +void XEmitter::BTR(int bits, const OpArg& dest, const OpArg& index) {WriteBitTest(bits, dest, index, 6);} +void XEmitter::BTC(int bits, const OpArg& dest, const OpArg& index) {WriteBitTest(bits, dest, index, 7);} //shift can be either imm8 or cl -void XEmitter::SHRD(int bits, OpArg dest, OpArg src, OpArg shift) +void XEmitter::SHRD(int bits, const OpArg& dest, const OpArg& src, const OpArg& shift) { CheckFlags(); if (dest.IsImm()) @@ -1067,7 +1062,7 @@ void XEmitter::SHRD(int bits, OpArg dest, OpArg src, OpArg shift) } } -void XEmitter::SHLD(int bits, OpArg dest, OpArg src, OpArg shift) +void XEmitter::SHLD(int bits, const OpArg& dest, const OpArg& src, const OpArg& shift) { CheckFlags(); if (dest.IsImm()) @@ -1111,7 +1106,7 @@ void OpArg::WriteSingleByteOp(XEmitter *emit, u8 op, X64Reg _operandReg, int bit } //operand can either be immediate or register -void OpArg::WriteNormalOp(XEmitter *emit, bool toRM, NormalOp op, const OpArg &operand, int bits) const +void OpArg::WriteNormalOp(XEmitter *emit, bool toRM, NormalOp op, const OpArg& operand, int bits) const { X64Reg _operandReg; if (IsImm()) @@ -1257,7 +1252,7 @@ void OpArg::WriteNormalOp(XEmitter *emit, bool toRM, NormalOp op, const OpArg &o } } -void XEmitter::WriteNormalOp(XEmitter *emit, int bits, NormalOp op, const OpArg &a1, const OpArg &a2) +void XEmitter::WriteNormalOp(XEmitter *emit, int bits, NormalOp op, const OpArg& a1, const OpArg& a2) { if (a1.IsImm()) { @@ -1283,24 +1278,24 @@ void XEmitter::WriteNormalOp(XEmitter *emit, int bits, NormalOp op, const OpArg } } -void XEmitter::ADD (int bits, const OpArg &a1, const OpArg &a2) {CheckFlags(); WriteNormalOp(this, bits, nrmADD, a1, a2);} -void XEmitter::ADC (int bits, const OpArg &a1, const OpArg &a2) {CheckFlags(); WriteNormalOp(this, bits, nrmADC, a1, a2);} -void XEmitter::SUB (int bits, const OpArg &a1, const OpArg &a2) {CheckFlags(); WriteNormalOp(this, bits, nrmSUB, a1, a2);} -void XEmitter::SBB (int bits, const OpArg &a1, const OpArg &a2) {CheckFlags(); WriteNormalOp(this, bits, nrmSBB, a1, a2);} -void XEmitter::AND (int bits, const OpArg &a1, const OpArg &a2) {CheckFlags(); WriteNormalOp(this, bits, nrmAND, a1, a2);} -void XEmitter::OR (int bits, const OpArg &a1, const OpArg &a2) {CheckFlags(); WriteNormalOp(this, bits, nrmOR , a1, a2);} -void XEmitter::XOR (int bits, const OpArg &a1, const OpArg &a2) {CheckFlags(); WriteNormalOp(this, bits, nrmXOR, a1, a2);} -void XEmitter::MOV (int bits, const OpArg &a1, const OpArg &a2) +void XEmitter::ADD (int bits, const OpArg& a1, const OpArg& a2) {CheckFlags(); WriteNormalOp(this, bits, nrmADD, a1, a2);} +void XEmitter::ADC (int bits, const OpArg& a1, const OpArg& a2) {CheckFlags(); WriteNormalOp(this, bits, nrmADC, a1, a2);} +void XEmitter::SUB (int bits, const OpArg& a1, const OpArg& a2) {CheckFlags(); WriteNormalOp(this, bits, nrmSUB, a1, a2);} +void XEmitter::SBB (int bits, const OpArg& a1, const OpArg& a2) {CheckFlags(); WriteNormalOp(this, bits, nrmSBB, a1, a2);} +void XEmitter::AND (int bits, const OpArg& a1, const OpArg& a2) {CheckFlags(); WriteNormalOp(this, bits, nrmAND, a1, a2);} +void XEmitter::OR (int bits, const OpArg& a1, const OpArg& a2) {CheckFlags(); WriteNormalOp(this, bits, nrmOR , a1, a2);} +void XEmitter::XOR (int bits, const OpArg& a1, const OpArg& a2) {CheckFlags(); WriteNormalOp(this, bits, nrmXOR, a1, a2);} +void XEmitter::MOV (int bits, const OpArg& a1, const OpArg& a2) { if (a1.IsSimpleReg() && a2.IsSimpleReg() && a1.GetSimpleReg() == a2.GetSimpleReg()) LOG_ERROR(Common, "Redundant MOV @ %p - bug in JIT?", code); WriteNormalOp(this, bits, nrmMOV, a1, a2); } -void XEmitter::TEST(int bits, const OpArg &a1, const OpArg &a2) {CheckFlags(); WriteNormalOp(this, bits, nrmTEST, a1, a2);} -void XEmitter::CMP (int bits, const OpArg &a1, const OpArg &a2) {CheckFlags(); WriteNormalOp(this, bits, nrmCMP, a1, a2);} -void XEmitter::XCHG(int bits, const OpArg &a1, const OpArg &a2) {WriteNormalOp(this, bits, nrmXCHG, a1, a2);} +void XEmitter::TEST(int bits, const OpArg& a1, const OpArg& a2) {CheckFlags(); WriteNormalOp(this, bits, nrmTEST, a1, a2);} +void XEmitter::CMP (int bits, const OpArg& a1, const OpArg& a2) {CheckFlags(); WriteNormalOp(this, bits, nrmCMP, a1, a2);} +void XEmitter::XCHG(int bits, const OpArg& a1, const OpArg& a2) {WriteNormalOp(this, bits, nrmXCHG, a1, a2);} -void XEmitter::IMUL(int bits, X64Reg regOp, OpArg a1, OpArg a2) +void XEmitter::IMUL(int bits, X64Reg regOp, const OpArg& a1, const OpArg& a2) { CheckFlags(); if (bits == 8) @@ -1353,7 +1348,7 @@ void XEmitter::IMUL(int bits, X64Reg regOp, OpArg a1, OpArg a2) } } -void XEmitter::IMUL(int bits, X64Reg regOp, OpArg a) +void XEmitter::IMUL(int bits, X64Reg regOp, const OpArg& a) { CheckFlags(); if (bits == 8) @@ -1390,7 +1385,7 @@ void XEmitter::WriteSSEOp(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extr arg.WriteRest(this, extrabytes); } -void XEmitter::WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes) +void XEmitter::WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp, const OpArg& arg, int extrabytes) { WriteAVXOp(opPrefix, op, regOp, INVALID_REG, arg, extrabytes); } @@ -1400,25 +1395,25 @@ static int GetVEXmmmmm(u16 op) // Currently, only 0x38 and 0x3A are used as secondary escape byte. if ((op >> 8) == 0x3A) return 3; - else if ((op >> 8) == 0x38) + if ((op >> 8) == 0x38) return 2; - else - return 1; + + return 1; } static int GetVEXpp(u8 opPrefix) { if (opPrefix == 0x66) return 1; - else if (opPrefix == 0xF3) + if (opPrefix == 0xF3) return 2; - else if (opPrefix == 0xF2) + if (opPrefix == 0xF2) return 3; - else - return 0; + + return 0; } -void XEmitter::WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes) +void XEmitter::WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, int extrabytes) { if (!Common::GetCPUCaps().avx) ASSERT_MSG(0, "Trying to use AVX on a system that doesn't support it. Bad programmer."); @@ -1431,7 +1426,7 @@ void XEmitter::WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpA } // Like the above, but more general; covers GPR-based VEX operations, like BMI1/2 -void XEmitter::WriteVEXOp(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes) +void XEmitter::WriteVEXOp(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, int extrabytes) { if (size != 32 && size != 64) ASSERT_MSG(0, "VEX GPR instructions only support 32-bit and 64-bit modes!"); @@ -1442,7 +1437,7 @@ void XEmitter::WriteVEXOp(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg r arg.WriteRest(this, extrabytes, regOp1); } -void XEmitter::WriteBMI1Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes) +void XEmitter::WriteBMI1Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, int extrabytes) { CheckFlags(); if (!Common::GetCPUCaps().bmi1) @@ -1450,7 +1445,7 @@ void XEmitter::WriteBMI1Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg WriteVEXOp(size, opPrefix, op, regOp1, regOp2, arg, extrabytes); } -void XEmitter::WriteBMI2Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes) +void XEmitter::WriteBMI2Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, int extrabytes) { CheckFlags(); if (!Common::GetCPUCaps().bmi2) @@ -1517,135 +1512,136 @@ void XEmitter::WriteMXCSR(OpArg arg, int ext) arg.WriteRest(this); } -void XEmitter::STMXCSR(OpArg memloc) {WriteMXCSR(memloc, 3);} -void XEmitter::LDMXCSR(OpArg memloc) {WriteMXCSR(memloc, 2);} - -void XEmitter::MOVNTDQ(OpArg arg, X64Reg regOp) {WriteSSEOp(0x66, sseMOVNTDQ, regOp, arg);} -void XEmitter::MOVNTPS(OpArg arg, X64Reg regOp) {WriteSSEOp(0x00, sseMOVNTP, regOp, arg);} -void XEmitter::MOVNTPD(OpArg arg, X64Reg regOp) {WriteSSEOp(0x66, sseMOVNTP, regOp, arg);} - -void XEmitter::ADDSS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, sseADD, regOp, arg);} -void XEmitter::ADDSD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, sseADD, regOp, arg);} -void XEmitter::SUBSS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, sseSUB, regOp, arg);} -void XEmitter::SUBSD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, sseSUB, regOp, arg);} -void XEmitter::CMPSS(X64Reg regOp, OpArg arg, u8 compare) {WriteSSEOp(0xF3, sseCMP, regOp, arg, 1); Write8(compare);} -void XEmitter::CMPSD(X64Reg regOp, OpArg arg, u8 compare) {WriteSSEOp(0xF2, sseCMP, regOp, arg, 1); Write8(compare);} -void XEmitter::MULSS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, sseMUL, regOp, arg);} -void XEmitter::MULSD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, sseMUL, regOp, arg);} -void XEmitter::DIVSS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, sseDIV, regOp, arg);} -void XEmitter::DIVSD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, sseDIV, regOp, arg);} -void XEmitter::MINSS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, sseMIN, regOp, arg);} -void XEmitter::MINSD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, sseMIN, regOp, arg);} -void XEmitter::MAXSS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, sseMAX, regOp, arg);} -void XEmitter::MAXSD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, sseMAX, regOp, arg);} -void XEmitter::SQRTSS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, sseSQRT, regOp, arg);} -void XEmitter::SQRTSD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, sseSQRT, regOp, arg);} -void XEmitter::RSQRTSS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, sseRSQRT, regOp, arg);} - -void XEmitter::ADDPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseADD, regOp, arg);} -void XEmitter::ADDPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseADD, regOp, arg);} -void XEmitter::SUBPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseSUB, regOp, arg);} -void XEmitter::SUBPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseSUB, regOp, arg);} -void XEmitter::CMPPS(X64Reg regOp, OpArg arg, u8 compare) {WriteSSEOp(0x00, sseCMP, regOp, arg, 1); Write8(compare);} -void XEmitter::CMPPD(X64Reg regOp, OpArg arg, u8 compare) {WriteSSEOp(0x66, sseCMP, regOp, arg, 1); Write8(compare);} -void XEmitter::ANDPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseAND, regOp, arg);} -void XEmitter::ANDPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseAND, regOp, arg);} -void XEmitter::ANDNPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseANDN, regOp, arg);} -void XEmitter::ANDNPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseANDN, regOp, arg);} -void XEmitter::ORPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseOR, regOp, arg);} -void XEmitter::ORPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseOR, regOp, arg);} -void XEmitter::XORPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseXOR, regOp, arg);} -void XEmitter::XORPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseXOR, regOp, arg);} -void XEmitter::MULPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseMUL, regOp, arg);} -void XEmitter::MULPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseMUL, regOp, arg);} -void XEmitter::DIVPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseDIV, regOp, arg);} -void XEmitter::DIVPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseDIV, regOp, arg);} -void XEmitter::MINPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseMIN, regOp, arg);} -void XEmitter::MINPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseMIN, regOp, arg);} -void XEmitter::MAXPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseMAX, regOp, arg);} -void XEmitter::MAXPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseMAX, regOp, arg);} -void XEmitter::SQRTPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseSQRT, regOp, arg);} -void XEmitter::SQRTPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseSQRT, regOp, arg);} -void XEmitter::RCPPS(X64Reg regOp, OpArg arg) { WriteSSEOp(0x00, sseRCP, regOp, arg); } -void XEmitter::RSQRTPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseRSQRT, regOp, arg);} -void XEmitter::SHUFPS(X64Reg regOp, OpArg arg, u8 shuffle) {WriteSSEOp(0x00, sseSHUF, regOp, arg,1); Write8(shuffle);} -void XEmitter::SHUFPD(X64Reg regOp, OpArg arg, u8 shuffle) {WriteSSEOp(0x66, sseSHUF, regOp, arg,1); Write8(shuffle);} - -void XEmitter::HADDPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, sseHADD, regOp, arg);} - -void XEmitter::COMISS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseCOMIS, regOp, arg);} //weird that these should be packed -void XEmitter::COMISD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseCOMIS, regOp, arg);} //ordered -void XEmitter::UCOMISS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseUCOMIS, regOp, arg);} //unordered -void XEmitter::UCOMISD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseUCOMIS, regOp, arg);} - -void XEmitter::MOVAPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseMOVAPfromRM, regOp, arg);} -void XEmitter::MOVAPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseMOVAPfromRM, regOp, arg);} -void XEmitter::MOVAPS(OpArg arg, X64Reg regOp) {WriteSSEOp(0x00, sseMOVAPtoRM, regOp, arg);} -void XEmitter::MOVAPD(OpArg arg, X64Reg regOp) {WriteSSEOp(0x66, sseMOVAPtoRM, regOp, arg);} - -void XEmitter::MOVUPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseMOVUPfromRM, regOp, arg);} -void XEmitter::MOVUPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseMOVUPfromRM, regOp, arg);} -void XEmitter::MOVUPS(OpArg arg, X64Reg regOp) {WriteSSEOp(0x00, sseMOVUPtoRM, regOp, arg);} -void XEmitter::MOVUPD(OpArg arg, X64Reg regOp) {WriteSSEOp(0x66, sseMOVUPtoRM, regOp, arg);} - -void XEmitter::MOVDQA(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseMOVDQfromRM, regOp, arg);} -void XEmitter::MOVDQA(OpArg arg, X64Reg regOp) {WriteSSEOp(0x66, sseMOVDQtoRM, regOp, arg);} -void XEmitter::MOVDQU(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, sseMOVDQfromRM, regOp, arg);} -void XEmitter::MOVDQU(OpArg arg, X64Reg regOp) {WriteSSEOp(0xF3, sseMOVDQtoRM, regOp, arg);} - -void XEmitter::MOVSS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, sseMOVUPfromRM, regOp, arg);} -void XEmitter::MOVSD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, sseMOVUPfromRM, regOp, arg);} -void XEmitter::MOVSS(OpArg arg, X64Reg regOp) {WriteSSEOp(0xF3, sseMOVUPtoRM, regOp, arg);} -void XEmitter::MOVSD(OpArg arg, X64Reg regOp) {WriteSSEOp(0xF2, sseMOVUPtoRM, regOp, arg);} - -void XEmitter::MOVLPS(X64Reg regOp, OpArg arg) { WriteSSEOp(0x00, sseMOVLPfromRM, regOp, arg); } -void XEmitter::MOVLPD(X64Reg regOp, OpArg arg) { WriteSSEOp(0x66, sseMOVLPfromRM, regOp, arg); } -void XEmitter::MOVLPS(OpArg arg, X64Reg regOp) { WriteSSEOp(0x00, sseMOVLPtoRM, regOp, arg); } -void XEmitter::MOVLPD(OpArg arg, X64Reg regOp) { WriteSSEOp(0x66, sseMOVLPtoRM, regOp, arg); } - -void XEmitter::MOVHPS(X64Reg regOp, OpArg arg) { WriteSSEOp(0x00, sseMOVHPfromRM, regOp, arg); } -void XEmitter::MOVHPD(X64Reg regOp, OpArg arg) { WriteSSEOp(0x66, sseMOVHPfromRM, regOp, arg); } -void XEmitter::MOVHPS(OpArg arg, X64Reg regOp) { WriteSSEOp(0x00, sseMOVHPtoRM, regOp, arg); } -void XEmitter::MOVHPD(OpArg arg, X64Reg regOp) { WriteSSEOp(0x66, sseMOVHPtoRM, regOp, arg); } +void XEmitter::STMXCSR(const OpArg& memloc) {WriteMXCSR(memloc, 3);} +void XEmitter::LDMXCSR(const OpArg& memloc) {WriteMXCSR(memloc, 2);} + +void XEmitter::MOVNTDQ(const OpArg& arg, X64Reg regOp) {WriteSSEOp(0x66, sseMOVNTDQ, regOp, arg);} +void XEmitter::MOVNTPS(const OpArg& arg, X64Reg regOp) {WriteSSEOp(0x00, sseMOVNTP, regOp, arg);} +void XEmitter::MOVNTPD(const OpArg& arg, X64Reg regOp) {WriteSSEOp(0x66, sseMOVNTP, regOp, arg);} + +void XEmitter::ADDSS(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0xF3, sseADD, regOp, arg);} +void XEmitter::ADDSD(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0xF2, sseADD, regOp, arg);} +void XEmitter::SUBSS(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0xF3, sseSUB, regOp, arg);} +void XEmitter::SUBSD(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0xF2, sseSUB, regOp, arg);} +void XEmitter::CMPSS(X64Reg regOp, const OpArg& arg, u8 compare) {WriteSSEOp(0xF3, sseCMP, regOp, arg, 1); Write8(compare);} +void XEmitter::CMPSD(X64Reg regOp, const OpArg& arg, u8 compare) {WriteSSEOp(0xF2, sseCMP, regOp, arg, 1); Write8(compare);} +void XEmitter::MULSS(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0xF3, sseMUL, regOp, arg);} +void XEmitter::MULSD(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0xF2, sseMUL, regOp, arg);} +void XEmitter::DIVSS(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0xF3, sseDIV, regOp, arg);} +void XEmitter::DIVSD(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0xF2, sseDIV, regOp, arg);} +void XEmitter::MINSS(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0xF3, sseMIN, regOp, arg);} +void XEmitter::MINSD(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0xF2, sseMIN, regOp, arg);} +void XEmitter::MAXSS(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0xF3, sseMAX, regOp, arg);} +void XEmitter::MAXSD(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0xF2, sseMAX, regOp, arg);} +void XEmitter::SQRTSS(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0xF3, sseSQRT, regOp, arg);} +void XEmitter::SQRTSD(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0xF2, sseSQRT, regOp, arg);} +void XEmitter::RCPSS(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0xF3, sseRCP, regOp, arg);} +void XEmitter::RSQRTSS(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0xF3, sseRSQRT, regOp, arg);} + +void XEmitter::ADDPS(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0x00, sseADD, regOp, arg);} +void XEmitter::ADDPD(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0x66, sseADD, regOp, arg);} +void XEmitter::SUBPS(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0x00, sseSUB, regOp, arg);} +void XEmitter::SUBPD(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0x66, sseSUB, regOp, arg);} +void XEmitter::CMPPS(X64Reg regOp, const OpArg& arg, u8 compare) {WriteSSEOp(0x00, sseCMP, regOp, arg, 1); Write8(compare);} +void XEmitter::CMPPD(X64Reg regOp, const OpArg& arg, u8 compare) {WriteSSEOp(0x66, sseCMP, regOp, arg, 1); Write8(compare);} +void XEmitter::ANDPS(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0x00, sseAND, regOp, arg);} +void XEmitter::ANDPD(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0x66, sseAND, regOp, arg);} +void XEmitter::ANDNPS(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0x00, sseANDN, regOp, arg);} +void XEmitter::ANDNPD(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0x66, sseANDN, regOp, arg);} +void XEmitter::ORPS(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0x00, sseOR, regOp, arg);} +void XEmitter::ORPD(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0x66, sseOR, regOp, arg);} +void XEmitter::XORPS(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0x00, sseXOR, regOp, arg);} +void XEmitter::XORPD(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0x66, sseXOR, regOp, arg);} +void XEmitter::MULPS(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0x00, sseMUL, regOp, arg);} +void XEmitter::MULPD(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0x66, sseMUL, regOp, arg);} +void XEmitter::DIVPS(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0x00, sseDIV, regOp, arg);} +void XEmitter::DIVPD(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0x66, sseDIV, regOp, arg);} +void XEmitter::MINPS(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0x00, sseMIN, regOp, arg);} +void XEmitter::MINPD(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0x66, sseMIN, regOp, arg);} +void XEmitter::MAXPS(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0x00, sseMAX, regOp, arg);} +void XEmitter::MAXPD(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0x66, sseMAX, regOp, arg);} +void XEmitter::SQRTPS(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0x00, sseSQRT, regOp, arg);} +void XEmitter::SQRTPD(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0x66, sseSQRT, regOp, arg);} +void XEmitter::RCPPS(X64Reg regOp, const OpArg& arg) { WriteSSEOp(0x00, sseRCP, regOp, arg); } +void XEmitter::RSQRTPS(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0x00, sseRSQRT, regOp, arg);} +void XEmitter::SHUFPS(X64Reg regOp, const OpArg& arg, u8 shuffle) {WriteSSEOp(0x00, sseSHUF, regOp, arg,1); Write8(shuffle);} +void XEmitter::SHUFPD(X64Reg regOp, const OpArg& arg, u8 shuffle) {WriteSSEOp(0x66, sseSHUF, regOp, arg,1); Write8(shuffle);} + +void XEmitter::HADDPS(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0xF2, sseHADD, regOp, arg);} + +void XEmitter::COMISS(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0x00, sseCOMIS, regOp, arg);} //weird that these should be packed +void XEmitter::COMISD(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0x66, sseCOMIS, regOp, arg);} //ordered +void XEmitter::UCOMISS(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0x00, sseUCOMIS, regOp, arg);} //unordered +void XEmitter::UCOMISD(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0x66, sseUCOMIS, regOp, arg);} + +void XEmitter::MOVAPS(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0x00, sseMOVAPfromRM, regOp, arg);} +void XEmitter::MOVAPD(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0x66, sseMOVAPfromRM, regOp, arg);} +void XEmitter::MOVAPS(const OpArg& arg, X64Reg regOp) {WriteSSEOp(0x00, sseMOVAPtoRM, regOp, arg);} +void XEmitter::MOVAPD(const OpArg& arg, X64Reg regOp) {WriteSSEOp(0x66, sseMOVAPtoRM, regOp, arg);} + +void XEmitter::MOVUPS(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0x00, sseMOVUPfromRM, regOp, arg);} +void XEmitter::MOVUPD(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0x66, sseMOVUPfromRM, regOp, arg);} +void XEmitter::MOVUPS(const OpArg& arg, X64Reg regOp) {WriteSSEOp(0x00, sseMOVUPtoRM, regOp, arg);} +void XEmitter::MOVUPD(const OpArg& arg, X64Reg regOp) {WriteSSEOp(0x66, sseMOVUPtoRM, regOp, arg);} + +void XEmitter::MOVDQA(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0x66, sseMOVDQfromRM, regOp, arg);} +void XEmitter::MOVDQA(const OpArg& arg, X64Reg regOp) {WriteSSEOp(0x66, sseMOVDQtoRM, regOp, arg);} +void XEmitter::MOVDQU(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0xF3, sseMOVDQfromRM, regOp, arg);} +void XEmitter::MOVDQU(const OpArg& arg, X64Reg regOp) {WriteSSEOp(0xF3, sseMOVDQtoRM, regOp, arg);} + +void XEmitter::MOVSS(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0xF3, sseMOVUPfromRM, regOp, arg);} +void XEmitter::MOVSD(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0xF2, sseMOVUPfromRM, regOp, arg);} +void XEmitter::MOVSS(const OpArg& arg, X64Reg regOp) {WriteSSEOp(0xF3, sseMOVUPtoRM, regOp, arg);} +void XEmitter::MOVSD(const OpArg& arg, X64Reg regOp) {WriteSSEOp(0xF2, sseMOVUPtoRM, regOp, arg);} + +void XEmitter::MOVLPS(X64Reg regOp, const OpArg& arg) { WriteSSEOp(0x00, sseMOVLPfromRM, regOp, arg); } +void XEmitter::MOVLPD(X64Reg regOp, const OpArg& arg) { WriteSSEOp(0x66, sseMOVLPfromRM, regOp, arg); } +void XEmitter::MOVLPS(const OpArg& arg, X64Reg regOp) { WriteSSEOp(0x00, sseMOVLPtoRM, regOp, arg); } +void XEmitter::MOVLPD(const OpArg& arg, X64Reg regOp) { WriteSSEOp(0x66, sseMOVLPtoRM, regOp, arg); } + +void XEmitter::MOVHPS(X64Reg regOp, const OpArg& arg) { WriteSSEOp(0x00, sseMOVHPfromRM, regOp, arg); } +void XEmitter::MOVHPD(X64Reg regOp, const OpArg& arg) { WriteSSEOp(0x66, sseMOVHPfromRM, regOp, arg); } +void XEmitter::MOVHPS(const OpArg& arg, X64Reg regOp) { WriteSSEOp(0x00, sseMOVHPtoRM, regOp, arg); } +void XEmitter::MOVHPD(const OpArg& arg, X64Reg regOp) { WriteSSEOp(0x66, sseMOVHPtoRM, regOp, arg); } void XEmitter::MOVHLPS(X64Reg regOp1, X64Reg regOp2) {WriteSSEOp(0x00, sseMOVHLPS, regOp1, R(regOp2));} void XEmitter::MOVLHPS(X64Reg regOp1, X64Reg regOp2) {WriteSSEOp(0x00, sseMOVLHPS, regOp1, R(regOp2));} -void XEmitter::CVTPS2PD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, 0x5A, regOp, arg);} -void XEmitter::CVTPD2PS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, 0x5A, regOp, arg);} +void XEmitter::CVTPS2PD(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0x00, 0x5A, regOp, arg);} +void XEmitter::CVTPD2PS(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0x66, 0x5A, regOp, arg);} -void XEmitter::CVTSD2SS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, 0x5A, regOp, arg);} -void XEmitter::CVTSS2SD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, 0x5A, regOp, arg);} -void XEmitter::CVTSD2SI(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, 0x2D, regOp, arg);} -void XEmitter::CVTSS2SI(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, 0x2D, regOp, arg);} -void XEmitter::CVTSI2SD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, 0x2A, regOp, arg);} -void XEmitter::CVTSI2SS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, 0x2A, regOp, arg);} +void XEmitter::CVTSD2SS(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0xF2, 0x5A, regOp, arg);} +void XEmitter::CVTSS2SD(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0xF3, 0x5A, regOp, arg);} +void XEmitter::CVTSD2SI(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0xF2, 0x2D, regOp, arg);} +void XEmitter::CVTSS2SI(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0xF3, 0x2D, regOp, arg);} +void XEmitter::CVTSI2SD(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0xF2, 0x2A, regOp, arg);} +void XEmitter::CVTSI2SS(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0xF3, 0x2A, regOp, arg);} -void XEmitter::CVTDQ2PD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, 0xE6, regOp, arg);} -void XEmitter::CVTDQ2PS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, 0x5B, regOp, arg);} -void XEmitter::CVTPD2DQ(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, 0xE6, regOp, arg);} -void XEmitter::CVTPS2DQ(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, 0x5B, regOp, arg);} +void XEmitter::CVTDQ2PD(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0xF3, 0xE6, regOp, arg);} +void XEmitter::CVTDQ2PS(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0x00, 0x5B, regOp, arg);} +void XEmitter::CVTPD2DQ(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0xF2, 0xE6, regOp, arg);} +void XEmitter::CVTPS2DQ(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0x66, 0x5B, regOp, arg);} -void XEmitter::CVTTSD2SI(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, 0x2C, regOp, arg);} -void XEmitter::CVTTSS2SI(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, 0x2C, regOp, arg);} -void XEmitter::CVTTPS2DQ(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, 0x5B, regOp, arg);} -void XEmitter::CVTTPD2DQ(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, 0xE6, regOp, arg);} +void XEmitter::CVTTSD2SI(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0xF2, 0x2C, regOp, arg);} +void XEmitter::CVTTSS2SI(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0xF3, 0x2C, regOp, arg);} +void XEmitter::CVTTPS2DQ(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0xF3, 0x5B, regOp, arg);} +void XEmitter::CVTTPD2DQ(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0x66, 0xE6, regOp, arg);} void XEmitter::MASKMOVDQU(X64Reg dest, X64Reg src) {WriteSSEOp(0x66, sseMASKMOVDQU, dest, R(src));} -void XEmitter::MOVMSKPS(X64Reg dest, OpArg arg) {WriteSSEOp(0x00, 0x50, dest, arg);} -void XEmitter::MOVMSKPD(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x50, dest, arg);} +void XEmitter::MOVMSKPS(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x00, 0x50, dest, arg);} +void XEmitter::MOVMSKPD(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x66, 0x50, dest, arg);} -void XEmitter::LDDQU(X64Reg dest, OpArg arg) {WriteSSEOp(0xF2, sseLDDQU, dest, arg);} // For integer data only +void XEmitter::LDDQU(X64Reg dest, const OpArg& arg) {WriteSSEOp(0xF2, sseLDDQU, dest, arg);} // For integer data only // THESE TWO ARE UNTESTED. -void XEmitter::UNPCKLPS(X64Reg dest, OpArg arg) {WriteSSEOp(0x00, 0x14, dest, arg);} -void XEmitter::UNPCKHPS(X64Reg dest, OpArg arg) {WriteSSEOp(0x00, 0x15, dest, arg);} +void XEmitter::UNPCKLPS(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x00, 0x14, dest, arg);} +void XEmitter::UNPCKHPS(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x00, 0x15, dest, arg);} -void XEmitter::UNPCKLPD(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x14, dest, arg);} -void XEmitter::UNPCKHPD(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x15, dest, arg);} +void XEmitter::UNPCKLPD(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x66, 0x14, dest, arg);} +void XEmitter::UNPCKHPD(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x66, 0x15, dest, arg);} -void XEmitter::MOVDDUP(X64Reg regOp, OpArg arg) +void XEmitter::MOVDDUP(X64Reg regOp, const OpArg& arg) { if (Common::GetCPUCaps().sse3) { @@ -1663,9 +1659,9 @@ void XEmitter::MOVDDUP(X64Reg regOp, OpArg arg) //There are a few more left // Also some integer instructions are missing -void XEmitter::PACKSSDW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x6B, dest, arg);} -void XEmitter::PACKSSWB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x63, dest, arg);} -void XEmitter::PACKUSWB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x67, dest, arg);} +void XEmitter::PACKSSDW(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x66, 0x6B, dest, arg);} +void XEmitter::PACKSSWB(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x66, 0x63, dest, arg);} +void XEmitter::PACKUSWB(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x66, 0x67, dest, arg);} void XEmitter::PUNPCKLBW(X64Reg dest, const OpArg &arg) {WriteSSEOp(0x66, 0x60, dest, arg);} void XEmitter::PUNPCKLWD(X64Reg dest, const OpArg &arg) {WriteSSEOp(0x66, 0x61, dest, arg);} @@ -1690,7 +1686,7 @@ void XEmitter::PSRLQ(X64Reg reg, int shift) Write8(shift); } -void XEmitter::PSRLQ(X64Reg reg, OpArg arg) +void XEmitter::PSRLQ(X64Reg reg, const OpArg& arg) { WriteSSEOp(0x66, 0xd3, reg, arg); } @@ -1735,212 +1731,212 @@ void XEmitter::PSRAD(X64Reg reg, int shift) Write8(shift); } -void XEmitter::WriteSSSE3Op(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes) +void XEmitter::WriteSSSE3Op(u8 opPrefix, u16 op, X64Reg regOp, const OpArg& arg, int extrabytes) { if (!Common::GetCPUCaps().ssse3) ASSERT_MSG(0, "Trying to use SSSE3 on a system that doesn't support it. Bad programmer."); WriteSSEOp(opPrefix, op, regOp, arg, extrabytes); } -void XEmitter::WriteSSE41Op(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes) +void XEmitter::WriteSSE41Op(u8 opPrefix, u16 op, X64Reg regOp, const OpArg& arg, int extrabytes) { if (!Common::GetCPUCaps().sse4_1) ASSERT_MSG(0, "Trying to use SSE4.1 on a system that doesn't support it. Bad programmer."); WriteSSEOp(opPrefix, op, regOp, arg, extrabytes); } -void XEmitter::PSHUFB(X64Reg dest, OpArg arg) {WriteSSSE3Op(0x66, 0x3800, dest, arg);} -void XEmitter::PTEST(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3817, dest, arg);} -void XEmitter::PACKUSDW(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x382b, dest, arg);} -void XEmitter::DPPS(X64Reg dest, OpArg arg, u8 mask) {WriteSSE41Op(0x66, 0x3A40, dest, arg, 1); Write8(mask);} - -void XEmitter::PMINSB(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3838, dest, arg);} -void XEmitter::PMINSD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3839, dest, arg);} -void XEmitter::PMINUW(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x383a, dest, arg);} -void XEmitter::PMINUD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x383b, dest, arg);} -void XEmitter::PMAXSB(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x383c, dest, arg);} -void XEmitter::PMAXSD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x383d, dest, arg);} -void XEmitter::PMAXUW(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x383e, dest, arg);} -void XEmitter::PMAXUD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x383f, dest, arg);} - -void XEmitter::PMOVSXBW(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3820, dest, arg);} -void XEmitter::PMOVSXBD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3821, dest, arg);} -void XEmitter::PMOVSXBQ(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3822, dest, arg);} -void XEmitter::PMOVSXWD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3823, dest, arg);} -void XEmitter::PMOVSXWQ(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3824, dest, arg);} -void XEmitter::PMOVSXDQ(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3825, dest, arg);} -void XEmitter::PMOVZXBW(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3830, dest, arg);} -void XEmitter::PMOVZXBD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3831, dest, arg);} -void XEmitter::PMOVZXBQ(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3832, dest, arg);} -void XEmitter::PMOVZXWD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3833, dest, arg);} -void XEmitter::PMOVZXWQ(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3834, dest, arg);} -void XEmitter::PMOVZXDQ(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3835, dest, arg);} - -void XEmitter::PBLENDVB(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3810, dest, arg);} -void XEmitter::BLENDVPS(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3814, dest, arg);} -void XEmitter::BLENDVPD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3815, dest, arg);} +void XEmitter::PSHUFB(X64Reg dest, const OpArg& arg) {WriteSSSE3Op(0x66, 0x3800, dest, arg);} +void XEmitter::PTEST(X64Reg dest, const OpArg& arg) {WriteSSE41Op(0x66, 0x3817, dest, arg);} +void XEmitter::PACKUSDW(X64Reg dest, const OpArg& arg) {WriteSSE41Op(0x66, 0x382b, dest, arg);} +void XEmitter::DPPS(X64Reg dest, const OpArg& arg, u8 mask) {WriteSSE41Op(0x66, 0x3A40, dest, arg, 1); Write8(mask);} + +void XEmitter::PMINSB(X64Reg dest, const OpArg& arg) {WriteSSE41Op(0x66, 0x3838, dest, arg);} +void XEmitter::PMINSD(X64Reg dest, const OpArg& arg) {WriteSSE41Op(0x66, 0x3839, dest, arg);} +void XEmitter::PMINUW(X64Reg dest, const OpArg& arg) {WriteSSE41Op(0x66, 0x383a, dest, arg);} +void XEmitter::PMINUD(X64Reg dest, const OpArg& arg) {WriteSSE41Op(0x66, 0x383b, dest, arg);} +void XEmitter::PMAXSB(X64Reg dest, const OpArg& arg) {WriteSSE41Op(0x66, 0x383c, dest, arg);} +void XEmitter::PMAXSD(X64Reg dest, const OpArg& arg) {WriteSSE41Op(0x66, 0x383d, dest, arg);} +void XEmitter::PMAXUW(X64Reg dest, const OpArg& arg) {WriteSSE41Op(0x66, 0x383e, dest, arg);} +void XEmitter::PMAXUD(X64Reg dest, const OpArg& arg) {WriteSSE41Op(0x66, 0x383f, dest, arg);} + +void XEmitter::PMOVSXBW(X64Reg dest, const OpArg& arg) {WriteSSE41Op(0x66, 0x3820, dest, arg);} +void XEmitter::PMOVSXBD(X64Reg dest, const OpArg& arg) {WriteSSE41Op(0x66, 0x3821, dest, arg);} +void XEmitter::PMOVSXBQ(X64Reg dest, const OpArg& arg) {WriteSSE41Op(0x66, 0x3822, dest, arg);} +void XEmitter::PMOVSXWD(X64Reg dest, const OpArg& arg) {WriteSSE41Op(0x66, 0x3823, dest, arg);} +void XEmitter::PMOVSXWQ(X64Reg dest, const OpArg& arg) {WriteSSE41Op(0x66, 0x3824, dest, arg);} +void XEmitter::PMOVSXDQ(X64Reg dest, const OpArg& arg) {WriteSSE41Op(0x66, 0x3825, dest, arg);} +void XEmitter::PMOVZXBW(X64Reg dest, const OpArg& arg) {WriteSSE41Op(0x66, 0x3830, dest, arg);} +void XEmitter::PMOVZXBD(X64Reg dest, const OpArg& arg) {WriteSSE41Op(0x66, 0x3831, dest, arg);} +void XEmitter::PMOVZXBQ(X64Reg dest, const OpArg& arg) {WriteSSE41Op(0x66, 0x3832, dest, arg);} +void XEmitter::PMOVZXWD(X64Reg dest, const OpArg& arg) {WriteSSE41Op(0x66, 0x3833, dest, arg);} +void XEmitter::PMOVZXWQ(X64Reg dest, const OpArg& arg) {WriteSSE41Op(0x66, 0x3834, dest, arg);} +void XEmitter::PMOVZXDQ(X64Reg dest, const OpArg& arg) {WriteSSE41Op(0x66, 0x3835, dest, arg);} + +void XEmitter::PBLENDVB(X64Reg dest, const OpArg& arg) {WriteSSE41Op(0x66, 0x3810, dest, arg);} +void XEmitter::BLENDVPS(X64Reg dest, const OpArg& arg) {WriteSSE41Op(0x66, 0x3814, dest, arg);} +void XEmitter::BLENDVPD(X64Reg dest, const OpArg& arg) {WriteSSE41Op(0x66, 0x3815, dest, arg);} void XEmitter::BLENDPS(X64Reg dest, const OpArg& arg, u8 blend) { WriteSSE41Op(0x66, 0x3A0C, dest, arg, 1); Write8(blend); } void XEmitter::BLENDPD(X64Reg dest, const OpArg& arg, u8 blend) { WriteSSE41Op(0x66, 0x3A0D, dest, arg, 1); Write8(blend); } -void XEmitter::ROUNDSS(X64Reg dest, OpArg arg, u8 mode) {WriteSSE41Op(0x66, 0x3A0A, dest, arg, 1); Write8(mode);} -void XEmitter::ROUNDSD(X64Reg dest, OpArg arg, u8 mode) {WriteSSE41Op(0x66, 0x3A0B, dest, arg, 1); Write8(mode);} -void XEmitter::ROUNDPS(X64Reg dest, OpArg arg, u8 mode) {WriteSSE41Op(0x66, 0x3A08, dest, arg, 1); Write8(mode);} -void XEmitter::ROUNDPD(X64Reg dest, OpArg arg, u8 mode) {WriteSSE41Op(0x66, 0x3A09, dest, arg, 1); Write8(mode);} +void XEmitter::ROUNDSS(X64Reg dest, const OpArg& arg, u8 mode) {WriteSSE41Op(0x66, 0x3A0A, dest, arg, 1); Write8(mode);} +void XEmitter::ROUNDSD(X64Reg dest, const OpArg& arg, u8 mode) {WriteSSE41Op(0x66, 0x3A0B, dest, arg, 1); Write8(mode);} +void XEmitter::ROUNDPS(X64Reg dest, const OpArg& arg, u8 mode) {WriteSSE41Op(0x66, 0x3A08, dest, arg, 1); Write8(mode);} +void XEmitter::ROUNDPD(X64Reg dest, const OpArg& arg, u8 mode) {WriteSSE41Op(0x66, 0x3A09, dest, arg, 1); Write8(mode);} -void XEmitter::PAND(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xDB, dest, arg);} -void XEmitter::PANDN(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xDF, dest, arg);} -void XEmitter::PXOR(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xEF, dest, arg);} -void XEmitter::POR(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xEB, dest, arg);} +void XEmitter::PAND(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x66, 0xDB, dest, arg);} +void XEmitter::PANDN(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x66, 0xDF, dest, arg);} +void XEmitter::PXOR(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x66, 0xEF, dest, arg);} +void XEmitter::POR(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x66, 0xEB, dest, arg);} -void XEmitter::PADDB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xFC, dest, arg);} -void XEmitter::PADDW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xFD, dest, arg);} -void XEmitter::PADDD(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xFE, dest, arg);} -void XEmitter::PADDQ(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xD4, dest, arg);} +void XEmitter::PADDB(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x66, 0xFC, dest, arg);} +void XEmitter::PADDW(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x66, 0xFD, dest, arg);} +void XEmitter::PADDD(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x66, 0xFE, dest, arg);} +void XEmitter::PADDQ(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x66, 0xD4, dest, arg);} -void XEmitter::PADDSB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xEC, dest, arg);} -void XEmitter::PADDSW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xED, dest, arg);} -void XEmitter::PADDUSB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xDC, dest, arg);} -void XEmitter::PADDUSW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xDD, dest, arg);} +void XEmitter::PADDSB(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x66, 0xEC, dest, arg);} +void XEmitter::PADDSW(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x66, 0xED, dest, arg);} +void XEmitter::PADDUSB(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x66, 0xDC, dest, arg);} +void XEmitter::PADDUSW(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x66, 0xDD, dest, arg);} -void XEmitter::PSUBB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xF8, dest, arg);} -void XEmitter::PSUBW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xF9, dest, arg);} -void XEmitter::PSUBD(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xFA, dest, arg);} -void XEmitter::PSUBQ(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xFB, dest, arg);} +void XEmitter::PSUBB(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x66, 0xF8, dest, arg);} +void XEmitter::PSUBW(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x66, 0xF9, dest, arg);} +void XEmitter::PSUBD(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x66, 0xFA, dest, arg);} +void XEmitter::PSUBQ(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x66, 0xFB, dest, arg);} -void XEmitter::PSUBSB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xE8, dest, arg);} -void XEmitter::PSUBSW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xE9, dest, arg);} -void XEmitter::PSUBUSB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xD8, dest, arg);} -void XEmitter::PSUBUSW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xD9, dest, arg);} +void XEmitter::PSUBSB(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x66, 0xE8, dest, arg);} +void XEmitter::PSUBSW(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x66, 0xE9, dest, arg);} +void XEmitter::PSUBUSB(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x66, 0xD8, dest, arg);} +void XEmitter::PSUBUSW(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x66, 0xD9, dest, arg);} -void XEmitter::PAVGB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xE0, dest, arg);} -void XEmitter::PAVGW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xE3, dest, arg);} +void XEmitter::PAVGB(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x66, 0xE0, dest, arg);} +void XEmitter::PAVGW(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x66, 0xE3, dest, arg);} -void XEmitter::PCMPEQB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x74, dest, arg);} -void XEmitter::PCMPEQW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x75, dest, arg);} -void XEmitter::PCMPEQD(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x76, dest, arg);} +void XEmitter::PCMPEQB(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x66, 0x74, dest, arg);} +void XEmitter::PCMPEQW(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x66, 0x75, dest, arg);} +void XEmitter::PCMPEQD(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x66, 0x76, dest, arg);} -void XEmitter::PCMPGTB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x64, dest, arg);} -void XEmitter::PCMPGTW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x65, dest, arg);} -void XEmitter::PCMPGTD(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x66, dest, arg);} +void XEmitter::PCMPGTB(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x66, 0x64, dest, arg);} +void XEmitter::PCMPGTW(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x66, 0x65, dest, arg);} +void XEmitter::PCMPGTD(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x66, 0x66, dest, arg);} -void XEmitter::PEXTRW(X64Reg dest, OpArg arg, u8 subreg) {WriteSSEOp(0x66, 0xC5, dest, arg, 1); Write8(subreg);} -void XEmitter::PINSRW(X64Reg dest, OpArg arg, u8 subreg) {WriteSSEOp(0x66, 0xC4, dest, arg, 1); Write8(subreg);} +void XEmitter::PEXTRW(X64Reg dest, const OpArg& arg, u8 subreg) {WriteSSEOp(0x66, 0xC5, dest, arg, 1); Write8(subreg);} +void XEmitter::PINSRW(X64Reg dest, const OpArg& arg, u8 subreg) {WriteSSEOp(0x66, 0xC4, dest, arg, 1); Write8(subreg);} -void XEmitter::PMADDWD(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xF5, dest, arg); } -void XEmitter::PSADBW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xF6, dest, arg);} +void XEmitter::PMADDWD(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x66, 0xF5, dest, arg); } +void XEmitter::PSADBW(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x66, 0xF6, dest, arg);} -void XEmitter::PMAXSW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xEE, dest, arg); } -void XEmitter::PMAXUB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xDE, dest, arg); } -void XEmitter::PMINSW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xEA, dest, arg); } -void XEmitter::PMINUB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xDA, dest, arg); } +void XEmitter::PMAXSW(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x66, 0xEE, dest, arg); } +void XEmitter::PMAXUB(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x66, 0xDE, dest, arg); } +void XEmitter::PMINSW(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x66, 0xEA, dest, arg); } +void XEmitter::PMINUB(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x66, 0xDA, dest, arg); } -void XEmitter::PMOVMSKB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xD7, dest, arg); } -void XEmitter::PSHUFD(X64Reg regOp, OpArg arg, u8 shuffle) {WriteSSEOp(0x66, 0x70, regOp, arg, 1); Write8(shuffle);} -void XEmitter::PSHUFLW(X64Reg regOp, OpArg arg, u8 shuffle) {WriteSSEOp(0xF2, 0x70, regOp, arg, 1); Write8(shuffle);} -void XEmitter::PSHUFHW(X64Reg regOp, OpArg arg, u8 shuffle) {WriteSSEOp(0xF3, 0x70, regOp, arg, 1); Write8(shuffle);} +void XEmitter::PMOVMSKB(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x66, 0xD7, dest, arg); } +void XEmitter::PSHUFD(X64Reg regOp, const OpArg& arg, u8 shuffle) {WriteSSEOp(0x66, 0x70, regOp, arg, 1); Write8(shuffle);} +void XEmitter::PSHUFLW(X64Reg regOp, const OpArg& arg, u8 shuffle) {WriteSSEOp(0xF2, 0x70, regOp, arg, 1); Write8(shuffle);} +void XEmitter::PSHUFHW(X64Reg regOp, const OpArg& arg, u8 shuffle) {WriteSSEOp(0xF3, 0x70, regOp, arg, 1); Write8(shuffle);} // VEX -void XEmitter::VADDSD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0xF2, sseADD, regOp1, regOp2, arg);} -void XEmitter::VSUBSD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0xF2, sseSUB, regOp1, regOp2, arg);} -void XEmitter::VMULSD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0xF2, sseMUL, regOp1, regOp2, arg);} -void XEmitter::VDIVSD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0xF2, sseDIV, regOp1, regOp2, arg);} -void XEmitter::VADDPD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, sseADD, regOp1, regOp2, arg);} -void XEmitter::VSUBPD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, sseSUB, regOp1, regOp2, arg);} -void XEmitter::VMULPD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, sseMUL, regOp1, regOp2, arg);} -void XEmitter::VDIVPD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, sseDIV, regOp1, regOp2, arg);} -void XEmitter::VSQRTSD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0xF2, sseSQRT, regOp1, regOp2, arg);} -void XEmitter::VSHUFPD(X64Reg regOp1, X64Reg regOp2, OpArg arg, u8 shuffle) {WriteAVXOp(0x66, sseSHUF, regOp1, regOp2, arg, 1); Write8(shuffle);} -void XEmitter::VUNPCKLPD(X64Reg regOp1, X64Reg regOp2, OpArg arg){WriteAVXOp(0x66, 0x14, regOp1, regOp2, arg);} -void XEmitter::VUNPCKHPD(X64Reg regOp1, X64Reg regOp2, OpArg arg){WriteAVXOp(0x66, 0x15, regOp1, regOp2, arg);} - -void XEmitter::VANDPS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x00, sseAND, regOp1, regOp2, arg); } -void XEmitter::VANDPD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, sseAND, regOp1, regOp2, arg); } -void XEmitter::VANDNPS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x00, sseANDN, regOp1, regOp2, arg); } -void XEmitter::VANDNPD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, sseANDN, regOp1, regOp2, arg); } -void XEmitter::VORPS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x00, sseOR, regOp1, regOp2, arg); } -void XEmitter::VORPD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, sseOR, regOp1, regOp2, arg); } -void XEmitter::VXORPS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x00, sseXOR, regOp1, regOp2, arg); } -void XEmitter::VXORPD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, sseXOR, regOp1, regOp2, arg); } - -void XEmitter::VPAND(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0xDB, regOp1, regOp2, arg); } -void XEmitter::VPANDN(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0xDF, regOp1, regOp2, arg); } -void XEmitter::VPOR(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0xEB, regOp1, regOp2, arg); } -void XEmitter::VPXOR(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0xEF, regOp1, regOp2, arg); } - -void XEmitter::VFMADD132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x3898, regOp1, regOp2, arg); } -void XEmitter::VFMADD213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38A8, regOp1, regOp2, arg); } -void XEmitter::VFMADD231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38B8, regOp1, regOp2, arg); } -void XEmitter::VFMADD132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x3898, regOp1, regOp2, arg, 1); } -void XEmitter::VFMADD213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38A8, regOp1, regOp2, arg, 1); } -void XEmitter::VFMADD231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38B8, regOp1, regOp2, arg, 1); } -void XEmitter::VFMADD132SS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x3899, regOp1, regOp2, arg); } -void XEmitter::VFMADD213SS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38A9, regOp1, regOp2, arg); } -void XEmitter::VFMADD231SS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38B9, regOp1, regOp2, arg); } -void XEmitter::VFMADD132SD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x3899, regOp1, regOp2, arg, 1); } -void XEmitter::VFMADD213SD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38A9, regOp1, regOp2, arg, 1); } -void XEmitter::VFMADD231SD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38B9, regOp1, regOp2, arg, 1); } -void XEmitter::VFMSUB132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x389A, regOp1, regOp2, arg); } -void XEmitter::VFMSUB213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38AA, regOp1, regOp2, arg); } -void XEmitter::VFMSUB231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38BA, regOp1, regOp2, arg); } -void XEmitter::VFMSUB132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x389A, regOp1, regOp2, arg, 1); } -void XEmitter::VFMSUB213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38AA, regOp1, regOp2, arg, 1); } -void XEmitter::VFMSUB231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38BA, regOp1, regOp2, arg, 1); } -void XEmitter::VFMSUB132SS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x389B, regOp1, regOp2, arg); } -void XEmitter::VFMSUB213SS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38AB, regOp1, regOp2, arg); } -void XEmitter::VFMSUB231SS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38BB, regOp1, regOp2, arg); } -void XEmitter::VFMSUB132SD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x389B, regOp1, regOp2, arg, 1); } -void XEmitter::VFMSUB213SD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38AB, regOp1, regOp2, arg, 1); } -void XEmitter::VFMSUB231SD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38BB, regOp1, regOp2, arg, 1); } -void XEmitter::VFNMADD132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x389C, regOp1, regOp2, arg); } -void XEmitter::VFNMADD213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38AC, regOp1, regOp2, arg); } -void XEmitter::VFNMADD231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38BC, regOp1, regOp2, arg); } -void XEmitter::VFNMADD132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x389C, regOp1, regOp2, arg, 1); } -void XEmitter::VFNMADD213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38AC, regOp1, regOp2, arg, 1); } -void XEmitter::VFNMADD231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38BC, regOp1, regOp2, arg, 1); } -void XEmitter::VFNMADD132SS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x389D, regOp1, regOp2, arg); } -void XEmitter::VFNMADD213SS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38AD, regOp1, regOp2, arg); } -void XEmitter::VFNMADD231SS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38BD, regOp1, regOp2, arg); } -void XEmitter::VFNMADD132SD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x389D, regOp1, regOp2, arg, 1); } -void XEmitter::VFNMADD213SD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38AD, regOp1, regOp2, arg, 1); } -void XEmitter::VFNMADD231SD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38BD, regOp1, regOp2, arg, 1); } -void XEmitter::VFNMSUB132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x389E, regOp1, regOp2, arg); } -void XEmitter::VFNMSUB213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38AE, regOp1, regOp2, arg); } -void XEmitter::VFNMSUB231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38BE, regOp1, regOp2, arg); } -void XEmitter::VFNMSUB132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x389E, regOp1, regOp2, arg, 1); } -void XEmitter::VFNMSUB213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38AE, regOp1, regOp2, arg, 1); } -void XEmitter::VFNMSUB231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38BE, regOp1, regOp2, arg, 1); } -void XEmitter::VFNMSUB132SS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x389F, regOp1, regOp2, arg); } -void XEmitter::VFNMSUB213SS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38AF, regOp1, regOp2, arg); } -void XEmitter::VFNMSUB231SS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38BF, regOp1, regOp2, arg); } -void XEmitter::VFNMSUB132SD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x389F, regOp1, regOp2, arg, 1); } -void XEmitter::VFNMSUB213SD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38AF, regOp1, regOp2, arg, 1); } -void XEmitter::VFNMSUB231SD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38BF, regOp1, regOp2, arg, 1); } -void XEmitter::VFMADDSUB132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x3896, regOp1, regOp2, arg); } -void XEmitter::VFMADDSUB213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38A6, regOp1, regOp2, arg); } -void XEmitter::VFMADDSUB231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38B6, regOp1, regOp2, arg); } -void XEmitter::VFMADDSUB132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x3896, regOp1, regOp2, arg, 1); } -void XEmitter::VFMADDSUB213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38A6, regOp1, regOp2, arg, 1); } -void XEmitter::VFMADDSUB231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38B6, regOp1, regOp2, arg, 1); } -void XEmitter::VFMSUBADD132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x3897, regOp1, regOp2, arg); } -void XEmitter::VFMSUBADD213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38A7, regOp1, regOp2, arg); } -void XEmitter::VFMSUBADD231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38B7, regOp1, regOp2, arg); } -void XEmitter::VFMSUBADD132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x3897, regOp1, regOp2, arg, 1); } -void XEmitter::VFMSUBADD213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38A7, regOp1, regOp2, arg, 1); } -void XEmitter::VFMSUBADD231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38B7, regOp1, regOp2, arg, 1); } - -void XEmitter::SARX(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2) {WriteBMI2Op(bits, 0xF3, 0x38F7, regOp1, regOp2, arg);} -void XEmitter::SHLX(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2) {WriteBMI2Op(bits, 0x66, 0x38F7, regOp1, regOp2, arg);} -void XEmitter::SHRX(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2) {WriteBMI2Op(bits, 0xF2, 0x38F7, regOp1, regOp2, arg);} -void XEmitter::RORX(int bits, X64Reg regOp, OpArg arg, u8 rotate) {WriteBMI2Op(bits, 0xF2, 0x3AF0, regOp, INVALID_REG, arg, 1); Write8(rotate);} -void XEmitter::PEXT(int bits, X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteBMI2Op(bits, 0xF3, 0x38F5, regOp1, regOp2, arg);} -void XEmitter::PDEP(int bits, X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteBMI2Op(bits, 0xF2, 0x38F5, regOp1, regOp2, arg);} -void XEmitter::MULX(int bits, X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteBMI2Op(bits, 0xF2, 0x38F6, regOp2, regOp1, arg);} -void XEmitter::BZHI(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2) {WriteBMI2Op(bits, 0x00, 0x38F5, regOp1, regOp2, arg);} -void XEmitter::BLSR(int bits, X64Reg regOp, OpArg arg) {WriteBMI1Op(bits, 0x00, 0x38F3, (X64Reg)0x1, regOp, arg);} -void XEmitter::BLSMSK(int bits, X64Reg regOp, OpArg arg) {WriteBMI1Op(bits, 0x00, 0x38F3, (X64Reg)0x2, regOp, arg);} -void XEmitter::BLSI(int bits, X64Reg regOp, OpArg arg) {WriteBMI1Op(bits, 0x00, 0x38F3, (X64Reg)0x3, regOp, arg);} -void XEmitter::BEXTR(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2){WriteBMI1Op(bits, 0x00, 0x38F7, regOp1, regOp2, arg);} -void XEmitter::ANDN(int bits, X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteBMI1Op(bits, 0x00, 0x38F2, regOp1, regOp2, arg);} +void XEmitter::VADDSD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) {WriteAVXOp(0xF2, sseADD, regOp1, regOp2, arg);} +void XEmitter::VSUBSD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) {WriteAVXOp(0xF2, sseSUB, regOp1, regOp2, arg);} +void XEmitter::VMULSD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) {WriteAVXOp(0xF2, sseMUL, regOp1, regOp2, arg);} +void XEmitter::VDIVSD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) {WriteAVXOp(0xF2, sseDIV, regOp1, regOp2, arg);} +void XEmitter::VADDPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) {WriteAVXOp(0x66, sseADD, regOp1, regOp2, arg);} +void XEmitter::VSUBPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) {WriteAVXOp(0x66, sseSUB, regOp1, regOp2, arg);} +void XEmitter::VMULPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) {WriteAVXOp(0x66, sseMUL, regOp1, regOp2, arg);} +void XEmitter::VDIVPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) {WriteAVXOp(0x66, sseDIV, regOp1, regOp2, arg);} +void XEmitter::VSQRTSD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) {WriteAVXOp(0xF2, sseSQRT, regOp1, regOp2, arg);} +void XEmitter::VSHUFPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg, u8 shuffle) {WriteAVXOp(0x66, sseSHUF, regOp1, regOp2, arg, 1); Write8(shuffle);} +void XEmitter::VUNPCKLPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg){WriteAVXOp(0x66, 0x14, regOp1, regOp2, arg);} +void XEmitter::VUNPCKHPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg){WriteAVXOp(0x66, 0x15, regOp1, regOp2, arg);} + +void XEmitter::VANDPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x00, sseAND, regOp1, regOp2, arg); } +void XEmitter::VANDPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, sseAND, regOp1, regOp2, arg); } +void XEmitter::VANDNPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x00, sseANDN, regOp1, regOp2, arg); } +void XEmitter::VANDNPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, sseANDN, regOp1, regOp2, arg); } +void XEmitter::VORPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x00, sseOR, regOp1, regOp2, arg); } +void XEmitter::VORPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, sseOR, regOp1, regOp2, arg); } +void XEmitter::VXORPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x00, sseXOR, regOp1, regOp2, arg); } +void XEmitter::VXORPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, sseXOR, regOp1, regOp2, arg); } + +void XEmitter::VPAND(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0xDB, regOp1, regOp2, arg); } +void XEmitter::VPANDN(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0xDF, regOp1, regOp2, arg); } +void XEmitter::VPOR(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0xEB, regOp1, regOp2, arg); } +void XEmitter::VPXOR(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0xEF, regOp1, regOp2, arg); } + +void XEmitter::VFMADD132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x3898, regOp1, regOp2, arg); } +void XEmitter::VFMADD213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x38A8, regOp1, regOp2, arg); } +void XEmitter::VFMADD231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x38B8, regOp1, regOp2, arg); } +void XEmitter::VFMADD132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x3898, regOp1, regOp2, arg, 1); } +void XEmitter::VFMADD213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x38A8, regOp1, regOp2, arg, 1); } +void XEmitter::VFMADD231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x38B8, regOp1, regOp2, arg, 1); } +void XEmitter::VFMADD132SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x3899, regOp1, regOp2, arg); } +void XEmitter::VFMADD213SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x38A9, regOp1, regOp2, arg); } +void XEmitter::VFMADD231SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x38B9, regOp1, regOp2, arg); } +void XEmitter::VFMADD132SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x3899, regOp1, regOp2, arg, 1); } +void XEmitter::VFMADD213SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x38A9, regOp1, regOp2, arg, 1); } +void XEmitter::VFMADD231SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x38B9, regOp1, regOp2, arg, 1); } +void XEmitter::VFMSUB132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x389A, regOp1, regOp2, arg); } +void XEmitter::VFMSUB213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x38AA, regOp1, regOp2, arg); } +void XEmitter::VFMSUB231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x38BA, regOp1, regOp2, arg); } +void XEmitter::VFMSUB132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x389A, regOp1, regOp2, arg, 1); } +void XEmitter::VFMSUB213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x38AA, regOp1, regOp2, arg, 1); } +void XEmitter::VFMSUB231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x38BA, regOp1, regOp2, arg, 1); } +void XEmitter::VFMSUB132SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x389B, regOp1, regOp2, arg); } +void XEmitter::VFMSUB213SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x38AB, regOp1, regOp2, arg); } +void XEmitter::VFMSUB231SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x38BB, regOp1, regOp2, arg); } +void XEmitter::VFMSUB132SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x389B, regOp1, regOp2, arg, 1); } +void XEmitter::VFMSUB213SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x38AB, regOp1, regOp2, arg, 1); } +void XEmitter::VFMSUB231SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x38BB, regOp1, regOp2, arg, 1); } +void XEmitter::VFNMADD132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x389C, regOp1, regOp2, arg); } +void XEmitter::VFNMADD213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x38AC, regOp1, regOp2, arg); } +void XEmitter::VFNMADD231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x38BC, regOp1, regOp2, arg); } +void XEmitter::VFNMADD132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x389C, regOp1, regOp2, arg, 1); } +void XEmitter::VFNMADD213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x38AC, regOp1, regOp2, arg, 1); } +void XEmitter::VFNMADD231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x38BC, regOp1, regOp2, arg, 1); } +void XEmitter::VFNMADD132SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x389D, regOp1, regOp2, arg); } +void XEmitter::VFNMADD213SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x38AD, regOp1, regOp2, arg); } +void XEmitter::VFNMADD231SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x38BD, regOp1, regOp2, arg); } +void XEmitter::VFNMADD132SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x389D, regOp1, regOp2, arg, 1); } +void XEmitter::VFNMADD213SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x38AD, regOp1, regOp2, arg, 1); } +void XEmitter::VFNMADD231SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x38BD, regOp1, regOp2, arg, 1); } +void XEmitter::VFNMSUB132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x389E, regOp1, regOp2, arg); } +void XEmitter::VFNMSUB213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x38AE, regOp1, regOp2, arg); } +void XEmitter::VFNMSUB231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x38BE, regOp1, regOp2, arg); } +void XEmitter::VFNMSUB132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x389E, regOp1, regOp2, arg, 1); } +void XEmitter::VFNMSUB213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x38AE, regOp1, regOp2, arg, 1); } +void XEmitter::VFNMSUB231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x38BE, regOp1, regOp2, arg, 1); } +void XEmitter::VFNMSUB132SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x389F, regOp1, regOp2, arg); } +void XEmitter::VFNMSUB213SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x38AF, regOp1, regOp2, arg); } +void XEmitter::VFNMSUB231SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x38BF, regOp1, regOp2, arg); } +void XEmitter::VFNMSUB132SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x389F, regOp1, regOp2, arg, 1); } +void XEmitter::VFNMSUB213SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x38AF, regOp1, regOp2, arg, 1); } +void XEmitter::VFNMSUB231SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x38BF, regOp1, regOp2, arg, 1); } +void XEmitter::VFMADDSUB132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x3896, regOp1, regOp2, arg); } +void XEmitter::VFMADDSUB213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x38A6, regOp1, regOp2, arg); } +void XEmitter::VFMADDSUB231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x38B6, regOp1, regOp2, arg); } +void XEmitter::VFMADDSUB132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x3896, regOp1, regOp2, arg, 1); } +void XEmitter::VFMADDSUB213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x38A6, regOp1, regOp2, arg, 1); } +void XEmitter::VFMADDSUB231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x38B6, regOp1, regOp2, arg, 1); } +void XEmitter::VFMSUBADD132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x3897, regOp1, regOp2, arg); } +void XEmitter::VFMSUBADD213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x38A7, regOp1, regOp2, arg); } +void XEmitter::VFMSUBADD231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x38B7, regOp1, regOp2, arg); } +void XEmitter::VFMSUBADD132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x3897, regOp1, regOp2, arg, 1); } +void XEmitter::VFMSUBADD213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x38A7, regOp1, regOp2, arg, 1); } +void XEmitter::VFMSUBADD231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x38B7, regOp1, regOp2, arg, 1); } + +void XEmitter::SARX(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2) {WriteBMI2Op(bits, 0xF3, 0x38F7, regOp1, regOp2, arg);} +void XEmitter::SHLX(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2) {WriteBMI2Op(bits, 0x66, 0x38F7, regOp1, regOp2, arg);} +void XEmitter::SHRX(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2) {WriteBMI2Op(bits, 0xF2, 0x38F7, regOp1, regOp2, arg);} +void XEmitter::RORX(int bits, X64Reg regOp, const OpArg& arg, u8 rotate) {WriteBMI2Op(bits, 0xF2, 0x3AF0, regOp, INVALID_REG, arg, 1); Write8(rotate);} +void XEmitter::PEXT(int bits, X64Reg regOp1, X64Reg regOp2, const OpArg& arg) {WriteBMI2Op(bits, 0xF3, 0x38F5, regOp1, regOp2, arg);} +void XEmitter::PDEP(int bits, X64Reg regOp1, X64Reg regOp2, const OpArg& arg) {WriteBMI2Op(bits, 0xF2, 0x38F5, regOp1, regOp2, arg);} +void XEmitter::MULX(int bits, X64Reg regOp1, X64Reg regOp2, const OpArg& arg) {WriteBMI2Op(bits, 0xF2, 0x38F6, regOp2, regOp1, arg);} +void XEmitter::BZHI(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2) {WriteBMI2Op(bits, 0x00, 0x38F5, regOp1, regOp2, arg);} +void XEmitter::BLSR(int bits, X64Reg regOp, const OpArg& arg) {WriteBMI1Op(bits, 0x00, 0x38F3, (X64Reg)0x1, regOp, arg);} +void XEmitter::BLSMSK(int bits, X64Reg regOp, const OpArg& arg) {WriteBMI1Op(bits, 0x00, 0x38F3, (X64Reg)0x2, regOp, arg);} +void XEmitter::BLSI(int bits, X64Reg regOp, const OpArg& arg) {WriteBMI1Op(bits, 0x00, 0x38F3, (X64Reg)0x3, regOp, arg);} +void XEmitter::BEXTR(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2){WriteBMI1Op(bits, 0x00, 0x38F7, regOp1, regOp2, arg);} +void XEmitter::ANDN(int bits, X64Reg regOp1, X64Reg regOp2, const OpArg& arg) {WriteBMI1Op(bits, 0x00, 0x38F2, regOp1, regOp2, arg);} // Prefixes @@ -1956,7 +1952,7 @@ void XEmitter::FWAIT() } // TODO: make this more generic -void XEmitter::WriteFloatLoadStore(int bits, FloatOp op, FloatOp op_80b, OpArg arg) +void XEmitter::WriteFloatLoadStore(int bits, FloatOp op, FloatOp op_80b, const OpArg& arg) { int mf = 0; ASSERT_MSG(!(bits == 80 && op_80b == floatINVALID), "WriteFloatLoadStore: 80 bits not supported for this instruction"); @@ -1974,9 +1970,9 @@ void XEmitter::WriteFloatLoadStore(int bits, FloatOp op, FloatOp op_80b, OpArg a arg.WriteRest(this, 0, (X64Reg) op); } -void XEmitter::FLD(int bits, OpArg src) {WriteFloatLoadStore(bits, floatLD, floatLD80, src);} -void XEmitter::FST(int bits, OpArg dest) {WriteFloatLoadStore(bits, floatST, floatINVALID, dest);} -void XEmitter::FSTP(int bits, OpArg dest) {WriteFloatLoadStore(bits, floatSTP, floatSTP80, dest);} +void XEmitter::FLD(int bits, const OpArg& src) {WriteFloatLoadStore(bits, floatLD, floatLD80, src);} +void XEmitter::FST(int bits, const OpArg& dest) {WriteFloatLoadStore(bits, floatST, floatINVALID, dest);} +void XEmitter::FSTP(int bits, const OpArg& dest) {WriteFloatLoadStore(bits, floatSTP, floatSTP80, dest);} void XEmitter::FNSTSW_AX() { Write8(0xDF); Write8(0xE0); } void XEmitter::RDTSC() { Write8(0x0F); Write8(0x31); } diff --git a/src/common/x64/emitter.h b/src/common/x64/emitter.h index e9c924126..a49cd2cf1 100644 --- a/src/common/x64/emitter.h +++ b/src/common/x64/emitter.h @@ -328,8 +328,6 @@ enum SSECompare ORD, }; -typedef const u8* JumpTarget; - class XEmitter { friend struct OpArg; // for Write8 etc @@ -344,27 +342,27 @@ private: void WriteSimple2Byte(int bits, u8 byte1, u8 byte2, X64Reg reg); void WriteMulDivType(int bits, OpArg src, int ext); void WriteBitSearchType(int bits, X64Reg dest, OpArg src, u8 byte2, bool rep = false); - void WriteShift(int bits, OpArg dest, OpArg &shift, int ext); - void WriteBitTest(int bits, OpArg &dest, OpArg &index, int ext); + void WriteShift(int bits, OpArg dest, const OpArg& shift, int ext); + void WriteBitTest(int bits, const OpArg& dest, const OpArg& index, int ext); void WriteMXCSR(OpArg arg, int ext); void WriteSSEOp(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes = 0); - void WriteSSSE3Op(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes = 0); - void WriteSSE41Op(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes = 0); - void WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes = 0); - void WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes = 0); - void WriteVEXOp(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes = 0); - void WriteBMI1Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes = 0); - void WriteBMI2Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes = 0); - void WriteFloatLoadStore(int bits, FloatOp op, FloatOp op_80b, OpArg arg); - void WriteNormalOp(XEmitter *emit, int bits, NormalOp op, const OpArg &a1, const OpArg &a2); + void WriteSSSE3Op(u8 opPrefix, u16 op, X64Reg regOp, const OpArg& arg, int extrabytes = 0); + void WriteSSE41Op(u8 opPrefix, u16 op, X64Reg regOp, const OpArg& arg, int extrabytes = 0); + void WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp, const OpArg& arg, int extrabytes = 0); + void WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, int extrabytes = 0); + void WriteVEXOp(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, int extrabytes = 0); + void WriteBMI1Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, int extrabytes = 0); + void WriteBMI2Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, int extrabytes = 0); + void WriteFloatLoadStore(int bits, FloatOp op, FloatOp op_80b, const OpArg& arg); + void WriteNormalOp(XEmitter *emit, int bits, NormalOp op, const OpArg& a1, const OpArg& a2); void ABI_CalculateFrameSize(u32 mask, size_t rsp_alignment, size_t needed_frame_size, size_t* shadowp, size_t* subtractionp, size_t* xmm_offsetp); protected: - inline void Write8(u8 value) {*code++ = value;} - inline void Write16(u16 value) {*(u16*)code = (value); code += 2;} - inline void Write32(u32 value) {*(u32*)code = (value); code += 4;} - inline void Write64(u64 value) {*(u64*)code = (value); code += 8;} + void Write8(u8 value); + void Write16(u16 value); + void Write32(u32 value); + void Write64(u64 value); public: XEmitter() { code = nullptr; flags_locked = false; } @@ -413,8 +411,8 @@ public: // Stack control void PUSH(X64Reg reg); void POP(X64Reg reg); - void PUSH(int bits, const OpArg ®); - void POP(int bits, const OpArg ®); + void PUSH(int bits, const OpArg& reg); + void POP(int bits, const OpArg& reg); void PUSHF(); void POPF(); @@ -424,21 +422,19 @@ public: void UD2(); FixupBranch J(bool force5bytes = false); - void JMP(const u8 * addr, bool force5Bytes = false); - void JMP(OpArg arg); - void JMPptr(const OpArg &arg); + void JMP(const u8* addr, bool force5Bytes = false); + void JMPptr(const OpArg& arg); void JMPself(); //infinite loop! #ifdef CALL #undef CALL #endif - void CALL(const void *fnptr); + void CALL(const void* fnptr); void CALLptr(OpArg arg); FixupBranch J_CC(CCFlags conditionCode, bool force5bytes = false); - //void J_CC(CCFlags conditionCode, JumpTarget target); - void J_CC(CCFlags conditionCode, const u8 * addr, bool force5Bytes = false); + void J_CC(CCFlags conditionCode, const u8* addr, bool force5Bytes = false); - void SetJumpTarget(const FixupBranch &branch); + void SetJumpTarget(const FixupBranch& branch); void SETcc(CCFlags flag, OpArg dest); // Note: CMOV brings small if any benefit on current cpus. @@ -450,8 +446,8 @@ public: void SFENCE(); // Bit scan - void BSF(int bits, X64Reg dest, OpArg src); //bottom bit to top bit - void BSR(int bits, X64Reg dest, OpArg src); //top bit to bottom bit + void BSF(int bits, X64Reg dest, const OpArg& src); // Bottom bit to top bit + void BSR(int bits, X64Reg dest, const OpArg& src); // Top bit to bottom bit // Cache control enum PrefetchLevel @@ -462,67 +458,67 @@ public: PF_T2, //Levels 3+ (aliased to T0 on AMD) }; void PREFETCH(PrefetchLevel level, OpArg arg); - void MOVNTI(int bits, OpArg dest, X64Reg src); - void MOVNTDQ(OpArg arg, X64Reg regOp); - void MOVNTPS(OpArg arg, X64Reg regOp); - void MOVNTPD(OpArg arg, X64Reg regOp); + void MOVNTI(int bits, const OpArg& dest, X64Reg src); + void MOVNTDQ(const OpArg& arg, X64Reg regOp); + void MOVNTPS(const OpArg& arg, X64Reg regOp); + void MOVNTPD(const OpArg& arg, X64Reg regOp); // Multiplication / division - void MUL(int bits, OpArg src); //UNSIGNED - void IMUL(int bits, OpArg src); //SIGNED - void IMUL(int bits, X64Reg regOp, OpArg src); - void IMUL(int bits, X64Reg regOp, OpArg src, OpArg imm); - void DIV(int bits, OpArg src); - void IDIV(int bits, OpArg src); + void MUL(int bits, const OpArg& src); //UNSIGNED + void IMUL(int bits, const OpArg& src); //SIGNED + void IMUL(int bits, X64Reg regOp, const OpArg& src); + void IMUL(int bits, X64Reg regOp, const OpArg& src, const OpArg& imm); + void DIV(int bits, const OpArg& src); + void IDIV(int bits, const OpArg& src); // Shift - void ROL(int bits, OpArg dest, OpArg shift); - void ROR(int bits, OpArg dest, OpArg shift); - void RCL(int bits, OpArg dest, OpArg shift); - void RCR(int bits, OpArg dest, OpArg shift); - void SHL(int bits, OpArg dest, OpArg shift); - void SHR(int bits, OpArg dest, OpArg shift); - void SAR(int bits, OpArg dest, OpArg shift); + void ROL(int bits, const OpArg& dest, const OpArg& shift); + void ROR(int bits, const OpArg& dest, const OpArg& shift); + void RCL(int bits, const OpArg& dest, const OpArg& shift); + void RCR(int bits, const OpArg& dest, const OpArg& shift); + void SHL(int bits, const OpArg& dest, const OpArg& shift); + void SHR(int bits, const OpArg& dest, const OpArg& shift); + void SAR(int bits, const OpArg& dest, const OpArg& shift); // Bit Test - void BT(int bits, OpArg dest, OpArg index); - void BTS(int bits, OpArg dest, OpArg index); - void BTR(int bits, OpArg dest, OpArg index); - void BTC(int bits, OpArg dest, OpArg index); + void BT(int bits, const OpArg& dest, const OpArg& index); + void BTS(int bits, const OpArg& dest, const OpArg& index); + void BTR(int bits, const OpArg& dest, const OpArg& index); + void BTC(int bits, const OpArg& dest, const OpArg& index); // Double-Precision Shift - void SHRD(int bits, OpArg dest, OpArg src, OpArg shift); - void SHLD(int bits, OpArg dest, OpArg src, OpArg shift); + void SHRD(int bits, const OpArg& dest, const OpArg& src, const OpArg& shift); + void SHLD(int bits, const OpArg& dest, const OpArg& src, const OpArg& shift); // Extend EAX into EDX in various ways void CWD(int bits = 16); - inline void CDQ() {CWD(32);} - inline void CQO() {CWD(64);} + void CDQ() {CWD(32);} + void CQO() {CWD(64);} void CBW(int bits = 8); - inline void CWDE() {CBW(16);} - inline void CDQE() {CBW(32);} + void CWDE() {CBW(16);} + void CDQE() {CBW(32);} // Load effective address void LEA(int bits, X64Reg dest, OpArg src); // Integer arithmetic - void NEG (int bits, OpArg src); - void ADD (int bits, const OpArg &a1, const OpArg &a2); - void ADC (int bits, const OpArg &a1, const OpArg &a2); - void SUB (int bits, const OpArg &a1, const OpArg &a2); - void SBB (int bits, const OpArg &a1, const OpArg &a2); - void AND (int bits, const OpArg &a1, const OpArg &a2); - void CMP (int bits, const OpArg &a1, const OpArg &a2); + void NEG(int bits, const OpArg& src); + void ADD(int bits, const OpArg& a1, const OpArg& a2); + void ADC(int bits, const OpArg& a1, const OpArg& a2); + void SUB(int bits, const OpArg& a1, const OpArg& a2); + void SBB(int bits, const OpArg& a1, const OpArg& a2); + void AND(int bits, const OpArg& a1, const OpArg& a2); + void CMP(int bits, const OpArg& a1, const OpArg& a2); // Bit operations - void NOT (int bits, OpArg src); - void OR (int bits, const OpArg &a1, const OpArg &a2); - void XOR (int bits, const OpArg &a1, const OpArg &a2); - void MOV (int bits, const OpArg &a1, const OpArg &a2); - void TEST(int bits, const OpArg &a1, const OpArg &a2); + void NOT (int bits, const OpArg& src); + void OR(int bits, const OpArg& a1, const OpArg& a2); + void XOR(int bits, const OpArg& a1, const OpArg& a2); + void MOV(int bits, const OpArg& a1, const OpArg& a2); + void TEST(int bits, const OpArg& a1, const OpArg& a2); // Are these useful at all? Consider removing. - void XCHG(int bits, const OpArg &a1, const OpArg &a2); + void XCHG(int bits, const OpArg& a1, const OpArg& a2); void XCHG_AHAL(); // Byte swapping (32 and 64-bit only). @@ -536,13 +532,13 @@ public: void MOVBE(int dbits, const OpArg& dest, const OpArg& src); // Available only on AMD >= Phenom or Intel >= Haswell - void LZCNT(int bits, X64Reg dest, OpArg src); + void LZCNT(int bits, X64Reg dest, const OpArg& src); // Note: this one is actually part of BMI1 - void TZCNT(int bits, X64Reg dest, OpArg src); + void TZCNT(int bits, X64Reg dest, const OpArg& src); // WARNING - These two take 11-13 cycles and are VectorPath! (AMD64) - void STMXCSR(OpArg memloc); - void LDMXCSR(OpArg memloc); + void STMXCSR(const OpArg& memloc); + void LDMXCSR(const OpArg& memloc); // Prefixes void LOCK(); @@ -569,259 +565,243 @@ public: x87_FPUBusy = 0x8000, }; - void FLD(int bits, OpArg src); - void FST(int bits, OpArg dest); - void FSTP(int bits, OpArg dest); + void FLD(int bits, const OpArg& src); + void FST(int bits, const OpArg& dest); + void FSTP(int bits, const OpArg& dest); void FNSTSW_AX(); void FWAIT(); // SSE/SSE2: Floating point arithmetic - void ADDSS(X64Reg regOp, OpArg arg); - void ADDSD(X64Reg regOp, OpArg arg); - void SUBSS(X64Reg regOp, OpArg arg); - void SUBSD(X64Reg regOp, OpArg arg); - void MULSS(X64Reg regOp, OpArg arg); - void MULSD(X64Reg regOp, OpArg arg); - void DIVSS(X64Reg regOp, OpArg arg); - void DIVSD(X64Reg regOp, OpArg arg); - void MINSS(X64Reg regOp, OpArg arg); - void MINSD(X64Reg regOp, OpArg arg); - void MAXSS(X64Reg regOp, OpArg arg); - void MAXSD(X64Reg regOp, OpArg arg); - void SQRTSS(X64Reg regOp, OpArg arg); - void SQRTSD(X64Reg regOp, OpArg arg); - void RSQRTSS(X64Reg regOp, OpArg arg); + void ADDSS(X64Reg regOp, const OpArg& arg); + void ADDSD(X64Reg regOp, const OpArg& arg); + void SUBSS(X64Reg regOp, const OpArg& arg); + void SUBSD(X64Reg regOp, const OpArg& arg); + void MULSS(X64Reg regOp, const OpArg& arg); + void MULSD(X64Reg regOp, const OpArg& arg); + void DIVSS(X64Reg regOp, const OpArg& arg); + void DIVSD(X64Reg regOp, const OpArg& arg); + void MINSS(X64Reg regOp, const OpArg& arg); + void MINSD(X64Reg regOp, const OpArg& arg); + void MAXSS(X64Reg regOp, const OpArg& arg); + void MAXSD(X64Reg regOp, const OpArg& arg); + void SQRTSS(X64Reg regOp, const OpArg& arg); + void SQRTSD(X64Reg regOp, const OpArg& arg); + void RCPSS(X64Reg regOp, const OpArg& arg); + void RSQRTSS(X64Reg regOp, const OpArg& arg); // SSE/SSE2: Floating point bitwise (yes) - void CMPSS(X64Reg regOp, OpArg arg, u8 compare); - void CMPSD(X64Reg regOp, OpArg arg, u8 compare); + void CMPSS(X64Reg regOp, const OpArg& arg, u8 compare); + void CMPSD(X64Reg regOp, const OpArg& arg, u8 compare); - inline void CMPEQSS(X64Reg regOp, OpArg arg) { CMPSS(regOp, arg, CMP_EQ); } - inline void CMPLTSS(X64Reg regOp, OpArg arg) { CMPSS(regOp, arg, CMP_LT); } - inline void CMPLESS(X64Reg regOp, OpArg arg) { CMPSS(regOp, arg, CMP_LE); } - inline void CMPUNORDSS(X64Reg regOp, OpArg arg) { CMPSS(regOp, arg, CMP_UNORD); } - inline void CMPNEQSS(X64Reg regOp, OpArg arg) { CMPSS(regOp, arg, CMP_NEQ); } - inline void CMPNLTSS(X64Reg regOp, OpArg arg) { CMPSS(regOp, arg, CMP_NLT); } - inline void CMPORDSS(X64Reg regOp, OpArg arg) { CMPSS(regOp, arg, CMP_ORD); } + void CMPEQSS(X64Reg regOp, const OpArg& arg) { CMPSS(regOp, arg, CMP_EQ); } + void CMPLTSS(X64Reg regOp, const OpArg& arg) { CMPSS(regOp, arg, CMP_LT); } + void CMPLESS(X64Reg regOp, const OpArg& arg) { CMPSS(regOp, arg, CMP_LE); } + void CMPUNORDSS(X64Reg regOp, const OpArg& arg) { CMPSS(regOp, arg, CMP_UNORD); } + void CMPNEQSS(X64Reg regOp, const OpArg& arg) { CMPSS(regOp, arg, CMP_NEQ); } + void CMPNLTSS(X64Reg regOp, const OpArg& arg) { CMPSS(regOp, arg, CMP_NLT); } + void CMPORDSS(X64Reg regOp, const OpArg& arg) { CMPSS(regOp, arg, CMP_ORD); } // SSE/SSE2: Floating point packed arithmetic (x4 for float, x2 for double) - void ADDPS(X64Reg regOp, OpArg arg); - void ADDPD(X64Reg regOp, OpArg arg); - void SUBPS(X64Reg regOp, OpArg arg); - void SUBPD(X64Reg regOp, OpArg arg); - void CMPPS(X64Reg regOp, OpArg arg, u8 compare); - void CMPPD(X64Reg regOp, OpArg arg, u8 compare); - void MULPS(X64Reg regOp, OpArg arg); - void MULPD(X64Reg regOp, OpArg arg); - void DIVPS(X64Reg regOp, OpArg arg); - void DIVPD(X64Reg regOp, OpArg arg); - void MINPS(X64Reg regOp, OpArg arg); - void MINPD(X64Reg regOp, OpArg arg); - void MAXPS(X64Reg regOp, OpArg arg); - void MAXPD(X64Reg regOp, OpArg arg); - void SQRTPS(X64Reg regOp, OpArg arg); - void SQRTPD(X64Reg regOp, OpArg arg); - void RCPPS(X64Reg regOp, OpArg arg); - void RSQRTPS(X64Reg regOp, OpArg arg); + void ADDPS(X64Reg regOp, const OpArg& arg); + void ADDPD(X64Reg regOp, const OpArg& arg); + void SUBPS(X64Reg regOp, const OpArg& arg); + void SUBPD(X64Reg regOp, const OpArg& arg); + void CMPPS(X64Reg regOp, const OpArg& arg, u8 compare); + void CMPPD(X64Reg regOp, const OpArg& arg, u8 compare); + void MULPS(X64Reg regOp, const OpArg& arg); + void MULPD(X64Reg regOp, const OpArg& arg); + void DIVPS(X64Reg regOp, const OpArg& arg); + void DIVPD(X64Reg regOp, const OpArg& arg); + void MINPS(X64Reg regOp, const OpArg& arg); + void MINPD(X64Reg regOp, const OpArg& arg); + void MAXPS(X64Reg regOp, const OpArg& arg); + void MAXPD(X64Reg regOp, const OpArg& arg); + void SQRTPS(X64Reg regOp, const OpArg& arg); + void SQRTPD(X64Reg regOp, const OpArg& arg); + void RCPPS(X64Reg regOp, const OpArg& arg); + void RSQRTPS(X64Reg regOp, const OpArg& arg); // SSE/SSE2: Floating point packed bitwise (x4 for float, x2 for double) - void ANDPS(X64Reg regOp, OpArg arg); - void ANDPD(X64Reg regOp, OpArg arg); - void ANDNPS(X64Reg regOp, OpArg arg); - void ANDNPD(X64Reg regOp, OpArg arg); - void ORPS(X64Reg regOp, OpArg arg); - void ORPD(X64Reg regOp, OpArg arg); - void XORPS(X64Reg regOp, OpArg arg); - void XORPD(X64Reg regOp, OpArg arg); + void ANDPS(X64Reg regOp, const OpArg& arg); + void ANDPD(X64Reg regOp, const OpArg& arg); + void ANDNPS(X64Reg regOp, const OpArg& arg); + void ANDNPD(X64Reg regOp, const OpArg& arg); + void ORPS(X64Reg regOp, const OpArg& arg); + void ORPD(X64Reg regOp, const OpArg& arg); + void XORPS(X64Reg regOp, const OpArg& arg); + void XORPD(X64Reg regOp, const OpArg& arg); // SSE/SSE2: Shuffle components. These are tricky - see Intel documentation. - void SHUFPS(X64Reg regOp, OpArg arg, u8 shuffle); - void SHUFPD(X64Reg regOp, OpArg arg, u8 shuffle); + void SHUFPS(X64Reg regOp, const OpArg& arg, u8 shuffle); + void SHUFPD(X64Reg regOp, const OpArg& arg, u8 shuffle); // SSE/SSE2: Useful alternative to shuffle in some cases. - void MOVDDUP(X64Reg regOp, OpArg arg); - - // TODO: Actually implement -#if 0 - // SSE3: Horizontal operations in SIMD registers. Could be useful for various VFPU things like dot products... - void ADDSUBPS(X64Reg dest, OpArg src); - void ADDSUBPD(X64Reg dest, OpArg src); - void HADDPD(X64Reg dest, OpArg src); - void HSUBPS(X64Reg dest, OpArg src); - void HSUBPD(X64Reg dest, OpArg src); - - // SSE4: Further horizontal operations - dot products. These are weirdly flexible, the arg contains both a read mask and a write "mask". - void DPPD(X64Reg dest, OpArg src, u8 arg); - - // These are probably useful for VFPU emulation. - void INSERTPS(X64Reg dest, OpArg src, u8 arg); - void EXTRACTPS(OpArg dest, X64Reg src, u8 arg); -#endif + void MOVDDUP(X64Reg regOp, const OpArg& arg); // SSE3: Horizontal operations in SIMD registers. Very slow! shufps-based code beats it handily on Ivy. - void HADDPS(X64Reg dest, OpArg src); + void HADDPS(X64Reg dest, const OpArg& src); // SSE4: Further horizontal operations - dot products. These are weirdly flexible, the arg contains both a read mask and a write "mask". - void DPPS(X64Reg dest, OpArg src, u8 arg); + void DPPS(X64Reg dest, const OpArg& src, u8 arg); - void UNPCKLPS(X64Reg dest, OpArg src); - void UNPCKHPS(X64Reg dest, OpArg src); - void UNPCKLPD(X64Reg dest, OpArg src); - void UNPCKHPD(X64Reg dest, OpArg src); + void UNPCKLPS(X64Reg dest, const OpArg& src); + void UNPCKHPS(X64Reg dest, const OpArg& src); + void UNPCKLPD(X64Reg dest, const OpArg& src); + void UNPCKHPD(X64Reg dest, const OpArg& src); // SSE/SSE2: Compares. - void COMISS(X64Reg regOp, OpArg arg); - void COMISD(X64Reg regOp, OpArg arg); - void UCOMISS(X64Reg regOp, OpArg arg); - void UCOMISD(X64Reg regOp, OpArg arg); + void COMISS(X64Reg regOp, const OpArg& arg); + void COMISD(X64Reg regOp, const OpArg& arg); + void UCOMISS(X64Reg regOp, const OpArg& arg); + void UCOMISD(X64Reg regOp, const OpArg& arg); // SSE/SSE2: Moves. Use the right data type for your data, in most cases. - void MOVAPS(X64Reg regOp, OpArg arg); - void MOVAPD(X64Reg regOp, OpArg arg); - void MOVAPS(OpArg arg, X64Reg regOp); - void MOVAPD(OpArg arg, X64Reg regOp); - - void MOVUPS(X64Reg regOp, OpArg arg); - void MOVUPD(X64Reg regOp, OpArg arg); - void MOVUPS(OpArg arg, X64Reg regOp); - void MOVUPD(OpArg arg, X64Reg regOp); - - void MOVDQA(X64Reg regOp, OpArg arg); - void MOVDQA(OpArg arg, X64Reg regOp); - void MOVDQU(X64Reg regOp, OpArg arg); - void MOVDQU(OpArg arg, X64Reg regOp); - - void MOVSS(X64Reg regOp, OpArg arg); - void MOVSD(X64Reg regOp, OpArg arg); - void MOVSS(OpArg arg, X64Reg regOp); - void MOVSD(OpArg arg, X64Reg regOp); - - void MOVLPS(X64Reg regOp, OpArg arg); - void MOVLPD(X64Reg regOp, OpArg arg); - void MOVLPS(OpArg arg, X64Reg regOp); - void MOVLPD(OpArg arg, X64Reg regOp); - - void MOVHPS(X64Reg regOp, OpArg arg); - void MOVHPD(X64Reg regOp, OpArg arg); - void MOVHPS(OpArg arg, X64Reg regOp); - void MOVHPD(OpArg arg, X64Reg regOp); + void MOVAPS(X64Reg regOp, const OpArg& arg); + void MOVAPD(X64Reg regOp, const OpArg& arg); + void MOVAPS(const OpArg& arg, X64Reg regOp); + void MOVAPD(const OpArg& arg, X64Reg regOp); + + void MOVUPS(X64Reg regOp, const OpArg& arg); + void MOVUPD(X64Reg regOp, const OpArg& arg); + void MOVUPS(const OpArg& arg, X64Reg regOp); + void MOVUPD(const OpArg& arg, X64Reg regOp); + + void MOVDQA(X64Reg regOp, const OpArg& arg); + void MOVDQA(const OpArg& arg, X64Reg regOp); + void MOVDQU(X64Reg regOp, const OpArg& arg); + void MOVDQU(const OpArg& arg, X64Reg regOp); + + void MOVSS(X64Reg regOp, const OpArg& arg); + void MOVSD(X64Reg regOp, const OpArg& arg); + void MOVSS(const OpArg& arg, X64Reg regOp); + void MOVSD(const OpArg& arg, X64Reg regOp); + + void MOVLPS(X64Reg regOp, const OpArg& arg); + void MOVLPD(X64Reg regOp, const OpArg& arg); + void MOVLPS(const OpArg& arg, X64Reg regOp); + void MOVLPD(const OpArg& arg, X64Reg regOp); + + void MOVHPS(X64Reg regOp, const OpArg& arg); + void MOVHPD(X64Reg regOp, const OpArg& arg); + void MOVHPS(const OpArg& arg, X64Reg regOp); + void MOVHPD(const OpArg& arg, X64Reg regOp); void MOVHLPS(X64Reg regOp1, X64Reg regOp2); void MOVLHPS(X64Reg regOp1, X64Reg regOp2); - void MOVD_xmm(X64Reg dest, const OpArg &arg); + void MOVD_xmm(X64Reg dest, const OpArg& arg); void MOVQ_xmm(X64Reg dest, OpArg arg); - void MOVD_xmm(const OpArg &arg, X64Reg src); + void MOVD_xmm(const OpArg& arg, X64Reg src); void MOVQ_xmm(OpArg arg, X64Reg src); // SSE/SSE2: Generates a mask from the high bits of the components of the packed register in question. - void MOVMSKPS(X64Reg dest, OpArg arg); - void MOVMSKPD(X64Reg dest, OpArg arg); + void MOVMSKPS(X64Reg dest, const OpArg& arg); + void MOVMSKPD(X64Reg dest, const OpArg& arg); // SSE2: Selective byte store, mask in src register. EDI/RDI specifies store address. This is a weird one. void MASKMOVDQU(X64Reg dest, X64Reg src); - void LDDQU(X64Reg dest, OpArg src); + void LDDQU(X64Reg dest, const OpArg& src); // SSE/SSE2: Data type conversions. - void CVTPS2PD(X64Reg dest, OpArg src); - void CVTPD2PS(X64Reg dest, OpArg src); - void CVTSS2SD(X64Reg dest, OpArg src); - void CVTSI2SS(X64Reg dest, OpArg src); - void CVTSD2SS(X64Reg dest, OpArg src); - void CVTSI2SD(X64Reg dest, OpArg src); - void CVTDQ2PD(X64Reg regOp, OpArg arg); - void CVTPD2DQ(X64Reg regOp, OpArg arg); - void CVTDQ2PS(X64Reg regOp, OpArg arg); - void CVTPS2DQ(X64Reg regOp, OpArg arg); - - void CVTTPS2DQ(X64Reg regOp, OpArg arg); - void CVTTPD2DQ(X64Reg regOp, OpArg arg); + void CVTPS2PD(X64Reg dest, const OpArg& src); + void CVTPD2PS(X64Reg dest, const OpArg& src); + void CVTSS2SD(X64Reg dest, const OpArg& src); + void CVTSI2SS(X64Reg dest, const OpArg& src); + void CVTSD2SS(X64Reg dest, const OpArg& src); + void CVTSI2SD(X64Reg dest, const OpArg& src); + void CVTDQ2PD(X64Reg regOp, const OpArg& arg); + void CVTPD2DQ(X64Reg regOp, const OpArg& arg); + void CVTDQ2PS(X64Reg regOp, const OpArg& arg); + void CVTPS2DQ(X64Reg regOp, const OpArg& arg); + + void CVTTPS2DQ(X64Reg regOp, const OpArg& arg); + void CVTTPD2DQ(X64Reg regOp, const OpArg& arg); // Destinations are X64 regs (rax, rbx, ...) for these instructions. - void CVTSS2SI(X64Reg xregdest, OpArg src); - void CVTSD2SI(X64Reg xregdest, OpArg src); - void CVTTSS2SI(X64Reg xregdest, OpArg arg); - void CVTTSD2SI(X64Reg xregdest, OpArg arg); + void CVTSS2SI(X64Reg xregdest, const OpArg& src); + void CVTSD2SI(X64Reg xregdest, const OpArg& src); + void CVTTSS2SI(X64Reg xregdest, const OpArg& arg); + void CVTTSD2SI(X64Reg xregdest, const OpArg& arg); // SSE2: Packed integer instructions - void PACKSSDW(X64Reg dest, OpArg arg); - void PACKSSWB(X64Reg dest, OpArg arg); - void PACKUSDW(X64Reg dest, OpArg arg); - void PACKUSWB(X64Reg dest, OpArg arg); + void PACKSSDW(X64Reg dest, const OpArg& arg); + void PACKSSWB(X64Reg dest, const OpArg& arg); + void PACKUSDW(X64Reg dest, const OpArg& arg); + void PACKUSWB(X64Reg dest, const OpArg& arg); void PUNPCKLBW(X64Reg dest, const OpArg &arg); void PUNPCKLWD(X64Reg dest, const OpArg &arg); void PUNPCKLDQ(X64Reg dest, const OpArg &arg); void PUNPCKLQDQ(X64Reg dest, const OpArg &arg); - void PTEST(X64Reg dest, OpArg arg); - void PAND(X64Reg dest, OpArg arg); - void PANDN(X64Reg dest, OpArg arg); - void PXOR(X64Reg dest, OpArg arg); - void POR(X64Reg dest, OpArg arg); - - void PADDB(X64Reg dest, OpArg arg); - void PADDW(X64Reg dest, OpArg arg); - void PADDD(X64Reg dest, OpArg arg); - void PADDQ(X64Reg dest, OpArg arg); - - void PADDSB(X64Reg dest, OpArg arg); - void PADDSW(X64Reg dest, OpArg arg); - void PADDUSB(X64Reg dest, OpArg arg); - void PADDUSW(X64Reg dest, OpArg arg); - - void PSUBB(X64Reg dest, OpArg arg); - void PSUBW(X64Reg dest, OpArg arg); - void PSUBD(X64Reg dest, OpArg arg); - void PSUBQ(X64Reg dest, OpArg arg); - - void PSUBSB(X64Reg dest, OpArg arg); - void PSUBSW(X64Reg dest, OpArg arg); - void PSUBUSB(X64Reg dest, OpArg arg); - void PSUBUSW(X64Reg dest, OpArg arg); - - void PAVGB(X64Reg dest, OpArg arg); - void PAVGW(X64Reg dest, OpArg arg); - - void PCMPEQB(X64Reg dest, OpArg arg); - void PCMPEQW(X64Reg dest, OpArg arg); - void PCMPEQD(X64Reg dest, OpArg arg); - - void PCMPGTB(X64Reg dest, OpArg arg); - void PCMPGTW(X64Reg dest, OpArg arg); - void PCMPGTD(X64Reg dest, OpArg arg); - - void PEXTRW(X64Reg dest, OpArg arg, u8 subreg); - void PINSRW(X64Reg dest, OpArg arg, u8 subreg); - - void PMADDWD(X64Reg dest, OpArg arg); - void PSADBW(X64Reg dest, OpArg arg); - - void PMAXSW(X64Reg dest, OpArg arg); - void PMAXUB(X64Reg dest, OpArg arg); - void PMINSW(X64Reg dest, OpArg arg); - void PMINUB(X64Reg dest, OpArg arg); + void PTEST(X64Reg dest, const OpArg& arg); + void PAND(X64Reg dest, const OpArg& arg); + void PANDN(X64Reg dest, const OpArg& arg); + void PXOR(X64Reg dest, const OpArg& arg); + void POR(X64Reg dest, const OpArg& arg); + + void PADDB(X64Reg dest, const OpArg& arg); + void PADDW(X64Reg dest, const OpArg& arg); + void PADDD(X64Reg dest, const OpArg& arg); + void PADDQ(X64Reg dest, const OpArg& arg); + + void PADDSB(X64Reg dest, const OpArg& arg); + void PADDSW(X64Reg dest, const OpArg& arg); + void PADDUSB(X64Reg dest, const OpArg& arg); + void PADDUSW(X64Reg dest, const OpArg& arg); + + void PSUBB(X64Reg dest, const OpArg& arg); + void PSUBW(X64Reg dest, const OpArg& arg); + void PSUBD(X64Reg dest, const OpArg& arg); + void PSUBQ(X64Reg dest, const OpArg& arg); + + void PSUBSB(X64Reg dest, const OpArg& arg); + void PSUBSW(X64Reg dest, const OpArg& arg); + void PSUBUSB(X64Reg dest, const OpArg& arg); + void PSUBUSW(X64Reg dest, const OpArg& arg); + + void PAVGB(X64Reg dest, const OpArg& arg); + void PAVGW(X64Reg dest, const OpArg& arg); + + void PCMPEQB(X64Reg dest, const OpArg& arg); + void PCMPEQW(X64Reg dest, const OpArg& arg); + void PCMPEQD(X64Reg dest, const OpArg& arg); + + void PCMPGTB(X64Reg dest, const OpArg& arg); + void PCMPGTW(X64Reg dest, const OpArg& arg); + void PCMPGTD(X64Reg dest, const OpArg& arg); + + void PEXTRW(X64Reg dest, const OpArg& arg, u8 subreg); + void PINSRW(X64Reg dest, const OpArg& arg, u8 subreg); + + void PMADDWD(X64Reg dest, const OpArg& arg); + void PSADBW(X64Reg dest, const OpArg& arg); + + void PMAXSW(X64Reg dest, const OpArg& arg); + void PMAXUB(X64Reg dest, const OpArg& arg); + void PMINSW(X64Reg dest, const OpArg& arg); + void PMINUB(X64Reg dest, const OpArg& arg); // SSE4: More MAX/MIN instructions. - void PMINSB(X64Reg dest, OpArg arg); - void PMINSD(X64Reg dest, OpArg arg); - void PMINUW(X64Reg dest, OpArg arg); - void PMINUD(X64Reg dest, OpArg arg); - void PMAXSB(X64Reg dest, OpArg arg); - void PMAXSD(X64Reg dest, OpArg arg); - void PMAXUW(X64Reg dest, OpArg arg); - void PMAXUD(X64Reg dest, OpArg arg); - - void PMOVMSKB(X64Reg dest, OpArg arg); - void PSHUFD(X64Reg dest, OpArg arg, u8 shuffle); - void PSHUFB(X64Reg dest, OpArg arg); - - void PSHUFLW(X64Reg dest, OpArg arg, u8 shuffle); - void PSHUFHW(X64Reg dest, OpArg arg, u8 shuffle); + void PMINSB(X64Reg dest, const OpArg& arg); + void PMINSD(X64Reg dest, const OpArg& arg); + void PMINUW(X64Reg dest, const OpArg& arg); + void PMINUD(X64Reg dest, const OpArg& arg); + void PMAXSB(X64Reg dest, const OpArg& arg); + void PMAXSD(X64Reg dest, const OpArg& arg); + void PMAXUW(X64Reg dest, const OpArg& arg); + void PMAXUD(X64Reg dest, const OpArg& arg); + + void PMOVMSKB(X64Reg dest, const OpArg& arg); + void PSHUFD(X64Reg dest, const OpArg& arg, u8 shuffle); + void PSHUFB(X64Reg dest, const OpArg& arg); + + void PSHUFLW(X64Reg dest, const OpArg& arg, u8 shuffle); + void PSHUFHW(X64Reg dest, const OpArg& arg, u8 shuffle); void PSRLW(X64Reg reg, int shift); void PSRLD(X64Reg reg, int shift); void PSRLQ(X64Reg reg, int shift); - void PSRLQ(X64Reg reg, OpArg arg); + void PSRLQ(X64Reg reg, const OpArg& arg); void PSRLDQ(X64Reg reg, int shift); void PSLLW(X64Reg reg, int shift); @@ -833,198 +813,198 @@ public: void PSRAD(X64Reg reg, int shift); // SSE4: data type conversions - void PMOVSXBW(X64Reg dest, OpArg arg); - void PMOVSXBD(X64Reg dest, OpArg arg); - void PMOVSXBQ(X64Reg dest, OpArg arg); - void PMOVSXWD(X64Reg dest, OpArg arg); - void PMOVSXWQ(X64Reg dest, OpArg arg); - void PMOVSXDQ(X64Reg dest, OpArg arg); - void PMOVZXBW(X64Reg dest, OpArg arg); - void PMOVZXBD(X64Reg dest, OpArg arg); - void PMOVZXBQ(X64Reg dest, OpArg arg); - void PMOVZXWD(X64Reg dest, OpArg arg); - void PMOVZXWQ(X64Reg dest, OpArg arg); - void PMOVZXDQ(X64Reg dest, OpArg arg); + void PMOVSXBW(X64Reg dest, const OpArg& arg); + void PMOVSXBD(X64Reg dest, const OpArg& arg); + void PMOVSXBQ(X64Reg dest, const OpArg& arg); + void PMOVSXWD(X64Reg dest, const OpArg& arg); + void PMOVSXWQ(X64Reg dest, const OpArg& arg); + void PMOVSXDQ(X64Reg dest, const OpArg& arg); + void PMOVZXBW(X64Reg dest, const OpArg& arg); + void PMOVZXBD(X64Reg dest, const OpArg& arg); + void PMOVZXBQ(X64Reg dest, const OpArg& arg); + void PMOVZXWD(X64Reg dest, const OpArg& arg); + void PMOVZXWQ(X64Reg dest, const OpArg& arg); + void PMOVZXDQ(X64Reg dest, const OpArg& arg); // SSE4: variable blend instructions (xmm0 implicit argument) - void PBLENDVB(X64Reg dest, OpArg arg); - void BLENDVPS(X64Reg dest, OpArg arg); - void BLENDVPD(X64Reg dest, OpArg arg); + void PBLENDVB(X64Reg dest, const OpArg& arg); + void BLENDVPS(X64Reg dest, const OpArg& arg); + void BLENDVPD(X64Reg dest, const OpArg& arg); void BLENDPS(X64Reg dest, const OpArg& arg, u8 blend); void BLENDPD(X64Reg dest, const OpArg& arg, u8 blend); // SSE4: rounding (see FloatRound for mode or use ROUNDNEARSS, etc. helpers.) - void ROUNDSS(X64Reg dest, OpArg arg, u8 mode); - void ROUNDSD(X64Reg dest, OpArg arg, u8 mode); - void ROUNDPS(X64Reg dest, OpArg arg, u8 mode); - void ROUNDPD(X64Reg dest, OpArg arg, u8 mode); - - inline void ROUNDNEARSS(X64Reg dest, OpArg arg) { ROUNDSS(dest, arg, FROUND_NEAREST); } - inline void ROUNDFLOORSS(X64Reg dest, OpArg arg) { ROUNDSS(dest, arg, FROUND_FLOOR); } - inline void ROUNDCEILSS(X64Reg dest, OpArg arg) { ROUNDSS(dest, arg, FROUND_CEIL); } - inline void ROUNDZEROSS(X64Reg dest, OpArg arg) { ROUNDSS(dest, arg, FROUND_ZERO); } - - inline void ROUNDNEARSD(X64Reg dest, OpArg arg) { ROUNDSD(dest, arg, FROUND_NEAREST); } - inline void ROUNDFLOORSD(X64Reg dest, OpArg arg) { ROUNDSD(dest, arg, FROUND_FLOOR); } - inline void ROUNDCEILSD(X64Reg dest, OpArg arg) { ROUNDSD(dest, arg, FROUND_CEIL); } - inline void ROUNDZEROSD(X64Reg dest, OpArg arg) { ROUNDSD(dest, arg, FROUND_ZERO); } - - inline void ROUNDNEARPS(X64Reg dest, OpArg arg) { ROUNDPS(dest, arg, FROUND_NEAREST); } - inline void ROUNDFLOORPS(X64Reg dest, OpArg arg) { ROUNDPS(dest, arg, FROUND_FLOOR); } - inline void ROUNDCEILPS(X64Reg dest, OpArg arg) { ROUNDPS(dest, arg, FROUND_CEIL); } - inline void ROUNDZEROPS(X64Reg dest, OpArg arg) { ROUNDPS(dest, arg, FROUND_ZERO); } - - inline void ROUNDNEARPD(X64Reg dest, OpArg arg) { ROUNDPD(dest, arg, FROUND_NEAREST); } - inline void ROUNDFLOORPD(X64Reg dest, OpArg arg) { ROUNDPD(dest, arg, FROUND_FLOOR); } - inline void ROUNDCEILPD(X64Reg dest, OpArg arg) { ROUNDPD(dest, arg, FROUND_CEIL); } - inline void ROUNDZEROPD(X64Reg dest, OpArg arg) { ROUNDPD(dest, arg, FROUND_ZERO); } + void ROUNDSS(X64Reg dest, const OpArg& arg, u8 mode); + void ROUNDSD(X64Reg dest, const OpArg& arg, u8 mode); + void ROUNDPS(X64Reg dest, const OpArg& arg, u8 mode); + void ROUNDPD(X64Reg dest, const OpArg& arg, u8 mode); + + void ROUNDNEARSS(X64Reg dest, const OpArg& arg) { ROUNDSS(dest, arg, FROUND_NEAREST); } + void ROUNDFLOORSS(X64Reg dest, const OpArg& arg) { ROUNDSS(dest, arg, FROUND_FLOOR); } + void ROUNDCEILSS(X64Reg dest, const OpArg& arg) { ROUNDSS(dest, arg, FROUND_CEIL); } + void ROUNDZEROSS(X64Reg dest, const OpArg& arg) { ROUNDSS(dest, arg, FROUND_ZERO); } + + void ROUNDNEARSD(X64Reg dest, const OpArg& arg) { ROUNDSD(dest, arg, FROUND_NEAREST); } + void ROUNDFLOORSD(X64Reg dest, const OpArg& arg) { ROUNDSD(dest, arg, FROUND_FLOOR); } + void ROUNDCEILSD(X64Reg dest, const OpArg& arg) { ROUNDSD(dest, arg, FROUND_CEIL); } + void ROUNDZEROSD(X64Reg dest, const OpArg& arg) { ROUNDSD(dest, arg, FROUND_ZERO); } + + void ROUNDNEARPS(X64Reg dest, const OpArg& arg) { ROUNDPS(dest, arg, FROUND_NEAREST); } + void ROUNDFLOORPS(X64Reg dest, const OpArg& arg) { ROUNDPS(dest, arg, FROUND_FLOOR); } + void ROUNDCEILPS(X64Reg dest, const OpArg& arg) { ROUNDPS(dest, arg, FROUND_CEIL); } + void ROUNDZEROPS(X64Reg dest, const OpArg& arg) { ROUNDPS(dest, arg, FROUND_ZERO); } + + void ROUNDNEARPD(X64Reg dest, const OpArg& arg) { ROUNDPD(dest, arg, FROUND_NEAREST); } + void ROUNDFLOORPD(X64Reg dest, const OpArg& arg) { ROUNDPD(dest, arg, FROUND_FLOOR); } + void ROUNDCEILPD(X64Reg dest, const OpArg& arg) { ROUNDPD(dest, arg, FROUND_CEIL); } + void ROUNDZEROPD(X64Reg dest, const OpArg& arg) { ROUNDPD(dest, arg, FROUND_ZERO); } // AVX - void VADDSD(X64Reg regOp1, X64Reg regOp2, OpArg arg); - void VSUBSD(X64Reg regOp1, X64Reg regOp2, OpArg arg); - void VMULSD(X64Reg regOp1, X64Reg regOp2, OpArg arg); - void VDIVSD(X64Reg regOp1, X64Reg regOp2, OpArg arg); - void VADDPD(X64Reg regOp1, X64Reg regOp2, OpArg arg); - void VSUBPD(X64Reg regOp1, X64Reg regOp2, OpArg arg); - void VMULPD(X64Reg regOp1, X64Reg regOp2, OpArg arg); - void VDIVPD(X64Reg regOp1, X64Reg regOp2, OpArg arg); - void VSQRTSD(X64Reg regOp1, X64Reg regOp2, OpArg arg); - void VSHUFPD(X64Reg regOp1, X64Reg regOp2, OpArg arg, u8 shuffle); - void VUNPCKLPD(X64Reg regOp1, X64Reg regOp2, OpArg arg); - void VUNPCKHPD(X64Reg regOp1, X64Reg regOp2, OpArg arg); - - void VANDPS(X64Reg regOp1, X64Reg regOp2, OpArg arg); - void VANDPD(X64Reg regOp1, X64Reg regOp2, OpArg arg); - void VANDNPS(X64Reg regOp1, X64Reg regOp2, OpArg arg); - void VANDNPD(X64Reg regOp1, X64Reg regOp2, OpArg arg); - void VORPS(X64Reg regOp1, X64Reg regOp2, OpArg arg); - void VORPD(X64Reg regOp1, X64Reg regOp2, OpArg arg); - void VXORPS(X64Reg regOp1, X64Reg regOp2, OpArg arg); - void VXORPD(X64Reg regOp1, X64Reg regOp2, OpArg arg); - - void VPAND(X64Reg regOp1, X64Reg regOp2, OpArg arg); - void VPANDN(X64Reg regOp1, X64Reg regOp2, OpArg arg); - void VPOR(X64Reg regOp1, X64Reg regOp2, OpArg arg); - void VPXOR(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VADDSD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VSUBSD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VMULSD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VDIVSD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VADDPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VSUBPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VMULPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VDIVPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VSQRTSD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VSHUFPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg, u8 shuffle); + void VUNPCKLPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VUNPCKHPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + + void VANDPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VANDPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VANDNPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VANDNPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VORPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VORPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VXORPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VXORPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + + void VPAND(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VPANDN(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VPOR(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VPXOR(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); // FMA3 - void VFMADD132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg); - void VFMADD213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg); - void VFMADD231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg); - void VFMADD132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg); - void VFMADD213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg); - void VFMADD231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg); - void VFMADD132SS(X64Reg regOp1, X64Reg regOp2, OpArg arg); - void VFMADD213SS(X64Reg regOp1, X64Reg regOp2, OpArg arg); - void VFMADD231SS(X64Reg regOp1, X64Reg regOp2, OpArg arg); - void VFMADD132SD(X64Reg regOp1, X64Reg regOp2, OpArg arg); - void VFMADD213SD(X64Reg regOp1, X64Reg regOp2, OpArg arg); - void VFMADD231SD(X64Reg regOp1, X64Reg regOp2, OpArg arg); - void VFMSUB132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg); - void VFMSUB213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg); - void VFMSUB231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg); - void VFMSUB132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg); - void VFMSUB213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg); - void VFMSUB231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg); - void VFMSUB132SS(X64Reg regOp1, X64Reg regOp2, OpArg arg); - void VFMSUB213SS(X64Reg regOp1, X64Reg regOp2, OpArg arg); - void VFMSUB231SS(X64Reg regOp1, X64Reg regOp2, OpArg arg); - void VFMSUB132SD(X64Reg regOp1, X64Reg regOp2, OpArg arg); - void VFMSUB213SD(X64Reg regOp1, X64Reg regOp2, OpArg arg); - void VFMSUB231SD(X64Reg regOp1, X64Reg regOp2, OpArg arg); - void VFNMADD132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg); - void VFNMADD213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg); - void VFNMADD231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg); - void VFNMADD132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg); - void VFNMADD213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg); - void VFNMADD231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg); - void VFNMADD132SS(X64Reg regOp1, X64Reg regOp2, OpArg arg); - void VFNMADD213SS(X64Reg regOp1, X64Reg regOp2, OpArg arg); - void VFNMADD231SS(X64Reg regOp1, X64Reg regOp2, OpArg arg); - void VFNMADD132SD(X64Reg regOp1, X64Reg regOp2, OpArg arg); - void VFNMADD213SD(X64Reg regOp1, X64Reg regOp2, OpArg arg); - void VFNMADD231SD(X64Reg regOp1, X64Reg regOp2, OpArg arg); - void VFNMSUB132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg); - void VFNMSUB213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg); - void VFNMSUB231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg); - void VFNMSUB132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg); - void VFNMSUB213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg); - void VFNMSUB231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg); - void VFNMSUB132SS(X64Reg regOp1, X64Reg regOp2, OpArg arg); - void VFNMSUB213SS(X64Reg regOp1, X64Reg regOp2, OpArg arg); - void VFNMSUB231SS(X64Reg regOp1, X64Reg regOp2, OpArg arg); - void VFNMSUB132SD(X64Reg regOp1, X64Reg regOp2, OpArg arg); - void VFNMSUB213SD(X64Reg regOp1, X64Reg regOp2, OpArg arg); - void VFNMSUB231SD(X64Reg regOp1, X64Reg regOp2, OpArg arg); - void VFMADDSUB132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg); - void VFMADDSUB213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg); - void VFMADDSUB231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg); - void VFMADDSUB132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg); - void VFMADDSUB213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg); - void VFMADDSUB231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg); - void VFMSUBADD132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg); - void VFMSUBADD213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg); - void VFMSUBADD231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg); - void VFMSUBADD132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg); - void VFMSUBADD213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg); - void VFMSUBADD231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFMADD132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFMADD213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFMADD231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFMADD132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFMADD213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFMADD231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFMADD132SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFMADD213SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFMADD231SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFMADD132SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFMADD213SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFMADD231SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFMSUB132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFMSUB213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFMSUB231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFMSUB132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFMSUB213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFMSUB231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFMSUB132SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFMSUB213SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFMSUB231SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFMSUB132SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFMSUB213SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFMSUB231SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFNMADD132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFNMADD213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFNMADD231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFNMADD132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFNMADD213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFNMADD231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFNMADD132SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFNMADD213SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFNMADD231SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFNMADD132SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFNMADD213SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFNMADD231SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFNMSUB132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFNMSUB213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFNMSUB231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFNMSUB132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFNMSUB213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFNMSUB231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFNMSUB132SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFNMSUB213SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFNMSUB231SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFNMSUB132SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFNMSUB213SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFNMSUB231SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFMADDSUB132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFMADDSUB213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFMADDSUB231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFMADDSUB132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFMADDSUB213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFMADDSUB231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFMSUBADD132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFMSUBADD213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFMSUBADD231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFMSUBADD132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFMSUBADD213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFMSUBADD231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); // VEX GPR instructions - void SARX(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2); - void SHLX(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2); - void SHRX(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2); - void RORX(int bits, X64Reg regOp, OpArg arg, u8 rotate); - void PEXT(int bits, X64Reg regOp1, X64Reg regOp2, OpArg arg); - void PDEP(int bits, X64Reg regOp1, X64Reg regOp2, OpArg arg); - void MULX(int bits, X64Reg regOp1, X64Reg regOp2, OpArg arg); - void BZHI(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2); - void BLSR(int bits, X64Reg regOp, OpArg arg); - void BLSMSK(int bits, X64Reg regOp, OpArg arg); - void BLSI(int bits, X64Reg regOp, OpArg arg); - void BEXTR(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2); - void ANDN(int bits, X64Reg regOp1, X64Reg regOp2, OpArg arg); + void SARX(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2); + void SHLX(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2); + void SHRX(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2); + void RORX(int bits, X64Reg regOp, const OpArg& arg, u8 rotate); + void PEXT(int bits, X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void PDEP(int bits, X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void MULX(int bits, X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void BZHI(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2); + void BLSR(int bits, X64Reg regOp, const OpArg& arg); + void BLSMSK(int bits, X64Reg regOp, const OpArg& arg); + void BLSI(int bits, X64Reg regOp, const OpArg& arg); + void BEXTR(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2); + void ANDN(int bits, X64Reg regOp1, X64Reg regOp2, const OpArg& arg); void RDTSC(); // Utility functions // The difference between this and CALL is that this aligns the stack // where appropriate. - void ABI_CallFunction(const void *func); + void ABI_CallFunction(const void* func); template <typename T> void ABI_CallFunction(T (*func)()) { - ABI_CallFunction((const void *)func); + ABI_CallFunction((const void*)func); } - void ABI_CallFunction(const u8 *func) { - ABI_CallFunction((const void *)func); + void ABI_CallFunction(const u8* func) { + ABI_CallFunction((const void*)func); } - void ABI_CallFunctionC16(const void *func, u16 param1); - void ABI_CallFunctionCC16(const void *func, u32 param1, u16 param2); + void ABI_CallFunctionC16(const void* func, u16 param1); + void ABI_CallFunctionCC16(const void* func, u32 param1, u16 param2); // These only support u32 parameters, but that's enough for a lot of uses. // These will destroy the 1 or 2 first "parameter regs". - void ABI_CallFunctionC(const void *func, u32 param1); - void ABI_CallFunctionCC(const void *func, u32 param1, u32 param2); - void ABI_CallFunctionCCC(const void *func, u32 param1, u32 param2, u32 param3); - void ABI_CallFunctionCCP(const void *func, u32 param1, u32 param2, void *param3); - void ABI_CallFunctionCCCP(const void *func, u32 param1, u32 param2, u32 param3, void *param4); - void ABI_CallFunctionP(const void *func, void *param1); - void ABI_CallFunctionPA(const void *func, void *param1, const Gen::OpArg &arg2); - void ABI_CallFunctionPAA(const void *func, void *param1, const Gen::OpArg &arg2, const Gen::OpArg &arg3); - void ABI_CallFunctionPPC(const void *func, void *param1, void *param2, u32 param3); - void ABI_CallFunctionAC(const void *func, const Gen::OpArg &arg1, u32 param2); - void ABI_CallFunctionACC(const void *func, const Gen::OpArg &arg1, u32 param2, u32 param3); - void ABI_CallFunctionA(const void *func, const Gen::OpArg &arg1); - void ABI_CallFunctionAA(const void *func, const Gen::OpArg &arg1, const Gen::OpArg &arg2); + void ABI_CallFunctionC(const void* func, u32 param1); + void ABI_CallFunctionCC(const void* func, u32 param1, u32 param2); + void ABI_CallFunctionCCC(const void* func, u32 param1, u32 param2, u32 param3); + void ABI_CallFunctionCCP(const void* func, u32 param1, u32 param2, void* param3); + void ABI_CallFunctionCCCP(const void* func, u32 param1, u32 param2, u32 param3, void* param4); + void ABI_CallFunctionP(const void* func, void* param1); + void ABI_CallFunctionPA(const void* func, void* param1, const OpArg& arg2); + void ABI_CallFunctionPAA(const void* func, void* param1, const OpArg& arg2, const OpArg& arg3); + void ABI_CallFunctionPPC(const void* func, void* param1, void* param2, u32 param3); + void ABI_CallFunctionAC(const void* func, const OpArg& arg1, u32 param2); + void ABI_CallFunctionACC(const void* func, const OpArg& arg1, u32 param2, u32 param3); + void ABI_CallFunctionA(const void* func, const OpArg& arg1); + void ABI_CallFunctionAA(const void* func, const OpArg& arg1, const OpArg& arg2); // Pass a register as a parameter. - void ABI_CallFunctionR(const void *func, X64Reg reg1); - void ABI_CallFunctionRR(const void *func, X64Reg reg1, X64Reg reg2); + void ABI_CallFunctionR(const void* func, X64Reg reg1); + void ABI_CallFunctionRR(const void* func, X64Reg reg1, X64Reg reg2); template <typename Tr, typename T1> void ABI_CallFunctionC(Tr (*func)(T1), u32 param1) { - ABI_CallFunctionC((const void *)func, param1); + ABI_CallFunctionC((const void*)func, param1); } // A function that doesn't have any control over what it will do to regs, @@ -1048,9 +1028,9 @@ public: void ABI_EmitEpilogue(int maxCallParams); #ifdef _M_IX86 - inline int ABI_GetNumXMMRegs() { return 8; } + static int ABI_GetNumXMMRegs() { return 8; } #else - inline int ABI_GetNumXMMRegs() { return 16; } + static int ABI_GetNumXMMRegs() { return 16; } #endif }; // class XEmitter diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt index 6cc60fd58..c17290b9b 100644 --- a/src/core/CMakeLists.txt +++ b/src/core/CMakeLists.txt @@ -29,6 +29,7 @@ set(SRCS hle/kernel/address_arbiter.cpp hle/kernel/event.cpp hle/kernel/kernel.cpp + hle/kernel/memory.cpp hle/kernel/mutex.cpp hle/kernel/process.cpp hle/kernel/resource_limit.cpp @@ -115,7 +116,6 @@ set(SRCS loader/loader.cpp loader/ncch.cpp tracer/recorder.cpp - mem_map.cpp memory.cpp settings.cpp system.cpp @@ -157,6 +157,7 @@ set(HEADERS hle/kernel/address_arbiter.h hle/kernel/event.h hle/kernel/kernel.h + hle/kernel/memory.h hle/kernel/mutex.h hle/kernel/process.h hle/kernel/resource_limit.h @@ -245,7 +246,6 @@ set(HEADERS loader/ncch.h tracer/recorder.h tracer/citrace.h - mem_map.h memory.h memory_setup.h settings.h diff --git a/src/core/arm/dyncom/arm_dyncom_interpreter.cpp b/src/core/arm/dyncom/arm_dyncom_interpreter.cpp index 422e80b50..0fddb07a0 100644 --- a/src/core/arm/dyncom/arm_dyncom_interpreter.cpp +++ b/src/core/arm/dyncom/arm_dyncom_interpreter.cpp @@ -9,6 +9,7 @@ #include "common/common_types.h" #include "common/logging/log.h" +#include "common/microprofile.h" #include "common/profiler.h" #include "core/memory.h" @@ -48,65 +49,47 @@ enum { typedef unsigned int (*shtop_fp_t)(ARMul_State* cpu, unsigned int sht_oper); -static int CondPassed(ARMul_State* cpu, unsigned int cond) { - const u32 NFLAG = cpu->NFlag; - const u32 ZFLAG = cpu->ZFlag; - const u32 CFLAG = cpu->CFlag; - const u32 VFLAG = cpu->VFlag; - - int temp = 0; +static bool CondPassed(ARMul_State* cpu, unsigned int cond) { + const bool n_flag = cpu->NFlag != 0; + const bool z_flag = cpu->ZFlag != 0; + const bool c_flag = cpu->CFlag != 0; + const bool v_flag = cpu->VFlag != 0; switch (cond) { - case 0x0: - temp = ZFLAG; - break; - case 0x1: // NE - temp = !ZFLAG; - break; - case 0x2: // CS - temp = CFLAG; - break; - case 0x3: // CC - temp = !CFLAG; - break; - case 0x4: // MI - temp = NFLAG; - break; - case 0x5: // PL - temp = !NFLAG; - break; - case 0x6: // VS - temp = VFLAG; - break; - case 0x7: // VC - temp = !VFLAG; - break; - case 0x8: // HI - temp = (CFLAG && !ZFLAG); - break; - case 0x9: // LS - temp = (!CFLAG || ZFLAG); - break; - case 0xa: // GE - temp = ((!NFLAG && !VFLAG) || (NFLAG && VFLAG)); - break; - case 0xb: // LT - temp = ((NFLAG && !VFLAG) || (!NFLAG && VFLAG)); - break; - case 0xc: // GT - temp = ((!NFLAG && !VFLAG && !ZFLAG) || (NFLAG && VFLAG && !ZFLAG)); - break; - case 0xd: // LE - temp = ((NFLAG && !VFLAG) || (!NFLAG && VFLAG)) || ZFLAG; - break; - case 0xe: // AL - temp = 1; - break; - case 0xf: - temp = 1; - break; - } - return temp; + case ConditionCode::EQ: + return z_flag; + case ConditionCode::NE: + return !z_flag; + case ConditionCode::CS: + return c_flag; + case ConditionCode::CC: + return !c_flag; + case ConditionCode::MI: + return n_flag; + case ConditionCode::PL: + return !n_flag; + case ConditionCode::VS: + return v_flag; + case ConditionCode::VC: + return !v_flag; + case ConditionCode::HI: + return (c_flag && !z_flag); + case ConditionCode::LS: + return (!c_flag || z_flag); + case ConditionCode::GE: + return (n_flag == v_flag); + case ConditionCode::LT: + return (n_flag != v_flag); + case ConditionCode::GT: + return (!z_flag && (n_flag == v_flag)); + case ConditionCode::LE: + return (z_flag || (n_flag != v_flag)); + case ConditionCode::AL: + case ConditionCode::NV: // Unconditional + return true; + } + + return false; } static unsigned int DPO(Immediate)(ARMul_State* cpu, unsigned int sht_oper) { @@ -3522,8 +3505,11 @@ enum { FETCH_EXCEPTION }; +MICROPROFILE_DEFINE(DynCom_Decode, "DynCom", "Decode", MP_RGB(255, 64, 64)); + static int InterpreterTranslate(ARMul_State* cpu, int& bb_start, u32 addr) { Common::Profiling::ScopeTimer timer_decode(profile_decode); + MICROPROFILE_SCOPE(DynCom_Decode); // Decode instruction, get index // Allocate memory and init InsCream @@ -3588,8 +3574,11 @@ static int clz(unsigned int x) { return n; } +MICROPROFILE_DEFINE(DynCom_Execute, "DynCom", "Execute", MP_RGB(255, 0, 0)); + unsigned InterpreterMainLoop(ARMul_State* cpu) { Common::Profiling::ScopeTimer timer_execute(profile_execute); + MICROPROFILE_SCOPE(DynCom_Execute); #undef RM #undef RS diff --git a/src/core/arm/skyeye_common/armstate.cpp b/src/core/arm/skyeye_common/armstate.cpp index ccb2eb0eb..0491717dc 100644 --- a/src/core/arm/skyeye_common/armstate.cpp +++ b/src/core/arm/skyeye_common/armstate.cpp @@ -4,7 +4,6 @@ #include "common/swap.h" #include "common/logging/log.h" -#include "core/mem_map.h" #include "core/memory.h" #include "core/arm/skyeye_common/armstate.h" #include "core/arm/skyeye_common/vfp/vfp.h" diff --git a/src/core/arm/skyeye_common/armsupp.cpp b/src/core/arm/skyeye_common/armsupp.cpp index d31fb9449..883713e86 100644 --- a/src/core/arm/skyeye_common/armsupp.cpp +++ b/src/core/arm/skyeye_common/armsupp.cpp @@ -17,7 +17,6 @@ #include "common/logging/log.h" -#include "core/mem_map.h" #include "core/arm/skyeye_common/arm_regformat.h" #include "core/arm/skyeye_common/armstate.h" #include "core/arm/skyeye_common/armsupp.h" diff --git a/src/core/hle/config_mem.cpp b/src/core/hle/config_mem.cpp index aea936d2d..b1a72dc0c 100644 --- a/src/core/hle/config_mem.cpp +++ b/src/core/hle/config_mem.cpp @@ -25,10 +25,6 @@ void Init() { config_mem.sys_core_ver = 0x2; config_mem.unit_info = 0x1; // Bit 0 set for Retail config_mem.prev_firm = 0; - config_mem.app_mem_type = 0x2; // Default app mem type is 0 - config_mem.app_mem_alloc = 0x06000000; // Set to 96MB, since some games use more than the default (64MB) - config_mem.base_mem_alloc = 0x01400000; // Default base memory is 20MB - config_mem.sys_mem_alloc = Memory::FCRAM_SIZE - (config_mem.app_mem_alloc + config_mem.base_mem_alloc); config_mem.firm_unk = 0; config_mem.firm_version_rev = 0; config_mem.firm_version_min = 0x40; @@ -36,7 +32,4 @@ void Init() { config_mem.firm_sys_core_ver = 0x2; } -void Shutdown() { -} - } // namespace diff --git a/src/core/hle/config_mem.h b/src/core/hle/config_mem.h index 9825a09e8..24a1254f2 100644 --- a/src/core/hle/config_mem.h +++ b/src/core/hle/config_mem.h @@ -52,6 +52,5 @@ static_assert(sizeof(ConfigMemDef) == Memory::CONFIG_MEMORY_SIZE, "Config Memory extern ConfigMemDef config_mem; void Init(); -void Shutdown(); } // namespace diff --git a/src/core/hle/function_wrappers.h b/src/core/hle/function_wrappers.h index 1a0518926..5846a161b 100644 --- a/src/core/hle/function_wrappers.h +++ b/src/core/hle/function_wrappers.h @@ -172,6 +172,14 @@ template<ResultCode func(u32, s64, s64)> void Wrap() { FuncReturn(func(PARAM(0), param1, param2).raw); } +template<ResultCode func(s64*, Handle, u32)> void Wrap() { + s64 param_1 = 0; + u32 retval = func(¶m_1, PARAM(1), PARAM(2)).raw; + Core::g_app_core->SetReg(1, (u32)param_1); + Core::g_app_core->SetReg(2, (u32)(param_1 >> 32)); + FuncReturn(retval); +} + //////////////////////////////////////////////////////////////////////////////////////////////////// // Function wrappers that return type u32 diff --git a/src/core/hle/hle.cpp b/src/core/hle/hle.cpp index cd0a400dc..331b1b22a 100644 --- a/src/core/hle/hle.cpp +++ b/src/core/hle/hle.cpp @@ -34,8 +34,6 @@ void Reschedule(const char *reason) { void Init() { Service::Init(); - ConfigMem::Init(); - SharedPage::Init(); g_reschedule = false; @@ -43,8 +41,6 @@ void Init() { } void Shutdown() { - ConfigMem::Shutdown(); - SharedPage::Shutdown(); Service::Shutdown(); LOG_DEBUG(Kernel, "shutdown OK"); diff --git a/src/core/hle/kernel/kernel.cpp b/src/core/hle/kernel/kernel.cpp index 5711c0405..7a401a965 100644 --- a/src/core/hle/kernel/kernel.cpp +++ b/src/core/hle/kernel/kernel.cpp @@ -7,11 +7,14 @@ #include "common/assert.h" #include "common/logging/log.h" +#include "core/hle/config_mem.h" #include "core/hle/kernel/kernel.h" -#include "core/hle/kernel/resource_limit.h" +#include "core/hle/kernel/memory.h" #include "core/hle/kernel/process.h" +#include "core/hle/kernel/resource_limit.h" #include "core/hle/kernel/thread.h" #include "core/hle/kernel/timer.h" +#include "core/hle/shared_page.h" namespace Kernel { @@ -119,6 +122,13 @@ void HandleTable::Clear() { /// Initialize the kernel void Init() { + ConfigMem::Init(); + SharedPage::Init(); + + // TODO(yuriks): The memory type parameter needs to be determined by the ExHeader field instead + // For now it defaults to the one with a largest allocation to the app + Kernel::MemoryInit(2); // Allocates 96MB to the application + Kernel::ResourceLimitsInit(); Kernel::ThreadingInit(); Kernel::TimersInit(); @@ -131,11 +141,14 @@ void Init() { /// Shutdown the kernel void Shutdown() { + g_handle_table.Clear(); // Free all kernel objects + Kernel::ThreadingShutdown(); + g_current_process = nullptr; + Kernel::TimersShutdown(); Kernel::ResourceLimitsShutdown(); - g_handle_table.Clear(); // Free all kernel objects - g_current_process = nullptr; + Kernel::MemoryShutdown(); } } // namespace diff --git a/src/core/hle/kernel/memory.cpp b/src/core/hle/kernel/memory.cpp new file mode 100644 index 000000000..e4fc5f3c4 --- /dev/null +++ b/src/core/hle/kernel/memory.cpp @@ -0,0 +1,136 @@ +// Copyright 2014 Citra Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <map> +#include <memory> +#include <utility> +#include <vector> + +#include "common/common_types.h" +#include "common/logging/log.h" + +#include "core/hle/config_mem.h" +#include "core/hle/kernel/memory.h" +#include "core/hle/kernel/vm_manager.h" +#include "core/hle/result.h" +#include "core/hle/shared_page.h" +#include "core/memory.h" +#include "core/memory_setup.h" + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +namespace Kernel { + +static MemoryRegionInfo memory_regions[3]; + +/// Size of the APPLICATION, SYSTEM and BASE memory regions (respectively) for each sytem +/// memory configuration type. +static const u32 memory_region_sizes[8][3] = { + // Old 3DS layouts + {0x04000000, 0x02C00000, 0x01400000}, // 0 + { /* This appears to be unused. */ }, // 1 + {0x06000000, 0x00C00000, 0x01400000}, // 2 + {0x05000000, 0x01C00000, 0x01400000}, // 3 + {0x04800000, 0x02400000, 0x01400000}, // 4 + {0x02000000, 0x04C00000, 0x01400000}, // 5 + + // New 3DS layouts + {0x07C00000, 0x06400000, 0x02000000}, // 6 + {0x0B200000, 0x02E00000, 0x02000000}, // 7 +}; + +void MemoryInit(u32 mem_type) { + // TODO(yuriks): On the n3DS, all o3DS configurations (<=5) are forced to 6 instead. + ASSERT_MSG(mem_type <= 5, "New 3DS memory configuration aren't supported yet!"); + ASSERT(mem_type != 1); + + // The kernel allocation regions (APPLICATION, SYSTEM and BASE) are laid out in sequence, with + // the sizes specified in the memory_region_sizes table. + VAddr base = 0; + for (int i = 0; i < 3; ++i) { + memory_regions[i].base = base; + memory_regions[i].size = memory_region_sizes[mem_type][i]; + memory_regions[i].linear_heap_memory = std::make_shared<std::vector<u8>>(); + + base += memory_regions[i].size; + } + + // We must've allocated the entire FCRAM by the end + ASSERT(base == Memory::FCRAM_SIZE); + + using ConfigMem::config_mem; + config_mem.app_mem_type = mem_type; + // app_mem_malloc does not always match the configured size for memory_region[0]: in case the + // n3DS type override is in effect it reports the size the game expects, not the real one. + config_mem.app_mem_alloc = memory_region_sizes[mem_type][0]; + config_mem.sys_mem_alloc = memory_regions[1].size; + config_mem.base_mem_alloc = memory_regions[2].size; +} + +void MemoryShutdown() { + for (auto& region : memory_regions) { + region.base = 0; + region.size = 0; + region.linear_heap_memory = nullptr; + } +} + +MemoryRegionInfo* GetMemoryRegion(MemoryRegion region) { + switch (region) { + case MemoryRegion::APPLICATION: + return &memory_regions[0]; + case MemoryRegion::SYSTEM: + return &memory_regions[1]; + case MemoryRegion::BASE: + return &memory_regions[2]; + default: + UNREACHABLE(); + } +} + +} + +namespace Memory { + +namespace { + +struct MemoryArea { + u32 base; + u32 size; + const char* name; +}; + +// We don't declare the IO regions in here since its handled by other means. +static MemoryArea memory_areas[] = { + {SHARED_MEMORY_VADDR, SHARED_MEMORY_SIZE, "Shared Memory"}, // Shared memory + {VRAM_VADDR, VRAM_SIZE, "VRAM"}, // Video memory (VRAM) + {DSP_RAM_VADDR, DSP_RAM_SIZE, "DSP RAM"}, // DSP memory + {TLS_AREA_VADDR, TLS_AREA_SIZE, "TLS Area"}, // TLS memory +}; + +} + +void Init() { + InitMemoryMap(); + LOG_DEBUG(HW_Memory, "initialized OK"); +} + +void InitLegacyAddressSpace(Kernel::VMManager& address_space) { + using namespace Kernel; + + for (MemoryArea& area : memory_areas) { + auto block = std::make_shared<std::vector<u8>>(area.size); + address_space.MapMemoryBlock(area.base, std::move(block), 0, area.size, MemoryState::Private).Unwrap(); + } + + auto cfg_mem_vma = address_space.MapBackingMemory(CONFIG_MEMORY_VADDR, + (u8*)&ConfigMem::config_mem, CONFIG_MEMORY_SIZE, MemoryState::Shared).MoveFrom(); + address_space.Reprotect(cfg_mem_vma, VMAPermission::Read); + + auto shared_page_vma = address_space.MapBackingMemory(SHARED_PAGE_VADDR, + (u8*)&SharedPage::shared_page, SHARED_PAGE_SIZE, MemoryState::Shared).MoveFrom(); + address_space.Reprotect(shared_page_vma, VMAPermission::Read); +} + +} // namespace diff --git a/src/core/hle/kernel/memory.h b/src/core/hle/kernel/memory.h new file mode 100644 index 000000000..36690b091 --- /dev/null +++ b/src/core/hle/kernel/memory.h @@ -0,0 +1,35 @@ +// Copyright 2014 Citra Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <memory> + +#include "common/common_types.h" + +#include "core/hle/kernel/process.h" + +namespace Kernel { + +class VMManager; + +struct MemoryRegionInfo { + u32 base; // Not an address, but offset from start of FCRAM + u32 size; + + std::shared_ptr<std::vector<u8>> linear_heap_memory; +}; + +void MemoryInit(u32 mem_type); +void MemoryShutdown(); +MemoryRegionInfo* GetMemoryRegion(MemoryRegion region); + +} + +namespace Memory { + +void Init(); +void InitLegacyAddressSpace(Kernel::VMManager& address_space); + +} // namespace diff --git a/src/core/hle/kernel/process.cpp b/src/core/hle/kernel/process.cpp index a7892c652..c2b4963d4 100644 --- a/src/core/hle/kernel/process.cpp +++ b/src/core/hle/kernel/process.cpp @@ -7,11 +7,11 @@ #include "common/logging/log.h" #include "common/make_unique.h" +#include "core/hle/kernel/memory.h" #include "core/hle/kernel/process.h" #include "core/hle/kernel/resource_limit.h" #include "core/hle/kernel/thread.h" #include "core/hle/kernel/vm_manager.h" -#include "core/mem_map.h" #include "core/memory.h" namespace Kernel { @@ -36,8 +36,7 @@ SharedPtr<Process> Process::Create(SharedPtr<CodeSet> code_set) { process->codeset = std::move(code_set); process->flags.raw = 0; process->flags.memory_region = MemoryRegion::APPLICATION; - process->address_space = Common::make_unique<VMManager>(); - Memory::InitLegacyAddressSpace(*process->address_space); + Memory::InitLegacyAddressSpace(process->vm_manager); return process; } @@ -93,9 +92,11 @@ void Process::ParseKernelCaps(const u32* kernel_caps, size_t len) { mapping.unk_flag = false; } else if ((type & 0xFE0) == 0xFC0) { // 0x01FF // Kernel version - int minor = descriptor & 0xFF; - int major = (descriptor >> 8) & 0xFF; - LOG_INFO(Loader, "ExHeader kernel version ignored: %d.%d", major, minor); + kernel_version = descriptor & 0xFFFF; + + int minor = kernel_version & 0xFF; + int major = (kernel_version >> 8) & 0xFF; + LOG_INFO(Loader, "ExHeader kernel version: %d.%d", major, minor); } else { LOG_ERROR(Loader, "Unhandled kernel caps descriptor: 0x%08X", descriptor); } @@ -103,20 +104,161 @@ void Process::ParseKernelCaps(const u32* kernel_caps, size_t len) { } void Process::Run(s32 main_thread_priority, u32 stack_size) { + memory_region = GetMemoryRegion(flags.memory_region); + auto MapSegment = [&](CodeSet::Segment& segment, VMAPermission permissions, MemoryState memory_state) { - auto vma = address_space->MapMemoryBlock(segment.addr, codeset->memory, + auto vma = vm_manager.MapMemoryBlock(segment.addr, codeset->memory, segment.offset, segment.size, memory_state).Unwrap(); - address_space->Reprotect(vma, permissions); + vm_manager.Reprotect(vma, permissions); + misc_memory_used += segment.size; }; + // Map CodeSet segments MapSegment(codeset->code, VMAPermission::ReadExecute, MemoryState::Code); MapSegment(codeset->rodata, VMAPermission::Read, MemoryState::Code); MapSegment(codeset->data, VMAPermission::ReadWrite, MemoryState::Private); - address_space->LogLayout(); + // Allocate and map stack + vm_manager.MapMemoryBlock(Memory::HEAP_VADDR_END - stack_size, + std::make_shared<std::vector<u8>>(stack_size, 0), 0, stack_size, MemoryState::Locked + ).Unwrap(); + misc_memory_used += stack_size; + + vm_manager.LogLayout(Log::Level::Debug); Kernel::SetupMainThread(codeset->entrypoint, main_thread_priority); } +VAddr Process::GetLinearHeapBase() const { + return (kernel_version < 0x22C ? Memory::LINEAR_HEAP_VADDR : Memory::NEW_LINEAR_HEAP_VADDR) + + memory_region->base; +} + +VAddr Process::GetLinearHeapLimit() const { + return GetLinearHeapBase() + memory_region->size; +} + +ResultVal<VAddr> Process::HeapAllocate(VAddr target, u32 size, VMAPermission perms) { + if (target < Memory::HEAP_VADDR || target + size > Memory::HEAP_VADDR_END || target + size < target) { + return ERR_INVALID_ADDRESS; + } + + if (heap_memory == nullptr) { + // Initialize heap + heap_memory = std::make_shared<std::vector<u8>>(); + heap_start = heap_end = target; + } + + // If necessary, expand backing vector to cover new heap extents. + if (target < heap_start) { + heap_memory->insert(begin(*heap_memory), heap_start - target, 0); + heap_start = target; + vm_manager.RefreshMemoryBlockMappings(heap_memory.get()); + } + if (target + size > heap_end) { + heap_memory->insert(end(*heap_memory), (target + size) - heap_end, 0); + heap_end = target + size; + vm_manager.RefreshMemoryBlockMappings(heap_memory.get()); + } + ASSERT(heap_end - heap_start == heap_memory->size()); + + CASCADE_RESULT(auto vma, vm_manager.MapMemoryBlock(target, heap_memory, target - heap_start, size, MemoryState::Private)); + vm_manager.Reprotect(vma, perms); + + heap_used += size; + + return MakeResult<VAddr>(heap_end - size); +} + +ResultCode Process::HeapFree(VAddr target, u32 size) { + if (target < Memory::HEAP_VADDR || target + size > Memory::HEAP_VADDR_END || target + size < target) { + return ERR_INVALID_ADDRESS; + } + + if (size == 0) { + return RESULT_SUCCESS; + } + + ResultCode result = vm_manager.UnmapRange(target, size); + if (result.IsError()) return result; + + heap_used -= size; + + return RESULT_SUCCESS; +} + +ResultVal<VAddr> Process::LinearAllocate(VAddr target, u32 size, VMAPermission perms) { + auto& linheap_memory = memory_region->linear_heap_memory; + + VAddr heap_end = GetLinearHeapBase() + (u32)linheap_memory->size(); + // Games and homebrew only ever seem to pass 0 here (which lets the kernel decide the address), + // but explicit addresses are also accepted and respected. + if (target == 0) { + target = heap_end; + } + + if (target < GetLinearHeapBase() || target + size > GetLinearHeapLimit() || + target > heap_end || target + size < target) { + + return ERR_INVALID_ADDRESS; + } + + // Expansion of the linear heap is only allowed if you do an allocation immediatelly at its + // end. It's possible to free gaps in the middle of the heap and then reallocate them later, + // but expansions are only allowed at the end. + if (target == heap_end) { + linheap_memory->insert(linheap_memory->end(), size, 0); + vm_manager.RefreshMemoryBlockMappings(linheap_memory.get()); + } + + // TODO(yuriks): As is, this lets processes map memory allocated by other processes from the + // same region. It is unknown if or how the 3DS kernel checks against this. + size_t offset = target - GetLinearHeapBase(); + CASCADE_RESULT(auto vma, vm_manager.MapMemoryBlock(target, linheap_memory, offset, size, MemoryState::Continuous)); + vm_manager.Reprotect(vma, perms); + + linear_heap_used += size; + + return MakeResult<VAddr>(target); +} + +ResultCode Process::LinearFree(VAddr target, u32 size) { + auto& linheap_memory = memory_region->linear_heap_memory; + + if (target < GetLinearHeapBase() || target + size > GetLinearHeapLimit() || + target + size < target) { + + return ERR_INVALID_ADDRESS; + } + + if (size == 0) { + return RESULT_SUCCESS; + } + + VAddr heap_end = GetLinearHeapBase() + (u32)linheap_memory->size(); + if (target + size > heap_end) { + return ERR_INVALID_ADDRESS_STATE; + } + + ResultCode result = vm_manager.UnmapRange(target, size); + if (result.IsError()) return result; + + linear_heap_used -= size; + + if (target + size == heap_end) { + // End of linear heap has been freed, so check what's the last allocated block in it and + // reduce the size. + auto vma = vm_manager.FindVMA(target); + ASSERT(vma != vm_manager.vma_map.end()); + ASSERT(vma->second.type == VMAType::Free); + VAddr new_end = vma->second.base; + if (new_end >= GetLinearHeapBase()) { + linheap_memory->resize(new_end - GetLinearHeapBase()); + } + } + + return RESULT_SUCCESS; +} + Kernel::Process::Process() {} Kernel::Process::~Process() {} diff --git a/src/core/hle/kernel/process.h b/src/core/hle/kernel/process.h index 83d3aceae..60e17f251 100644 --- a/src/core/hle/kernel/process.h +++ b/src/core/hle/kernel/process.h @@ -15,6 +15,7 @@ #include "common/common_types.h" #include "core/hle/kernel/kernel.h" +#include "core/hle/kernel/vm_manager.h" namespace Kernel { @@ -48,7 +49,7 @@ union ProcessFlags { }; class ResourceLimit; -class VMManager; +struct MemoryRegionInfo; struct CodeSet final : public Object { static SharedPtr<CodeSet> Create(std::string name, u64 program_id); @@ -104,14 +105,12 @@ public: /// processes access to specific I/O regions and device memory. boost::container::static_vector<AddressMapping, 8> address_mappings; ProcessFlags flags; + /// Kernel compatibility version for this process + u16 kernel_version = 0; /// The id of this process u32 process_id = next_process_id++; - /// Bitmask of the used TLS slots - std::bitset<300> used_tls_slots; - std::unique_ptr<VMManager> address_space; - /** * Parses a list of kernel capability descriptors (as found in the ExHeader) and applies them * to this process. @@ -123,6 +122,36 @@ public: */ void Run(s32 main_thread_priority, u32 stack_size); + + /////////////////////////////////////////////////////////////////////////////////////////////// + // Memory Management + + VMManager vm_manager; + + // Memory used to back the allocations in the regular heap. A single vector is used to cover + // the entire virtual address space extents that bound the allocations, including any holes. + // This makes deallocation and reallocation of holes fast and keeps process memory contiguous + // in the emulator address space, allowing Memory::GetPointer to be reasonably safe. + std::shared_ptr<std::vector<u8>> heap_memory; + // The left/right bounds of the address space covered by heap_memory. + VAddr heap_start = 0, heap_end = 0; + + u32 heap_used = 0, linear_heap_used = 0, misc_memory_used = 0; + + MemoryRegionInfo* memory_region = nullptr; + + /// Bitmask of the used TLS slots + std::bitset<300> used_tls_slots; + + VAddr GetLinearHeapBase() const; + VAddr GetLinearHeapLimit() const; + + ResultVal<VAddr> HeapAllocate(VAddr target, u32 size, VMAPermission perms); + ResultCode HeapFree(VAddr target, u32 size); + + ResultVal<VAddr> LinearAllocate(VAddr target, u32 size, VMAPermission perms); + ResultCode LinearFree(VAddr target, u32 size); + private: Process(); ~Process() override; diff --git a/src/core/hle/kernel/resource_limit.cpp b/src/core/hle/kernel/resource_limit.cpp index 94b3e3298..67dde08c2 100644 --- a/src/core/hle/kernel/resource_limit.cpp +++ b/src/core/hle/kernel/resource_limit.cpp @@ -6,7 +6,6 @@ #include "common/logging/log.h" -#include "core/mem_map.h" #include "core/hle/kernel/resource_limit.h" namespace Kernel { diff --git a/src/core/hle/kernel/shared_memory.cpp b/src/core/hle/kernel/shared_memory.cpp index 4137683b5..1f477664b 100644 --- a/src/core/hle/kernel/shared_memory.cpp +++ b/src/core/hle/kernel/shared_memory.cpp @@ -20,6 +20,7 @@ SharedPtr<SharedMemory> SharedMemory::Create(u32 size, MemoryPermission permissi shared_memory->name = std::move(name); shared_memory->base_address = 0x0; + shared_memory->fixed_address = 0x0; shared_memory->size = size; shared_memory->permissions = permissions; shared_memory->other_permissions = other_permissions; @@ -30,9 +31,31 @@ SharedPtr<SharedMemory> SharedMemory::Create(u32 size, MemoryPermission permissi ResultCode SharedMemory::Map(VAddr address, MemoryPermission permissions, MemoryPermission other_permissions) { + if (base_address != 0) { + LOG_ERROR(Kernel, "cannot map id=%u, address=0x%08X name=%s: already mapped at 0x%08X!", + GetObjectId(), address, name.c_str(), base_address); + // TODO: Verify error code with hardware + return ResultCode(ErrorDescription::InvalidAddress, ErrorModule::Kernel, + ErrorSummary::InvalidArgument, ErrorLevel::Permanent); + } + + if (fixed_address != 0) { + if (address != 0 && address != fixed_address) { + LOG_ERROR(Kernel, "cannot map id=%u, address=0x%08X name=%s: fixed_addres is 0x%08X!", + GetObjectId(), address, name.c_str(), fixed_address); + // TODO: Verify error code with hardware + return ResultCode(ErrorDescription::InvalidAddress, ErrorModule::Kernel, + ErrorSummary::InvalidArgument, ErrorLevel::Permanent); + } + + // HACK(yuriks): This is only here to support the APT shared font mapping right now. + // Later, this should actually map the memory block onto the address space. + return RESULT_SUCCESS; + } + if (address < Memory::SHARED_MEMORY_VADDR || address + size >= Memory::SHARED_MEMORY_VADDR_END) { - LOG_ERROR(Kernel, "cannot map id=%u, address=0x%08X outside of shared mem bounds!", - GetObjectId(), address); + LOG_ERROR(Kernel, "cannot map id=%u, address=0x%08X name=%s outside of shared mem bounds!", + GetObjectId(), address, name.c_str()); // TODO: Verify error code with hardware return ResultCode(ErrorDescription::InvalidAddress, ErrorModule::Kernel, ErrorSummary::InvalidArgument, ErrorLevel::Permanent); diff --git a/src/core/hle/kernel/shared_memory.h b/src/core/hle/kernel/shared_memory.h index 7a2922776..35b550d12 100644 --- a/src/core/hle/kernel/shared_memory.h +++ b/src/core/hle/kernel/shared_memory.h @@ -61,6 +61,8 @@ public: /// Address of shared memory block in the process. VAddr base_address; + /// Fixed address to allow mapping to. Used for blocks created from the linear heap. + VAddr fixed_address; /// Size of the memory block. Page-aligned. u32 size; /// Permission restrictions applied to the process which created the block. diff --git a/src/core/hle/kernel/thread.cpp b/src/core/hle/kernel/thread.cpp index 29ea6d531..c10126513 100644 --- a/src/core/hle/kernel/thread.cpp +++ b/src/core/hle/kernel/thread.cpp @@ -117,6 +117,7 @@ void Thread::Stop() { wait_objects.clear(); Kernel::g_current_process->used_tls_slots[tls_index] = false; + g_current_process->misc_memory_used -= Memory::TLS_ENTRY_SIZE; HLE::Reschedule(__func__); } @@ -414,6 +415,7 @@ ResultVal<SharedPtr<Thread>> Thread::Create(std::string name, VAddr entry_point, } ASSERT_MSG(thread->tls_index != -1, "Out of TLS space"); + g_current_process->misc_memory_used += Memory::TLS_ENTRY_SIZE; // TODO(peachum): move to ScheduleThread() when scheduler is added so selected core is used // to initialize the context @@ -504,7 +506,7 @@ void Thread::SetWaitSynchronizationOutput(s32 output) { } VAddr Thread::GetTLSAddress() const { - return Memory::TLS_AREA_VADDR + tls_index * 0x200; + return Memory::TLS_AREA_VADDR + tls_index * Memory::TLS_ENTRY_SIZE; } //////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/core/hle/kernel/vm_manager.cpp b/src/core/hle/kernel/vm_manager.cpp index 205cc7b53..2610acf76 100644 --- a/src/core/hle/kernel/vm_manager.cpp +++ b/src/core/hle/kernel/vm_manager.cpp @@ -11,6 +11,15 @@ namespace Kernel { +static const char* GetMemoryStateName(MemoryState state) { + static const char* names[] = { + "Free", "Reserved", "IO", "Static", "Code", "Private", "Shared", "Continuous", "Aliased", + "Alias", "AliasCode", "Locked", + }; + + return names[(int)state]; +} + bool VirtualMemoryArea::CanBeMergedWith(const VirtualMemoryArea& next) const { ASSERT(base + size == next.base); if (permissions != next.permissions || @@ -51,11 +60,15 @@ void VMManager::Reset() { } VMManager::VMAHandle VMManager::FindVMA(VAddr target) const { - return std::prev(vma_map.upper_bound(target)); + if (target >= MAX_ADDRESS) { + return vma_map.end(); + } else { + return std::prev(vma_map.upper_bound(target)); + } } ResultVal<VMManager::VMAHandle> VMManager::MapMemoryBlock(VAddr target, - std::shared_ptr<std::vector<u8>> block, u32 offset, u32 size, MemoryState state) { + std::shared_ptr<std::vector<u8>> block, size_t offset, u32 size, MemoryState state) { ASSERT(block != nullptr); ASSERT(offset + size <= block->size()); @@ -106,10 +119,8 @@ ResultVal<VMManager::VMAHandle> VMManager::MapMMIO(VAddr target, PAddr paddr, u3 return MakeResult<VMAHandle>(MergeAdjacent(vma_handle)); } -void VMManager::Unmap(VMAHandle vma_handle) { - VMAIter iter = StripIterConstness(vma_handle); - - VirtualMemoryArea& vma = iter->second; +VMManager::VMAIter VMManager::Unmap(VMAIter vma_handle) { + VirtualMemoryArea& vma = vma_handle->second; vma.type = VMAType::Free; vma.permissions = VMAPermission::None; vma.meminfo_state = MemoryState::Free; @@ -121,26 +132,67 @@ void VMManager::Unmap(VMAHandle vma_handle) { UpdatePageTableForVMA(vma); - MergeAdjacent(iter); + return MergeAdjacent(vma_handle); +} + +ResultCode VMManager::UnmapRange(VAddr target, u32 size) { + CASCADE_RESULT(VMAIter vma, CarveVMARange(target, size)); + VAddr target_end = target + size; + + VMAIter end = vma_map.end(); + // The comparison against the end of the range must be done using addresses since VMAs can be + // merged during this process, causing invalidation of the iterators. + while (vma != end && vma->second.base < target_end) { + vma = std::next(Unmap(vma)); + } + + ASSERT(FindVMA(target)->second.size >= size); + return RESULT_SUCCESS; } -void VMManager::Reprotect(VMAHandle vma_handle, VMAPermission new_perms) { +VMManager::VMAHandle VMManager::Reprotect(VMAHandle vma_handle, VMAPermission new_perms) { VMAIter iter = StripIterConstness(vma_handle); VirtualMemoryArea& vma = iter->second; vma.permissions = new_perms; UpdatePageTableForVMA(vma); - MergeAdjacent(iter); + return MergeAdjacent(iter); +} + +ResultCode VMManager::ReprotectRange(VAddr target, u32 size, VMAPermission new_perms) { + CASCADE_RESULT(VMAIter vma, CarveVMARange(target, size)); + VAddr target_end = target + size; + + VMAIter end = vma_map.end(); + // The comparison against the end of the range must be done using addresses since VMAs can be + // merged during this process, causing invalidation of the iterators. + while (vma != end && vma->second.base < target_end) { + vma = std::next(StripIterConstness(Reprotect(vma, new_perms))); + } + + return RESULT_SUCCESS; } -void VMManager::LogLayout() const { +void VMManager::RefreshMemoryBlockMappings(const std::vector<u8>* block) { + // If this ever proves to have a noticeable performance impact, allow users of the function to + // specify a specific range of addresses to limit the scan to. for (const auto& p : vma_map) { const VirtualMemoryArea& vma = p.second; - LOG_DEBUG(Kernel, "%08X - %08X size: %8X %c%c%c", vma.base, vma.base + vma.size, vma.size, + if (block == vma.backing_block.get()) { + UpdatePageTableForVMA(vma); + } + } +} + +void VMManager::LogLayout(Log::Level log_level) const { + for (const auto& p : vma_map) { + const VirtualMemoryArea& vma = p.second; + LOG_GENERIC(Log::Class::Kernel, log_level, "%08X - %08X size: %8X %c%c%c %s", + vma.base, vma.base + vma.size, vma.size, (u8)vma.permissions & (u8)VMAPermission::Read ? 'R' : '-', (u8)vma.permissions & (u8)VMAPermission::Write ? 'W' : '-', - (u8)vma.permissions & (u8)VMAPermission::Execute ? 'X' : '-'); + (u8)vma.permissions & (u8)VMAPermission::Execute ? 'X' : '-', GetMemoryStateName(vma.meminfo_state)); } } @@ -151,21 +203,19 @@ VMManager::VMAIter VMManager::StripIterConstness(const VMAHandle & iter) { } ResultVal<VMManager::VMAIter> VMManager::CarveVMA(VAddr base, u32 size) { - ASSERT_MSG((size & Memory::PAGE_MASK) == 0, "non-page aligned size: %8X", size); - ASSERT_MSG((base & Memory::PAGE_MASK) == 0, "non-page aligned base: %08X", base); + ASSERT_MSG((size & Memory::PAGE_MASK) == 0, "non-page aligned size: 0x%8X", size); + ASSERT_MSG((base & Memory::PAGE_MASK) == 0, "non-page aligned base: 0x%08X", base); VMAIter vma_handle = StripIterConstness(FindVMA(base)); if (vma_handle == vma_map.end()) { // Target address is outside the range managed by the kernel - return ResultCode(ErrorDescription::InvalidAddress, ErrorModule::OS, - ErrorSummary::InvalidArgument, ErrorLevel::Usage); // 0xE0E01BF5 + return ERR_INVALID_ADDRESS; } VirtualMemoryArea& vma = vma_handle->second; if (vma.type != VMAType::Free) { // Region is already allocated - return ResultCode(ErrorDescription::InvalidAddress, ErrorModule::OS, - ErrorSummary::InvalidState, ErrorLevel::Usage); // 0xE0A01BF5 + return ERR_INVALID_ADDRESS_STATE; } u32 start_in_vma = base - vma.base; @@ -173,8 +223,7 @@ ResultVal<VMManager::VMAIter> VMManager::CarveVMA(VAddr base, u32 size) { if (end_in_vma > vma.size) { // Requested allocation doesn't fit inside VMA - return ResultCode(ErrorDescription::InvalidAddress, ErrorModule::OS, - ErrorSummary::InvalidState, ErrorLevel::Usage); // 0xE0A01BF5 + return ERR_INVALID_ADDRESS_STATE; } if (end_in_vma != vma.size) { @@ -189,6 +238,35 @@ ResultVal<VMManager::VMAIter> VMManager::CarveVMA(VAddr base, u32 size) { return MakeResult<VMAIter>(vma_handle); } +ResultVal<VMManager::VMAIter> VMManager::CarveVMARange(VAddr target, u32 size) { + ASSERT_MSG((size & Memory::PAGE_MASK) == 0, "non-page aligned size: 0x%8X", size); + ASSERT_MSG((target & Memory::PAGE_MASK) == 0, "non-page aligned base: 0x%08X", target); + + VAddr target_end = target + size; + ASSERT(target_end >= target); + ASSERT(target_end <= MAX_ADDRESS); + ASSERT(size > 0); + + VMAIter begin_vma = StripIterConstness(FindVMA(target)); + VMAIter i_end = vma_map.lower_bound(target_end); + for (auto i = begin_vma; i != i_end; ++i) { + if (i->second.type == VMAType::Free) { + return ERR_INVALID_ADDRESS_STATE; + } + } + + if (target != begin_vma->second.base) { + begin_vma = SplitVMA(begin_vma, target - begin_vma->second.base); + } + + VMAIter end_vma = StripIterConstness(FindVMA(target_end)); + if (end_vma != vma_map.end() && target_end != end_vma->second.base) { + end_vma = SplitVMA(end_vma, target_end - end_vma->second.base); + } + + return MakeResult<VMAIter>(begin_vma); +} + VMManager::VMAIter VMManager::SplitVMA(VMAIter vma_handle, u32 offset_in_vma) { VirtualMemoryArea& old_vma = vma_handle->second; VirtualMemoryArea new_vma = old_vma; // Make a copy of the VMA diff --git a/src/core/hle/kernel/vm_manager.h b/src/core/hle/kernel/vm_manager.h index b3795a94a..4e95f1f0c 100644 --- a/src/core/hle/kernel/vm_manager.h +++ b/src/core/hle/kernel/vm_manager.h @@ -14,6 +14,14 @@ namespace Kernel { +const ResultCode ERR_INVALID_ADDRESS{ // 0xE0E01BF5 + ErrorDescription::InvalidAddress, ErrorModule::OS, + ErrorSummary::InvalidArgument, ErrorLevel::Usage}; + +const ResultCode ERR_INVALID_ADDRESS_STATE{ // 0xE0A01BF5 + ErrorDescription::InvalidAddress, ErrorModule::OS, + ErrorSummary::InvalidState, ErrorLevel::Usage}; + enum class VMAType : u8 { /// VMA represents an unmapped region of the address space. Free, @@ -75,7 +83,7 @@ struct VirtualMemoryArea { /// Memory block backing this VMA. std::shared_ptr<std::vector<u8>> backing_block = nullptr; /// Offset into the backing_memory the mapping starts from. - u32 offset = 0; + size_t offset = 0; // Settings for type = BackingMemory /// Pointer backing this VMA. It will not be destroyed or freed when the VMA is removed. @@ -141,7 +149,7 @@ public: * @param state MemoryState tag to attach to the VMA. */ ResultVal<VMAHandle> MapMemoryBlock(VAddr target, std::shared_ptr<std::vector<u8>> block, - u32 offset, u32 size, MemoryState state); + size_t offset, u32 size, MemoryState state); /** * Maps an unmanaged host memory pointer at a given address. @@ -163,14 +171,23 @@ public: */ ResultVal<VMAHandle> MapMMIO(VAddr target, PAddr paddr, u32 size, MemoryState state); - /// Unmaps the given VMA. - void Unmap(VMAHandle vma); + /// Unmaps a range of addresses, splitting VMAs as necessary. + ResultCode UnmapRange(VAddr target, u32 size); /// Changes the permissions of the given VMA. - void Reprotect(VMAHandle vma, VMAPermission new_perms); + VMAHandle Reprotect(VMAHandle vma, VMAPermission new_perms); + + /// Changes the permissions of a range of addresses, splitting VMAs as necessary. + ResultCode ReprotectRange(VAddr target, u32 size, VMAPermission new_perms); + + /** + * Scans all VMAs and updates the page table range of any that use the given vector as backing + * memory. This should be called after any operation that causes reallocation of the vector. + */ + void RefreshMemoryBlockMappings(const std::vector<u8>* block); /// Dumps the address space layout to the log, for debugging - void LogLayout() const; + void LogLayout(Log::Level log_level) const; private: using VMAIter = decltype(vma_map)::iterator; @@ -178,6 +195,9 @@ private: /// Converts a VMAHandle to a mutable VMAIter. VMAIter StripIterConstness(const VMAHandle& iter); + /// Unmaps the given VMA. + VMAIter Unmap(VMAIter vma); + /** * Carves a VMA of a specific size at the specified address by splitting Free VMAs while doing * the appropriate error checking. @@ -185,6 +205,12 @@ private: ResultVal<VMAIter> CarveVMA(VAddr base, u32 size); /** + * Splits the edges of the given range of non-Free VMAs so that there is a VMA split at each + * end of the range. + */ + ResultVal<VMAIter> CarveVMARange(VAddr base, u32 size); + + /** * Splits a VMA in two, at the specified offset. * @returns the right side of the split, with the original iterator becoming the left side. */ diff --git a/src/core/hle/service/apt/apt.cpp b/src/core/hle/service/apt/apt.cpp index 35402341b..ba66569b4 100644 --- a/src/core/hle/service/apt/apt.cpp +++ b/src/core/hle/service/apt/apt.cpp @@ -16,6 +16,7 @@ #include "core/hle/hle.h" #include "core/hle/kernel/event.h" #include "core/hle/kernel/mutex.h" +#include "core/hle/kernel/process.h" #include "core/hle/kernel/shared_memory.h" #include "core/hle/kernel/thread.h" @@ -37,7 +38,7 @@ static Kernel::SharedPtr<Kernel::Mutex> lock; static Kernel::SharedPtr<Kernel::Event> notification_event; ///< APT notification event static Kernel::SharedPtr<Kernel::Event> parameter_event; ///< APT parameter event -static std::vector<u8> shared_font; +static std::shared_ptr<std::vector<u8>> shared_font; static u32 cpu_percent; ///< CPU time available to the running application @@ -74,11 +75,12 @@ void Initialize(Service::Interface* self) { void GetSharedFont(Service::Interface* self) { u32* cmd_buff = Kernel::GetCommandBuffer(); - if (!shared_font.empty()) { - // TODO(bunnei): This function shouldn't copy the shared font every time it's called. - // Instead, it should probably map the shared font as RO memory. We don't currently have - // an easy way to do this, but the copy should be sufficient for now. - memcpy(Memory::GetPointer(SHARED_FONT_VADDR), shared_font.data(), shared_font.size()); + if (shared_font != nullptr) { + // TODO(yuriks): This is a hack to keep this working right now even with our completely + // broken shared memory system. + shared_font_mem->fixed_address = SHARED_FONT_VADDR; + Kernel::g_current_process->vm_manager.MapMemoryBlock(shared_font_mem->fixed_address, + shared_font, 0, shared_font_mem->size, Kernel::MemoryState::Shared); cmd_buff[0] = IPC::MakeHeader(0x44, 2, 2); cmd_buff[1] = RESULT_SUCCESS.raw; // No error @@ -391,7 +393,6 @@ void Init() { // a homebrew app to do this: https://github.com/citra-emu/3dsutils. Put the resulting file // "shared_font.bin" in the Citra "sysdata" directory. - shared_font.clear(); std::string filepath = FileUtil::GetUserPath(D_SYSDATA_IDX) + SHARED_FONT; FileUtil::CreateFullPath(filepath); // Create path if not already created @@ -399,8 +400,8 @@ void Init() { if (file.IsOpen()) { // Read shared font data - shared_font.resize((size_t)file.GetSize()); - file.ReadBytes(shared_font.data(), (size_t)file.GetSize()); + shared_font = std::make_shared<std::vector<u8>>((size_t)file.GetSize()); + file.ReadBytes(shared_font->data(), shared_font->size()); // Create shared font memory object using Kernel::MemoryPermission; @@ -424,7 +425,7 @@ void Init() { } void Shutdown() { - shared_font.clear(); + shared_font = nullptr; shared_font_mem = nullptr; lock = nullptr; notification_event = nullptr; diff --git a/src/core/hle/service/gsp_gpu.cpp b/src/core/hle/service/gsp_gpu.cpp index e93c1b436..c3d0d28a5 100644 --- a/src/core/hle/service/gsp_gpu.cpp +++ b/src/core/hle/service/gsp_gpu.cpp @@ -3,8 +3,8 @@ // Refer to the license.txt file included. #include "common/bit_field.h" +#include "common/microprofile.h" -#include "core/mem_map.h" #include "core/memory.h" #include "core/hle/kernel/event.h" #include "core/hle/kernel/shared_memory.h" @@ -230,6 +230,10 @@ void SetBufferSwap(u32 screen_id, const FrameBufferInfo& info) { if (Pica::g_debug_context) Pica::g_debug_context->OnEvent(Pica::DebugContext::Event::BufferSwapped, nullptr); + + if (screen_id == 0) { + MicroProfileFlip(); + } } /** @@ -418,7 +422,7 @@ static void ExecuteCommand(const Command& command, u32 thread_id) { case CommandId::SET_DISPLAY_TRANSFER: { - auto& params = command.image_copy; + auto& params = command.display_transfer; WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(display_transfer_config.input_address)), Memory::VirtualToPhysicalAddress(params.in_buffer_address) >> 3); WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(display_transfer_config.output_address)), @@ -433,17 +437,22 @@ static void ExecuteCommand(const Command& command, u32 thread_id) { // TODO: Check if texture copies are implemented correctly.. case CommandId::SET_TEXTURE_COPY: { - auto& params = command.image_copy; - WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(display_transfer_config.input_address)), + auto& params = command.texture_copy; + WriteGPURegister((u32)GPU_REG_INDEX(display_transfer_config.input_address), Memory::VirtualToPhysicalAddress(params.in_buffer_address) >> 3); - WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(display_transfer_config.output_address)), + WriteGPURegister((u32)GPU_REG_INDEX(display_transfer_config.output_address), Memory::VirtualToPhysicalAddress(params.out_buffer_address) >> 3); - WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(display_transfer_config.input_size)), params.in_buffer_size); - WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(display_transfer_config.output_size)), params.out_buffer_size); - WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(display_transfer_config.flags)), params.flags); - - // TODO: Should this register be set to 1 or should instead its value be OR-ed with 1? - WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(display_transfer_config.trigger)), 1); + WriteGPURegister((u32)GPU_REG_INDEX(display_transfer_config.texture_copy.size), + params.size); + WriteGPURegister((u32)GPU_REG_INDEX(display_transfer_config.texture_copy.input_size), + params.in_width_gap); + WriteGPURegister((u32)GPU_REG_INDEX(display_transfer_config.texture_copy.output_size), + params.out_width_gap); + WriteGPURegister((u32)GPU_REG_INDEX(display_transfer_config.flags), + params.flags); + + // NOTE: Actual GSP ORs 1 with current register instead of overwriting. Doesn't seem to matter. + WriteGPURegister((u32)GPU_REG_INDEX(display_transfer_config.trigger), 1); break; } diff --git a/src/core/hle/service/gsp_gpu.h b/src/core/hle/service/gsp_gpu.h index c89d0a467..8bcb30ad1 100644 --- a/src/core/hle/service/gsp_gpu.h +++ b/src/core/hle/service/gsp_gpu.h @@ -127,7 +127,16 @@ struct Command { u32 in_buffer_size; u32 out_buffer_size; u32 flags; - } image_copy; + } display_transfer; + + struct { + u32 in_buffer_address; + u32 out_buffer_address; + u32 size; + u32 in_width_gap; + u32 out_width_gap; + u32 flags; + } texture_copy; u8 raw_data[0x1C]; }; diff --git a/src/core/hle/service/y2r_u.cpp b/src/core/hle/service/y2r_u.cpp index 6e7dafaad..6b1b71fe4 100644 --- a/src/core/hle/service/y2r_u.cpp +++ b/src/core/hle/service/y2r_u.cpp @@ -10,7 +10,6 @@ #include "core/hle/kernel/event.h" #include "core/hle/service/y2r_u.h" #include "core/hw/y2r.h" -#include "core/mem_map.h" #include "video_core/renderer_base.h" #include "video_core/utils.h" diff --git a/src/core/hle/shared_page.cpp b/src/core/hle/shared_page.cpp index 26d87c7e2..50c5bc01b 100644 --- a/src/core/hle/shared_page.cpp +++ b/src/core/hle/shared_page.cpp @@ -18,7 +18,4 @@ void Init() { shared_page.running_hw = 0x1; // product } -void Shutdown() { -} - } // namespace diff --git a/src/core/hle/shared_page.h b/src/core/hle/shared_page.h index db6a5340b..379bb7b63 100644 --- a/src/core/hle/shared_page.h +++ b/src/core/hle/shared_page.h @@ -54,6 +54,5 @@ static_assert(sizeof(SharedPageDef) == Memory::SHARED_PAGE_SIZE, "Shared page st extern SharedPageDef shared_page; void Init(); -void Shutdown(); } // namespace diff --git a/src/core/hle/svc.cpp b/src/core/hle/svc.cpp index bb64fdfb7..19f750d72 100644 --- a/src/core/hle/svc.cpp +++ b/src/core/hle/svc.cpp @@ -5,16 +5,17 @@ #include <map> #include "common/logging/log.h" +#include "common/microprofile.h" #include "common/profiler.h" #include "common/string_util.h" #include "common/symbols.h" #include "core/core_timing.h" -#include "core/mem_map.h" #include "core/arm/arm_interface.h" #include "core/hle/kernel/address_arbiter.h" #include "core/hle/kernel/event.h" +#include "core/hle/kernel/memory.h" #include "core/hle/kernel/mutex.h" #include "core/hle/kernel/process.h" #include "core/hle/kernel/resource_limit.h" @@ -41,32 +42,114 @@ const ResultCode ERR_NOT_FOUND(ErrorDescription::NotFound, ErrorModule::Kernel, const ResultCode ERR_PORT_NAME_TOO_LONG(ErrorDescription(30), ErrorModule::OS, ErrorSummary::InvalidArgument, ErrorLevel::Usage); // 0xE0E0181E +const ResultCode ERR_MISALIGNED_ADDRESS{ // 0xE0E01BF1 + ErrorDescription::MisalignedAddress, ErrorModule::OS, + ErrorSummary::InvalidArgument, ErrorLevel::Usage}; +const ResultCode ERR_MISALIGNED_SIZE{ // 0xE0E01BF2 + ErrorDescription::MisalignedSize, ErrorModule::OS, + ErrorSummary::InvalidArgument, ErrorLevel::Usage}; +const ResultCode ERR_INVALID_COMBINATION{ // 0xE0E01BEE + ErrorDescription::InvalidCombination, ErrorModule::OS, + ErrorSummary::InvalidArgument, ErrorLevel::Usage}; + enum ControlMemoryOperation { - MEMORY_OPERATION_HEAP = 0x00000003, - MEMORY_OPERATION_GSP_HEAP = 0x00010003, + MEMOP_FREE = 1, + MEMOP_RESERVE = 2, // This operation seems to be unsupported in the kernel + MEMOP_COMMIT = 3, + MEMOP_MAP = 4, + MEMOP_UNMAP = 5, + MEMOP_PROTECT = 6, + MEMOP_OPERATION_MASK = 0xFF, + + MEMOP_REGION_APP = 0x100, + MEMOP_REGION_SYSTEM = 0x200, + MEMOP_REGION_BASE = 0x300, + MEMOP_REGION_MASK = 0xF00, + + MEMOP_LINEAR = 0x10000, }; /// Map application or GSP heap memory static ResultCode ControlMemory(u32* out_addr, u32 operation, u32 addr0, u32 addr1, u32 size, u32 permissions) { - LOG_TRACE(Kernel_SVC,"called operation=0x%08X, addr0=0x%08X, addr1=0x%08X, size=%08X, permissions=0x%08X", + using namespace Kernel; + + LOG_DEBUG(Kernel_SVC,"called operation=0x%08X, addr0=0x%08X, addr1=0x%08X, size=0x%X, permissions=0x%08X", operation, addr0, addr1, size, permissions); - switch (operation) { + if ((addr0 & Memory::PAGE_MASK) != 0 || (addr1 & Memory::PAGE_MASK) != 0) { + return ERR_MISALIGNED_ADDRESS; + } + if ((size & Memory::PAGE_MASK) != 0) { + return ERR_MISALIGNED_SIZE; + } + + u32 region = operation & MEMOP_REGION_MASK; + operation &= ~MEMOP_REGION_MASK; + + if (region != 0) { + LOG_WARNING(Kernel_SVC, "ControlMemory with specified region not supported, region=%X", region); + } - // Map normal heap memory - case MEMORY_OPERATION_HEAP: - *out_addr = Memory::MapBlock_Heap(size, operation, permissions); + if ((permissions & (u32)MemoryPermission::ReadWrite) != permissions) { + return ERR_INVALID_COMBINATION; + } + VMAPermission vma_permissions = (VMAPermission)permissions; + + auto& process = *g_current_process; + + switch (operation & MEMOP_OPERATION_MASK) { + case MEMOP_FREE: + { + if (addr0 >= Memory::HEAP_VADDR && addr0 < Memory::HEAP_VADDR_END) { + ResultCode result = process.HeapFree(addr0, size); + if (result.IsError()) return result; + } else if (addr0 >= process.GetLinearHeapBase() && addr0 < process.GetLinearHeapLimit()) { + ResultCode result = process.LinearFree(addr0, size); + if (result.IsError()) return result; + } else { + return ERR_INVALID_ADDRESS; + } + *out_addr = addr0; break; + } - // Map GSP heap memory - case MEMORY_OPERATION_GSP_HEAP: - *out_addr = Memory::MapBlock_HeapLinear(size, operation, permissions); + case MEMOP_COMMIT: + { + if (operation & MEMOP_LINEAR) { + CASCADE_RESULT(*out_addr, process.LinearAllocate(addr0, size, vma_permissions)); + } else { + CASCADE_RESULT(*out_addr, process.HeapAllocate(addr0, size, vma_permissions)); + } + break; + } + + case MEMOP_MAP: // TODO: This is just a hack to avoid regressions until memory aliasing is implemented + { + CASCADE_RESULT(*out_addr, process.HeapAllocate(addr0, size, vma_permissions)); break; + } + + case MEMOP_UNMAP: // TODO: This is just a hack to avoid regressions until memory aliasing is implemented + { + ResultCode result = process.HeapFree(addr0, size); + if (result.IsError()) return result; + break; + } + + case MEMOP_PROTECT: + { + ResultCode result = process.vm_manager.ReprotectRange(addr0, size, vma_permissions); + if (result.IsError()) return result; + break; + } - // Unknown ControlMemory operation default: LOG_ERROR(Kernel_SVC, "unknown operation=0x%08X", operation); + return ERR_INVALID_COMBINATION; } + + process.vm_manager.LogLayout(Log::Level::Trace); + return RESULT_SUCCESS; } @@ -537,9 +620,9 @@ static ResultCode QueryProcessMemory(MemoryInfo* memory_info, PageInfo* page_inf if (process == nullptr) return ERR_INVALID_HANDLE; - auto vma = process->address_space->FindVMA(addr); + auto vma = process->vm_manager.FindVMA(addr); - if (vma == process->address_space->vma_map.end()) + if (vma == Kernel::g_current_process->vm_manager.vma_map.end()) return ResultCode(ErrorDescription::InvalidAddress, ErrorModule::OS, ErrorSummary::InvalidArgument, ErrorLevel::Usage); memory_info->base_address = vma->second.base; @@ -692,6 +775,52 @@ static ResultCode CreateMemoryBlock(Handle* out_handle, u32 addr, u32 size, u32 return RESULT_SUCCESS; } +static ResultCode GetProcessInfo(s64* out, Handle process_handle, u32 type) { + LOG_TRACE(Kernel_SVC, "called process=0x%08X type=%u", process_handle, type); + + using Kernel::Process; + Kernel::SharedPtr<Process> process = Kernel::g_handle_table.Get<Process>(process_handle); + if (process == nullptr) + return ERR_INVALID_HANDLE; + + switch (type) { + case 0: + case 2: + // TODO(yuriks): Type 0 returns a slightly higher number than type 2, but I'm not sure + // what's the difference between them. + *out = process->heap_used + process->linear_heap_used + process->misc_memory_used; + break; + case 1: + case 3: + case 4: + case 5: + case 6: + case 7: + case 8: + // These are valid, but not implemented yet + LOG_ERROR(Kernel_SVC, "unimplemented GetProcessInfo type=%u", type); + break; + case 20: + *out = Memory::FCRAM_PADDR - process->GetLinearHeapBase(); + break; + default: + LOG_ERROR(Kernel_SVC, "unknown GetProcessInfo type=%u", type); + + if (type >= 21 && type <= 23) { + return ResultCode( // 0xE0E01BF4 + ErrorDescription::NotImplemented, ErrorModule::OS, + ErrorSummary::InvalidArgument, ErrorLevel::Usage); + } else { + return ResultCode( // 0xD8E007ED + ErrorDescription::InvalidEnumValue, ErrorModule::Kernel, + ErrorSummary::InvalidArgument, ErrorLevel::Permanent); + } + break; + } + + return RESULT_SUCCESS; +} + namespace { struct FunctionDef { using Func = void(); @@ -746,7 +875,7 @@ static const FunctionDef SVC_Table[] = { {0x28, HLE::Wrap<GetSystemTick>, "GetSystemTick"}, {0x29, nullptr, "GetHandleInfo"}, {0x2A, nullptr, "GetSystemInfo"}, - {0x2B, nullptr, "GetProcessInfo"}, + {0x2B, HLE::Wrap<GetProcessInfo>, "GetProcessInfo"}, {0x2C, nullptr, "GetThreadInfo"}, {0x2D, HLE::Wrap<ConnectToPort>, "ConnectToPort"}, {0x2E, nullptr, "SendSyncRequest1"}, @@ -841,8 +970,11 @@ static const FunctionDef* GetSVCInfo(u32 func_num) { return &SVC_Table[func_num]; } +MICROPROFILE_DEFINE(Kernel_SVC, "Kernel", "SVC", MP_RGB(70, 200, 70)); + void CallSVC(u32 immediate) { Common::Profiling::ScopeTimer timer_svc(profiler_svc); + MICROPROFILE_SCOPE(Kernel_SVC); const FunctionDef* info = GetSVCInfo(immediate); if (info) { diff --git a/src/core/hw/gpu.cpp b/src/core/hw/gpu.cpp index 3ccbc03b2..bc7bde903 100644 --- a/src/core/hw/gpu.cpp +++ b/src/core/hw/gpu.cpp @@ -3,11 +3,13 @@ // Refer to the license.txt file included. #include <cstring> +#include <numeric> #include <type_traits> #include "common/color.h" #include "common/common_types.h" #include "common/logging/log.h" +#include "common/microprofile.h" #include "common/vector_math.h" #include "core/settings.h" @@ -84,6 +86,9 @@ static Math::Vec4<u8> DecodePixel(Regs::PixelFormat input_format, const u8* src_ } } +MICROPROFILE_DEFINE(GPU_DisplayTransfer, "GPU", "DisplayTransfer", MP_RGB(100, 100, 255)); +MICROPROFILE_DEFINE(GPU_CmdlistProcessing, "GPU", "Cmdlist Processing", MP_RGB(100, 255, 100)); + template <typename T> inline void Write(u32 addr, const T data) { addr -= HW::VADDR_GPU; @@ -149,6 +154,8 @@ inline void Write(u32 addr, const T data) { case GPU_REG_INDEX(display_transfer_config.trigger): { + MICROPROFILE_SCOPE(GPU_DisplayTransfer); + const auto& config = g_regs.display_transfer_config; if (config.trigger & 1) { @@ -158,14 +165,59 @@ inline void Write(u32 addr, const T data) { u8* src_pointer = Memory::GetPhysicalPointer(config.GetPhysicalInputAddress()); u8* dst_pointer = Memory::GetPhysicalPointer(config.GetPhysicalOutputAddress()); + if (config.is_texture_copy) { + u32 input_width = config.texture_copy.input_width * 16; + u32 input_gap = config.texture_copy.input_gap * 16; + u32 output_width = config.texture_copy.output_width * 16; + u32 output_gap = config.texture_copy.output_gap * 16; + + size_t contiguous_input_size = config.texture_copy.size / input_width * (input_width + input_gap); + VideoCore::g_renderer->hw_rasterizer->NotifyPreRead(config.GetPhysicalInputAddress(), contiguous_input_size); + + u32 remaining_size = config.texture_copy.size; + u32 remaining_input = input_width; + u32 remaining_output = output_width; + while (remaining_size > 0) { + u32 copy_size = std::min({ remaining_input, remaining_output, remaining_size }); + + std::memcpy(dst_pointer, src_pointer, copy_size); + src_pointer += copy_size; + dst_pointer += copy_size; + + remaining_input -= copy_size; + remaining_output -= copy_size; + remaining_size -= copy_size; + + if (remaining_input == 0) { + remaining_input = input_width; + src_pointer += input_gap; + } + if (remaining_output == 0) { + remaining_output = output_width; + dst_pointer += output_gap; + } + } + + LOG_TRACE(HW_GPU, "TextureCopy: 0x%X bytes from 0x%08X(%u+%u)-> 0x%08X(%u+%u), flags 0x%08X", + config.texture_copy.size, + config.GetPhysicalInputAddress(), input_width, input_gap, + config.GetPhysicalOutputAddress(), output_width, output_gap, + config.flags); + + size_t contiguous_output_size = config.texture_copy.size / output_width * (output_width + output_gap); + VideoCore::g_renderer->hw_rasterizer->NotifyFlush(config.GetPhysicalOutputAddress(), contiguous_output_size); + + GSP_GPU::SignalInterrupt(GSP_GPU::InterruptId::PPF); + break; + } + if (config.scaling > config.ScaleXY) { LOG_CRITICAL(HW_GPU, "Unimplemented display transfer scaling mode %u", config.scaling.Value()); UNIMPLEMENTED(); break; } - if (config.output_tiled && - (config.scaling == config.ScaleXY || config.scaling == config.ScaleX)) { + if (config.input_linear && config.scaling != config.NoScale) { LOG_CRITICAL(HW_GPU, "Scaling is only implemented on tiled input"); UNIMPLEMENTED(); break; @@ -182,23 +234,6 @@ inline void Write(u32 addr, const T data) { VideoCore::g_renderer->hw_rasterizer->NotifyPreRead(config.GetPhysicalInputAddress(), input_size); - if (config.raw_copy) { - // Raw copies do not perform color conversion nor tiled->linear / linear->tiled conversions - // TODO(Subv): Verify if raw copies perform scaling - memcpy(dst_pointer, src_pointer, output_size); - - LOG_TRACE(HW_GPU, "DisplayTriggerTransfer: 0x%08x bytes from 0x%08x(%ux%u)-> 0x%08x(%ux%u), output format: %x, flags 0x%08X, Raw copy", - output_size, - config.GetPhysicalInputAddress(), config.input_width.Value(), config.input_height.Value(), - config.GetPhysicalOutputAddress(), config.output_width.Value(), config.output_height.Value(), - config.output_format.Value(), config.flags); - - GSP_GPU::SignalInterrupt(GSP_GPU::InterruptId::PPF); - - VideoCore::g_renderer->hw_rasterizer->NotifyFlush(config.GetPhysicalOutputAddress(), output_size); - break; - } - for (u32 y = 0; y < output_height; ++y) { for (u32 x = 0; x < output_width; ++x) { Math::Vec4<u8> src_color; @@ -220,7 +255,7 @@ inline void Write(u32 addr, const T data) { u32 src_offset; u32 dst_offset; - if (config.output_tiled) { + if (config.input_linear) { if (!config.dont_swizzle) { // Interpret the input as linear and the output as tiled u32 coarse_y = y & ~7; @@ -315,6 +350,8 @@ inline void Write(u32 addr, const T data) { const auto& config = g_regs.command_processor_config; if (config.trigger & 1) { + MICROPROFILE_SCOPE(GPU_CmdlistProcessing); + u32* buffer = (u32*)Memory::GetPhysicalPointer(config.GetPhysicalAddress()); if (Pica::g_debug_context && Pica::g_debug_context->recorder) { diff --git a/src/core/hw/gpu.h b/src/core/hw/gpu.h index daad506fe..2e3a9f779 100644 --- a/src/core/hw/gpu.h +++ b/src/core/hw/gpu.h @@ -201,12 +201,14 @@ struct Regs { u32 flags; BitField< 0, 1, u32> flip_vertically; // flips input data vertically - BitField< 1, 1, u32> output_tiled; // Converts from linear to tiled format - BitField< 3, 1, u32> raw_copy; // Copies the data without performing any processing + BitField< 1, 1, u32> input_linear; // Converts from linear to tiled format + BitField< 2, 1, u32> crop_input_lines; + BitField< 3, 1, u32> is_texture_copy; // Copies the data without performing any processing and respecting texture copy fields BitField< 5, 1, u32> dont_swizzle; BitField< 8, 3, PixelFormat> input_format; BitField<12, 3, PixelFormat> output_format; - + /// Uses some kind of 32x32 block swizzling mode, instead of the usual 8x8 one. + BitField<16, 1, u32> block_32; // TODO(yuriks): unimplemented BitField<24, 2, ScalingMode> scaling; // Determines the scaling mode of the transfer }; @@ -214,10 +216,30 @@ struct Regs { // it seems that writing to this field triggers the display transfer u32 trigger; + + INSERT_PADDING_WORDS(0x1); + + struct { + u32 size; + + union { + u32 input_size; + + BitField< 0, 16, u32> input_width; + BitField<16, 16, u32> input_gap; + }; + + union { + u32 output_size; + + BitField< 0, 16, u32> output_width; + BitField<16, 16, u32> output_gap; + }; + } texture_copy; } display_transfer_config; - ASSERT_MEMBER_SIZE(display_transfer_config, 0x1c); + ASSERT_MEMBER_SIZE(display_transfer_config, 0x2c); - INSERT_PADDING_WORDS(0x331); + INSERT_PADDING_WORDS(0x32D); struct { // command list size (in bytes) diff --git a/src/core/mem_map.cpp b/src/core/mem_map.cpp deleted file mode 100644 index cbe993fbe..000000000 --- a/src/core/mem_map.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// Copyright 2014 Citra Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#include <map> -#include <memory> -#include <utility> -#include <vector> - -#include "common/common_types.h" -#include "common/logging/log.h" - -#include "core/hle/config_mem.h" -#include "core/hle/kernel/vm_manager.h" -#include "core/hle/result.h" -#include "core/hle/shared_page.h" -#include "core/mem_map.h" -#include "core/memory.h" -#include "core/memory_setup.h" - -//////////////////////////////////////////////////////////////////////////////////////////////////// - -namespace Memory { - -namespace { - -struct MemoryArea { - u32 base; - u32 size; - const char* name; -}; - -// We don't declare the IO regions in here since its handled by other means. -static MemoryArea memory_areas[] = { - {HEAP_VADDR, HEAP_SIZE, "Heap"}, // Application heap (main memory) - {SHARED_MEMORY_VADDR, SHARED_MEMORY_SIZE, "Shared Memory"}, // Shared memory - {LINEAR_HEAP_VADDR, LINEAR_HEAP_SIZE, "Linear Heap"}, // Linear heap (main memory) - {VRAM_VADDR, VRAM_SIZE, "VRAM"}, // Video memory (VRAM) - {DSP_RAM_VADDR, DSP_RAM_SIZE, "DSP RAM"}, // DSP memory - {TLS_AREA_VADDR, TLS_AREA_SIZE, "TLS Area"}, // TLS memory -}; - -/// Represents a block of memory mapped by ControlMemory/MapMemoryBlock -struct MemoryBlock { - MemoryBlock() : handle(0), base_address(0), address(0), size(0), operation(0), permissions(0) { - } - u32 handle; - u32 base_address; - u32 address; - u32 size; - u32 operation; - u32 permissions; - - const u32 GetVirtualAddress() const{ - return base_address + address; - } -}; - -static std::map<u32, MemoryBlock> heap_map; -static std::map<u32, MemoryBlock> heap_linear_map; - -} - -u32 MapBlock_Heap(u32 size, u32 operation, u32 permissions) { - MemoryBlock block; - - block.base_address = HEAP_VADDR; - block.size = size; - block.operation = operation; - block.permissions = permissions; - - if (heap_map.size() > 0) { - const MemoryBlock last_block = heap_map.rbegin()->second; - block.address = last_block.address + last_block.size; - } - heap_map[block.GetVirtualAddress()] = block; - - return block.GetVirtualAddress(); -} - -u32 MapBlock_HeapLinear(u32 size, u32 operation, u32 permissions) { - MemoryBlock block; - - block.base_address = LINEAR_HEAP_VADDR; - block.size = size; - block.operation = operation; - block.permissions = permissions; - - if (heap_linear_map.size() > 0) { - const MemoryBlock last_block = heap_linear_map.rbegin()->second; - block.address = last_block.address + last_block.size; - } - heap_linear_map[block.GetVirtualAddress()] = block; - - return block.GetVirtualAddress(); -} - -PAddr VirtualToPhysicalAddress(const VAddr addr) { - if (addr == 0) { - return 0; - } else if (addr >= VRAM_VADDR && addr < VRAM_VADDR_END) { - return addr - VRAM_VADDR + VRAM_PADDR; - } else if (addr >= LINEAR_HEAP_VADDR && addr < LINEAR_HEAP_VADDR_END) { - return addr - LINEAR_HEAP_VADDR + FCRAM_PADDR; - } else if (addr >= DSP_RAM_VADDR && addr < DSP_RAM_VADDR_END) { - return addr - DSP_RAM_VADDR + DSP_RAM_PADDR; - } else if (addr >= IO_AREA_VADDR && addr < IO_AREA_VADDR_END) { - return addr - IO_AREA_VADDR + IO_AREA_PADDR; - } - - LOG_ERROR(HW_Memory, "Unknown virtual address @ 0x%08x", addr); - // To help with debugging, set bit on address so that it's obviously invalid. - return addr | 0x80000000; -} - -VAddr PhysicalToVirtualAddress(const PAddr addr) { - if (addr == 0) { - return 0; - } else if (addr >= VRAM_PADDR && addr < VRAM_PADDR_END) { - return addr - VRAM_PADDR + VRAM_VADDR; - } else if (addr >= FCRAM_PADDR && addr < FCRAM_PADDR_END) { - return addr - FCRAM_PADDR + LINEAR_HEAP_VADDR; - } else if (addr >= DSP_RAM_PADDR && addr < DSP_RAM_PADDR_END) { - return addr - DSP_RAM_PADDR + DSP_RAM_VADDR; - } else if (addr >= IO_AREA_PADDR && addr < IO_AREA_PADDR_END) { - return addr - IO_AREA_PADDR + IO_AREA_VADDR; - } - - LOG_ERROR(HW_Memory, "Unknown physical address @ 0x%08x", addr); - // To help with debugging, set bit on address so that it's obviously invalid. - return addr | 0x80000000; -} - -void Init() { - InitMemoryMap(); - LOG_DEBUG(HW_Memory, "initialized OK"); -} - -void InitLegacyAddressSpace(Kernel::VMManager& address_space) { - using namespace Kernel; - - for (MemoryArea& area : memory_areas) { - auto block = std::make_shared<std::vector<u8>>(area.size); - address_space.MapMemoryBlock(area.base, std::move(block), 0, area.size, MemoryState::Private).Unwrap(); - } - - auto cfg_mem_vma = address_space.MapBackingMemory(CONFIG_MEMORY_VADDR, - (u8*)&ConfigMem::config_mem, CONFIG_MEMORY_SIZE, MemoryState::Shared).MoveFrom(); - address_space.Reprotect(cfg_mem_vma, VMAPermission::Read); - - auto shared_page_vma = address_space.MapBackingMemory(SHARED_PAGE_VADDR, - (u8*)&SharedPage::shared_page, SHARED_PAGE_SIZE, MemoryState::Shared).MoveFrom(); - address_space.Reprotect(shared_page_vma, VMAPermission::Read); -} - -void Shutdown() { - heap_map.clear(); - heap_linear_map.clear(); - - LOG_DEBUG(HW_Memory, "shutdown OK"); -} - -} // namespace diff --git a/src/core/mem_map.h b/src/core/mem_map.h deleted file mode 100644 index 229ef82c5..000000000 --- a/src/core/mem_map.h +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2014 Citra Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#pragma once - -#include "common/common_types.h" - -namespace Kernel { -class VMManager; -} - -namespace Memory { - -void Init(); -void InitLegacyAddressSpace(Kernel::VMManager& address_space); -void Shutdown(); - -/** - * Maps a block of memory on the heap - * @param size Size of block in bytes - * @param operation Memory map operation type - * @param permissions Memory allocation permissions - */ -u32 MapBlock_Heap(u32 size, u32 operation, u32 permissions); - -/** - * Maps a block of memory on the GSP heap - * @param size Size of block in bytes - * @param operation Memory map operation type - * @param permissions Control memory permissions - */ -u32 MapBlock_HeapLinear(u32 size, u32 operation, u32 permissions); - -/** - * Converts a virtual address inside a region with 1:1 mapping to physical memory to a physical - * address. This should be used by services to translate addresses for use by the hardware. - */ -PAddr VirtualToPhysicalAddress(VAddr addr); - -/** - * Undoes a mapping performed by VirtualToPhysicalAddress(). - */ -VAddr PhysicalToVirtualAddress(PAddr addr); - -} // namespace diff --git a/src/core/memory.cpp b/src/core/memory.cpp index 1f66bb27d..cde390b8a 100644 --- a/src/core/memory.cpp +++ b/src/core/memory.cpp @@ -9,7 +9,7 @@ #include "common/logging/log.h" #include "common/swap.h" -#include "core/mem_map.h" +#include "core/hle/kernel/process.h" #include "core/memory.h" #include "core/memory_setup.h" @@ -198,4 +198,42 @@ void WriteBlock(const VAddr addr, const u8* data, const size_t size) { Write8(addr + offset, data[offset]); } +PAddr VirtualToPhysicalAddress(const VAddr addr) { + if (addr == 0) { + return 0; + } else if (addr >= VRAM_VADDR && addr < VRAM_VADDR_END) { + return addr - VRAM_VADDR + VRAM_PADDR; + } else if (addr >= LINEAR_HEAP_VADDR && addr < LINEAR_HEAP_VADDR_END) { + return addr - LINEAR_HEAP_VADDR + FCRAM_PADDR; + } else if (addr >= DSP_RAM_VADDR && addr < DSP_RAM_VADDR_END) { + return addr - DSP_RAM_VADDR + DSP_RAM_PADDR; + } else if (addr >= IO_AREA_VADDR && addr < IO_AREA_VADDR_END) { + return addr - IO_AREA_VADDR + IO_AREA_PADDR; + } else if (addr >= NEW_LINEAR_HEAP_VADDR && addr < NEW_LINEAR_HEAP_VADDR_END) { + return addr - NEW_LINEAR_HEAP_VADDR + FCRAM_PADDR; + } + + LOG_ERROR(HW_Memory, "Unknown virtual address @ 0x%08X", addr); + // To help with debugging, set bit on address so that it's obviously invalid. + return addr | 0x80000000; +} + +VAddr PhysicalToVirtualAddress(const PAddr addr) { + if (addr == 0) { + return 0; + } else if (addr >= VRAM_PADDR && addr < VRAM_PADDR_END) { + return addr - VRAM_PADDR + VRAM_VADDR; + } else if (addr >= FCRAM_PADDR && addr < FCRAM_PADDR_END) { + return addr - FCRAM_PADDR + Kernel::g_current_process->GetLinearHeapBase(); + } else if (addr >= DSP_RAM_PADDR && addr < DSP_RAM_PADDR_END) { + return addr - DSP_RAM_PADDR + DSP_RAM_VADDR; + } else if (addr >= IO_AREA_PADDR && addr < IO_AREA_PADDR_END) { + return addr - IO_AREA_PADDR + IO_AREA_VADDR; + } + + LOG_ERROR(HW_Memory, "Unknown physical address @ 0x%08X", addr); + // To help with debugging, set bit on address so that it's obviously invalid. + return addr | 0x80000000; +} + } // namespace diff --git a/src/core/memory.h b/src/core/memory.h index 418609de0..5af72b7a7 100644 --- a/src/core/memory.h +++ b/src/core/memory.h @@ -15,6 +15,8 @@ namespace Memory { * be mapped. */ const u32 PAGE_SIZE = 0x1000; +const u32 PAGE_MASK = PAGE_SIZE - 1; +const int PAGE_BITS = 12; /// Physical memory regions as seen from the ARM11 enum : PAddr { @@ -103,8 +105,15 @@ enum : VAddr { // hardcoded value. /// Area where TLS (Thread-Local Storage) buffers are allocated. TLS_AREA_VADDR = 0x1FF82000, - TLS_AREA_SIZE = 0x00030000, // Each TLS buffer is 0x200 bytes, allows for 300 threads + TLS_ENTRY_SIZE = 0x200, + TLS_AREA_SIZE = 300 * TLS_ENTRY_SIZE + 0x800, // Space for up to 300 threads + round to page size TLS_AREA_VADDR_END = TLS_AREA_VADDR + TLS_AREA_SIZE, + + + /// Equivalent to LINEAR_HEAP_VADDR, but expanded to cover the extra memory in the New 3DS. + NEW_LINEAR_HEAP_VADDR = 0x30000000, + NEW_LINEAR_HEAP_SIZE = 0x10000000, + NEW_LINEAR_HEAP_VADDR_END = NEW_LINEAR_HEAP_VADDR + NEW_LINEAR_HEAP_SIZE, }; u8 Read8(VAddr addr); @@ -122,6 +131,17 @@ void WriteBlock(VAddr addr, const u8* data, size_t size); u8* GetPointer(VAddr virtual_address); /** +* Converts a virtual address inside a region with 1:1 mapping to physical memory to a physical +* address. This should be used by services to translate addresses for use by the hardware. +*/ +PAddr VirtualToPhysicalAddress(VAddr addr); + +/** +* Undoes a mapping performed by VirtualToPhysicalAddress(). +*/ +VAddr PhysicalToVirtualAddress(PAddr addr); + +/** * Gets a pointer to the memory region beginning at the specified physical address. * * @note This is currently implemented using PhysicalToVirtualAddress(). diff --git a/src/core/memory_setup.h b/src/core/memory_setup.h index 361bfc816..84ff30120 100644 --- a/src/core/memory_setup.h +++ b/src/core/memory_setup.h @@ -10,9 +10,6 @@ namespace Memory { -const u32 PAGE_MASK = PAGE_SIZE - 1; -const int PAGE_BITS = 12; - void InitMemoryMap(); /** diff --git a/src/core/system.cpp b/src/core/system.cpp index 561ff82f0..3cd84bf5e 100644 --- a/src/core/system.cpp +++ b/src/core/system.cpp @@ -4,11 +4,11 @@ #include "core/core.h" #include "core/core_timing.h" -#include "core/mem_map.h" #include "core/system.h" #include "core/hw/hw.h" #include "core/hle/hle.h" #include "core/hle/kernel/kernel.h" +#include "core/hle/kernel/memory.h" #include "video_core/video_core.h" @@ -29,7 +29,6 @@ void Shutdown() { HLE::Shutdown(); Kernel::Shutdown(); HW::Shutdown(); - Memory::Shutdown(); CoreTiming::Shutdown(); Core::Shutdown(); } diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp index d82e20f86..a78985510 100644 --- a/src/video_core/command_processor.cpp +++ b/src/video_core/command_processor.cpp @@ -4,6 +4,7 @@ #include <boost/range/algorithm/fill.hpp> +#include "common/microprofile.h" #include "common/profiler.h" #include "core/hle/service/gsp_gpu.h" @@ -43,6 +44,8 @@ static const u32 expand_bits_to_bytes[] = { 0xffff0000, 0xffff00ff, 0xffffff00, 0xffffffff }; +MICROPROFILE_DEFINE(GPU_Drawing, "GPU", "Drawing", MP_RGB(50, 50, 240)); + static void WritePicaReg(u32 id, u32 value, u32 mask) { auto& regs = g_state.regs; @@ -126,6 +129,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { case PICA_REG_INDEX(trigger_draw_indexed): { Common::Profiling::ScopeTimer scope_timer(category_drawing); + MICROPROFILE_SCOPE(GPU_Drawing); #if PICA_LOG_TEV DebugUtils::DumpTevStageConfig(regs.GetTevStages()); diff --git a/src/video_core/debug_utils/debug_utils.cpp b/src/video_core/debug_utils/debug_utils.cpp index 8ad77f0c8..059445f7d 100644 --- a/src/video_core/debug_utils/debug_utils.cpp +++ b/src/video_core/debug_utils/debug_utils.cpp @@ -25,6 +25,8 @@ #include "common/math_util.h" #include "common/vector_math.h" +#include "core/settings.h" + #include "video_core/pica.h" #include "video_core/renderer_base.h" #include "video_core/utils.h" @@ -45,8 +47,10 @@ void DebugContext::OnEvent(Event event, void* data) { { std::unique_lock<std::mutex> lock(breakpoint_mutex); - // Commit the hardware renderer's framebuffer so it will show on debug widgets - VideoCore::g_renderer->hw_rasterizer->CommitFramebuffer(); + if (Settings::values.use_hw_renderer) { + // Commit the hardware renderer's framebuffer so it will show on debug widgets + VideoCore::g_renderer->hw_rasterizer->CommitFramebuffer(); + } // TODO: Should stop the CPU thread here once we multithread emulation. diff --git a/src/video_core/pica.h b/src/video_core/pica.h index 87cf705e7..f40684d83 100644 --- a/src/video_core/pica.h +++ b/src/video_core/pica.h @@ -1033,12 +1033,20 @@ struct float24 { return ret; } + static float24 Zero() { + return FromFloat32(0.f); + } + // Not recommended for anything but logging float ToFloat32() const { return value; } float24 operator * (const float24& flt) const { + if ((this->value == 0.f && !std::isnan(flt.value)) || + (flt.value == 0.f && !std::isnan(this->value))) + // PICA gives 0 instead of NaN when multiplying by inf + return Zero(); return float24::FromFloat32(ToFloat32() * flt.ToFloat32()); } @@ -1055,7 +1063,11 @@ struct float24 { } float24& operator *= (const float24& flt) { - value *= flt.ToFloat32(); + if ((this->value == 0.f && !std::isnan(flt.value)) || + (flt.value == 0.f && !std::isnan(this->value))) + // PICA gives 0 instead of NaN when multiplying by inf + *this = Zero(); + else value *= flt.ToFloat32(); return *this; } diff --git a/src/video_core/rasterizer.cpp b/src/video_core/rasterizer.cpp index b6c0e1bff..77eadda9e 100644 --- a/src/video_core/rasterizer.cpp +++ b/src/video_core/rasterizer.cpp @@ -7,6 +7,7 @@ #include "common/color.h" #include "common/common_types.h" #include "common/math_util.h" +#include "common/microprofile.h" #include "common/profiler.h" #include "core/hw/gpu.h" @@ -286,6 +287,7 @@ static int SignedArea (const Math::Vec2<Fix12P4>& vtx1, }; static Common::Profiling::TimingCategory rasterization_category("Rasterization"); +MICROPROFILE_DEFINE(GPU_Rasterization, "GPU", "Rasterization", MP_RGB(50, 50, 240)); /** * Helper function for ProcessTriangle with the "reversed" flag to allow for implementing @@ -298,6 +300,7 @@ static void ProcessTriangleInternal(const Shader::OutputVertex& v0, { const auto& regs = g_state.regs; Common::Profiling::ScopeTimer timer(rasterization_category); + MICROPROFILE_SCOPE(GPU_Rasterization); // vertex positions in rasterizer coordinates static auto FloatToFix = [](float24 flt) { diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index c3829d5c6..d29049508 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -7,6 +7,7 @@ #include "common/color.h" #include "common/math_util.h" +#include "common/microprofile.h" #include "common/profiler.h" #include "core/hw/gpu.h" @@ -230,8 +231,8 @@ void RasterizerOpenGL::DrawTriangles() { u32 cur_fb_depth_size = Pica::Regs::BytesPerDepthPixel(regs.framebuffer.depth_format) * regs.framebuffer.GetWidth() * regs.framebuffer.GetHeight(); - res_cache.NotifyFlush(cur_fb_color_addr, cur_fb_color_size); - res_cache.NotifyFlush(cur_fb_depth_addr, cur_fb_depth_size); + res_cache.NotifyFlush(cur_fb_color_addr, cur_fb_color_size, true); + res_cache.NotifyFlush(cur_fb_depth_addr, cur_fb_depth_size, true); } void RasterizerOpenGL::CommitFramebuffer() { @@ -786,12 +787,16 @@ void RasterizerOpenGL::SyncDrawState() { state.Apply(); } +MICROPROFILE_DEFINE(OpenGL_FramebufferReload, "OpenGL", "FB Reload", MP_RGB(70, 70, 200)); + void RasterizerOpenGL::ReloadColorBuffer() { u8* color_buffer = Memory::GetPhysicalPointer(Pica::g_state.regs.framebuffer.GetColorBufferPhysicalAddress()); if (color_buffer == nullptr) return; + MICROPROFILE_SCOPE(OpenGL_FramebufferReload); + u32 bytes_per_pixel = Pica::Regs::BytesPerColorPixel(fb_color_texture.format); std::unique_ptr<u8[]> temp_fb_color_buffer(new u8[fb_color_texture.width * fb_color_texture.height * bytes_per_pixel]); @@ -831,6 +836,8 @@ void RasterizerOpenGL::ReloadDepthBuffer() { if (depth_buffer == nullptr) return; + MICROPROFILE_SCOPE(OpenGL_FramebufferReload); + u32 bytes_per_pixel = Pica::Regs::BytesPerDepthPixel(fb_depth_texture.format); // OpenGL needs 4 bpp alignment for D24 @@ -884,6 +891,7 @@ void RasterizerOpenGL::ReloadDepthBuffer() { } Common::Profiling::TimingCategory buffer_commit_category("Framebuffer Commit"); +MICROPROFILE_DEFINE(OpenGL_FramebufferCommit, "OpenGL", "FB Commit", MP_RGB(70, 70, 200)); void RasterizerOpenGL::CommitColorBuffer() { if (last_fb_color_addr != 0) { @@ -891,6 +899,7 @@ void RasterizerOpenGL::CommitColorBuffer() { if (color_buffer != nullptr) { Common::Profiling::ScopeTimer timer(buffer_commit_category); + MICROPROFILE_SCOPE(OpenGL_FramebufferCommit); u32 bytes_per_pixel = Pica::Regs::BytesPerColorPixel(fb_color_texture.format); @@ -927,6 +936,7 @@ void RasterizerOpenGL::CommitDepthBuffer() { if (depth_buffer != nullptr) { Common::Profiling::ScopeTimer timer(buffer_commit_category); + MICROPROFILE_SCOPE(OpenGL_FramebufferCommit); u32 bytes_per_pixel = Pica::Regs::BytesPerDepthPixel(fb_depth_texture.format); diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp index 70f0ba5f1..1e38c2e6d 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp @@ -2,8 +2,10 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. +#include "common/hash.h" #include "common/make_unique.h" #include "common/math_util.h" +#include "common/microprofile.h" #include "common/vector_math.h" #include "core/memory.h" @@ -16,15 +18,18 @@ RasterizerCacheOpenGL::~RasterizerCacheOpenGL() { FullFlush(); } +MICROPROFILE_DEFINE(OpenGL_TextureUpload, "OpenGL", "Texture Upload", MP_RGB(128, 64, 192)); + void RasterizerCacheOpenGL::LoadAndBindTexture(OpenGLState &state, unsigned texture_unit, const Pica::Regs::FullTextureConfig& config) { PAddr texture_addr = config.config.GetPhysicalAddress(); - const auto cached_texture = texture_cache.find(texture_addr); if (cached_texture != texture_cache.end()) { state.texture_units[texture_unit].texture_2d = cached_texture->second->texture.handle; state.Apply(); } else { + MICROPROFILE_SCOPE(OpenGL_TextureUpload); + std::unique_ptr<CachedTexture> new_texture = Common::make_unique<CachedTexture>(); new_texture->texture.Create(); @@ -46,12 +51,14 @@ void RasterizerCacheOpenGL::LoadAndBindTexture(OpenGLState &state, unsigned text } const auto info = Pica::DebugUtils::TextureInfo::FromPicaRegister(config.config, config.format); + u8* texture_src_data = Memory::GetPhysicalPointer(texture_addr); new_texture->width = info.width; new_texture->height = info.height; - new_texture->size = info.width * info.height * Pica::Regs::NibblesPerPixel(info.format); + new_texture->size = info.stride * info.height; + new_texture->addr = texture_addr; + new_texture->hash = Common::ComputeHash64(texture_src_data, new_texture->size); - u8* texture_src_data = Memory::GetPhysicalPointer(texture_addr); std::unique_ptr<Math::Vec4<u8>[]> temp_texture_buffer_rgba(new Math::Vec4<u8>[info.width * info.height]); for (int y = 0; y < info.height; ++y) { @@ -66,12 +73,18 @@ void RasterizerCacheOpenGL::LoadAndBindTexture(OpenGLState &state, unsigned text } } -void RasterizerCacheOpenGL::NotifyFlush(PAddr addr, u32 size) { +void RasterizerCacheOpenGL::NotifyFlush(PAddr addr, u32 size, bool ignore_hash) { // Flush any texture that falls in the flushed region // TODO: Optimize by also inserting upper bound (addr + size) of each texture into the same map and also narrow using lower_bound auto cache_upper_bound = texture_cache.upper_bound(addr + size); + for (auto it = texture_cache.begin(); it != cache_upper_bound;) { - if (MathUtil::IntervalsIntersect(addr, size, it->first, it->second->size)) { + const auto& info = *it->second; + + // Flush the texture only if the memory region intersects and a change is detected + if (MathUtil::IntervalsIntersect(addr, size, info.addr, info.size) && + (ignore_hash || info.hash != Common::ComputeHash64(Memory::GetPhysicalPointer(info.addr), info.size))) { + it = texture_cache.erase(it); } else { ++it; diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.h b/src/video_core/renderer_opengl/gl_rasterizer_cache.h index 96f3a925c..d8f9edf59 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer_cache.h +++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.h @@ -19,7 +19,7 @@ public: void LoadAndBindTexture(OpenGLState &state, unsigned texture_unit, const Pica::Regs::FullTextureConfig& config); /// Flush any cached resource that touches the flushed region - void NotifyFlush(PAddr addr, u32 size); + void NotifyFlush(PAddr addr, u32 size, bool ignore_hash = false); /// Flush all cached OpenGL resources tracked by this cache manager void FullFlush(); @@ -30,6 +30,8 @@ private: GLuint width; GLuint height; u32 size; + u64 hash; + PAddr addr; }; std::map<PAddr, std::unique_ptr<CachedTexture>> texture_cache; diff --git a/src/video_core/shader/shader.cpp b/src/video_core/shader/shader.cpp index 4e9836c80..f89117521 100644 --- a/src/video_core/shader/shader.cpp +++ b/src/video_core/shader/shader.cpp @@ -9,6 +9,7 @@ #include "common/hash.h" #include "common/make_unique.h" +#include "common/microprofile.h" #include "common/profiler.h" #include "video_core/debug_utils/debug_utils.h" @@ -51,15 +52,19 @@ void Setup(UnitState<false>& state) { } void Shutdown() { +#ifdef ARCHITECTURE_x86_64 shader_map.clear(); +#endif // ARCHITECTURE_x86_64 } static Common::Profiling::TimingCategory shader_category("Vertex Shader"); +MICROPROFILE_DEFINE(GPU_VertexShader, "GPU", "Vertex Shader", MP_RGB(50, 50, 240)); OutputVertex Run(UnitState<false>& state, const InputVertex& input, int num_attributes) { auto& config = g_state.regs.vs; Common::Profiling::ScopeTimer timer(shader_category); + MICROPROFILE_SCOPE(GPU_VertexShader); state.program_counter = config.main_offset; state.debug.max_offset = 0; diff --git a/src/video_core/shader/shader_interpreter.cpp b/src/video_core/shader/shader_interpreter.cpp index e14de0768..69e4efa68 100644 --- a/src/video_core/shader/shader_interpreter.cpp +++ b/src/video_core/shader/shader_interpreter.cpp @@ -177,7 +177,10 @@ void RunInterpreter(UnitState<Debug>& state) { if (!swizzle.DestComponentEnabled(i)) continue; - dest[i] = std::max(src1[i], src2[i]); + // NOTE: Exact form required to match NaN semantics to hardware: + // max(0, NaN) -> NaN + // max(NaN, 0) -> 0 + dest[i] = (src1[i] > src2[i]) ? src1[i] : src2[i]; } Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest); break; @@ -190,19 +193,29 @@ void RunInterpreter(UnitState<Debug>& state) { if (!swizzle.DestComponentEnabled(i)) continue; - dest[i] = std::min(src1[i], src2[i]); + // NOTE: Exact form required to match NaN semantics to hardware: + // min(0, NaN) -> NaN + // min(NaN, 0) -> 0 + dest[i] = (src1[i] < src2[i]) ? src1[i] : src2[i]; } Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest); break; case OpCode::Id::DP3: case OpCode::Id::DP4: + case OpCode::Id::DPH: + case OpCode::Id::DPHI: { Record<DebugDataRecord::SRC1>(state.debug, iteration, src1); Record<DebugDataRecord::SRC2>(state.debug, iteration, src2); Record<DebugDataRecord::DEST_IN>(state.debug, iteration, dest); + + OpCode::Id opcode = instr.opcode.Value().EffectiveOpCode(); + if (opcode == OpCode::Id::DPH || opcode == OpCode::Id::DPHI) + src1[3] = float24::FromFloat32(1.0f); + float24 dot = float24::FromFloat32(0.f); - int num_components = (instr.opcode.Value() == OpCode::Id::DP3) ? 3 : 4; + int num_components = (opcode == OpCode::Id::DP3) ? 3 : 4; for (int i = 0; i < num_components; ++i) dot = dot + src1[i] * src2[i]; @@ -221,13 +234,12 @@ void RunInterpreter(UnitState<Debug>& state) { { Record<DebugDataRecord::SRC1>(state.debug, iteration, src1); Record<DebugDataRecord::DEST_IN>(state.debug, iteration, dest); + float24 rcp_res = float24::FromFloat32(1.0f / src1[0].ToFloat32()); for (int i = 0; i < 4; ++i) { if (!swizzle.DestComponentEnabled(i)) continue; - // TODO: Be stable against division by zero! - // TODO: I think this might be wrong... we should only use one component here - dest[i] = float24::FromFloat32(1.0f / src1[i].ToFloat32()); + dest[i] = rcp_res; } Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest); break; @@ -238,13 +250,12 @@ void RunInterpreter(UnitState<Debug>& state) { { Record<DebugDataRecord::SRC1>(state.debug, iteration, src1); Record<DebugDataRecord::DEST_IN>(state.debug, iteration, dest); + float24 rsq_res = float24::FromFloat32(1.0f / std::sqrt(src1[0].ToFloat32())); for (int i = 0; i < 4; ++i) { if (!swizzle.DestComponentEnabled(i)) continue; - // TODO: Be stable against division by zero! - // TODO: I think this might be wrong... we should only use one component here - dest[i] = float24::FromFloat32(1.0f / sqrt(src1[i].ToFloat32())); + dest[i] = rsq_res; } Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest); break; @@ -278,6 +289,20 @@ void RunInterpreter(UnitState<Debug>& state) { break; } + case OpCode::Id::SGE: + case OpCode::Id::SGEI: + Record<DebugDataRecord::SRC1>(state.debug, iteration, src1); + Record<DebugDataRecord::SRC2>(state.debug, iteration, src2); + Record<DebugDataRecord::DEST_IN>(state.debug, iteration, dest); + for (int i = 0; i < 4; ++i) { + if (!swizzle.DestComponentEnabled(i)) + continue; + + dest[i] = (src1[i] >= src2[i]) ? float24::FromFloat32(1.0f) : float24::FromFloat32(0.0f); + } + Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest); + break; + case OpCode::Id::SLT: case OpCode::Id::SLTI: Record<DebugDataRecord::SRC1>(state.debug, iteration, src1); @@ -334,6 +359,42 @@ void RunInterpreter(UnitState<Debug>& state) { Record<DebugDataRecord::CMP_RESULT>(state.debug, iteration, state.conditional_code); break; + case OpCode::Id::EX2: + { + Record<DebugDataRecord::SRC1>(state.debug, iteration, src1); + Record<DebugDataRecord::DEST_IN>(state.debug, iteration, dest); + + // EX2 only takes first component exp2 and writes it to all dest components + float24 ex2_res = float24::FromFloat32(std::exp2(src1[0].ToFloat32())); + for (int i = 0; i < 4; ++i) { + if (!swizzle.DestComponentEnabled(i)) + continue; + + dest[i] = ex2_res; + } + + Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest); + break; + } + + case OpCode::Id::LG2: + { + Record<DebugDataRecord::SRC1>(state.debug, iteration, src1); + Record<DebugDataRecord::DEST_IN>(state.debug, iteration, dest); + + // LG2 only takes the first component log2 and writes it to all dest components + float24 lg2_res = float24::FromFloat32(std::log2(src1[0].ToFloat32())); + for (int i = 0; i < 4; ++i) { + if (!swizzle.DestComponentEnabled(i)) + continue; + + dest[i] = lg2_res; + } + + Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest); + break; + } + default: LOG_ERROR(HW_GPU, "Unhandled arithmetic instruction: 0x%02x (%s): 0x%08x", (int)instr.opcode.Value().EffectiveOpCode(), instr.opcode.Value().GetInfo().name, instr.hex); diff --git a/src/video_core/shader/shader_jit_x64.cpp b/src/video_core/shader/shader_jit_x64.cpp index 836942c6b..d3cfe109e 100644 --- a/src/video_core/shader/shader_jit_x64.cpp +++ b/src/video_core/shader/shader_jit_x64.cpp @@ -23,14 +23,14 @@ const JitFunction instr_table[64] = { &JitCompiler::Compile_ADD, // add &JitCompiler::Compile_DP3, // dp3 &JitCompiler::Compile_DP4, // dp4 - nullptr, // dph + &JitCompiler::Compile_DPH, // dph nullptr, // unknown - nullptr, // ex2 - nullptr, // lg2 + &JitCompiler::Compile_EX2, // ex2 + &JitCompiler::Compile_LG2, // lg2 nullptr, // unknown &JitCompiler::Compile_MUL, // mul - nullptr, // lge - nullptr, // slt + &JitCompiler::Compile_SGE, // sge + &JitCompiler::Compile_SLT, // slt &JitCompiler::Compile_FLR, // flr &JitCompiler::Compile_MAX, // max &JitCompiler::Compile_MIN, // min @@ -44,10 +44,10 @@ const JitFunction instr_table[64] = { nullptr, // unknown nullptr, // unknown nullptr, // unknown - nullptr, // dphi + &JitCompiler::Compile_DPH, // dphi nullptr, // unknown - nullptr, // sgei - &JitCompiler::Compile_SLTI, // slti + &JitCompiler::Compile_SGE, // sgei + &JitCompiler::Compile_SLT, // slti nullptr, // unknown nullptr, // unknown nullptr, // unknown @@ -115,6 +115,8 @@ static const X64Reg SRC1 = XMM1; static const X64Reg SRC2 = XMM2; /// Loaded with the third swizzled source register, otherwise can be used as a scratch register static const X64Reg SRC3 = XMM3; +/// Additional scratch register +static const X64Reg SCRATCH2 = XMM4; /// Constant vector of [1.0f, 1.0f, 1.0f, 1.0f], used to efficiently set a vector to one static const X64Reg ONE = XMM14; /// Constant vector of [-0.f, -0.f, -0.f, -0.f], used to efficiently negate a vector with XOR @@ -227,8 +229,8 @@ void JitCompiler::Compile_DestEnable(Instruction instr,X64Reg src) { u8 mask = ((swiz.dest_mask & 1) << 3) | ((swiz.dest_mask & 8) >> 3) | ((swiz.dest_mask & 2) << 1) | ((swiz.dest_mask & 4) >> 1); BLENDPS(SCRATCH, R(src), mask); } else { - MOVAPS(XMM4, R(src)); - UNPCKHPS(XMM4, R(SCRATCH)); // Unpack X/Y components of source and destination + MOVAPS(SCRATCH2, R(src)); + UNPCKHPS(SCRATCH2, R(SCRATCH)); // Unpack X/Y components of source and destination UNPCKLPS(SCRATCH, R(src)); // Unpack Z/W components of source and destination // Compute selector to selectively copy source components to destination for SHUFPS instruction @@ -236,7 +238,7 @@ void JitCompiler::Compile_DestEnable(Instruction instr,X64Reg src) { ((swiz.DestComponentEnabled(1) ? 3 : 2) << 2) | ((swiz.DestComponentEnabled(2) ? 0 : 1) << 4) | ((swiz.DestComponentEnabled(3) ? 2 : 3) << 6); - SHUFPS(SCRATCH, R(XMM4), sel); + SHUFPS(SCRATCH, R(SCRATCH2), sel); } // Store dest back to memory @@ -244,6 +246,19 @@ void JitCompiler::Compile_DestEnable(Instruction instr,X64Reg src) { } } +void JitCompiler::Compile_SanitizedMul(Gen::X64Reg src1, Gen::X64Reg src2, Gen::X64Reg scratch) { + MOVAPS(scratch, R(src1)); + CMPPS(scratch, R(src2), CMP_ORD); + + MULPS(src1, R(src2)); + + MOVAPS(src2, R(src1)); + CMPPS(src2, R(src2), CMP_UNORD); + + XORPS(scratch, R(src2)); + ANDPS(src1, R(scratch)); +} + void JitCompiler::Compile_EvaluateCondition(Instruction instr) { // Note: NXOR is used below to check for equality switch (instr.flow_control.op) { @@ -280,6 +295,22 @@ void JitCompiler::Compile_UniformCondition(Instruction instr) { CMP(sizeof(bool) * 8, MDisp(UNIFORMS, offset), Imm8(0)); } +void JitCompiler::Compile_PushCallerSavedXMM() { +#ifndef _WIN32 + SUB(64, R(RSP), Imm8(2 * 16)); + MOVUPS(MDisp(RSP, 16), ONE); + MOVUPS(MDisp(RSP, 0), NEGBIT); +#endif +} + +void JitCompiler::Compile_PopCallerSavedXMM() { +#ifndef _WIN32 + MOVUPS(NEGBIT, MDisp(RSP, 0)); + MOVUPS(ONE, MDisp(RSP, 16)); + ADD(64, R(RSP), Imm8(2 * 16)); +#endif +} + void JitCompiler::Compile_ADD(Instruction instr) { Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); @@ -291,21 +322,17 @@ void JitCompiler::Compile_DP3(Instruction instr) { Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); - if (Common::GetCPUCaps().sse4_1) { - DPPS(SRC1, R(SRC2), 0x7f); - } else { - MULPS(SRC1, R(SRC2)); + Compile_SanitizedMul(SRC1, SRC2, SCRATCH); - MOVAPS(SRC2, R(SRC1)); - SHUFPS(SRC2, R(SRC2), _MM_SHUFFLE(1, 1, 1, 1)); + MOVAPS(SRC2, R(SRC1)); + SHUFPS(SRC2, R(SRC2), _MM_SHUFFLE(1, 1, 1, 1)); - MOVAPS(SRC3, R(SRC1)); - SHUFPS(SRC3, R(SRC3), _MM_SHUFFLE(2, 2, 2, 2)); + MOVAPS(SRC3, R(SRC1)); + SHUFPS(SRC3, R(SRC3), _MM_SHUFFLE(2, 2, 2, 2)); - SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 0, 0, 0)); - ADDPS(SRC1, R(SRC2)); - ADDPS(SRC1, R(SRC3)); - } + SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 0, 0, 0)); + ADDPS(SRC1, R(SRC2)); + ADDPS(SRC1, R(SRC3)); Compile_DestEnable(instr, SRC1); } @@ -314,27 +341,117 @@ void JitCompiler::Compile_DP4(Instruction instr) { Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); + Compile_SanitizedMul(SRC1, SRC2, SCRATCH); + + MOVAPS(SRC2, R(SRC1)); + SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY + ADDPS(SRC1, R(SRC2)); + + MOVAPS(SRC2, R(SRC1)); + SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX + ADDPS(SRC1, R(SRC2)); + + Compile_DestEnable(instr, SRC1); +} + +void JitCompiler::Compile_DPH(Instruction instr) { + if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::DPHI) { + Compile_SwizzleSrc(instr, 1, instr.common.src1i, SRC1); + Compile_SwizzleSrc(instr, 2, instr.common.src2i, SRC2); + } else { + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); + } + if (Common::GetCPUCaps().sse4_1) { - DPPS(SRC1, R(SRC2), 0xff); + // Set 4th component to 1.0 + BLENDPS(SRC1, R(ONE), 0x8); // 0b1000 } else { - MULPS(SRC1, R(SRC2)); + // Set 4th component to 1.0 + MOVAPS(SCRATCH, R(SRC1)); + UNPCKHPS(SCRATCH, R(ONE)); // XYZW, 1111 -> Z1__ + UNPCKLPD(SRC1, R(SCRATCH)); // XYZW, Z1__ -> XYZ1 + } - MOVAPS(SRC2, R(SRC1)); - SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY - ADDPS(SRC1, R(SRC2)); + Compile_SanitizedMul(SRC1, SRC2, SCRATCH); - MOVAPS(SRC2, R(SRC1)); - SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX - ADDPS(SRC1, R(SRC2)); - } + MOVAPS(SRC2, R(SRC1)); + SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY + ADDPS(SRC1, R(SRC2)); + + MOVAPS(SRC2, R(SRC1)); + SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX + ADDPS(SRC1, R(SRC2)); + + Compile_DestEnable(instr, SRC1); +} + +void JitCompiler::Compile_EX2(Instruction instr) { + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + MOVSS(XMM0, R(SRC1)); + // The following will actually break the stack alignment + ABI_PushAllCallerSavedRegsAndAdjustStack(); + Compile_PushCallerSavedXMM(); + ABI_CallFunction(reinterpret_cast<const void*>(exp2f)); + Compile_PopCallerSavedXMM(); + ABI_PopAllCallerSavedRegsAndAdjustStack(); + + SHUFPS(XMM0, R(XMM0), _MM_SHUFFLE(0, 0, 0, 0)); + MOVAPS(SRC1, R(XMM0)); + Compile_DestEnable(instr, SRC1); +} + +void JitCompiler::Compile_LG2(Instruction instr) { + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + MOVSS(XMM0, R(SRC1)); + + // The following will actually break the stack alignment + ABI_PushAllCallerSavedRegsAndAdjustStack(); + Compile_PushCallerSavedXMM(); + ABI_CallFunction(reinterpret_cast<const void*>(log2f)); + Compile_PopCallerSavedXMM(); + ABI_PopAllCallerSavedRegsAndAdjustStack(); + + SHUFPS(XMM0, R(XMM0), _MM_SHUFFLE(0, 0, 0, 0)); + MOVAPS(SRC1, R(XMM0)); Compile_DestEnable(instr, SRC1); } void JitCompiler::Compile_MUL(Instruction instr) { Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); - MULPS(SRC1, R(SRC2)); + Compile_SanitizedMul(SRC1, SRC2, SCRATCH); + Compile_DestEnable(instr, SRC1); +} + +void JitCompiler::Compile_SGE(Instruction instr) { + if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::SGEI) { + Compile_SwizzleSrc(instr, 1, instr.common.src1i, SRC1); + Compile_SwizzleSrc(instr, 2, instr.common.src2i, SRC2); + } else { + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); + } + + CMPPS(SRC1, R(SRC2), CMP_NLT); + ANDPS(SRC1, R(ONE)); + + Compile_DestEnable(instr, SRC1); +} + +void JitCompiler::Compile_SLT(Instruction instr) { + if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::SLTI) { + Compile_SwizzleSrc(instr, 1, instr.common.src1i, SRC1); + Compile_SwizzleSrc(instr, 2, instr.common.src2i, SRC2); + } else { + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); + } + + CMPPS(SRC1, R(SRC2), CMP_LT); + ANDPS(SRC1, R(ONE)); + Compile_DestEnable(instr, SRC1); } @@ -354,6 +471,7 @@ void JitCompiler::Compile_FLR(Instruction instr) { void JitCompiler::Compile_MAX(Instruction instr) { Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); + // SSE semantics match PICA200 ones: In case of NaN, SRC2 is returned. MAXPS(SRC1, R(SRC2)); Compile_DestEnable(instr, SRC1); } @@ -361,6 +479,7 @@ void JitCompiler::Compile_MAX(Instruction instr) { void JitCompiler::Compile_MIN(Instruction instr) { Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); + // SSE semantics match PICA200 ones: In case of NaN, SRC2 is returned. MINPS(SRC1, R(SRC2)); Compile_DestEnable(instr, SRC1); } @@ -374,8 +493,8 @@ void JitCompiler::Compile_MOVA(Instruction instr) { Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); - // Convert floats to integers (only care about X and Y components) - CVTPS2DQ(SRC1, R(SRC1)); + // Convert floats to integers using truncation (only care about X and Y components) + CVTTPS2DQ(SRC1, R(SRC1)); // Get result MOVQ_xmm(R(RAX), SRC1); @@ -415,22 +534,13 @@ void JitCompiler::Compile_MOV(Instruction instr) { Compile_DestEnable(instr, SRC1); } -void JitCompiler::Compile_SLTI(Instruction instr) { - Compile_SwizzleSrc(instr, 1, instr.common.src1i, SRC1); - Compile_SwizzleSrc(instr, 1, instr.common.src2i, SRC2); - - CMPSS(SRC1, R(SRC2), CMP_LT); - ANDPS(SRC1, R(ONE)); - - Compile_DestEnable(instr, SRC1); -} - void JitCompiler::Compile_RCP(Instruction instr) { Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); - // TODO(bunnei): RCPPS is a pretty rough approximation, this might cause problems if Pica + // TODO(bunnei): RCPSS is a pretty rough approximation, this might cause problems if Pica // performs this operation more accurately. This should be checked on hardware. - RCPPS(SRC1, R(SRC1)); + RCPSS(SRC1, R(SRC1)); + SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 0, 0, 0)); // XYWZ -> XXXX Compile_DestEnable(instr, SRC1); } @@ -438,9 +548,10 @@ void JitCompiler::Compile_RCP(Instruction instr) { void JitCompiler::Compile_RSQ(Instruction instr) { Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); - // TODO(bunnei): RSQRTPS is a pretty rough approximation, this might cause problems if Pica + // TODO(bunnei): RSQRTSS is a pretty rough approximation, this might cause problems if Pica // performs this operation more accurately. This should be checked on hardware. - RSQRTPS(SRC1, R(SRC1)); + RSQRTSS(SRC1, R(SRC1)); + SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 0, 0, 0)); // XYWZ -> XXXX Compile_DestEnable(instr, SRC1); } @@ -475,27 +586,42 @@ void JitCompiler::Compile_CALLU(Instruction instr) { } void JitCompiler::Compile_CMP(Instruction instr) { + using Op = Instruction::Common::CompareOpType::Op; + Op op_x = instr.common.compare_op.x; + Op op_y = instr.common.compare_op.y; + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); - static const u8 cmp[] = { CMP_EQ, CMP_NEQ, CMP_LT, CMP_LE, CMP_NLE, CMP_NLT }; + // SSE doesn't have greater-than (GT) or greater-equal (GE) comparison operators. You need to + // emulate them by swapping the lhs and rhs and using LT and LE. NLT and NLE can't be used here + // because they don't match when used with NaNs. + static const u8 cmp[] = { CMP_EQ, CMP_NEQ, CMP_LT, CMP_LE, CMP_LT, CMP_LE }; + + bool invert_op_x = (op_x == Op::GreaterThan || op_x == Op::GreaterEqual); + Gen::X64Reg lhs_x = invert_op_x ? SRC2 : SRC1; + Gen::X64Reg rhs_x = invert_op_x ? SRC1 : SRC2; - if (instr.common.compare_op.x == instr.common.compare_op.y) { + if (op_x == op_y) { // Compare X-component and Y-component together - CMPPS(SRC1, R(SRC2), cmp[instr.common.compare_op.x]); + CMPPS(lhs_x, R(rhs_x), cmp[op_x]); + MOVQ_xmm(R(COND0), lhs_x); - MOVQ_xmm(R(COND0), SRC1); MOV(64, R(COND1), R(COND0)); } else { + bool invert_op_y = (op_y == Op::GreaterThan || op_y == Op::GreaterEqual); + Gen::X64Reg lhs_y = invert_op_y ? SRC2 : SRC1; + Gen::X64Reg rhs_y = invert_op_y ? SRC1 : SRC2; + // Compare X-component - MOVAPS(SCRATCH, R(SRC1)); - CMPSS(SCRATCH, R(SRC2), cmp[instr.common.compare_op.x]); + MOVAPS(SCRATCH, R(lhs_x)); + CMPSS(SCRATCH, R(rhs_x), cmp[op_x]); // Compare Y-component - CMPPS(SRC1, R(SRC2), cmp[instr.common.compare_op.y]); + CMPPS(lhs_y, R(rhs_y), cmp[op_y]); MOVQ_xmm(R(COND0), SCRATCH); - MOVQ_xmm(R(COND1), SRC1); + MOVQ_xmm(R(COND1), lhs_y); } SHR(32, R(COND0), Imm8(31)); @@ -513,12 +639,8 @@ void JitCompiler::Compile_MAD(Instruction instr) { Compile_SwizzleSrc(instr, 3, instr.mad.src3, SRC3); } - if (Common::GetCPUCaps().fma) { - VFMADD213PS(SRC1, SRC2, R(SRC3)); - } else { - MULPS(SRC1, R(SRC2)); - ADDPS(SRC1, R(SRC3)); - } + Compile_SanitizedMul(SRC1, SRC2, SCRATCH); + ADDPS(SRC1, R(SRC3)); Compile_DestEnable(instr, SRC1); } @@ -646,12 +768,12 @@ CompiledShader* JitCompiler::Compile() { // Used to set a register to one static const __m128 one = { 1.f, 1.f, 1.f, 1.f }; MOV(PTRBITS, R(RAX), ImmPtr(&one)); - MOVAPS(ONE, MDisp(RAX, 0)); + MOVAPS(ONE, MatR(RAX)); // Used to negate registers static const __m128 neg = { -0.f, -0.f, -0.f, -0.f }; MOV(PTRBITS, R(RAX), ImmPtr(&neg)); - MOVAPS(NEGBIT, MDisp(RAX, 0)); + MOVAPS(NEGBIT, MatR(RAX)); looping = false; diff --git a/src/video_core/shader/shader_jit_x64.h b/src/video_core/shader/shader_jit_x64.h index b88f2a0d2..58828ecc8 100644 --- a/src/video_core/shader/shader_jit_x64.h +++ b/src/video_core/shader/shader_jit_x64.h @@ -37,7 +37,12 @@ public: void Compile_ADD(Instruction instr); void Compile_DP3(Instruction instr); void Compile_DP4(Instruction instr); + void Compile_DPH(Instruction instr); + void Compile_EX2(Instruction instr); + void Compile_LG2(Instruction instr); void Compile_MUL(Instruction instr); + void Compile_SGE(Instruction instr); + void Compile_SLT(Instruction instr); void Compile_FLR(Instruction instr); void Compile_MAX(Instruction instr); void Compile_MIN(Instruction instr); @@ -45,7 +50,6 @@ public: void Compile_RSQ(Instruction instr); void Compile_MOVA(Instruction instr); void Compile_MOV(Instruction instr); - void Compile_SLTI(Instruction instr); void Compile_NOP(Instruction instr); void Compile_END(Instruction instr); void Compile_CALL(Instruction instr); @@ -64,9 +68,18 @@ private: void Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRegister src_reg, Gen::X64Reg dest); void Compile_DestEnable(Instruction instr, Gen::X64Reg dest); + /** + * Compiles a `MUL src1, src2` operation, properly handling the PICA semantics when multiplying + * zero by inf. Clobbers `src2` and `scratch`. + */ + void Compile_SanitizedMul(Gen::X64Reg src1, Gen::X64Reg src2, Gen::X64Reg scratch); + void Compile_EvaluateCondition(Instruction instr); void Compile_UniformCondition(Instruction instr); + void Compile_PushCallerSavedXMM(); + void Compile_PopCallerSavedXMM(); + /// Pointer to the variable that stores the current Pica code offset. Used to handle nested code blocks. unsigned* offset_ptr = nullptr; |