summaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/citra_qt/debugger/graphics_cmdlists.cpp44
-rw-r--r--src/citra_qt/debugger/graphics_vertex_shader.cpp256
-rw-r--r--src/citra_qt/debugger/graphics_vertex_shader.h51
-rw-r--r--src/citra_qt/main.cpp97
-rw-r--r--src/citra_qt/main.h25
-rw-r--r--src/citra_qt/main.ui7
-rw-r--r--src/common/color.h18
-rw-r--r--src/common/common_funcs.h12
-rw-r--r--src/common/file_util.h2
-rw-r--r--src/common/x64/emitter.cpp746
-rw-r--r--src/common/x64/emitter.h849
-rw-r--r--src/core/arm/skyeye_common/vfp/vfpdouble.cpp9
-rw-r--r--src/core/arm/skyeye_common/vfp/vfpsingle.cpp9
-rw-r--r--src/core/hle/service/gsp_gpu.cpp25
-rw-r--r--src/core/hle/service/gsp_gpu.h11
-rw-r--r--src/core/hw/gpu.cpp69
-rw-r--r--src/core/hw/gpu.h32
-rw-r--r--src/core/loader/loader.cpp6
-rw-r--r--src/video_core/command_processor.cpp33
-rw-r--r--src/video_core/debug_utils/debug_utils.cpp105
-rw-r--r--src/video_core/debug_utils/debug_utils.h19
-rw-r--r--src/video_core/pica.h14
-rw-r--r--src/video_core/shader/shader.cpp55
-rw-r--r--src/video_core/shader/shader.h197
-rw-r--r--src/video_core/shader/shader_interpreter.cpp154
-rw-r--r--src/video_core/shader/shader_interpreter.h3
-rw-r--r--src/video_core/shader/shader_jit_x64.cpp108
-rw-r--r--src/video_core/shader/shader_jit_x64.h8
28 files changed, 1905 insertions, 1059 deletions
diff --git a/src/citra_qt/debugger/graphics_cmdlists.cpp b/src/citra_qt/debugger/graphics_cmdlists.cpp
index 7ac3ea542..35a3140b2 100644
--- a/src/citra_qt/debugger/graphics_cmdlists.cpp
+++ b/src/citra_qt/debugger/graphics_cmdlists.cpp
@@ -73,7 +73,7 @@ TextureInfoDockWidget::TextureInfoDockWidget(const Pica::DebugUtils::TextureInfo
format_choice->addItem(tr("RGB565"));
format_choice->addItem(tr("RGBA4"));
format_choice->addItem(tr("IA8"));
- format_choice->addItem(tr("UNK6"));
+ format_choice->addItem(tr("RG8"));
format_choice->addItem(tr("I8"));
format_choice->addItem(tr("A8"));
format_choice->addItem(tr("IA4"));
@@ -175,29 +175,29 @@ int GPUCommandListModel::rowCount(const QModelIndex& parent) const {
}
int GPUCommandListModel::columnCount(const QModelIndex& parent) const {
- return 3;
+ return 4;
}
QVariant GPUCommandListModel::data(const QModelIndex& index, int role) const {
if (!index.isValid())
return QVariant();
- const auto& writes = pica_trace.writes;
- const Pica::CommandProcessor::CommandHeader cmd{writes[index.row()].Id()};
- const u32 val{writes[index.row()].Value()};
+ const auto& write = pica_trace.writes[index.row()];
if (role == Qt::DisplayRole) {
QString content;
switch ( index.column() ) {
case 0:
- return QString::fromLatin1(Pica::Regs::GetCommandName(cmd.cmd_id).c_str());
+ return QString::fromLatin1(Pica::Regs::GetCommandName(write.cmd_id).c_str());
case 1:
- return QString("%1").arg(cmd.cmd_id, 3, 16, QLatin1Char('0'));
+ return QString("%1").arg(write.cmd_id, 3, 16, QLatin1Char('0'));
case 2:
- return QString("%1").arg(val, 8, 16, QLatin1Char('0'));
+ return QString("%1").arg(write.mask, 4, 2, QLatin1Char('0'));
+ case 3:
+ return QString("%1").arg(write.value, 8, 16, QLatin1Char('0'));
}
} else if (role == CommandIdRole) {
- return QVariant::fromValue<int>(cmd.cmd_id.Value());
+ return QVariant::fromValue<int>(write.cmd_id);
}
return QVariant();
@@ -213,6 +213,8 @@ QVariant GPUCommandListModel::headerData(int section, Qt::Orientation orientatio
case 1:
return tr("Register");
case 2:
+ return tr("Mask");
+ case 3:
return tr("New Value");
}
@@ -260,7 +262,7 @@ void GPUCommandListWidget::OnCommandDoubleClicked(const QModelIndex& index) {
}
void GPUCommandListWidget::SetCommandInfo(const QModelIndex& index) {
- QWidget* new_info_widget;
+ QWidget* new_info_widget = nullptr;
const unsigned int command_id = list_widget->model()->data(index, GPUCommandListModel::CommandIdRole).toUInt();
if (COMMAND_IN_RANGE(command_id, texture0) ||
@@ -281,14 +283,15 @@ void GPUCommandListWidget::SetCommandInfo(const QModelIndex& index) {
auto info = Pica::DebugUtils::TextureInfo::FromPicaRegister(config, format);
u8* src = Memory::GetPhysicalPointer(config.GetPhysicalAddress());
new_info_widget = new TextureInfoWidget(src, info);
- } else {
- new_info_widget = new QWidget;
}
-
- widget()->layout()->removeWidget(command_info_widget);
- delete command_info_widget;
- widget()->layout()->addWidget(new_info_widget);
- command_info_widget = new_info_widget;
+ if (command_info_widget) {
+ delete command_info_widget;
+ command_info_widget = nullptr;
+ }
+ if (new_info_widget) {
+ widget()->layout()->addWidget(new_info_widget);
+ command_info_widget = new_info_widget;
+ }
}
#undef COMMAND_IN_RANGE
@@ -300,7 +303,9 @@ GPUCommandListWidget::GPUCommandListWidget(QWidget* parent) : QDockWidget(tr("Pi
list_widget = new QTreeView;
list_widget->setModel(model);
- list_widget->setFont(QFont("monospace"));
+ QFont font("monospace");
+ font.setStyleHint(QFont::Monospace); // Automatic fallback to a monospace font on on platforms without a font called "monospace"
+ list_widget->setFont(font);
list_widget->setRootIsDecorated(false);
list_widget->setUniformRowHeights(true);
@@ -324,7 +329,7 @@ GPUCommandListWidget::GPUCommandListWidget(QWidget* parent) : QDockWidget(tr("Pi
connect(copy_all, SIGNAL(clicked()), this, SLOT(CopyAllToClipboard()));
- command_info_widget = new QWidget;
+ command_info_widget = nullptr;
QVBoxLayout* main_layout = new QVBoxLayout;
main_layout->addWidget(list_widget);
@@ -334,7 +339,6 @@ GPUCommandListWidget::GPUCommandListWidget(QWidget* parent) : QDockWidget(tr("Pi
sub_layout->addWidget(copy_all);
main_layout->addLayout(sub_layout);
}
- main_layout->addWidget(command_info_widget);
main_widget->setLayout(main_layout);
setWidget(main_widget);
diff --git a/src/citra_qt/debugger/graphics_vertex_shader.cpp b/src/citra_qt/debugger/graphics_vertex_shader.cpp
index 302e22d7a..0c17edee0 100644
--- a/src/citra_qt/debugger/graphics_vertex_shader.cpp
+++ b/src/citra_qt/debugger/graphics_vertex_shader.cpp
@@ -6,9 +6,16 @@
#include <sstream>
#include <QBoxLayout>
+#include <QFileDialog>
+#include <QGroupBox>
+#include <QLabel>
+#include <QLineEdit>
+#include <QPushButton>
+#include <QSignalMapper>
+#include <QSpinBox>
#include <QTreeView>
-#include "video_core/shader/shader_interpreter.h"
+#include "video_core/shader/shader.h"
#include "graphics_vertex_shader.h"
@@ -17,7 +24,7 @@ using nihstro::Instruction;
using nihstro::SourceRegister;
using nihstro::SwizzlePattern;
-GraphicsVertexShaderModel::GraphicsVertexShaderModel(QObject* parent): QAbstractItemModel(parent) {
+GraphicsVertexShaderModel::GraphicsVertexShaderModel(GraphicsVertexShaderWidget* parent): QAbstractItemModel(parent), par(parent) {
}
@@ -34,7 +41,7 @@ int GraphicsVertexShaderModel::columnCount(const QModelIndex& parent) const {
}
int GraphicsVertexShaderModel::rowCount(const QModelIndex& parent) const {
- return static_cast<int>(info.code.size());
+ return static_cast<int>(par->info.code.size());
}
QVariant GraphicsVertexShaderModel::headerData(int section, Qt::Orientation orientation, int role) const {
@@ -62,21 +69,21 @@ QVariant GraphicsVertexShaderModel::data(const QModelIndex& index, int role) con
{
switch (index.column()) {
case 0:
- if (info.HasLabel(index.row()))
- return QString::fromStdString(info.GetLabel(index.row()));
+ if (par->info.HasLabel(index.row()))
+ return QString::fromStdString(par->info.GetLabel(index.row()));
return QString("%1").arg(4*index.row(), 4, 16, QLatin1Char('0'));
case 1:
- return QString("%1").arg(info.code[index.row()].hex, 8, 16, QLatin1Char('0'));
+ return QString("%1").arg(par->info.code[index.row()].hex, 8, 16, QLatin1Char('0'));
case 2:
{
std::stringstream output;
output.flags(std::ios::hex);
- Instruction instr = info.code[index.row()];
- const SwizzlePattern& swizzle = info.swizzle_info[instr.common.operand_desc_id].pattern;
+ Instruction instr = par->info.code[index.row()];
+ const SwizzlePattern& swizzle = par->info.swizzle_info[instr.common.operand_desc_id].pattern;
// longest known instruction name: "setemit "
output << std::setw(8) << std::left << instr.opcode.Value().GetInfo().name;
@@ -130,13 +137,13 @@ QVariant GraphicsVertexShaderModel::data(const QModelIndex& index, int role) con
print_input_indexed_compact(output, src1, swizzle.negate_src1, swizzle.SelectorToString(false).substr(0,1), instr.common.AddressRegisterName());
output << " " << instr.common.compare_op.ToString(instr.common.compare_op.x) << " ";
- print_input(output, src2, swizzle.negate_src2, swizzle.SelectorToString(false).substr(0,1));
+ print_input(output, src2, swizzle.negate_src2, swizzle.SelectorToString(true).substr(0,1));
output << ", ";
print_input_indexed_compact(output, src1, swizzle.negate_src1, swizzle.SelectorToString(false).substr(1,1), instr.common.AddressRegisterName());
output << " " << instr.common.compare_op.ToString(instr.common.compare_op.y) << " ";
- print_input(output, src2, swizzle.negate_src2, swizzle.SelectorToString(false).substr(1,1));
+ print_input(output, src2, swizzle.negate_src2, swizzle.SelectorToString(true).substr(1,1));
break;
}
@@ -167,7 +174,7 @@ QVariant GraphicsVertexShaderModel::data(const QModelIndex& index, int role) con
// TODO: In some cases, the Address Register is used as an index for SRC2 instead of SRC1
if (instr.opcode.Value().GetInfo().subtype & OpCode::Info::Src2) {
SourceRegister src2 = instr.common.GetSrc2(src_is_inverted);
- print_input(output, src2, swizzle.negate_src2, swizzle.SelectorToString(false));
+ print_input(output, src2, swizzle.negate_src2, swizzle.SelectorToString(true));
}
break;
}
@@ -240,6 +247,18 @@ QVariant GraphicsVertexShaderModel::data(const QModelIndex& index, int role) con
case Qt::FontRole:
return QFont("monospace");
+ case Qt::BackgroundRole:
+ // Highlight instructions which have no debug data associated to them
+ for (const auto& record : par->debug_data.records)
+ if (index.row() == record.instruction_offset)
+ return QVariant();
+
+ return QBrush(QColor(255, 255, 127));
+
+
+ // TODO: Draw arrows for each "reachable" instruction to visualize control flow
+
+
default:
break;
}
@@ -247,53 +266,232 @@ QVariant GraphicsVertexShaderModel::data(const QModelIndex& index, int role) con
return QVariant();
}
-void GraphicsVertexShaderModel::OnUpdate()
-{
- beginResetModel();
-
- info.Clear();
-
- for (auto instr : Pica::g_state.vs.program_code)
- info.code.push_back({instr});
+void GraphicsVertexShaderWidget::DumpShader() {
+ QString filename = QFileDialog::getSaveFileName(this, tr("Save Shader Dump"), "shader_dump.shbin",
+ tr("Shader Binary (*.shbin)"));
- for (auto pattern : Pica::g_state.vs.swizzle_data)
- info.swizzle_info.push_back({pattern});
+ if (filename.isEmpty()) {
+ // If the user canceled the dialog, don't dump anything.
+ return;
+ }
- info.labels.insert({ Pica::g_state.regs.vs.main_offset, "main" });
+ auto& setup = Pica::g_state.vs;
+ auto& config = Pica::g_state.regs.vs;
- endResetModel();
+ Pica::DebugUtils::DumpShader(filename.toStdString(), config, setup, Pica::g_state.regs.vs_output_attributes);
}
-
GraphicsVertexShaderWidget::GraphicsVertexShaderWidget(std::shared_ptr< Pica::DebugContext > debug_context,
QWidget* parent)
: BreakPointObserverDock(debug_context, "Pica Vertex Shader", parent) {
setObjectName("PicaVertexShader");
- auto binary_model = new GraphicsVertexShaderModel(this);
- auto binary_list = new QTreeView;
- binary_list->setModel(binary_model);
+ auto input_data_mapper = new QSignalMapper(this);
+
+ // TODO: Support inputting data in hexadecimal raw format
+ for (unsigned i = 0; i < ARRAY_SIZE(input_data); ++i) {
+ input_data[i] = new QLineEdit;
+ input_data[i]->setValidator(new QDoubleValidator(input_data[i]));
+ }
+
+ breakpoint_warning = new QLabel(tr("(data only available at VertexLoaded breakpoints)"));
+
+ // TODO: Add some button for jumping to the shader entry point
+
+ model = new GraphicsVertexShaderModel(this);
+ binary_list = new QTreeView;
+ binary_list->setModel(model);
binary_list->setRootIsDecorated(false);
binary_list->setAlternatingRowColors(true);
- connect(this, SIGNAL(Update()), binary_model, SLOT(OnUpdate()));
+ auto dump_shader = new QPushButton(QIcon::fromTheme("document-save"), tr("Dump"));
+
+ instruction_description = new QLabel;
+
+ cycle_index = new QSpinBox;
+
+ connect(this, SIGNAL(SelectCommand(const QModelIndex&, QItemSelectionModel::SelectionFlags)),
+ binary_list->selectionModel(), SLOT(select(const QModelIndex&, QItemSelectionModel::SelectionFlags)));
+
+ connect(dump_shader, SIGNAL(clicked()), this, SLOT(DumpShader()));
+
+ connect(cycle_index, SIGNAL(valueChanged(int)), this, SLOT(OnCycleIndexChanged(int)));
+
+ for (unsigned i = 0; i < ARRAY_SIZE(input_data); ++i) {
+ connect(input_data[i], SIGNAL(textEdited(const QString&)), input_data_mapper, SLOT(map()));
+ input_data_mapper->setMapping(input_data[i], i);
+ }
+ connect(input_data_mapper, SIGNAL(mapped(int)), this, SLOT(OnInputAttributeChanged(int)));
auto main_widget = new QWidget;
auto main_layout = new QVBoxLayout;
{
+ auto input_data_group = new QGroupBox(tr("Input Data"));
+
+ // For each vertex attribute, add a QHBoxLayout consisting of:
+ // - A QLabel denoting the source attribute index
+ // - Four QLineEdits for showing and manipulating attribute data
+ // - A QLabel denoting the shader input attribute index
+ auto sub_layout = new QVBoxLayout;
+ for (unsigned i = 0; i < 16; ++i) {
+ // Create an HBoxLayout to store the widgets used to specify a particular attribute
+ // and store it in a QWidget to allow for easy hiding and unhiding.
+ auto row_layout = new QHBoxLayout;
+ row_layout->addWidget(new QLabel(tr("Attribute %1").arg(i, 2)));
+ for (unsigned comp = 0; comp < 4; ++comp)
+ row_layout->addWidget(input_data[4 * i + comp]);
+
+ row_layout->addWidget(input_data_mapping[i] = new QLabel);
+
+ input_data_container[i] = new QWidget;
+ input_data_container[i]->setLayout(row_layout);
+ input_data_container[i]->hide();
+
+ sub_layout->addWidget(input_data_container[i]);
+ }
+
+ sub_layout->addWidget(breakpoint_warning);
+ breakpoint_warning->hide();
+
+ input_data_group->setLayout(sub_layout);
+ main_layout->addWidget(input_data_group);
+ }
+ {
auto sub_layout = new QHBoxLayout;
sub_layout->addWidget(binary_list);
main_layout->addLayout(sub_layout);
}
+ main_layout->addWidget(dump_shader);
+ {
+ auto sub_layout = new QHBoxLayout;
+ sub_layout->addWidget(new QLabel(tr("Cycle Index:")));
+ sub_layout->addWidget(cycle_index);
+ main_layout->addLayout(sub_layout);
+ }
+ main_layout->addWidget(instruction_description);
+ main_layout->addStretch();
main_widget->setLayout(main_layout);
setWidget(main_widget);
+
+ widget()->setEnabled(false);
}
void GraphicsVertexShaderWidget::OnBreakPointHit(Pica::DebugContext::Event event, void* data) {
- emit Update();
+ auto input = static_cast<Pica::Shader::InputVertex*>(data);
+ if (event == Pica::DebugContext::Event::VertexLoaded) {
+ Reload(true, data);
+ } else {
+ // No vertex data is retrievable => invalidate currently stored vertex data
+ Reload(true, nullptr);
+ }
widget()->setEnabled(true);
}
+void GraphicsVertexShaderWidget::Reload(bool replace_vertex_data, void* vertex_data) {
+ model->beginResetModel();
+
+ if (replace_vertex_data) {
+ if (vertex_data) {
+ memcpy(&input_vertex, vertex_data, sizeof(input_vertex));
+ for (unsigned attr = 0; attr < 16; ++attr) {
+ for (unsigned comp = 0; comp < 4; ++comp) {
+ input_data[4 * attr + comp]->setText(QString("%1").arg(input_vertex.attr[attr][comp].ToFloat32()));
+ }
+ }
+ breakpoint_warning->hide();
+ } else {
+ for (unsigned attr = 0; attr < 16; ++attr) {
+ for (unsigned comp = 0; comp < 4; ++comp) {
+ input_data[4 * attr + comp]->setText(QString("???"));
+ }
+ }
+ breakpoint_warning->show();
+ }
+ }
+
+ // Reload shader code
+ info.Clear();
+
+ auto& shader_setup = Pica::g_state.vs;
+ auto& shader_config = Pica::g_state.regs.vs;
+ for (auto instr : shader_setup.program_code)
+ info.code.push_back({instr});
+
+ for (auto pattern : shader_setup.swizzle_data)
+ info.swizzle_info.push_back({pattern});
+
+ u32 entry_point = Pica::g_state.regs.vs.main_offset;
+ info.labels.insert({ entry_point, "main" });
+
+ // Generate debug information
+ debug_data = Pica::Shader::ProduceDebugInfo(input_vertex, 1, shader_config, shader_setup);
+
+ // Reload widget state
+
+ // Only show input attributes which are used as input to the shader
+ for (unsigned int attr = 0; attr < 16; ++attr) {
+ input_data_container[attr]->setVisible(false);
+ }
+ for (unsigned int attr = 0; attr < Pica::g_state.regs.vertex_attributes.GetNumTotalAttributes(); ++attr) {
+ unsigned source_attr = shader_config.input_register_map.GetRegisterForAttribute(attr);
+ input_data_mapping[source_attr]->setText(QString("-> v%1").arg(attr));
+ input_data_container[source_attr]->setVisible(true);
+ }
+
+ // Initialize debug info text for current cycle count
+ cycle_index->setMaximum(debug_data.records.size() - 1);
+ OnCycleIndexChanged(cycle_index->value());
+
+ model->endResetModel();
+}
+
void GraphicsVertexShaderWidget::OnResumed() {
widget()->setEnabled(false);
}
+
+void GraphicsVertexShaderWidget::OnInputAttributeChanged(int index) {
+ float value = input_data[index]->text().toFloat();
+ Reload();
+}
+
+void GraphicsVertexShaderWidget::OnCycleIndexChanged(int index) {
+ QString text;
+
+ auto& record = debug_data.records[index];
+ if (record.mask & Pica::Shader::DebugDataRecord::SRC1)
+ text += tr("SRC1: %1, %2, %3, %4\n").arg(record.src1.x.ToFloat32()).arg(record.src1.y.ToFloat32()).arg(record.src1.z.ToFloat32()).arg(record.src1.w.ToFloat32());
+ if (record.mask & Pica::Shader::DebugDataRecord::SRC2)
+ text += tr("SRC2: %1, %2, %3, %4\n").arg(record.src2.x.ToFloat32()).arg(record.src2.y.ToFloat32()).arg(record.src2.z.ToFloat32()).arg(record.src2.w.ToFloat32());
+ if (record.mask & Pica::Shader::DebugDataRecord::SRC3)
+ text += tr("SRC3: %1, %2, %3, %4\n").arg(record.src3.x.ToFloat32()).arg(record.src3.y.ToFloat32()).arg(record.src3.z.ToFloat32()).arg(record.src3.w.ToFloat32());
+ if (record.mask & Pica::Shader::DebugDataRecord::DEST_IN)
+ text += tr("DEST_IN: %1, %2, %3, %4\n").arg(record.dest_in.x.ToFloat32()).arg(record.dest_in.y.ToFloat32()).arg(record.dest_in.z.ToFloat32()).arg(record.dest_in.w.ToFloat32());
+ if (record.mask & Pica::Shader::DebugDataRecord::DEST_OUT)
+ text += tr("DEST_OUT: %1, %2, %3, %4\n").arg(record.dest_out.x.ToFloat32()).arg(record.dest_out.y.ToFloat32()).arg(record.dest_out.z.ToFloat32()).arg(record.dest_out.w.ToFloat32());
+
+ if (record.mask & Pica::Shader::DebugDataRecord::ADDR_REG_OUT)
+ text += tr("Addres Registers: %1, %2\n").arg(record.address_registers[0]).arg(record.address_registers[1]);
+ if (record.mask & Pica::Shader::DebugDataRecord::CMP_RESULT)
+ text += tr("Compare Result: %1, %2\n").arg(record.conditional_code[0] ? "true" : "false").arg(record.conditional_code[1] ? "true" : "false");
+
+ if (record.mask & Pica::Shader::DebugDataRecord::COND_BOOL_IN)
+ text += tr("Static Condition: %1\n").arg(record.cond_bool ? "true" : "false");
+ if (record.mask & Pica::Shader::DebugDataRecord::COND_CMP_IN)
+ text += tr("Dynamic Conditions: %1, %2\n").arg(record.cond_cmp[0] ? "true" : "false").arg(record.cond_cmp[1] ? "true" : "false");
+ if (record.mask & Pica::Shader::DebugDataRecord::LOOP_INT_IN)
+ text += tr("Loop Parameters: %1 (repeats), %2 (initializer), %3 (increment), %4\n").arg(record.loop_int.x).arg(record.loop_int.y).arg(record.loop_int.z).arg(record.loop_int.w);
+
+ text += tr("Instruction offset: 0x%1").arg(4 * record.instruction_offset, 4, 16, QLatin1Char('0'));
+ if (record.mask & Pica::Shader::DebugDataRecord::NEXT_INSTR) {
+ text += tr(" -> 0x%2").arg(4 * record.next_instruction, 4, 16, QLatin1Char('0'));
+ } else {
+ text += tr(" (last instruction)");
+ }
+
+ instruction_description->setText(text);
+
+ // Scroll to current instruction
+ const QModelIndex& instr_index = model->index(record.instruction_offset, 0);
+ emit SelectCommand(instr_index, QItemSelectionModel::ClearAndSelect | QItemSelectionModel::Rows);
+ binary_list->scrollTo(instr_index, QAbstractItemView::EnsureVisible);
+}
diff --git a/src/citra_qt/debugger/graphics_vertex_shader.h b/src/citra_qt/debugger/graphics_vertex_shader.h
index 38339dc05..1b3f1f7ec 100644
--- a/src/citra_qt/debugger/graphics_vertex_shader.h
+++ b/src/citra_qt/debugger/graphics_vertex_shader.h
@@ -10,11 +10,18 @@
#include "nihstro/parser_shbin.h"
+#include "video_core/shader/shader.h"
+
+class QLabel;
+class QSpinBox;
+
+class GraphicsVertexShaderWidget;
+
class GraphicsVertexShaderModel : public QAbstractItemModel {
Q_OBJECT
public:
- GraphicsVertexShaderModel(QObject* parent);
+ GraphicsVertexShaderModel(GraphicsVertexShaderWidget* parent);
QModelIndex index(int row, int column, const QModelIndex& parent = QModelIndex()) const override;
QModelIndex parent(const QModelIndex& child) const override;
@@ -23,11 +30,10 @@ public:
QVariant data(const QModelIndex& index, int role = Qt::DisplayRole) const override;
QVariant headerData(int section, Qt::Orientation orientation, int role = Qt::DisplayRole) const override;
-public slots:
- void OnUpdate();
-
private:
- nihstro::ShaderInfo info;
+ GraphicsVertexShaderWidget* par;
+
+ friend class GraphicsVertexShaderWidget;
};
class GraphicsVertexShaderWidget : public BreakPointObserverDock {
@@ -43,9 +49,42 @@ private slots:
void OnBreakPointHit(Pica::DebugContext::Event event, void* data) override;
void OnResumed() override;
+ void OnInputAttributeChanged(int index);
+
+ void OnCycleIndexChanged(int index);
+
+ void DumpShader();
+
+ /**
+ * Reload widget based on the current PICA200 state
+ * @param replace_vertex_data If true, invalidate all current vertex data
+ * @param vertex_data New vertex data to use, as passed to OnBreakPointHit. May be nullptr to specify that no valid vertex data can be retrieved currently. Only used if replace_vertex_data is true.
+ */
+ void Reload(bool replace_vertex_data = false, void* vertex_data = nullptr);
+
+
signals:
- void Update();
+ // Call this to change the current command selection in the disassembly view
+ void SelectCommand(const QModelIndex&, QItemSelectionModel::SelectionFlags);
private:
+ QLabel* instruction_description;
+ QTreeView* binary_list;
+ GraphicsVertexShaderModel* model;
+
+ /// TODO: Move these into a single struct
+ std::array<QLineEdit*, 4*16> input_data; // A text box for each of the 4 components of up to 16 vertex attributes
+ std::array<QWidget*, 16> input_data_container; // QWidget containing the QLayout containing each vertex attribute
+ std::array<QLabel*, 16> input_data_mapping; // A QLabel denoting the shader input attribute which the vertex attribute maps to
+
+ // Text to be shown when input vertex data is not retrievable
+ QLabel* breakpoint_warning;
+
+ QSpinBox* cycle_index;
+
+ nihstro::ShaderInfo info;
+ Pica::Shader::DebugData<true> debug_data;
+ Pica::Shader::InputVertex input_vertex;
+ friend class GraphicsVertexShaderModel;
};
diff --git a/src/citra_qt/main.cpp b/src/citra_qt/main.cpp
index 4c3edf87a..8bf2a3e13 100644
--- a/src/citra_qt/main.cpp
+++ b/src/citra_qt/main.cpp
@@ -7,6 +7,7 @@
#include <QtGui>
#include <QDesktopWidget>
#include <QFileDialog>
+#include <QMessageBox>
#include "qhexedit.h"
#include "main.h"
@@ -140,6 +141,16 @@ GMainWindow::GMainWindow() : emu_thread(nullptr)
ui.actionDisplay_widget_title_bars->setChecked(settings.value("displayTitleBars", true).toBool());
OnDisplayTitleBars(ui.actionDisplay_widget_title_bars->isChecked());
+ // Prepare actions for recent files
+ for (int i = 0; i < max_recent_files_item; ++i) {
+ actions_recent_files[i] = new QAction(this);
+ actions_recent_files[i]->setVisible(false);
+ connect(actions_recent_files[i], SIGNAL(triggered()), this, SLOT(OnMenuRecentFile()));
+
+ ui.menu_recent_files->addAction(actions_recent_files[i]);
+ }
+ UpdateRecentFiles();
+
// Setup connections
connect(ui.action_Load_File, SIGNAL(triggered()), this, SLOT(OnMenuLoadFile()));
connect(ui.action_Load_Symbol_Map, SIGNAL(triggered()), this, SLOT(OnMenuLoadSymbolMap()));
@@ -213,6 +224,10 @@ void GMainWindow::OnDisplayTitleBars(bool show)
void GMainWindow::BootGame(const std::string& filename) {
LOG_INFO(Frontend, "Citra starting...\n");
+ // Shutdown previous session if the emu thread is still active...
+ if (emu_thread != nullptr)
+ ShutdownGame();
+
// Initialize the core emulation
System::Init(render_window);
@@ -272,18 +287,51 @@ void GMainWindow::ShutdownGame() {
render_window->hide();
}
-void GMainWindow::OnMenuLoadFile()
+void GMainWindow::StoreRecentFile(const QString& filename)
{
QSettings settings;
+ QStringList recent_files = settings.value("recentFiles").toStringList();
+ recent_files.prepend(filename);
+ recent_files.removeDuplicates();
+ settings.setValue("recentFiles", recent_files);
+
+ UpdateRecentFiles();
+}
+
+void GMainWindow::UpdateRecentFiles() {
+ QSettings settings;
+ QStringList recent_files = settings.value("recentFiles").toStringList();
+
+ unsigned int num_recent_files = std::min(recent_files.size(), static_cast<int>(max_recent_files_item));
+
+ for (unsigned int i = 0; i < num_recent_files; i++) {
+ QString text = QString("&%1. %2").arg(i + 1).arg(QFileInfo(recent_files[i]).fileName());
+ actions_recent_files[i]->setText(text);
+ actions_recent_files[i]->setData(recent_files[i]);
+ actions_recent_files[i]->setToolTip(recent_files[i]);
+ actions_recent_files[i]->setVisible(true);
+ }
+
+ for (int j = num_recent_files; j < max_recent_files_item; ++j) {
+ actions_recent_files[j]->setVisible(false);
+ }
+
+ // Grey out the recent files menu if the list is empty
+ if (num_recent_files == 0) {
+ ui.menu_recent_files->setEnabled(false);
+ } else {
+ ui.menu_recent_files->setEnabled(true);
+ }
+}
+
+void GMainWindow::OnMenuLoadFile() {
+ QSettings settings;
QString rom_path = settings.value("romsPath", QString()).toString();
QString filename = QFileDialog::getOpenFileName(this, tr("Load File"), rom_path, tr("3DS executable (*.3ds *.3dsx *.elf *.axf *.cci *.cxi)"));
if (filename.size()) {
settings.setValue("romsPath", QFileInfo(filename).path());
-
- // Shutdown previous session if the emu thread is still active...
- if (emu_thread != nullptr)
- ShutdownGame();
+ StoreRecentFile(filename);
BootGame(filename.toLatin1().data());
}
@@ -301,8 +349,28 @@ void GMainWindow::OnMenuLoadSymbolMap() {
}
}
-void GMainWindow::OnStartGame()
-{
+void GMainWindow::OnMenuRecentFile() {
+ QAction* action = qobject_cast<QAction*>(sender());
+ assert(action);
+
+ QString filename = action->data().toString();
+ QFileInfo file_info(filename);
+ if (file_info.exists()) {
+ BootGame(filename.toLatin1().data());
+ StoreRecentFile(filename); // Put the filename on top of the list
+ } else {
+ // Display an error message and remove the file from the list.
+ QMessageBox::information(this, tr("File not found"), tr("File \"%1\" not found").arg(filename));
+
+ QSettings settings;
+ QStringList recent_files = settings.value("recentFiles").toStringList();
+ recent_files.removeOne(filename);
+ settings.setValue("recentFiles", recent_files);
+ UpdateRecentFiles();
+ }
+}
+
+void GMainWindow::OnStartGame() {
emu_thread->SetRunning(true);
ui.action_Start->setEnabled(false);
@@ -312,8 +380,7 @@ void GMainWindow::OnStartGame()
ui.action_Stop->setEnabled(true);
}
-void GMainWindow::OnPauseGame()
-{
+void GMainWindow::OnPauseGame() {
emu_thread->SetRunning(false);
ui.action_Start->setEnabled(true);
@@ -325,8 +392,7 @@ void GMainWindow::OnStopGame() {
ShutdownGame();
}
-void GMainWindow::OnOpenHotkeysDialog()
-{
+void GMainWindow::OnOpenHotkeysDialog() {
GHotkeysDialog dialog(this);
dialog.exec();
}
@@ -358,13 +424,11 @@ void GMainWindow::ToggleWindowMode() {
}
}
-void GMainWindow::OnConfigure()
-{
+void GMainWindow::OnConfigure() {
//GControllerConfigDialog* dialog = new GControllerConfigDialog(controller_ports, this);
}
-void GMainWindow::closeEvent(QCloseEvent* event)
-{
+void GMainWindow::closeEvent(QCloseEvent* event) {
// Save window layout
QSettings settings(QSettings::IniFormat, QSettings::UserScope, "Citra team", "Citra");
settings.setValue("geometry", saveGeometry());
@@ -388,8 +452,7 @@ void GMainWindow::closeEvent(QCloseEvent* event)
#undef main
#endif
-int main(int argc, char* argv[])
-{
+int main(int argc, char* argv[]) {
Log::Filter log_filter(Log::Level::Info);
Log::SetFilter(&log_filter);
diff --git a/src/citra_qt/main.h b/src/citra_qt/main.h
index 61114a04d..6f1292295 100644
--- a/src/citra_qt/main.h
+++ b/src/citra_qt/main.h
@@ -24,6 +24,8 @@ class GMainWindow : public QMainWindow
{
Q_OBJECT
+ static const int max_recent_files_item = 10; ///< Max number of recently loaded items to keep track
+
// TODO: Make use of this!
enum {
UI_IDLE,
@@ -58,6 +60,26 @@ private:
void BootGame(const std::string& filename);
void ShutdownGame();
+ /**
+ * Stores the filename in the recently loaded files list.
+ * The new filename is stored at the beginning of the recently loaded files list.
+ * After inserting the new entry, duplicates are removed meaning that if
+ * this was inserted from \a OnMenuRecentFile(), the entry will be put on top
+ * and remove from its previous position.
+ *
+ * Finally, this function calls \a UpdateRecentFiles() to update the UI.
+ *
+ * @param filename the filename to store
+ */
+ void StoreRecentFile(const QString& filename);
+
+ /**
+ * Updates the recent files menu.
+ * Menu entries are rebuilt from the configuration file.
+ * If there is no entry in the menu, the menu is greyed out.
+ */
+ void UpdateRecentFiles();
+
void closeEvent(QCloseEvent* event) override;
private slots:
@@ -66,6 +88,7 @@ private slots:
void OnStopGame();
void OnMenuLoadFile();
void OnMenuLoadSymbolMap();
+ void OnMenuRecentFile();
void OnOpenHotkeysDialog();
void OnConfigure();
void OnDisplayTitleBars(bool);
@@ -86,6 +109,8 @@ private:
CallstackWidget* callstackWidget;
GPUCommandStreamWidget* graphicsWidget;
GPUCommandListWidget* graphicsCommandsWidget;
+
+ QAction* actions_recent_files[max_recent_files_item];
};
#endif // _CITRA_QT_MAIN_HXX_
diff --git a/src/citra_qt/main.ui b/src/citra_qt/main.ui
index b2ce8167d..1ba700a3a 100644
--- a/src/citra_qt/main.ui
+++ b/src/citra_qt/main.ui
@@ -52,9 +52,16 @@
<property name="title">
<string>&amp;File</string>
</property>
+ <widget class="QMenu" name="menu_recent_files">
+ <property name="title">
+ <string>Recent Files</string>
+ </property>
+ </widget>
<addaction name="action_Load_File"/>
<addaction name="action_Load_Symbol_Map"/>
<addaction name="separator"/>
+ <addaction name="menu_recent_files"/>
+ <addaction name="separator"/>
<addaction name="action_Exit"/>
</widget>
<widget class="QMenu" name="menu_Emulation">
diff --git a/src/common/color.h b/src/common/color.h
index 9dafdca0c..eb199e308 100644
--- a/src/common/color.h
+++ b/src/common/color.h
@@ -69,6 +69,15 @@ inline const Math::Vec4<u8> DecodeRGB8(const u8* bytes) {
}
/**
+ * Decode a color stored in RG8 (aka HILO8) format
+ * @param bytes Pointer to encoded source color
+ * @return Result color decoded as Math::Vec4<u8>
+ */
+inline const Math::Vec4<u8> DecodeRG8(const u8* bytes) {
+ return { bytes[1], bytes[0], 0, 255 };
+}
+
+/**
* Decode a color stored in RGB565 format
* @param bytes Pointer to encoded source color
* @return Result color decoded as Math::Vec4<u8>
@@ -152,6 +161,15 @@ inline void EncodeRGB8(const Math::Vec4<u8>& color, u8* bytes) {
}
/**
+ * Encode a color as RG8 (aka HILO8) format
+ * @param color Source color to encode
+ * @param bytes Destination pointer to store encoded color
+ */
+inline void EncodeRG8(const Math::Vec4<u8>& color, u8* bytes) {
+ bytes[1] = color.r();
+ bytes[0] = color.g();
+}
+/**
* Encode a color as RGB565 format
* @param color Source color to encode
* @param bytes Destination pointer to store encoded color
diff --git a/src/common/common_funcs.h b/src/common/common_funcs.h
index 88e452a16..ed20c3629 100644
--- a/src/common/common_funcs.h
+++ b/src/common/common_funcs.h
@@ -45,14 +45,20 @@
// GCC 4.8 defines all the rotate functions now
// Small issue with GCC's lrotl/lrotr intrinsics is they are still 32bit while we require 64bit
-#ifndef _rotl
-inline u32 _rotl(u32 x, int shift) {
+#ifdef _rotl
+#define rotl _rotl
+#else
+inline u32 rotl(u32 x, int shift) {
shift &= 31;
if (!shift) return x;
return (x << shift) | (x >> (32 - shift));
}
+#endif
-inline u32 _rotr(u32 x, int shift) {
+#ifdef _rotr
+#define rotr _rotr
+#else
+inline u32 rotr(u32 x, int shift) {
shift &= 31;
if (!shift) return x;
return (x >> shift) | (x << (32 - shift));
diff --git a/src/common/file_util.h b/src/common/file_util.h
index d0dccdf69..e71a9b2fa 100644
--- a/src/common/file_util.h
+++ b/src/common/file_util.h
@@ -244,7 +244,7 @@ private:
template <typename T>
void OpenFStream(T& fstream, const std::string& filename, std::ios_base::openmode openmode)
{
-#ifdef _WIN32
+#ifdef _MSC_VER
fstream.open(Common::UTF8ToTStr(filename).c_str(), openmode);
#else
fstream.open(filename.c_str(), openmode);
diff --git a/src/common/x64/emitter.cpp b/src/common/x64/emitter.cpp
index 4b79acd1f..cf31f8d69 100644
--- a/src/common/x64/emitter.cpp
+++ b/src/common/x64/emitter.cpp
@@ -15,6 +15,7 @@
// Official SVN repository and contact information can be found at
// http://code.google.com/p/dolphin-emu/
+#include <cinttypes>
#include <cstring>
#include "common/assert.h"
@@ -25,11 +26,6 @@
#include "cpu_detect.h"
#include "emitter.h"
-#define PRIx64 "llx"
-
-// Minimize the diff against Dolphin
-#define DYNA_REC JIT
-
namespace Gen
{
@@ -374,7 +370,7 @@ void XEmitter::Rex(int w, int r, int x, int b)
Write8(rx);
}
-void XEmitter::JMP(const u8 *addr, bool force5Bytes)
+void XEmitter::JMP(const u8* addr, bool force5Bytes)
{
u64 fn = (u64)addr;
if (!force5Bytes)
@@ -398,7 +394,7 @@ void XEmitter::JMP(const u8 *addr, bool force5Bytes)
}
}
-void XEmitter::JMPptr(const OpArg &arg2)
+void XEmitter::JMPptr(const OpArg& arg2)
{
OpArg arg = arg2;
if (arg.IsImm()) ASSERT_MSG(0, "JMPptr - Imm argument");
@@ -425,7 +421,7 @@ void XEmitter::CALLptr(OpArg arg)
arg.WriteRest(this);
}
-void XEmitter::CALL(const void *fnptr)
+void XEmitter::CALL(const void* fnptr)
{
u64 distance = u64(fnptr) - (u64(code) + 5);
ASSERT_MSG(
@@ -496,7 +492,7 @@ void XEmitter::J_CC(CCFlags conditionCode, const u8* addr, bool force5bytes)
}
}
-void XEmitter::SetJumpTarget(const FixupBranch &branch)
+void XEmitter::SetJumpTarget(const FixupBranch& branch)
{
if (branch.type == 0)
{
@@ -512,30 +508,6 @@ void XEmitter::SetJumpTarget(const FixupBranch &branch)
}
}
-// INC/DEC considered harmful on newer CPUs due to partial flag set.
-// Use ADD, SUB instead.
-
-/*
-void XEmitter::INC(int bits, OpArg arg)
-{
- if (arg.IsImm()) ASSERT_MSG(0, "INC - Imm argument");
- arg.operandReg = 0;
- if (bits == 16) {Write8(0x66);}
- arg.WriteRex(this, bits, bits);
- Write8(bits == 8 ? 0xFE : 0xFF);
- arg.WriteRest(this);
-}
-void XEmitter::DEC(int bits, OpArg arg)
-{
- if (arg.IsImm()) ASSERT_MSG(0, "DEC - Imm argument");
- arg.operandReg = 1;
- if (bits == 16) {Write8(0x66);}
- arg.WriteRex(this, bits, bits);
- Write8(bits == 8 ? 0xFE : 0xFF);
- arg.WriteRest(this);
-}
-*/
-
//Single byte opcodes
//There is no PUSHAD/POPAD in 64-bit mode.
void XEmitter::INT3() {Write8(0xCC);}
@@ -667,7 +639,7 @@ void XEmitter::CBW(int bits)
void XEmitter::PUSH(X64Reg reg) {WriteSimple1Byte(32, 0x50, reg);}
void XEmitter::POP(X64Reg reg) {WriteSimple1Byte(32, 0x58, reg);}
-void XEmitter::PUSH(int bits, const OpArg &reg)
+void XEmitter::PUSH(int bits, const OpArg& reg)
{
if (reg.IsSimpleReg())
PUSH(reg.GetSimpleReg());
@@ -703,7 +675,7 @@ void XEmitter::PUSH(int bits, const OpArg &reg)
}
}
-void XEmitter::POP(int /*bits*/, const OpArg &reg)
+void XEmitter::POP(int /*bits*/, const OpArg& reg)
{
if (reg.IsSimpleReg())
POP(reg.GetSimpleReg());
@@ -791,12 +763,12 @@ void XEmitter::WriteMulDivType(int bits, OpArg src, int ext)
src.WriteRest(this);
}
-void XEmitter::MUL(int bits, OpArg src) {WriteMulDivType(bits, src, 4);}
-void XEmitter::DIV(int bits, OpArg src) {WriteMulDivType(bits, src, 6);}
-void XEmitter::IMUL(int bits, OpArg src) {WriteMulDivType(bits, src, 5);}
-void XEmitter::IDIV(int bits, OpArg src) {WriteMulDivType(bits, src, 7);}
-void XEmitter::NEG(int bits, OpArg src) {WriteMulDivType(bits, src, 3);}
-void XEmitter::NOT(int bits, OpArg src) {WriteMulDivType(bits, src, 2);}
+void XEmitter::MUL(int bits, const OpArg& src) {WriteMulDivType(bits, src, 4);}
+void XEmitter::DIV(int bits, const OpArg& src) {WriteMulDivType(bits, src, 6);}
+void XEmitter::IMUL(int bits, const OpArg& src) {WriteMulDivType(bits, src, 5);}
+void XEmitter::IDIV(int bits, const OpArg& src) {WriteMulDivType(bits, src, 7);}
+void XEmitter::NEG(int bits, const OpArg& src) {WriteMulDivType(bits, src, 3);}
+void XEmitter::NOT(int bits, const OpArg& src) {WriteMulDivType(bits, src, 2);}
void XEmitter::WriteBitSearchType(int bits, X64Reg dest, OpArg src, u8 byte2, bool rep)
{
@@ -813,24 +785,24 @@ void XEmitter::WriteBitSearchType(int bits, X64Reg dest, OpArg src, u8 byte2, bo
src.WriteRest(this);
}
-void XEmitter::MOVNTI(int bits, OpArg dest, X64Reg src)
+void XEmitter::MOVNTI(int bits, const OpArg& dest, X64Reg src)
{
if (bits <= 16)
ASSERT_MSG(0, "MOVNTI - bits<=16");
WriteBitSearchType(bits, src, dest, 0xC3);
}
-void XEmitter::BSF(int bits, X64Reg dest, OpArg src) {WriteBitSearchType(bits,dest,src,0xBC);} //bottom bit to top bit
-void XEmitter::BSR(int bits, X64Reg dest, OpArg src) {WriteBitSearchType(bits,dest,src,0xBD);} //top bit to bottom bit
+void XEmitter::BSF(int bits, X64Reg dest, const OpArg& src) {WriteBitSearchType(bits,dest,src,0xBC);} // Bottom bit to top bit
+void XEmitter::BSR(int bits, X64Reg dest, const OpArg& src) {WriteBitSearchType(bits,dest,src,0xBD);} // Top bit to bottom bit
-void XEmitter::TZCNT(int bits, X64Reg dest, OpArg src)
+void XEmitter::TZCNT(int bits, X64Reg dest, const OpArg& src)
{
CheckFlags();
if (!Common::GetCPUCaps().bmi1)
ASSERT_MSG(0, "Trying to use BMI1 on a system that doesn't support it. Bad programmer.");
WriteBitSearchType(bits, dest, src, 0xBC, true);
}
-void XEmitter::LZCNT(int bits, X64Reg dest, OpArg src)
+void XEmitter::LZCNT(int bits, X64Reg dest, const OpArg& src)
{
CheckFlags();
if (!Common::GetCPUCaps().lzcnt)
@@ -950,7 +922,7 @@ void XEmitter::LEA(int bits, X64Reg dest, OpArg src)
}
//shift can be either imm8 or cl
-void XEmitter::WriteShift(int bits, OpArg dest, OpArg &shift, int ext)
+void XEmitter::WriteShift(int bits, OpArg dest, const OpArg& shift, int ext)
{
CheckFlags();
bool writeImm = false;
@@ -991,16 +963,16 @@ void XEmitter::WriteShift(int bits, OpArg dest, OpArg &shift, int ext)
// large rotates and shift are slower on intel than amd
// intel likes to rotate by 1, and the op is smaller too
-void XEmitter::ROL(int bits, OpArg dest, OpArg shift) {WriteShift(bits, dest, shift, 0);}
-void XEmitter::ROR(int bits, OpArg dest, OpArg shift) {WriteShift(bits, dest, shift, 1);}
-void XEmitter::RCL(int bits, OpArg dest, OpArg shift) {WriteShift(bits, dest, shift, 2);}
-void XEmitter::RCR(int bits, OpArg dest, OpArg shift) {WriteShift(bits, dest, shift, 3);}
-void XEmitter::SHL(int bits, OpArg dest, OpArg shift) {WriteShift(bits, dest, shift, 4);}
-void XEmitter::SHR(int bits, OpArg dest, OpArg shift) {WriteShift(bits, dest, shift, 5);}
-void XEmitter::SAR(int bits, OpArg dest, OpArg shift) {WriteShift(bits, dest, shift, 7);}
+void XEmitter::ROL(int bits, const OpArg& dest, const OpArg& shift) {WriteShift(bits, dest, shift, 0);}
+void XEmitter::ROR(int bits, const OpArg& dest, const OpArg& shift) {WriteShift(bits, dest, shift, 1);}
+void XEmitter::RCL(int bits, const OpArg& dest, const OpArg& shift) {WriteShift(bits, dest, shift, 2);}
+void XEmitter::RCR(int bits, const OpArg& dest, const OpArg& shift) {WriteShift(bits, dest, shift, 3);}
+void XEmitter::SHL(int bits, const OpArg& dest, const OpArg& shift) {WriteShift(bits, dest, shift, 4);}
+void XEmitter::SHR(int bits, const OpArg& dest, const OpArg& shift) {WriteShift(bits, dest, shift, 5);}
+void XEmitter::SAR(int bits, const OpArg& dest, const OpArg& shift) {WriteShift(bits, dest, shift, 7);}
// index can be either imm8 or register, don't use memory destination because it's slow
-void XEmitter::WriteBitTest(int bits, OpArg &dest, OpArg &index, int ext)
+void XEmitter::WriteBitTest(int bits, const OpArg& dest, const OpArg& index, int ext)
{
CheckFlags();
if (dest.IsImm())
@@ -1029,13 +1001,13 @@ void XEmitter::WriteBitTest(int bits, OpArg &dest, OpArg &index, int ext)
}
}
-void XEmitter::BT(int bits, OpArg dest, OpArg index) {WriteBitTest(bits, dest, index, 4);}
-void XEmitter::BTS(int bits, OpArg dest, OpArg index) {WriteBitTest(bits, dest, index, 5);}
-void XEmitter::BTR(int bits, OpArg dest, OpArg index) {WriteBitTest(bits, dest, index, 6);}
-void XEmitter::BTC(int bits, OpArg dest, OpArg index) {WriteBitTest(bits, dest, index, 7);}
+void XEmitter::BT(int bits, const OpArg& dest, const OpArg& index) {WriteBitTest(bits, dest, index, 4);}
+void XEmitter::BTS(int bits, const OpArg& dest, const OpArg& index) {WriteBitTest(bits, dest, index, 5);}
+void XEmitter::BTR(int bits, const OpArg& dest, const OpArg& index) {WriteBitTest(bits, dest, index, 6);}
+void XEmitter::BTC(int bits, const OpArg& dest, const OpArg& index) {WriteBitTest(bits, dest, index, 7);}
//shift can be either imm8 or cl
-void XEmitter::SHRD(int bits, OpArg dest, OpArg src, OpArg shift)
+void XEmitter::SHRD(int bits, const OpArg& dest, const OpArg& src, const OpArg& shift)
{
CheckFlags();
if (dest.IsImm())
@@ -1067,7 +1039,7 @@ void XEmitter::SHRD(int bits, OpArg dest, OpArg src, OpArg shift)
}
}
-void XEmitter::SHLD(int bits, OpArg dest, OpArg src, OpArg shift)
+void XEmitter::SHLD(int bits, const OpArg& dest, const OpArg& src, const OpArg& shift)
{
CheckFlags();
if (dest.IsImm())
@@ -1111,7 +1083,7 @@ void OpArg::WriteSingleByteOp(XEmitter *emit, u8 op, X64Reg _operandReg, int bit
}
//operand can either be immediate or register
-void OpArg::WriteNormalOp(XEmitter *emit, bool toRM, NormalOp op, const OpArg &operand, int bits) const
+void OpArg::WriteNormalOp(XEmitter *emit, bool toRM, NormalOp op, const OpArg& operand, int bits) const
{
X64Reg _operandReg;
if (IsImm())
@@ -1257,7 +1229,7 @@ void OpArg::WriteNormalOp(XEmitter *emit, bool toRM, NormalOp op, const OpArg &o
}
}
-void XEmitter::WriteNormalOp(XEmitter *emit, int bits, NormalOp op, const OpArg &a1, const OpArg &a2)
+void XEmitter::WriteNormalOp(XEmitter *emit, int bits, NormalOp op, const OpArg& a1, const OpArg& a2)
{
if (a1.IsImm())
{
@@ -1283,24 +1255,24 @@ void XEmitter::WriteNormalOp(XEmitter *emit, int bits, NormalOp op, const OpArg
}
}
-void XEmitter::ADD (int bits, const OpArg &a1, const OpArg &a2) {CheckFlags(); WriteNormalOp(this, bits, nrmADD, a1, a2);}
-void XEmitter::ADC (int bits, const OpArg &a1, const OpArg &a2) {CheckFlags(); WriteNormalOp(this, bits, nrmADC, a1, a2);}
-void XEmitter::SUB (int bits, const OpArg &a1, const OpArg &a2) {CheckFlags(); WriteNormalOp(this, bits, nrmSUB, a1, a2);}
-void XEmitter::SBB (int bits, const OpArg &a1, const OpArg &a2) {CheckFlags(); WriteNormalOp(this, bits, nrmSBB, a1, a2);}
-void XEmitter::AND (int bits, const OpArg &a1, const OpArg &a2) {CheckFlags(); WriteNormalOp(this, bits, nrmAND, a1, a2);}
-void XEmitter::OR (int bits, const OpArg &a1, const OpArg &a2) {CheckFlags(); WriteNormalOp(this, bits, nrmOR , a1, a2);}
-void XEmitter::XOR (int bits, const OpArg &a1, const OpArg &a2) {CheckFlags(); WriteNormalOp(this, bits, nrmXOR, a1, a2);}
-void XEmitter::MOV (int bits, const OpArg &a1, const OpArg &a2)
+void XEmitter::ADD (int bits, const OpArg& a1, const OpArg& a2) {CheckFlags(); WriteNormalOp(this, bits, nrmADD, a1, a2);}
+void XEmitter::ADC (int bits, const OpArg& a1, const OpArg& a2) {CheckFlags(); WriteNormalOp(this, bits, nrmADC, a1, a2);}
+void XEmitter::SUB (int bits, const OpArg& a1, const OpArg& a2) {CheckFlags(); WriteNormalOp(this, bits, nrmSUB, a1, a2);}
+void XEmitter::SBB (int bits, const OpArg& a1, const OpArg& a2) {CheckFlags(); WriteNormalOp(this, bits, nrmSBB, a1, a2);}
+void XEmitter::AND (int bits, const OpArg& a1, const OpArg& a2) {CheckFlags(); WriteNormalOp(this, bits, nrmAND, a1, a2);}
+void XEmitter::OR (int bits, const OpArg& a1, const OpArg& a2) {CheckFlags(); WriteNormalOp(this, bits, nrmOR , a1, a2);}
+void XEmitter::XOR (int bits, const OpArg& a1, const OpArg& a2) {CheckFlags(); WriteNormalOp(this, bits, nrmXOR, a1, a2);}
+void XEmitter::MOV (int bits, const OpArg& a1, const OpArg& a2)
{
if (a1.IsSimpleReg() && a2.IsSimpleReg() && a1.GetSimpleReg() == a2.GetSimpleReg())
LOG_ERROR(Common, "Redundant MOV @ %p - bug in JIT?", code);
WriteNormalOp(this, bits, nrmMOV, a1, a2);
}
-void XEmitter::TEST(int bits, const OpArg &a1, const OpArg &a2) {CheckFlags(); WriteNormalOp(this, bits, nrmTEST, a1, a2);}
-void XEmitter::CMP (int bits, const OpArg &a1, const OpArg &a2) {CheckFlags(); WriteNormalOp(this, bits, nrmCMP, a1, a2);}
-void XEmitter::XCHG(int bits, const OpArg &a1, const OpArg &a2) {WriteNormalOp(this, bits, nrmXCHG, a1, a2);}
+void XEmitter::TEST(int bits, const OpArg& a1, const OpArg& a2) {CheckFlags(); WriteNormalOp(this, bits, nrmTEST, a1, a2);}
+void XEmitter::CMP (int bits, const OpArg& a1, const OpArg& a2) {CheckFlags(); WriteNormalOp(this, bits, nrmCMP, a1, a2);}
+void XEmitter::XCHG(int bits, const OpArg& a1, const OpArg& a2) {WriteNormalOp(this, bits, nrmXCHG, a1, a2);}
-void XEmitter::IMUL(int bits, X64Reg regOp, OpArg a1, OpArg a2)
+void XEmitter::IMUL(int bits, X64Reg regOp, const OpArg& a1, const OpArg& a2)
{
CheckFlags();
if (bits == 8)
@@ -1353,7 +1325,7 @@ void XEmitter::IMUL(int bits, X64Reg regOp, OpArg a1, OpArg a2)
}
}
-void XEmitter::IMUL(int bits, X64Reg regOp, OpArg a)
+void XEmitter::IMUL(int bits, X64Reg regOp, const OpArg& a)
{
CheckFlags();
if (bits == 8)
@@ -1390,7 +1362,7 @@ void XEmitter::WriteSSEOp(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extr
arg.WriteRest(this, extrabytes);
}
-void XEmitter::WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes)
+void XEmitter::WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp, const OpArg& arg, int extrabytes)
{
WriteAVXOp(opPrefix, op, regOp, INVALID_REG, arg, extrabytes);
}
@@ -1400,25 +1372,25 @@ static int GetVEXmmmmm(u16 op)
// Currently, only 0x38 and 0x3A are used as secondary escape byte.
if ((op >> 8) == 0x3A)
return 3;
- else if ((op >> 8) == 0x38)
+ if ((op >> 8) == 0x38)
return 2;
- else
- return 1;
+
+ return 1;
}
static int GetVEXpp(u8 opPrefix)
{
if (opPrefix == 0x66)
return 1;
- else if (opPrefix == 0xF3)
+ if (opPrefix == 0xF3)
return 2;
- else if (opPrefix == 0xF2)
+ if (opPrefix == 0xF2)
return 3;
- else
- return 0;
+
+ return 0;
}
-void XEmitter::WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes)
+void XEmitter::WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, int extrabytes)
{
if (!Common::GetCPUCaps().avx)
ASSERT_MSG(0, "Trying to use AVX on a system that doesn't support it. Bad programmer.");
@@ -1431,7 +1403,7 @@ void XEmitter::WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpA
}
// Like the above, but more general; covers GPR-based VEX operations, like BMI1/2
-void XEmitter::WriteVEXOp(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes)
+void XEmitter::WriteVEXOp(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, int extrabytes)
{
if (size != 32 && size != 64)
ASSERT_MSG(0, "VEX GPR instructions only support 32-bit and 64-bit modes!");
@@ -1442,7 +1414,7 @@ void XEmitter::WriteVEXOp(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg r
arg.WriteRest(this, extrabytes, regOp1);
}
-void XEmitter::WriteBMI1Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes)
+void XEmitter::WriteBMI1Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, int extrabytes)
{
CheckFlags();
if (!Common::GetCPUCaps().bmi1)
@@ -1450,7 +1422,7 @@ void XEmitter::WriteBMI1Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg
WriteVEXOp(size, opPrefix, op, regOp1, regOp2, arg, extrabytes);
}
-void XEmitter::WriteBMI2Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes)
+void XEmitter::WriteBMI2Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, int extrabytes)
{
CheckFlags();
if (!Common::GetCPUCaps().bmi2)
@@ -1517,135 +1489,135 @@ void XEmitter::WriteMXCSR(OpArg arg, int ext)
arg.WriteRest(this);
}
-void XEmitter::STMXCSR(OpArg memloc) {WriteMXCSR(memloc, 3);}
-void XEmitter::LDMXCSR(OpArg memloc) {WriteMXCSR(memloc, 2);}
-
-void XEmitter::MOVNTDQ(OpArg arg, X64Reg regOp) {WriteSSEOp(0x66, sseMOVNTDQ, regOp, arg);}
-void XEmitter::MOVNTPS(OpArg arg, X64Reg regOp) {WriteSSEOp(0x00, sseMOVNTP, regOp, arg);}
-void XEmitter::MOVNTPD(OpArg arg, X64Reg regOp) {WriteSSEOp(0x66, sseMOVNTP, regOp, arg);}
-
-void XEmitter::ADDSS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, sseADD, regOp, arg);}
-void XEmitter::ADDSD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, sseADD, regOp, arg);}
-void XEmitter::SUBSS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, sseSUB, regOp, arg);}
-void XEmitter::SUBSD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, sseSUB, regOp, arg);}
-void XEmitter::CMPSS(X64Reg regOp, OpArg arg, u8 compare) {WriteSSEOp(0xF3, sseCMP, regOp, arg, 1); Write8(compare);}
-void XEmitter::CMPSD(X64Reg regOp, OpArg arg, u8 compare) {WriteSSEOp(0xF2, sseCMP, regOp, arg, 1); Write8(compare);}
-void XEmitter::MULSS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, sseMUL, regOp, arg);}
-void XEmitter::MULSD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, sseMUL, regOp, arg);}
-void XEmitter::DIVSS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, sseDIV, regOp, arg);}
-void XEmitter::DIVSD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, sseDIV, regOp, arg);}
-void XEmitter::MINSS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, sseMIN, regOp, arg);}
-void XEmitter::MINSD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, sseMIN, regOp, arg);}
-void XEmitter::MAXSS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, sseMAX, regOp, arg);}
-void XEmitter::MAXSD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, sseMAX, regOp, arg);}
-void XEmitter::SQRTSS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, sseSQRT, regOp, arg);}
-void XEmitter::SQRTSD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, sseSQRT, regOp, arg);}
-void XEmitter::RSQRTSS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, sseRSQRT, regOp, arg);}
-
-void XEmitter::ADDPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseADD, regOp, arg);}
-void XEmitter::ADDPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseADD, regOp, arg);}
-void XEmitter::SUBPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseSUB, regOp, arg);}
-void XEmitter::SUBPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseSUB, regOp, arg);}
-void XEmitter::CMPPS(X64Reg regOp, OpArg arg, u8 compare) {WriteSSEOp(0x00, sseCMP, regOp, arg, 1); Write8(compare);}
-void XEmitter::CMPPD(X64Reg regOp, OpArg arg, u8 compare) {WriteSSEOp(0x66, sseCMP, regOp, arg, 1); Write8(compare);}
-void XEmitter::ANDPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseAND, regOp, arg);}
-void XEmitter::ANDPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseAND, regOp, arg);}
-void XEmitter::ANDNPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseANDN, regOp, arg);}
-void XEmitter::ANDNPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseANDN, regOp, arg);}
-void XEmitter::ORPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseOR, regOp, arg);}
-void XEmitter::ORPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseOR, regOp, arg);}
-void XEmitter::XORPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseXOR, regOp, arg);}
-void XEmitter::XORPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseXOR, regOp, arg);}
-void XEmitter::MULPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseMUL, regOp, arg);}
-void XEmitter::MULPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseMUL, regOp, arg);}
-void XEmitter::DIVPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseDIV, regOp, arg);}
-void XEmitter::DIVPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseDIV, regOp, arg);}
-void XEmitter::MINPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseMIN, regOp, arg);}
-void XEmitter::MINPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseMIN, regOp, arg);}
-void XEmitter::MAXPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseMAX, regOp, arg);}
-void XEmitter::MAXPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseMAX, regOp, arg);}
-void XEmitter::SQRTPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseSQRT, regOp, arg);}
-void XEmitter::SQRTPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseSQRT, regOp, arg);}
-void XEmitter::RCPPS(X64Reg regOp, OpArg arg) { WriteSSEOp(0x00, sseRCP, regOp, arg); }
-void XEmitter::RSQRTPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseRSQRT, regOp, arg);}
-void XEmitter::SHUFPS(X64Reg regOp, OpArg arg, u8 shuffle) {WriteSSEOp(0x00, sseSHUF, regOp, arg,1); Write8(shuffle);}
-void XEmitter::SHUFPD(X64Reg regOp, OpArg arg, u8 shuffle) {WriteSSEOp(0x66, sseSHUF, regOp, arg,1); Write8(shuffle);}
-
-void XEmitter::HADDPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, sseHADD, regOp, arg);}
-
-void XEmitter::COMISS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseCOMIS, regOp, arg);} //weird that these should be packed
-void XEmitter::COMISD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseCOMIS, regOp, arg);} //ordered
-void XEmitter::UCOMISS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseUCOMIS, regOp, arg);} //unordered
-void XEmitter::UCOMISD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseUCOMIS, regOp, arg);}
-
-void XEmitter::MOVAPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseMOVAPfromRM, regOp, arg);}
-void XEmitter::MOVAPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseMOVAPfromRM, regOp, arg);}
-void XEmitter::MOVAPS(OpArg arg, X64Reg regOp) {WriteSSEOp(0x00, sseMOVAPtoRM, regOp, arg);}
-void XEmitter::MOVAPD(OpArg arg, X64Reg regOp) {WriteSSEOp(0x66, sseMOVAPtoRM, regOp, arg);}
-
-void XEmitter::MOVUPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseMOVUPfromRM, regOp, arg);}
-void XEmitter::MOVUPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseMOVUPfromRM, regOp, arg);}
-void XEmitter::MOVUPS(OpArg arg, X64Reg regOp) {WriteSSEOp(0x00, sseMOVUPtoRM, regOp, arg);}
-void XEmitter::MOVUPD(OpArg arg, X64Reg regOp) {WriteSSEOp(0x66, sseMOVUPtoRM, regOp, arg);}
-
-void XEmitter::MOVDQA(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseMOVDQfromRM, regOp, arg);}
-void XEmitter::MOVDQA(OpArg arg, X64Reg regOp) {WriteSSEOp(0x66, sseMOVDQtoRM, regOp, arg);}
-void XEmitter::MOVDQU(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, sseMOVDQfromRM, regOp, arg);}
-void XEmitter::MOVDQU(OpArg arg, X64Reg regOp) {WriteSSEOp(0xF3, sseMOVDQtoRM, regOp, arg);}
-
-void XEmitter::MOVSS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, sseMOVUPfromRM, regOp, arg);}
-void XEmitter::MOVSD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, sseMOVUPfromRM, regOp, arg);}
-void XEmitter::MOVSS(OpArg arg, X64Reg regOp) {WriteSSEOp(0xF3, sseMOVUPtoRM, regOp, arg);}
-void XEmitter::MOVSD(OpArg arg, X64Reg regOp) {WriteSSEOp(0xF2, sseMOVUPtoRM, regOp, arg);}
-
-void XEmitter::MOVLPS(X64Reg regOp, OpArg arg) { WriteSSEOp(0x00, sseMOVLPfromRM, regOp, arg); }
-void XEmitter::MOVLPD(X64Reg regOp, OpArg arg) { WriteSSEOp(0x66, sseMOVLPfromRM, regOp, arg); }
-void XEmitter::MOVLPS(OpArg arg, X64Reg regOp) { WriteSSEOp(0x00, sseMOVLPtoRM, regOp, arg); }
-void XEmitter::MOVLPD(OpArg arg, X64Reg regOp) { WriteSSEOp(0x66, sseMOVLPtoRM, regOp, arg); }
-
-void XEmitter::MOVHPS(X64Reg regOp, OpArg arg) { WriteSSEOp(0x00, sseMOVHPfromRM, regOp, arg); }
-void XEmitter::MOVHPD(X64Reg regOp, OpArg arg) { WriteSSEOp(0x66, sseMOVHPfromRM, regOp, arg); }
-void XEmitter::MOVHPS(OpArg arg, X64Reg regOp) { WriteSSEOp(0x00, sseMOVHPtoRM, regOp, arg); }
-void XEmitter::MOVHPD(OpArg arg, X64Reg regOp) { WriteSSEOp(0x66, sseMOVHPtoRM, regOp, arg); }
+void XEmitter::STMXCSR(const OpArg& memloc) {WriteMXCSR(memloc, 3);}
+void XEmitter::LDMXCSR(const OpArg& memloc) {WriteMXCSR(memloc, 2);}
+
+void XEmitter::MOVNTDQ(const OpArg& arg, X64Reg regOp) {WriteSSEOp(0x66, sseMOVNTDQ, regOp, arg);}
+void XEmitter::MOVNTPS(const OpArg& arg, X64Reg regOp) {WriteSSEOp(0x00, sseMOVNTP, regOp, arg);}
+void XEmitter::MOVNTPD(const OpArg& arg, X64Reg regOp) {WriteSSEOp(0x66, sseMOVNTP, regOp, arg);}
+
+void XEmitter::ADDSS(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0xF3, sseADD, regOp, arg);}
+void XEmitter::ADDSD(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0xF2, sseADD, regOp, arg);}
+void XEmitter::SUBSS(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0xF3, sseSUB, regOp, arg);}
+void XEmitter::SUBSD(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0xF2, sseSUB, regOp, arg);}
+void XEmitter::CMPSS(X64Reg regOp, const OpArg& arg, u8 compare) {WriteSSEOp(0xF3, sseCMP, regOp, arg, 1); Write8(compare);}
+void XEmitter::CMPSD(X64Reg regOp, const OpArg& arg, u8 compare) {WriteSSEOp(0xF2, sseCMP, regOp, arg, 1); Write8(compare);}
+void XEmitter::MULSS(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0xF3, sseMUL, regOp, arg);}
+void XEmitter::MULSD(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0xF2, sseMUL, regOp, arg);}
+void XEmitter::DIVSS(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0xF3, sseDIV, regOp, arg);}
+void XEmitter::DIVSD(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0xF2, sseDIV, regOp, arg);}
+void XEmitter::MINSS(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0xF3, sseMIN, regOp, arg);}
+void XEmitter::MINSD(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0xF2, sseMIN, regOp, arg);}
+void XEmitter::MAXSS(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0xF3, sseMAX, regOp, arg);}
+void XEmitter::MAXSD(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0xF2, sseMAX, regOp, arg);}
+void XEmitter::SQRTSS(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0xF3, sseSQRT, regOp, arg);}
+void XEmitter::SQRTSD(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0xF2, sseSQRT, regOp, arg);}
+void XEmitter::RSQRTSS(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0xF3, sseRSQRT, regOp, arg);}
+
+void XEmitter::ADDPS(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0x00, sseADD, regOp, arg);}
+void XEmitter::ADDPD(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0x66, sseADD, regOp, arg);}
+void XEmitter::SUBPS(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0x00, sseSUB, regOp, arg);}
+void XEmitter::SUBPD(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0x66, sseSUB, regOp, arg);}
+void XEmitter::CMPPS(X64Reg regOp, const OpArg& arg, u8 compare) {WriteSSEOp(0x00, sseCMP, regOp, arg, 1); Write8(compare);}
+void XEmitter::CMPPD(X64Reg regOp, const OpArg& arg, u8 compare) {WriteSSEOp(0x66, sseCMP, regOp, arg, 1); Write8(compare);}
+void XEmitter::ANDPS(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0x00, sseAND, regOp, arg);}
+void XEmitter::ANDPD(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0x66, sseAND, regOp, arg);}
+void XEmitter::ANDNPS(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0x00, sseANDN, regOp, arg);}
+void XEmitter::ANDNPD(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0x66, sseANDN, regOp, arg);}
+void XEmitter::ORPS(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0x00, sseOR, regOp, arg);}
+void XEmitter::ORPD(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0x66, sseOR, regOp, arg);}
+void XEmitter::XORPS(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0x00, sseXOR, regOp, arg);}
+void XEmitter::XORPD(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0x66, sseXOR, regOp, arg);}
+void XEmitter::MULPS(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0x00, sseMUL, regOp, arg);}
+void XEmitter::MULPD(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0x66, sseMUL, regOp, arg);}
+void XEmitter::DIVPS(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0x00, sseDIV, regOp, arg);}
+void XEmitter::DIVPD(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0x66, sseDIV, regOp, arg);}
+void XEmitter::MINPS(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0x00, sseMIN, regOp, arg);}
+void XEmitter::MINPD(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0x66, sseMIN, regOp, arg);}
+void XEmitter::MAXPS(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0x00, sseMAX, regOp, arg);}
+void XEmitter::MAXPD(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0x66, sseMAX, regOp, arg);}
+void XEmitter::SQRTPS(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0x00, sseSQRT, regOp, arg);}
+void XEmitter::SQRTPD(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0x66, sseSQRT, regOp, arg);}
+void XEmitter::RCPPS(X64Reg regOp, const OpArg& arg) { WriteSSEOp(0x00, sseRCP, regOp, arg); }
+void XEmitter::RSQRTPS(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0x00, sseRSQRT, regOp, arg);}
+void XEmitter::SHUFPS(X64Reg regOp, const OpArg& arg, u8 shuffle) {WriteSSEOp(0x00, sseSHUF, regOp, arg,1); Write8(shuffle);}
+void XEmitter::SHUFPD(X64Reg regOp, const OpArg& arg, u8 shuffle) {WriteSSEOp(0x66, sseSHUF, regOp, arg,1); Write8(shuffle);}
+
+void XEmitter::HADDPS(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0xF2, sseHADD, regOp, arg);}
+
+void XEmitter::COMISS(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0x00, sseCOMIS, regOp, arg);} //weird that these should be packed
+void XEmitter::COMISD(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0x66, sseCOMIS, regOp, arg);} //ordered
+void XEmitter::UCOMISS(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0x00, sseUCOMIS, regOp, arg);} //unordered
+void XEmitter::UCOMISD(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0x66, sseUCOMIS, regOp, arg);}
+
+void XEmitter::MOVAPS(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0x00, sseMOVAPfromRM, regOp, arg);}
+void XEmitter::MOVAPD(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0x66, sseMOVAPfromRM, regOp, arg);}
+void XEmitter::MOVAPS(const OpArg& arg, X64Reg regOp) {WriteSSEOp(0x00, sseMOVAPtoRM, regOp, arg);}
+void XEmitter::MOVAPD(const OpArg& arg, X64Reg regOp) {WriteSSEOp(0x66, sseMOVAPtoRM, regOp, arg);}
+
+void XEmitter::MOVUPS(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0x00, sseMOVUPfromRM, regOp, arg);}
+void XEmitter::MOVUPD(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0x66, sseMOVUPfromRM, regOp, arg);}
+void XEmitter::MOVUPS(const OpArg& arg, X64Reg regOp) {WriteSSEOp(0x00, sseMOVUPtoRM, regOp, arg);}
+void XEmitter::MOVUPD(const OpArg& arg, X64Reg regOp) {WriteSSEOp(0x66, sseMOVUPtoRM, regOp, arg);}
+
+void XEmitter::MOVDQA(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0x66, sseMOVDQfromRM, regOp, arg);}
+void XEmitter::MOVDQA(const OpArg& arg, X64Reg regOp) {WriteSSEOp(0x66, sseMOVDQtoRM, regOp, arg);}
+void XEmitter::MOVDQU(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0xF3, sseMOVDQfromRM, regOp, arg);}
+void XEmitter::MOVDQU(const OpArg& arg, X64Reg regOp) {WriteSSEOp(0xF3, sseMOVDQtoRM, regOp, arg);}
+
+void XEmitter::MOVSS(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0xF3, sseMOVUPfromRM, regOp, arg);}
+void XEmitter::MOVSD(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0xF2, sseMOVUPfromRM, regOp, arg);}
+void XEmitter::MOVSS(const OpArg& arg, X64Reg regOp) {WriteSSEOp(0xF3, sseMOVUPtoRM, regOp, arg);}
+void XEmitter::MOVSD(const OpArg& arg, X64Reg regOp) {WriteSSEOp(0xF2, sseMOVUPtoRM, regOp, arg);}
+
+void XEmitter::MOVLPS(X64Reg regOp, const OpArg& arg) { WriteSSEOp(0x00, sseMOVLPfromRM, regOp, arg); }
+void XEmitter::MOVLPD(X64Reg regOp, const OpArg& arg) { WriteSSEOp(0x66, sseMOVLPfromRM, regOp, arg); }
+void XEmitter::MOVLPS(const OpArg& arg, X64Reg regOp) { WriteSSEOp(0x00, sseMOVLPtoRM, regOp, arg); }
+void XEmitter::MOVLPD(const OpArg& arg, X64Reg regOp) { WriteSSEOp(0x66, sseMOVLPtoRM, regOp, arg); }
+
+void XEmitter::MOVHPS(X64Reg regOp, const OpArg& arg) { WriteSSEOp(0x00, sseMOVHPfromRM, regOp, arg); }
+void XEmitter::MOVHPD(X64Reg regOp, const OpArg& arg) { WriteSSEOp(0x66, sseMOVHPfromRM, regOp, arg); }
+void XEmitter::MOVHPS(const OpArg& arg, X64Reg regOp) { WriteSSEOp(0x00, sseMOVHPtoRM, regOp, arg); }
+void XEmitter::MOVHPD(const OpArg& arg, X64Reg regOp) { WriteSSEOp(0x66, sseMOVHPtoRM, regOp, arg); }
void XEmitter::MOVHLPS(X64Reg regOp1, X64Reg regOp2) {WriteSSEOp(0x00, sseMOVHLPS, regOp1, R(regOp2));}
void XEmitter::MOVLHPS(X64Reg regOp1, X64Reg regOp2) {WriteSSEOp(0x00, sseMOVLHPS, regOp1, R(regOp2));}
-void XEmitter::CVTPS2PD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, 0x5A, regOp, arg);}
-void XEmitter::CVTPD2PS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, 0x5A, regOp, arg);}
+void XEmitter::CVTPS2PD(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0x00, 0x5A, regOp, arg);}
+void XEmitter::CVTPD2PS(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0x66, 0x5A, regOp, arg);}
-void XEmitter::CVTSD2SS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, 0x5A, regOp, arg);}
-void XEmitter::CVTSS2SD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, 0x5A, regOp, arg);}
-void XEmitter::CVTSD2SI(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, 0x2D, regOp, arg);}
-void XEmitter::CVTSS2SI(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, 0x2D, regOp, arg);}
-void XEmitter::CVTSI2SD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, 0x2A, regOp, arg);}
-void XEmitter::CVTSI2SS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, 0x2A, regOp, arg);}
+void XEmitter::CVTSD2SS(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0xF2, 0x5A, regOp, arg);}
+void XEmitter::CVTSS2SD(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0xF3, 0x5A, regOp, arg);}
+void XEmitter::CVTSD2SI(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0xF2, 0x2D, regOp, arg);}
+void XEmitter::CVTSS2SI(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0xF3, 0x2D, regOp, arg);}
+void XEmitter::CVTSI2SD(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0xF2, 0x2A, regOp, arg);}
+void XEmitter::CVTSI2SS(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0xF3, 0x2A, regOp, arg);}
-void XEmitter::CVTDQ2PD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, 0xE6, regOp, arg);}
-void XEmitter::CVTDQ2PS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, 0x5B, regOp, arg);}
-void XEmitter::CVTPD2DQ(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, 0xE6, regOp, arg);}
-void XEmitter::CVTPS2DQ(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, 0x5B, regOp, arg);}
+void XEmitter::CVTDQ2PD(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0xF3, 0xE6, regOp, arg);}
+void XEmitter::CVTDQ2PS(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0x00, 0x5B, regOp, arg);}
+void XEmitter::CVTPD2DQ(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0xF2, 0xE6, regOp, arg);}
+void XEmitter::CVTPS2DQ(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0x66, 0x5B, regOp, arg);}
-void XEmitter::CVTTSD2SI(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, 0x2C, regOp, arg);}
-void XEmitter::CVTTSS2SI(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, 0x2C, regOp, arg);}
-void XEmitter::CVTTPS2DQ(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, 0x5B, regOp, arg);}
-void XEmitter::CVTTPD2DQ(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, 0xE6, regOp, arg);}
+void XEmitter::CVTTSD2SI(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0xF2, 0x2C, regOp, arg);}
+void XEmitter::CVTTSS2SI(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0xF3, 0x2C, regOp, arg);}
+void XEmitter::CVTTPS2DQ(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0xF3, 0x5B, regOp, arg);}
+void XEmitter::CVTTPD2DQ(X64Reg regOp, const OpArg& arg) {WriteSSEOp(0x66, 0xE6, regOp, arg);}
void XEmitter::MASKMOVDQU(X64Reg dest, X64Reg src) {WriteSSEOp(0x66, sseMASKMOVDQU, dest, R(src));}
-void XEmitter::MOVMSKPS(X64Reg dest, OpArg arg) {WriteSSEOp(0x00, 0x50, dest, arg);}
-void XEmitter::MOVMSKPD(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x50, dest, arg);}
+void XEmitter::MOVMSKPS(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x00, 0x50, dest, arg);}
+void XEmitter::MOVMSKPD(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x66, 0x50, dest, arg);}
-void XEmitter::LDDQU(X64Reg dest, OpArg arg) {WriteSSEOp(0xF2, sseLDDQU, dest, arg);} // For integer data only
+void XEmitter::LDDQU(X64Reg dest, const OpArg& arg) {WriteSSEOp(0xF2, sseLDDQU, dest, arg);} // For integer data only
// THESE TWO ARE UNTESTED.
-void XEmitter::UNPCKLPS(X64Reg dest, OpArg arg) {WriteSSEOp(0x00, 0x14, dest, arg);}
-void XEmitter::UNPCKHPS(X64Reg dest, OpArg arg) {WriteSSEOp(0x00, 0x15, dest, arg);}
+void XEmitter::UNPCKLPS(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x00, 0x14, dest, arg);}
+void XEmitter::UNPCKHPS(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x00, 0x15, dest, arg);}
-void XEmitter::UNPCKLPD(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x14, dest, arg);}
-void XEmitter::UNPCKHPD(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x15, dest, arg);}
+void XEmitter::UNPCKLPD(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x66, 0x14, dest, arg);}
+void XEmitter::UNPCKHPD(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x66, 0x15, dest, arg);}
-void XEmitter::MOVDDUP(X64Reg regOp, OpArg arg)
+void XEmitter::MOVDDUP(X64Reg regOp, const OpArg& arg)
{
if (Common::GetCPUCaps().sse3)
{
@@ -1663,9 +1635,9 @@ void XEmitter::MOVDDUP(X64Reg regOp, OpArg arg)
//There are a few more left
// Also some integer instructions are missing
-void XEmitter::PACKSSDW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x6B, dest, arg);}
-void XEmitter::PACKSSWB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x63, dest, arg);}
-void XEmitter::PACKUSWB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x67, dest, arg);}
+void XEmitter::PACKSSDW(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x66, 0x6B, dest, arg);}
+void XEmitter::PACKSSWB(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x66, 0x63, dest, arg);}
+void XEmitter::PACKUSWB(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x66, 0x67, dest, arg);}
void XEmitter::PUNPCKLBW(X64Reg dest, const OpArg &arg) {WriteSSEOp(0x66, 0x60, dest, arg);}
void XEmitter::PUNPCKLWD(X64Reg dest, const OpArg &arg) {WriteSSEOp(0x66, 0x61, dest, arg);}
@@ -1690,7 +1662,7 @@ void XEmitter::PSRLQ(X64Reg reg, int shift)
Write8(shift);
}
-void XEmitter::PSRLQ(X64Reg reg, OpArg arg)
+void XEmitter::PSRLQ(X64Reg reg, const OpArg& arg)
{
WriteSSEOp(0x66, 0xd3, reg, arg);
}
@@ -1735,212 +1707,212 @@ void XEmitter::PSRAD(X64Reg reg, int shift)
Write8(shift);
}
-void XEmitter::WriteSSSE3Op(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes)
+void XEmitter::WriteSSSE3Op(u8 opPrefix, u16 op, X64Reg regOp, const OpArg& arg, int extrabytes)
{
if (!Common::GetCPUCaps().ssse3)
ASSERT_MSG(0, "Trying to use SSSE3 on a system that doesn't support it. Bad programmer.");
WriteSSEOp(opPrefix, op, regOp, arg, extrabytes);
}
-void XEmitter::WriteSSE41Op(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes)
+void XEmitter::WriteSSE41Op(u8 opPrefix, u16 op, X64Reg regOp, const OpArg& arg, int extrabytes)
{
if (!Common::GetCPUCaps().sse4_1)
ASSERT_MSG(0, "Trying to use SSE4.1 on a system that doesn't support it. Bad programmer.");
WriteSSEOp(opPrefix, op, regOp, arg, extrabytes);
}
-void XEmitter::PSHUFB(X64Reg dest, OpArg arg) {WriteSSSE3Op(0x66, 0x3800, dest, arg);}
-void XEmitter::PTEST(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3817, dest, arg);}
-void XEmitter::PACKUSDW(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x382b, dest, arg);}
-void XEmitter::DPPS(X64Reg dest, OpArg arg, u8 mask) {WriteSSE41Op(0x66, 0x3A40, dest, arg, 1); Write8(mask);}
-
-void XEmitter::PMINSB(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3838, dest, arg);}
-void XEmitter::PMINSD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3839, dest, arg);}
-void XEmitter::PMINUW(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x383a, dest, arg);}
-void XEmitter::PMINUD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x383b, dest, arg);}
-void XEmitter::PMAXSB(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x383c, dest, arg);}
-void XEmitter::PMAXSD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x383d, dest, arg);}
-void XEmitter::PMAXUW(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x383e, dest, arg);}
-void XEmitter::PMAXUD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x383f, dest, arg);}
-
-void XEmitter::PMOVSXBW(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3820, dest, arg);}
-void XEmitter::PMOVSXBD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3821, dest, arg);}
-void XEmitter::PMOVSXBQ(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3822, dest, arg);}
-void XEmitter::PMOVSXWD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3823, dest, arg);}
-void XEmitter::PMOVSXWQ(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3824, dest, arg);}
-void XEmitter::PMOVSXDQ(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3825, dest, arg);}
-void XEmitter::PMOVZXBW(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3830, dest, arg);}
-void XEmitter::PMOVZXBD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3831, dest, arg);}
-void XEmitter::PMOVZXBQ(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3832, dest, arg);}
-void XEmitter::PMOVZXWD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3833, dest, arg);}
-void XEmitter::PMOVZXWQ(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3834, dest, arg);}
-void XEmitter::PMOVZXDQ(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3835, dest, arg);}
-
-void XEmitter::PBLENDVB(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3810, dest, arg);}
-void XEmitter::BLENDVPS(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3814, dest, arg);}
-void XEmitter::BLENDVPD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3815, dest, arg);}
+void XEmitter::PSHUFB(X64Reg dest, const OpArg& arg) {WriteSSSE3Op(0x66, 0x3800, dest, arg);}
+void XEmitter::PTEST(X64Reg dest, const OpArg& arg) {WriteSSE41Op(0x66, 0x3817, dest, arg);}
+void XEmitter::PACKUSDW(X64Reg dest, const OpArg& arg) {WriteSSE41Op(0x66, 0x382b, dest, arg);}
+void XEmitter::DPPS(X64Reg dest, const OpArg& arg, u8 mask) {WriteSSE41Op(0x66, 0x3A40, dest, arg, 1); Write8(mask);}
+
+void XEmitter::PMINSB(X64Reg dest, const OpArg& arg) {WriteSSE41Op(0x66, 0x3838, dest, arg);}
+void XEmitter::PMINSD(X64Reg dest, const OpArg& arg) {WriteSSE41Op(0x66, 0x3839, dest, arg);}
+void XEmitter::PMINUW(X64Reg dest, const OpArg& arg) {WriteSSE41Op(0x66, 0x383a, dest, arg);}
+void XEmitter::PMINUD(X64Reg dest, const OpArg& arg) {WriteSSE41Op(0x66, 0x383b, dest, arg);}
+void XEmitter::PMAXSB(X64Reg dest, const OpArg& arg) {WriteSSE41Op(0x66, 0x383c, dest, arg);}
+void XEmitter::PMAXSD(X64Reg dest, const OpArg& arg) {WriteSSE41Op(0x66, 0x383d, dest, arg);}
+void XEmitter::PMAXUW(X64Reg dest, const OpArg& arg) {WriteSSE41Op(0x66, 0x383e, dest, arg);}
+void XEmitter::PMAXUD(X64Reg dest, const OpArg& arg) {WriteSSE41Op(0x66, 0x383f, dest, arg);}
+
+void XEmitter::PMOVSXBW(X64Reg dest, const OpArg& arg) {WriteSSE41Op(0x66, 0x3820, dest, arg);}
+void XEmitter::PMOVSXBD(X64Reg dest, const OpArg& arg) {WriteSSE41Op(0x66, 0x3821, dest, arg);}
+void XEmitter::PMOVSXBQ(X64Reg dest, const OpArg& arg) {WriteSSE41Op(0x66, 0x3822, dest, arg);}
+void XEmitter::PMOVSXWD(X64Reg dest, const OpArg& arg) {WriteSSE41Op(0x66, 0x3823, dest, arg);}
+void XEmitter::PMOVSXWQ(X64Reg dest, const OpArg& arg) {WriteSSE41Op(0x66, 0x3824, dest, arg);}
+void XEmitter::PMOVSXDQ(X64Reg dest, const OpArg& arg) {WriteSSE41Op(0x66, 0x3825, dest, arg);}
+void XEmitter::PMOVZXBW(X64Reg dest, const OpArg& arg) {WriteSSE41Op(0x66, 0x3830, dest, arg);}
+void XEmitter::PMOVZXBD(X64Reg dest, const OpArg& arg) {WriteSSE41Op(0x66, 0x3831, dest, arg);}
+void XEmitter::PMOVZXBQ(X64Reg dest, const OpArg& arg) {WriteSSE41Op(0x66, 0x3832, dest, arg);}
+void XEmitter::PMOVZXWD(X64Reg dest, const OpArg& arg) {WriteSSE41Op(0x66, 0x3833, dest, arg);}
+void XEmitter::PMOVZXWQ(X64Reg dest, const OpArg& arg) {WriteSSE41Op(0x66, 0x3834, dest, arg);}
+void XEmitter::PMOVZXDQ(X64Reg dest, const OpArg& arg) {WriteSSE41Op(0x66, 0x3835, dest, arg);}
+
+void XEmitter::PBLENDVB(X64Reg dest, const OpArg& arg) {WriteSSE41Op(0x66, 0x3810, dest, arg);}
+void XEmitter::BLENDVPS(X64Reg dest, const OpArg& arg) {WriteSSE41Op(0x66, 0x3814, dest, arg);}
+void XEmitter::BLENDVPD(X64Reg dest, const OpArg& arg) {WriteSSE41Op(0x66, 0x3815, dest, arg);}
void XEmitter::BLENDPS(X64Reg dest, const OpArg& arg, u8 blend) { WriteSSE41Op(0x66, 0x3A0C, dest, arg, 1); Write8(blend); }
void XEmitter::BLENDPD(X64Reg dest, const OpArg& arg, u8 blend) { WriteSSE41Op(0x66, 0x3A0D, dest, arg, 1); Write8(blend); }
-void XEmitter::ROUNDSS(X64Reg dest, OpArg arg, u8 mode) {WriteSSE41Op(0x66, 0x3A0A, dest, arg, 1); Write8(mode);}
-void XEmitter::ROUNDSD(X64Reg dest, OpArg arg, u8 mode) {WriteSSE41Op(0x66, 0x3A0B, dest, arg, 1); Write8(mode);}
-void XEmitter::ROUNDPS(X64Reg dest, OpArg arg, u8 mode) {WriteSSE41Op(0x66, 0x3A08, dest, arg, 1); Write8(mode);}
-void XEmitter::ROUNDPD(X64Reg dest, OpArg arg, u8 mode) {WriteSSE41Op(0x66, 0x3A09, dest, arg, 1); Write8(mode);}
+void XEmitter::ROUNDSS(X64Reg dest, const OpArg& arg, u8 mode) {WriteSSE41Op(0x66, 0x3A0A, dest, arg, 1); Write8(mode);}
+void XEmitter::ROUNDSD(X64Reg dest, const OpArg& arg, u8 mode) {WriteSSE41Op(0x66, 0x3A0B, dest, arg, 1); Write8(mode);}
+void XEmitter::ROUNDPS(X64Reg dest, const OpArg& arg, u8 mode) {WriteSSE41Op(0x66, 0x3A08, dest, arg, 1); Write8(mode);}
+void XEmitter::ROUNDPD(X64Reg dest, const OpArg& arg, u8 mode) {WriteSSE41Op(0x66, 0x3A09, dest, arg, 1); Write8(mode);}
-void XEmitter::PAND(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xDB, dest, arg);}
-void XEmitter::PANDN(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xDF, dest, arg);}
-void XEmitter::PXOR(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xEF, dest, arg);}
-void XEmitter::POR(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xEB, dest, arg);}
+void XEmitter::PAND(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x66, 0xDB, dest, arg);}
+void XEmitter::PANDN(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x66, 0xDF, dest, arg);}
+void XEmitter::PXOR(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x66, 0xEF, dest, arg);}
+void XEmitter::POR(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x66, 0xEB, dest, arg);}
-void XEmitter::PADDB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xFC, dest, arg);}
-void XEmitter::PADDW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xFD, dest, arg);}
-void XEmitter::PADDD(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xFE, dest, arg);}
-void XEmitter::PADDQ(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xD4, dest, arg);}
+void XEmitter::PADDB(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x66, 0xFC, dest, arg);}
+void XEmitter::PADDW(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x66, 0xFD, dest, arg);}
+void XEmitter::PADDD(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x66, 0xFE, dest, arg);}
+void XEmitter::PADDQ(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x66, 0xD4, dest, arg);}
-void XEmitter::PADDSB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xEC, dest, arg);}
-void XEmitter::PADDSW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xED, dest, arg);}
-void XEmitter::PADDUSB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xDC, dest, arg);}
-void XEmitter::PADDUSW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xDD, dest, arg);}
+void XEmitter::PADDSB(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x66, 0xEC, dest, arg);}
+void XEmitter::PADDSW(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x66, 0xED, dest, arg);}
+void XEmitter::PADDUSB(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x66, 0xDC, dest, arg);}
+void XEmitter::PADDUSW(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x66, 0xDD, dest, arg);}
-void XEmitter::PSUBB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xF8, dest, arg);}
-void XEmitter::PSUBW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xF9, dest, arg);}
-void XEmitter::PSUBD(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xFA, dest, arg);}
-void XEmitter::PSUBQ(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xFB, dest, arg);}
+void XEmitter::PSUBB(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x66, 0xF8, dest, arg);}
+void XEmitter::PSUBW(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x66, 0xF9, dest, arg);}
+void XEmitter::PSUBD(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x66, 0xFA, dest, arg);}
+void XEmitter::PSUBQ(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x66, 0xFB, dest, arg);}
-void XEmitter::PSUBSB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xE8, dest, arg);}
-void XEmitter::PSUBSW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xE9, dest, arg);}
-void XEmitter::PSUBUSB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xD8, dest, arg);}
-void XEmitter::PSUBUSW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xD9, dest, arg);}
+void XEmitter::PSUBSB(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x66, 0xE8, dest, arg);}
+void XEmitter::PSUBSW(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x66, 0xE9, dest, arg);}
+void XEmitter::PSUBUSB(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x66, 0xD8, dest, arg);}
+void XEmitter::PSUBUSW(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x66, 0xD9, dest, arg);}
-void XEmitter::PAVGB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xE0, dest, arg);}
-void XEmitter::PAVGW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xE3, dest, arg);}
+void XEmitter::PAVGB(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x66, 0xE0, dest, arg);}
+void XEmitter::PAVGW(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x66, 0xE3, dest, arg);}
-void XEmitter::PCMPEQB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x74, dest, arg);}
-void XEmitter::PCMPEQW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x75, dest, arg);}
-void XEmitter::PCMPEQD(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x76, dest, arg);}
+void XEmitter::PCMPEQB(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x66, 0x74, dest, arg);}
+void XEmitter::PCMPEQW(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x66, 0x75, dest, arg);}
+void XEmitter::PCMPEQD(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x66, 0x76, dest, arg);}
-void XEmitter::PCMPGTB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x64, dest, arg);}
-void XEmitter::PCMPGTW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x65, dest, arg);}
-void XEmitter::PCMPGTD(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x66, dest, arg);}
+void XEmitter::PCMPGTB(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x66, 0x64, dest, arg);}
+void XEmitter::PCMPGTW(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x66, 0x65, dest, arg);}
+void XEmitter::PCMPGTD(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x66, 0x66, dest, arg);}
-void XEmitter::PEXTRW(X64Reg dest, OpArg arg, u8 subreg) {WriteSSEOp(0x66, 0xC5, dest, arg, 1); Write8(subreg);}
-void XEmitter::PINSRW(X64Reg dest, OpArg arg, u8 subreg) {WriteSSEOp(0x66, 0xC4, dest, arg, 1); Write8(subreg);}
+void XEmitter::PEXTRW(X64Reg dest, const OpArg& arg, u8 subreg) {WriteSSEOp(0x66, 0xC5, dest, arg, 1); Write8(subreg);}
+void XEmitter::PINSRW(X64Reg dest, const OpArg& arg, u8 subreg) {WriteSSEOp(0x66, 0xC4, dest, arg, 1); Write8(subreg);}
-void XEmitter::PMADDWD(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xF5, dest, arg); }
-void XEmitter::PSADBW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xF6, dest, arg);}
+void XEmitter::PMADDWD(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x66, 0xF5, dest, arg); }
+void XEmitter::PSADBW(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x66, 0xF6, dest, arg);}
-void XEmitter::PMAXSW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xEE, dest, arg); }
-void XEmitter::PMAXUB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xDE, dest, arg); }
-void XEmitter::PMINSW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xEA, dest, arg); }
-void XEmitter::PMINUB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xDA, dest, arg); }
+void XEmitter::PMAXSW(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x66, 0xEE, dest, arg); }
+void XEmitter::PMAXUB(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x66, 0xDE, dest, arg); }
+void XEmitter::PMINSW(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x66, 0xEA, dest, arg); }
+void XEmitter::PMINUB(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x66, 0xDA, dest, arg); }
-void XEmitter::PMOVMSKB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xD7, dest, arg); }
-void XEmitter::PSHUFD(X64Reg regOp, OpArg arg, u8 shuffle) {WriteSSEOp(0x66, 0x70, regOp, arg, 1); Write8(shuffle);}
-void XEmitter::PSHUFLW(X64Reg regOp, OpArg arg, u8 shuffle) {WriteSSEOp(0xF2, 0x70, regOp, arg, 1); Write8(shuffle);}
-void XEmitter::PSHUFHW(X64Reg regOp, OpArg arg, u8 shuffle) {WriteSSEOp(0xF3, 0x70, regOp, arg, 1); Write8(shuffle);}
+void XEmitter::PMOVMSKB(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x66, 0xD7, dest, arg); }
+void XEmitter::PSHUFD(X64Reg regOp, const OpArg& arg, u8 shuffle) {WriteSSEOp(0x66, 0x70, regOp, arg, 1); Write8(shuffle);}
+void XEmitter::PSHUFLW(X64Reg regOp, const OpArg& arg, u8 shuffle) {WriteSSEOp(0xF2, 0x70, regOp, arg, 1); Write8(shuffle);}
+void XEmitter::PSHUFHW(X64Reg regOp, const OpArg& arg, u8 shuffle) {WriteSSEOp(0xF3, 0x70, regOp, arg, 1); Write8(shuffle);}
// VEX
-void XEmitter::VADDSD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0xF2, sseADD, regOp1, regOp2, arg);}
-void XEmitter::VSUBSD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0xF2, sseSUB, regOp1, regOp2, arg);}
-void XEmitter::VMULSD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0xF2, sseMUL, regOp1, regOp2, arg);}
-void XEmitter::VDIVSD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0xF2, sseDIV, regOp1, regOp2, arg);}
-void XEmitter::VADDPD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, sseADD, regOp1, regOp2, arg);}
-void XEmitter::VSUBPD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, sseSUB, regOp1, regOp2, arg);}
-void XEmitter::VMULPD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, sseMUL, regOp1, regOp2, arg);}
-void XEmitter::VDIVPD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, sseDIV, regOp1, regOp2, arg);}
-void XEmitter::VSQRTSD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0xF2, sseSQRT, regOp1, regOp2, arg);}
-void XEmitter::VSHUFPD(X64Reg regOp1, X64Reg regOp2, OpArg arg, u8 shuffle) {WriteAVXOp(0x66, sseSHUF, regOp1, regOp2, arg, 1); Write8(shuffle);}
-void XEmitter::VUNPCKLPD(X64Reg regOp1, X64Reg regOp2, OpArg arg){WriteAVXOp(0x66, 0x14, regOp1, regOp2, arg);}
-void XEmitter::VUNPCKHPD(X64Reg regOp1, X64Reg regOp2, OpArg arg){WriteAVXOp(0x66, 0x15, regOp1, regOp2, arg);}
-
-void XEmitter::VANDPS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x00, sseAND, regOp1, regOp2, arg); }
-void XEmitter::VANDPD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, sseAND, regOp1, regOp2, arg); }
-void XEmitter::VANDNPS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x00, sseANDN, regOp1, regOp2, arg); }
-void XEmitter::VANDNPD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, sseANDN, regOp1, regOp2, arg); }
-void XEmitter::VORPS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x00, sseOR, regOp1, regOp2, arg); }
-void XEmitter::VORPD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, sseOR, regOp1, regOp2, arg); }
-void XEmitter::VXORPS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x00, sseXOR, regOp1, regOp2, arg); }
-void XEmitter::VXORPD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, sseXOR, regOp1, regOp2, arg); }
-
-void XEmitter::VPAND(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0xDB, regOp1, regOp2, arg); }
-void XEmitter::VPANDN(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0xDF, regOp1, regOp2, arg); }
-void XEmitter::VPOR(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0xEB, regOp1, regOp2, arg); }
-void XEmitter::VPXOR(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0xEF, regOp1, regOp2, arg); }
-
-void XEmitter::VFMADD132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x3898, regOp1, regOp2, arg); }
-void XEmitter::VFMADD213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38A8, regOp1, regOp2, arg); }
-void XEmitter::VFMADD231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38B8, regOp1, regOp2, arg); }
-void XEmitter::VFMADD132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x3898, regOp1, regOp2, arg, 1); }
-void XEmitter::VFMADD213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38A8, regOp1, regOp2, arg, 1); }
-void XEmitter::VFMADD231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38B8, regOp1, regOp2, arg, 1); }
-void XEmitter::VFMADD132SS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x3899, regOp1, regOp2, arg); }
-void XEmitter::VFMADD213SS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38A9, regOp1, regOp2, arg); }
-void XEmitter::VFMADD231SS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38B9, regOp1, regOp2, arg); }
-void XEmitter::VFMADD132SD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x3899, regOp1, regOp2, arg, 1); }
-void XEmitter::VFMADD213SD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38A9, regOp1, regOp2, arg, 1); }
-void XEmitter::VFMADD231SD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38B9, regOp1, regOp2, arg, 1); }
-void XEmitter::VFMSUB132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x389A, regOp1, regOp2, arg); }
-void XEmitter::VFMSUB213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38AA, regOp1, regOp2, arg); }
-void XEmitter::VFMSUB231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38BA, regOp1, regOp2, arg); }
-void XEmitter::VFMSUB132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x389A, regOp1, regOp2, arg, 1); }
-void XEmitter::VFMSUB213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38AA, regOp1, regOp2, arg, 1); }
-void XEmitter::VFMSUB231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38BA, regOp1, regOp2, arg, 1); }
-void XEmitter::VFMSUB132SS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x389B, regOp1, regOp2, arg); }
-void XEmitter::VFMSUB213SS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38AB, regOp1, regOp2, arg); }
-void XEmitter::VFMSUB231SS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38BB, regOp1, regOp2, arg); }
-void XEmitter::VFMSUB132SD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x389B, regOp1, regOp2, arg, 1); }
-void XEmitter::VFMSUB213SD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38AB, regOp1, regOp2, arg, 1); }
-void XEmitter::VFMSUB231SD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38BB, regOp1, regOp2, arg, 1); }
-void XEmitter::VFNMADD132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x389C, regOp1, regOp2, arg); }
-void XEmitter::VFNMADD213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38AC, regOp1, regOp2, arg); }
-void XEmitter::VFNMADD231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38BC, regOp1, regOp2, arg); }
-void XEmitter::VFNMADD132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x389C, regOp1, regOp2, arg, 1); }
-void XEmitter::VFNMADD213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38AC, regOp1, regOp2, arg, 1); }
-void XEmitter::VFNMADD231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38BC, regOp1, regOp2, arg, 1); }
-void XEmitter::VFNMADD132SS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x389D, regOp1, regOp2, arg); }
-void XEmitter::VFNMADD213SS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38AD, regOp1, regOp2, arg); }
-void XEmitter::VFNMADD231SS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38BD, regOp1, regOp2, arg); }
-void XEmitter::VFNMADD132SD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x389D, regOp1, regOp2, arg, 1); }
-void XEmitter::VFNMADD213SD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38AD, regOp1, regOp2, arg, 1); }
-void XEmitter::VFNMADD231SD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38BD, regOp1, regOp2, arg, 1); }
-void XEmitter::VFNMSUB132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x389E, regOp1, regOp2, arg); }
-void XEmitter::VFNMSUB213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38AE, regOp1, regOp2, arg); }
-void XEmitter::VFNMSUB231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38BE, regOp1, regOp2, arg); }
-void XEmitter::VFNMSUB132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x389E, regOp1, regOp2, arg, 1); }
-void XEmitter::VFNMSUB213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38AE, regOp1, regOp2, arg, 1); }
-void XEmitter::VFNMSUB231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38BE, regOp1, regOp2, arg, 1); }
-void XEmitter::VFNMSUB132SS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x389F, regOp1, regOp2, arg); }
-void XEmitter::VFNMSUB213SS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38AF, regOp1, regOp2, arg); }
-void XEmitter::VFNMSUB231SS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38BF, regOp1, regOp2, arg); }
-void XEmitter::VFNMSUB132SD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x389F, regOp1, regOp2, arg, 1); }
-void XEmitter::VFNMSUB213SD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38AF, regOp1, regOp2, arg, 1); }
-void XEmitter::VFNMSUB231SD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38BF, regOp1, regOp2, arg, 1); }
-void XEmitter::VFMADDSUB132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x3896, regOp1, regOp2, arg); }
-void XEmitter::VFMADDSUB213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38A6, regOp1, regOp2, arg); }
-void XEmitter::VFMADDSUB231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38B6, regOp1, regOp2, arg); }
-void XEmitter::VFMADDSUB132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x3896, regOp1, regOp2, arg, 1); }
-void XEmitter::VFMADDSUB213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38A6, regOp1, regOp2, arg, 1); }
-void XEmitter::VFMADDSUB231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38B6, regOp1, regOp2, arg, 1); }
-void XEmitter::VFMSUBADD132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x3897, regOp1, regOp2, arg); }
-void XEmitter::VFMSUBADD213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38A7, regOp1, regOp2, arg); }
-void XEmitter::VFMSUBADD231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38B7, regOp1, regOp2, arg); }
-void XEmitter::VFMSUBADD132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x3897, regOp1, regOp2, arg, 1); }
-void XEmitter::VFMSUBADD213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38A7, regOp1, regOp2, arg, 1); }
-void XEmitter::VFMSUBADD231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38B7, regOp1, regOp2, arg, 1); }
-
-void XEmitter::SARX(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2) {WriteBMI2Op(bits, 0xF3, 0x38F7, regOp1, regOp2, arg);}
-void XEmitter::SHLX(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2) {WriteBMI2Op(bits, 0x66, 0x38F7, regOp1, regOp2, arg);}
-void XEmitter::SHRX(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2) {WriteBMI2Op(bits, 0xF2, 0x38F7, regOp1, regOp2, arg);}
-void XEmitter::RORX(int bits, X64Reg regOp, OpArg arg, u8 rotate) {WriteBMI2Op(bits, 0xF2, 0x3AF0, regOp, INVALID_REG, arg, 1); Write8(rotate);}
-void XEmitter::PEXT(int bits, X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteBMI2Op(bits, 0xF3, 0x38F5, regOp1, regOp2, arg);}
-void XEmitter::PDEP(int bits, X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteBMI2Op(bits, 0xF2, 0x38F5, regOp1, regOp2, arg);}
-void XEmitter::MULX(int bits, X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteBMI2Op(bits, 0xF2, 0x38F6, regOp2, regOp1, arg);}
-void XEmitter::BZHI(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2) {WriteBMI2Op(bits, 0x00, 0x38F5, regOp1, regOp2, arg);}
-void XEmitter::BLSR(int bits, X64Reg regOp, OpArg arg) {WriteBMI1Op(bits, 0x00, 0x38F3, (X64Reg)0x1, regOp, arg);}
-void XEmitter::BLSMSK(int bits, X64Reg regOp, OpArg arg) {WriteBMI1Op(bits, 0x00, 0x38F3, (X64Reg)0x2, regOp, arg);}
-void XEmitter::BLSI(int bits, X64Reg regOp, OpArg arg) {WriteBMI1Op(bits, 0x00, 0x38F3, (X64Reg)0x3, regOp, arg);}
-void XEmitter::BEXTR(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2){WriteBMI1Op(bits, 0x00, 0x38F7, regOp1, regOp2, arg);}
-void XEmitter::ANDN(int bits, X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteBMI1Op(bits, 0x00, 0x38F2, regOp1, regOp2, arg);}
+void XEmitter::VADDSD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) {WriteAVXOp(0xF2, sseADD, regOp1, regOp2, arg);}
+void XEmitter::VSUBSD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) {WriteAVXOp(0xF2, sseSUB, regOp1, regOp2, arg);}
+void XEmitter::VMULSD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) {WriteAVXOp(0xF2, sseMUL, regOp1, regOp2, arg);}
+void XEmitter::VDIVSD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) {WriteAVXOp(0xF2, sseDIV, regOp1, regOp2, arg);}
+void XEmitter::VADDPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) {WriteAVXOp(0x66, sseADD, regOp1, regOp2, arg);}
+void XEmitter::VSUBPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) {WriteAVXOp(0x66, sseSUB, regOp1, regOp2, arg);}
+void XEmitter::VMULPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) {WriteAVXOp(0x66, sseMUL, regOp1, regOp2, arg);}
+void XEmitter::VDIVPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) {WriteAVXOp(0x66, sseDIV, regOp1, regOp2, arg);}
+void XEmitter::VSQRTSD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) {WriteAVXOp(0xF2, sseSQRT, regOp1, regOp2, arg);}
+void XEmitter::VSHUFPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg, u8 shuffle) {WriteAVXOp(0x66, sseSHUF, regOp1, regOp2, arg, 1); Write8(shuffle);}
+void XEmitter::VUNPCKLPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg){WriteAVXOp(0x66, 0x14, regOp1, regOp2, arg);}
+void XEmitter::VUNPCKHPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg){WriteAVXOp(0x66, 0x15, regOp1, regOp2, arg);}
+
+void XEmitter::VANDPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x00, sseAND, regOp1, regOp2, arg); }
+void XEmitter::VANDPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, sseAND, regOp1, regOp2, arg); }
+void XEmitter::VANDNPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x00, sseANDN, regOp1, regOp2, arg); }
+void XEmitter::VANDNPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, sseANDN, regOp1, regOp2, arg); }
+void XEmitter::VORPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x00, sseOR, regOp1, regOp2, arg); }
+void XEmitter::VORPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, sseOR, regOp1, regOp2, arg); }
+void XEmitter::VXORPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x00, sseXOR, regOp1, regOp2, arg); }
+void XEmitter::VXORPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, sseXOR, regOp1, regOp2, arg); }
+
+void XEmitter::VPAND(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0xDB, regOp1, regOp2, arg); }
+void XEmitter::VPANDN(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0xDF, regOp1, regOp2, arg); }
+void XEmitter::VPOR(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0xEB, regOp1, regOp2, arg); }
+void XEmitter::VPXOR(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0xEF, regOp1, regOp2, arg); }
+
+void XEmitter::VFMADD132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x3898, regOp1, regOp2, arg); }
+void XEmitter::VFMADD213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x38A8, regOp1, regOp2, arg); }
+void XEmitter::VFMADD231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x38B8, regOp1, regOp2, arg); }
+void XEmitter::VFMADD132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x3898, regOp1, regOp2, arg, 1); }
+void XEmitter::VFMADD213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x38A8, regOp1, regOp2, arg, 1); }
+void XEmitter::VFMADD231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x38B8, regOp1, regOp2, arg, 1); }
+void XEmitter::VFMADD132SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x3899, regOp1, regOp2, arg); }
+void XEmitter::VFMADD213SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x38A9, regOp1, regOp2, arg); }
+void XEmitter::VFMADD231SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x38B9, regOp1, regOp2, arg); }
+void XEmitter::VFMADD132SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x3899, regOp1, regOp2, arg, 1); }
+void XEmitter::VFMADD213SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x38A9, regOp1, regOp2, arg, 1); }
+void XEmitter::VFMADD231SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x38B9, regOp1, regOp2, arg, 1); }
+void XEmitter::VFMSUB132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x389A, regOp1, regOp2, arg); }
+void XEmitter::VFMSUB213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x38AA, regOp1, regOp2, arg); }
+void XEmitter::VFMSUB231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x38BA, regOp1, regOp2, arg); }
+void XEmitter::VFMSUB132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x389A, regOp1, regOp2, arg, 1); }
+void XEmitter::VFMSUB213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x38AA, regOp1, regOp2, arg, 1); }
+void XEmitter::VFMSUB231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x38BA, regOp1, regOp2, arg, 1); }
+void XEmitter::VFMSUB132SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x389B, regOp1, regOp2, arg); }
+void XEmitter::VFMSUB213SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x38AB, regOp1, regOp2, arg); }
+void XEmitter::VFMSUB231SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x38BB, regOp1, regOp2, arg); }
+void XEmitter::VFMSUB132SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x389B, regOp1, regOp2, arg, 1); }
+void XEmitter::VFMSUB213SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x38AB, regOp1, regOp2, arg, 1); }
+void XEmitter::VFMSUB231SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x38BB, regOp1, regOp2, arg, 1); }
+void XEmitter::VFNMADD132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x389C, regOp1, regOp2, arg); }
+void XEmitter::VFNMADD213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x38AC, regOp1, regOp2, arg); }
+void XEmitter::VFNMADD231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x38BC, regOp1, regOp2, arg); }
+void XEmitter::VFNMADD132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x389C, regOp1, regOp2, arg, 1); }
+void XEmitter::VFNMADD213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x38AC, regOp1, regOp2, arg, 1); }
+void XEmitter::VFNMADD231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x38BC, regOp1, regOp2, arg, 1); }
+void XEmitter::VFNMADD132SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x389D, regOp1, regOp2, arg); }
+void XEmitter::VFNMADD213SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x38AD, regOp1, regOp2, arg); }
+void XEmitter::VFNMADD231SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x38BD, regOp1, regOp2, arg); }
+void XEmitter::VFNMADD132SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x389D, regOp1, regOp2, arg, 1); }
+void XEmitter::VFNMADD213SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x38AD, regOp1, regOp2, arg, 1); }
+void XEmitter::VFNMADD231SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x38BD, regOp1, regOp2, arg, 1); }
+void XEmitter::VFNMSUB132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x389E, regOp1, regOp2, arg); }
+void XEmitter::VFNMSUB213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x38AE, regOp1, regOp2, arg); }
+void XEmitter::VFNMSUB231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x38BE, regOp1, regOp2, arg); }
+void XEmitter::VFNMSUB132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x389E, regOp1, regOp2, arg, 1); }
+void XEmitter::VFNMSUB213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x38AE, regOp1, regOp2, arg, 1); }
+void XEmitter::VFNMSUB231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x38BE, regOp1, regOp2, arg, 1); }
+void XEmitter::VFNMSUB132SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x389F, regOp1, regOp2, arg); }
+void XEmitter::VFNMSUB213SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x38AF, regOp1, regOp2, arg); }
+void XEmitter::VFNMSUB231SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x38BF, regOp1, regOp2, arg); }
+void XEmitter::VFNMSUB132SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x389F, regOp1, regOp2, arg, 1); }
+void XEmitter::VFNMSUB213SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x38AF, regOp1, regOp2, arg, 1); }
+void XEmitter::VFNMSUB231SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x38BF, regOp1, regOp2, arg, 1); }
+void XEmitter::VFMADDSUB132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x3896, regOp1, regOp2, arg); }
+void XEmitter::VFMADDSUB213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x38A6, regOp1, regOp2, arg); }
+void XEmitter::VFMADDSUB231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x38B6, regOp1, regOp2, arg); }
+void XEmitter::VFMADDSUB132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x3896, regOp1, regOp2, arg, 1); }
+void XEmitter::VFMADDSUB213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x38A6, regOp1, regOp2, arg, 1); }
+void XEmitter::VFMADDSUB231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x38B6, regOp1, regOp2, arg, 1); }
+void XEmitter::VFMSUBADD132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x3897, regOp1, regOp2, arg); }
+void XEmitter::VFMSUBADD213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x38A7, regOp1, regOp2, arg); }
+void XEmitter::VFMSUBADD231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x38B7, regOp1, regOp2, arg); }
+void XEmitter::VFMSUBADD132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x3897, regOp1, regOp2, arg, 1); }
+void XEmitter::VFMSUBADD213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x38A7, regOp1, regOp2, arg, 1); }
+void XEmitter::VFMSUBADD231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) { WriteAVXOp(0x66, 0x38B7, regOp1, regOp2, arg, 1); }
+
+void XEmitter::SARX(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2) {WriteBMI2Op(bits, 0xF3, 0x38F7, regOp1, regOp2, arg);}
+void XEmitter::SHLX(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2) {WriteBMI2Op(bits, 0x66, 0x38F7, regOp1, regOp2, arg);}
+void XEmitter::SHRX(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2) {WriteBMI2Op(bits, 0xF2, 0x38F7, regOp1, regOp2, arg);}
+void XEmitter::RORX(int bits, X64Reg regOp, const OpArg& arg, u8 rotate) {WriteBMI2Op(bits, 0xF2, 0x3AF0, regOp, INVALID_REG, arg, 1); Write8(rotate);}
+void XEmitter::PEXT(int bits, X64Reg regOp1, X64Reg regOp2, const OpArg& arg) {WriteBMI2Op(bits, 0xF3, 0x38F5, regOp1, regOp2, arg);}
+void XEmitter::PDEP(int bits, X64Reg regOp1, X64Reg regOp2, const OpArg& arg) {WriteBMI2Op(bits, 0xF2, 0x38F5, regOp1, regOp2, arg);}
+void XEmitter::MULX(int bits, X64Reg regOp1, X64Reg regOp2, const OpArg& arg) {WriteBMI2Op(bits, 0xF2, 0x38F6, regOp2, regOp1, arg);}
+void XEmitter::BZHI(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2) {WriteBMI2Op(bits, 0x00, 0x38F5, regOp1, regOp2, arg);}
+void XEmitter::BLSR(int bits, X64Reg regOp, const OpArg& arg) {WriteBMI1Op(bits, 0x00, 0x38F3, (X64Reg)0x1, regOp, arg);}
+void XEmitter::BLSMSK(int bits, X64Reg regOp, const OpArg& arg) {WriteBMI1Op(bits, 0x00, 0x38F3, (X64Reg)0x2, regOp, arg);}
+void XEmitter::BLSI(int bits, X64Reg regOp, const OpArg& arg) {WriteBMI1Op(bits, 0x00, 0x38F3, (X64Reg)0x3, regOp, arg);}
+void XEmitter::BEXTR(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2){WriteBMI1Op(bits, 0x00, 0x38F7, regOp1, regOp2, arg);}
+void XEmitter::ANDN(int bits, X64Reg regOp1, X64Reg regOp2, const OpArg& arg) {WriteBMI1Op(bits, 0x00, 0x38F2, regOp1, regOp2, arg);}
// Prefixes
@@ -1956,7 +1928,7 @@ void XEmitter::FWAIT()
}
// TODO: make this more generic
-void XEmitter::WriteFloatLoadStore(int bits, FloatOp op, FloatOp op_80b, OpArg arg)
+void XEmitter::WriteFloatLoadStore(int bits, FloatOp op, FloatOp op_80b, const OpArg& arg)
{
int mf = 0;
ASSERT_MSG(!(bits == 80 && op_80b == floatINVALID), "WriteFloatLoadStore: 80 bits not supported for this instruction");
@@ -1974,9 +1946,9 @@ void XEmitter::WriteFloatLoadStore(int bits, FloatOp op, FloatOp op_80b, OpArg a
arg.WriteRest(this, 0, (X64Reg) op);
}
-void XEmitter::FLD(int bits, OpArg src) {WriteFloatLoadStore(bits, floatLD, floatLD80, src);}
-void XEmitter::FST(int bits, OpArg dest) {WriteFloatLoadStore(bits, floatST, floatINVALID, dest);}
-void XEmitter::FSTP(int bits, OpArg dest) {WriteFloatLoadStore(bits, floatSTP, floatSTP80, dest);}
+void XEmitter::FLD(int bits, const OpArg& src) {WriteFloatLoadStore(bits, floatLD, floatLD80, src);}
+void XEmitter::FST(int bits, const OpArg& dest) {WriteFloatLoadStore(bits, floatST, floatINVALID, dest);}
+void XEmitter::FSTP(int bits, const OpArg& dest) {WriteFloatLoadStore(bits, floatSTP, floatSTP80, dest);}
void XEmitter::FNSTSW_AX() { Write8(0xDF); Write8(0xE0); }
void XEmitter::RDTSC() { Write8(0x0F); Write8(0x31); }
diff --git a/src/common/x64/emitter.h b/src/common/x64/emitter.h
index e9c924126..86f4a1fff 100644
--- a/src/common/x64/emitter.h
+++ b/src/common/x64/emitter.h
@@ -328,8 +328,6 @@ enum SSECompare
ORD,
};
-typedef const u8* JumpTarget;
-
class XEmitter
{
friend struct OpArg; // for Write8 etc
@@ -344,27 +342,27 @@ private:
void WriteSimple2Byte(int bits, u8 byte1, u8 byte2, X64Reg reg);
void WriteMulDivType(int bits, OpArg src, int ext);
void WriteBitSearchType(int bits, X64Reg dest, OpArg src, u8 byte2, bool rep = false);
- void WriteShift(int bits, OpArg dest, OpArg &shift, int ext);
- void WriteBitTest(int bits, OpArg &dest, OpArg &index, int ext);
+ void WriteShift(int bits, OpArg dest, const OpArg& shift, int ext);
+ void WriteBitTest(int bits, const OpArg& dest, const OpArg& index, int ext);
void WriteMXCSR(OpArg arg, int ext);
void WriteSSEOp(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes = 0);
- void WriteSSSE3Op(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes = 0);
- void WriteSSE41Op(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes = 0);
- void WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes = 0);
- void WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes = 0);
- void WriteVEXOp(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes = 0);
- void WriteBMI1Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes = 0);
- void WriteBMI2Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes = 0);
- void WriteFloatLoadStore(int bits, FloatOp op, FloatOp op_80b, OpArg arg);
- void WriteNormalOp(XEmitter *emit, int bits, NormalOp op, const OpArg &a1, const OpArg &a2);
+ void WriteSSSE3Op(u8 opPrefix, u16 op, X64Reg regOp, const OpArg& arg, int extrabytes = 0);
+ void WriteSSE41Op(u8 opPrefix, u16 op, X64Reg regOp, const OpArg& arg, int extrabytes = 0);
+ void WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp, const OpArg& arg, int extrabytes = 0);
+ void WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, int extrabytes = 0);
+ void WriteVEXOp(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, int extrabytes = 0);
+ void WriteBMI1Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, int extrabytes = 0);
+ void WriteBMI2Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, int extrabytes = 0);
+ void WriteFloatLoadStore(int bits, FloatOp op, FloatOp op_80b, const OpArg& arg);
+ void WriteNormalOp(XEmitter *emit, int bits, NormalOp op, const OpArg& a1, const OpArg& a2);
void ABI_CalculateFrameSize(u32 mask, size_t rsp_alignment, size_t needed_frame_size, size_t* shadowp, size_t* subtractionp, size_t* xmm_offsetp);
protected:
- inline void Write8(u8 value) {*code++ = value;}
- inline void Write16(u16 value) {*(u16*)code = (value); code += 2;}
- inline void Write32(u32 value) {*(u32*)code = (value); code += 4;}
- inline void Write64(u64 value) {*(u64*)code = (value); code += 8;}
+ void Write8(u8 value) {*code++ = value;}
+ void Write16(u16 value) {*(u16*)code = (value); code += 2;}
+ void Write32(u32 value) {*(u32*)code = (value); code += 4;}
+ void Write64(u64 value) {*(u64*)code = (value); code += 8;}
public:
XEmitter() { code = nullptr; flags_locked = false; }
@@ -413,8 +411,8 @@ public:
// Stack control
void PUSH(X64Reg reg);
void POP(X64Reg reg);
- void PUSH(int bits, const OpArg &reg);
- void POP(int bits, const OpArg &reg);
+ void PUSH(int bits, const OpArg& reg);
+ void POP(int bits, const OpArg& reg);
void PUSHF();
void POPF();
@@ -424,21 +422,19 @@ public:
void UD2();
FixupBranch J(bool force5bytes = false);
- void JMP(const u8 * addr, bool force5Bytes = false);
- void JMP(OpArg arg);
- void JMPptr(const OpArg &arg);
+ void JMP(const u8* addr, bool force5Bytes = false);
+ void JMPptr(const OpArg& arg);
void JMPself(); //infinite loop!
#ifdef CALL
#undef CALL
#endif
- void CALL(const void *fnptr);
+ void CALL(const void* fnptr);
void CALLptr(OpArg arg);
FixupBranch J_CC(CCFlags conditionCode, bool force5bytes = false);
- //void J_CC(CCFlags conditionCode, JumpTarget target);
- void J_CC(CCFlags conditionCode, const u8 * addr, bool force5Bytes = false);
+ void J_CC(CCFlags conditionCode, const u8* addr, bool force5Bytes = false);
- void SetJumpTarget(const FixupBranch &branch);
+ void SetJumpTarget(const FixupBranch& branch);
void SETcc(CCFlags flag, OpArg dest);
// Note: CMOV brings small if any benefit on current cpus.
@@ -450,8 +446,8 @@ public:
void SFENCE();
// Bit scan
- void BSF(int bits, X64Reg dest, OpArg src); //bottom bit to top bit
- void BSR(int bits, X64Reg dest, OpArg src); //top bit to bottom bit
+ void BSF(int bits, X64Reg dest, const OpArg& src); // Bottom bit to top bit
+ void BSR(int bits, X64Reg dest, const OpArg& src); // Top bit to bottom bit
// Cache control
enum PrefetchLevel
@@ -462,67 +458,67 @@ public:
PF_T2, //Levels 3+ (aliased to T0 on AMD)
};
void PREFETCH(PrefetchLevel level, OpArg arg);
- void MOVNTI(int bits, OpArg dest, X64Reg src);
- void MOVNTDQ(OpArg arg, X64Reg regOp);
- void MOVNTPS(OpArg arg, X64Reg regOp);
- void MOVNTPD(OpArg arg, X64Reg regOp);
+ void MOVNTI(int bits, const OpArg& dest, X64Reg src);
+ void MOVNTDQ(const OpArg& arg, X64Reg regOp);
+ void MOVNTPS(const OpArg& arg, X64Reg regOp);
+ void MOVNTPD(const OpArg& arg, X64Reg regOp);
// Multiplication / division
- void MUL(int bits, OpArg src); //UNSIGNED
- void IMUL(int bits, OpArg src); //SIGNED
- void IMUL(int bits, X64Reg regOp, OpArg src);
- void IMUL(int bits, X64Reg regOp, OpArg src, OpArg imm);
- void DIV(int bits, OpArg src);
- void IDIV(int bits, OpArg src);
+ void MUL(int bits, const OpArg& src); //UNSIGNED
+ void IMUL(int bits, const OpArg& src); //SIGNED
+ void IMUL(int bits, X64Reg regOp, const OpArg& src);
+ void IMUL(int bits, X64Reg regOp, const OpArg& src, const OpArg& imm);
+ void DIV(int bits, const OpArg& src);
+ void IDIV(int bits, const OpArg& src);
// Shift
- void ROL(int bits, OpArg dest, OpArg shift);
- void ROR(int bits, OpArg dest, OpArg shift);
- void RCL(int bits, OpArg dest, OpArg shift);
- void RCR(int bits, OpArg dest, OpArg shift);
- void SHL(int bits, OpArg dest, OpArg shift);
- void SHR(int bits, OpArg dest, OpArg shift);
- void SAR(int bits, OpArg dest, OpArg shift);
+ void ROL(int bits, const OpArg& dest, const OpArg& shift);
+ void ROR(int bits, const OpArg& dest, const OpArg& shift);
+ void RCL(int bits, const OpArg& dest, const OpArg& shift);
+ void RCR(int bits, const OpArg& dest, const OpArg& shift);
+ void SHL(int bits, const OpArg& dest, const OpArg& shift);
+ void SHR(int bits, const OpArg& dest, const OpArg& shift);
+ void SAR(int bits, const OpArg& dest, const OpArg& shift);
// Bit Test
- void BT(int bits, OpArg dest, OpArg index);
- void BTS(int bits, OpArg dest, OpArg index);
- void BTR(int bits, OpArg dest, OpArg index);
- void BTC(int bits, OpArg dest, OpArg index);
+ void BT(int bits, const OpArg& dest, const OpArg& index);
+ void BTS(int bits, const OpArg& dest, const OpArg& index);
+ void BTR(int bits, const OpArg& dest, const OpArg& index);
+ void BTC(int bits, const OpArg& dest, const OpArg& index);
// Double-Precision Shift
- void SHRD(int bits, OpArg dest, OpArg src, OpArg shift);
- void SHLD(int bits, OpArg dest, OpArg src, OpArg shift);
+ void SHRD(int bits, const OpArg& dest, const OpArg& src, const OpArg& shift);
+ void SHLD(int bits, const OpArg& dest, const OpArg& src, const OpArg& shift);
// Extend EAX into EDX in various ways
void CWD(int bits = 16);
- inline void CDQ() {CWD(32);}
- inline void CQO() {CWD(64);}
+ void CDQ() {CWD(32);}
+ void CQO() {CWD(64);}
void CBW(int bits = 8);
- inline void CWDE() {CBW(16);}
- inline void CDQE() {CBW(32);}
+ void CWDE() {CBW(16);}
+ void CDQE() {CBW(32);}
// Load effective address
void LEA(int bits, X64Reg dest, OpArg src);
// Integer arithmetic
- void NEG (int bits, OpArg src);
- void ADD (int bits, const OpArg &a1, const OpArg &a2);
- void ADC (int bits, const OpArg &a1, const OpArg &a2);
- void SUB (int bits, const OpArg &a1, const OpArg &a2);
- void SBB (int bits, const OpArg &a1, const OpArg &a2);
- void AND (int bits, const OpArg &a1, const OpArg &a2);
- void CMP (int bits, const OpArg &a1, const OpArg &a2);
+ void NEG(int bits, const OpArg& src);
+ void ADD(int bits, const OpArg& a1, const OpArg& a2);
+ void ADC(int bits, const OpArg& a1, const OpArg& a2);
+ void SUB(int bits, const OpArg& a1, const OpArg& a2);
+ void SBB(int bits, const OpArg& a1, const OpArg& a2);
+ void AND(int bits, const OpArg& a1, const OpArg& a2);
+ void CMP(int bits, const OpArg& a1, const OpArg& a2);
// Bit operations
- void NOT (int bits, OpArg src);
- void OR (int bits, const OpArg &a1, const OpArg &a2);
- void XOR (int bits, const OpArg &a1, const OpArg &a2);
- void MOV (int bits, const OpArg &a1, const OpArg &a2);
- void TEST(int bits, const OpArg &a1, const OpArg &a2);
+ void NOT (int bits, const OpArg& src);
+ void OR(int bits, const OpArg& a1, const OpArg& a2);
+ void XOR(int bits, const OpArg& a1, const OpArg& a2);
+ void MOV(int bits, const OpArg& a1, const OpArg& a2);
+ void TEST(int bits, const OpArg& a1, const OpArg& a2);
// Are these useful at all? Consider removing.
- void XCHG(int bits, const OpArg &a1, const OpArg &a2);
+ void XCHG(int bits, const OpArg& a1, const OpArg& a2);
void XCHG_AHAL();
// Byte swapping (32 and 64-bit only).
@@ -536,13 +532,13 @@ public:
void MOVBE(int dbits, const OpArg& dest, const OpArg& src);
// Available only on AMD >= Phenom or Intel >= Haswell
- void LZCNT(int bits, X64Reg dest, OpArg src);
+ void LZCNT(int bits, X64Reg dest, const OpArg& src);
// Note: this one is actually part of BMI1
- void TZCNT(int bits, X64Reg dest, OpArg src);
+ void TZCNT(int bits, X64Reg dest, const OpArg& src);
// WARNING - These two take 11-13 cycles and are VectorPath! (AMD64)
- void STMXCSR(OpArg memloc);
- void LDMXCSR(OpArg memloc);
+ void STMXCSR(const OpArg& memloc);
+ void LDMXCSR(const OpArg& memloc);
// Prefixes
void LOCK();
@@ -569,259 +565,242 @@ public:
x87_FPUBusy = 0x8000,
};
- void FLD(int bits, OpArg src);
- void FST(int bits, OpArg dest);
- void FSTP(int bits, OpArg dest);
+ void FLD(int bits, const OpArg& src);
+ void FST(int bits, const OpArg& dest);
+ void FSTP(int bits, const OpArg& dest);
void FNSTSW_AX();
void FWAIT();
// SSE/SSE2: Floating point arithmetic
- void ADDSS(X64Reg regOp, OpArg arg);
- void ADDSD(X64Reg regOp, OpArg arg);
- void SUBSS(X64Reg regOp, OpArg arg);
- void SUBSD(X64Reg regOp, OpArg arg);
- void MULSS(X64Reg regOp, OpArg arg);
- void MULSD(X64Reg regOp, OpArg arg);
- void DIVSS(X64Reg regOp, OpArg arg);
- void DIVSD(X64Reg regOp, OpArg arg);
- void MINSS(X64Reg regOp, OpArg arg);
- void MINSD(X64Reg regOp, OpArg arg);
- void MAXSS(X64Reg regOp, OpArg arg);
- void MAXSD(X64Reg regOp, OpArg arg);
- void SQRTSS(X64Reg regOp, OpArg arg);
- void SQRTSD(X64Reg regOp, OpArg arg);
- void RSQRTSS(X64Reg regOp, OpArg arg);
+ void ADDSS(X64Reg regOp, const OpArg& arg);
+ void ADDSD(X64Reg regOp, const OpArg& arg);
+ void SUBSS(X64Reg regOp, const OpArg& arg);
+ void SUBSD(X64Reg regOp, const OpArg& arg);
+ void MULSS(X64Reg regOp, const OpArg& arg);
+ void MULSD(X64Reg regOp, const OpArg& arg);
+ void DIVSS(X64Reg regOp, const OpArg& arg);
+ void DIVSD(X64Reg regOp, const OpArg& arg);
+ void MINSS(X64Reg regOp, const OpArg& arg);
+ void MINSD(X64Reg regOp, const OpArg& arg);
+ void MAXSS(X64Reg regOp, const OpArg& arg);
+ void MAXSD(X64Reg regOp, const OpArg& arg);
+ void SQRTSS(X64Reg regOp, const OpArg& arg);
+ void SQRTSD(X64Reg regOp, const OpArg& arg);
+ void RSQRTSS(X64Reg regOp, const OpArg& arg);
// SSE/SSE2: Floating point bitwise (yes)
- void CMPSS(X64Reg regOp, OpArg arg, u8 compare);
- void CMPSD(X64Reg regOp, OpArg arg, u8 compare);
+ void CMPSS(X64Reg regOp, const OpArg& arg, u8 compare);
+ void CMPSD(X64Reg regOp, const OpArg& arg, u8 compare);
- inline void CMPEQSS(X64Reg regOp, OpArg arg) { CMPSS(regOp, arg, CMP_EQ); }
- inline void CMPLTSS(X64Reg regOp, OpArg arg) { CMPSS(regOp, arg, CMP_LT); }
- inline void CMPLESS(X64Reg regOp, OpArg arg) { CMPSS(regOp, arg, CMP_LE); }
- inline void CMPUNORDSS(X64Reg regOp, OpArg arg) { CMPSS(regOp, arg, CMP_UNORD); }
- inline void CMPNEQSS(X64Reg regOp, OpArg arg) { CMPSS(regOp, arg, CMP_NEQ); }
- inline void CMPNLTSS(X64Reg regOp, OpArg arg) { CMPSS(regOp, arg, CMP_NLT); }
- inline void CMPORDSS(X64Reg regOp, OpArg arg) { CMPSS(regOp, arg, CMP_ORD); }
+ void CMPEQSS(X64Reg regOp, const OpArg& arg) { CMPSS(regOp, arg, CMP_EQ); }
+ void CMPLTSS(X64Reg regOp, const OpArg& arg) { CMPSS(regOp, arg, CMP_LT); }
+ void CMPLESS(X64Reg regOp, const OpArg& arg) { CMPSS(regOp, arg, CMP_LE); }
+ void CMPUNORDSS(X64Reg regOp, const OpArg& arg) { CMPSS(regOp, arg, CMP_UNORD); }
+ void CMPNEQSS(X64Reg regOp, const OpArg& arg) { CMPSS(regOp, arg, CMP_NEQ); }
+ void CMPNLTSS(X64Reg regOp, const OpArg& arg) { CMPSS(regOp, arg, CMP_NLT); }
+ void CMPORDSS(X64Reg regOp, const OpArg& arg) { CMPSS(regOp, arg, CMP_ORD); }
// SSE/SSE2: Floating point packed arithmetic (x4 for float, x2 for double)
- void ADDPS(X64Reg regOp, OpArg arg);
- void ADDPD(X64Reg regOp, OpArg arg);
- void SUBPS(X64Reg regOp, OpArg arg);
- void SUBPD(X64Reg regOp, OpArg arg);
- void CMPPS(X64Reg regOp, OpArg arg, u8 compare);
- void CMPPD(X64Reg regOp, OpArg arg, u8 compare);
- void MULPS(X64Reg regOp, OpArg arg);
- void MULPD(X64Reg regOp, OpArg arg);
- void DIVPS(X64Reg regOp, OpArg arg);
- void DIVPD(X64Reg regOp, OpArg arg);
- void MINPS(X64Reg regOp, OpArg arg);
- void MINPD(X64Reg regOp, OpArg arg);
- void MAXPS(X64Reg regOp, OpArg arg);
- void MAXPD(X64Reg regOp, OpArg arg);
- void SQRTPS(X64Reg regOp, OpArg arg);
- void SQRTPD(X64Reg regOp, OpArg arg);
- void RCPPS(X64Reg regOp, OpArg arg);
- void RSQRTPS(X64Reg regOp, OpArg arg);
+ void ADDPS(X64Reg regOp, const OpArg& arg);
+ void ADDPD(X64Reg regOp, const OpArg& arg);
+ void SUBPS(X64Reg regOp, const OpArg& arg);
+ void SUBPD(X64Reg regOp, const OpArg& arg);
+ void CMPPS(X64Reg regOp, const OpArg& arg, u8 compare);
+ void CMPPD(X64Reg regOp, const OpArg& arg, u8 compare);
+ void MULPS(X64Reg regOp, const OpArg& arg);
+ void MULPD(X64Reg regOp, const OpArg& arg);
+ void DIVPS(X64Reg regOp, const OpArg& arg);
+ void DIVPD(X64Reg regOp, const OpArg& arg);
+ void MINPS(X64Reg regOp, const OpArg& arg);
+ void MINPD(X64Reg regOp, const OpArg& arg);
+ void MAXPS(X64Reg regOp, const OpArg& arg);
+ void MAXPD(X64Reg regOp, const OpArg& arg);
+ void SQRTPS(X64Reg regOp, const OpArg& arg);
+ void SQRTPD(X64Reg regOp, const OpArg& arg);
+ void RCPPS(X64Reg regOp, const OpArg& arg);
+ void RSQRTPS(X64Reg regOp, const OpArg& arg);
// SSE/SSE2: Floating point packed bitwise (x4 for float, x2 for double)
- void ANDPS(X64Reg regOp, OpArg arg);
- void ANDPD(X64Reg regOp, OpArg arg);
- void ANDNPS(X64Reg regOp, OpArg arg);
- void ANDNPD(X64Reg regOp, OpArg arg);
- void ORPS(X64Reg regOp, OpArg arg);
- void ORPD(X64Reg regOp, OpArg arg);
- void XORPS(X64Reg regOp, OpArg arg);
- void XORPD(X64Reg regOp, OpArg arg);
+ void ANDPS(X64Reg regOp, const OpArg& arg);
+ void ANDPD(X64Reg regOp, const OpArg& arg);
+ void ANDNPS(X64Reg regOp, const OpArg& arg);
+ void ANDNPD(X64Reg regOp, const OpArg& arg);
+ void ORPS(X64Reg regOp, const OpArg& arg);
+ void ORPD(X64Reg regOp, const OpArg& arg);
+ void XORPS(X64Reg regOp, const OpArg& arg);
+ void XORPD(X64Reg regOp, const OpArg& arg);
// SSE/SSE2: Shuffle components. These are tricky - see Intel documentation.
- void SHUFPS(X64Reg regOp, OpArg arg, u8 shuffle);
- void SHUFPD(X64Reg regOp, OpArg arg, u8 shuffle);
+ void SHUFPS(X64Reg regOp, const OpArg& arg, u8 shuffle);
+ void SHUFPD(X64Reg regOp, const OpArg& arg, u8 shuffle);
// SSE/SSE2: Useful alternative to shuffle in some cases.
- void MOVDDUP(X64Reg regOp, OpArg arg);
-
- // TODO: Actually implement
-#if 0
- // SSE3: Horizontal operations in SIMD registers. Could be useful for various VFPU things like dot products...
- void ADDSUBPS(X64Reg dest, OpArg src);
- void ADDSUBPD(X64Reg dest, OpArg src);
- void HADDPD(X64Reg dest, OpArg src);
- void HSUBPS(X64Reg dest, OpArg src);
- void HSUBPD(X64Reg dest, OpArg src);
-
- // SSE4: Further horizontal operations - dot products. These are weirdly flexible, the arg contains both a read mask and a write "mask".
- void DPPD(X64Reg dest, OpArg src, u8 arg);
-
- // These are probably useful for VFPU emulation.
- void INSERTPS(X64Reg dest, OpArg src, u8 arg);
- void EXTRACTPS(OpArg dest, X64Reg src, u8 arg);
-#endif
+ void MOVDDUP(X64Reg regOp, const OpArg& arg);
// SSE3: Horizontal operations in SIMD registers. Very slow! shufps-based code beats it handily on Ivy.
- void HADDPS(X64Reg dest, OpArg src);
+ void HADDPS(X64Reg dest, const OpArg& src);
// SSE4: Further horizontal operations - dot products. These are weirdly flexible, the arg contains both a read mask and a write "mask".
- void DPPS(X64Reg dest, OpArg src, u8 arg);
+ void DPPS(X64Reg dest, const OpArg& src, u8 arg);
- void UNPCKLPS(X64Reg dest, OpArg src);
- void UNPCKHPS(X64Reg dest, OpArg src);
- void UNPCKLPD(X64Reg dest, OpArg src);
- void UNPCKHPD(X64Reg dest, OpArg src);
+ void UNPCKLPS(X64Reg dest, const OpArg& src);
+ void UNPCKHPS(X64Reg dest, const OpArg& src);
+ void UNPCKLPD(X64Reg dest, const OpArg& src);
+ void UNPCKHPD(X64Reg dest, const OpArg& src);
// SSE/SSE2: Compares.
- void COMISS(X64Reg regOp, OpArg arg);
- void COMISD(X64Reg regOp, OpArg arg);
- void UCOMISS(X64Reg regOp, OpArg arg);
- void UCOMISD(X64Reg regOp, OpArg arg);
+ void COMISS(X64Reg regOp, const OpArg& arg);
+ void COMISD(X64Reg regOp, const OpArg& arg);
+ void UCOMISS(X64Reg regOp, const OpArg& arg);
+ void UCOMISD(X64Reg regOp, const OpArg& arg);
// SSE/SSE2: Moves. Use the right data type for your data, in most cases.
- void MOVAPS(X64Reg regOp, OpArg arg);
- void MOVAPD(X64Reg regOp, OpArg arg);
- void MOVAPS(OpArg arg, X64Reg regOp);
- void MOVAPD(OpArg arg, X64Reg regOp);
-
- void MOVUPS(X64Reg regOp, OpArg arg);
- void MOVUPD(X64Reg regOp, OpArg arg);
- void MOVUPS(OpArg arg, X64Reg regOp);
- void MOVUPD(OpArg arg, X64Reg regOp);
-
- void MOVDQA(X64Reg regOp, OpArg arg);
- void MOVDQA(OpArg arg, X64Reg regOp);
- void MOVDQU(X64Reg regOp, OpArg arg);
- void MOVDQU(OpArg arg, X64Reg regOp);
-
- void MOVSS(X64Reg regOp, OpArg arg);
- void MOVSD(X64Reg regOp, OpArg arg);
- void MOVSS(OpArg arg, X64Reg regOp);
- void MOVSD(OpArg arg, X64Reg regOp);
-
- void MOVLPS(X64Reg regOp, OpArg arg);
- void MOVLPD(X64Reg regOp, OpArg arg);
- void MOVLPS(OpArg arg, X64Reg regOp);
- void MOVLPD(OpArg arg, X64Reg regOp);
-
- void MOVHPS(X64Reg regOp, OpArg arg);
- void MOVHPD(X64Reg regOp, OpArg arg);
- void MOVHPS(OpArg arg, X64Reg regOp);
- void MOVHPD(OpArg arg, X64Reg regOp);
+ void MOVAPS(X64Reg regOp, const OpArg& arg);
+ void MOVAPD(X64Reg regOp, const OpArg& arg);
+ void MOVAPS(const OpArg& arg, X64Reg regOp);
+ void MOVAPD(const OpArg& arg, X64Reg regOp);
+
+ void MOVUPS(X64Reg regOp, const OpArg& arg);
+ void MOVUPD(X64Reg regOp, const OpArg& arg);
+ void MOVUPS(const OpArg& arg, X64Reg regOp);
+ void MOVUPD(const OpArg& arg, X64Reg regOp);
+
+ void MOVDQA(X64Reg regOp, const OpArg& arg);
+ void MOVDQA(const OpArg& arg, X64Reg regOp);
+ void MOVDQU(X64Reg regOp, const OpArg& arg);
+ void MOVDQU(const OpArg& arg, X64Reg regOp);
+
+ void MOVSS(X64Reg regOp, const OpArg& arg);
+ void MOVSD(X64Reg regOp, const OpArg& arg);
+ void MOVSS(const OpArg& arg, X64Reg regOp);
+ void MOVSD(const OpArg& arg, X64Reg regOp);
+
+ void MOVLPS(X64Reg regOp, const OpArg& arg);
+ void MOVLPD(X64Reg regOp, const OpArg& arg);
+ void MOVLPS(const OpArg& arg, X64Reg regOp);
+ void MOVLPD(const OpArg& arg, X64Reg regOp);
+
+ void MOVHPS(X64Reg regOp, const OpArg& arg);
+ void MOVHPD(X64Reg regOp, const OpArg& arg);
+ void MOVHPS(const OpArg& arg, X64Reg regOp);
+ void MOVHPD(const OpArg& arg, X64Reg regOp);
void MOVHLPS(X64Reg regOp1, X64Reg regOp2);
void MOVLHPS(X64Reg regOp1, X64Reg regOp2);
- void MOVD_xmm(X64Reg dest, const OpArg &arg);
+ void MOVD_xmm(X64Reg dest, const OpArg& arg);
void MOVQ_xmm(X64Reg dest, OpArg arg);
- void MOVD_xmm(const OpArg &arg, X64Reg src);
+ void MOVD_xmm(const OpArg& arg, X64Reg src);
void MOVQ_xmm(OpArg arg, X64Reg src);
// SSE/SSE2: Generates a mask from the high bits of the components of the packed register in question.
- void MOVMSKPS(X64Reg dest, OpArg arg);
- void MOVMSKPD(X64Reg dest, OpArg arg);
+ void MOVMSKPS(X64Reg dest, const OpArg& arg);
+ void MOVMSKPD(X64Reg dest, const OpArg& arg);
// SSE2: Selective byte store, mask in src register. EDI/RDI specifies store address. This is a weird one.
void MASKMOVDQU(X64Reg dest, X64Reg src);
- void LDDQU(X64Reg dest, OpArg src);
+ void LDDQU(X64Reg dest, const OpArg& src);
// SSE/SSE2: Data type conversions.
- void CVTPS2PD(X64Reg dest, OpArg src);
- void CVTPD2PS(X64Reg dest, OpArg src);
- void CVTSS2SD(X64Reg dest, OpArg src);
- void CVTSI2SS(X64Reg dest, OpArg src);
- void CVTSD2SS(X64Reg dest, OpArg src);
- void CVTSI2SD(X64Reg dest, OpArg src);
- void CVTDQ2PD(X64Reg regOp, OpArg arg);
- void CVTPD2DQ(X64Reg regOp, OpArg arg);
- void CVTDQ2PS(X64Reg regOp, OpArg arg);
- void CVTPS2DQ(X64Reg regOp, OpArg arg);
-
- void CVTTPS2DQ(X64Reg regOp, OpArg arg);
- void CVTTPD2DQ(X64Reg regOp, OpArg arg);
+ void CVTPS2PD(X64Reg dest, const OpArg& src);
+ void CVTPD2PS(X64Reg dest, const OpArg& src);
+ void CVTSS2SD(X64Reg dest, const OpArg& src);
+ void CVTSI2SS(X64Reg dest, const OpArg& src);
+ void CVTSD2SS(X64Reg dest, const OpArg& src);
+ void CVTSI2SD(X64Reg dest, const OpArg& src);
+ void CVTDQ2PD(X64Reg regOp, const OpArg& arg);
+ void CVTPD2DQ(X64Reg regOp, const OpArg& arg);
+ void CVTDQ2PS(X64Reg regOp, const OpArg& arg);
+ void CVTPS2DQ(X64Reg regOp, const OpArg& arg);
+
+ void CVTTPS2DQ(X64Reg regOp, const OpArg& arg);
+ void CVTTPD2DQ(X64Reg regOp, const OpArg& arg);
// Destinations are X64 regs (rax, rbx, ...) for these instructions.
- void CVTSS2SI(X64Reg xregdest, OpArg src);
- void CVTSD2SI(X64Reg xregdest, OpArg src);
- void CVTTSS2SI(X64Reg xregdest, OpArg arg);
- void CVTTSD2SI(X64Reg xregdest, OpArg arg);
+ void CVTSS2SI(X64Reg xregdest, const OpArg& src);
+ void CVTSD2SI(X64Reg xregdest, const OpArg& src);
+ void CVTTSS2SI(X64Reg xregdest, const OpArg& arg);
+ void CVTTSD2SI(X64Reg xregdest, const OpArg& arg);
// SSE2: Packed integer instructions
- void PACKSSDW(X64Reg dest, OpArg arg);
- void PACKSSWB(X64Reg dest, OpArg arg);
- void PACKUSDW(X64Reg dest, OpArg arg);
- void PACKUSWB(X64Reg dest, OpArg arg);
+ void PACKSSDW(X64Reg dest, const OpArg& arg);
+ void PACKSSWB(X64Reg dest, const OpArg& arg);
+ void PACKUSDW(X64Reg dest, const OpArg& arg);
+ void PACKUSWB(X64Reg dest, const OpArg& arg);
void PUNPCKLBW(X64Reg dest, const OpArg &arg);
void PUNPCKLWD(X64Reg dest, const OpArg &arg);
void PUNPCKLDQ(X64Reg dest, const OpArg &arg);
void PUNPCKLQDQ(X64Reg dest, const OpArg &arg);
- void PTEST(X64Reg dest, OpArg arg);
- void PAND(X64Reg dest, OpArg arg);
- void PANDN(X64Reg dest, OpArg arg);
- void PXOR(X64Reg dest, OpArg arg);
- void POR(X64Reg dest, OpArg arg);
-
- void PADDB(X64Reg dest, OpArg arg);
- void PADDW(X64Reg dest, OpArg arg);
- void PADDD(X64Reg dest, OpArg arg);
- void PADDQ(X64Reg dest, OpArg arg);
-
- void PADDSB(X64Reg dest, OpArg arg);
- void PADDSW(X64Reg dest, OpArg arg);
- void PADDUSB(X64Reg dest, OpArg arg);
- void PADDUSW(X64Reg dest, OpArg arg);
-
- void PSUBB(X64Reg dest, OpArg arg);
- void PSUBW(X64Reg dest, OpArg arg);
- void PSUBD(X64Reg dest, OpArg arg);
- void PSUBQ(X64Reg dest, OpArg arg);
-
- void PSUBSB(X64Reg dest, OpArg arg);
- void PSUBSW(X64Reg dest, OpArg arg);
- void PSUBUSB(X64Reg dest, OpArg arg);
- void PSUBUSW(X64Reg dest, OpArg arg);
-
- void PAVGB(X64Reg dest, OpArg arg);
- void PAVGW(X64Reg dest, OpArg arg);
-
- void PCMPEQB(X64Reg dest, OpArg arg);
- void PCMPEQW(X64Reg dest, OpArg arg);
- void PCMPEQD(X64Reg dest, OpArg arg);
-
- void PCMPGTB(X64Reg dest, OpArg arg);
- void PCMPGTW(X64Reg dest, OpArg arg);
- void PCMPGTD(X64Reg dest, OpArg arg);
-
- void PEXTRW(X64Reg dest, OpArg arg, u8 subreg);
- void PINSRW(X64Reg dest, OpArg arg, u8 subreg);
-
- void PMADDWD(X64Reg dest, OpArg arg);
- void PSADBW(X64Reg dest, OpArg arg);
-
- void PMAXSW(X64Reg dest, OpArg arg);
- void PMAXUB(X64Reg dest, OpArg arg);
- void PMINSW(X64Reg dest, OpArg arg);
- void PMINUB(X64Reg dest, OpArg arg);
+ void PTEST(X64Reg dest, const OpArg& arg);
+ void PAND(X64Reg dest, const OpArg& arg);
+ void PANDN(X64Reg dest, const OpArg& arg);
+ void PXOR(X64Reg dest, const OpArg& arg);
+ void POR(X64Reg dest, const OpArg& arg);
+
+ void PADDB(X64Reg dest, const OpArg& arg);
+ void PADDW(X64Reg dest, const OpArg& arg);
+ void PADDD(X64Reg dest, const OpArg& arg);
+ void PADDQ(X64Reg dest, const OpArg& arg);
+
+ void PADDSB(X64Reg dest, const OpArg& arg);
+ void PADDSW(X64Reg dest, const OpArg& arg);
+ void PADDUSB(X64Reg dest, const OpArg& arg);
+ void PADDUSW(X64Reg dest, const OpArg& arg);
+
+ void PSUBB(X64Reg dest, const OpArg& arg);
+ void PSUBW(X64Reg dest, const OpArg& arg);
+ void PSUBD(X64Reg dest, const OpArg& arg);
+ void PSUBQ(X64Reg dest, const OpArg& arg);
+
+ void PSUBSB(X64Reg dest, const OpArg& arg);
+ void PSUBSW(X64Reg dest, const OpArg& arg);
+ void PSUBUSB(X64Reg dest, const OpArg& arg);
+ void PSUBUSW(X64Reg dest, const OpArg& arg);
+
+ void PAVGB(X64Reg dest, const OpArg& arg);
+ void PAVGW(X64Reg dest, const OpArg& arg);
+
+ void PCMPEQB(X64Reg dest, const OpArg& arg);
+ void PCMPEQW(X64Reg dest, const OpArg& arg);
+ void PCMPEQD(X64Reg dest, const OpArg& arg);
+
+ void PCMPGTB(X64Reg dest, const OpArg& arg);
+ void PCMPGTW(X64Reg dest, const OpArg& arg);
+ void PCMPGTD(X64Reg dest, const OpArg& arg);
+
+ void PEXTRW(X64Reg dest, const OpArg& arg, u8 subreg);
+ void PINSRW(X64Reg dest, const OpArg& arg, u8 subreg);
+
+ void PMADDWD(X64Reg dest, const OpArg& arg);
+ void PSADBW(X64Reg dest, const OpArg& arg);
+
+ void PMAXSW(X64Reg dest, const OpArg& arg);
+ void PMAXUB(X64Reg dest, const OpArg& arg);
+ void PMINSW(X64Reg dest, const OpArg& arg);
+ void PMINUB(X64Reg dest, const OpArg& arg);
// SSE4: More MAX/MIN instructions.
- void PMINSB(X64Reg dest, OpArg arg);
- void PMINSD(X64Reg dest, OpArg arg);
- void PMINUW(X64Reg dest, OpArg arg);
- void PMINUD(X64Reg dest, OpArg arg);
- void PMAXSB(X64Reg dest, OpArg arg);
- void PMAXSD(X64Reg dest, OpArg arg);
- void PMAXUW(X64Reg dest, OpArg arg);
- void PMAXUD(X64Reg dest, OpArg arg);
-
- void PMOVMSKB(X64Reg dest, OpArg arg);
- void PSHUFD(X64Reg dest, OpArg arg, u8 shuffle);
- void PSHUFB(X64Reg dest, OpArg arg);
-
- void PSHUFLW(X64Reg dest, OpArg arg, u8 shuffle);
- void PSHUFHW(X64Reg dest, OpArg arg, u8 shuffle);
+ void PMINSB(X64Reg dest, const OpArg& arg);
+ void PMINSD(X64Reg dest, const OpArg& arg);
+ void PMINUW(X64Reg dest, const OpArg& arg);
+ void PMINUD(X64Reg dest, const OpArg& arg);
+ void PMAXSB(X64Reg dest, const OpArg& arg);
+ void PMAXSD(X64Reg dest, const OpArg& arg);
+ void PMAXUW(X64Reg dest, const OpArg& arg);
+ void PMAXUD(X64Reg dest, const OpArg& arg);
+
+ void PMOVMSKB(X64Reg dest, const OpArg& arg);
+ void PSHUFD(X64Reg dest, const OpArg& arg, u8 shuffle);
+ void PSHUFB(X64Reg dest, const OpArg& arg);
+
+ void PSHUFLW(X64Reg dest, const OpArg& arg, u8 shuffle);
+ void PSHUFHW(X64Reg dest, const OpArg& arg, u8 shuffle);
void PSRLW(X64Reg reg, int shift);
void PSRLD(X64Reg reg, int shift);
void PSRLQ(X64Reg reg, int shift);
- void PSRLQ(X64Reg reg, OpArg arg);
+ void PSRLQ(X64Reg reg, const OpArg& arg);
void PSRLDQ(X64Reg reg, int shift);
void PSLLW(X64Reg reg, int shift);
@@ -833,198 +812,198 @@ public:
void PSRAD(X64Reg reg, int shift);
// SSE4: data type conversions
- void PMOVSXBW(X64Reg dest, OpArg arg);
- void PMOVSXBD(X64Reg dest, OpArg arg);
- void PMOVSXBQ(X64Reg dest, OpArg arg);
- void PMOVSXWD(X64Reg dest, OpArg arg);
- void PMOVSXWQ(X64Reg dest, OpArg arg);
- void PMOVSXDQ(X64Reg dest, OpArg arg);
- void PMOVZXBW(X64Reg dest, OpArg arg);
- void PMOVZXBD(X64Reg dest, OpArg arg);
- void PMOVZXBQ(X64Reg dest, OpArg arg);
- void PMOVZXWD(X64Reg dest, OpArg arg);
- void PMOVZXWQ(X64Reg dest, OpArg arg);
- void PMOVZXDQ(X64Reg dest, OpArg arg);
+ void PMOVSXBW(X64Reg dest, const OpArg& arg);
+ void PMOVSXBD(X64Reg dest, const OpArg& arg);
+ void PMOVSXBQ(X64Reg dest, const OpArg& arg);
+ void PMOVSXWD(X64Reg dest, const OpArg& arg);
+ void PMOVSXWQ(X64Reg dest, const OpArg& arg);
+ void PMOVSXDQ(X64Reg dest, const OpArg& arg);
+ void PMOVZXBW(X64Reg dest, const OpArg& arg);
+ void PMOVZXBD(X64Reg dest, const OpArg& arg);
+ void PMOVZXBQ(X64Reg dest, const OpArg& arg);
+ void PMOVZXWD(X64Reg dest, const OpArg& arg);
+ void PMOVZXWQ(X64Reg dest, const OpArg& arg);
+ void PMOVZXDQ(X64Reg dest, const OpArg& arg);
// SSE4: variable blend instructions (xmm0 implicit argument)
- void PBLENDVB(X64Reg dest, OpArg arg);
- void BLENDVPS(X64Reg dest, OpArg arg);
- void BLENDVPD(X64Reg dest, OpArg arg);
+ void PBLENDVB(X64Reg dest, const OpArg& arg);
+ void BLENDVPS(X64Reg dest, const OpArg& arg);
+ void BLENDVPD(X64Reg dest, const OpArg& arg);
void BLENDPS(X64Reg dest, const OpArg& arg, u8 blend);
void BLENDPD(X64Reg dest, const OpArg& arg, u8 blend);
// SSE4: rounding (see FloatRound for mode or use ROUNDNEARSS, etc. helpers.)
- void ROUNDSS(X64Reg dest, OpArg arg, u8 mode);
- void ROUNDSD(X64Reg dest, OpArg arg, u8 mode);
- void ROUNDPS(X64Reg dest, OpArg arg, u8 mode);
- void ROUNDPD(X64Reg dest, OpArg arg, u8 mode);
-
- inline void ROUNDNEARSS(X64Reg dest, OpArg arg) { ROUNDSS(dest, arg, FROUND_NEAREST); }
- inline void ROUNDFLOORSS(X64Reg dest, OpArg arg) { ROUNDSS(dest, arg, FROUND_FLOOR); }
- inline void ROUNDCEILSS(X64Reg dest, OpArg arg) { ROUNDSS(dest, arg, FROUND_CEIL); }
- inline void ROUNDZEROSS(X64Reg dest, OpArg arg) { ROUNDSS(dest, arg, FROUND_ZERO); }
-
- inline void ROUNDNEARSD(X64Reg dest, OpArg arg) { ROUNDSD(dest, arg, FROUND_NEAREST); }
- inline void ROUNDFLOORSD(X64Reg dest, OpArg arg) { ROUNDSD(dest, arg, FROUND_FLOOR); }
- inline void ROUNDCEILSD(X64Reg dest, OpArg arg) { ROUNDSD(dest, arg, FROUND_CEIL); }
- inline void ROUNDZEROSD(X64Reg dest, OpArg arg) { ROUNDSD(dest, arg, FROUND_ZERO); }
-
- inline void ROUNDNEARPS(X64Reg dest, OpArg arg) { ROUNDPS(dest, arg, FROUND_NEAREST); }
- inline void ROUNDFLOORPS(X64Reg dest, OpArg arg) { ROUNDPS(dest, arg, FROUND_FLOOR); }
- inline void ROUNDCEILPS(X64Reg dest, OpArg arg) { ROUNDPS(dest, arg, FROUND_CEIL); }
- inline void ROUNDZEROPS(X64Reg dest, OpArg arg) { ROUNDPS(dest, arg, FROUND_ZERO); }
-
- inline void ROUNDNEARPD(X64Reg dest, OpArg arg) { ROUNDPD(dest, arg, FROUND_NEAREST); }
- inline void ROUNDFLOORPD(X64Reg dest, OpArg arg) { ROUNDPD(dest, arg, FROUND_FLOOR); }
- inline void ROUNDCEILPD(X64Reg dest, OpArg arg) { ROUNDPD(dest, arg, FROUND_CEIL); }
- inline void ROUNDZEROPD(X64Reg dest, OpArg arg) { ROUNDPD(dest, arg, FROUND_ZERO); }
+ void ROUNDSS(X64Reg dest, const OpArg& arg, u8 mode);
+ void ROUNDSD(X64Reg dest, const OpArg& arg, u8 mode);
+ void ROUNDPS(X64Reg dest, const OpArg& arg, u8 mode);
+ void ROUNDPD(X64Reg dest, const OpArg& arg, u8 mode);
+
+ void ROUNDNEARSS(X64Reg dest, const OpArg& arg) { ROUNDSS(dest, arg, FROUND_NEAREST); }
+ void ROUNDFLOORSS(X64Reg dest, const OpArg& arg) { ROUNDSS(dest, arg, FROUND_FLOOR); }
+ void ROUNDCEILSS(X64Reg dest, const OpArg& arg) { ROUNDSS(dest, arg, FROUND_CEIL); }
+ void ROUNDZEROSS(X64Reg dest, const OpArg& arg) { ROUNDSS(dest, arg, FROUND_ZERO); }
+
+ void ROUNDNEARSD(X64Reg dest, const OpArg& arg) { ROUNDSD(dest, arg, FROUND_NEAREST); }
+ void ROUNDFLOORSD(X64Reg dest, const OpArg& arg) { ROUNDSD(dest, arg, FROUND_FLOOR); }
+ void ROUNDCEILSD(X64Reg dest, const OpArg& arg) { ROUNDSD(dest, arg, FROUND_CEIL); }
+ void ROUNDZEROSD(X64Reg dest, const OpArg& arg) { ROUNDSD(dest, arg, FROUND_ZERO); }
+
+ void ROUNDNEARPS(X64Reg dest, const OpArg& arg) { ROUNDPS(dest, arg, FROUND_NEAREST); }
+ void ROUNDFLOORPS(X64Reg dest, const OpArg& arg) { ROUNDPS(dest, arg, FROUND_FLOOR); }
+ void ROUNDCEILPS(X64Reg dest, const OpArg& arg) { ROUNDPS(dest, arg, FROUND_CEIL); }
+ void ROUNDZEROPS(X64Reg dest, const OpArg& arg) { ROUNDPS(dest, arg, FROUND_ZERO); }
+
+ void ROUNDNEARPD(X64Reg dest, const OpArg& arg) { ROUNDPD(dest, arg, FROUND_NEAREST); }
+ void ROUNDFLOORPD(X64Reg dest, const OpArg& arg) { ROUNDPD(dest, arg, FROUND_FLOOR); }
+ void ROUNDCEILPD(X64Reg dest, const OpArg& arg) { ROUNDPD(dest, arg, FROUND_CEIL); }
+ void ROUNDZEROPD(X64Reg dest, const OpArg& arg) { ROUNDPD(dest, arg, FROUND_ZERO); }
// AVX
- void VADDSD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
- void VSUBSD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
- void VMULSD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
- void VDIVSD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
- void VADDPD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
- void VSUBPD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
- void VMULPD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
- void VDIVPD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
- void VSQRTSD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
- void VSHUFPD(X64Reg regOp1, X64Reg regOp2, OpArg arg, u8 shuffle);
- void VUNPCKLPD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
- void VUNPCKHPD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
-
- void VANDPS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
- void VANDPD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
- void VANDNPS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
- void VANDNPD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
- void VORPS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
- void VORPD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
- void VXORPS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
- void VXORPD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
-
- void VPAND(X64Reg regOp1, X64Reg regOp2, OpArg arg);
- void VPANDN(X64Reg regOp1, X64Reg regOp2, OpArg arg);
- void VPOR(X64Reg regOp1, X64Reg regOp2, OpArg arg);
- void VPXOR(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+ void VADDSD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VSUBSD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VMULSD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VDIVSD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VADDPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VSUBPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VMULPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VDIVPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VSQRTSD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VSHUFPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg, u8 shuffle);
+ void VUNPCKLPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VUNPCKHPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+
+ void VANDPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VANDPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VANDNPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VANDNPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VORPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VORPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VXORPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VXORPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+
+ void VPAND(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VPANDN(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VPOR(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VPXOR(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
// FMA3
- void VFMADD132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
- void VFMADD213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
- void VFMADD231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
- void VFMADD132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
- void VFMADD213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
- void VFMADD231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
- void VFMADD132SS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
- void VFMADD213SS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
- void VFMADD231SS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
- void VFMADD132SD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
- void VFMADD213SD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
- void VFMADD231SD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
- void VFMSUB132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
- void VFMSUB213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
- void VFMSUB231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
- void VFMSUB132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
- void VFMSUB213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
- void VFMSUB231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
- void VFMSUB132SS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
- void VFMSUB213SS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
- void VFMSUB231SS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
- void VFMSUB132SD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
- void VFMSUB213SD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
- void VFMSUB231SD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
- void VFNMADD132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
- void VFNMADD213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
- void VFNMADD231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
- void VFNMADD132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
- void VFNMADD213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
- void VFNMADD231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
- void VFNMADD132SS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
- void VFNMADD213SS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
- void VFNMADD231SS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
- void VFNMADD132SD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
- void VFNMADD213SD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
- void VFNMADD231SD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
- void VFNMSUB132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
- void VFNMSUB213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
- void VFNMSUB231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
- void VFNMSUB132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
- void VFNMSUB213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
- void VFNMSUB231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
- void VFNMSUB132SS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
- void VFNMSUB213SS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
- void VFNMSUB231SS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
- void VFNMSUB132SD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
- void VFNMSUB213SD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
- void VFNMSUB231SD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
- void VFMADDSUB132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
- void VFMADDSUB213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
- void VFMADDSUB231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
- void VFMADDSUB132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
- void VFMADDSUB213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
- void VFMADDSUB231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
- void VFMSUBADD132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
- void VFMSUBADD213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
- void VFMSUBADD231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
- void VFMSUBADD132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
- void VFMSUBADD213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
- void VFMSUBADD231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+ void VFMADD132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFMADD213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFMADD231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFMADD132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFMADD213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFMADD231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFMADD132SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFMADD213SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFMADD231SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFMADD132SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFMADD213SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFMADD231SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFMSUB132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFMSUB213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFMSUB231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFMSUB132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFMSUB213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFMSUB231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFMSUB132SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFMSUB213SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFMSUB231SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFMSUB132SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFMSUB213SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFMSUB231SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFNMADD132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFNMADD213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFNMADD231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFNMADD132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFNMADD213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFNMADD231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFNMADD132SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFNMADD213SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFNMADD231SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFNMADD132SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFNMADD213SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFNMADD231SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFNMSUB132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFNMSUB213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFNMSUB231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFNMSUB132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFNMSUB213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFNMSUB231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFNMSUB132SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFNMSUB213SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFNMSUB231SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFNMSUB132SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFNMSUB213SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFNMSUB231SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFMADDSUB132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFMADDSUB213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFMADDSUB231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFMADDSUB132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFMADDSUB213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFMADDSUB231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFMSUBADD132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFMSUBADD213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFMSUBADD231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFMSUBADD132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFMSUBADD213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFMSUBADD231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
// VEX GPR instructions
- void SARX(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2);
- void SHLX(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2);
- void SHRX(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2);
- void RORX(int bits, X64Reg regOp, OpArg arg, u8 rotate);
- void PEXT(int bits, X64Reg regOp1, X64Reg regOp2, OpArg arg);
- void PDEP(int bits, X64Reg regOp1, X64Reg regOp2, OpArg arg);
- void MULX(int bits, X64Reg regOp1, X64Reg regOp2, OpArg arg);
- void BZHI(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2);
- void BLSR(int bits, X64Reg regOp, OpArg arg);
- void BLSMSK(int bits, X64Reg regOp, OpArg arg);
- void BLSI(int bits, X64Reg regOp, OpArg arg);
- void BEXTR(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2);
- void ANDN(int bits, X64Reg regOp1, X64Reg regOp2, OpArg arg);
+ void SARX(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2);
+ void SHLX(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2);
+ void SHRX(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2);
+ void RORX(int bits, X64Reg regOp, const OpArg& arg, u8 rotate);
+ void PEXT(int bits, X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void PDEP(int bits, X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void MULX(int bits, X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void BZHI(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2);
+ void BLSR(int bits, X64Reg regOp, const OpArg& arg);
+ void BLSMSK(int bits, X64Reg regOp, const OpArg& arg);
+ void BLSI(int bits, X64Reg regOp, const OpArg& arg);
+ void BEXTR(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2);
+ void ANDN(int bits, X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
void RDTSC();
// Utility functions
// The difference between this and CALL is that this aligns the stack
// where appropriate.
- void ABI_CallFunction(const void *func);
+ void ABI_CallFunction(const void* func);
template <typename T>
void ABI_CallFunction(T (*func)()) {
- ABI_CallFunction((const void *)func);
+ ABI_CallFunction((const void*)func);
}
- void ABI_CallFunction(const u8 *func) {
- ABI_CallFunction((const void *)func);
+ void ABI_CallFunction(const u8* func) {
+ ABI_CallFunction((const void*)func);
}
- void ABI_CallFunctionC16(const void *func, u16 param1);
- void ABI_CallFunctionCC16(const void *func, u32 param1, u16 param2);
+ void ABI_CallFunctionC16(const void* func, u16 param1);
+ void ABI_CallFunctionCC16(const void* func, u32 param1, u16 param2);
// These only support u32 parameters, but that's enough for a lot of uses.
// These will destroy the 1 or 2 first "parameter regs".
- void ABI_CallFunctionC(const void *func, u32 param1);
- void ABI_CallFunctionCC(const void *func, u32 param1, u32 param2);
- void ABI_CallFunctionCCC(const void *func, u32 param1, u32 param2, u32 param3);
- void ABI_CallFunctionCCP(const void *func, u32 param1, u32 param2, void *param3);
- void ABI_CallFunctionCCCP(const void *func, u32 param1, u32 param2, u32 param3, void *param4);
- void ABI_CallFunctionP(const void *func, void *param1);
- void ABI_CallFunctionPA(const void *func, void *param1, const Gen::OpArg &arg2);
- void ABI_CallFunctionPAA(const void *func, void *param1, const Gen::OpArg &arg2, const Gen::OpArg &arg3);
- void ABI_CallFunctionPPC(const void *func, void *param1, void *param2, u32 param3);
- void ABI_CallFunctionAC(const void *func, const Gen::OpArg &arg1, u32 param2);
- void ABI_CallFunctionACC(const void *func, const Gen::OpArg &arg1, u32 param2, u32 param3);
- void ABI_CallFunctionA(const void *func, const Gen::OpArg &arg1);
- void ABI_CallFunctionAA(const void *func, const Gen::OpArg &arg1, const Gen::OpArg &arg2);
+ void ABI_CallFunctionC(const void* func, u32 param1);
+ void ABI_CallFunctionCC(const void* func, u32 param1, u32 param2);
+ void ABI_CallFunctionCCC(const void* func, u32 param1, u32 param2, u32 param3);
+ void ABI_CallFunctionCCP(const void* func, u32 param1, u32 param2, void* param3);
+ void ABI_CallFunctionCCCP(const void* func, u32 param1, u32 param2, u32 param3, void* param4);
+ void ABI_CallFunctionP(const void* func, void* param1);
+ void ABI_CallFunctionPA(const void* func, void* param1, const OpArg& arg2);
+ void ABI_CallFunctionPAA(const void* func, void* param1, const OpArg& arg2, const OpArg& arg3);
+ void ABI_CallFunctionPPC(const void* func, void* param1, void* param2, u32 param3);
+ void ABI_CallFunctionAC(const void* func, const OpArg& arg1, u32 param2);
+ void ABI_CallFunctionACC(const void* func, const OpArg& arg1, u32 param2, u32 param3);
+ void ABI_CallFunctionA(const void* func, const OpArg& arg1);
+ void ABI_CallFunctionAA(const void* func, const OpArg& arg1, const OpArg& arg2);
// Pass a register as a parameter.
- void ABI_CallFunctionR(const void *func, X64Reg reg1);
- void ABI_CallFunctionRR(const void *func, X64Reg reg1, X64Reg reg2);
+ void ABI_CallFunctionR(const void* func, X64Reg reg1);
+ void ABI_CallFunctionRR(const void* func, X64Reg reg1, X64Reg reg2);
template <typename Tr, typename T1>
void ABI_CallFunctionC(Tr (*func)(T1), u32 param1) {
- ABI_CallFunctionC((const void *)func, param1);
+ ABI_CallFunctionC((const void*)func, param1);
}
// A function that doesn't have any control over what it will do to regs,
@@ -1048,9 +1027,9 @@ public:
void ABI_EmitEpilogue(int maxCallParams);
#ifdef _M_IX86
- inline int ABI_GetNumXMMRegs() { return 8; }
+ static int ABI_GetNumXMMRegs() { return 8; }
#else
- inline int ABI_GetNumXMMRegs() { return 16; }
+ static int ABI_GetNumXMMRegs() { return 16; }
#endif
}; // class XEmitter
diff --git a/src/core/arm/skyeye_common/vfp/vfpdouble.cpp b/src/core/arm/skyeye_common/vfp/vfpdouble.cpp
index 1d844a66e..47a9fe804 100644
--- a/src/core/arm/skyeye_common/vfp/vfpdouble.cpp
+++ b/src/core/arm/skyeye_common/vfp/vfpdouble.cpp
@@ -51,6 +51,7 @@
* ===========================================================================
*/
+#include <algorithm>
#include "common/logging/log.h"
#include "core/arm/skyeye_common/vfp/vfp.h"
#include "core/arm/skyeye_common/vfp/vfp_helper.h"
@@ -785,9 +786,7 @@ u32 vfp_double_add(struct vfp_double *vdd, struct vfp_double *vdn,struct vfp_dou
* This ensures that NaN propagation works correctly.
*/
if (vdn->exponent < vdm->exponent) {
- struct vfp_double *t = vdn;
- vdn = vdm;
- vdm = t;
+ std::swap(vdm, vdn);
}
/*
@@ -843,9 +842,7 @@ vfp_double_multiply(struct vfp_double *vdd, struct vfp_double *vdn,
* This ensures that NaN propagation works correctly.
*/
if (vdn->exponent < vdm->exponent) {
- struct vfp_double *t = vdn;
- vdn = vdm;
- vdm = t;
+ std::swap(vdm, vdn);
LOG_TRACE(Core_ARM11, "VFP: swapping M <-> N\n");
}
diff --git a/src/core/arm/skyeye_common/vfp/vfpsingle.cpp b/src/core/arm/skyeye_common/vfp/vfpsingle.cpp
index e5d339252..0fb3c3bf1 100644
--- a/src/core/arm/skyeye_common/vfp/vfpsingle.cpp
+++ b/src/core/arm/skyeye_common/vfp/vfpsingle.cpp
@@ -51,6 +51,7 @@
* ===========================================================================
*/
+#include <algorithm>
#include <cinttypes>
#include "common/common_funcs.h"
@@ -815,9 +816,7 @@ vfp_single_add(struct vfp_single *vsd, struct vfp_single *vsn,
* This ensures that NaN propagation works correctly.
*/
if (vsn->exponent < vsm->exponent) {
- struct vfp_single *t = vsn;
- vsn = vsm;
- vsm = t;
+ std::swap(vsm, vsn);
}
/*
@@ -872,9 +871,7 @@ vfp_single_multiply(struct vfp_single *vsd, struct vfp_single *vsn, struct vfp_s
* This ensures that NaN propagation works correctly.
*/
if (vsn->exponent < vsm->exponent) {
- struct vfp_single *t = vsn;
- vsn = vsm;
- vsm = t;
+ std::swap(vsm, vsn);
LOG_TRACE(Core_ARM11, "swapping M <-> N");
}
diff --git a/src/core/hle/service/gsp_gpu.cpp b/src/core/hle/service/gsp_gpu.cpp
index 5050b9f7d..fde508a13 100644
--- a/src/core/hle/service/gsp_gpu.cpp
+++ b/src/core/hle/service/gsp_gpu.cpp
@@ -417,7 +417,7 @@ static void ExecuteCommand(const Command& command, u32 thread_id) {
case CommandId::SET_DISPLAY_TRANSFER:
{
- auto& params = command.image_copy;
+ auto& params = command.display_transfer;
WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(display_transfer_config.input_address)),
Memory::VirtualToPhysicalAddress(params.in_buffer_address) >> 3);
WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(display_transfer_config.output_address)),
@@ -432,17 +432,22 @@ static void ExecuteCommand(const Command& command, u32 thread_id) {
// TODO: Check if texture copies are implemented correctly..
case CommandId::SET_TEXTURE_COPY:
{
- auto& params = command.image_copy;
- WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(display_transfer_config.input_address)),
+ auto& params = command.texture_copy;
+ WriteGPURegister((u32)GPU_REG_INDEX(display_transfer_config.input_address),
Memory::VirtualToPhysicalAddress(params.in_buffer_address) >> 3);
- WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(display_transfer_config.output_address)),
+ WriteGPURegister((u32)GPU_REG_INDEX(display_transfer_config.output_address),
Memory::VirtualToPhysicalAddress(params.out_buffer_address) >> 3);
- WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(display_transfer_config.input_size)), params.in_buffer_size);
- WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(display_transfer_config.output_size)), params.out_buffer_size);
- WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(display_transfer_config.flags)), params.flags);
-
- // TODO: Should this register be set to 1 or should instead its value be OR-ed with 1?
- WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(display_transfer_config.trigger)), 1);
+ WriteGPURegister((u32)GPU_REG_INDEX(display_transfer_config.texture_copy.size),
+ params.size);
+ WriteGPURegister((u32)GPU_REG_INDEX(display_transfer_config.texture_copy.input_size),
+ params.in_width_gap);
+ WriteGPURegister((u32)GPU_REG_INDEX(display_transfer_config.texture_copy.output_size),
+ params.out_width_gap);
+ WriteGPURegister((u32)GPU_REG_INDEX(display_transfer_config.flags),
+ params.flags);
+
+ // NOTE: Actual GSP ORs 1 with current register instead of overwriting. Doesn't seem to matter.
+ WriteGPURegister((u32)GPU_REG_INDEX(display_transfer_config.trigger), 1);
break;
}
diff --git a/src/core/hle/service/gsp_gpu.h b/src/core/hle/service/gsp_gpu.h
index c89d0a467..8bcb30ad1 100644
--- a/src/core/hle/service/gsp_gpu.h
+++ b/src/core/hle/service/gsp_gpu.h
@@ -127,7 +127,16 @@ struct Command {
u32 in_buffer_size;
u32 out_buffer_size;
u32 flags;
- } image_copy;
+ } display_transfer;
+
+ struct {
+ u32 in_buffer_address;
+ u32 out_buffer_address;
+ u32 size;
+ u32 in_width_gap;
+ u32 out_width_gap;
+ u32 flags;
+ } texture_copy;
u8 raw_data[0x1C];
};
diff --git a/src/core/hw/gpu.cpp b/src/core/hw/gpu.cpp
index 3ccbc03b2..68ae38289 100644
--- a/src/core/hw/gpu.cpp
+++ b/src/core/hw/gpu.cpp
@@ -3,6 +3,7 @@
// Refer to the license.txt file included.
#include <cstring>
+#include <numeric>
#include <type_traits>
#include "common/color.h"
@@ -158,14 +159,59 @@ inline void Write(u32 addr, const T data) {
u8* src_pointer = Memory::GetPhysicalPointer(config.GetPhysicalInputAddress());
u8* dst_pointer = Memory::GetPhysicalPointer(config.GetPhysicalOutputAddress());
+ if (config.is_texture_copy) {
+ u32 input_width = config.texture_copy.input_width * 16;
+ u32 input_gap = config.texture_copy.input_gap * 16;
+ u32 output_width = config.texture_copy.output_width * 16;
+ u32 output_gap = config.texture_copy.output_gap * 16;
+
+ size_t contiguous_input_size = config.texture_copy.size / input_width * (input_width + input_gap);
+ VideoCore::g_renderer->hw_rasterizer->NotifyPreRead(config.GetPhysicalInputAddress(), contiguous_input_size);
+
+ u32 remaining_size = config.texture_copy.size;
+ u32 remaining_input = input_width;
+ u32 remaining_output = output_width;
+ while (remaining_size > 0) {
+ u32 copy_size = std::min({ remaining_input, remaining_output, remaining_size });
+
+ std::memcpy(dst_pointer, src_pointer, copy_size);
+ src_pointer += copy_size;
+ dst_pointer += copy_size;
+
+ remaining_input -= copy_size;
+ remaining_output -= copy_size;
+ remaining_size -= copy_size;
+
+ if (remaining_input == 0) {
+ remaining_input = input_width;
+ src_pointer += input_gap;
+ }
+ if (remaining_output == 0) {
+ remaining_output = output_width;
+ dst_pointer += output_gap;
+ }
+ }
+
+ LOG_TRACE(HW_GPU, "TextureCopy: 0x%X bytes from 0x%08X(%u+%u)-> 0x%08X(%u+%u), flags 0x%08X",
+ config.texture_copy.size,
+ config.GetPhysicalInputAddress(), input_width, input_gap,
+ config.GetPhysicalOutputAddress(), output_width, output_gap,
+ config.flags);
+
+ size_t contiguous_output_size = config.texture_copy.size / output_width * (output_width + output_gap);
+ VideoCore::g_renderer->hw_rasterizer->NotifyFlush(config.GetPhysicalOutputAddress(), contiguous_output_size);
+
+ GSP_GPU::SignalInterrupt(GSP_GPU::InterruptId::PPF);
+ break;
+ }
+
if (config.scaling > config.ScaleXY) {
LOG_CRITICAL(HW_GPU, "Unimplemented display transfer scaling mode %u", config.scaling.Value());
UNIMPLEMENTED();
break;
}
- if (config.output_tiled &&
- (config.scaling == config.ScaleXY || config.scaling == config.ScaleX)) {
+ if (config.input_linear && config.scaling != config.NoScale) {
LOG_CRITICAL(HW_GPU, "Scaling is only implemented on tiled input");
UNIMPLEMENTED();
break;
@@ -182,23 +228,6 @@ inline void Write(u32 addr, const T data) {
VideoCore::g_renderer->hw_rasterizer->NotifyPreRead(config.GetPhysicalInputAddress(), input_size);
- if (config.raw_copy) {
- // Raw copies do not perform color conversion nor tiled->linear / linear->tiled conversions
- // TODO(Subv): Verify if raw copies perform scaling
- memcpy(dst_pointer, src_pointer, output_size);
-
- LOG_TRACE(HW_GPU, "DisplayTriggerTransfer: 0x%08x bytes from 0x%08x(%ux%u)-> 0x%08x(%ux%u), output format: %x, flags 0x%08X, Raw copy",
- output_size,
- config.GetPhysicalInputAddress(), config.input_width.Value(), config.input_height.Value(),
- config.GetPhysicalOutputAddress(), config.output_width.Value(), config.output_height.Value(),
- config.output_format.Value(), config.flags);
-
- GSP_GPU::SignalInterrupt(GSP_GPU::InterruptId::PPF);
-
- VideoCore::g_renderer->hw_rasterizer->NotifyFlush(config.GetPhysicalOutputAddress(), output_size);
- break;
- }
-
for (u32 y = 0; y < output_height; ++y) {
for (u32 x = 0; x < output_width; ++x) {
Math::Vec4<u8> src_color;
@@ -220,7 +249,7 @@ inline void Write(u32 addr, const T data) {
u32 src_offset;
u32 dst_offset;
- if (config.output_tiled) {
+ if (config.input_linear) {
if (!config.dont_swizzle) {
// Interpret the input as linear and the output as tiled
u32 coarse_y = y & ~7;
diff --git a/src/core/hw/gpu.h b/src/core/hw/gpu.h
index daad506fe..2e3a9f779 100644
--- a/src/core/hw/gpu.h
+++ b/src/core/hw/gpu.h
@@ -201,12 +201,14 @@ struct Regs {
u32 flags;
BitField< 0, 1, u32> flip_vertically; // flips input data vertically
- BitField< 1, 1, u32> output_tiled; // Converts from linear to tiled format
- BitField< 3, 1, u32> raw_copy; // Copies the data without performing any processing
+ BitField< 1, 1, u32> input_linear; // Converts from linear to tiled format
+ BitField< 2, 1, u32> crop_input_lines;
+ BitField< 3, 1, u32> is_texture_copy; // Copies the data without performing any processing and respecting texture copy fields
BitField< 5, 1, u32> dont_swizzle;
BitField< 8, 3, PixelFormat> input_format;
BitField<12, 3, PixelFormat> output_format;
-
+ /// Uses some kind of 32x32 block swizzling mode, instead of the usual 8x8 one.
+ BitField<16, 1, u32> block_32; // TODO(yuriks): unimplemented
BitField<24, 2, ScalingMode> scaling; // Determines the scaling mode of the transfer
};
@@ -214,10 +216,30 @@ struct Regs {
// it seems that writing to this field triggers the display transfer
u32 trigger;
+
+ INSERT_PADDING_WORDS(0x1);
+
+ struct {
+ u32 size;
+
+ union {
+ u32 input_size;
+
+ BitField< 0, 16, u32> input_width;
+ BitField<16, 16, u32> input_gap;
+ };
+
+ union {
+ u32 output_size;
+
+ BitField< 0, 16, u32> output_width;
+ BitField<16, 16, u32> output_gap;
+ };
+ } texture_copy;
} display_transfer_config;
- ASSERT_MEMBER_SIZE(display_transfer_config, 0x1c);
+ ASSERT_MEMBER_SIZE(display_transfer_config, 0x2c);
- INSERT_PADDING_WORDS(0x331);
+ INSERT_PADDING_WORDS(0x32D);
struct {
// command list size (in bytes)
diff --git a/src/core/loader/loader.cpp b/src/core/loader/loader.cpp
index f5b349a77..062291006 100644
--- a/src/core/loader/loader.cpp
+++ b/src/core/loader/loader.cpp
@@ -77,6 +77,8 @@ static const char* GetFileTypeString(FileType type) {
return "NCSD";
case FileType::CXI:
return "NCCH";
+ case FileType::CIA:
+ return "CIA";
case FileType::ELF:
return "ELF";
case FileType::THREEDSX:
@@ -134,6 +136,10 @@ ResultStatus LoadFile(const std::string& filename) {
break;
}
+ // CIA file format...
+ case FileType::CIA:
+ return ResultStatus::ErrorNotImplemented;
+
// Error occurred durring IdentifyFile...
case FileType::Error:
diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp
index 374c4748d..d82e20f86 100644
--- a/src/video_core/command_processor.cpp
+++ b/src/video_core/command_processor.cpp
@@ -35,7 +35,15 @@ static u32 default_attr_write_buffer[3];
Common::Profiling::TimingCategory category_drawing("Drawing");
-static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
+// Expand a 4-bit mask to 4-byte mask, e.g. 0b0101 -> 0x00FF00FF
+static const u32 expand_bits_to_bytes[] = {
+ 0x00000000, 0x000000ff, 0x0000ff00, 0x0000ffff,
+ 0x00ff0000, 0x00ff00ff, 0x00ffff00, 0x00ffffff,
+ 0xff000000, 0xff0000ff, 0xff00ff00, 0xff00ffff,
+ 0xffff0000, 0xffff00ff, 0xffffff00, 0xffffffff
+};
+
+static void WritePicaReg(u32 id, u32 value, u32 mask) {
auto& regs = g_state.regs;
if (id >= regs.NumIds())
@@ -47,13 +55,16 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
// TODO: Figure out how register masking acts on e.g. vs.uniform_setup.set_value
u32 old_value = regs[id];
- regs[id] = (old_value & ~mask) | (value & mask);
+
+ const u32 write_mask = expand_bits_to_bytes[mask];
+
+ regs[id] = (old_value & ~write_mask) | (value & write_mask);
+
+ DebugUtils::OnPicaRegWrite({ (u16)id, (u16)mask, regs[id] });
if (g_debug_context)
g_debug_context->OnEvent(DebugContext::Event::PicaCommandLoaded, reinterpret_cast<void*>(&id));
- DebugUtils::OnPicaRegWrite(id, regs[id]);
-
switch(id) {
// Trigger IRQ
case PICA_REG_INDEX(trigger_irq):
@@ -215,7 +226,7 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
unsigned int vertex_cache_pos = 0;
vertex_cache_ids.fill(-1);
- Shader::UnitState shader_unit;
+ Shader::UnitState<false> shader_unit;
Shader::Setup(shader_unit);
for (unsigned int index = 0; index < regs.num_vertices; ++index)
@@ -469,13 +480,6 @@ void ProcessCommandList(const u32* list, u32 size) {
g_state.cmd_list.length = size / sizeof(u32);
while (g_state.cmd_list.current_ptr < g_state.cmd_list.head_ptr + g_state.cmd_list.length) {
- // Expand a 4-bit mask to 4-byte mask, e.g. 0b0101 -> 0x00FF00FF
- static const u32 expand_bits_to_bytes[] = {
- 0x00000000, 0x000000ff, 0x0000ff00, 0x0000ffff,
- 0x00ff0000, 0x00ff00ff, 0x00ffff00, 0x00ffffff,
- 0xff000000, 0xff0000ff, 0xff00ff00, 0xff00ffff,
- 0xffff0000, 0xffff00ff, 0xffffff00, 0xffffffff
- };
// Align read pointer to 8 bytes
if ((g_state.cmd_list.head_ptr - g_state.cmd_list.current_ptr) % 2 != 0)
@@ -483,14 +487,13 @@ void ProcessCommandList(const u32* list, u32 size) {
u32 value = *g_state.cmd_list.current_ptr++;
const CommandHeader header = { *g_state.cmd_list.current_ptr++ };
- const u32 write_mask = expand_bits_to_bytes[header.parameter_mask];
u32 cmd = header.cmd_id;
- WritePicaReg(cmd, value, write_mask);
+ WritePicaReg(cmd, value, header.parameter_mask);
for (unsigned i = 0; i < header.extra_data_length; ++i) {
u32 cmd = header.cmd_id + (header.group_commands ? i + 1 : 0);
- WritePicaReg(cmd, *g_state.cmd_list.current_ptr++, write_mask);
+ WritePicaReg(cmd, *g_state.cmd_list.current_ptr++, header.parameter_mask);
}
}
}
diff --git a/src/video_core/debug_utils/debug_utils.cpp b/src/video_core/debug_utils/debug_utils.cpp
index 572b4fd62..8ad77f0c8 100644
--- a/src/video_core/debug_utils/debug_utils.cpp
+++ b/src/video_core/debug_utils/debug_utils.cpp
@@ -4,9 +4,10 @@
#include <algorithm>
#include <condition_variable>
+#include <cstring>
+#include <fstream>
#include <list>
#include <map>
-#include <fstream>
#include <mutex>
#include <string>
@@ -14,6 +15,7 @@
#include <png.h>
#endif
+#include <nihstro/float24.h>
#include <nihstro/shader_binary.h>
#include "common/assert.h"
@@ -63,7 +65,7 @@ void DebugContext::OnEvent(Event event, void* data) {
void DebugContext::Resume() {
{
- std::unique_lock<std::mutex> lock(breakpoint_mutex);
+ std::lock_guard<std::mutex> lock(breakpoint_mutex);
// Tell all observers that we are about to resume
for (auto& breakpoint_observer : breakpoint_observers) {
@@ -110,8 +112,7 @@ void GeometryDumper::Dump() {
}
-void DumpShader(const u32* binary_data, u32 binary_size, const u32* swizzle_data, u32 swizzle_size,
- u32 main_offset, const Regs::VSOutputAttributes* output_attributes)
+void DumpShader(const std::string& filename, const Regs::ShaderConfig& config, const State::ShaderSetup& setup, const Regs::VSOutputAttributes* output_attributes)
{
struct StuffToWrite {
u8* pointer;
@@ -131,11 +132,14 @@ void DumpShader(const u32* binary_data, u32 binary_size, const u32* swizzle_data
// into shbin format (separate type and component mask).
union OutputRegisterInfo {
enum Type : u64 {
- POSITION = 0,
- COLOR = 2,
- TEXCOORD0 = 3,
- TEXCOORD1 = 5,
- TEXCOORD2 = 6,
+ POSITION = 0,
+ QUATERNION = 1,
+ COLOR = 2,
+ TEXCOORD0 = 3,
+ TEXCOORD1 = 5,
+ TEXCOORD2 = 6,
+
+ VIEW = 8,
};
BitField< 0, 64, u64> hex;
@@ -157,6 +161,10 @@ void DumpShader(const u32* binary_data, u32 binary_size, const u32* swizzle_data
{ OutputAttributes::POSITION_Y, { OutputRegisterInfo::POSITION, 2} },
{ OutputAttributes::POSITION_Z, { OutputRegisterInfo::POSITION, 4} },
{ OutputAttributes::POSITION_W, { OutputRegisterInfo::POSITION, 8} },
+ { OutputAttributes::QUATERNION_X, { OutputRegisterInfo::QUATERNION, 1} },
+ { OutputAttributes::QUATERNION_Y, { OutputRegisterInfo::QUATERNION, 2} },
+ { OutputAttributes::QUATERNION_Z, { OutputRegisterInfo::QUATERNION, 4} },
+ { OutputAttributes::QUATERNION_W, { OutputRegisterInfo::QUATERNION, 8} },
{ OutputAttributes::COLOR_R, { OutputRegisterInfo::COLOR, 1} },
{ OutputAttributes::COLOR_G, { OutputRegisterInfo::COLOR, 2} },
{ OutputAttributes::COLOR_B, { OutputRegisterInfo::COLOR, 4} },
@@ -166,7 +174,10 @@ void DumpShader(const u32* binary_data, u32 binary_size, const u32* swizzle_data
{ OutputAttributes::TEXCOORD1_U, { OutputRegisterInfo::TEXCOORD1, 1} },
{ OutputAttributes::TEXCOORD1_V, { OutputRegisterInfo::TEXCOORD1, 2} },
{ OutputAttributes::TEXCOORD2_U, { OutputRegisterInfo::TEXCOORD2, 1} },
- { OutputAttributes::TEXCOORD2_V, { OutputRegisterInfo::TEXCOORD2, 2} }
+ { OutputAttributes::TEXCOORD2_V, { OutputRegisterInfo::TEXCOORD2, 2} },
+ { OutputAttributes::VIEW_X, { OutputRegisterInfo::VIEW, 1} },
+ { OutputAttributes::VIEW_Y, { OutputRegisterInfo::VIEW, 2} },
+ { OutputAttributes::VIEW_Z, { OutputRegisterInfo::VIEW, 4} }
};
for (const auto& semantic : std::vector<OutputAttributes::Semantic>{
@@ -221,28 +232,69 @@ void DumpShader(const u32* binary_data, u32 binary_size, const u32* swizzle_data
// TODO: Reduce the amount of binary code written to relevant portions
dvlp.binary_offset = write_offset - dvlp_offset;
- dvlp.binary_size_words = binary_size;
- QueueForWriting((u8*)binary_data, binary_size * sizeof(u32));
+ dvlp.binary_size_words = setup.program_code.size();
+ QueueForWriting((u8*)setup.program_code.data(), setup.program_code.size() * sizeof(u32));
dvlp.swizzle_info_offset = write_offset - dvlp_offset;
- dvlp.swizzle_info_num_entries = swizzle_size;
+ dvlp.swizzle_info_num_entries = setup.swizzle_data.size();
u32 dummy = 0;
- for (unsigned int i = 0; i < swizzle_size; ++i) {
- QueueForWriting((u8*)&swizzle_data[i], sizeof(swizzle_data[i]));
+ for (unsigned int i = 0; i < setup.swizzle_data.size(); ++i) {
+ QueueForWriting((u8*)&setup.swizzle_data[i], sizeof(setup.swizzle_data[i]));
QueueForWriting((u8*)&dummy, sizeof(dummy));
}
- dvle.main_offset_words = main_offset;
+ dvle.main_offset_words = config.main_offset;
dvle.output_register_table_offset = write_offset - dvlb.dvle_offset;
dvle.output_register_table_size = static_cast<u32>(output_info_table.size());
QueueForWriting((u8*)output_info_table.data(), static_cast<u32>(output_info_table.size() * sizeof(OutputRegisterInfo)));
// TODO: Create a label table for "main"
+ std::vector<nihstro::ConstantInfo> constant_table;
+ for (unsigned i = 0; i < setup.uniforms.b.size(); ++i) {
+ nihstro::ConstantInfo constant;
+ memset(&constant, 0, sizeof(constant));
+ constant.type = nihstro::ConstantInfo::Bool;
+ constant.regid = i;
+ constant.b = setup.uniforms.b[i];
+ constant_table.emplace_back(constant);
+ }
+ for (unsigned i = 0; i < setup.uniforms.i.size(); ++i) {
+ nihstro::ConstantInfo constant;
+ memset(&constant, 0, sizeof(constant));
+ constant.type = nihstro::ConstantInfo::Int;
+ constant.regid = i;
+ constant.i.x = setup.uniforms.i[i].x;
+ constant.i.y = setup.uniforms.i[i].y;
+ constant.i.z = setup.uniforms.i[i].z;
+ constant.i.w = setup.uniforms.i[i].w;
+ constant_table.emplace_back(constant);
+ }
+ for (unsigned i = 0; i < sizeof(setup.uniforms.f) / sizeof(setup.uniforms.f[0]); ++i) {
+ nihstro::ConstantInfo constant;
+ memset(&constant, 0, sizeof(constant));
+ constant.type = nihstro::ConstantInfo::Float;
+ constant.regid = i;
+ constant.f.x = nihstro::to_float24(setup.uniforms.f[i].x.ToFloat32());
+ constant.f.y = nihstro::to_float24(setup.uniforms.f[i].y.ToFloat32());
+ constant.f.z = nihstro::to_float24(setup.uniforms.f[i].z.ToFloat32());
+ constant.f.w = nihstro::to_float24(setup.uniforms.f[i].w.ToFloat32());
+
+ // Store constant if it's different from zero..
+ if (setup.uniforms.f[i].x.ToFloat32() != 0.0 ||
+ setup.uniforms.f[i].y.ToFloat32() != 0.0 ||
+ setup.uniforms.f[i].z.ToFloat32() != 0.0 ||
+ setup.uniforms.f[i].w.ToFloat32() != 0.0)
+ constant_table.emplace_back(constant);
+ }
+ dvle.constant_table_offset = write_offset - dvlb.dvle_offset;
+ dvle.constant_table_size = constant_table.size();
+ for (const auto& constant : constant_table) {
+ QueueForWriting((uint8_t*)&constant, sizeof(constant));
+ }
// Write data to file
static int dump_index = 0;
- std::string filename = std::string("shader_dump") + std::to_string(++dump_index) + std::string(".shbin");
std::ofstream file(filename, std::ios_base::out | std::ios_base::binary);
for (auto& chunk : writing_queue) {
@@ -261,11 +313,10 @@ void StartPicaTracing()
return;
}
- pica_trace_mutex.lock();
+ std::lock_guard<std::mutex> lock(pica_trace_mutex);
pica_trace = std::unique_ptr<PicaTrace>(new PicaTrace);
is_pica_tracing = true;
- pica_trace_mutex.unlock();
}
bool IsPicaTracing()
@@ -273,18 +324,18 @@ bool IsPicaTracing()
return is_pica_tracing != 0;
}
-void OnPicaRegWrite(u32 id, u32 value)
+void OnPicaRegWrite(PicaTrace::Write write)
{
// Double check for is_pica_tracing to avoid pointless locking overhead
if (!is_pica_tracing)
return;
- std::unique_lock<std::mutex> lock(pica_trace_mutex);
+ std::lock_guard<std::mutex> lock(pica_trace_mutex);
if (!is_pica_tracing)
return;
- pica_trace->writes.emplace_back(id, value);
+ pica_trace->writes.push_back(write);
}
std::unique_ptr<PicaTrace> FinishPicaTracing()
@@ -298,9 +349,9 @@ std::unique_ptr<PicaTrace> FinishPicaTracing()
is_pica_tracing = false;
// Wait until running tracing is finished
- pica_trace_mutex.lock();
+ std::lock_guard<std::mutex> lock(pica_trace_mutex);
std::unique_ptr<PicaTrace> ret(std::move(pica_trace));
- pica_trace_mutex.unlock();
+
return std::move(ret);
}
@@ -359,6 +410,12 @@ const Math::Vec4<u8> LookupTexture(const u8* source, int x, int y, const Texture
}
}
+ case Regs::TextureFormat::RG8:
+ {
+ auto res = Color::DecodeRG8(source + VideoCore::GetMortonOffset(x, y, 2));
+ return { res.r(), res.g(), 0, 255 };
+ }
+
case Regs::TextureFormat::I8:
{
const u8* source_ptr = source + VideoCore::GetMortonOffset(x, y, 1);
diff --git a/src/video_core/debug_utils/debug_utils.h b/src/video_core/debug_utils/debug_utils.h
index 81eea30a9..85762f5b4 100644
--- a/src/video_core/debug_utils/debug_utils.h
+++ b/src/video_core/debug_utils/debug_utils.h
@@ -158,7 +158,6 @@ extern std::shared_ptr<DebugContext> g_debug_context; // TODO: Get rid of this g
namespace DebugUtils {
#define PICA_DUMP_GEOMETRY 0
-#define PICA_DUMP_SHADERS 0
#define PICA_DUMP_TEXTURES 0
#define PICA_LOG_TEV 0
@@ -182,27 +181,23 @@ private:
std::vector<Face> faces;
};
-void DumpShader(const u32* binary_data, u32 binary_size, const u32* swizzle_data, u32 swizzle_size,
- u32 main_offset, const Regs::VSOutputAttributes* output_attributes);
+void DumpShader(const std::string& filename, const Regs::ShaderConfig& config,
+ const State::ShaderSetup& setup, const Regs::VSOutputAttributes* output_attributes);
// Utility class to log Pica commands.
struct PicaTrace {
- struct Write : public std::pair<u32,u32> {
- Write(u32 id, u32 value) : std::pair<u32,u32>(id, value) {}
-
- u32& Id() { return first; }
- const u32& Id() const { return first; }
-
- u32& Value() { return second; }
- const u32& Value() const { return second; }
+ struct Write {
+ u16 cmd_id;
+ u16 mask;
+ u32 value;
};
std::vector<Write> writes;
};
void StartPicaTracing();
bool IsPicaTracing();
-void OnPicaRegWrite(u32 id, u32 value);
+void OnPicaRegWrite(PicaTrace::Write write);
std::unique_ptr<PicaTrace> FinishPicaTracing();
struct TextureInfo {
diff --git a/src/video_core/pica.h b/src/video_core/pica.h
index 6ce90f95a..58b924f9e 100644
--- a/src/video_core/pica.h
+++ b/src/video_core/pica.h
@@ -80,6 +80,11 @@ struct Regs {
POSITION_Z = 2,
POSITION_W = 3,
+ QUATERNION_X = 4,
+ QUATERNION_Y = 5,
+ QUATERNION_Z = 6,
+ QUATERNION_W = 7,
+
COLOR_R = 8,
COLOR_G = 9,
COLOR_B = 10,
@@ -89,6 +94,12 @@ struct Regs {
TEXCOORD0_V = 13,
TEXCOORD1_U = 14,
TEXCOORD1_V = 15,
+
+ // TODO: Not verified
+ VIEW_X = 18,
+ VIEW_Y = 19,
+ VIEW_Z = 20,
+
TEXCOORD2_U = 22,
TEXCOORD2_V = 23,
@@ -163,7 +174,7 @@ struct Regs {
RGB565 = 3,
RGBA4 = 4,
IA8 = 5,
-
+ RG8 = 6, ///< @note Also called HILO8 in 3DBrew.
I8 = 7,
A8 = 8,
IA4 = 9,
@@ -204,6 +215,7 @@ struct Regs {
case TextureFormat::RGB565:
case TextureFormat::RGBA4:
case TextureFormat::IA8:
+ case TextureFormat::RG8:
return 4;
case TextureFormat::I4:
diff --git a/src/video_core/shader/shader.cpp b/src/video_core/shader/shader.cpp
index 6a27a8015..4e9836c80 100644
--- a/src/video_core/shader/shader.cpp
+++ b/src/video_core/shader/shader.cpp
@@ -5,6 +5,8 @@
#include <memory>
#include <unordered_map>
+#include <boost/range/algorithm/fill.hpp>
+
#include "common/hash.h"
#include "common/make_unique.h"
#include "common/profiler.h"
@@ -30,7 +32,7 @@ static JitCompiler jit;
static CompiledShader* jit_shader;
#endif // ARCHITECTURE_x86_64
-void Setup(UnitState& state) {
+void Setup(UnitState<false>& state) {
#ifdef ARCHITECTURE_x86_64
if (VideoCore::g_shader_jit_enabled) {
u64 cache_key = (Common::ComputeHash64(&g_state.vs.program_code, sizeof(g_state.vs.program_code)) ^
@@ -54,9 +56,8 @@ void Shutdown() {
static Common::Profiling::TimingCategory shader_category("Vertex Shader");
-OutputVertex Run(UnitState& state, const InputVertex& input, int num_attributes) {
+OutputVertex Run(UnitState<false>& state, const InputVertex& input, int num_attributes) {
auto& config = g_state.regs.vs;
- auto& setup = g_state.vs;
Common::Profiling::ScopeTimer timer(shader_category);
@@ -67,6 +68,8 @@ OutputVertex Run(UnitState& state, const InputVertex& input, int num_attributes)
// Setup input register table
const auto& attribute_register_map = config.input_register_map;
+ // TODO: Instead of this cumbersome logic, just load the input data directly like
+ // for (int attr = 0; attr < num_attributes; ++attr) { input_attr[0] = state.registers.input[attribute_register_map.attribute0_register]; }
if (num_attributes > 0) state.registers.input[attribute_register_map.attribute0_register] = input.attr[0];
if (num_attributes > 1) state.registers.input[attribute_register_map.attribute1_register] = input.attr[1];
if (num_attributes > 2) state.registers.input[attribute_register_map.attribute2_register] = input.attr[2];
@@ -96,12 +99,6 @@ OutputVertex Run(UnitState& state, const InputVertex& input, int num_attributes)
RunInterpreter(state);
#endif // ARCHITECTURE_x86_64
-#if PICA_DUMP_SHADERS
- DebugUtils::DumpShader(setup.program_code.data(), state.debug.max_offset, setup.swizzle_data.data(),
- state.debug.max_opdesc_id, config.main_offset,
- g_state.regs.vs_output_attributes); // TODO: Don't hardcode VS here
-#endif
-
// Setup output data
OutputVertex ret;
// TODO(neobrain): Under some circumstances, up to 16 attributes may be output. We need to
@@ -132,14 +129,52 @@ OutputVertex Run(UnitState& state, const InputVertex& input, int num_attributes)
std::fmin(std::fabs(ret.color[i].ToFloat32()), 1.0f));
}
- LOG_TRACE(Render_Software, "Output vertex: pos (%.2f, %.2f, %.2f, %.2f), col(%.2f, %.2f, %.2f, %.2f), tc0(%.2f, %.2f)",
+ LOG_TRACE(Render_Software, "Output vertex: pos (%.2f, %.2f, %.2f, %.2f), quat (%.2f, %.2f, %.2f, %.2f), col(%.2f, %.2f, %.2f, %.2f), tc0(%.2f, %.2f)",
ret.pos.x.ToFloat32(), ret.pos.y.ToFloat32(), ret.pos.z.ToFloat32(), ret.pos.w.ToFloat32(),
+ ret.quat.x.ToFloat32(), ret.quat.y.ToFloat32(), ret.quat.z.ToFloat32(), ret.quat.w.ToFloat32(),
ret.color.x.ToFloat32(), ret.color.y.ToFloat32(), ret.color.z.ToFloat32(), ret.color.w.ToFloat32(),
ret.tc0.u().ToFloat32(), ret.tc0.v().ToFloat32());
return ret;
}
+DebugData<true> ProduceDebugInfo(const InputVertex& input, int num_attributes, const Regs::ShaderConfig& config, const State::ShaderSetup& setup) {
+ UnitState<true> state;
+
+ const auto& shader_memory = setup.program_code;
+ state.program_counter = config.main_offset;
+ state.debug.max_offset = 0;
+ state.debug.max_opdesc_id = 0;
+
+ // Setup input register table
+ const auto& attribute_register_map = config.input_register_map;
+ float24 dummy_register;
+ boost::fill(state.registers.input, &dummy_register);
+
+ if (num_attributes > 0) state.registers.input[attribute_register_map.attribute0_register] = &input.attr[0].x;
+ if (num_attributes > 1) state.registers.input[attribute_register_map.attribute1_register] = &input.attr[1].x;
+ if (num_attributes > 2) state.registers.input[attribute_register_map.attribute2_register] = &input.attr[2].x;
+ if (num_attributes > 3) state.registers.input[attribute_register_map.attribute3_register] = &input.attr[3].x;
+ if (num_attributes > 4) state.registers.input[attribute_register_map.attribute4_register] = &input.attr[4].x;
+ if (num_attributes > 5) state.registers.input[attribute_register_map.attribute5_register] = &input.attr[5].x;
+ if (num_attributes > 6) state.registers.input[attribute_register_map.attribute6_register] = &input.attr[6].x;
+ if (num_attributes > 7) state.registers.input[attribute_register_map.attribute7_register] = &input.attr[7].x;
+ if (num_attributes > 8) state.registers.input[attribute_register_map.attribute8_register] = &input.attr[8].x;
+ if (num_attributes > 9) state.registers.input[attribute_register_map.attribute9_register] = &input.attr[9].x;
+ if (num_attributes > 10) state.registers.input[attribute_register_map.attribute10_register] = &input.attr[10].x;
+ if (num_attributes > 11) state.registers.input[attribute_register_map.attribute11_register] = &input.attr[11].x;
+ if (num_attributes > 12) state.registers.input[attribute_register_map.attribute12_register] = &input.attr[12].x;
+ if (num_attributes > 13) state.registers.input[attribute_register_map.attribute13_register] = &input.attr[13].x;
+ if (num_attributes > 14) state.registers.input[attribute_register_map.attribute14_register] = &input.attr[14].x;
+ if (num_attributes > 15) state.registers.input[attribute_register_map.attribute15_register] = &input.attr[15].x;
+
+ state.conditional_code[0] = false;
+ state.conditional_code[1] = false;
+
+ RunInterpreter(state);
+ return state.debug;
+}
+
} // namespace Shader
} // namespace Pica
diff --git a/src/video_core/shader/shader.h b/src/video_core/shader/shader.h
index 2007a2844..bac51ddd8 100644
--- a/src/video_core/shader/shader.h
+++ b/src/video_core/shader/shader.h
@@ -4,7 +4,10 @@
#pragma once
+#include <vector>
+
#include <boost/container/static_vector.hpp>
+
#include <nihstro/shader_binary.h>
#include "common/common_funcs.h"
@@ -30,7 +33,7 @@ struct OutputVertex {
// VS output attributes
Math::Vec4<float24> pos;
- Math::Vec4<float24> dummy; // quaternions (not implemented, yet)
+ Math::Vec4<float24> quat;
Math::Vec4<float24> color;
Math::Vec2<float24> tc0;
Math::Vec2<float24> tc1;
@@ -72,12 +75,185 @@ struct OutputVertex {
static_assert(std::is_pod<OutputVertex>::value, "Structure is not POD");
static_assert(sizeof(OutputVertex) == 32 * sizeof(float), "OutputVertex has invalid size");
+
+// Helper structure used to keep track of data useful for inspection of shader emulation
+template<bool full_debugging>
+struct DebugData;
+
+template<>
+struct DebugData<false> {
+ // TODO: Hide these behind and interface and move them to DebugData<true>
+ u32 max_offset; // maximum program counter ever reached
+ u32 max_opdesc_id; // maximum swizzle pattern index ever used
+};
+
+template<>
+struct DebugData<true> {
+ // Records store the input and output operands of a particular instruction.
+ struct Record {
+ enum Type {
+ // Floating point arithmetic operands
+ SRC1 = 0x1,
+ SRC2 = 0x2,
+ SRC3 = 0x4,
+
+ // Initial and final output operand value
+ DEST_IN = 0x8,
+ DEST_OUT = 0x10,
+
+ // Current and next instruction offset (in words)
+ CUR_INSTR = 0x20,
+ NEXT_INSTR = 0x40,
+
+ // Output address register value
+ ADDR_REG_OUT = 0x80,
+
+ // Result of a comparison instruction
+ CMP_RESULT = 0x100,
+
+ // Input values for conditional flow control instructions
+ COND_BOOL_IN = 0x200,
+ COND_CMP_IN = 0x400,
+
+ // Input values for a loop
+ LOOP_INT_IN = 0x800,
+ };
+
+ Math::Vec4<float24> src1;
+ Math::Vec4<float24> src2;
+ Math::Vec4<float24> src3;
+
+ Math::Vec4<float24> dest_in;
+ Math::Vec4<float24> dest_out;
+
+ s32 address_registers[2];
+ bool conditional_code[2];
+ bool cond_bool;
+ bool cond_cmp[2];
+ Math::Vec4<u8> loop_int;
+
+ u32 instruction_offset;
+ u32 next_instruction;
+
+ // set of enabled fields (as a combination of Type flags)
+ unsigned mask = 0;
+ };
+
+ u32 max_offset; // maximum program counter ever reached
+ u32 max_opdesc_id; // maximum swizzle pattern index ever used
+
+ // List of records for each executed shader instruction
+ std::vector<DebugData<true>::Record> records;
+};
+
+// Type alias for better readability
+using DebugDataRecord = DebugData<true>::Record;
+
+// Helper function to set a DebugData<true>::Record field based on the template enum parameter.
+template<DebugDataRecord::Type type, typename ValueType>
+inline void SetField(DebugDataRecord& record, ValueType value);
+
+template<>
+inline void SetField<DebugDataRecord::SRC1>(DebugDataRecord& record, float24* value) {
+ record.src1.x = value[0];
+ record.src1.y = value[1];
+ record.src1.z = value[2];
+ record.src1.w = value[3];
+}
+
+template<>
+inline void SetField<DebugDataRecord::SRC2>(DebugDataRecord& record, float24* value) {
+ record.src2.x = value[0];
+ record.src2.y = value[1];
+ record.src2.z = value[2];
+ record.src2.w = value[3];
+}
+
+template<>
+inline void SetField<DebugDataRecord::SRC3>(DebugDataRecord& record, float24* value) {
+ record.src3.x = value[0];
+ record.src3.y = value[1];
+ record.src3.z = value[2];
+ record.src3.w = value[3];
+}
+
+template<>
+inline void SetField<DebugDataRecord::DEST_IN>(DebugDataRecord& record, float24* value) {
+ record.dest_in.x = value[0];
+ record.dest_in.y = value[1];
+ record.dest_in.z = value[2];
+ record.dest_in.w = value[3];
+}
+
+template<>
+inline void SetField<DebugDataRecord::DEST_OUT>(DebugDataRecord& record, float24* value) {
+ record.dest_out.x = value[0];
+ record.dest_out.y = value[1];
+ record.dest_out.z = value[2];
+ record.dest_out.w = value[3];
+}
+
+template<>
+inline void SetField<DebugDataRecord::ADDR_REG_OUT>(DebugDataRecord& record, s32* value) {
+ record.address_registers[0] = value[0];
+ record.address_registers[1] = value[1];
+}
+
+template<>
+inline void SetField<DebugDataRecord::CMP_RESULT>(DebugDataRecord& record, bool* value) {
+ record.conditional_code[0] = value[0];
+ record.conditional_code[1] = value[1];
+}
+
+template<>
+inline void SetField<DebugDataRecord::COND_BOOL_IN>(DebugDataRecord& record, bool value) {
+ record.cond_bool = value;
+}
+
+template<>
+inline void SetField<DebugDataRecord::COND_CMP_IN>(DebugDataRecord& record, bool* value) {
+ record.cond_cmp[0] = value[0];
+ record.cond_cmp[1] = value[1];
+}
+
+template<>
+inline void SetField<DebugDataRecord::LOOP_INT_IN>(DebugDataRecord& record, Math::Vec4<u8> value) {
+ record.loop_int = value;
+}
+
+template<>
+inline void SetField<DebugDataRecord::CUR_INSTR>(DebugDataRecord& record, u32 value) {
+ record.instruction_offset = value;
+}
+
+template<>
+inline void SetField<DebugDataRecord::NEXT_INSTR>(DebugDataRecord& record, u32 value) {
+ record.next_instruction = value;
+}
+
+// Helper function to set debug information on the current shader iteration.
+template<DebugDataRecord::Type type, typename ValueType>
+inline void Record(DebugData<false>& debug_data, u32 offset, ValueType value) {
+ // Debugging disabled => nothing to do
+}
+
+template<DebugDataRecord::Type type, typename ValueType>
+inline void Record(DebugData<true>& debug_data, u32 offset, ValueType value) {
+ if (offset >= debug_data.records.size())
+ debug_data.records.resize(offset + 1);
+
+ SetField<type, ValueType>(debug_data.records[offset], value);
+ debug_data.records[offset].mask |= type;
+}
+
+
/**
* This structure contains the state information that needs to be unique for a shader unit. The 3DS
* has four shader units that process shaders in parallel. At the present, Citra only implements a
* single shader unit that processes all shaders serially. Putting the state information in a struct
* here will make it easier for us to parallelize the shader processing later.
*/
+template<bool Debug>
struct UnitState {
struct Registers {
// The registers are accessed by the shader JIT using SSE instructions, and are therefore
@@ -111,10 +287,7 @@ struct UnitState {
// TODO: Is there a maximal size for this?
boost::container::static_vector<CallStackElement, 16> call_stack;
- struct {
- u32 max_offset; // maximum program counter ever reached
- u32 max_opdesc_id; // maximum swizzle pattern index ever used
- } debug;
+ DebugData<Debug> debug;
static int InputOffset(const SourceRegister& reg) {
switch (reg.GetRegisterType()) {
@@ -150,7 +323,7 @@ struct UnitState {
* vertex, which would happen within the `Run` function).
* @param state Shader unit state, must be setup per shader and per shader unit
*/
-void Setup(UnitState& state);
+void Setup(UnitState<false>& state);
/// Performs any cleanup when the emulator is shutdown
void Shutdown();
@@ -162,7 +335,17 @@ void Shutdown();
* @param num_attributes The number of vertex shader attributes
* @return The output vertex, after having been processed by the vertex shader
*/
-OutputVertex Run(UnitState& state, const InputVertex& input, int num_attributes);
+OutputVertex Run(UnitState<false>& state, const InputVertex& input, int num_attributes);
+
+/**
+ * Produce debug information based on the given shader and input vertex
+ * @param input Input vertex into the shader
+ * @param num_attributes The number of vertex shader attributes
+ * @param config Configuration object for the shader pipeline
+ * @param setup Setup object for the shader pipeline
+ * @return Debug information for this shader with regards to the given vertex
+ */
+DebugData<true> ProduceDebugInfo(const InputVertex& input, int num_attributes, const Regs::ShaderConfig& config, const State::ShaderSetup& setup);
} // namespace Shader
diff --git a/src/video_core/shader/shader_interpreter.cpp b/src/video_core/shader/shader_interpreter.cpp
index c8489f920..063cc38f0 100644
--- a/src/video_core/shader/shader_interpreter.cpp
+++ b/src/video_core/shader/shader_interpreter.cpp
@@ -21,7 +21,8 @@ namespace Pica {
namespace Shader {
-void RunInterpreter(UnitState& state) {
+template<bool Debug>
+void RunInterpreter(UnitState<Debug>& state) {
const auto& uniforms = g_state.vs.uniforms;
const auto& swizzle_data = g_state.vs.swizzle_data;
const auto& program_code = g_state.vs.program_code;
@@ -29,7 +30,9 @@ void RunInterpreter(UnitState& state) {
// Placeholder for invalid inputs
static float24 dummy_vec4_float24[4];
- while (true) {
+ unsigned iteration = 0;
+ bool exit_loop = false;
+ while (!exit_loop) {
if (!state.call_stack.empty()) {
auto& top = state.call_stack.back();
if (state.program_counter == top.final_address) {
@@ -47,16 +50,19 @@ void RunInterpreter(UnitState& state) {
}
}
- bool exit_loop = false;
const Instruction instr = { program_code[state.program_counter] };
const SwizzlePattern swizzle = { swizzle_data[instr.common.operand_desc_id] };
- static auto call = [](UnitState& state, u32 offset, u32 num_instructions,
+ static auto call = [](UnitState<Debug>& state, u32 offset, u32 num_instructions,
u32 return_offset, u8 repeat_count, u8 loop_increment) {
state.program_counter = offset - 1; // -1 to make sure when incrementing the PC we end up at the correct offset
ASSERT(state.call_stack.size() < state.call_stack.capacity());
state.call_stack.push_back({ offset + num_instructions, return_offset, repeat_count, loop_increment, offset });
};
+ Record<DebugDataRecord::CUR_INSTR>(state.debug, iteration, state.program_counter);
+ if (iteration > 0)
+ Record<DebugDataRecord::NEXT_INSTR>(state.debug, iteration - 1, state.program_counter);
+
state.debug.max_offset = std::max<u32>(state.debug.max_offset, 1 + state.program_counter);
auto LookupSourceRegister = [&](const SourceRegister& source_reg) -> const float24* {
@@ -123,58 +129,78 @@ void RunInterpreter(UnitState& state) {
switch (instr.opcode.Value().EffectiveOpCode()) {
case OpCode::Id::ADD:
{
+ Record<DebugDataRecord::SRC1>(state.debug, iteration, src1);
+ Record<DebugDataRecord::SRC2>(state.debug, iteration, src2);
+ Record<DebugDataRecord::DEST_IN>(state.debug, iteration, dest);
for (int i = 0; i < 4; ++i) {
if (!swizzle.DestComponentEnabled(i))
continue;
dest[i] = src1[i] + src2[i];
}
-
+ Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest);
break;
}
case OpCode::Id::MUL:
{
+ Record<DebugDataRecord::SRC1>(state.debug, iteration, src1);
+ Record<DebugDataRecord::SRC2>(state.debug, iteration, src2);
+ Record<DebugDataRecord::DEST_IN>(state.debug, iteration, dest);
for (int i = 0; i < 4; ++i) {
if (!swizzle.DestComponentEnabled(i))
continue;
dest[i] = src1[i] * src2[i];
}
-
+ Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest);
break;
}
case OpCode::Id::FLR:
+ Record<DebugDataRecord::SRC1>(state.debug, iteration, src1);
+ Record<DebugDataRecord::DEST_IN>(state.debug, iteration, dest);
for (int i = 0; i < 4; ++i) {
if (!swizzle.DestComponentEnabled(i))
continue;
dest[i] = float24::FromFloat32(std::floor(src1[i].ToFloat32()));
}
+ Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest);
break;
case OpCode::Id::MAX:
+ Record<DebugDataRecord::SRC1>(state.debug, iteration, src1);
+ Record<DebugDataRecord::SRC2>(state.debug, iteration, src2);
+ Record<DebugDataRecord::DEST_IN>(state.debug, iteration, dest);
for (int i = 0; i < 4; ++i) {
if (!swizzle.DestComponentEnabled(i))
continue;
dest[i] = std::max(src1[i], src2[i]);
}
+ Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest);
break;
case OpCode::Id::MIN:
+ Record<DebugDataRecord::SRC1>(state.debug, iteration, src1);
+ Record<DebugDataRecord::SRC2>(state.debug, iteration, src2);
+ Record<DebugDataRecord::DEST_IN>(state.debug, iteration, dest);
for (int i = 0; i < 4; ++i) {
if (!swizzle.DestComponentEnabled(i))
continue;
dest[i] = std::min(src1[i], src2[i]);
}
+ Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest);
break;
case OpCode::Id::DP3:
case OpCode::Id::DP4:
{
+ Record<DebugDataRecord::SRC1>(state.debug, iteration, src1);
+ Record<DebugDataRecord::SRC2>(state.debug, iteration, src2);
+ Record<DebugDataRecord::DEST_IN>(state.debug, iteration, dest);
float24 dot = float24::FromFloat32(0.f);
int num_components = (instr.opcode.Value() == OpCode::Id::DP3) ? 3 : 4;
for (int i = 0; i < num_components; ++i)
@@ -186,12 +212,15 @@ void RunInterpreter(UnitState& state) {
dest[i] = dot;
}
+ Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest);
break;
}
// Reciprocal
case OpCode::Id::RCP:
{
+ Record<DebugDataRecord::SRC1>(state.debug, iteration, src1);
+ Record<DebugDataRecord::DEST_IN>(state.debug, iteration, dest);
for (int i = 0; i < 4; ++i) {
if (!swizzle.DestComponentEnabled(i))
continue;
@@ -200,13 +229,15 @@ void RunInterpreter(UnitState& state) {
// TODO: I think this might be wrong... we should only use one component here
dest[i] = float24::FromFloat32(1.0f / src1[i].ToFloat32());
}
-
+ Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest);
break;
}
// Reciprocal Square Root
case OpCode::Id::RSQ:
{
+ Record<DebugDataRecord::SRC1>(state.debug, iteration, src1);
+ Record<DebugDataRecord::DEST_IN>(state.debug, iteration, dest);
for (int i = 0; i < 4; ++i) {
if (!swizzle.DestComponentEnabled(i))
continue;
@@ -215,12 +246,13 @@ void RunInterpreter(UnitState& state) {
// TODO: I think this might be wrong... we should only use one component here
dest[i] = float24::FromFloat32(1.0f / sqrt(src1[i].ToFloat32()));
}
-
+ Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest);
break;
}
case OpCode::Id::MOVA:
{
+ Record<DebugDataRecord::SRC1>(state.debug, iteration, src1);
for (int i = 0; i < 2; ++i) {
if (!swizzle.DestComponentEnabled(i))
continue;
@@ -228,32 +260,55 @@ void RunInterpreter(UnitState& state) {
// TODO: Figure out how the rounding is done on hardware
state.address_registers[i] = static_cast<s32>(src1[i].ToFloat32());
}
-
+ Record<DebugDataRecord::ADDR_REG_OUT>(state.debug, iteration, state.address_registers);
break;
}
case OpCode::Id::MOV:
{
+ Record<DebugDataRecord::SRC1>(state.debug, iteration, src1);
+ Record<DebugDataRecord::DEST_IN>(state.debug, iteration, dest);
for (int i = 0; i < 4; ++i) {
if (!swizzle.DestComponentEnabled(i))
continue;
dest[i] = src1[i];
}
+ Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest);
break;
}
+ case OpCode::Id::SGE:
+ case OpCode::Id::SGEI:
+ Record<DebugDataRecord::SRC1>(state.debug, iteration, src1);
+ Record<DebugDataRecord::SRC2>(state.debug, iteration, src2);
+ Record<DebugDataRecord::DEST_IN>(state.debug, iteration, dest);
+ for (int i = 0; i < 4; ++i) {
+ if (!swizzle.DestComponentEnabled(i))
+ continue;
+
+ dest[i] = (src1[i] >= src2[i]) ? float24::FromFloat32(1.0f) : float24::FromFloat32(0.0f);
+ }
+ Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest);
+ break;
+
case OpCode::Id::SLT:
case OpCode::Id::SLTI:
+ Record<DebugDataRecord::SRC1>(state.debug, iteration, src1);
+ Record<DebugDataRecord::SRC2>(state.debug, iteration, src2);
+ Record<DebugDataRecord::DEST_IN>(state.debug, iteration, dest);
for (int i = 0; i < 4; ++i) {
if (!swizzle.DestComponentEnabled(i))
continue;
dest[i] = (src1[i] < src2[i]) ? float24::FromFloat32(1.0f) : float24::FromFloat32(0.0f);
}
+ Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest);
break;
case OpCode::Id::CMP:
+ Record<DebugDataRecord::SRC1>(state.debug, iteration, src1);
+ Record<DebugDataRecord::SRC2>(state.debug, iteration, src2);
for (int i = 0; i < 2; ++i) {
// TODO: Can you restrict to one compare via dest masking?
@@ -261,27 +316,27 @@ void RunInterpreter(UnitState& state) {
auto op = (i == 0) ? compare_op.x.Value() : compare_op.y.Value();
switch (op) {
- case compare_op.Equal:
+ case Instruction::Common::CompareOpType::Equal:
state.conditional_code[i] = (src1[i] == src2[i]);
break;
- case compare_op.NotEqual:
+ case Instruction::Common::CompareOpType::NotEqual:
state.conditional_code[i] = (src1[i] != src2[i]);
break;
- case compare_op.LessThan:
+ case Instruction::Common::CompareOpType::LessThan:
state.conditional_code[i] = (src1[i] < src2[i]);
break;
- case compare_op.LessEqual:
+ case Instruction::Common::CompareOpType::LessEqual:
state.conditional_code[i] = (src1[i] <= src2[i]);
break;
- case compare_op.GreaterThan:
+ case Instruction::Common::CompareOpType::GreaterThan:
state.conditional_code[i] = (src1[i] > src2[i]);
break;
- case compare_op.GreaterEqual:
+ case Instruction::Common::CompareOpType::GreaterEqual:
state.conditional_code[i] = (src1[i] >= src2[i]);
break;
@@ -290,7 +345,44 @@ void RunInterpreter(UnitState& state) {
break;
}
}
+ Record<DebugDataRecord::CMP_RESULT>(state.debug, iteration, state.conditional_code);
+ break;
+
+ case OpCode::Id::EX2:
+ {
+ Record<DebugDataRecord::SRC1>(state.debug, iteration, src1);
+ Record<DebugDataRecord::DEST_IN>(state.debug, iteration, dest);
+
+ // EX2 only takes first component exp2 and writes it to all dest components
+ float24 ex2_res = float24::FromFloat32(std::exp2(src1[0].ToFloat32()));
+ for (int i = 0; i < 4; ++i) {
+ if (!swizzle.DestComponentEnabled(i))
+ continue;
+
+ dest[i] = ex2_res;
+ }
+
+ Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest);
break;
+ }
+
+ case OpCode::Id::LG2:
+ {
+ Record<DebugDataRecord::SRC1>(state.debug, iteration, src1);
+ Record<DebugDataRecord::DEST_IN>(state.debug, iteration, dest);
+
+ // LG2 only takes the first component log2 and writes it to all dest components
+ float24 lg2_res = float24::FromFloat32(std::log2(src1[0].ToFloat32()));
+ for (int i = 0; i < 4; ++i) {
+ if (!swizzle.DestComponentEnabled(i))
+ continue;
+
+ dest[i] = lg2_res;
+ }
+
+ Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest);
+ break;
+ }
default:
LOG_ERROR(HW_GPU, "Unhandled arithmetic instruction: 0x%02x (%s): 0x%08x",
@@ -359,12 +451,17 @@ void RunInterpreter(UnitState& state) {
: (instr.mad.dest.Value() < 0x20) ? &state.registers.temporary[instr.mad.dest.Value().GetIndex()][0]
: dummy_vec4_float24;
+ Record<DebugDataRecord::SRC1>(state.debug, iteration, src1);
+ Record<DebugDataRecord::SRC2>(state.debug, iteration, src2);
+ Record<DebugDataRecord::SRC3>(state.debug, iteration, src3);
+ Record<DebugDataRecord::DEST_IN>(state.debug, iteration, dest);
for (int i = 0; i < 4; ++i) {
if (!swizzle.DestComponentEnabled(i))
continue;
dest[i] = src1[i] * src2[i] + src3[i];
}
+ Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest);
} else {
LOG_ERROR(HW_GPU, "Unhandled multiply-add instruction: 0x%02x (%s): 0x%08x",
(int)instr.opcode.Value().EffectiveOpCode(), instr.opcode.Value().GetInfo().name, instr.hex);
@@ -374,7 +471,7 @@ void RunInterpreter(UnitState& state) {
default:
{
- static auto evaluate_condition = [](const UnitState& state, bool refx, bool refy, Instruction::FlowControlType flow_control) {
+ static auto evaluate_condition = [](const UnitState<Debug>& state, bool refx, bool refy, Instruction::FlowControlType flow_control) {
bool results[2] = { refx == state.conditional_code[0],
refy == state.conditional_code[1] };
@@ -400,12 +497,14 @@ void RunInterpreter(UnitState& state) {
break;
case OpCode::Id::JMPC:
+ Record<DebugDataRecord::COND_CMP_IN>(state.debug, iteration, state.conditional_code);
if (evaluate_condition(state, instr.flow_control.refx, instr.flow_control.refy, instr.flow_control)) {
state.program_counter = instr.flow_control.dest_offset - 1;
}
break;
case OpCode::Id::JMPU:
+ Record<DebugDataRecord::COND_BOOL_IN>(state.debug, iteration, uniforms.b[instr.flow_control.bool_uniform_id]);
if (uniforms.b[instr.flow_control.bool_uniform_id]) {
state.program_counter = instr.flow_control.dest_offset - 1;
}
@@ -419,6 +518,7 @@ void RunInterpreter(UnitState& state) {
break;
case OpCode::Id::CALLU:
+ Record<DebugDataRecord::COND_BOOL_IN>(state.debug, iteration, uniforms.b[instr.flow_control.bool_uniform_id]);
if (uniforms.b[instr.flow_control.bool_uniform_id]) {
call(state,
instr.flow_control.dest_offset,
@@ -428,6 +528,7 @@ void RunInterpreter(UnitState& state) {
break;
case OpCode::Id::CALLC:
+ Record<DebugDataRecord::COND_CMP_IN>(state.debug, iteration, state.conditional_code);
if (evaluate_condition(state, instr.flow_control.refx, instr.flow_control.refy, instr.flow_control)) {
call(state,
instr.flow_control.dest_offset,
@@ -440,6 +541,7 @@ void RunInterpreter(UnitState& state) {
break;
case OpCode::Id::IFU:
+ Record<DebugDataRecord::COND_BOOL_IN>(state.debug, iteration, uniforms.b[instr.flow_control.bool_uniform_id]);
if (uniforms.b[instr.flow_control.bool_uniform_id]) {
call(state,
state.program_counter + 1,
@@ -458,6 +560,7 @@ void RunInterpreter(UnitState& state) {
{
// TODO: Do we need to consider swizzlers here?
+ Record<DebugDataRecord::COND_CMP_IN>(state.debug, iteration, state.conditional_code);
if (evaluate_condition(state, instr.flow_control.refx, instr.flow_control.refy, instr.flow_control)) {
call(state,
state.program_counter + 1,
@@ -475,14 +578,19 @@ void RunInterpreter(UnitState& state) {
case OpCode::Id::LOOP:
{
- state.address_registers[2] = uniforms.i[instr.flow_control.int_uniform_id].y;
+ Math::Vec4<u8> loop_param(uniforms.i[instr.flow_control.int_uniform_id].x,
+ uniforms.i[instr.flow_control.int_uniform_id].y,
+ uniforms.i[instr.flow_control.int_uniform_id].z,
+ uniforms.i[instr.flow_control.int_uniform_id].w);
+ state.address_registers[2] = loop_param.y;
+ Record<DebugDataRecord::LOOP_INT_IN>(state.debug, iteration, loop_param);
call(state,
state.program_counter + 1,
instr.flow_control.dest_offset - state.program_counter + 1,
instr.flow_control.dest_offset + 1,
- uniforms.i[instr.flow_control.int_uniform_id].x,
- uniforms.i[instr.flow_control.int_uniform_id].z);
+ loop_param.x,
+ loop_param.z);
break;
}
@@ -497,12 +605,14 @@ void RunInterpreter(UnitState& state) {
}
++state.program_counter;
-
- if (exit_loop)
- break;
+ ++iteration;
}
}
+// Explicit instantiation
+template void RunInterpreter(UnitState<false>& state);
+template void RunInterpreter(UnitState<true>& state);
+
} // namespace
} // namespace
diff --git a/src/video_core/shader/shader_interpreter.h b/src/video_core/shader/shader_interpreter.h
index ad6e58e39..71bcad5ac 100644
--- a/src/video_core/shader/shader_interpreter.h
+++ b/src/video_core/shader/shader_interpreter.h
@@ -12,7 +12,8 @@ namespace Pica {
namespace Shader {
-void RunInterpreter(UnitState& state);
+template<bool Debug>
+void RunInterpreter(UnitState<Debug>& state);
} // namespace
diff --git a/src/video_core/shader/shader_jit_x64.cpp b/src/video_core/shader/shader_jit_x64.cpp
index ce47774d5..a1bdd8456 100644
--- a/src/video_core/shader/shader_jit_x64.cpp
+++ b/src/video_core/shader/shader_jit_x64.cpp
@@ -25,12 +25,12 @@ const JitFunction instr_table[64] = {
&JitCompiler::Compile_DP4, // dp4
nullptr, // dph
nullptr, // unknown
- nullptr, // ex2
- nullptr, // lg2
+ &JitCompiler::Compile_EX2, // ex2
+ &JitCompiler::Compile_LG2, // lg2
nullptr, // unknown
&JitCompiler::Compile_MUL, // mul
- nullptr, // lge
- nullptr, // slt
+ &JitCompiler::Compile_SGE, // sge
+ &JitCompiler::Compile_SLT, // slt
&JitCompiler::Compile_FLR, // flr
&JitCompiler::Compile_MAX, // max
&JitCompiler::Compile_MIN, // min
@@ -46,8 +46,8 @@ const JitFunction instr_table[64] = {
nullptr, // unknown
nullptr, // dphi
nullptr, // unknown
- nullptr, // sgei
- &JitCompiler::Compile_SLTI, // slti
+ &JitCompiler::Compile_SGE, // sgei
+ &JitCompiler::Compile_SLT, // slti
nullptr, // unknown
nullptr, // unknown
nullptr, // unknown
@@ -141,7 +141,7 @@ void JitCompiler::Compile_SwizzleSrc(Instruction instr, unsigned src_num, Source
src_offset = src_reg.GetIndex() * sizeof(float24) * 4;
} else {
src_ptr = REGISTERS;
- src_offset = UnitState::InputOffset(src_reg);
+ src_offset = UnitState<false>::InputOffset(src_reg);
}
unsigned operand_desc_id;
@@ -217,11 +217,11 @@ void JitCompiler::Compile_DestEnable(Instruction instr,X64Reg src) {
// If all components are enabled, write the result to the destination register
if (swiz.dest_mask == NO_DEST_REG_MASK) {
// Store dest back to memory
- MOVAPS(MDisp(REGISTERS, UnitState::OutputOffset(dest)), src);
+ MOVAPS(MDisp(REGISTERS, UnitState<false>::OutputOffset(dest)), src);
} else {
// Not all components are enabled, so mask the result when storing to the destination register...
- MOVAPS(SCRATCH, MDisp(REGISTERS, UnitState::OutputOffset(dest)));
+ MOVAPS(SCRATCH, MDisp(REGISTERS, UnitState<false>::OutputOffset(dest)));
if (Common::GetCPUCaps().sse4_1) {
u8 mask = ((swiz.dest_mask & 1) << 3) | ((swiz.dest_mask & 8) >> 3) | ((swiz.dest_mask & 2) << 1) | ((swiz.dest_mask & 4) >> 1);
@@ -240,7 +240,7 @@ void JitCompiler::Compile_DestEnable(Instruction instr,X64Reg src) {
}
// Store dest back to memory
- MOVAPS(MDisp(REGISTERS, UnitState::OutputOffset(dest)), SCRATCH);
+ MOVAPS(MDisp(REGISTERS, UnitState<false>::OutputOffset(dest)), SCRATCH);
}
}
@@ -280,6 +280,22 @@ void JitCompiler::Compile_UniformCondition(Instruction instr) {
CMP(sizeof(bool) * 8, MDisp(UNIFORMS, offset), Imm8(0));
}
+void JitCompiler::Compile_PushCallerSavedXMM() {
+#ifndef _WIN32
+ SUB(64, R(RSP), Imm8(2 * 16));
+ MOVUPS(MDisp(RSP, 16), ONE);
+ MOVUPS(MDisp(RSP, 0), NEGBIT);
+#endif
+}
+
+void JitCompiler::Compile_PopCallerSavedXMM() {
+#ifndef _WIN32
+ MOVUPS(NEGBIT, MDisp(RSP, 0));
+ MOVUPS(ONE, MDisp(RSP, 16));
+ ADD(64, R(RSP), Imm8(2 * 16));
+#endif
+}
+
void JitCompiler::Compile_ADD(Instruction instr) {
Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
@@ -331,6 +347,38 @@ void JitCompiler::Compile_DP4(Instruction instr) {
Compile_DestEnable(instr, SRC1);
}
+void JitCompiler::Compile_EX2(Instruction instr) {
+ Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
+ MOVSS(XMM0, R(SRC1));
+
+ // The following will actually break the stack alignment
+ ABI_PushAllCallerSavedRegsAndAdjustStack();
+ Compile_PushCallerSavedXMM();
+ ABI_CallFunction(reinterpret_cast<const void*>(exp2f));
+ Compile_PopCallerSavedXMM();
+ ABI_PopAllCallerSavedRegsAndAdjustStack();
+
+ SHUFPS(XMM0, R(XMM0), _MM_SHUFFLE(0, 0, 0, 0));
+ MOVAPS(SRC1, R(XMM0));
+ Compile_DestEnable(instr, SRC1);
+}
+
+void JitCompiler::Compile_LG2(Instruction instr) {
+ Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
+ MOVSS(XMM0, R(SRC1));
+
+ // The following will actually break the stack alignment
+ ABI_PushAllCallerSavedRegsAndAdjustStack();
+ Compile_PushCallerSavedXMM();
+ ABI_CallFunction(reinterpret_cast<const void*>(log2f));
+ Compile_PopCallerSavedXMM();
+ ABI_PopAllCallerSavedRegsAndAdjustStack();
+
+ SHUFPS(XMM0, R(XMM0), _MM_SHUFFLE(0, 0, 0, 0));
+ MOVAPS(SRC1, R(XMM0));
+ Compile_DestEnable(instr, SRC1);
+}
+
void JitCompiler::Compile_MUL(Instruction instr) {
Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
@@ -338,6 +386,36 @@ void JitCompiler::Compile_MUL(Instruction instr) {
Compile_DestEnable(instr, SRC1);
}
+void JitCompiler::Compile_SGE(Instruction instr) {
+ if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::SGEI) {
+ Compile_SwizzleSrc(instr, 1, instr.common.src1i, SRC1);
+ Compile_SwizzleSrc(instr, 2, instr.common.src2i, SRC2);
+ } else {
+ Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
+ Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
+ }
+
+ CMPPS(SRC1, R(SRC2), CMP_NLT);
+ ANDPS(SRC1, R(ONE));
+
+ Compile_DestEnable(instr, SRC1);
+}
+
+void JitCompiler::Compile_SLT(Instruction instr) {
+ if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::SLTI) {
+ Compile_SwizzleSrc(instr, 1, instr.common.src1i, SRC1);
+ Compile_SwizzleSrc(instr, 2, instr.common.src2i, SRC2);
+ } else {
+ Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
+ Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
+ }
+
+ CMPPS(SRC1, R(SRC2), CMP_LT);
+ ANDPS(SRC1, R(ONE));
+
+ Compile_DestEnable(instr, SRC1);
+}
+
void JitCompiler::Compile_FLR(Instruction instr) {
Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
@@ -415,16 +493,6 @@ void JitCompiler::Compile_MOV(Instruction instr) {
Compile_DestEnable(instr, SRC1);
}
-void JitCompiler::Compile_SLTI(Instruction instr) {
- Compile_SwizzleSrc(instr, 1, instr.common.src1i, SRC1);
- Compile_SwizzleSrc(instr, 1, instr.common.src2i, SRC2);
-
- CMPSS(SRC1, R(SRC2), CMP_LT);
- ANDPS(SRC1, R(ONE));
-
- Compile_DestEnable(instr, SRC1);
-}
-
void JitCompiler::Compile_RCP(Instruction instr) {
Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
diff --git a/src/video_core/shader/shader_jit_x64.h b/src/video_core/shader/shader_jit_x64.h
index b88f2a0d2..b2aa5293c 100644
--- a/src/video_core/shader/shader_jit_x64.h
+++ b/src/video_core/shader/shader_jit_x64.h
@@ -37,7 +37,11 @@ public:
void Compile_ADD(Instruction instr);
void Compile_DP3(Instruction instr);
void Compile_DP4(Instruction instr);
+ void Compile_EX2(Instruction instr);
+ void Compile_LG2(Instruction instr);
void Compile_MUL(Instruction instr);
+ void Compile_SGE(Instruction instr);
+ void Compile_SLT(Instruction instr);
void Compile_FLR(Instruction instr);
void Compile_MAX(Instruction instr);
void Compile_MIN(Instruction instr);
@@ -45,7 +49,6 @@ public:
void Compile_RSQ(Instruction instr);
void Compile_MOVA(Instruction instr);
void Compile_MOV(Instruction instr);
- void Compile_SLTI(Instruction instr);
void Compile_NOP(Instruction instr);
void Compile_END(Instruction instr);
void Compile_CALL(Instruction instr);
@@ -67,6 +70,9 @@ private:
void Compile_EvaluateCondition(Instruction instr);
void Compile_UniformCondition(Instruction instr);
+ void Compile_PushCallerSavedXMM();
+ void Compile_PopCallerSavedXMM();
+
/// Pointer to the variable that stores the current Pica code offset. Used to handle nested code blocks.
unsigned* offset_ptr = nullptr;