From 625c30d8bdee4f60c8437dc6004368f685435775 Mon Sep 17 00:00:00 2001 From: faketruth Date: Tue, 25 Oct 2011 18:57:38 +0000 Subject: Using SSE instructions for noise (terrain generation) Unfortunately the noise functions is only like 7% faster, so you won't even notice git-svn-id: http://mc-server.googlecode.com/svn/trunk@9 0a769ca7-a7f5-676a-18bf-c427514a06d6 --- source/cChunk.cpp | 18 +++++++------- source/cNoise.cpp | 70 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ source/cNoise.h | 5 ++++ 3 files changed, 84 insertions(+), 9 deletions(-) (limited to 'source') diff --git a/source/cChunk.cpp b/source/cChunk.cpp index a8e8938a0..8e03ff6c7 100644 --- a/source/cChunk.cpp +++ b/source/cChunk.cpp @@ -678,13 +678,13 @@ void cChunk::SpreadLight(char* a_LightBuffer) float GetNoise( float x, float y, cNoise & a_Noise ) { - float oct1 = a_Noise.CubicNoise2D( x*cGenSettings::HeightFreq1, y*cGenSettings::HeightFreq1 )*cGenSettings::HeightAmp1; - float oct2 = a_Noise.CubicNoise2D( x*cGenSettings::HeightFreq2, y*cGenSettings::HeightFreq2 )*cGenSettings::HeightAmp2; - float oct3 = a_Noise.CubicNoise2D( x*cGenSettings::HeightFreq3, y*cGenSettings::HeightFreq3 )*cGenSettings::HeightAmp3; + float oct1 = a_Noise.SSE_CubicNoise2D( x*cGenSettings::HeightFreq1, y*cGenSettings::HeightFreq1 )*cGenSettings::HeightAmp1; + float oct2 = a_Noise.SSE_CubicNoise2D( x*cGenSettings::HeightFreq2, y*cGenSettings::HeightFreq2 )*cGenSettings::HeightAmp2; + float oct3 = a_Noise.SSE_CubicNoise2D( x*cGenSettings::HeightFreq3, y*cGenSettings::HeightFreq3 )*cGenSettings::HeightAmp3; - float height = a_Noise.CubicNoise2D( x*0.1f, y*0.1f )*2; + float height = a_Noise.SSE_CubicNoise2D( x*0.1f, y*0.1f )*2; - float flatness = ((a_Noise.CubicNoise2D( x*0.5f, y*0.5f ) + 1.f ) * 0.5f) * 1.1f; // 0 ... 1.5 + float flatness = ((a_Noise.SSE_CubicNoise2D( x*0.5f, y*0.5f ) + 1.f ) * 0.5f) * 1.1f; // 0 ... 1.5 flatness *= flatness * flatness; return (oct1 + oct2 + oct3) * flatness + height; @@ -836,8 +836,8 @@ void cChunk::GenerateTerrain() // int yy = TopY; int zz = z + m_PosZ*16; - float val1 = m_Noise.CubicNoise2D( xx*0.1f, zz*0.1f ); - float val2 = m_Noise.CubicNoise2D( xx*0.01f, zz*0.01f ); + float val1 = m_Noise.SSE_CubicNoise2D( xx*0.1f, zz*0.1f ); + float val2 = m_Noise.SSE_CubicNoise2D( xx*0.01f, zz*0.01f ); if( m_BlockType[index] == SandID ) { if( (val1 + val2 > 0.f) && (rand()%128) > 124 && m_BlockType[index] == E_BLOCK_SAND ) @@ -852,8 +852,8 @@ void cChunk::GenerateTerrain() } else if( m_BlockType[index] == GrassID ) { - float val3 = m_Noise.CubicNoise2D( xx*0.01f+10, zz*0.01f+10 ); - float val4 = m_Noise.CubicNoise2D( xx*0.05f+20, zz*0.05f+20 ); + float val3 = m_Noise.SSE_CubicNoise2D( xx*0.01f+10, zz*0.01f+10 ); + float val4 = m_Noise.SSE_CubicNoise2D( xx*0.05f+20, zz*0.05f+20 ); if( val1 + val2 > 0.2f && (rand()%128) > 124 ) cRoot::Get()->GetWorld()->GrowTree( xx, TopY, zz ); else if( val3 > 0.2f && (rand()%128) > 124 ) diff --git a/source/cNoise.cpp b/source/cNoise.cpp index 1d321c089..1bdc52698 100644 --- a/source/cNoise.cpp +++ b/source/cNoise.cpp @@ -1,6 +1,8 @@ #include "cNoise.h" #include +#include //_mm_mul_epi32 + #define FAST_FLOOR( x ) ( (x) < 0 ? ((int)x)-1 : ((int)x) ) cNoise::cNoise( unsigned int a_Seed ) @@ -36,6 +38,41 @@ float cNoise::IntNoise3D( int a_X, int a_Y, int a_Z ) const return ( 1.0f - ( (n * (n * n * 15731 + 789221) + 1376312589) & 0x7fffffff) / 1073741824.0f); } +/**************** + * SSE Random value generator + **/ +__m128 cNoise::SSE_IntNoise2D( int a_X1, int a_Y1, int a_X2, int a_Y2, int a_X3, int a_Y3, int a_X4, int a_Y4 ) const +{ + const __m128i X4 = _mm_set_epi32(a_X4, a_X3, a_X2, a_X1); + const __m128i Y4 = _mm_set_epi32(a_Y4, a_Y3, a_Y2, a_Y1); + + const __m128 One4 = _mm_set_ps1( 1.f ); + const __m128i YScale4 = _mm_set1_epi32( 57 ); + + const __m128i i15731 = _mm_set1_epi32( 15731 ); + const __m128i i789221 = _mm_set1_epi32( 789221 ); + const __m128i i1376312589 = _mm_set1_epi32(1376312589); + const __m128i MaskValue4 = _mm_set1_epi32(0x7fffffff); + const __m128 f1073741824 = _mm_set_ps1( 1073741824.0f ); + + const __m128i Seed4 = _mm_mullo_epi32( _mm_mullo_epi32( _mm_set1_epi32( m_Seed ), YScale4 ), YScale4 ); + + const __m128i ScaledY4 = _mm_mullo_epi32( Y4, YScale4 ); + const __m128i n4 = _mm_add_epi32( _mm_add_epi32( X4, ScaledY4 ), Seed4 ); + const __m128i nn4 = _mm_slli_epi32( n4, 13 ); + const __m128i nnn4 = _mm_xor_si128( nn4, n4 ); + + const __m128i StepA4 = _mm_mullo_epi32( nnn4, nnn4 ); + const __m128i StepAA4 = _mm_add_epi32( _mm_mullo_epi32( StepA4, i15731 ), i789221 ); + const __m128i StepB4 = _mm_add_epi32( _mm_mullo_epi32( nnn4, StepAA4 ), i1376312589 ); + const __m128i StepC4 = _mm_and_si128( StepB4, MaskValue4 ); + const __m128 StepD4 = _mm_div_ps( _mm_cvtepi32_ps( StepC4 ), f1073741824 ); + const __m128 Result4 = _mm_sub_ps( One4, StepD4 ); + + return Result4; +} + + /*************** * Interpolated (and 1 smoothed) noise in 1-dimension **/ @@ -129,6 +166,26 @@ float cNoise::CubicNoise2D( float a_X, float a_Y ) const return CubicInterpolate( interp1, interp2, interp3, interp4, FracY ); } +float cNoise::SSE_CubicNoise2D( float a_X, float a_Y ) const +{ + const int BaseX = FAST_FLOOR( a_X ); + const int BaseY = FAST_FLOOR( a_Y ); + + __m128 points4[4] = { + SSE_IntNoise2D( BaseX-1, BaseY-1, BaseX-1, BaseY, BaseX-1, BaseY+1, BaseX-1, BaseY+2 ), + SSE_IntNoise2D( BaseX, BaseY-1, BaseX, BaseY, BaseX, BaseY+1, BaseX, BaseY+2 ), + SSE_IntNoise2D( BaseX+1, BaseY-1, BaseX+1, BaseY, BaseX+1, BaseY+1, BaseX+1, BaseY+2 ), + SSE_IntNoise2D( BaseX+2, BaseY-1, BaseX+2, BaseY, BaseX+2, BaseY+1, BaseX+2, BaseY+2 ), + }; + + const float FracX = (a_X) - BaseX; + union { __m128 p4; float p[4]; } + AllInterp = { CubicInterpolate4( points4[0], points4[1], points4[2], points4[3], FracX ) }; + + const float FracY = (a_Y) - BaseY; + return CubicInterpolate( AllInterp.p[0], AllInterp.p[1], AllInterp.p[2], AllInterp.p[3], FracY ); +} + /****************** * Interpolated (and 1 smoothed) noise in 3-dimensions **/ @@ -240,6 +297,19 @@ float cNoise::CubicInterpolate( float a_A, float a_B, float a_C, float a_D, floa return P*(a_Pct*a_Pct*a_Pct) + Q*(a_Pct*a_Pct) + R*a_Pct + S; } +__m128 cNoise::CubicInterpolate4( const __m128 & a_A, const __m128 & a_B, const __m128 & a_C, const __m128 & a_D, float a_Pct ) const +{ + const __m128 P = _mm_sub_ps( _mm_sub_ps( a_D, a_C ), _mm_sub_ps( a_A, a_B ) ); + const __m128 Q = _mm_sub_ps( _mm_sub_ps( a_A, a_B ), P ); + const __m128 R = _mm_sub_ps( a_C, a_A ); + + const __m128 Pct = _mm_set_ps1( a_Pct ); + const __m128 Pct2 = _mm_mul_ps( Pct, Pct ); + const __m128 Pct3 = _mm_mul_ps( Pct2, Pct ); + + return _mm_add_ps( _mm_add_ps( _mm_add_ps( _mm_mul_ps(P, Pct3), _mm_mul_ps( Q, Pct2 ) ), _mm_mul_ps( R, Pct ) ), a_B ); +} + float cNoise::CosineInterpolate( float a_A, float a_B, float a_Pct ) const { const float ft = a_Pct * 3.1415927f; diff --git a/source/cNoise.h b/source/cNoise.h index 538970da3..9511ab6e4 100644 --- a/source/cNoise.h +++ b/source/cNoise.h @@ -1,5 +1,7 @@ #pragma once +#include + class cNoise { public: @@ -8,6 +10,7 @@ public: float IntNoise( int a_X ) const; float IntNoise2D( int a_X, int a_Y ) const; + __m128 SSE_IntNoise2D( int a_X1, int a_Y1, int a_X2, int a_Y2, int a_X3, int a_Y3, int a_X4, int a_Y4 ) const; float IntNoise3D( int a_X, int a_Y, int a_Z ) const; float LinearNoise1D( float a_X ) const; @@ -18,6 +21,7 @@ public: float LinearNoise2D( float a_X, float a_Y ) const; float CosineNoise2D( float a_X, float a_Y ) const; float CubicNoise2D( float a_X, float a_Y ) const; + float SSE_CubicNoise2D( float a_X, float a_Y ) const; float CosineNoise3D( float a_X, float a_Y, float a_Z ) const; float CubicNoise3D( float a_X, float a_Y, float a_Z ) const; @@ -25,6 +29,7 @@ public: void SetSeed( unsigned int a_Seed ) { m_Seed = a_Seed; } private: float CubicInterpolate( float a_A, float a_B, float a_C, float a_D, float a_Pct ) const; + __m128 CubicInterpolate4( const __m128 & a_A, const __m128 & a_B, const __m128 & a_C, const __m128 & a_D, float a_Pct ) const; float CosineInterpolate( float a_A, float a_B, float a_Pct ) const; float LinearInterpolate( float a_A, float a_B, float a_Pct ) const; -- cgit v1.2.3