How to get better performance out of Lua when iterating array of structs

174 views Asked by At

Problem: Iterating an array of structs in Lua and manipulating the data, that is than used in C++ later on.

Backstory: I did some performance testing this week and I am kind of disappointed by the performance of Lua. My struggle began while integrating Lua as a scripting interface to my game engine. I started using luabridge for simplicity but quickly switched to sol2 because of some convenience features. Then I measured the performance for the first time an was quiet shocked by how bad it was.

Testcase: I extracted a standalone testcase (see Code:) to compare native C++ performance to sol2 performance. Still the same bad results. Then I also added another testcase that uses plain Lua and just Light Userdata to do the same thing. Performance is a little better but far from good as shown below.

Timings:

C++                  elapsed time: 0.002736s
Sol (Container)      elapsed time: 0.999166s
Lua (Light Userdata) elapsed time: 0.338946s

Question: Is this something to expect or is there any chance to get close to native C++ performance for a usecase like this?

Info:

  • LuaJit (latest master branch)
  • sol2 (latest master branch)
  • Compiler: MSVC19
  • OS: Windows 11

Code:

#define SOL_ALL_SAFETIES_ON  0
#define SOL_USING_CXX_LUAJIT 1
#include <sol/sol.hpp>
#include <chrono>


struct Transform
{
    float position_x;
    float position_y;
    float position_z;
    float scale_x;
    float scale_y;
    float scale_z;
};

Transform* p_transforms = nullptr;


std::vector<Transform*> GetTransformPointerArray( int32_t count )
{
    std::vector<Transform*> transform_pointers( count );
    for( int i = 0; i < transform_pointers.size(); ++i )
        transform_pointers[ i ] = &p_transforms[ i ];

    return transform_pointers;
}


void c_Update( int32_t count )
{
    for( int i = 0; i < count; ++i )
    {
        Transform* p_transform = &p_transforms[ i ];

        p_transform->position_x += 0.01f;
        p_transform->scale_x += 0.01f;
    }
}


void c_perf_test( int32_t iterations, int32_t count )
{
    auto start = std::chrono::high_resolution_clock::now();

    for( int i = 0; i < iterations; ++i )
        c_Update( count );

    auto                          end             = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> elapsed_seconds = end - start;
    double                        elapsed         = elapsed_seconds.count();

    printf( "C++                  elapsed time: %fs\n", elapsed );
}


void sol_perf_test( int32_t iterations, int32_t count )
{
    sol::state lua;
    lua.open_libraries();

    lua.new_usertype<Transform>( "Transform",
                                 "position_x", &Transform::position_x,
                                 "position_y", &Transform::position_y,
                                 "position_z", &Transform::position_z,
                                 "scale_x", &Transform::scale_x,
                                 "scale_y", &Transform::scale_y,
                                 "scale_z", &Transform::scale_z );

    lua.script( R"(
        function Update( transforms )
            for i = 1, #transforms, 1 do
                local transform = transforms[i]

                local position_x = transform.position_x
                local scale_x    = transform.scale_x

                position_x = position_x + 0.01
                scale_x    = scale_x + 0.01

                transform.position_x = position_x
                transform.scale_x    = scale_x
            end
        end
    )" );

    sol::function update_func = lua[ "Update" ];

    std::vector<Transform*> transform_pointers = GetTransformPointerArray( count );

    auto start = std::chrono::high_resolution_clock::now();

    for( int i = 0; i < iterations; ++i )
        update_func( transform_pointers );

    auto                          end             = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> elapsed_seconds = end - start;
    double                        elapsed         = elapsed_seconds.count();

    printf( "Sol (Container)      elapsed time: %fs\n", elapsed );
}


static int get_light_transform_array( lua_State* L )
{
    lua_pushlightuserdata( L, p_transforms );
    return 1;
}


static int get_light_transform( lua_State* L )
{
    Transform* p_transforms = (Transform*) lua_touserdata( L, 2 );
    int        index        = luaL_checkint( L, 3 );

    lua_pushlightuserdata( L, &p_transforms[ index - 1 ] );
    return 1;
}


static int get_position_x( lua_State* L )
{
    Transform* p_transform = (Transform*) lua_touserdata( L, 2 );
    lua_pushnumber( L, p_transform->position_x );
    return 1;
}


static int set_position_x( lua_State* L )
{
    Transform* p_transform  = (Transform*) lua_touserdata( L, 2 );
    p_transform->position_x = lua_tonumber( L, 3 );
    return 0;
}


static int get_scale_x( lua_State* L )
{
    Transform* p_transform = (Transform*) lua_touserdata( L, 2 );
    lua_pushnumber( L, p_transform->scale_x );
    return 1;
}


static int set_scale_x( lua_State* L )
{
    Transform* p_transform = (Transform*) lua_touserdata( L, 2 );
    p_transform->scale_x   = lua_tonumber( L, 3 );
    return 0;
}


static void create_transform_library( lua_State* L )
{
    static const struct luaL_Reg transform_library[] = {
        {"GetLightTransformArray", get_light_transform_array},
        {     "GetLightTransform",       get_light_transform},
        {          "GetPositionX",            get_position_x},
        {          "SetPositionX",            set_position_x},
        {             "GetScaleX",               get_scale_x},
        {             "SetScaleX",               set_scale_x},
        {                    NULL,                      NULL}
    };

    luaL_openlib( L, "Transform", transform_library, 0 );
}


void lightuserdata_perf_test( int32_t iterations, int32_t count )
{
    lua_State* p_lua = luaL_newstate();
    luaL_openlibs( p_lua );

    create_transform_library( p_lua );

    int status = luaL_dostring( p_lua, R"(
        function Update( count )
            local transforms = Transform:GetLightTransformArray()

            for i = 1, count, 1 do
                local light_transform = Transform:GetLightTransform( transforms, i )
                local position_x      = Transform:GetPositionX( light_transform )
                local scale_x         = Transform:GetScaleX( light_transform )

                position_x = position_x + 0.01
                scale_x    = scale_x + 0.01

                Transform:SetPositionX( light_transform, position_x )
                Transform:SetScaleX( light_transform, scale_x )
            end
        end
    )" );

    if( status != 0 )
    {
        printf( "Error: %s\n", lua_tostring( p_lua, -1 ) );
        return;
    }

    auto start = std::chrono::high_resolution_clock::now();

    for( int i = 0; i < iterations; ++i )
    {
        lua_getglobal( p_lua, "Update" );
        lua_pushinteger( p_lua, count );
        lua_pcall( p_lua, 1, 0, 0 );
    }

    auto                          end             = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> elapsed_seconds = end - start;
    double                        elapsed         = elapsed_seconds.count();

    printf( "Lua (Light Userdata) elapsed time: %fs\n", elapsed );

    lua_close( p_lua );
}


int main( int argc, char* argv[] )
{
    int32_t iterations = 1000;
    int32_t count      = 5000;

    p_transforms = new Transform[ count ];
    memset( p_transforms, 0, sizeof( Transform ) * count );

    c_perf_test( iterations, count );
    sol_perf_test( iterations, count );
    lightuserdata_perf_test( iterations, count );

    delete[] p_transforms;
    return 0;
}
1

There are 1 answers

0
thegabman On

So a few more hours in, I found out about ffi.cast and wrote another testcase that combines Light Userdata and ffi. Timings are very close to native C++ now. For very performance critical stuff this could work for me but I fear I am giving up on safety. Maybe somebody has an other idea on how to improve performance without using ffi.

Timings:

C++                  elapsed time: 0.001683s
Sol (Container)      elapsed time: 1.020745s
Lua (Light Userdata) elapsed time: 0.337135s
LuaJit ffi           elapsed time: 0.004741s

Code:

void ffi_perf_test( int32_t iterations, int32_t count )
{
    sol::state lua;
    lua.open_libraries( sol::lib::base, sol::lib::package, sol::lib::jit, sol::lib::ffi );
    lua_State* p_lua = lua.lua_state();

    create_transform_library( p_lua );

    int status = luaL_dostring( p_lua, R"(
        local ffi = require( "ffi" )

        ffi.cdef[[
            typedef struct Transform
            {
                float position_x;
                float position_y;
                float position_z;
                float scale_x;
                float scale_y;
                float scale_z;
            } Transform;
        ]]

        function Update( count )
            local transforms     = Transform:GetLightTransformArray()
            local ffi_transforms = ffi.cast( "Transform*", transforms )

            for i = 0, count-1, 1 do
                local position_x = ffi_transforms[i].position_x
                local scale_x    = ffi_transforms[i].scale_x

                position_x = position_x + 0.01
                scale_x    = scale_x + 0.01

                ffi_transforms[i].position_x = position_x
                ffi_transforms[i].scale_x    = scale_x
            end
        end
    )" );

    if( status != 0 )
    {
        printf( "Error: %s\n", lua_tostring( p_lua, -1 ) );
        return;
    }

    auto start = std::chrono::high_resolution_clock::now();

    for( int i = 0; i < iterations; ++i )
    {
        lua_getglobal( p_lua, "Update" );
        lua_pushinteger( p_lua, count );
        lua_pcall( p_lua, 1, 0, 0 );
    }

    auto                          end             = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> elapsed_seconds = end - start;
    double                        elapsed         = elapsed_seconds.count();

    printf( "LuaJit ffi           elapsed time: %fs\n", elapsed );
}