Problem: Iterating an array of structs in Lua and manipulating the data, that is than used in C++ later on.
Backstory: I did some performance testing this week and I am kind of disappointed by the performance of Lua. My struggle began while integrating Lua as a scripting interface to my game engine. I started using luabridge for simplicity but quickly switched to sol2 because of some convenience features. Then I measured the performance for the first time an was quiet shocked by how bad it was.
Testcase: I extracted a standalone testcase (see Code:) to compare native C++ performance to sol2 performance. Still the same bad results. Then I also added another testcase that uses plain Lua and just Light Userdata to do the same thing. Performance is a little better but far from good as shown below.
Timings:
C++ elapsed time: 0.002736s
Sol (Container) elapsed time: 0.999166s
Lua (Light Userdata) elapsed time: 0.338946s
Question: Is this something to expect or is there any chance to get close to native C++ performance for a usecase like this?
Info:
- LuaJit (latest master branch)
- sol2 (latest master branch)
- Compiler: MSVC19
- OS: Windows 11
Code:
#define SOL_ALL_SAFETIES_ON 0
#define SOL_USING_CXX_LUAJIT 1
#include <sol/sol.hpp>
#include <chrono>
struct Transform
{
float position_x;
float position_y;
float position_z;
float scale_x;
float scale_y;
float scale_z;
};
Transform* p_transforms = nullptr;
std::vector<Transform*> GetTransformPointerArray( int32_t count )
{
std::vector<Transform*> transform_pointers( count );
for( int i = 0; i < transform_pointers.size(); ++i )
transform_pointers[ i ] = &p_transforms[ i ];
return transform_pointers;
}
void c_Update( int32_t count )
{
for( int i = 0; i < count; ++i )
{
Transform* p_transform = &p_transforms[ i ];
p_transform->position_x += 0.01f;
p_transform->scale_x += 0.01f;
}
}
void c_perf_test( int32_t iterations, int32_t count )
{
auto start = std::chrono::high_resolution_clock::now();
for( int i = 0; i < iterations; ++i )
c_Update( count );
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> elapsed_seconds = end - start;
double elapsed = elapsed_seconds.count();
printf( "C++ elapsed time: %fs\n", elapsed );
}
void sol_perf_test( int32_t iterations, int32_t count )
{
sol::state lua;
lua.open_libraries();
lua.new_usertype<Transform>( "Transform",
"position_x", &Transform::position_x,
"position_y", &Transform::position_y,
"position_z", &Transform::position_z,
"scale_x", &Transform::scale_x,
"scale_y", &Transform::scale_y,
"scale_z", &Transform::scale_z );
lua.script( R"(
function Update( transforms )
for i = 1, #transforms, 1 do
local transform = transforms[i]
local position_x = transform.position_x
local scale_x = transform.scale_x
position_x = position_x + 0.01
scale_x = scale_x + 0.01
transform.position_x = position_x
transform.scale_x = scale_x
end
end
)" );
sol::function update_func = lua[ "Update" ];
std::vector<Transform*> transform_pointers = GetTransformPointerArray( count );
auto start = std::chrono::high_resolution_clock::now();
for( int i = 0; i < iterations; ++i )
update_func( transform_pointers );
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> elapsed_seconds = end - start;
double elapsed = elapsed_seconds.count();
printf( "Sol (Container) elapsed time: %fs\n", elapsed );
}
static int get_light_transform_array( lua_State* L )
{
lua_pushlightuserdata( L, p_transforms );
return 1;
}
static int get_light_transform( lua_State* L )
{
Transform* p_transforms = (Transform*) lua_touserdata( L, 2 );
int index = luaL_checkint( L, 3 );
lua_pushlightuserdata( L, &p_transforms[ index - 1 ] );
return 1;
}
static int get_position_x( lua_State* L )
{
Transform* p_transform = (Transform*) lua_touserdata( L, 2 );
lua_pushnumber( L, p_transform->position_x );
return 1;
}
static int set_position_x( lua_State* L )
{
Transform* p_transform = (Transform*) lua_touserdata( L, 2 );
p_transform->position_x = lua_tonumber( L, 3 );
return 0;
}
static int get_scale_x( lua_State* L )
{
Transform* p_transform = (Transform*) lua_touserdata( L, 2 );
lua_pushnumber( L, p_transform->scale_x );
return 1;
}
static int set_scale_x( lua_State* L )
{
Transform* p_transform = (Transform*) lua_touserdata( L, 2 );
p_transform->scale_x = lua_tonumber( L, 3 );
return 0;
}
static void create_transform_library( lua_State* L )
{
static const struct luaL_Reg transform_library[] = {
{"GetLightTransformArray", get_light_transform_array},
{ "GetLightTransform", get_light_transform},
{ "GetPositionX", get_position_x},
{ "SetPositionX", set_position_x},
{ "GetScaleX", get_scale_x},
{ "SetScaleX", set_scale_x},
{ NULL, NULL}
};
luaL_openlib( L, "Transform", transform_library, 0 );
}
void lightuserdata_perf_test( int32_t iterations, int32_t count )
{
lua_State* p_lua = luaL_newstate();
luaL_openlibs( p_lua );
create_transform_library( p_lua );
int status = luaL_dostring( p_lua, R"(
function Update( count )
local transforms = Transform:GetLightTransformArray()
for i = 1, count, 1 do
local light_transform = Transform:GetLightTransform( transforms, i )
local position_x = Transform:GetPositionX( light_transform )
local scale_x = Transform:GetScaleX( light_transform )
position_x = position_x + 0.01
scale_x = scale_x + 0.01
Transform:SetPositionX( light_transform, position_x )
Transform:SetScaleX( light_transform, scale_x )
end
end
)" );
if( status != 0 )
{
printf( "Error: %s\n", lua_tostring( p_lua, -1 ) );
return;
}
auto start = std::chrono::high_resolution_clock::now();
for( int i = 0; i < iterations; ++i )
{
lua_getglobal( p_lua, "Update" );
lua_pushinteger( p_lua, count );
lua_pcall( p_lua, 1, 0, 0 );
}
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> elapsed_seconds = end - start;
double elapsed = elapsed_seconds.count();
printf( "Lua (Light Userdata) elapsed time: %fs\n", elapsed );
lua_close( p_lua );
}
int main( int argc, char* argv[] )
{
int32_t iterations = 1000;
int32_t count = 5000;
p_transforms = new Transform[ count ];
memset( p_transforms, 0, sizeof( Transform ) * count );
c_perf_test( iterations, count );
sol_perf_test( iterations, count );
lightuserdata_perf_test( iterations, count );
delete[] p_transforms;
return 0;
}
So a few more hours in, I found out about ffi.cast and wrote another testcase that combines Light Userdata and ffi. Timings are very close to native C++ now. For very performance critical stuff this could work for me but I fear I am giving up on safety. Maybe somebody has an other idea on how to improve performance without using ffi.
Timings:
Code: