Multithreaded simulation orders of magnitude slower than single-threaded

618 views Asked by At

I'm using Verilator to simulate a circuit from a very simple program that just repeatedly sets the clock line high, and then low, until some output conditions are met:

#include "VSim.h"
#include <iostream>

vluint64_t main_time = 0;

double sc_time_stamp ()
{
    return main_time;
}

void vstep(VSim* top)
{
    top->RESET = 0;
    top->CLK_25MHZ = 1;
    top->eval();
    main_time++;
    top->CLK_25MHZ = 0;
    top->eval();
    main_time++;
}

int main(int argc, char** argv, char** env)
{
    VSim* top = new VSim();

    int cycles = 0;

    for (int j = 0; j < 10; ++j)
    {
        for (;;)
        {
            vstep(top);
            cycles++;
            if (top->VGA_HSYNC == 0 && top->VGA_VSYNC == 0) break;
        }

        for (;;)
        {
            vstep(top);
            cycles++;
            if (top->VGA_DE) break;
        }
    }

    printf("Verilator, from C: %d cycles\n", cycles);
    delete top;
    return 0;
}

So the problem is that if I run Verilator in single-threaded mode (i.e. I run verilator without a --threads N flag, I don't set VL_THREADED during compilation, and I don't link -lpthread and verilated_threads.o into the result), then this program takes ~150 ms:

$ time ../_build/verilator/SimMain 
Verilator, from C: 4192001 cycles

real    0m0.137s
user    0m0.133s
sys 0m0.004s

But if I use 4 threads, then on the same machine (which has 4 real cores for 8 HT cores total), I see the process using 400% CPU, but it uses more than 250x more CPU and the wallclock time goes up by 70x:

$ time ../_build/verilator/SimMain 
Verilator, from C: 4192001 cycles

real    0m9.528s
user    0m37.965s
sys 0m0.016s

What is causing this, and how do I fix it?

EDITED TO ADD: This question is about Verilator. It seems the c++ question tag has brought in people who know nothing about Verilator and think they should be able to reason this out from first principles. You will not, because the actual multithreading occurs inside the code generated by Verilator.

EDITED TO ADD: The RTL I am simulating is not the nicest Verilog in the world, since it is generated by Clash; but it has come up in the comments that this behaviour might be because of some properties of it. So here's the full Verilog code:

/* AUTOMATICALLY GENERATED VERILOG-2001 SOURCE CODE.
** GENERATED BY CLASH 1.3.0. DO NOT MODIFY.
*/
`timescale 100fs/100fs
module topEntity
    ( // Inputs
      input  CLK_25MHZ // clock
    , input  RESET // reset

      // Outputs
    , output wire  VGA_HSYNC
    , output wire  VGA_VSYNC
    , output wire  VGA_DE
    , output wire [7:0] VGA_RED
    , output wire [7:0] VGA_GREEN
    , output wire [7:0] VGA_BLUE
    );
  wire [23:0] result;
  wire  b1;
  wire [23:0] result_0;
  wire  result_1;
  wire  result_2;
  // ../src/Bounce.hs:(52,1)-(58,54)
  wire signed [9:0] \x' ;
  // ../src/Bounce.hs:(52,1)-(58,54)
  wire signed [63:0] \c$x'_app_arg ;
  // ../src/Bounce.hs:(52,1)-(58,54)
  wire [8:0] x;
  // ../src/Bounce.hs:(52,1)-(58,54)
  reg [19:0] ds = {10'sd0,   10'sd2};
  // ../src/Bounce.hs:84:1-66
  wire signed [9:0] dx;
  // ../src/Bounce.hs:89:1-74
  wire signed [9:0] x_0;
  // ../src/Bounce.hs:89:1-74
  wire signed [9:0] dx_0;
  // ../src/Bounce.hs:89:1-74
  wire signed [9:0] diff;
  // ../src/Bounce.hs:89:1-74
  wire [1:0] ds2;
  reg [19:0] result_3;
  // ../src/Bounce.hs:89:1-74
  wire [1:0] c$ds2_case_alt;
  // ../src/Bounce.hs:89:1-74
  wire [19:0] ds1;
  // ../src/Bounce.hs:89:1-74
  wire signed [9:0] x_1;
  // ../src/Bounce.hs:89:1-74
  wire signed [9:0] dx_1;
  // ../src/Bounce.hs:89:1-74
  wire signed [9:0] diff_0;
  // ../src/Bounce.hs:89:1-74
  wire [1:0] ds2_0;
  reg [19:0] result_4;
  // ../src/Bounce.hs:89:1-74
  wire [1:0] c$ds2_case_alt_0;
  // ../src/Bounce.hs:(52,1)-(58,54)
  wire signed [9:0] ballY;
  wire  result_5;
  wire  result_6;
  // ../src/Bounce.hs:(52,1)-(58,54)
  wire signed [10:0] \x'_0 ;
  // ../src/Bounce.hs:(52,1)-(58,54)
  wire signed [63:0] \c$x'_app_arg_0 ;
  // ../src/Bounce.hs:(52,1)-(58,54)
  wire [9:0] x_2;
  // ../src/Bounce.hs:(52,1)-(58,54)
  reg [21:0] ds_0 = {11'sd0,   11'sd3};
  // ../src/Bounce.hs:84:1-66
  wire signed [10:0] dx_2;
  // ../src/Bounce.hs:89:1-74
  wire signed [10:0] x_3;
  // ../src/Bounce.hs:89:1-74
  wire signed [10:0] dx_3;
  // ../src/Bounce.hs:89:1-74
  wire signed [10:0] diff_1;
  // ../src/Bounce.hs:89:1-74
  wire [1:0] ds2_1;
  reg [21:0] result_7;
  // ../src/Bounce.hs:89:1-74
  wire [1:0] c$ds2_case_alt_1;
  // ../src/Bounce.hs:89:1-74
  wire [21:0] ds1_0;
  // ../src/Bounce.hs:89:1-74
  wire signed [10:0] x_4;
  // ../src/Bounce.hs:89:1-74
  wire signed [10:0] dx_4;
  // ../src/Bounce.hs:89:1-74
  wire signed [10:0] diff_2;
  // ../src/Bounce.hs:89:1-74
  wire [1:0] ds2_2;
  reg [21:0] result_8;
  // ../src/Bounce.hs:89:1-74
  wire [1:0] c$ds2_case_alt_2;
  // ../src/Bounce.hs:(52,1)-(58,54)
  wire signed [10:0] ballX;
  wire  result_9;
  // ../src/Bounce.hs:(52,1)-(58,54)
  reg  old = 1'b0;
  // ../src/Bounce.hs:(52,1)-(58,54)
  wire  c$frameEnd_case_alt;
  // ../src/Bounce.hs:(52,1)-(58,54)
  wire  s;
  wire [23:0] result_10;
  wire  c$app_arg;
  reg [9:0] vgaY;
  wire  c$app_arg_0;
  reg [10:0] vgaX;
  wire [0:0] c$app_arg_1;
  reg  eta;
  wire [0:0] c$app_arg_2;
  reg  eta_0;
  wire [8:0] coord;
  reg  result_11;
  wire [10:0] c$case_alt;
  wire [10:0] c$case_alt_0;
  wire [10:0] c$case_alt_1;
  wire [10:0] c$case_alt_2;
  reg [10:0] result_12;
  reg [11:0] result_13;
  wire [5:0] cnt;
  wire [11:0] c$case_alt_3;
  wire [6:0] cnt_0;
  wire [11:0] c$case_alt_4;
  wire [3:0] cnt_1;
  wire [11:0] c$case_alt_5;
  wire [9:0] cnt_2;
  wire [11:0] c$case_alt_6;
  reg [11:0] result_14 = {2'b00,10'd0};
  wire [5:0] cnt_3;
  wire [8:0] cnt_4;
  wire [3:0] cnt_5;
  wire [0:0] cnt_6;
  wire [4:0] cnt_7;
  reg [10:0] result_15 = {2'b00,9'd0};
  wire [9:0] coord_0;
  reg [11:0] result_16;
  wire [5:0] cnt_8;
  wire [11:0] c$case_alt_7;
  wire [6:0] cnt_9;
  wire [11:0] c$case_alt_8;
  wire [3:0] cnt_10;
  wire [11:0] c$case_alt_9;
  wire [9:0] cnt_11;
  wire [11:0] c$case_alt_10;
  reg [11:0] result_17 = {2'b00,10'd0};
  wire [9:0] result_selection_3;
  wire [10:0] result_selection_9;
  wire [9:0] s_selection_1;
  wire [26:0] VGA;
  wire [2:0] VGA_0;

  assign result = b1 ? {8'd0,   8'd0,
                        8'd0} : result_0;

  assign VGA = {result_10[23:21],
                result[23:16],   result[15:8],   result[7:0]};

  assign b1 = ~ result_10[21:21];

  assign result_0 = (result_5 & result_1) ? {8'd240,
                                             8'd224,   8'd64} : {8'd48,   8'd48,   8'd48};

  assign result_selection_3 = result_10[9:0];

  assign result_1 = result_selection_3[9:9] ? result_2 : 1'b0;

  assign result_2 = (ballY <= \x' ) ? (\x'  < (ballY + 10'sd15)) : 1'b0;

  assign \x'  = $signed(\c$x'_app_arg [0+:10]);

  assign \c$x'_app_arg  = $unsigned({{(64-9) {1'b0}},x});

  assign x = result_10[8:0];

  // register begin
  always @(posedge CLK_25MHZ or  posedge  RESET) begin : ds_register
    if ( RESET) begin
      ds <= {10'sd0,   10'sd2};
    end else if (result_9) begin
      ds <= result_3;
    end
  end
  // register end

  assign dx = $signed(ds[9:0]);

  assign x_0 = $signed(result_4[19:10]);

  assign dx_0 = $signed(result_4[9:0]);

  assign diff = 10'sd0 - x_0;

  assign ds2 = (10'sd0 == diff) ? 2'd1 : c$ds2_case_alt;

  always @(*) begin
    case(ds2)
      2'b00 : result_3 = {10'sd0 + diff,   -dx_0};
      default : result_3 = result_4;
    endcase
  end

  assign c$ds2_case_alt = (10'sd0 <= diff) ? 2'd0 : 2'd2;

  assign ds1 = {ballY + dx,   dx};

  assign x_1 = $signed(ds1[19:10]);

  assign dx_1 = $signed(ds1[9:0]);

  assign diff_0 = 10'sd464 - x_1;

  assign ds2_0 = (10'sd0 == diff_0) ? 2'd1 : c$ds2_case_alt_0;

  always @(*) begin
    case(ds2_0)
      2'b10 : result_4 = {10'sd464 + diff_0,
                          -dx_1};
      default : result_4 = ds1;
    endcase
  end

  assign c$ds2_case_alt_0 = (10'sd0 <= diff_0) ? 2'd0 : 2'd2;

  assign ballY = $signed(ds[19:10]);

  assign result_selection_9 = result_10[20:10];

  assign result_5 = result_selection_9[10:10] ? result_6 : 1'b0;

  assign result_6 = (ballX <= \x'_0 ) ? (\x'_0  < (ballX + 11'sd15)) : 1'b0;

  assign \x'_0  = $signed(\c$x'_app_arg_0 [0+:11]);

  assign \c$x'_app_arg_0  = $unsigned({{(64-10) {1'b0}},x_2});

  assign x_2 = result_10[19:10];

  // register begin
  always @(posedge CLK_25MHZ or  posedge  RESET) begin : ds_0_register
    if ( RESET) begin
      ds_0 <= {11'sd0,   11'sd3};
    end else if (result_9) begin
      ds_0 <= result_7;
    end
  end
  // register end

  assign dx_2 = $signed(ds_0[10:0]);

  assign x_3 = $signed(result_8[21:11]);

  assign dx_3 = $signed(result_8[10:0]);

  assign diff_1 = 11'sd0 - x_3;

  assign ds2_1 = (11'sd0 == diff_1) ? 2'd1 : c$ds2_case_alt_1;

  always @(*) begin
    case(ds2_1)
      2'b00 : result_7 = {11'sd0 + diff_1,   -dx_3};
      default : result_7 = result_8;
    endcase
  end

  assign c$ds2_case_alt_1 = (11'sd0 <= diff_1) ? 2'd0 : 2'd2;

  assign ds1_0 = {ballX + dx_2,   dx_2};

  assign x_4 = $signed(ds1_0[21:11]);

  assign dx_4 = $signed(ds1_0[10:0]);

  assign diff_2 = 11'sd624 - x_4;

  assign ds2_2 = (11'sd0 == diff_2) ? 2'd1 : c$ds2_case_alt_2;

  always @(*) begin
    case(ds2_2)
      2'b10 : result_8 = {11'sd624 + diff_2,
                          -dx_4};
      default : result_8 = ds1_0;
    endcase
  end

  assign c$ds2_case_alt_2 = (11'sd0 <= diff_2) ? 2'd0 : 2'd2;

  assign ballX = $signed(ds_0[21:11]);

  assign result_9 = old ? c$frameEnd_case_alt : 1'b0;

  // register begin
  always @(posedge CLK_25MHZ or  posedge  RESET) begin : old_register
    if ( RESET) begin
      old <= 1'b0;
    end else if (1'b1) begin
      old <= s;
    end
  end
  // register end

  assign c$frameEnd_case_alt = s ? 1'b0 : 1'b1;

  assign s_selection_1 = result_10[9:0];

  assign s = s_selection_1[9:9] ? 1'b1 : 1'b0;

  assign result_10 = {{~ (c$app_arg_2),
                       ~ (c$app_arg_1),   c$app_arg_0 & c$app_arg},
                      vgaX,   vgaY};

  assign c$app_arg = vgaY[9:9] ? 1'b1 : 1'b0;

  always @(*) begin
    case(result_15[10:9])
      2'b00 : vgaY = {1'b1,coord};
      default : vgaY = {1'b0,9'bxxxxxxxxx};
    endcase
  end

  assign c$app_arg_0 = vgaX[10:10] ? 1'b1 : 1'b0;

  always @(*) begin
    case(result_17[11:10])
      2'b00 : vgaX = {1'b1,coord_0};
      default : vgaX = {1'b0,10'bxxxxxxxxxx};
    endcase
  end

  assign c$app_arg_1 = eta ? 1'b1 : 1'b0;

  always @(*) begin
    case(result_15[10:9])
      2'b10 : eta = 1'b1;
      default : eta = 1'b0;
    endcase
  end

  assign c$app_arg_2 = eta_0 ? 1'b1 : 1'b0;

  always @(*) begin
    case(result_17[11:10])
      2'b10 : eta_0 = 1'b1;
      default : eta_0 = 1'b0;
    endcase
  end

  assign coord = result_15[8:0];

  always @(*) begin
    case(result_14[11:10])
      2'b11 : result_11 = cnt_3 == 6'd47;
      default : result_11 = 1'b0;
    endcase
  end

  assign c$case_alt = (cnt_4 == 9'd479) ? {2'b01,4'd0,5'bxxxxx} : {2'b00,cnt_4 + 9'd1};

  assign c$case_alt_0 = (cnt_5 == 4'd10) ? {2'b10,1'd0,8'bxxxxxxxx} : {2'b01,cnt_5 + 4'd1,5'bxxxxx};

  assign c$case_alt_1 = (cnt_6 == 1'd1) ? {2'b11,5'd0,4'bxxxx} : {2'b10,cnt_6 + 1'd1,8'bxxxxxxxx};

  assign c$case_alt_2 = (cnt_7 == 5'd30) ? {2'b00,9'd0} : {2'b11,cnt_7 + 5'd1,4'bxxxx};

  always @(*) begin
    case(result_15[10:9])
      2'b00 : result_12 = c$case_alt;
      2'b01 : result_12 = c$case_alt_0;
      2'b10 : result_12 = c$case_alt_1;
      default : result_12 = c$case_alt_2;
    endcase
  end

  always @(*) begin
    case(result_14[11:10])
      2'b00 : result_13 = c$case_alt_6;
      2'b01 : result_13 = c$case_alt_5;
      2'b10 : result_13 = c$case_alt_4;
      default : result_13 = c$case_alt_3;
    endcase
  end

  assign cnt = result_14[9:4];

  assign c$case_alt_3 = (cnt == 6'd47) ? {2'b00,10'd0} : {2'b11,cnt + 6'd1,4'bxxxx};

  assign cnt_0 = result_14[9:3];

  assign c$case_alt_4 = (cnt_0 == 7'd95) ? {2'b11,6'd0,4'bxxxx} : {2'b10,cnt_0 + 7'd1,3'bxxx};

  assign cnt_1 = result_14[9:6];

  assign c$case_alt_5 = (cnt_1 == 4'd15) ? {2'b10,7'd0,3'bxxx} : {2'b01,cnt_1 + 4'd1,6'bxxxxxx};

  assign cnt_2 = result_14[9:0];

  assign c$case_alt_6 = (cnt_2 == 10'd639) ? {2'b01,4'd0,6'bxxxxxx} : {2'b00,cnt_2 + 10'd1};

  // register begin
  always @(posedge CLK_25MHZ or  posedge  RESET) begin : result_14_register
    if ( RESET) begin
      result_14 <= {2'b00,10'd0};
    end else if (1'b1) begin
      result_14 <= result_13;
    end
  end
  // register end

  assign cnt_3 = result_14[9:4];

  assign cnt_4 = result_15[8:0];

  assign cnt_5 = result_15[8:5];

  assign cnt_6 = result_15[8:8];

  assign cnt_7 = result_15[8:4];

  // register begin
  always @(posedge CLK_25MHZ or  posedge  RESET) begin : result_15_register
    if ( RESET) begin
      result_15 <= {2'b00,9'd0};
    end else if (result_11) begin
      result_15 <= result_12;
    end
  end
  // register end

  assign coord_0 = result_17[9:0];

  always @(*) begin
    case(result_17[11:10])
      2'b00 : result_16 = c$case_alt_10;
      2'b01 : result_16 = c$case_alt_9;
      2'b10 : result_16 = c$case_alt_8;
      default : result_16 = c$case_alt_7;
    endcase
  end

  assign cnt_8 = result_17[9:4];

  assign c$case_alt_7 = (cnt_8 == 6'd47) ? {2'b00,10'd0} : {2'b11,cnt_8 + 6'd1,4'bxxxx};

  assign cnt_9 = result_17[9:3];

  assign c$case_alt_8 = (cnt_9 == 7'd95) ? {2'b11,6'd0,4'bxxxx} : {2'b10,cnt_9 + 7'd1,3'bxxx};

  assign cnt_10 = result_17[9:6];

  assign c$case_alt_9 = (cnt_10 == 4'd15) ? {2'b10,7'd0,3'bxxx} : {2'b01,cnt_10 + 4'd1,6'bxxxxxx};

  assign cnt_11 = result_17[9:0];

  assign c$case_alt_10 = (cnt_11 == 10'd639) ? {2'b01,4'd0,6'bxxxxxx} : {2'b00,cnt_11 + 10'd1};

  // register begin
  always @(posedge CLK_25MHZ or  posedge  RESET) begin : result_17_register
    if ( RESET) begin
      result_17 <= {2'b00,10'd0};
    end else if (1'b1) begin
      result_17 <= result_16;
    end
  end
  // register end

  assign VGA_0 = VGA[26:24];

  assign VGA_RED = VGA[23:16];

  assign VGA_GREEN = VGA[15:8];

  assign VGA_BLUE = VGA[7:0];

  assign VGA_HSYNC = VGA_0[2:2];

  assign VGA_VSYNC = VGA_0[1:1];

  assign VGA_DE = VGA_0[0:0];


endmodule
1

There are 1 answers

2
Devolus On BEST ANSWER

According to the devs they gave this answer:

Multithreading will only show speedups on much larger designs. In small designs the communication between cores will be much larger than leaving it on one core.

So it seems that the initial guesses were correct, and the code in question is not sufficient to exhibit a speedup. The overhead involved is to high so that it doesn't benefit from multithreading.