I'm using Verilator to simulate a circuit from a very simple program that just repeatedly sets the clock line high, and then low, until some output conditions are met:
#include "VSim.h"
#include <iostream>
vluint64_t main_time = 0;
double sc_time_stamp ()
{
return main_time;
}
void vstep(VSim* top)
{
top->RESET = 0;
top->CLK_25MHZ = 1;
top->eval();
main_time++;
top->CLK_25MHZ = 0;
top->eval();
main_time++;
}
int main(int argc, char** argv, char** env)
{
VSim* top = new VSim();
int cycles = 0;
for (int j = 0; j < 10; ++j)
{
for (;;)
{
vstep(top);
cycles++;
if (top->VGA_HSYNC == 0 && top->VGA_VSYNC == 0) break;
}
for (;;)
{
vstep(top);
cycles++;
if (top->VGA_DE) break;
}
}
printf("Verilator, from C: %d cycles\n", cycles);
delete top;
return 0;
}
So the problem is that if I run Verilator in single-threaded mode
(i.e. I run verilator
without a --threads N
flag, I don't set
VL_THREADED
during compilation, and I don't link -lpthread
and
verilated_threads.o
into the result), then this program takes ~150 ms:
$ time ../_build/verilator/SimMain
Verilator, from C: 4192001 cycles
real 0m0.137s
user 0m0.133s
sys 0m0.004s
But if I use 4 threads, then on the same machine (which has 4 real cores for 8 HT cores total), I see the process using 400% CPU, but it uses more than 250x more CPU and the wallclock time goes up by 70x:
$ time ../_build/verilator/SimMain
Verilator, from C: 4192001 cycles
real 0m9.528s
user 0m37.965s
sys 0m0.016s
What is causing this, and how do I fix it?
EDITED TO ADD: This question is about Verilator. It seems the c++
question tag has brought in people who know nothing about Verilator and think they should be able to reason this out from first principles. You will not, because the actual multithreading occurs inside the code generated by Verilator.
EDITED TO ADD: The RTL I am simulating is not the nicest Verilog in the world, since it is generated by Clash; but it has come up in the comments that this behaviour might be because of some properties of it. So here's the full Verilog code:
/* AUTOMATICALLY GENERATED VERILOG-2001 SOURCE CODE.
** GENERATED BY CLASH 1.3.0. DO NOT MODIFY.
*/
`timescale 100fs/100fs
module topEntity
( // Inputs
input CLK_25MHZ // clock
, input RESET // reset
// Outputs
, output wire VGA_HSYNC
, output wire VGA_VSYNC
, output wire VGA_DE
, output wire [7:0] VGA_RED
, output wire [7:0] VGA_GREEN
, output wire [7:0] VGA_BLUE
);
wire [23:0] result;
wire b1;
wire [23:0] result_0;
wire result_1;
wire result_2;
// ../src/Bounce.hs:(52,1)-(58,54)
wire signed [9:0] \x' ;
// ../src/Bounce.hs:(52,1)-(58,54)
wire signed [63:0] \c$x'_app_arg ;
// ../src/Bounce.hs:(52,1)-(58,54)
wire [8:0] x;
// ../src/Bounce.hs:(52,1)-(58,54)
reg [19:0] ds = {10'sd0, 10'sd2};
// ../src/Bounce.hs:84:1-66
wire signed [9:0] dx;
// ../src/Bounce.hs:89:1-74
wire signed [9:0] x_0;
// ../src/Bounce.hs:89:1-74
wire signed [9:0] dx_0;
// ../src/Bounce.hs:89:1-74
wire signed [9:0] diff;
// ../src/Bounce.hs:89:1-74
wire [1:0] ds2;
reg [19:0] result_3;
// ../src/Bounce.hs:89:1-74
wire [1:0] c$ds2_case_alt;
// ../src/Bounce.hs:89:1-74
wire [19:0] ds1;
// ../src/Bounce.hs:89:1-74
wire signed [9:0] x_1;
// ../src/Bounce.hs:89:1-74
wire signed [9:0] dx_1;
// ../src/Bounce.hs:89:1-74
wire signed [9:0] diff_0;
// ../src/Bounce.hs:89:1-74
wire [1:0] ds2_0;
reg [19:0] result_4;
// ../src/Bounce.hs:89:1-74
wire [1:0] c$ds2_case_alt_0;
// ../src/Bounce.hs:(52,1)-(58,54)
wire signed [9:0] ballY;
wire result_5;
wire result_6;
// ../src/Bounce.hs:(52,1)-(58,54)
wire signed [10:0] \x'_0 ;
// ../src/Bounce.hs:(52,1)-(58,54)
wire signed [63:0] \c$x'_app_arg_0 ;
// ../src/Bounce.hs:(52,1)-(58,54)
wire [9:0] x_2;
// ../src/Bounce.hs:(52,1)-(58,54)
reg [21:0] ds_0 = {11'sd0, 11'sd3};
// ../src/Bounce.hs:84:1-66
wire signed [10:0] dx_2;
// ../src/Bounce.hs:89:1-74
wire signed [10:0] x_3;
// ../src/Bounce.hs:89:1-74
wire signed [10:0] dx_3;
// ../src/Bounce.hs:89:1-74
wire signed [10:0] diff_1;
// ../src/Bounce.hs:89:1-74
wire [1:0] ds2_1;
reg [21:0] result_7;
// ../src/Bounce.hs:89:1-74
wire [1:0] c$ds2_case_alt_1;
// ../src/Bounce.hs:89:1-74
wire [21:0] ds1_0;
// ../src/Bounce.hs:89:1-74
wire signed [10:0] x_4;
// ../src/Bounce.hs:89:1-74
wire signed [10:0] dx_4;
// ../src/Bounce.hs:89:1-74
wire signed [10:0] diff_2;
// ../src/Bounce.hs:89:1-74
wire [1:0] ds2_2;
reg [21:0] result_8;
// ../src/Bounce.hs:89:1-74
wire [1:0] c$ds2_case_alt_2;
// ../src/Bounce.hs:(52,1)-(58,54)
wire signed [10:0] ballX;
wire result_9;
// ../src/Bounce.hs:(52,1)-(58,54)
reg old = 1'b0;
// ../src/Bounce.hs:(52,1)-(58,54)
wire c$frameEnd_case_alt;
// ../src/Bounce.hs:(52,1)-(58,54)
wire s;
wire [23:0] result_10;
wire c$app_arg;
reg [9:0] vgaY;
wire c$app_arg_0;
reg [10:0] vgaX;
wire [0:0] c$app_arg_1;
reg eta;
wire [0:0] c$app_arg_2;
reg eta_0;
wire [8:0] coord;
reg result_11;
wire [10:0] c$case_alt;
wire [10:0] c$case_alt_0;
wire [10:0] c$case_alt_1;
wire [10:0] c$case_alt_2;
reg [10:0] result_12;
reg [11:0] result_13;
wire [5:0] cnt;
wire [11:0] c$case_alt_3;
wire [6:0] cnt_0;
wire [11:0] c$case_alt_4;
wire [3:0] cnt_1;
wire [11:0] c$case_alt_5;
wire [9:0] cnt_2;
wire [11:0] c$case_alt_6;
reg [11:0] result_14 = {2'b00,10'd0};
wire [5:0] cnt_3;
wire [8:0] cnt_4;
wire [3:0] cnt_5;
wire [0:0] cnt_6;
wire [4:0] cnt_7;
reg [10:0] result_15 = {2'b00,9'd0};
wire [9:0] coord_0;
reg [11:0] result_16;
wire [5:0] cnt_8;
wire [11:0] c$case_alt_7;
wire [6:0] cnt_9;
wire [11:0] c$case_alt_8;
wire [3:0] cnt_10;
wire [11:0] c$case_alt_9;
wire [9:0] cnt_11;
wire [11:0] c$case_alt_10;
reg [11:0] result_17 = {2'b00,10'd0};
wire [9:0] result_selection_3;
wire [10:0] result_selection_9;
wire [9:0] s_selection_1;
wire [26:0] VGA;
wire [2:0] VGA_0;
assign result = b1 ? {8'd0, 8'd0,
8'd0} : result_0;
assign VGA = {result_10[23:21],
result[23:16], result[15:8], result[7:0]};
assign b1 = ~ result_10[21:21];
assign result_0 = (result_5 & result_1) ? {8'd240,
8'd224, 8'd64} : {8'd48, 8'd48, 8'd48};
assign result_selection_3 = result_10[9:0];
assign result_1 = result_selection_3[9:9] ? result_2 : 1'b0;
assign result_2 = (ballY <= \x' ) ? (\x' < (ballY + 10'sd15)) : 1'b0;
assign \x' = $signed(\c$x'_app_arg [0+:10]);
assign \c$x'_app_arg = $unsigned({{(64-9) {1'b0}},x});
assign x = result_10[8:0];
// register begin
always @(posedge CLK_25MHZ or posedge RESET) begin : ds_register
if ( RESET) begin
ds <= {10'sd0, 10'sd2};
end else if (result_9) begin
ds <= result_3;
end
end
// register end
assign dx = $signed(ds[9:0]);
assign x_0 = $signed(result_4[19:10]);
assign dx_0 = $signed(result_4[9:0]);
assign diff = 10'sd0 - x_0;
assign ds2 = (10'sd0 == diff) ? 2'd1 : c$ds2_case_alt;
always @(*) begin
case(ds2)
2'b00 : result_3 = {10'sd0 + diff, -dx_0};
default : result_3 = result_4;
endcase
end
assign c$ds2_case_alt = (10'sd0 <= diff) ? 2'd0 : 2'd2;
assign ds1 = {ballY + dx, dx};
assign x_1 = $signed(ds1[19:10]);
assign dx_1 = $signed(ds1[9:0]);
assign diff_0 = 10'sd464 - x_1;
assign ds2_0 = (10'sd0 == diff_0) ? 2'd1 : c$ds2_case_alt_0;
always @(*) begin
case(ds2_0)
2'b10 : result_4 = {10'sd464 + diff_0,
-dx_1};
default : result_4 = ds1;
endcase
end
assign c$ds2_case_alt_0 = (10'sd0 <= diff_0) ? 2'd0 : 2'd2;
assign ballY = $signed(ds[19:10]);
assign result_selection_9 = result_10[20:10];
assign result_5 = result_selection_9[10:10] ? result_6 : 1'b0;
assign result_6 = (ballX <= \x'_0 ) ? (\x'_0 < (ballX + 11'sd15)) : 1'b0;
assign \x'_0 = $signed(\c$x'_app_arg_0 [0+:11]);
assign \c$x'_app_arg_0 = $unsigned({{(64-10) {1'b0}},x_2});
assign x_2 = result_10[19:10];
// register begin
always @(posedge CLK_25MHZ or posedge RESET) begin : ds_0_register
if ( RESET) begin
ds_0 <= {11'sd0, 11'sd3};
end else if (result_9) begin
ds_0 <= result_7;
end
end
// register end
assign dx_2 = $signed(ds_0[10:0]);
assign x_3 = $signed(result_8[21:11]);
assign dx_3 = $signed(result_8[10:0]);
assign diff_1 = 11'sd0 - x_3;
assign ds2_1 = (11'sd0 == diff_1) ? 2'd1 : c$ds2_case_alt_1;
always @(*) begin
case(ds2_1)
2'b00 : result_7 = {11'sd0 + diff_1, -dx_3};
default : result_7 = result_8;
endcase
end
assign c$ds2_case_alt_1 = (11'sd0 <= diff_1) ? 2'd0 : 2'd2;
assign ds1_0 = {ballX + dx_2, dx_2};
assign x_4 = $signed(ds1_0[21:11]);
assign dx_4 = $signed(ds1_0[10:0]);
assign diff_2 = 11'sd624 - x_4;
assign ds2_2 = (11'sd0 == diff_2) ? 2'd1 : c$ds2_case_alt_2;
always @(*) begin
case(ds2_2)
2'b10 : result_8 = {11'sd624 + diff_2,
-dx_4};
default : result_8 = ds1_0;
endcase
end
assign c$ds2_case_alt_2 = (11'sd0 <= diff_2) ? 2'd0 : 2'd2;
assign ballX = $signed(ds_0[21:11]);
assign result_9 = old ? c$frameEnd_case_alt : 1'b0;
// register begin
always @(posedge CLK_25MHZ or posedge RESET) begin : old_register
if ( RESET) begin
old <= 1'b0;
end else if (1'b1) begin
old <= s;
end
end
// register end
assign c$frameEnd_case_alt = s ? 1'b0 : 1'b1;
assign s_selection_1 = result_10[9:0];
assign s = s_selection_1[9:9] ? 1'b1 : 1'b0;
assign result_10 = {{~ (c$app_arg_2),
~ (c$app_arg_1), c$app_arg_0 & c$app_arg},
vgaX, vgaY};
assign c$app_arg = vgaY[9:9] ? 1'b1 : 1'b0;
always @(*) begin
case(result_15[10:9])
2'b00 : vgaY = {1'b1,coord};
default : vgaY = {1'b0,9'bxxxxxxxxx};
endcase
end
assign c$app_arg_0 = vgaX[10:10] ? 1'b1 : 1'b0;
always @(*) begin
case(result_17[11:10])
2'b00 : vgaX = {1'b1,coord_0};
default : vgaX = {1'b0,10'bxxxxxxxxxx};
endcase
end
assign c$app_arg_1 = eta ? 1'b1 : 1'b0;
always @(*) begin
case(result_15[10:9])
2'b10 : eta = 1'b1;
default : eta = 1'b0;
endcase
end
assign c$app_arg_2 = eta_0 ? 1'b1 : 1'b0;
always @(*) begin
case(result_17[11:10])
2'b10 : eta_0 = 1'b1;
default : eta_0 = 1'b0;
endcase
end
assign coord = result_15[8:0];
always @(*) begin
case(result_14[11:10])
2'b11 : result_11 = cnt_3 == 6'd47;
default : result_11 = 1'b0;
endcase
end
assign c$case_alt = (cnt_4 == 9'd479) ? {2'b01,4'd0,5'bxxxxx} : {2'b00,cnt_4 + 9'd1};
assign c$case_alt_0 = (cnt_5 == 4'd10) ? {2'b10,1'd0,8'bxxxxxxxx} : {2'b01,cnt_5 + 4'd1,5'bxxxxx};
assign c$case_alt_1 = (cnt_6 == 1'd1) ? {2'b11,5'd0,4'bxxxx} : {2'b10,cnt_6 + 1'd1,8'bxxxxxxxx};
assign c$case_alt_2 = (cnt_7 == 5'd30) ? {2'b00,9'd0} : {2'b11,cnt_7 + 5'd1,4'bxxxx};
always @(*) begin
case(result_15[10:9])
2'b00 : result_12 = c$case_alt;
2'b01 : result_12 = c$case_alt_0;
2'b10 : result_12 = c$case_alt_1;
default : result_12 = c$case_alt_2;
endcase
end
always @(*) begin
case(result_14[11:10])
2'b00 : result_13 = c$case_alt_6;
2'b01 : result_13 = c$case_alt_5;
2'b10 : result_13 = c$case_alt_4;
default : result_13 = c$case_alt_3;
endcase
end
assign cnt = result_14[9:4];
assign c$case_alt_3 = (cnt == 6'd47) ? {2'b00,10'd0} : {2'b11,cnt + 6'd1,4'bxxxx};
assign cnt_0 = result_14[9:3];
assign c$case_alt_4 = (cnt_0 == 7'd95) ? {2'b11,6'd0,4'bxxxx} : {2'b10,cnt_0 + 7'd1,3'bxxx};
assign cnt_1 = result_14[9:6];
assign c$case_alt_5 = (cnt_1 == 4'd15) ? {2'b10,7'd0,3'bxxx} : {2'b01,cnt_1 + 4'd1,6'bxxxxxx};
assign cnt_2 = result_14[9:0];
assign c$case_alt_6 = (cnt_2 == 10'd639) ? {2'b01,4'd0,6'bxxxxxx} : {2'b00,cnt_2 + 10'd1};
// register begin
always @(posedge CLK_25MHZ or posedge RESET) begin : result_14_register
if ( RESET) begin
result_14 <= {2'b00,10'd0};
end else if (1'b1) begin
result_14 <= result_13;
end
end
// register end
assign cnt_3 = result_14[9:4];
assign cnt_4 = result_15[8:0];
assign cnt_5 = result_15[8:5];
assign cnt_6 = result_15[8:8];
assign cnt_7 = result_15[8:4];
// register begin
always @(posedge CLK_25MHZ or posedge RESET) begin : result_15_register
if ( RESET) begin
result_15 <= {2'b00,9'd0};
end else if (result_11) begin
result_15 <= result_12;
end
end
// register end
assign coord_0 = result_17[9:0];
always @(*) begin
case(result_17[11:10])
2'b00 : result_16 = c$case_alt_10;
2'b01 : result_16 = c$case_alt_9;
2'b10 : result_16 = c$case_alt_8;
default : result_16 = c$case_alt_7;
endcase
end
assign cnt_8 = result_17[9:4];
assign c$case_alt_7 = (cnt_8 == 6'd47) ? {2'b00,10'd0} : {2'b11,cnt_8 + 6'd1,4'bxxxx};
assign cnt_9 = result_17[9:3];
assign c$case_alt_8 = (cnt_9 == 7'd95) ? {2'b11,6'd0,4'bxxxx} : {2'b10,cnt_9 + 7'd1,3'bxxx};
assign cnt_10 = result_17[9:6];
assign c$case_alt_9 = (cnt_10 == 4'd15) ? {2'b10,7'd0,3'bxxx} : {2'b01,cnt_10 + 4'd1,6'bxxxxxx};
assign cnt_11 = result_17[9:0];
assign c$case_alt_10 = (cnt_11 == 10'd639) ? {2'b01,4'd0,6'bxxxxxx} : {2'b00,cnt_11 + 10'd1};
// register begin
always @(posedge CLK_25MHZ or posedge RESET) begin : result_17_register
if ( RESET) begin
result_17 <= {2'b00,10'd0};
end else if (1'b1) begin
result_17 <= result_16;
end
end
// register end
assign VGA_0 = VGA[26:24];
assign VGA_RED = VGA[23:16];
assign VGA_GREEN = VGA[15:8];
assign VGA_BLUE = VGA[7:0];
assign VGA_HSYNC = VGA_0[2:2];
assign VGA_VSYNC = VGA_0[1:1];
assign VGA_DE = VGA_0[0:0];
endmodule
According to the devs they gave this answer:
So it seems that the initial guesses were correct, and the code in question is not sufficient to exhibit a speedup. The overhead involved is to high so that it doesn't benefit from multithreading.