diff --git a/fpga/common/rtl/stats_counter.v b/fpga/common/rtl/stats_counter.v index c023a802d..09af6e307 100644 --- a/fpga/common/rtl/stats_counter.v +++ b/fpga/common/rtl/stats_counter.v @@ -44,7 +44,9 @@ module stats_counter # // Width of AXI lite address bus in bits parameter AXIL_ADDR_WIDTH = STAT_ID_WIDTH+$clog2(((AXIL_DATA_WIDTH > STAT_COUNT_WIDTH ? AXIL_DATA_WIDTH : STAT_COUNT_WIDTH)+7)/8), // Width of AXI lite wstrb (width of data bus in words) - parameter AXIL_STRB_WIDTH = (AXIL_DATA_WIDTH/8) + parameter AXIL_STRB_WIDTH = (AXIL_DATA_WIDTH/8), + // Pipeline length + parameter PIPELINE = 2 ) ( input wire clk, @@ -86,7 +88,7 @@ parameter ID_SHIFT = $clog2(((AXIL_DATA_WIDTH > STAT_COUNT_WIDTH ? AXIL_DATA_WID parameter WORD_SELECT_SHIFT = $clog2(AXIL_DATA_WIDTH/8); parameter WORD_SELECT_WIDTH = STAT_COUNT_WIDTH > AXIL_DATA_WIDTH ? $clog2((STAT_COUNT_WIDTH+7)/8) - $clog2(AXIL_DATA_WIDTH/8) : 0; -// bus width assertions +// check configuration initial begin if (AXIL_STRB_WIDTH * 8 != AXIL_DATA_WIDTH) begin $error("Error: AXI lite interface requires byte (8-bit) granularity (instance %m)"); @@ -97,45 +99,44 @@ initial begin $error("Error: AXI lite address width too narrow (instance %m)"); $finish; end + + if (PIPELINE < 2) begin + $error("Error: PIPELINE must be at least 2 (instance %m)"); + $finish; + end end -localparam [1:0] - STATE_INIT = 2'd0, - STATE_IDLE = 2'd1, - STATE_READ = 2'd2, - STATE_WRITE = 2'd3; +reg init_reg = 1'b1, init_next; +reg [STAT_ID_WIDTH-1:0] init_ptr_reg = 0, init_ptr_next; -reg [1:0] state_reg = STATE_INIT, state_next; +reg op_acc_pipe_hazard; +reg stage_active; + +reg [PIPELINE-1:0] op_axil_read_pipe_reg = 0, op_axil_read_pipe_next; +reg [PIPELINE-1:0] op_acc_pipe_reg = 0, op_acc_pipe_next; + +reg [STAT_ID_WIDTH-1:0] mem_addr_pipeline_reg[PIPELINE-1:0], mem_addr_pipeline_next[PIPELINE-1:0]; +reg [WORD_SELECT_WIDTH-1:0] axil_shift_pipeline_reg[PIPELINE-1:0], axil_shift_pipeline_next[PIPELINE-1:0]; +reg [STAT_INC_WIDTH-1:0] stat_inc_pipeline_reg[PIPELINE-1:0], stat_inc_pipeline_next[PIPELINE-1:0]; reg s_axis_stat_tready_reg = 1'b0, s_axis_stat_tready_next; -reg s_axil_awready_reg = 1'b0, s_axil_awready_next; -reg s_axil_wready_reg = 1'b0, s_axil_wready_next; -reg s_axil_bvalid_reg = 1'b0, s_axil_bvalid_next; -reg s_axil_arready_reg = 1'b0, s_axil_arready_next; -reg [AXIL_DATA_WIDTH-1:0] s_axil_rdata_reg = {AXIL_DATA_WIDTH{1'b0}}, s_axil_rdata_next; -reg s_axil_rvalid_reg = 1'b0, s_axil_rvalid_next; - -reg [STAT_ID_WIDTH-1:0] id_reg = {STAT_ID_WIDTH{1'b0}}, id_next; -reg [STAT_INC_WIDTH-1:0] inc_reg = {STAT_INC_WIDTH{1'b0}}, inc_next; - -reg rd_data_valid_reg = 1'b0, rd_data_valid_next; -reg [WORD_SELECT_WIDTH-1:0] rd_data_shift_reg = 0, rd_data_shift_next; +reg s_axil_awready_reg = 0, s_axil_awready_next; +reg s_axil_wready_reg = 0, s_axil_wready_next; +reg s_axil_bvalid_reg = 0, s_axil_bvalid_next; +reg s_axil_arready_reg = 0, s_axil_arready_next; +reg [AXIL_DATA_WIDTH-1:0] s_axil_rdata_reg = 0, s_axil_rdata_next; +reg s_axil_rvalid_reg = 0, s_axil_rvalid_next; (* ramstyle = "no_rw_check" *) -reg [STAT_COUNT_WIDTH-1:0] mem_reg[(2**STAT_ID_WIDTH)-1:0]; +reg [STAT_COUNT_WIDTH-1:0] mem[2**STAT_ID_WIDTH-1:0]; -reg [STAT_COUNT_WIDTH-1:0] mem_rd_data_reg = {STAT_COUNT_WIDTH{1'b0}}; -reg [STAT_COUNT_WIDTH-1:0] mem_rd_data_axil_reg = {STAT_COUNT_WIDTH{1'b0}}; - -reg mem_rd_en; -reg mem_wr_en; +reg [STAT_ID_WIDTH-1:0] mem_rd_addr; +reg [STAT_ID_WIDTH-1:0] mem_wr_addr; reg [STAT_COUNT_WIDTH-1:0] mem_wr_data; - -reg mem_rd_en_axil; - -wire [STAT_ID_WIDTH-1:0] s_axil_araddr_id = s_axil_araddr >> ID_SHIFT; -wire [WORD_SELECT_WIDTH-1:0] s_axil_araddr_word = s_axil_araddr >> WORD_SELECT_SHIFT; +reg mem_wr_en; +reg [STAT_COUNT_WIDTH-1:0] mem_read_data_reg = 0; +reg [STAT_COUNT_WIDTH-1:0] mem_read_data_pipeline_reg[PIPELINE-1:1]; assign s_axis_stat_tready = s_axis_stat_tready_reg; @@ -148,169 +149,168 @@ assign s_axil_rdata = s_axil_rdata_reg; assign s_axil_rresp = 2'b00; assign s_axil_rvalid = s_axil_rvalid_reg; +wire [STAT_ID_WIDTH-1:0] s_axil_araddr_id = s_axil_araddr >> ID_SHIFT; +wire [WORD_SELECT_WIDTH-1:0] s_axil_araddr_shift = s_axil_araddr >> WORD_SELECT_SHIFT; + integer i, j; initial begin - // two nested loops for smaller number of iterations per loop - // workaround for synthesizer complaints about large loop counts + // break up loop to work around iteration termination for (i = 0; i < 2**STAT_ID_WIDTH; i = i + 2**(STAT_ID_WIDTH/2)) begin for (j = i; j < i + 2**(STAT_ID_WIDTH/2); j = j + 1) begin - mem_reg[j] = 0; + mem[j] = 0; end end + + for (i = 0; i < PIPELINE; i = i + 1) begin + mem_addr_pipeline_reg[i] = 0; + axil_shift_pipeline_reg[i] = 0; + stat_inc_pipeline_reg[i] = 0; + end end -// accumulate always @* begin - state_next = STATE_IDLE; + init_next = init_reg; + init_ptr_next = init_ptr_reg; + + op_axil_read_pipe_next = {op_axil_read_pipe_reg, 1'b0}; + op_acc_pipe_next = {op_acc_pipe_reg, 1'b0}; + + mem_addr_pipeline_next[0] = 0; + axil_shift_pipeline_next[0] = 0; + stat_inc_pipeline_next[0] = 0; + for (j = 1; j < PIPELINE; j = j + 1) begin + mem_addr_pipeline_next[j] = mem_addr_pipeline_reg[j-1]; + axil_shift_pipeline_next[j] = axil_shift_pipeline_reg[j-1]; + stat_inc_pipeline_next[j] = stat_inc_pipeline_reg[j-1]; + end s_axis_stat_tready_next = 1'b0; - id_next = id_reg; - inc_next = inc_reg; - - mem_rd_en = 1'b0; - mem_wr_en = 1'b0; - mem_wr_data = mem_rd_data_reg + inc_reg; - - case (state_reg) - STATE_INIT: begin - id_next = id_reg + 1; - mem_wr_en = 1'b1; - mem_wr_data = 0; - - if (id_reg == {STAT_ID_WIDTH{1'b1}}) begin - state_next = STATE_IDLE; - end else begin - state_next = STATE_INIT; - end - end - STATE_IDLE: begin - s_axis_stat_tready_next = 1'b1; - - if (s_axis_stat_tvalid && s_axis_stat_tready) begin - inc_next = s_axis_stat_tdata; - id_next = s_axis_stat_tid; - s_axis_stat_tready_next = 1'b0; - state_next = STATE_READ; - end else begin - state_next = STATE_IDLE; - end - end - STATE_READ: begin - s_axis_stat_tready_next = 1'b1; - mem_rd_en = 1'b1; - state_next = STATE_WRITE; - end - STATE_WRITE: begin - s_axis_stat_tready_next = 1'b1; - mem_wr_en = 1'b1; - mem_wr_data = mem_rd_data_reg + inc_reg; - - if (s_axis_stat_tvalid && s_axis_stat_tready) begin - inc_next = s_axis_stat_tdata; - id_next = s_axis_stat_tid; - s_axis_stat_tready_next = 1'b0; - state_next = STATE_READ; - end else begin - state_next = STATE_IDLE; - end - end - endcase -end - -always @(posedge clk) begin - state_reg <= state_next; - - s_axis_stat_tready_reg <= s_axis_stat_tready_next; - - id_reg <= id_next; - inc_reg <= inc_next; - - if (mem_wr_en) begin - mem_reg[id_reg] <= mem_wr_data; - end else if (mem_rd_en) begin - mem_rd_data_reg <= mem_reg[id_reg]; - end - - if (rst) begin - state_reg <= STATE_INIT; - s_axis_stat_tready_reg <= 1'b0; - id_reg <= {STAT_ID_WIDTH{1'b0}}; - end -end - -// register interface -always @* begin s_axil_awready_next = 1'b0; s_axil_wready_next = 1'b0; s_axil_bvalid_next = s_axil_bvalid_reg && !s_axil_bready; + s_axil_arready_next = 1'b0; + s_axil_rdata_next = s_axil_rdata_reg; + s_axil_rvalid_next = s_axil_rvalid_reg && !s_axil_rready; + + mem_rd_addr = 0; + mem_wr_addr = mem_addr_pipeline_reg[PIPELINE-1]; + mem_wr_data = mem_read_data_pipeline_reg[PIPELINE-1] + stat_inc_pipeline_reg[PIPELINE-1]; + mem_wr_en = 0; + + op_acc_pipe_hazard = 1'b0; + stage_active = 1'b0; + + for (j = 0; j < PIPELINE; j = j + 1) begin + stage_active = op_axil_read_pipe_reg[j] || op_acc_pipe_reg[j]; + op_acc_pipe_hazard = op_acc_pipe_hazard || (stage_active && mem_addr_pipeline_reg[j] == s_axis_stat_tid); + end + + // discard writes if (s_axil_awvalid && s_axil_wvalid && (!s_axil_bvalid || s_axil_bready) && (!s_axil_awready && !s_axil_wready)) begin s_axil_awready_next = 1'b1; s_axil_wready_next = 1'b1; s_axil_bvalid_next = 1'b1; end + + // pipeline stage 0 - accept request + if (init_reg) begin + init_ptr_next = init_ptr_reg + 1; + + mem_wr_addr = init_ptr_reg; + mem_wr_data = 0; + mem_wr_en = 1'b1; + + if (&init_ptr_reg) begin + init_next = 1'b0; + end + end else if (s_axil_arvalid && (!s_axil_rvalid || s_axil_rready) && !op_axil_read_pipe_reg) begin + // AXIL read + op_axil_read_pipe_next[0] = 1'b1; + + s_axil_arready_next = 1'b1; + + mem_rd_addr = s_axil_araddr_id; + mem_addr_pipeline_next[0] = s_axil_araddr_id; + axil_shift_pipeline_next[0] = s_axil_araddr_shift; + end else if (s_axis_stat_tvalid && !s_axis_stat_tready && !op_acc_pipe_hazard) begin + // accumulate + op_acc_pipe_next[0] = 1'b1; + + s_axis_stat_tready_next = 1'b1; + + stat_inc_pipeline_next[0] = s_axis_stat_tdata; + + mem_rd_addr = s_axis_stat_tid; + mem_addr_pipeline_next[0] = s_axis_stat_tid; + end + + // read complete, perform operation + if (op_acc_pipe_reg[PIPELINE-1]) begin + // accumulate + mem_wr_addr = mem_addr_pipeline_reg[PIPELINE-1]; + mem_wr_data = mem_read_data_pipeline_reg[PIPELINE-1] + stat_inc_pipeline_reg[PIPELINE-1]; + mem_wr_en = 1'b1; + end else if (op_axil_read_pipe_reg[PIPELINE-1]) begin + // AXIL read + s_axil_rvalid_next = 1'b1; + s_axil_rdata_next = 0; + + if (STAT_COUNT_WIDTH > AXIL_DATA_WIDTH) begin + s_axil_rdata_next = mem_read_data_pipeline_reg[PIPELINE-1] >> axil_shift_pipeline_reg[PIPELINE-1]*AXIL_DATA_WIDTH; + end else begin + s_axil_rdata_next = mem_read_data_pipeline_reg[PIPELINE-1]; + end + end end always @(posedge clk) begin + init_reg <= init_next; + init_ptr_reg <= init_ptr_next; + + op_axil_read_pipe_reg <= op_axil_read_pipe_next; + op_acc_pipe_reg <= op_acc_pipe_next; + + s_axis_stat_tready_reg <= s_axis_stat_tready_next; + s_axil_awready_reg <= s_axil_awready_next; s_axil_wready_reg <= s_axil_wready_next; s_axil_bvalid_reg <= s_axil_bvalid_next; + s_axil_arready_reg <= s_axil_arready_next; + s_axil_rdata_reg <= s_axil_rdata_next; + s_axil_rvalid_reg <= s_axil_rvalid_next; + + for (i = 0; i < PIPELINE; i = i + 1) begin + mem_addr_pipeline_reg[i] <= mem_addr_pipeline_next[i]; + axil_shift_pipeline_reg[i] <= axil_shift_pipeline_next[i]; + stat_inc_pipeline_reg[i] <= stat_inc_pipeline_next[i]; + end + + if (mem_wr_en) begin + mem[mem_wr_addr] <= mem_wr_data; + end + mem_read_data_reg <= mem[mem_rd_addr]; + mem_read_data_pipeline_reg[1] <= mem_read_data_reg; + for (i = 2; i < PIPELINE; i = i + 1) begin + mem_read_data_pipeline_reg[i] <= mem_read_data_pipeline_reg[i-1]; + end if (rst) begin + init_reg <= 1'b1; + init_ptr_reg <= 0; + + op_axil_read_pipe_reg <= 0; + op_acc_pipe_reg <= 0; + + s_axis_stat_tready_reg <= 1'b0; + s_axil_awready_reg <= 1'b0; s_axil_wready_reg <= 1'b0; s_axil_bvalid_reg <= 1'b0; - end -end - -always @* begin - s_axil_arready_next = 1'b0; - s_axil_rvalid_next = s_axil_rvalid_reg && !s_axil_rready; - s_axil_rdata_next = s_axil_rdata_reg; - - rd_data_valid_next = rd_data_valid_reg; - rd_data_shift_next = rd_data_shift_reg; - - mem_rd_en_axil = 1'b0; - - if (rd_data_valid_reg && (!s_axil_rvalid || s_axil_rready)) begin - s_axil_rvalid_next = 1'b1; - rd_data_valid_next = 1'b0; - - if (STAT_COUNT_WIDTH > AXIL_DATA_WIDTH) begin - s_axil_rdata_next = mem_rd_data_axil_reg >> rd_data_shift_reg*AXIL_DATA_WIDTH; - end else begin - s_axil_rdata_next = mem_rd_data_axil_reg; - end - end - - if (s_axil_arvalid && (!s_axil_rvalid || s_axil_rready || !rd_data_valid_reg) && !s_axil_arready) begin - s_axil_arready_next = 1'b1; - rd_data_valid_next = 1'b1; - rd_data_shift_next = s_axil_araddr_word; - - mem_rd_en_axil = 1'b1; - end -end - -always @(posedge clk) begin - s_axil_arready_reg <= s_axil_arready_next; - s_axil_rvalid_reg <= s_axil_rvalid_next; - s_axil_rdata_reg <= s_axil_rdata_next; - - rd_data_valid_reg <= rd_data_valid_next; - rd_data_shift_reg <= rd_data_shift_next; - - if (mem_rd_en_axil) begin - mem_rd_data_axil_reg <= mem_reg[s_axil_araddr_id]; - end - - if (rst) begin s_axil_arready_reg <= 1'b0; s_axil_rvalid_reg <= 1'b0; - rd_data_valid_reg <= 1'b0; end end diff --git a/fpga/common/tb/stats_counter/Makefile b/fpga/common/tb/stats_counter/Makefile index b347cf775..5fd20a24f 100644 --- a/fpga/common/tb/stats_counter/Makefile +++ b/fpga/common/tb/stats_counter/Makefile @@ -38,7 +38,7 @@ export PARAM_STAT_COUNT_WIDTH ?= 32 export PARAM_AXIL_DATA_WIDTH ?= 32 export PARAM_AXIL_ADDR_WIDTH ?= $(shell python -c "print($(PARAM_STAT_ID_WIDTH) + (($(PARAM_STAT_COUNT_WIDTH)+7)//8-1).bit_length())") export PARAM_AXIL_STRB_WIDTH ?= $(shell expr $(PARAM_AXIL_DATA_WIDTH) / 8 ) -export PARAM_PIPELINE ?= 0 +export PARAM_PIPELINE ?= 2 ifeq ($(SIM), icarus) PLUSARGS += -fst diff --git a/fpga/common/tb/stats_counter/test_stats_counter.py b/fpga/common/tb/stats_counter/test_stats_counter.py index 1c69733c1..1b2de5f1b 100644 --- a/fpga/common/tb/stats_counter/test_stats_counter.py +++ b/fpga/common/tb/stats_counter/test_stats_counter.py @@ -211,7 +211,7 @@ def test_stats_counter(request, stat_count_width): parameters['AXIL_DATA_WIDTH'] = 32 parameters['AXIL_ADDR_WIDTH'] = parameters['STAT_ID_WIDTH'] + ((parameters['STAT_COUNT_WIDTH']+7)//8-1).bit_length() parameters['AXIL_STRB_WIDTH'] = parameters['AXIL_DATA_WIDTH'] // 8 - parameters['PIPELINE'] = 1 + parameters['PIPELINE'] = 2 extra_env = {f'PARAM_{k}': str(v) for k, v in parameters.items()}