diff --git a/rtl/dma_if_desc_mux.v b/rtl/dma_if_desc_mux.v index 47892121b..14d9bc429 100644 --- a/rtl/dma_if_desc_mux.v +++ b/rtl/dma_if_desc_mux.v @@ -140,6 +140,16 @@ wire [PORTS-1:0] grant; wire grant_valid; wire [CL_PORTS-1:0] grant_encoded; +// input registers to pipeline arbitration delay +reg [PORTS*DMA_ADDR_WIDTH-1:0] s_axis_desc_dma_addr_reg = 0; +reg [PORTS*S_RAM_SEL_WIDTH-1:0] s_axis_desc_ram_sel_reg = 0; +reg [PORTS*RAM_ADDR_WIDTH-1:0] s_axis_desc_ram_addr_reg = 0; +reg [PORTS*IMM_WIDTH-1:0] s_axis_desc_imm_reg = 0; +reg [PORTS-1:0] s_axis_desc_imm_en_reg = 0; +reg [PORTS*LEN_WIDTH-1:0] s_axis_desc_len_reg = 0; +reg [PORTS*S_TAG_WIDTH-1:0] s_axis_desc_tag_reg = 0; +reg [PORTS-1:0] s_axis_desc_valid_reg = 0; + // internal datapath reg [DMA_ADDR_WIDTH-1:0] m_axis_desc_dma_addr_int; reg [M_RAM_SEL_WIDTH-1:0] m_axis_desc_ram_sel_int; @@ -152,17 +162,17 @@ reg m_axis_desc_valid_int; reg m_axis_desc_ready_int_reg = 1'b0; wire m_axis_desc_ready_int_early; -assign s_axis_desc_ready = (m_axis_desc_ready_int_reg && grant_valid) << grant_encoded; +assign s_axis_desc_ready = ~s_axis_desc_valid_reg | ({PORTS{m_axis_desc_ready_int_reg}} & grant); // mux for incoming packet -wire [DMA_ADDR_WIDTH-1:0] current_s_desc_dma_addr = s_axis_desc_dma_addr[grant_encoded*DMA_ADDR_WIDTH +: DMA_ADDR_WIDTH]; -wire [S_RAM_SEL_WIDTH-1:0] current_s_desc_ram_sel = s_axis_desc_ram_sel[grant_encoded*S_RAM_SEL_WIDTH +: S_RAM_SEL_WIDTH_INT]; -wire [RAM_ADDR_WIDTH-1:0] current_s_desc_ram_addr = s_axis_desc_ram_addr[grant_encoded*RAM_ADDR_WIDTH +: RAM_ADDR_WIDTH]; -wire [IMM_WIDTH-1:0] current_s_desc_imm = s_axis_desc_imm[grant_encoded*IMM_WIDTH +: IMM_WIDTH]; -wire current_s_desc_imm_en = s_axis_desc_imm_en[grant_encoded]; -wire [LEN_WIDTH-1:0] current_s_desc_len = s_axis_desc_len[grant_encoded*LEN_WIDTH +: LEN_WIDTH]; -wire [S_TAG_WIDTH-1:0] current_s_desc_tag = s_axis_desc_tag[grant_encoded*S_TAG_WIDTH +: S_TAG_WIDTH]; -wire current_s_desc_valid = s_axis_desc_valid[grant_encoded]; +wire [DMA_ADDR_WIDTH-1:0] current_s_desc_dma_addr = s_axis_desc_dma_addr_reg[grant_encoded*DMA_ADDR_WIDTH +: DMA_ADDR_WIDTH]; +wire [S_RAM_SEL_WIDTH-1:0] current_s_desc_ram_sel = s_axis_desc_ram_sel_reg[grant_encoded*S_RAM_SEL_WIDTH +: S_RAM_SEL_WIDTH_INT]; +wire [RAM_ADDR_WIDTH-1:0] current_s_desc_ram_addr = s_axis_desc_ram_addr_reg[grant_encoded*RAM_ADDR_WIDTH +: RAM_ADDR_WIDTH]; +wire [IMM_WIDTH-1:0] current_s_desc_imm = s_axis_desc_imm_reg[grant_encoded*IMM_WIDTH +: IMM_WIDTH]; +wire current_s_desc_imm_en = s_axis_desc_imm_en_reg[grant_encoded]; +wire [LEN_WIDTH-1:0] current_s_desc_len = s_axis_desc_len_reg[grant_encoded*LEN_WIDTH +: LEN_WIDTH]; +wire [S_TAG_WIDTH-1:0] current_s_desc_tag = s_axis_desc_tag_reg[grant_encoded*S_TAG_WIDTH +: S_TAG_WIDTH]; +wire current_s_desc_valid = s_axis_desc_valid_reg[grant_encoded]; wire current_s_desc_ready = s_axis_desc_ready[grant_encoded]; // arbiter instance @@ -183,8 +193,8 @@ arb_inst ( .grant_encoded(grant_encoded) ); -assign request = s_axis_desc_valid & ~grant; -assign acknowledge = grant & s_axis_desc_valid & s_axis_desc_ready; +assign request = (s_axis_desc_valid_reg & ~grant) | (s_axis_desc_valid & grant); +assign acknowledge = grant & s_axis_desc_valid_reg & {PORTS{m_axis_desc_ready_int_reg}}; always @* begin // pass through selected packet data @@ -204,6 +214,28 @@ always @* begin m_axis_desc_valid_int = current_s_desc_valid && m_axis_desc_ready_int_reg && grant_valid; end +integer i; + +always @(posedge clk) begin + // register inputs + for (i = 0; i < PORTS; i = i + 1) begin + if (s_axis_desc_ready[i]) begin + s_axis_desc_dma_addr_reg[i*DMA_ADDR_WIDTH +: DMA_ADDR_WIDTH] <= s_axis_desc_dma_addr[i*DMA_ADDR_WIDTH +: DMA_ADDR_WIDTH]; + s_axis_desc_ram_sel_reg[i*S_RAM_SEL_WIDTH +: S_RAM_SEL_WIDTH_INT] <= s_axis_desc_ram_sel[i*S_RAM_SEL_WIDTH +: S_RAM_SEL_WIDTH_INT]; + s_axis_desc_ram_addr_reg[i*RAM_ADDR_WIDTH +: RAM_ADDR_WIDTH] <= s_axis_desc_ram_addr[i*RAM_ADDR_WIDTH +: RAM_ADDR_WIDTH]; + s_axis_desc_imm_reg[i*IMM_WIDTH +: IMM_WIDTH] <= s_axis_desc_imm[i*IMM_WIDTH +: IMM_WIDTH]; + s_axis_desc_imm_en_reg[i] <= s_axis_desc_imm_en[i]; + s_axis_desc_len_reg[i*LEN_WIDTH +: LEN_WIDTH] <= s_axis_desc_len[i*LEN_WIDTH +: LEN_WIDTH]; + s_axis_desc_tag_reg[i*S_TAG_WIDTH +: S_TAG_WIDTH] <= s_axis_desc_tag[i*S_TAG_WIDTH +: S_TAG_WIDTH]; + s_axis_desc_valid_reg[i] <= s_axis_desc_valid[i]; + end + end + + if (rst) begin + s_axis_desc_valid_reg <= 0; + end +end + // output datapath logic reg [DMA_ADDR_WIDTH-1:0] m_axis_desc_dma_addr_reg = {DMA_ADDR_WIDTH{1'b0}}; reg [M_RAM_SEL_WIDTH-1:0] m_axis_desc_ram_sel_reg = {M_RAM_SEL_WIDTH{1'b0}}; diff --git a/rtl/pcie_axi_dma_desc_mux.v b/rtl/pcie_axi_dma_desc_mux.v index fa0a3cea7..69c805056 100644 --- a/rtl/pcie_axi_dma_desc_mux.v +++ b/rtl/pcie_axi_dma_desc_mux.v @@ -107,6 +107,13 @@ wire [PORTS-1:0] grant; wire grant_valid; wire [CL_PORTS-1:0] grant_encoded; +// input registers to pipeline arbitration delay +reg [PORTS*PCIE_ADDR_WIDTH-1:0] s_axis_desc_pcie_addr_reg = 0; +reg [PORTS*AXI_ADDR_WIDTH-1:0] s_axis_desc_axi_addr_reg = 0; +reg [PORTS*LEN_WIDTH-1:0] s_axis_desc_len_reg = 0; +reg [PORTS*S_TAG_WIDTH-1:0] s_axis_desc_tag_reg = 0; +reg [PORTS-1:0] s_axis_desc_valid_reg = 0; + // internal datapath reg [PCIE_ADDR_WIDTH-1:0] m_axis_desc_pcie_addr_int; reg [AXI_ADDR_WIDTH-1:0] m_axis_desc_axi_addr_int; @@ -116,14 +123,14 @@ reg m_axis_desc_valid_int; reg m_axis_desc_ready_int_reg = 1'b0; wire m_axis_desc_ready_int_early; -assign s_axis_desc_ready = (m_axis_desc_ready_int_reg && grant_valid) << grant_encoded; +assign s_axis_desc_ready = ~s_axis_desc_valid_reg | ({PORTS{m_axis_desc_ready_int_reg}} & grant); // mux for incoming packet -wire [PCIE_ADDR_WIDTH-1:0] current_s_desc_pcie_addr = s_axis_desc_pcie_addr[grant_encoded*PCIE_ADDR_WIDTH +: PCIE_ADDR_WIDTH]; -wire [AXI_ADDR_WIDTH-1:0] current_s_desc_axi_addr = s_axis_desc_axi_addr[grant_encoded*AXI_ADDR_WIDTH +: AXI_ADDR_WIDTH]; -wire [LEN_WIDTH-1:0] current_s_desc_len = s_axis_desc_len[grant_encoded*LEN_WIDTH +: LEN_WIDTH]; -wire [S_TAG_WIDTH-1:0] current_s_desc_tag = s_axis_desc_tag[grant_encoded*S_TAG_WIDTH +: S_TAG_WIDTH]; -wire current_s_desc_valid = s_axis_desc_valid[grant_encoded]; +wire [PCIE_ADDR_WIDTH-1:0] current_s_desc_pcie_addr = s_axis_desc_pcie_addr_reg[grant_encoded*PCIE_ADDR_WIDTH +: PCIE_ADDR_WIDTH]; +wire [AXI_ADDR_WIDTH-1:0] current_s_desc_axi_addr = s_axis_desc_axi_addr_reg[grant_encoded*AXI_ADDR_WIDTH +: AXI_ADDR_WIDTH]; +wire [LEN_WIDTH-1:0] current_s_desc_len = s_axis_desc_len_reg[grant_encoded*LEN_WIDTH +: LEN_WIDTH]; +wire [S_TAG_WIDTH-1:0] current_s_desc_tag = s_axis_desc_tag_reg[grant_encoded*S_TAG_WIDTH +: S_TAG_WIDTH]; +wire current_s_desc_valid = s_axis_desc_valid_reg[grant_encoded]; wire current_s_desc_ready = s_axis_desc_ready[grant_encoded]; // arbiter instance @@ -144,8 +151,8 @@ arb_inst ( .grant_encoded(grant_encoded) ); -assign request = s_axis_desc_valid & ~grant; -assign acknowledge = grant & s_axis_desc_valid & s_axis_desc_ready; +assign request = (s_axis_desc_valid_reg & ~grant) | (s_axis_desc_valid & grant); +assign acknowledge = grant & s_axis_desc_valid_reg & {PORTS{m_axis_desc_ready_int_reg}}; always @* begin // pass through selected packet data @@ -159,6 +166,25 @@ always @* begin m_axis_desc_valid_int = current_s_desc_valid && m_axis_desc_ready_int_reg && grant_valid; end +integer i; + +always @(posedge clk) begin + // register inputs + for (i = 0; i < PORTS; i = i + 1) begin + if (s_axis_desc_ready[i]) begin + s_axis_desc_pcie_addr_reg[i*PCIE_ADDR_WIDTH +: PCIE_ADDR_WIDTH] <= s_axis_desc_pcie_addr[i*PCIE_ADDR_WIDTH +: PCIE_ADDR_WIDTH]; + s_axis_desc_axi_addr_reg[i*AXI_ADDR_WIDTH +: AXI_ADDR_WIDTH] <= s_axis_desc_axi_addr[i*AXI_ADDR_WIDTH +: AXI_ADDR_WIDTH]; + s_axis_desc_len_reg[i*LEN_WIDTH +: LEN_WIDTH] <= s_axis_desc_len[i*LEN_WIDTH +: LEN_WIDTH]; + s_axis_desc_tag_reg[i*S_TAG_WIDTH +: S_TAG_WIDTH] <= s_axis_desc_tag[i*S_TAG_WIDTH +: S_TAG_WIDTH]; + s_axis_desc_valid_reg[i] <= s_axis_desc_valid[i]; + end + end + + if (rst) begin + s_axis_desc_valid_reg <= 0; + end +end + // output datapath logic reg [PCIE_ADDR_WIDTH-1:0] m_axis_desc_pcie_addr_reg = {PCIE_ADDR_WIDTH{1'b0}}; reg [AXI_ADDR_WIDTH-1:0] m_axis_desc_axi_addr_reg = {AXI_ADDR_WIDTH{1'b0}}; diff --git a/rtl/pcie_tlp_mux.v b/rtl/pcie_tlp_mux.v index e1311106a..5b12cc969 100644 --- a/rtl/pcie_tlp_mux.v +++ b/rtl/pcie_tlp_mux.v @@ -107,6 +107,17 @@ wire [PORTS-1:0] grant; wire grant_valid; wire [CL_PORTS-1:0] grant_encoded; +// input registers to pipeline arbitration delay +reg [PORTS*TLP_SEG_COUNT*TLP_SEG_DATA_WIDTH-1:0] in_tlp_data_reg = 0; +reg [PORTS*TLP_SEG_COUNT*TLP_SEG_STRB_WIDTH-1:0] in_tlp_strb_reg = 0; +reg [PORTS*TLP_SEG_COUNT*TLP_SEG_HDR_WIDTH-1:0] in_tlp_hdr_reg = 0; +reg [PORTS*TLP_SEG_COUNT*3-1:0] in_tlp_bar_id_reg = 0; +reg [PORTS*TLP_SEG_COUNT*8-1:0] in_tlp_func_num_reg = 0; +reg [PORTS*TLP_SEG_COUNT*4-1:0] in_tlp_error_reg = 0; +reg [PORTS*TLP_SEG_COUNT-1:0] in_tlp_valid_reg = 0; +reg [PORTS*TLP_SEG_COUNT-1:0] in_tlp_sop_reg = 0; +reg [PORTS*TLP_SEG_COUNT-1:0] in_tlp_eop_reg = 0; + // internal datapath reg [TLP_SEG_COUNT*TLP_SEG_DATA_WIDTH-1:0] out_tlp_data_int; reg [TLP_SEG_COUNT*TLP_SEG_STRB_WIDTH-1:0] out_tlp_strb_int; @@ -120,18 +131,18 @@ reg [TLP_SEG_COUNT-1:0] out_tlp_eop_int; reg out_tlp_ready_int_reg = 1'b0; wire out_tlp_ready_int_early; -assign in_tlp_ready = (out_tlp_ready_int_reg && grant_valid) << grant_encoded; +assign in_tlp_ready = ~in_tlp_valid_reg | ({PORTS{out_tlp_ready_int_reg}} & grant); // mux for incoming packet -wire [TLP_SEG_COUNT*TLP_SEG_DATA_WIDTH-1:0] current_in_tlp_data = in_tlp_data[grant_encoded*TLP_SEG_COUNT*TLP_SEG_DATA_WIDTH +: TLP_SEG_COUNT*TLP_SEG_DATA_WIDTH]; -wire [TLP_SEG_COUNT*TLP_SEG_STRB_WIDTH-1:0] current_in_tlp_strb = in_tlp_strb[grant_encoded*TLP_SEG_COUNT*TLP_SEG_STRB_WIDTH +: TLP_SEG_COUNT*TLP_SEG_STRB_WIDTH]; -wire [TLP_SEG_COUNT*TLP_SEG_HDR_WIDTH-1:0] current_in_tlp_hdr = in_tlp_hdr[grant_encoded*TLP_SEG_COUNT*TLP_SEG_HDR_WIDTH +: TLP_SEG_COUNT*TLP_SEG_HDR_WIDTH]; -wire [TLP_SEG_COUNT*3-1:0] current_in_tlp_bar_id = in_tlp_bar_id[grant_encoded*TLP_SEG_COUNT*3 +: TLP_SEG_COUNT*3]; -wire [TLP_SEG_COUNT*8-1:0] current_in_tlp_func_num = in_tlp_func_num[grant_encoded*TLP_SEG_COUNT*8 +: TLP_SEG_COUNT*8]; -wire [TLP_SEG_COUNT*4-1:0] current_in_tlp_error = in_tlp_error[grant_encoded*TLP_SEG_COUNT*4 +: TLP_SEG_COUNT*4]; -wire [TLP_SEG_COUNT-1:0] current_in_tlp_valid = in_tlp_valid[grant_encoded*TLP_SEG_COUNT +: TLP_SEG_COUNT]; -wire [TLP_SEG_COUNT-1:0] current_in_tlp_sop = in_tlp_sop[grant_encoded*TLP_SEG_COUNT +: TLP_SEG_COUNT]; -wire [TLP_SEG_COUNT-1:0] current_in_tlp_eop = in_tlp_eop[grant_encoded*TLP_SEG_COUNT +: TLP_SEG_COUNT]; +wire [TLP_SEG_COUNT*TLP_SEG_DATA_WIDTH-1:0] current_in_tlp_data = in_tlp_data_reg[grant_encoded*TLP_SEG_COUNT*TLP_SEG_DATA_WIDTH +: TLP_SEG_COUNT*TLP_SEG_DATA_WIDTH]; +wire [TLP_SEG_COUNT*TLP_SEG_STRB_WIDTH-1:0] current_in_tlp_strb = in_tlp_strb_reg[grant_encoded*TLP_SEG_COUNT*TLP_SEG_STRB_WIDTH +: TLP_SEG_COUNT*TLP_SEG_STRB_WIDTH]; +wire [TLP_SEG_COUNT*TLP_SEG_HDR_WIDTH-1:0] current_in_tlp_hdr = in_tlp_hdr_reg[grant_encoded*TLP_SEG_COUNT*TLP_SEG_HDR_WIDTH +: TLP_SEG_COUNT*TLP_SEG_HDR_WIDTH]; +wire [TLP_SEG_COUNT*3-1:0] current_in_tlp_bar_id = in_tlp_bar_id_reg[grant_encoded*TLP_SEG_COUNT*3 +: TLP_SEG_COUNT*3]; +wire [TLP_SEG_COUNT*8-1:0] current_in_tlp_func_num = in_tlp_func_num_reg[grant_encoded*TLP_SEG_COUNT*8 +: TLP_SEG_COUNT*8]; +wire [TLP_SEG_COUNT*4-1:0] current_in_tlp_error = in_tlp_error_reg[grant_encoded*TLP_SEG_COUNT*4 +: TLP_SEG_COUNT*4]; +wire [TLP_SEG_COUNT-1:0] current_in_tlp_valid = in_tlp_valid_reg[grant_encoded*TLP_SEG_COUNT +: TLP_SEG_COUNT]; +wire [TLP_SEG_COUNT-1:0] current_in_tlp_sop = in_tlp_sop_reg[grant_encoded*TLP_SEG_COUNT +: TLP_SEG_COUNT]; +wire [TLP_SEG_COUNT-1:0] current_in_tlp_eop = in_tlp_eop_reg[grant_encoded*TLP_SEG_COUNT +: TLP_SEG_COUNT]; wire current_in_tlp_ready = in_tlp_ready[grant_encoded]; // arbiter instance @@ -152,8 +163,8 @@ arb_inst ( .grant_encoded(grant_encoded) ); -assign request = in_tlp_valid & ~grant; -assign acknowledge = grant & in_tlp_valid & in_tlp_ready & in_tlp_eop; +assign request = (in_tlp_valid_reg & ~grant) | (in_tlp_valid & grant); +assign acknowledge = grant & in_tlp_valid_reg & {PORTS{out_tlp_ready_int_reg}} & in_tlp_eop_reg; always @* begin // pass through selected packet data @@ -163,11 +174,34 @@ always @* begin out_tlp_bar_id_int = current_in_tlp_bar_id; out_tlp_func_num_int = current_in_tlp_func_num; out_tlp_error_int = current_in_tlp_error; - out_tlp_valid_int = out_tlp_ready_int_reg && grant_valid ? current_in_tlp_valid : 0; + out_tlp_valid_int = current_in_tlp_valid && out_tlp_ready_int_reg && grant_valid; out_tlp_sop_int = current_in_tlp_sop; out_tlp_eop_int = current_in_tlp_eop; end +integer i; + +always @(posedge clk) begin + // register inputs + for (i = 0; i < PORTS; i = i + 1) begin + if (in_tlp_ready[i]) begin + in_tlp_data_reg[i*TLP_SEG_COUNT*TLP_SEG_DATA_WIDTH +: TLP_SEG_COUNT*TLP_SEG_DATA_WIDTH] <= in_tlp_data[i*TLP_SEG_COUNT*TLP_SEG_DATA_WIDTH +: TLP_SEG_COUNT*TLP_SEG_DATA_WIDTH]; + in_tlp_strb_reg[i*TLP_SEG_COUNT*TLP_SEG_STRB_WIDTH +: TLP_SEG_COUNT*TLP_SEG_STRB_WIDTH] <= in_tlp_strb[i*TLP_SEG_COUNT*TLP_SEG_STRB_WIDTH +: TLP_SEG_COUNT*TLP_SEG_STRB_WIDTH]; + in_tlp_hdr_reg[i*TLP_SEG_COUNT*TLP_SEG_HDR_WIDTH +: TLP_SEG_COUNT*TLP_SEG_HDR_WIDTH] <= in_tlp_hdr[i*TLP_SEG_COUNT*TLP_SEG_HDR_WIDTH +: TLP_SEG_COUNT*TLP_SEG_HDR_WIDTH]; + in_tlp_bar_id_reg[i*TLP_SEG_COUNT*3 +: TLP_SEG_COUNT*3] <= in_tlp_bar_id[i*TLP_SEG_COUNT*3 +: TLP_SEG_COUNT*3]; + in_tlp_func_num_reg[i*TLP_SEG_COUNT*8 +: TLP_SEG_COUNT*8] <= in_tlp_func_num[i*TLP_SEG_COUNT*8 +: TLP_SEG_COUNT*8]; + in_tlp_error_reg[i*TLP_SEG_COUNT*4 +: TLP_SEG_COUNT*4] <= in_tlp_error[i*TLP_SEG_COUNT*4 +: TLP_SEG_COUNT*4]; + in_tlp_valid_reg[i*TLP_SEG_COUNT +: TLP_SEG_COUNT] <= in_tlp_valid[i*TLP_SEG_COUNT +: TLP_SEG_COUNT]; + in_tlp_sop_reg[i*TLP_SEG_COUNT +: TLP_SEG_COUNT] <= in_tlp_sop[i*TLP_SEG_COUNT +: TLP_SEG_COUNT]; + in_tlp_eop_reg[i*TLP_SEG_COUNT +: TLP_SEG_COUNT] <= in_tlp_eop[i*TLP_SEG_COUNT +: TLP_SEG_COUNT]; + end + end + + if (rst) begin + in_tlp_valid_reg <= 0; + end +end + // output datapath logic reg [TLP_SEG_COUNT*TLP_SEG_DATA_WIDTH-1:0] out_tlp_data_reg = 0; reg [TLP_SEG_COUNT*TLP_SEG_STRB_WIDTH-1:0] out_tlp_strb_reg = 0;