diff --git a/fpga/lib/pcie/example/ADM_PCIE_9V3/fpga/rtl/fpga.v b/fpga/lib/pcie/example/ADM_PCIE_9V3/fpga/rtl/fpga.v index d8c59a846..a4bcc4141 100644 --- a/fpga/lib/pcie/example/ADM_PCIE_9V3/fpga/rtl/fpga.v +++ b/fpga/lib/pcie/example/ADM_PCIE_9V3/fpga/rtl/fpga.v @@ -54,7 +54,7 @@ module fpga ( parameter AXIS_PCIE_DATA_WIDTH = 512; parameter AXIS_PCIE_KEEP_WIDTH = (AXIS_PCIE_DATA_WIDTH/32); parameter AXIS_PCIE_RC_USER_WIDTH = AXIS_PCIE_DATA_WIDTH < 512 ? 75 : 161; -parameter AXIS_PCIE_RQ_USER_WIDTH = AXIS_PCIE_DATA_WIDTH < 512 ? 60 : 137; +parameter AXIS_PCIE_RQ_USER_WIDTH = AXIS_PCIE_DATA_WIDTH < 512 ? 62 : 137; parameter AXIS_PCIE_CQ_USER_WIDTH = AXIS_PCIE_DATA_WIDTH < 512 ? 85 : 183; parameter AXIS_PCIE_CC_USER_WIDTH = AXIS_PCIE_DATA_WIDTH < 512 ? 33 : 81; parameter RC_STRADDLE = AXIS_PCIE_DATA_WIDTH >= 256; diff --git a/fpga/lib/pcie/example/ADM_PCIE_9V3/fpga/rtl/fpga_core.v b/fpga/lib/pcie/example/ADM_PCIE_9V3/fpga/rtl/fpga_core.v index 78f28f9a0..fb1c82576 100644 --- a/fpga/lib/pcie/example/ADM_PCIE_9V3/fpga/rtl/fpga_core.v +++ b/fpga/lib/pcie/example/ADM_PCIE_9V3/fpga/rtl/fpga_core.v @@ -159,8 +159,8 @@ example_core_pcie_us #( .PCIE_TAG_COUNT(PCIE_TAG_COUNT), .READ_OP_TABLE_SIZE(PCIE_TAG_COUNT), .READ_TX_LIMIT(2**(RQ_SEQ_NUM_WIDTH-1)), - .READ_CPLH_FC_LIMIT(128), - .READ_CPLD_FC_LIMIT(2048), + .READ_CPLH_FC_LIMIT(256), + .READ_CPLD_FC_LIMIT(2048-256), .WRITE_OP_TABLE_SIZE(2**(RQ_SEQ_NUM_WIDTH-1)), .WRITE_TX_LIMIT(2**(RQ_SEQ_NUM_WIDTH-1)), .BAR0_APERTURE(BAR0_APERTURE), @@ -263,8 +263,7 @@ example_core_pcie_us_inst ( */ .cfg_max_read_req(cfg_max_read_req), .cfg_max_payload(cfg_max_payload), - // .cfg_rcb_status(cfg_rcb_status), - .cfg_rcb_status(1'b1), // force RCB 128 due to insufficient CPLH limit in US+ PCIe HIP + .cfg_rcb_status(cfg_rcb_status), /* * Status diff --git a/fpga/lib/pcie/example/ADM_PCIE_9V3/fpga_axi/tb/fpga_core/Makefile b/fpga/lib/pcie/example/ADM_PCIE_9V3/fpga_axi/tb/fpga_core/Makefile index e730af529..b09d0d3bf 100644 --- a/fpga/lib/pcie/example/ADM_PCIE_9V3/fpga_axi/tb/fpga_core/Makefile +++ b/fpga/lib/pcie/example/ADM_PCIE_9V3/fpga_axi/tb/fpga_core/Makefile @@ -54,7 +54,6 @@ export PARAM_AXIS_PCIE_RQ_USER_WIDTH := $(if $(filter-out 512,$(PARAM_AXIS_PCIE_ export PARAM_AXIS_PCIE_RC_USER_WIDTH := $(if $(filter-out 512,$(PARAM_AXIS_PCIE_DATA_WIDTH)),75,161) export PARAM_AXIS_PCIE_CQ_USER_WIDTH := $(if $(filter-out 512,$(PARAM_AXIS_PCIE_DATA_WIDTH)),88,183) export PARAM_AXIS_PCIE_CC_USER_WIDTH := $(if $(filter-out 512,$(PARAM_AXIS_PCIE_DATA_WIDTH)),33,81) -export PARAM_RQ_SEQ_NUM_WIDTH := 6 ifeq ($(SIM), icarus) PLUSARGS += -fst diff --git a/fpga/lib/pcie/example/ADM_PCIE_9V3/fpga_axi/tb/fpga_core/test_fpga_core.py b/fpga/lib/pcie/example/ADM_PCIE_9V3/fpga_axi/tb/fpga_core/test_fpga_core.py index 80b4057a8..ee8e602bc 100644 --- a/fpga/lib/pcie/example/ADM_PCIE_9V3/fpga_axi/tb/fpga_core/test_fpga_core.py +++ b/fpga/lib/pcie/example/ADM_PCIE_9V3/fpga_axi/tb/fpga_core/test_fpga_core.py @@ -396,7 +396,6 @@ def test_fpga_core(request): parameters['AXIS_PCIE_RC_USER_WIDTH'] = 75 if parameters['AXIS_PCIE_DATA_WIDTH'] < 512 else 161 parameters['AXIS_PCIE_CQ_USER_WIDTH'] = 88 if parameters['AXIS_PCIE_DATA_WIDTH'] < 512 else 183 parameters['AXIS_PCIE_CC_USER_WIDTH'] = 33 if parameters['AXIS_PCIE_DATA_WIDTH'] < 512 else 81 - parameters['RQ_SEQ_NUM_WIDTH'] = 6 extra_env = {f'PARAM_{k}': str(v) for k, v in parameters.items()} diff --git a/fpga/lib/pcie/example/AU200/fpga/rtl/fpga.v b/fpga/lib/pcie/example/AU200/fpga/rtl/fpga.v index 426b56c8c..6d267f5cb 100644 --- a/fpga/lib/pcie/example/AU200/fpga/rtl/fpga.v +++ b/fpga/lib/pcie/example/AU200/fpga/rtl/fpga.v @@ -53,7 +53,7 @@ module fpga ( parameter AXIS_PCIE_DATA_WIDTH = 512; parameter AXIS_PCIE_KEEP_WIDTH = (AXIS_PCIE_DATA_WIDTH/32); parameter AXIS_PCIE_RC_USER_WIDTH = AXIS_PCIE_DATA_WIDTH < 512 ? 75 : 161; -parameter AXIS_PCIE_RQ_USER_WIDTH = AXIS_PCIE_DATA_WIDTH < 512 ? 60 : 137; +parameter AXIS_PCIE_RQ_USER_WIDTH = AXIS_PCIE_DATA_WIDTH < 512 ? 62 : 137; parameter AXIS_PCIE_CQ_USER_WIDTH = AXIS_PCIE_DATA_WIDTH < 512 ? 85 : 183; parameter AXIS_PCIE_CC_USER_WIDTH = AXIS_PCIE_DATA_WIDTH < 512 ? 33 : 81; parameter RC_STRADDLE = AXIS_PCIE_DATA_WIDTH >= 256; diff --git a/fpga/lib/pcie/example/AU200/fpga/rtl/fpga_core.v b/fpga/lib/pcie/example/AU200/fpga/rtl/fpga_core.v index 39d8ab260..68701b8c8 100644 --- a/fpga/lib/pcie/example/AU200/fpga/rtl/fpga_core.v +++ b/fpga/lib/pcie/example/AU200/fpga/rtl/fpga_core.v @@ -156,8 +156,8 @@ example_core_pcie_us #( .PCIE_TAG_COUNT(PCIE_TAG_COUNT), .READ_OP_TABLE_SIZE(PCIE_TAG_COUNT), .READ_TX_LIMIT(2**(RQ_SEQ_NUM_WIDTH-1)), - .READ_CPLH_FC_LIMIT(128), - .READ_CPLD_FC_LIMIT(2048), + .READ_CPLH_FC_LIMIT(256), + .READ_CPLD_FC_LIMIT(2048-256), .WRITE_OP_TABLE_SIZE(2**(RQ_SEQ_NUM_WIDTH-1)), .WRITE_TX_LIMIT(2**(RQ_SEQ_NUM_WIDTH-1)), .BAR0_APERTURE(BAR0_APERTURE), @@ -260,8 +260,7 @@ example_core_pcie_us_inst ( */ .cfg_max_read_req(cfg_max_read_req), .cfg_max_payload(cfg_max_payload), - // .cfg_rcb_status(cfg_rcb_status), - .cfg_rcb_status(1'b1), // force RCB 128 due to insufficient CPLH limit in US+ PCIe HIP + .cfg_rcb_status(cfg_rcb_status), /* * Status diff --git a/fpga/lib/pcie/example/AU200/fpga_axi/tb/fpga_core/Makefile b/fpga/lib/pcie/example/AU200/fpga_axi/tb/fpga_core/Makefile index e730af529..b09d0d3bf 100644 --- a/fpga/lib/pcie/example/AU200/fpga_axi/tb/fpga_core/Makefile +++ b/fpga/lib/pcie/example/AU200/fpga_axi/tb/fpga_core/Makefile @@ -54,7 +54,6 @@ export PARAM_AXIS_PCIE_RQ_USER_WIDTH := $(if $(filter-out 512,$(PARAM_AXIS_PCIE_ export PARAM_AXIS_PCIE_RC_USER_WIDTH := $(if $(filter-out 512,$(PARAM_AXIS_PCIE_DATA_WIDTH)),75,161) export PARAM_AXIS_PCIE_CQ_USER_WIDTH := $(if $(filter-out 512,$(PARAM_AXIS_PCIE_DATA_WIDTH)),88,183) export PARAM_AXIS_PCIE_CC_USER_WIDTH := $(if $(filter-out 512,$(PARAM_AXIS_PCIE_DATA_WIDTH)),33,81) -export PARAM_RQ_SEQ_NUM_WIDTH := 6 ifeq ($(SIM), icarus) PLUSARGS += -fst diff --git a/fpga/lib/pcie/example/AU200/fpga_axi/tb/fpga_core/test_fpga_core.py b/fpga/lib/pcie/example/AU200/fpga_axi/tb/fpga_core/test_fpga_core.py index f662d6cf6..8e67c0332 100644 --- a/fpga/lib/pcie/example/AU200/fpga_axi/tb/fpga_core/test_fpga_core.py +++ b/fpga/lib/pcie/example/AU200/fpga_axi/tb/fpga_core/test_fpga_core.py @@ -398,7 +398,6 @@ def test_fpga_core(request): parameters['AXIS_PCIE_RC_USER_WIDTH'] = 75 if parameters['AXIS_PCIE_DATA_WIDTH'] < 512 else 161 parameters['AXIS_PCIE_CQ_USER_WIDTH'] = 88 if parameters['AXIS_PCIE_DATA_WIDTH'] < 512 else 183 parameters['AXIS_PCIE_CC_USER_WIDTH'] = 33 if parameters['AXIS_PCIE_DATA_WIDTH'] < 512 else 81 - parameters['RQ_SEQ_NUM_WIDTH'] = 6 extra_env = {f'PARAM_{k}': str(v) for k, v in parameters.items()} diff --git a/fpga/lib/pcie/example/AU250/fpga/rtl/fpga.v b/fpga/lib/pcie/example/AU250/fpga/rtl/fpga.v index 426b56c8c..6d267f5cb 100644 --- a/fpga/lib/pcie/example/AU250/fpga/rtl/fpga.v +++ b/fpga/lib/pcie/example/AU250/fpga/rtl/fpga.v @@ -53,7 +53,7 @@ module fpga ( parameter AXIS_PCIE_DATA_WIDTH = 512; parameter AXIS_PCIE_KEEP_WIDTH = (AXIS_PCIE_DATA_WIDTH/32); parameter AXIS_PCIE_RC_USER_WIDTH = AXIS_PCIE_DATA_WIDTH < 512 ? 75 : 161; -parameter AXIS_PCIE_RQ_USER_WIDTH = AXIS_PCIE_DATA_WIDTH < 512 ? 60 : 137; +parameter AXIS_PCIE_RQ_USER_WIDTH = AXIS_PCIE_DATA_WIDTH < 512 ? 62 : 137; parameter AXIS_PCIE_CQ_USER_WIDTH = AXIS_PCIE_DATA_WIDTH < 512 ? 85 : 183; parameter AXIS_PCIE_CC_USER_WIDTH = AXIS_PCIE_DATA_WIDTH < 512 ? 33 : 81; parameter RC_STRADDLE = AXIS_PCIE_DATA_WIDTH >= 256; diff --git a/fpga/lib/pcie/example/AU250/fpga/rtl/fpga_core.v b/fpga/lib/pcie/example/AU250/fpga/rtl/fpga_core.v index 39d8ab260..68701b8c8 100644 --- a/fpga/lib/pcie/example/AU250/fpga/rtl/fpga_core.v +++ b/fpga/lib/pcie/example/AU250/fpga/rtl/fpga_core.v @@ -156,8 +156,8 @@ example_core_pcie_us #( .PCIE_TAG_COUNT(PCIE_TAG_COUNT), .READ_OP_TABLE_SIZE(PCIE_TAG_COUNT), .READ_TX_LIMIT(2**(RQ_SEQ_NUM_WIDTH-1)), - .READ_CPLH_FC_LIMIT(128), - .READ_CPLD_FC_LIMIT(2048), + .READ_CPLH_FC_LIMIT(256), + .READ_CPLD_FC_LIMIT(2048-256), .WRITE_OP_TABLE_SIZE(2**(RQ_SEQ_NUM_WIDTH-1)), .WRITE_TX_LIMIT(2**(RQ_SEQ_NUM_WIDTH-1)), .BAR0_APERTURE(BAR0_APERTURE), @@ -260,8 +260,7 @@ example_core_pcie_us_inst ( */ .cfg_max_read_req(cfg_max_read_req), .cfg_max_payload(cfg_max_payload), - // .cfg_rcb_status(cfg_rcb_status), - .cfg_rcb_status(1'b1), // force RCB 128 due to insufficient CPLH limit in US+ PCIe HIP + .cfg_rcb_status(cfg_rcb_status), /* * Status diff --git a/fpga/lib/pcie/example/AU250/fpga_axi/tb/fpga_core/Makefile b/fpga/lib/pcie/example/AU250/fpga_axi/tb/fpga_core/Makefile index e730af529..b09d0d3bf 100644 --- a/fpga/lib/pcie/example/AU250/fpga_axi/tb/fpga_core/Makefile +++ b/fpga/lib/pcie/example/AU250/fpga_axi/tb/fpga_core/Makefile @@ -54,7 +54,6 @@ export PARAM_AXIS_PCIE_RQ_USER_WIDTH := $(if $(filter-out 512,$(PARAM_AXIS_PCIE_ export PARAM_AXIS_PCIE_RC_USER_WIDTH := $(if $(filter-out 512,$(PARAM_AXIS_PCIE_DATA_WIDTH)),75,161) export PARAM_AXIS_PCIE_CQ_USER_WIDTH := $(if $(filter-out 512,$(PARAM_AXIS_PCIE_DATA_WIDTH)),88,183) export PARAM_AXIS_PCIE_CC_USER_WIDTH := $(if $(filter-out 512,$(PARAM_AXIS_PCIE_DATA_WIDTH)),33,81) -export PARAM_RQ_SEQ_NUM_WIDTH := 6 ifeq ($(SIM), icarus) PLUSARGS += -fst diff --git a/fpga/lib/pcie/example/AU250/fpga_axi/tb/fpga_core/test_fpga_core.py b/fpga/lib/pcie/example/AU250/fpga_axi/tb/fpga_core/test_fpga_core.py index f662d6cf6..8e67c0332 100644 --- a/fpga/lib/pcie/example/AU250/fpga_axi/tb/fpga_core/test_fpga_core.py +++ b/fpga/lib/pcie/example/AU250/fpga_axi/tb/fpga_core/test_fpga_core.py @@ -398,7 +398,6 @@ def test_fpga_core(request): parameters['AXIS_PCIE_RC_USER_WIDTH'] = 75 if parameters['AXIS_PCIE_DATA_WIDTH'] < 512 else 161 parameters['AXIS_PCIE_CQ_USER_WIDTH'] = 88 if parameters['AXIS_PCIE_DATA_WIDTH'] < 512 else 183 parameters['AXIS_PCIE_CC_USER_WIDTH'] = 33 if parameters['AXIS_PCIE_DATA_WIDTH'] < 512 else 81 - parameters['RQ_SEQ_NUM_WIDTH'] = 6 extra_env = {f'PARAM_{k}': str(v) for k, v in parameters.items()} diff --git a/fpga/lib/pcie/example/AU280/fpga/rtl/fpga.v b/fpga/lib/pcie/example/AU280/fpga/rtl/fpga.v index 52ae618c2..1d9cc0c53 100644 --- a/fpga/lib/pcie/example/AU280/fpga/rtl/fpga.v +++ b/fpga/lib/pcie/example/AU280/fpga/rtl/fpga.v @@ -52,7 +52,7 @@ module fpga ( parameter AXIS_PCIE_DATA_WIDTH = 512; parameter AXIS_PCIE_KEEP_WIDTH = (AXIS_PCIE_DATA_WIDTH/32); parameter AXIS_PCIE_RC_USER_WIDTH = AXIS_PCIE_DATA_WIDTH < 512 ? 75 : 161; -parameter AXIS_PCIE_RQ_USER_WIDTH = AXIS_PCIE_DATA_WIDTH < 512 ? 60 : 137; +parameter AXIS_PCIE_RQ_USER_WIDTH = AXIS_PCIE_DATA_WIDTH < 512 ? 62 : 137; parameter AXIS_PCIE_CQ_USER_WIDTH = AXIS_PCIE_DATA_WIDTH < 512 ? 85 : 183; parameter AXIS_PCIE_CC_USER_WIDTH = AXIS_PCIE_DATA_WIDTH < 512 ? 33 : 81; parameter RC_STRADDLE = AXIS_PCIE_DATA_WIDTH >= 256; diff --git a/fpga/lib/pcie/example/AU280/fpga/rtl/fpga_core.v b/fpga/lib/pcie/example/AU280/fpga/rtl/fpga_core.v index 644ee09fd..ed244292b 100644 --- a/fpga/lib/pcie/example/AU280/fpga/rtl/fpga_core.v +++ b/fpga/lib/pcie/example/AU280/fpga/rtl/fpga_core.v @@ -148,8 +148,8 @@ example_core_pcie_us #( .PCIE_TAG_COUNT(PCIE_TAG_COUNT), .READ_OP_TABLE_SIZE(PCIE_TAG_COUNT), .READ_TX_LIMIT(2**(RQ_SEQ_NUM_WIDTH-1)), - .READ_CPLH_FC_LIMIT(128), - .READ_CPLD_FC_LIMIT(2048), + .READ_CPLH_FC_LIMIT(256), + .READ_CPLD_FC_LIMIT(2048-256), .WRITE_OP_TABLE_SIZE(2**(RQ_SEQ_NUM_WIDTH-1)), .WRITE_TX_LIMIT(2**(RQ_SEQ_NUM_WIDTH-1)), .BAR0_APERTURE(BAR0_APERTURE), @@ -252,8 +252,7 @@ example_core_pcie_us_inst ( */ .cfg_max_read_req(cfg_max_read_req), .cfg_max_payload(cfg_max_payload), - // .cfg_rcb_status(cfg_rcb_status), - .cfg_rcb_status(1'b1), // force RCB 128 due to insufficient CPLH limit in US+ PCIe HIP + .cfg_rcb_status(cfg_rcb_status), /* * Status diff --git a/fpga/lib/pcie/example/AU280/fpga_axi/tb/fpga_core/Makefile b/fpga/lib/pcie/example/AU280/fpga_axi/tb/fpga_core/Makefile index e730af529..b09d0d3bf 100644 --- a/fpga/lib/pcie/example/AU280/fpga_axi/tb/fpga_core/Makefile +++ b/fpga/lib/pcie/example/AU280/fpga_axi/tb/fpga_core/Makefile @@ -54,7 +54,6 @@ export PARAM_AXIS_PCIE_RQ_USER_WIDTH := $(if $(filter-out 512,$(PARAM_AXIS_PCIE_ export PARAM_AXIS_PCIE_RC_USER_WIDTH := $(if $(filter-out 512,$(PARAM_AXIS_PCIE_DATA_WIDTH)),75,161) export PARAM_AXIS_PCIE_CQ_USER_WIDTH := $(if $(filter-out 512,$(PARAM_AXIS_PCIE_DATA_WIDTH)),88,183) export PARAM_AXIS_PCIE_CC_USER_WIDTH := $(if $(filter-out 512,$(PARAM_AXIS_PCIE_DATA_WIDTH)),33,81) -export PARAM_RQ_SEQ_NUM_WIDTH := 6 ifeq ($(SIM), icarus) PLUSARGS += -fst diff --git a/fpga/lib/pcie/example/AU280/fpga_axi/tb/fpga_core/test_fpga_core.py b/fpga/lib/pcie/example/AU280/fpga_axi/tb/fpga_core/test_fpga_core.py index 80b4057a8..ee8e602bc 100644 --- a/fpga/lib/pcie/example/AU280/fpga_axi/tb/fpga_core/test_fpga_core.py +++ b/fpga/lib/pcie/example/AU280/fpga_axi/tb/fpga_core/test_fpga_core.py @@ -396,7 +396,6 @@ def test_fpga_core(request): parameters['AXIS_PCIE_RC_USER_WIDTH'] = 75 if parameters['AXIS_PCIE_DATA_WIDTH'] < 512 else 161 parameters['AXIS_PCIE_CQ_USER_WIDTH'] = 88 if parameters['AXIS_PCIE_DATA_WIDTH'] < 512 else 183 parameters['AXIS_PCIE_CC_USER_WIDTH'] = 33 if parameters['AXIS_PCIE_DATA_WIDTH'] < 512 else 81 - parameters['RQ_SEQ_NUM_WIDTH'] = 6 extra_env = {f'PARAM_{k}': str(v) for k, v in parameters.items()} diff --git a/fpga/lib/pcie/example/AU50/fpga/rtl/fpga.v b/fpga/lib/pcie/example/AU50/fpga/rtl/fpga.v index f02aa7984..a45ddfb5c 100644 --- a/fpga/lib/pcie/example/AU50/fpga/rtl/fpga.v +++ b/fpga/lib/pcie/example/AU50/fpga/rtl/fpga.v @@ -55,7 +55,7 @@ module fpga ( parameter AXIS_PCIE_DATA_WIDTH = 512; parameter AXIS_PCIE_KEEP_WIDTH = (AXIS_PCIE_DATA_WIDTH/32); parameter AXIS_PCIE_RC_USER_WIDTH = AXIS_PCIE_DATA_WIDTH < 512 ? 75 : 161; -parameter AXIS_PCIE_RQ_USER_WIDTH = AXIS_PCIE_DATA_WIDTH < 512 ? 60 : 137; +parameter AXIS_PCIE_RQ_USER_WIDTH = AXIS_PCIE_DATA_WIDTH < 512 ? 62 : 137; parameter AXIS_PCIE_CQ_USER_WIDTH = AXIS_PCIE_DATA_WIDTH < 512 ? 85 : 183; parameter AXIS_PCIE_CC_USER_WIDTH = AXIS_PCIE_DATA_WIDTH < 512 ? 33 : 81; parameter RC_STRADDLE = AXIS_PCIE_DATA_WIDTH >= 256; diff --git a/fpga/lib/pcie/example/AU50/fpga/rtl/fpga_core.v b/fpga/lib/pcie/example/AU50/fpga/rtl/fpga_core.v index 1fff150c5..75875b0b6 100644 --- a/fpga/lib/pcie/example/AU50/fpga/rtl/fpga_core.v +++ b/fpga/lib/pcie/example/AU50/fpga/rtl/fpga_core.v @@ -159,8 +159,8 @@ example_core_pcie_us #( .PCIE_TAG_COUNT(PCIE_TAG_COUNT), .READ_OP_TABLE_SIZE(PCIE_TAG_COUNT), .READ_TX_LIMIT(2**(RQ_SEQ_NUM_WIDTH-1)), - .READ_CPLH_FC_LIMIT(128), - .READ_CPLD_FC_LIMIT(2048), + .READ_CPLH_FC_LIMIT(256), + .READ_CPLD_FC_LIMIT(2048-256), .WRITE_OP_TABLE_SIZE(2**(RQ_SEQ_NUM_WIDTH-1)), .WRITE_TX_LIMIT(2**(RQ_SEQ_NUM_WIDTH-1)), .BAR0_APERTURE(BAR0_APERTURE), @@ -263,8 +263,7 @@ example_core_pcie_us_inst ( */ .cfg_max_read_req(cfg_max_read_req), .cfg_max_payload(cfg_max_payload), - // .cfg_rcb_status(cfg_rcb_status), - .cfg_rcb_status(1'b1), // force RCB 128 due to insufficient CPLH limit in US+ PCIe HIP + .cfg_rcb_status(cfg_rcb_status), /* * Status diff --git a/fpga/lib/pcie/example/AU50/fpga_axi/tb/fpga_core/Makefile b/fpga/lib/pcie/example/AU50/fpga_axi/tb/fpga_core/Makefile index e730af529..b09d0d3bf 100644 --- a/fpga/lib/pcie/example/AU50/fpga_axi/tb/fpga_core/Makefile +++ b/fpga/lib/pcie/example/AU50/fpga_axi/tb/fpga_core/Makefile @@ -54,7 +54,6 @@ export PARAM_AXIS_PCIE_RQ_USER_WIDTH := $(if $(filter-out 512,$(PARAM_AXIS_PCIE_ export PARAM_AXIS_PCIE_RC_USER_WIDTH := $(if $(filter-out 512,$(PARAM_AXIS_PCIE_DATA_WIDTH)),75,161) export PARAM_AXIS_PCIE_CQ_USER_WIDTH := $(if $(filter-out 512,$(PARAM_AXIS_PCIE_DATA_WIDTH)),88,183) export PARAM_AXIS_PCIE_CC_USER_WIDTH := $(if $(filter-out 512,$(PARAM_AXIS_PCIE_DATA_WIDTH)),33,81) -export PARAM_RQ_SEQ_NUM_WIDTH := 6 ifeq ($(SIM), icarus) PLUSARGS += -fst diff --git a/fpga/lib/pcie/example/AU50/fpga_axi/tb/fpga_core/test_fpga_core.py b/fpga/lib/pcie/example/AU50/fpga_axi/tb/fpga_core/test_fpga_core.py index 80b4057a8..ee8e602bc 100644 --- a/fpga/lib/pcie/example/AU50/fpga_axi/tb/fpga_core/test_fpga_core.py +++ b/fpga/lib/pcie/example/AU50/fpga_axi/tb/fpga_core/test_fpga_core.py @@ -396,7 +396,6 @@ def test_fpga_core(request): parameters['AXIS_PCIE_RC_USER_WIDTH'] = 75 if parameters['AXIS_PCIE_DATA_WIDTH'] < 512 else 161 parameters['AXIS_PCIE_CQ_USER_WIDTH'] = 88 if parameters['AXIS_PCIE_DATA_WIDTH'] < 512 else 183 parameters['AXIS_PCIE_CC_USER_WIDTH'] = 33 if parameters['AXIS_PCIE_DATA_WIDTH'] < 512 else 81 - parameters['RQ_SEQ_NUM_WIDTH'] = 6 extra_env = {f'PARAM_{k}': str(v) for k, v in parameters.items()} diff --git a/fpga/lib/pcie/example/ExaNIC_X10/fpga/rtl/fpga_core.v b/fpga/lib/pcie/example/ExaNIC_X10/fpga/rtl/fpga_core.v index d33065d87..c0078c3b7 100644 --- a/fpga/lib/pcie/example/ExaNIC_X10/fpga/rtl/fpga_core.v +++ b/fpga/lib/pcie/example/ExaNIC_X10/fpga/rtl/fpga_core.v @@ -155,7 +155,7 @@ example_core_pcie_us #( .READ_OP_TABLE_SIZE(PCIE_TAG_COUNT), .READ_TX_LIMIT(2**(RQ_SEQ_NUM_WIDTH-1)), .READ_CPLH_FC_LIMIT(64), - .READ_CPLD_FC_LIMIT(992), + .READ_CPLD_FC_LIMIT(1024-64), .WRITE_OP_TABLE_SIZE(2**(RQ_SEQ_NUM_WIDTH-1)), .WRITE_TX_LIMIT(2**(RQ_SEQ_NUM_WIDTH-1)), .BAR0_APERTURE(BAR0_APERTURE), @@ -258,8 +258,7 @@ example_core_pcie_us_inst ( */ .cfg_max_read_req(cfg_max_read_req), .cfg_max_payload(cfg_max_payload), - // .cfg_rcb_status(cfg_rcb_status), - .cfg_rcb_status(1'b1), // force RCB 128 due to insufficient CPLH limit in US PCIe HIP + .cfg_rcb_status(cfg_rcb_status), /* * Status diff --git a/fpga/lib/pcie/example/ExaNIC_X10/fpga_axi/tb/fpga_core/Makefile b/fpga/lib/pcie/example/ExaNIC_X10/fpga_axi/tb/fpga_core/Makefile index 83abb0568..ebdba6e8f 100644 --- a/fpga/lib/pcie/example/ExaNIC_X10/fpga_axi/tb/fpga_core/Makefile +++ b/fpga/lib/pcie/example/ExaNIC_X10/fpga_axi/tb/fpga_core/Makefile @@ -54,7 +54,6 @@ export PARAM_AXIS_PCIE_RQ_USER_WIDTH := 60 export PARAM_AXIS_PCIE_RC_USER_WIDTH := 75 export PARAM_AXIS_PCIE_CQ_USER_WIDTH := 85 export PARAM_AXIS_PCIE_CC_USER_WIDTH := 33 -export PARAM_RQ_SEQ_NUM_WIDTH := 4 ifeq ($(SIM), icarus) PLUSARGS += -fst diff --git a/fpga/lib/pcie/example/ExaNIC_X10/fpga_axi/tb/fpga_core/test_fpga_core.py b/fpga/lib/pcie/example/ExaNIC_X10/fpga_axi/tb/fpga_core/test_fpga_core.py index 942cda074..8a8449e2c 100644 --- a/fpga/lib/pcie/example/ExaNIC_X10/fpga_axi/tb/fpga_core/test_fpga_core.py +++ b/fpga/lib/pcie/example/ExaNIC_X10/fpga_axi/tb/fpga_core/test_fpga_core.py @@ -370,7 +370,6 @@ def test_fpga_core(request): parameters['AXIS_PCIE_RC_USER_WIDTH'] = 75 parameters['AXIS_PCIE_CQ_USER_WIDTH'] = 85 parameters['AXIS_PCIE_CC_USER_WIDTH'] = 33 - parameters['RQ_SEQ_NUM_WIDTH'] = 4 extra_env = {f'PARAM_{k}': str(v) for k, v in parameters.items()} diff --git a/fpga/lib/pcie/example/ExaNIC_X25/fpga/rtl/fpga.v b/fpga/lib/pcie/example/ExaNIC_X25/fpga/rtl/fpga.v index 30463f830..c59f38be4 100644 --- a/fpga/lib/pcie/example/ExaNIC_X25/fpga/rtl/fpga.v +++ b/fpga/lib/pcie/example/ExaNIC_X25/fpga/rtl/fpga.v @@ -54,7 +54,7 @@ module fpga ( parameter AXIS_PCIE_DATA_WIDTH = 256; parameter AXIS_PCIE_KEEP_WIDTH = (AXIS_PCIE_DATA_WIDTH/32); parameter AXIS_PCIE_RC_USER_WIDTH = AXIS_PCIE_DATA_WIDTH < 512 ? 75 : 161; -parameter AXIS_PCIE_RQ_USER_WIDTH = AXIS_PCIE_DATA_WIDTH < 512 ? 60 : 137; +parameter AXIS_PCIE_RQ_USER_WIDTH = AXIS_PCIE_DATA_WIDTH < 512 ? 62 : 137; parameter AXIS_PCIE_CQ_USER_WIDTH = AXIS_PCIE_DATA_WIDTH < 512 ? 85 : 183; parameter AXIS_PCIE_CC_USER_WIDTH = AXIS_PCIE_DATA_WIDTH < 512 ? 33 : 81; parameter RC_STRADDLE = AXIS_PCIE_DATA_WIDTH >= 256; diff --git a/fpga/lib/pcie/example/ExaNIC_X25/fpga/rtl/fpga_core.v b/fpga/lib/pcie/example/ExaNIC_X25/fpga/rtl/fpga_core.v index 08a1bef83..7617c4390 100644 --- a/fpga/lib/pcie/example/ExaNIC_X25/fpga/rtl/fpga_core.v +++ b/fpga/lib/pcie/example/ExaNIC_X25/fpga/rtl/fpga_core.v @@ -159,8 +159,8 @@ example_core_pcie_us #( .PCIE_TAG_COUNT(PCIE_TAG_COUNT), .READ_OP_TABLE_SIZE(PCIE_TAG_COUNT), .READ_TX_LIMIT(2**(RQ_SEQ_NUM_WIDTH-1)), - .READ_CPLH_FC_LIMIT(128), - .READ_CPLD_FC_LIMIT(2048), + .READ_CPLH_FC_LIMIT(256), + .READ_CPLD_FC_LIMIT(2048-256), .WRITE_OP_TABLE_SIZE(2**(RQ_SEQ_NUM_WIDTH-1)), .WRITE_TX_LIMIT(2**(RQ_SEQ_NUM_WIDTH-1)), .BAR0_APERTURE(BAR0_APERTURE), @@ -263,8 +263,7 @@ example_core_pcie_us_inst ( */ .cfg_max_read_req(cfg_max_read_req), .cfg_max_payload(cfg_max_payload), - // .cfg_rcb_status(cfg_rcb_status), - .cfg_rcb_status(1'b1), // force RCB 128 due to insufficient CPLH limit in US+ PCIe HIP + .cfg_rcb_status(cfg_rcb_status), /* * Status diff --git a/fpga/lib/pcie/example/ExaNIC_X25/fpga_axi/tb/fpga_core/Makefile b/fpga/lib/pcie/example/ExaNIC_X25/fpga_axi/tb/fpga_core/Makefile index 8df1050af..09f0cec0e 100644 --- a/fpga/lib/pcie/example/ExaNIC_X25/fpga_axi/tb/fpga_core/Makefile +++ b/fpga/lib/pcie/example/ExaNIC_X25/fpga_axi/tb/fpga_core/Makefile @@ -54,7 +54,6 @@ export PARAM_AXIS_PCIE_RQ_USER_WIDTH := $(if $(filter-out 512,$(PARAM_AXIS_PCIE_ export PARAM_AXIS_PCIE_RC_USER_WIDTH := $(if $(filter-out 512,$(PARAM_AXIS_PCIE_DATA_WIDTH)),75,161) export PARAM_AXIS_PCIE_CQ_USER_WIDTH := $(if $(filter-out 512,$(PARAM_AXIS_PCIE_DATA_WIDTH)),88,183) export PARAM_AXIS_PCIE_CC_USER_WIDTH := $(if $(filter-out 512,$(PARAM_AXIS_PCIE_DATA_WIDTH)),33,81) -export PARAM_RQ_SEQ_NUM_WIDTH := 6 ifeq ($(SIM), icarus) PLUSARGS += -fst diff --git a/fpga/lib/pcie/example/ExaNIC_X25/fpga_axi/tb/fpga_core/test_fpga_core.py b/fpga/lib/pcie/example/ExaNIC_X25/fpga_axi/tb/fpga_core/test_fpga_core.py index 0d0d0c5b1..9fbce2122 100644 --- a/fpga/lib/pcie/example/ExaNIC_X25/fpga_axi/tb/fpga_core/test_fpga_core.py +++ b/fpga/lib/pcie/example/ExaNIC_X25/fpga_axi/tb/fpga_core/test_fpga_core.py @@ -396,7 +396,6 @@ def test_fpga_core(request): parameters['AXIS_PCIE_RC_USER_WIDTH'] = 75 if parameters['AXIS_PCIE_DATA_WIDTH'] < 512 else 161 parameters['AXIS_PCIE_CQ_USER_WIDTH'] = 88 if parameters['AXIS_PCIE_DATA_WIDTH'] < 512 else 183 parameters['AXIS_PCIE_CC_USER_WIDTH'] = 33 if parameters['AXIS_PCIE_DATA_WIDTH'] < 512 else 81 - parameters['RQ_SEQ_NUM_WIDTH'] = 6 extra_env = {f'PARAM_{k}': str(v) for k, v in parameters.items()} diff --git a/fpga/lib/pcie/example/VCU108/fpga/rtl/fpga_core.v b/fpga/lib/pcie/example/VCU108/fpga/rtl/fpga_core.v index 510ea1bba..a6b213963 100644 --- a/fpga/lib/pcie/example/VCU108/fpga/rtl/fpga_core.v +++ b/fpga/lib/pcie/example/VCU108/fpga/rtl/fpga_core.v @@ -157,7 +157,7 @@ example_core_pcie_us #( .READ_OP_TABLE_SIZE(PCIE_TAG_COUNT), .READ_TX_LIMIT(2**(RQ_SEQ_NUM_WIDTH-1)), .READ_CPLH_FC_LIMIT(64), - .READ_CPLD_FC_LIMIT(992), + .READ_CPLD_FC_LIMIT(1024-64), .WRITE_OP_TABLE_SIZE(2**(RQ_SEQ_NUM_WIDTH-1)), .WRITE_TX_LIMIT(2**(RQ_SEQ_NUM_WIDTH-1)), .BAR0_APERTURE(BAR0_APERTURE), @@ -260,8 +260,7 @@ example_core_pcie_us_inst ( */ .cfg_max_read_req(cfg_max_read_req), .cfg_max_payload(cfg_max_payload), - // .cfg_rcb_status(cfg_rcb_status), - .cfg_rcb_status(1'b1), // force RCB 128 due to insufficient CPLH limit in US PCIe HIP + .cfg_rcb_status(cfg_rcb_status), /* * Status diff --git a/fpga/lib/pcie/example/VCU108/fpga_axi/rtl/fpga.v b/fpga/lib/pcie/example/VCU108/fpga_axi/rtl/fpga.v index 93ddb1620..125e8a0b6 100644 --- a/fpga/lib/pcie/example/VCU108/fpga_axi/rtl/fpga.v +++ b/fpga/lib/pcie/example/VCU108/fpga_axi/rtl/fpga.v @@ -57,6 +57,10 @@ module fpga ( parameter AXIS_PCIE_DATA_WIDTH = 256; parameter AXIS_PCIE_KEEP_WIDTH = (AXIS_PCIE_DATA_WIDTH/32); +parameter AXIS_PCIE_RC_USER_WIDTH = 75; +parameter AXIS_PCIE_RQ_USER_WIDTH = 60; +parameter AXIS_PCIE_CQ_USER_WIDTH = 85; +parameter AXIS_PCIE_CC_USER_WIDTH = 33; // Clock and reset wire pcie_user_clk; @@ -107,33 +111,33 @@ ibufds_gte3_pcie_mgt_refclk_inst ( .ODIV2 (pcie_sys_clk) ); -wire [AXIS_PCIE_DATA_WIDTH-1:0] axis_rq_tdata; -wire [AXIS_PCIE_KEEP_WIDTH-1:0] axis_rq_tkeep; -wire axis_rq_tlast; -wire axis_rq_tready; -wire [59:0] axis_rq_tuser; -wire axis_rq_tvalid; +wire [AXIS_PCIE_DATA_WIDTH-1:0] axis_rq_tdata; +wire [AXIS_PCIE_KEEP_WIDTH-1:0] axis_rq_tkeep; +wire axis_rq_tlast; +wire axis_rq_tready; +wire [AXIS_PCIE_RQ_USER_WIDTH-1:0] axis_rq_tuser; +wire axis_rq_tvalid; -wire [AXIS_PCIE_DATA_WIDTH-1:0] axis_rc_tdata; -wire [AXIS_PCIE_KEEP_WIDTH-1:0] axis_rc_tkeep; -wire axis_rc_tlast; -wire axis_rc_tready; -wire [74:0] axis_rc_tuser; -wire axis_rc_tvalid; +wire [AXIS_PCIE_DATA_WIDTH-1:0] axis_rc_tdata; +wire [AXIS_PCIE_KEEP_WIDTH-1:0] axis_rc_tkeep; +wire axis_rc_tlast; +wire axis_rc_tready; +wire [AXIS_PCIE_RC_USER_WIDTH-1:0] axis_rc_tuser; +wire axis_rc_tvalid; -wire [AXIS_PCIE_DATA_WIDTH-1:0] axis_cq_tdata; -wire [AXIS_PCIE_KEEP_WIDTH-1:0] axis_cq_tkeep; -wire axis_cq_tlast; -wire axis_cq_tready; -wire [84:0] axis_cq_tuser; -wire axis_cq_tvalid; +wire [AXIS_PCIE_DATA_WIDTH-1:0] axis_cq_tdata; +wire [AXIS_PCIE_KEEP_WIDTH-1:0] axis_cq_tkeep; +wire axis_cq_tlast; +wire axis_cq_tready; +wire [AXIS_PCIE_CQ_USER_WIDTH-1:0] axis_cq_tuser; +wire axis_cq_tvalid; -wire [AXIS_PCIE_DATA_WIDTH-1:0] axis_cc_tdata; -wire [AXIS_PCIE_KEEP_WIDTH-1:0] axis_cc_tkeep; -wire axis_cc_tlast; -wire axis_cc_tready; -wire [32:0] axis_cc_tuser; -wire axis_cc_tvalid; +wire [AXIS_PCIE_DATA_WIDTH-1:0] axis_cc_tdata; +wire [AXIS_PCIE_KEEP_WIDTH-1:0] axis_cc_tkeep; +wire axis_cc_tlast; +wire axis_cc_tready; +wire [AXIS_PCIE_CC_USER_WIDTH-1:0] axis_cc_tuser; +wire axis_cc_tvalid; // ila_0 rq_ila ( // .clk(pcie_user_clk), @@ -357,7 +361,12 @@ pcie3_ultrascale_inst ( ); fpga_core #( - .AXIS_PCIE_DATA_WIDTH(AXIS_PCIE_DATA_WIDTH) + .AXIS_PCIE_DATA_WIDTH(AXIS_PCIE_DATA_WIDTH), + .AXIS_PCIE_KEEP_WIDTH(AXIS_PCIE_KEEP_WIDTH), + .AXIS_PCIE_RC_USER_WIDTH(AXIS_PCIE_RC_USER_WIDTH), + .AXIS_PCIE_RQ_USER_WIDTH(AXIS_PCIE_RQ_USER_WIDTH), + .AXIS_PCIE_CQ_USER_WIDTH(AXIS_PCIE_CQ_USER_WIDTH), + .AXIS_PCIE_CC_USER_WIDTH(AXIS_PCIE_CC_USER_WIDTH) ) core_inst ( /* diff --git a/fpga/lib/pcie/example/VCU108/fpga_axi/rtl/fpga_core.v b/fpga/lib/pcie/example/VCU108/fpga_axi/rtl/fpga_core.v index c5819b9d5..742db0bc5 100644 --- a/fpga/lib/pcie/example/VCU108/fpga_axi/rtl/fpga_core.v +++ b/fpga/lib/pcie/example/VCU108/fpga_axi/rtl/fpga_core.v @@ -34,89 +34,93 @@ THE SOFTWARE. module fpga_core # ( parameter AXIS_PCIE_DATA_WIDTH = 256, - parameter AXIS_PCIE_KEEP_WIDTH = (AXIS_PCIE_DATA_WIDTH/32) + parameter AXIS_PCIE_KEEP_WIDTH = (AXIS_PCIE_DATA_WIDTH/32), + parameter AXIS_PCIE_RC_USER_WIDTH = 75, + parameter AXIS_PCIE_RQ_USER_WIDTH = 60, + parameter AXIS_PCIE_CQ_USER_WIDTH = 85, + parameter AXIS_PCIE_CC_USER_WIDTH = 33 ) ( /* * Clock: 250 MHz * Synchronous reset */ - input wire clk, - input wire rst, + input wire clk, + input wire rst, /* * GPIO */ - input wire btnu, - input wire btnl, - input wire btnd, - input wire btnr, - input wire btnc, - input wire [3:0] sw, - output wire [7:0] led, + input wire btnu, + input wire btnl, + input wire btnd, + input wire btnr, + input wire btnc, + input wire [3:0] sw, + output wire [7:0] led, /* * PCIe */ - output wire [AXIS_PCIE_DATA_WIDTH-1:0] m_axis_rq_tdata, - output wire [AXIS_PCIE_KEEP_WIDTH-1:0] m_axis_rq_tkeep, - output wire m_axis_rq_tlast, - input wire m_axis_rq_tready, - output wire [59:0] m_axis_rq_tuser, - output wire m_axis_rq_tvalid, + output wire [AXIS_PCIE_DATA_WIDTH-1:0] m_axis_rq_tdata, + output wire [AXIS_PCIE_KEEP_WIDTH-1:0] m_axis_rq_tkeep, + output wire m_axis_rq_tlast, + input wire m_axis_rq_tready, + output wire [AXIS_PCIE_RQ_USER_WIDTH-1:0] m_axis_rq_tuser, + output wire m_axis_rq_tvalid, - input wire [AXIS_PCIE_DATA_WIDTH-1:0] s_axis_rc_tdata, - input wire [AXIS_PCIE_KEEP_WIDTH-1:0] s_axis_rc_tkeep, - input wire s_axis_rc_tlast, - output wire s_axis_rc_tready, - input wire [74:0] s_axis_rc_tuser, - input wire s_axis_rc_tvalid, + input wire [AXIS_PCIE_DATA_WIDTH-1:0] s_axis_rc_tdata, + input wire [AXIS_PCIE_KEEP_WIDTH-1:0] s_axis_rc_tkeep, + input wire s_axis_rc_tlast, + output wire s_axis_rc_tready, + input wire [AXIS_PCIE_RC_USER_WIDTH-1:0] s_axis_rc_tuser, + input wire s_axis_rc_tvalid, - input wire [AXIS_PCIE_DATA_WIDTH-1:0] s_axis_cq_tdata, - input wire [AXIS_PCIE_KEEP_WIDTH-1:0] s_axis_cq_tkeep, - input wire s_axis_cq_tlast, - output wire s_axis_cq_tready, - input wire [84:0] s_axis_cq_tuser, - input wire s_axis_cq_tvalid, + input wire [AXIS_PCIE_DATA_WIDTH-1:0] s_axis_cq_tdata, + input wire [AXIS_PCIE_KEEP_WIDTH-1:0] s_axis_cq_tkeep, + input wire s_axis_cq_tlast, + output wire s_axis_cq_tready, + input wire [AXIS_PCIE_CQ_USER_WIDTH-1:0] s_axis_cq_tuser, + input wire s_axis_cq_tvalid, - output wire [AXIS_PCIE_DATA_WIDTH-1:0] m_axis_cc_tdata, - output wire [AXIS_PCIE_KEEP_WIDTH-1:0] m_axis_cc_tkeep, - output wire m_axis_cc_tlast, - input wire m_axis_cc_tready, - output wire [32:0] m_axis_cc_tuser, - output wire m_axis_cc_tvalid, + output wire [AXIS_PCIE_DATA_WIDTH-1:0] m_axis_cc_tdata, + output wire [AXIS_PCIE_KEEP_WIDTH-1:0] m_axis_cc_tkeep, + output wire m_axis_cc_tlast, + input wire m_axis_cc_tready, + output wire [AXIS_PCIE_CC_USER_WIDTH-1:0] m_axis_cc_tuser, + output wire m_axis_cc_tvalid, - input wire [2:0] cfg_max_payload, - input wire [2:0] cfg_max_read_req, + input wire [2:0] cfg_max_payload, + input wire [2:0] cfg_max_read_req, - output wire [18:0] cfg_mgmt_addr, - output wire cfg_mgmt_write, - output wire [31:0] cfg_mgmt_write_data, - output wire [3:0] cfg_mgmt_byte_enable, - output wire cfg_mgmt_read, - input wire [31:0] cfg_mgmt_read_data, - input wire cfg_mgmt_read_write_done, + output wire [18:0] cfg_mgmt_addr, + output wire cfg_mgmt_write, + output wire [31:0] cfg_mgmt_write_data, + output wire [3:0] cfg_mgmt_byte_enable, + output wire cfg_mgmt_read, + input wire [31:0] cfg_mgmt_read_data, + input wire cfg_mgmt_read_write_done, - input wire [3:0] cfg_interrupt_msi_enable, - input wire [7:0] cfg_interrupt_msi_vf_enable, - input wire [11:0] cfg_interrupt_msi_mmenable, - input wire cfg_interrupt_msi_mask_update, - input wire [31:0] cfg_interrupt_msi_data, - output wire [3:0] cfg_interrupt_msi_select, - output wire [31:0] cfg_interrupt_msi_int, - output wire [31:0] cfg_interrupt_msi_pending_status, - output wire cfg_interrupt_msi_pending_status_data_enable, - output wire [3:0] cfg_interrupt_msi_pending_status_function_num, - input wire cfg_interrupt_msi_sent, - input wire cfg_interrupt_msi_fail, - output wire [2:0] cfg_interrupt_msi_attr, - output wire cfg_interrupt_msi_tph_present, - output wire [1:0] cfg_interrupt_msi_tph_type, - output wire [8:0] cfg_interrupt_msi_tph_st_tag, - output wire [3:0] cfg_interrupt_msi_function_number, + input wire [3:0] cfg_interrupt_msi_enable, + input wire [7:0] cfg_interrupt_msi_vf_enable, + input wire [11:0] cfg_interrupt_msi_mmenable, + input wire cfg_interrupt_msi_mask_update, + input wire [31:0] cfg_interrupt_msi_data, + output wire [3:0] cfg_interrupt_msi_select, + output wire [31:0] cfg_interrupt_msi_int, + output wire [31:0] cfg_interrupt_msi_pending_status, + output wire cfg_interrupt_msi_pending_status_data_enable, + output wire [3:0] cfg_interrupt_msi_pending_status_function_num, + input wire cfg_interrupt_msi_sent, + input wire cfg_interrupt_msi_fail, + output wire [2:0] cfg_interrupt_msi_attr, + output wire cfg_interrupt_msi_tph_present, + output wire [1:0] cfg_interrupt_msi_tph_type, + output wire [8:0] cfg_interrupt_msi_tph_st_tag, + output wire [3:0] cfg_interrupt_msi_function_number, - output wire status_error_cor, - output wire status_error_uncor + output wire status_error_cor, + output wire status_error_uncor ); parameter PCIE_ADDR_WIDTH = 64; diff --git a/fpga/lib/pcie/example/VCU108/fpga_axi/tb/fpga_core/test_fpga_core.py b/fpga/lib/pcie/example/VCU108/fpga_axi/tb/fpga_core/test_fpga_core.py index b292bc974..562568c0b 100644 --- a/fpga/lib/pcie/example/VCU108/fpga_axi/tb/fpga_core/test_fpga_core.py +++ b/fpga/lib/pcie/example/VCU108/fpga_axi/tb/fpga_core/test_fpga_core.py @@ -377,7 +377,6 @@ def test_fpga_core(request): parameters['AXIS_PCIE_RC_USER_WIDTH'] = 75 parameters['AXIS_PCIE_CQ_USER_WIDTH'] = 85 parameters['AXIS_PCIE_CC_USER_WIDTH'] = 33 - parameters['RQ_SEQ_NUM_WIDTH'] = 4 extra_env = {f'PARAM_{k}': str(v) for k, v in parameters.items()} diff --git a/fpga/lib/pcie/example/VCU118/fpga/rtl/fpga.v b/fpga/lib/pcie/example/VCU118/fpga/rtl/fpga.v index eceb81788..86a746ce9 100644 --- a/fpga/lib/pcie/example/VCU118/fpga/rtl/fpga.v +++ b/fpga/lib/pcie/example/VCU118/fpga/rtl/fpga.v @@ -58,7 +58,7 @@ module fpga ( parameter AXIS_PCIE_DATA_WIDTH = 512; parameter AXIS_PCIE_KEEP_WIDTH = (AXIS_PCIE_DATA_WIDTH/32); parameter AXIS_PCIE_RC_USER_WIDTH = AXIS_PCIE_DATA_WIDTH < 512 ? 75 : 161; -parameter AXIS_PCIE_RQ_USER_WIDTH = AXIS_PCIE_DATA_WIDTH < 512 ? 60 : 137; +parameter AXIS_PCIE_RQ_USER_WIDTH = AXIS_PCIE_DATA_WIDTH < 512 ? 62 : 137; parameter AXIS_PCIE_CQ_USER_WIDTH = AXIS_PCIE_DATA_WIDTH < 512 ? 85 : 183; parameter AXIS_PCIE_CC_USER_WIDTH = AXIS_PCIE_DATA_WIDTH < 512 ? 33 : 81; parameter RC_STRADDLE = AXIS_PCIE_DATA_WIDTH >= 256; diff --git a/fpga/lib/pcie/example/VCU118/fpga/rtl/fpga_core.v b/fpga/lib/pcie/example/VCU118/fpga/rtl/fpga_core.v index 573ce020f..10d0c665e 100644 --- a/fpga/lib/pcie/example/VCU118/fpga/rtl/fpga_core.v +++ b/fpga/lib/pcie/example/VCU118/fpga/rtl/fpga_core.v @@ -161,8 +161,8 @@ example_core_pcie_us #( .PCIE_TAG_COUNT(PCIE_TAG_COUNT), .READ_OP_TABLE_SIZE(PCIE_TAG_COUNT), .READ_TX_LIMIT(2**(RQ_SEQ_NUM_WIDTH-1)), - .READ_CPLH_FC_LIMIT(128), - .READ_CPLD_FC_LIMIT(2048), + .READ_CPLH_FC_LIMIT(256), + .READ_CPLD_FC_LIMIT(2048-256), .WRITE_OP_TABLE_SIZE(2**(RQ_SEQ_NUM_WIDTH-1)), .WRITE_TX_LIMIT(2**(RQ_SEQ_NUM_WIDTH-1)), .BAR0_APERTURE(BAR0_APERTURE), @@ -265,8 +265,7 @@ example_core_pcie_us_inst ( */ .cfg_max_read_req(cfg_max_read_req), .cfg_max_payload(cfg_max_payload), - // .cfg_rcb_status(cfg_rcb_status), - .cfg_rcb_status(1'b1), // force RCB 128 due to insufficient CPLH limit in US+ PCIe HIP + .cfg_rcb_status(cfg_rcb_status), /* * Status diff --git a/fpga/lib/pcie/example/VCU118/fpga_axi/tb/fpga_core/Makefile b/fpga/lib/pcie/example/VCU118/fpga_axi/tb/fpga_core/Makefile index e730af529..b09d0d3bf 100644 --- a/fpga/lib/pcie/example/VCU118/fpga_axi/tb/fpga_core/Makefile +++ b/fpga/lib/pcie/example/VCU118/fpga_axi/tb/fpga_core/Makefile @@ -54,7 +54,6 @@ export PARAM_AXIS_PCIE_RQ_USER_WIDTH := $(if $(filter-out 512,$(PARAM_AXIS_PCIE_ export PARAM_AXIS_PCIE_RC_USER_WIDTH := $(if $(filter-out 512,$(PARAM_AXIS_PCIE_DATA_WIDTH)),75,161) export PARAM_AXIS_PCIE_CQ_USER_WIDTH := $(if $(filter-out 512,$(PARAM_AXIS_PCIE_DATA_WIDTH)),88,183) export PARAM_AXIS_PCIE_CC_USER_WIDTH := $(if $(filter-out 512,$(PARAM_AXIS_PCIE_DATA_WIDTH)),33,81) -export PARAM_RQ_SEQ_NUM_WIDTH := 6 ifeq ($(SIM), icarus) PLUSARGS += -fst diff --git a/fpga/lib/pcie/example/VCU118/fpga_axi/tb/fpga_core/test_fpga_core.py b/fpga/lib/pcie/example/VCU118/fpga_axi/tb/fpga_core/test_fpga_core.py index 74ef95548..a2de697b6 100644 --- a/fpga/lib/pcie/example/VCU118/fpga_axi/tb/fpga_core/test_fpga_core.py +++ b/fpga/lib/pcie/example/VCU118/fpga_axi/tb/fpga_core/test_fpga_core.py @@ -403,7 +403,6 @@ def test_fpga_core(request): parameters['AXIS_PCIE_RC_USER_WIDTH'] = 75 if parameters['AXIS_PCIE_DATA_WIDTH'] < 512 else 161 parameters['AXIS_PCIE_CQ_USER_WIDTH'] = 88 if parameters['AXIS_PCIE_DATA_WIDTH'] < 512 else 183 parameters['AXIS_PCIE_CC_USER_WIDTH'] = 33 if parameters['AXIS_PCIE_DATA_WIDTH'] < 512 else 81 - parameters['RQ_SEQ_NUM_WIDTH'] = 6 extra_env = {f'PARAM_{k}': str(v) for k, v in parameters.items()} diff --git a/fpga/lib/pcie/example/VCU1525/fpga/rtl/fpga.v b/fpga/lib/pcie/example/VCU1525/fpga/rtl/fpga.v index 426b56c8c..6d267f5cb 100644 --- a/fpga/lib/pcie/example/VCU1525/fpga/rtl/fpga.v +++ b/fpga/lib/pcie/example/VCU1525/fpga/rtl/fpga.v @@ -53,7 +53,7 @@ module fpga ( parameter AXIS_PCIE_DATA_WIDTH = 512; parameter AXIS_PCIE_KEEP_WIDTH = (AXIS_PCIE_DATA_WIDTH/32); parameter AXIS_PCIE_RC_USER_WIDTH = AXIS_PCIE_DATA_WIDTH < 512 ? 75 : 161; -parameter AXIS_PCIE_RQ_USER_WIDTH = AXIS_PCIE_DATA_WIDTH < 512 ? 60 : 137; +parameter AXIS_PCIE_RQ_USER_WIDTH = AXIS_PCIE_DATA_WIDTH < 512 ? 62 : 137; parameter AXIS_PCIE_CQ_USER_WIDTH = AXIS_PCIE_DATA_WIDTH < 512 ? 85 : 183; parameter AXIS_PCIE_CC_USER_WIDTH = AXIS_PCIE_DATA_WIDTH < 512 ? 33 : 81; parameter RC_STRADDLE = AXIS_PCIE_DATA_WIDTH >= 256; diff --git a/fpga/lib/pcie/example/VCU1525/fpga/rtl/fpga_core.v b/fpga/lib/pcie/example/VCU1525/fpga/rtl/fpga_core.v index 39d8ab260..68701b8c8 100644 --- a/fpga/lib/pcie/example/VCU1525/fpga/rtl/fpga_core.v +++ b/fpga/lib/pcie/example/VCU1525/fpga/rtl/fpga_core.v @@ -156,8 +156,8 @@ example_core_pcie_us #( .PCIE_TAG_COUNT(PCIE_TAG_COUNT), .READ_OP_TABLE_SIZE(PCIE_TAG_COUNT), .READ_TX_LIMIT(2**(RQ_SEQ_NUM_WIDTH-1)), - .READ_CPLH_FC_LIMIT(128), - .READ_CPLD_FC_LIMIT(2048), + .READ_CPLH_FC_LIMIT(256), + .READ_CPLD_FC_LIMIT(2048-256), .WRITE_OP_TABLE_SIZE(2**(RQ_SEQ_NUM_WIDTH-1)), .WRITE_TX_LIMIT(2**(RQ_SEQ_NUM_WIDTH-1)), .BAR0_APERTURE(BAR0_APERTURE), @@ -260,8 +260,7 @@ example_core_pcie_us_inst ( */ .cfg_max_read_req(cfg_max_read_req), .cfg_max_payload(cfg_max_payload), - // .cfg_rcb_status(cfg_rcb_status), - .cfg_rcb_status(1'b1), // force RCB 128 due to insufficient CPLH limit in US+ PCIe HIP + .cfg_rcb_status(cfg_rcb_status), /* * Status diff --git a/fpga/lib/pcie/example/VCU1525/fpga_axi/tb/fpga_core/Makefile b/fpga/lib/pcie/example/VCU1525/fpga_axi/tb/fpga_core/Makefile index e730af529..b09d0d3bf 100644 --- a/fpga/lib/pcie/example/VCU1525/fpga_axi/tb/fpga_core/Makefile +++ b/fpga/lib/pcie/example/VCU1525/fpga_axi/tb/fpga_core/Makefile @@ -54,7 +54,6 @@ export PARAM_AXIS_PCIE_RQ_USER_WIDTH := $(if $(filter-out 512,$(PARAM_AXIS_PCIE_ export PARAM_AXIS_PCIE_RC_USER_WIDTH := $(if $(filter-out 512,$(PARAM_AXIS_PCIE_DATA_WIDTH)),75,161) export PARAM_AXIS_PCIE_CQ_USER_WIDTH := $(if $(filter-out 512,$(PARAM_AXIS_PCIE_DATA_WIDTH)),88,183) export PARAM_AXIS_PCIE_CC_USER_WIDTH := $(if $(filter-out 512,$(PARAM_AXIS_PCIE_DATA_WIDTH)),33,81) -export PARAM_RQ_SEQ_NUM_WIDTH := 6 ifeq ($(SIM), icarus) PLUSARGS += -fst diff --git a/fpga/lib/pcie/example/VCU1525/fpga_axi/tb/fpga_core/test_fpga_core.py b/fpga/lib/pcie/example/VCU1525/fpga_axi/tb/fpga_core/test_fpga_core.py index f662d6cf6..8e67c0332 100644 --- a/fpga/lib/pcie/example/VCU1525/fpga_axi/tb/fpga_core/test_fpga_core.py +++ b/fpga/lib/pcie/example/VCU1525/fpga_axi/tb/fpga_core/test_fpga_core.py @@ -398,7 +398,6 @@ def test_fpga_core(request): parameters['AXIS_PCIE_RC_USER_WIDTH'] = 75 if parameters['AXIS_PCIE_DATA_WIDTH'] < 512 else 161 parameters['AXIS_PCIE_CQ_USER_WIDTH'] = 88 if parameters['AXIS_PCIE_DATA_WIDTH'] < 512 else 183 parameters['AXIS_PCIE_CC_USER_WIDTH'] = 33 if parameters['AXIS_PCIE_DATA_WIDTH'] < 512 else 81 - parameters['RQ_SEQ_NUM_WIDTH'] = 6 extra_env = {f'PARAM_{k}': str(v) for k, v in parameters.items()} diff --git a/fpga/lib/pcie/example/ZCU106/fpga/rtl/fpga.v b/fpga/lib/pcie/example/ZCU106/fpga/rtl/fpga.v index 6e9f9e77d..038c1125c 100644 --- a/fpga/lib/pcie/example/ZCU106/fpga/rtl/fpga.v +++ b/fpga/lib/pcie/example/ZCU106/fpga/rtl/fpga.v @@ -58,7 +58,7 @@ module fpga ( parameter AXIS_PCIE_DATA_WIDTH = 128; parameter AXIS_PCIE_KEEP_WIDTH = (AXIS_PCIE_DATA_WIDTH/32); parameter AXIS_PCIE_RC_USER_WIDTH = AXIS_PCIE_DATA_WIDTH < 512 ? 75 : 161; -parameter AXIS_PCIE_RQ_USER_WIDTH = AXIS_PCIE_DATA_WIDTH < 512 ? 60 : 137; +parameter AXIS_PCIE_RQ_USER_WIDTH = AXIS_PCIE_DATA_WIDTH < 512 ? 62 : 137; parameter AXIS_PCIE_CQ_USER_WIDTH = AXIS_PCIE_DATA_WIDTH < 512 ? 85 : 183; parameter AXIS_PCIE_CC_USER_WIDTH = AXIS_PCIE_DATA_WIDTH < 512 ? 33 : 81; parameter RC_STRADDLE = AXIS_PCIE_DATA_WIDTH >= 256; diff --git a/fpga/lib/pcie/example/ZCU106/fpga/rtl/fpga_core.v b/fpga/lib/pcie/example/ZCU106/fpga/rtl/fpga_core.v index b3879c240..e7581c496 100644 --- a/fpga/lib/pcie/example/ZCU106/fpga/rtl/fpga_core.v +++ b/fpga/lib/pcie/example/ZCU106/fpga/rtl/fpga_core.v @@ -161,8 +161,8 @@ example_core_pcie_us #( .PCIE_TAG_COUNT(PCIE_TAG_COUNT), .READ_OP_TABLE_SIZE(PCIE_TAG_COUNT), .READ_TX_LIMIT(2**(RQ_SEQ_NUM_WIDTH-1)), - .READ_CPLH_FC_LIMIT(128), - .READ_CPLD_FC_LIMIT(2048), + .READ_CPLH_FC_LIMIT(256), + .READ_CPLD_FC_LIMIT(2048-256), .WRITE_OP_TABLE_SIZE(2**(RQ_SEQ_NUM_WIDTH-1)), .WRITE_TX_LIMIT(2**(RQ_SEQ_NUM_WIDTH-1)), .BAR0_APERTURE(BAR0_APERTURE), @@ -265,8 +265,7 @@ example_core_pcie_us_inst ( */ .cfg_max_read_req(cfg_max_read_req), .cfg_max_payload(cfg_max_payload), - // .cfg_rcb_status(cfg_rcb_status), - .cfg_rcb_status(1'b1), // force RCB 128 due to insufficient CPLH limit in US+ PCIe HIP + .cfg_rcb_status(cfg_rcb_status), /* * Status diff --git a/fpga/lib/pcie/example/ZCU106/fpga_axi/tb/fpga_core/Makefile b/fpga/lib/pcie/example/ZCU106/fpga_axi/tb/fpga_core/Makefile index 6d78bf572..55ed40bff 100644 --- a/fpga/lib/pcie/example/ZCU106/fpga_axi/tb/fpga_core/Makefile +++ b/fpga/lib/pcie/example/ZCU106/fpga_axi/tb/fpga_core/Makefile @@ -54,7 +54,6 @@ export PARAM_AXIS_PCIE_RQ_USER_WIDTH := $(if $(filter-out 512,$(PARAM_AXIS_PCIE_ export PARAM_AXIS_PCIE_RC_USER_WIDTH := $(if $(filter-out 512,$(PARAM_AXIS_PCIE_DATA_WIDTH)),75,161) export PARAM_AXIS_PCIE_CQ_USER_WIDTH := $(if $(filter-out 512,$(PARAM_AXIS_PCIE_DATA_WIDTH)),88,183) export PARAM_AXIS_PCIE_CC_USER_WIDTH := $(if $(filter-out 512,$(PARAM_AXIS_PCIE_DATA_WIDTH)),33,81) -export PARAM_RQ_SEQ_NUM_WIDTH := 6 ifeq ($(SIM), icarus) PLUSARGS += -fst diff --git a/fpga/lib/pcie/example/ZCU106/fpga_axi/tb/fpga_core/test_fpga_core.py b/fpga/lib/pcie/example/ZCU106/fpga_axi/tb/fpga_core/test_fpga_core.py index e27299f62..3e1712a8e 100644 --- a/fpga/lib/pcie/example/ZCU106/fpga_axi/tb/fpga_core/test_fpga_core.py +++ b/fpga/lib/pcie/example/ZCU106/fpga_axi/tb/fpga_core/test_fpga_core.py @@ -403,7 +403,6 @@ def test_fpga_core(request): parameters['AXIS_PCIE_RC_USER_WIDTH'] = 75 if parameters['AXIS_PCIE_DATA_WIDTH'] < 512 else 161 parameters['AXIS_PCIE_CQ_USER_WIDTH'] = 88 if parameters['AXIS_PCIE_DATA_WIDTH'] < 512 else 183 parameters['AXIS_PCIE_CC_USER_WIDTH'] = 33 if parameters['AXIS_PCIE_DATA_WIDTH'] < 512 else 81 - parameters['RQ_SEQ_NUM_WIDTH'] = 6 extra_env = {f'PARAM_{k}': str(v) for k, v in parameters.items()} diff --git a/fpga/lib/pcie/example/common/driver/example/example_driver.c b/fpga/lib/pcie/example/common/driver/example/example_driver.c index 17d863ad7..6996bdeaf 100644 --- a/fpga/lib/pcie/example/common/driver/example/example_driver.c +++ b/fpga/lib/pcie/example/common/driver/example/example_driver.c @@ -103,6 +103,8 @@ static void dma_block_read(struct example_dev *edev, if ((ioread32(edev->bar[0] + 0x001000) & 1) != 0) dev_warn(edev->dev, "%s: operation timed out", __func__); + if ((ioread32(edev->bar[0] + 0x000000) & 0x300) != 0) + dev_warn(edev->dev, "%s: DMA engine busy", __func__); } static void dma_block_write(struct example_dev *edev, @@ -157,15 +159,22 @@ static void dma_block_write(struct example_dev *edev, if ((ioread32(edev->bar[0] + 0x001100) & 1) != 0) dev_warn(edev->dev, "%s: operation timed out", __func__); + if ((ioread32(edev->bar[0] + 0x000000) & 0x300) != 0) + dev_warn(edev->dev, "%s: DMA engine busy", __func__); } static void dma_block_read_bench(struct example_dev *edev, dma_addr_t dma_addr, u64 size, u64 stride, u64 count) { u64 cycles; + u32 rd_req; + u32 rd_cpl; udelay(5); + rd_req = ioread32(edev->bar[0] + 0x000020); + rd_cpl = ioread32(edev->bar[0] + 0x000024); + dma_block_read(edev, dma_addr, 0, 0x3fff, stride, 0, 0, 0x3fff, stride, size, count); @@ -173,17 +182,23 @@ static void dma_block_read_bench(struct example_dev *edev, udelay(5); - dev_info(edev->dev, "read %lld blocks of %lld bytes (stride %lld) in %lld ns: %lld Mbps", - count, size, stride, cycles * 4, size * count * 8 * 1000 / (cycles * 4)); + rd_req = ioread32(edev->bar[0] + 0x000020) - rd_req; + rd_cpl = ioread32(edev->bar[0] + 0x000024) - rd_cpl; + + dev_info(edev->dev, "read %lld blocks of %lld bytes (total %lld B, stride %lld) in %lld ns (%d req %d cpl): %lld Mbps", + count, size, count*size, stride, cycles * 4, rd_req, rd_cpl, size * count * 8 * 1000 / (cycles * 4)); } static void dma_block_write_bench(struct example_dev *edev, dma_addr_t dma_addr, u64 size, u64 stride, u64 count) { u64 cycles; + u32 wr_req; udelay(5); + wr_req = ioread32(edev->bar[0] + 0x000028); + dma_block_write(edev, dma_addr, 0, 0x3fff, stride, 0, 0, 0x3fff, stride, size, count); @@ -191,8 +206,83 @@ static void dma_block_write_bench(struct example_dev *edev, udelay(5); - dev_info(edev->dev, "wrote %lld blocks of %lld bytes (stride %lld) in %lld ns: %lld Mbps", - count, size, stride, cycles * 4, size * count * 8 * 1000 / (cycles * 4)); + wr_req = ioread32(edev->bar[0] + 0x000028) - wr_req; + + dev_info(edev->dev, "wrote %lld blocks of %lld bytes (total %lld B, stride %lld) in %lld ns (%d req): %lld Mbps", + count, size, count*size, stride, cycles * 4, wr_req, size * count * 8 * 1000 / (cycles * 4)); +} + +static void dma_cpl_buf_test(struct example_dev *edev, dma_addr_t dma_addr, + u64 size, u64 stride, u64 count, int stall) +{ + unsigned long t; + u64 cycles; + u32 rd_req; + u32 rd_cpl; + + rd_req = ioread32(edev->bar[0] + 0x000020); + rd_cpl = ioread32(edev->bar[0] + 0x000024); + + // DMA base address + iowrite32(dma_addr & 0xffffffff, edev->bar[0] + 0x001080); + iowrite32((dma_addr >> 32) & 0xffffffff, edev->bar[0] + 0x001084); + // DMA offset address + iowrite32(0, edev->bar[0] + 0x001088); + iowrite32(0, edev->bar[0] + 0x00108c); + // DMA offset mask + iowrite32(0x3fff, edev->bar[0] + 0x001090); + iowrite32(0, edev->bar[0] + 0x001094); + // DMA stride + iowrite32(stride & 0xffffffff, edev->bar[0] + 0x001098); + iowrite32((stride >> 32) & 0xffffffff, edev->bar[0] + 0x00109c); + // RAM base address + iowrite32(0, edev->bar[0] + 0x0010c0); + iowrite32(0, edev->bar[0] + 0x0010c4); + // RAM offset address + iowrite32(0, edev->bar[0] + 0x0010c8); + iowrite32(0, edev->bar[0] + 0x0010cc); + // RAM offset mask + iowrite32(0x3fff, edev->bar[0] + 0x0010d0); + iowrite32(0, edev->bar[0] + 0x0010d4); + // RAM stride + iowrite32(stride & 0xffffffff, edev->bar[0] + 0x0010d8); + iowrite32((stride >> 32) & 0xffffffff, edev->bar[0] + 0x0010dc); + // clear cycle count + iowrite32(0, edev->bar[0] + 0x001008); + iowrite32(0, edev->bar[0] + 0x00100c); + // block length + iowrite32(size, edev->bar[0] + 0x001010); + // block count + iowrite32(count, edev->bar[0] + 0x001018); + + if (stall) + iowrite32(stall, edev->bar[0] + 0x000040); + + // start + iowrite32(1, edev->bar[0] + 0x001000); + + if (stall) + msleep(10); + + // wait for transfer to complete + t = jiffies + msecs_to_jiffies(20000); + while (time_before(jiffies, t)) { + if ((ioread32(edev->bar[0] + 0x001000) & 1) == 0) + break; + } + + if ((ioread32(edev->bar[0] + 0x001000) & 1) != 0) + dev_warn(edev->dev, "%s: operation timed out", __func__); + if ((ioread32(edev->bar[0] + 0x000000) & 0x300) != 0) + dev_warn(edev->dev, "%s: DMA engine busy", __func__); + + cycles = ioread32(edev->bar[0] + 0x001008); + + rd_req = ioread32(edev->bar[0] + 0x000020) - rd_req; + rd_cpl = ioread32(edev->bar[0] + 0x000024) - rd_cpl; + + dev_info(edev->dev, "read %lld x %lld B (total %lld B %lld CPLD, stride %lld) in %lld ns (%d req %d cpl): %lld Mbps", + count, size, count*size, count*((size+15) / 16), stride, cycles * 4, rd_req, rd_cpl, size * count * 8 * 1000 / (cycles * 4)); } static irqreturn_t edev_intr(int irq, void *data) @@ -227,16 +317,20 @@ static int edev_probe(struct pci_dev *pdev, const struct pci_device_id *ent) if (pdev->pcie_cap) { u16 devctl; u32 lnkcap; + u16 lnkctl; u16 lnksta; pci_read_config_word(pdev, pdev->pcie_cap + PCI_EXP_DEVCTL, &devctl); pci_read_config_dword(pdev, pdev->pcie_cap + PCI_EXP_LNKCAP, &lnkcap); + pci_read_config_word(pdev, pdev->pcie_cap + PCI_EXP_LNKCTL, &lnkctl); pci_read_config_word(pdev, pdev->pcie_cap + PCI_EXP_LNKSTA, &lnksta); dev_info(dev, " Max payload size: %d bytes", 128 << ((devctl & PCI_EXP_DEVCTL_PAYLOAD) >> 5)); dev_info(dev, " Max read request size: %d bytes", 128 << ((devctl & PCI_EXP_DEVCTL_READRQ) >> 12)); + dev_info(dev, " Read completion boundary: %d bytes", + lnkctl & PCI_EXP_LNKCTL_RCB ? 128 : 64); dev_info(dev, " Link capability: gen %d x%d", lnkcap & PCI_EXP_LNKCAP_SLS, (lnkcap & PCI_EXP_LNKCAP_MLW) >> 4); dev_info(dev, " Link status: gen %d x%d", @@ -361,6 +455,7 @@ static int edev_probe(struct pci_dev *pdev, const struct pci_device_id *ent) msleep(1); dev_info(dev, "Read status"); + dev_info(dev, "%08x", ioread32(edev->bar[0] + 0x000000)); dev_info(dev, "%08x", ioread32(edev->bar[0] + 0x000118)); dev_info(dev, "start copy to host"); @@ -374,6 +469,7 @@ static int edev_probe(struct pci_dev *pdev, const struct pci_device_id *ent) msleep(1); dev_info(dev, "Read status"); + dev_info(dev, "%08x", ioread32(edev->bar[0] + 0x000000)); dev_info(dev, "%08x", ioread32(edev->bar[0] + 0x000218)); dev_info(dev, "read test data"); @@ -398,6 +494,7 @@ static int edev_probe(struct pci_dev *pdev, const struct pci_device_id *ent) msleep(1); dev_info(dev, "Read status"); + dev_info(dev, "%08x", ioread32(edev->bar[0] + 0x000000)); dev_info(dev, "%08x", ioread32(edev->bar[0] + 0x000218)); dev_info(dev, "read data"); @@ -407,31 +504,90 @@ static int edev_probe(struct pci_dev *pdev, const struct pci_device_id *ent) if (!mismatch) { u64 size; u64 stride; + u64 count; dev_info(dev, "disable interrupts"); iowrite32(0x0, edev->bar[0] + 0x000008); + dev_info(dev, "test RX completion buffer (CPLH, 8)"); + + size = 8; + stride = size; + for (count = 32; count <= 256; count += 8) { + dma_cpl_buf_test(edev, + edev->dma_region_addr + 0x0000, + size, stride, count, 100000); + if ((ioread32(edev->bar[0] + 0x000000) & 0x300) != 0) + goto out; + } + + dev_info(dev, "test RX completion buffer (CPLH, unaligned 8+64)"); + + size = 8+64; + stride = 0; + for (count = 8; count <= 256; count += 8) { + dma_cpl_buf_test(edev, + edev->dma_region_addr + 128 - 8, + size, stride, count, 400000); + if ((ioread32(edev->bar[0] + 0x000000) & 0x300) != 0) + goto out; + } + + dev_info(dev, "test RX completion buffer (CPLH, unaligned 8+128+8)"); + + size = 8+128+8; + stride = 0; + for (count = 8; count <= 256; count += 8) { + dma_cpl_buf_test(edev, + edev->dma_region_addr + 128 - 8, + size, stride, count, 100000); + if ((ioread32(edev->bar[0] + 0x000000) & 0x300) != 0) + goto out; + } + + dev_info(dev, "test RX completion buffer (CPLD)"); + + size = 512; + stride = size; + for (count = 8; count <= 256; count += 8) { + dma_cpl_buf_test(edev, + edev->dma_region_addr + 0x0000, + size, stride, count, 100000); + if ((ioread32(edev->bar[0] + 0x000000) & 0x300) != 0) + goto out; + } + dev_info(dev, "perform block reads (dma_alloc_coherent)"); + count = 10000; for (size = 1; size <= 8192; size *= 2) { for (stride = size; stride <= max(size, 256llu); stride *= 2) { dma_block_read_bench(edev, edev->dma_region_addr + 0x0000, - size, stride, 10000); + size, stride, count); + if ((ioread32(edev->bar[0] + 0x000000) & 0x300) != 0) + goto out; } } dev_info(dev, "perform block writes (dma_alloc_coherent)"); + count = 10000; for (size = 1; size <= 8192; size *= 2) { for (stride = size; stride <= max(size, 256llu); stride *= 2) { dma_block_write_bench(edev, edev->dma_region_addr + 0x0000, - size, stride, 10000); + size, stride, count); + if ((ioread32(edev->bar[0] + 0x000000) & 0x300) != 0) + goto out; } } } +out: + dev_info(dev, "Read status"); + dev_info(dev, "%08x", ioread32(edev->bar[0] + 0x000000)); + // probe complete return 0; diff --git a/fpga/lib/pcie/example/common/rtl/example_core.v b/fpga/lib/pcie/example/common/rtl/example_core.v index e0ae1578c..8ac66add8 100644 --- a/fpga/lib/pcie/example/common/rtl/example_core.v +++ b/fpga/lib/pcie/example/common/rtl/example_core.v @@ -152,7 +152,18 @@ module example_core # */ output wire [IRQ_INDEX_WIDTH-1:0] irq_index, output wire irq_valid, - input wire irq_ready + input wire irq_ready, + + /* + * Control and status + */ + output wire dma_enable, + input wire dma_rd_busy, + input wire dma_wr_busy, + input wire dma_rd_req, + input wire dma_rd_cpl, + input wire dma_wr_req, + output wire rx_cpl_stall ); localparam RAM_ADDR_IMM_WIDTH = (DMA_IMM_ENABLE && (DMA_IMM_WIDTH > RAM_ADDR_WIDTH)) ? DMA_IMM_WIDTH : RAM_ADDR_WIDTH; @@ -203,6 +214,9 @@ reg axil_ctrl_rvalid_reg = 1'b0, axil_ctrl_rvalid_next; reg [63:0] cycle_count_reg = 0; reg [15:0] dma_read_active_count_reg = 0; reg [15:0] dma_write_active_count_reg = 0; +reg [31:0] dma_rd_req_count_reg = 0; +reg [31:0] dma_rd_cpl_count_reg = 0; +reg [31:0] dma_wr_req_count_reg = 0; reg [DMA_ADDR_WIDTH-1:0] dma_read_desc_dma_addr_reg = 0, dma_read_desc_dma_addr_next; reg [RAM_ADDR_WIDTH-1:0] dma_read_desc_ram_addr_reg = 0, dma_read_desc_ram_addr_next; @@ -230,6 +244,9 @@ reg dma_rd_int_en_reg = 0, dma_rd_int_en_next; reg dma_wr_int_en_reg = 0, dma_wr_int_en_next; reg irq_valid_reg = 1'b0, irq_valid_next; +reg rx_cpl_stall_reg = 1'b0, rx_cpl_stall_next; +reg [23:0] rx_cpl_stall_count_reg = 0, rx_cpl_stall_count_next; + reg dma_read_block_run_reg = 1'b0, dma_read_block_run_next; reg [DMA_LEN_WIDTH-1:0] dma_read_block_len_reg = 0, dma_read_block_len_next; reg [31:0] dma_read_block_count_reg = 0, dma_read_block_count_next; @@ -284,6 +301,9 @@ assign m_axis_dma_write_desc_valid = dma_write_desc_valid_reg; assign irq_index = 0; assign irq_valid = irq_valid_reg; +assign dma_enable = dma_enable_reg; +assign rx_cpl_stall = rx_cpl_stall_reg; + always @* begin axil_ctrl_awready_next = 1'b0; axil_ctrl_wready_next = 1'b0; @@ -322,6 +342,9 @@ always @* begin irq_valid_next = irq_valid_reg && !irq_ready; + rx_cpl_stall_next = 1'b0; + rx_cpl_stall_count_next = rx_cpl_stall_count_reg; + dma_read_block_run_next = dma_read_block_run_reg; dma_read_block_len_next = dma_read_block_len_reg; dma_read_block_count_next = dma_read_block_count_reg; @@ -348,6 +371,11 @@ always @* begin dma_write_block_ram_offset_mask_next = dma_write_block_ram_offset_mask_reg; dma_write_block_ram_stride_next = dma_write_block_ram_stride_reg; + if (rx_cpl_stall_count_reg) begin + rx_cpl_stall_count_next = rx_cpl_stall_count_reg - 1; + rx_cpl_stall_next = 1'b1; + end + if (s_axil_ctrl_awvalid && s_axil_ctrl_wvalid && !axil_ctrl_bvalid_reg) begin // write operation axil_ctrl_awready_next = 1'b1; @@ -364,6 +392,7 @@ always @* begin dma_rd_int_en_next = s_axil_ctrl_wdata[0]; dma_wr_int_en_next = s_axil_ctrl_wdata[1]; end + 16'h0040: rx_cpl_stall_count_next = s_axil_ctrl_wdata; // single read 16'h0100: dma_read_desc_dma_addr_next[31:0] = s_axil_ctrl_wdata; 16'h0104: dma_read_desc_dma_addr_next[63:32] = s_axil_ctrl_wdata; @@ -437,6 +466,8 @@ always @* begin // control 16'h0000: begin axil_ctrl_rdata_next[0] = dma_enable_reg; + axil_ctrl_rdata_next[8] = dma_wr_busy; + axil_ctrl_rdata_next[9] = dma_rd_busy; end 16'h0008: begin axil_ctrl_rdata_next[0] = dma_rd_int_en_reg; @@ -444,8 +475,12 @@ always @* begin end 16'h0010: axil_ctrl_rdata_next = cycle_count_reg; 16'h0014: axil_ctrl_rdata_next = cycle_count_reg >> 32; - 16'h0020: axil_ctrl_rdata_next = dma_read_active_count_reg; - 16'h0028: axil_ctrl_rdata_next = dma_write_active_count_reg; + 16'h0018: axil_ctrl_rdata_next = dma_read_active_count_reg; + 16'h001c: axil_ctrl_rdata_next = dma_write_active_count_reg; + 16'h0020: axil_ctrl_rdata_next = dma_rd_req_count_reg; + 16'h0024: axil_ctrl_rdata_next = dma_rd_cpl_count_reg; + 16'h0028: axil_ctrl_rdata_next = dma_wr_req_count_reg; + 16'h0040: axil_ctrl_rdata_next = rx_cpl_stall_count_reg; // single read 16'h0100: axil_ctrl_rdata_next = dma_read_desc_dma_addr_reg; 16'h0104: axil_ctrl_rdata_next = dma_read_desc_dma_addr_reg >> 32; @@ -615,6 +650,10 @@ always @(posedge clk) begin + (m_axis_dma_write_desc_valid && m_axis_dma_write_desc_ready) - s_axis_dma_write_desc_status_valid; + dma_rd_req_count_reg <= dma_rd_req_count_reg + dma_rd_req; + dma_rd_cpl_count_reg <= dma_rd_cpl_count_reg + dma_rd_cpl; + dma_wr_req_count_reg <= dma_wr_req_count_reg + dma_wr_req; + dma_read_desc_dma_addr_reg <= dma_read_desc_dma_addr_next; dma_read_desc_ram_addr_reg <= dma_read_desc_ram_addr_next; dma_read_desc_len_reg <= dma_read_desc_len_next; @@ -643,6 +682,9 @@ always @(posedge clk) begin irq_valid_reg <= irq_valid_next; + rx_cpl_stall_reg <= rx_cpl_stall_next; + rx_cpl_stall_count_reg <= rx_cpl_stall_count_next; + dma_read_block_run_reg <= dma_read_block_run_next; dma_read_block_len_reg <= dma_read_block_len_next; dma_read_block_count_reg <= dma_read_block_count_next; @@ -679,6 +721,9 @@ always @(posedge clk) begin cycle_count_reg <= 0; dma_read_active_count_reg <= 0; dma_write_active_count_reg <= 0; + dma_rd_req_count_reg <= 0; + dma_rd_cpl_count_reg <= 0; + dma_wr_req_count_reg <= 0; dma_read_desc_valid_reg <= 1'b0; dma_read_desc_status_valid_reg <= 1'b0; @@ -688,6 +733,8 @@ always @(posedge clk) begin dma_rd_int_en_reg <= 1'b0; dma_wr_int_en_reg <= 1'b0; irq_valid_reg <= 1'b0; + rx_cpl_stall_reg <= 1'b0; + rx_cpl_stall_count_reg <= 0; dma_read_block_run_reg <= 1'b0; dma_write_block_run_reg <= 1'b0; end diff --git a/fpga/lib/pcie/example/common/rtl/example_core_pcie.v b/fpga/lib/pcie/example/common/rtl/example_core_pcie.v index db710450c..5d93ac847 100644 --- a/fpga/lib/pcie/example/common/rtl/example_core_pcie.v +++ b/fpga/lib/pcie/example/common/rtl/example_core_pcie.v @@ -172,7 +172,12 @@ module example_core_pcie # * Status */ output wire status_error_cor, - output wire status_error_uncor + output wire status_error_uncor, + + /* + * Control and status + */ + output wire rx_cpl_stall ); parameter AXIL_CTRL_DATA_WIDTH = 32; @@ -345,6 +350,11 @@ wire [IRQ_INDEX_WIDTH-1:0] irq_index; wire irq_valid; wire irq_ready; +// Control and status +wire dma_enable; +wire dma_rd_busy; +wire dma_wr_busy; + pcie_tlp_demux_bar #( .PORTS(3), .TLP_DATA_WIDTH(TLP_DATA_WIDTH), @@ -900,8 +910,8 @@ dma_if_pcie_inst ( /* * Configuration */ - .read_enable(1'b1), - .write_enable(1'b1), + .read_enable(dma_enable), + .write_enable(dma_enable), .ext_tag_enable(ext_tag_enable), .rcb_128b(rcb_128b), .requester_id({bus_num, 5'd0, 3'd0}), @@ -911,8 +921,8 @@ dma_if_pcie_inst ( /* * Status */ - .status_rd_busy(), - .status_wr_busy(), + .status_rd_busy(dma_rd_busy), + .status_wr_busy(dma_wr_busy), .status_error_cor(status_error_cor_int[3]), .status_error_uncor(status_error_uncor_int[3]) ); @@ -1109,7 +1119,18 @@ core_inst ( */ .irq_index(irq_index), .irq_valid(irq_valid), - .irq_ready(irq_ready) + .irq_ready(irq_ready), + + /* + * Control and status + */ + .dma_enable(dma_enable), + .dma_rd_busy(dma_rd_busy), + .dma_wr_busy(dma_wr_busy), + .dma_rd_req(tx_rd_req_tlp_valid && tx_rd_req_tlp_sop && tx_rd_req_tlp_ready), + .dma_rd_cpl(rx_cpl_tlp_valid && rx_cpl_tlp_sop && rx_cpl_tlp_ready), + .dma_wr_req(tx_wr_req_tlp_valid && tx_wr_req_tlp_sop && tx_wr_req_tlp_ready), + .rx_cpl_stall(rx_cpl_stall) ); endmodule diff --git a/fpga/lib/pcie/example/common/rtl/example_core_pcie_ptile.v b/fpga/lib/pcie/example/common/rtl/example_core_pcie_ptile.v index 877b2d351..232beab8e 100644 --- a/fpga/lib/pcie/example/common/rtl/example_core_pcie_ptile.v +++ b/fpga/lib/pcie/example/common/rtl/example_core_pcie_ptile.v @@ -200,6 +200,12 @@ wire [2:0] max_payload_size; wire msix_enable; wire msix_mask; +wire rx_cpl_stall; + +wire rx_st_ready_int; + +assign rx_st_ready = rx_st_ready_int & !rx_cpl_stall; + pcie_ptile_if #( .SEG_COUNT(SEG_COUNT), .SEG_DATA_WIDTH(SEG_DATA_WIDTH), @@ -226,7 +232,7 @@ pcie_ptile_if_inst ( .rx_st_sop(rx_st_sop), .rx_st_eop(rx_st_eop), .rx_st_valid(rx_st_valid), - .rx_st_ready(rx_st_ready), + .rx_st_ready(rx_st_ready_int), .rx_st_hdr(rx_st_hdr), .rx_st_tlp_prfx(rx_st_tlp_prfx), .rx_st_vf_active(rx_st_vf_active), @@ -488,7 +494,12 @@ core_pcie_inst ( * Status */ .status_error_cor(), - .status_error_uncor() + .status_error_uncor(), + + /* + * Control and status + */ + .rx_cpl_stall(rx_cpl_stall) ); endmodule diff --git a/fpga/lib/pcie/example/common/rtl/example_core_pcie_s10.v b/fpga/lib/pcie/example/common/rtl/example_core_pcie_s10.v index c51ec3ce1..8ceadc3d7 100644 --- a/fpga/lib/pcie/example/common/rtl/example_core_pcie_s10.v +++ b/fpga/lib/pcie/example/common/rtl/example_core_pcie_s10.v @@ -58,7 +58,7 @@ module example_core_pcie_s10 # // Completion header flow control credit limit (read) parameter READ_CPLH_FC_LIMIT = 770, // Completion data flow control credit limit (read) - parameter READ_CPLD_FC_LIMIT = 2500, + parameter READ_CPLD_FC_LIMIT = 2400, // Operation table size (write) parameter WRITE_OP_TABLE_SIZE = 2**TX_SEQ_NUM_WIDTH, // In-flight transmit limit (write) @@ -194,6 +194,12 @@ wire [2:0] max_payload_size; wire msix_enable; wire msix_mask; +wire rx_cpl_stall; + +wire rx_st_ready_int; + +assign rx_st_ready = rx_st_ready_int & !rx_cpl_stall; + pcie_s10_if #( .SEG_COUNT(SEG_COUNT), .SEG_DATA_WIDTH(SEG_DATA_WIDTH), @@ -222,7 +228,7 @@ pcie_s10_if_inst ( .rx_st_sop(rx_st_sop), .rx_st_eop(rx_st_eop), .rx_st_valid(rx_st_valid), - .rx_st_ready(rx_st_ready), + .rx_st_ready(rx_st_ready_int), .rx_st_vf_active(rx_st_vf_active), .rx_st_func_num(rx_st_func_num), .rx_st_vf_num(rx_st_vf_num), @@ -495,7 +501,12 @@ core_pcie_inst ( * Status */ .status_error_cor(), - .status_error_uncor() + .status_error_uncor(), + + /* + * Control and status + */ + .rx_cpl_stall(rx_cpl_stall) ); endmodule diff --git a/fpga/lib/pcie/example/common/rtl/example_core_pcie_us.v b/fpga/lib/pcie/example/common/rtl/example_core_pcie_us.v index c8fe2cfcf..9ecdb948c 100644 --- a/fpga/lib/pcie/example/common/rtl/example_core_pcie_us.v +++ b/fpga/lib/pcie/example/common/rtl/example_core_pcie_us.v @@ -68,9 +68,9 @@ module example_core_pcie_us # // In-flight transmit limit (read) parameter READ_TX_LIMIT = 2**(RQ_SEQ_NUM_WIDTH-1), // Completion header flow control credit limit (read) - parameter READ_CPLH_FC_LIMIT = AXIS_PCIE_RQ_USER_WIDTH == 60 ? 64 : 128, + parameter READ_CPLH_FC_LIMIT = AXIS_PCIE_RQ_USER_WIDTH == 60 ? 64 : 256, // Completion data flow control credit limit (read) - parameter READ_CPLD_FC_LIMIT = AXIS_PCIE_RQ_USER_WIDTH == 60 ? 992 : 2048, + parameter READ_CPLD_FC_LIMIT = AXIS_PCIE_RQ_USER_WIDTH == 60 ? 1024-64 : 2048-256, // Operation table size (write) parameter WRITE_OP_TABLE_SIZE = 2**(RQ_SEQ_NUM_WIDTH-1), // In-flight transmit limit (write) @@ -259,6 +259,14 @@ wire ext_tag_enable; wire msix_enable; wire msix_mask; +wire rx_cpl_stall; + +wire s_axis_rc_tvalid_int; +wire s_axis_rc_tready_int; + +assign s_axis_rc_tvalid_int = s_axis_rc_tvalid & ~rx_cpl_stall; +assign s_axis_rc_tready = s_axis_rc_tready_int & ~rx_cpl_stall; + pcie_us_if #( .AXIS_PCIE_DATA_WIDTH(AXIS_PCIE_DATA_WIDTH), .AXIS_PCIE_KEEP_WIDTH(AXIS_PCIE_KEEP_WIDTH), @@ -295,8 +303,8 @@ pcie_us_if_inst ( */ .s_axis_rc_tdata(s_axis_rc_tdata), .s_axis_rc_tkeep(s_axis_rc_tkeep), - .s_axis_rc_tvalid(s_axis_rc_tvalid), - .s_axis_rc_tready(s_axis_rc_tready), + .s_axis_rc_tvalid(s_axis_rc_tvalid_int), + .s_axis_rc_tready(s_axis_rc_tready_int), .s_axis_rc_tlast(s_axis_rc_tlast), .s_axis_rc_tuser(s_axis_rc_tuser), @@ -624,7 +632,12 @@ core_pcie_inst ( * Status */ .status_error_cor(status_error_cor), - .status_error_uncor(status_error_uncor) + .status_error_uncor(status_error_uncor), + + /* + * Control and status + */ + .rx_cpl_stall(rx_cpl_stall) ); endmodule diff --git a/fpga/lib/pcie/example/common/tb/example_core_pcie/test_example_core_pcie.py b/fpga/lib/pcie/example/common/tb/example_core_pcie/test_example_core_pcie.py index 70f1b9813..2d4634c54 100644 --- a/fpga/lib/pcie/example/common/tb/example_core_pcie/test_example_core_pcie.py +++ b/fpga/lib/pcie/example/common/tb/example_core_pcie/test_example_core_pcie.py @@ -224,6 +224,8 @@ async def run_test(dut): await Timer(2000, 'ns') # read status + status = await dev_pf0_bar0.read_dword(0x000000) + tb.log.info("DMA Status: 0x%x", status) val = await dev_pf0_bar0.read_dword(0x000118) tb.log.info("Status: 0x%x", val) assert val == 0x800000AA @@ -238,6 +240,8 @@ async def run_test(dut): await Timer(2000, 'ns') # read status + status = await dev_pf0_bar0.read_dword(0x000000) + tb.log.info("DMA Status: 0x%x", status) val = await dev_pf0_bar0.read_dword(0x000218) tb.log.info("Status: 0x%x", val) assert val == 0x80000055 @@ -258,6 +262,8 @@ async def run_test(dut): await Timer(2000, 'ns') # read status + status = await dev_pf0_bar0.read_dword(0x000000) + tb.log.info("DMA Status: 0x%x", status) val = await dev_pf0_bar0.read_dword(0x000218) tb.log.info("Status: 0x%x", val) assert val == 0x800000AA @@ -321,11 +327,15 @@ async def run_test(dut): await dev_pf0_bar0.write_dword(0x001000, 1) for k in range(10): - cnt = await dev_pf0_bar0.read_dword(0x001018) await Timer(1000, 'ns') - if cnt == 0: + run = await dev_pf0_bar0.read_dword(0x001000) + if run == 0: break + # read status + status = await dev_pf0_bar0.read_dword(0x000000) + tb.log.info("DMA Status: 0x%x", status) + # configure operation (write) # DMA base address await dev_pf0_bar0.write_dword(0x001180, (mem_base+dest_offset) & 0xffffffff) @@ -363,11 +373,17 @@ async def run_test(dut): await dev_pf0_bar0.write_dword(0x001100, 1) for k in range(10): - cnt = await dev_pf0_bar0.read_dword(0x001118) await Timer(1000, 'ns') - if cnt == 0: + run = await dev_pf0_bar0.read_dword(0x001100) + if run == 0: break + # read status + status = await dev_pf0_bar0.read_dword(0x000000) + tb.log.info("DMA Status: 0x%x", status) + + assert status & 0x300 == 0 + tb.log.info("%s", mem.hexdump_str(dest_offset, region_len)) assert mem[src_offset:src_offset+region_len] == mem[dest_offset:dest_offset+region_len] diff --git a/fpga/lib/pcie/example/common/tb/example_core_pcie_ptile/test_example_core_pcie_ptile.py b/fpga/lib/pcie/example/common/tb/example_core_pcie_ptile/test_example_core_pcie_ptile.py index b82024cc6..42cce8133 100644 --- a/fpga/lib/pcie/example/common/tb/example_core_pcie_ptile/test_example_core_pcie_ptile.py +++ b/fpga/lib/pcie/example/common/tb/example_core_pcie_ptile/test_example_core_pcie_ptile.py @@ -258,6 +258,211 @@ class TB(object): await self.rc.enumerate() +async def dma_block_read_bench(tb, dev, addr, mask, size, stride, count): + dev_pf0_bar0 = dev.bar_window[0] + + rd_req = await dev_pf0_bar0.read_dword(0x000020) + rd_cpl = await dev_pf0_bar0.read_dword(0x000024) + + # configure operation (read) + # DMA base address + await dev_pf0_bar0.write_dword(0x001080, addr & 0xffffffff) + await dev_pf0_bar0.write_dword(0x001084, (addr >> 32) & 0xffffffff) + # DMA offset address + await dev_pf0_bar0.write_dword(0x001088, 0) + await dev_pf0_bar0.write_dword(0x00108c, 0) + # DMA offset mask + await dev_pf0_bar0.write_dword(0x001090, mask) + await dev_pf0_bar0.write_dword(0x001094, 0) + # DMA stride + await dev_pf0_bar0.write_dword(0x001098, stride) + await dev_pf0_bar0.write_dword(0x00109c, 0) + # RAM base address + await dev_pf0_bar0.write_dword(0x0010c0, 0) + await dev_pf0_bar0.write_dword(0x0010c4, 0) + # RAM offset address + await dev_pf0_bar0.write_dword(0x0010c8, 0) + await dev_pf0_bar0.write_dword(0x0010cc, 0) + # RAM offset mask + await dev_pf0_bar0.write_dword(0x0010d0, mask) + await dev_pf0_bar0.write_dword(0x0010d4, 0) + # RAM stride + await dev_pf0_bar0.write_dword(0x0010d8, stride) + await dev_pf0_bar0.write_dword(0x0010dc, 0) + # clear cycle count + await dev_pf0_bar0.write_dword(0x001008, 0) + await dev_pf0_bar0.write_dword(0x00100c, 0) + # block length + await dev_pf0_bar0.write_dword(0x001010, size) + # block count + await dev_pf0_bar0.write_dword(0x001018, count) + await dev_pf0_bar0.write_dword(0x00101c, 0) + + # start + await dev_pf0_bar0.write_dword(0x001000, 1) + + for k in range(1000): + await Timer(1000, 'ns') + run = await dev_pf0_bar0.read_dword(0x001000) + status = await dev_pf0_bar0.read_dword(0x000000) + if run == 0 and status & 0x300 == 0: + break + + if run != 0: + tb.log.warning("Operation timed out") + if status & 0x300 != 0: + tb.log.warning("DMA engine busy") + + cycles = await dev_pf0_bar0.read_dword(0x001008) + + rd_req = await dev_pf0_bar0.read_dword(0x000020) - rd_req + rd_cpl = await dev_pf0_bar0.read_dword(0x000024) - rd_cpl + + tb.log.info("read %d blocks of %d bytes (total %d B, stride %d) in %d ns (%d req %d cpl) %d Mbps", + count, size, count*size, stride, cycles*4, rd_req, rd_cpl, size * count * 8 * 1000 / (cycles * 4)) + + assert status & 0x300 == 0 + + +async def dma_block_write_bench(tb, dev, addr, mask, size, stride, count): + dev_pf0_bar0 = dev.bar_window[0] + + wr_req = await dev_pf0_bar0.read_dword(0x000028) + + # configure operation (read) + # DMA base address + await dev_pf0_bar0.write_dword(0x001180, addr & 0xffffffff) + await dev_pf0_bar0.write_dword(0x001184, (addr >> 32) & 0xffffffff) + # DMA offset address + await dev_pf0_bar0.write_dword(0x001188, 0) + await dev_pf0_bar0.write_dword(0x00118c, 0) + # DMA offset mask + await dev_pf0_bar0.write_dword(0x001190, mask) + await dev_pf0_bar0.write_dword(0x001194, 0) + # DMA stride + await dev_pf0_bar0.write_dword(0x001198, stride) + await dev_pf0_bar0.write_dword(0x00119c, 0) + # RAM base address + await dev_pf0_bar0.write_dword(0x0011c0, 0) + await dev_pf0_bar0.write_dword(0x0011c4, 0) + # RAM offset address + await dev_pf0_bar0.write_dword(0x0011c8, 0) + await dev_pf0_bar0.write_dword(0x0011cc, 0) + # RAM offset mask + await dev_pf0_bar0.write_dword(0x0011d0, mask) + await dev_pf0_bar0.write_dword(0x0011d4, 0) + # RAM stride + await dev_pf0_bar0.write_dword(0x0011d8, stride) + await dev_pf0_bar0.write_dword(0x0011dc, 0) + # clear cycle count + await dev_pf0_bar0.write_dword(0x001108, 0) + await dev_pf0_bar0.write_dword(0x00110c, 0) + # block length + await dev_pf0_bar0.write_dword(0x001110, size) + # block count + await dev_pf0_bar0.write_dword(0x001118, count) + await dev_pf0_bar0.write_dword(0x00111c, 0) + + # start + await dev_pf0_bar0.write_dword(0x001100, 1) + + for k in range(1000): + await Timer(1000, 'ns') + run = await dev_pf0_bar0.read_dword(0x001100) + status = await dev_pf0_bar0.read_dword(0x000000) + if run == 0 and status & 0x300 == 0: + break + + if run != 0: + tb.log.warning("Operation timed out") + if status & 0x300 != 0: + tb.log.warning("DMA engine busy") + + cycles = await dev_pf0_bar0.read_dword(0x001108) + + wr_req = await dev_pf0_bar0.read_dword(0x000028) - wr_req + + tb.log.info("wrote %d blocks of %d bytes (total %d B, stride %d) in %d ns (%d req) %d Mbps", + count, size, count*size, stride, cycles*4, wr_req, size * count * 8 * 1000 / (cycles * 4)) + + assert status & 0x300 == 0 + + +async def dma_cpl_buf_test(tb, dev, addr, mask, size, stride, count, stall): + dev_pf0_bar0 = dev.bar_window[0] + + rd_req = await dev_pf0_bar0.read_dword(0x000020) + rd_cpl = await dev_pf0_bar0.read_dword(0x000024) + + # configure operation (read) + # DMA base address + await dev_pf0_bar0.write_dword(0x001080, addr & 0xffffffff) + await dev_pf0_bar0.write_dword(0x001084, (addr >> 32) & 0xffffffff) + # DMA offset address + await dev_pf0_bar0.write_dword(0x001088, 0) + await dev_pf0_bar0.write_dword(0x00108c, 0) + # DMA offset mask + await dev_pf0_bar0.write_dword(0x001090, mask) + await dev_pf0_bar0.write_dword(0x001094, 0) + # DMA stride + await dev_pf0_bar0.write_dword(0x001098, stride) + await dev_pf0_bar0.write_dword(0x00109c, 0) + # RAM base address + await dev_pf0_bar0.write_dword(0x0010c0, 0) + await dev_pf0_bar0.write_dword(0x0010c4, 0) + # RAM offset address + await dev_pf0_bar0.write_dword(0x0010c8, 0) + await dev_pf0_bar0.write_dword(0x0010cc, 0) + # RAM offset mask + await dev_pf0_bar0.write_dword(0x0010d0, mask) + await dev_pf0_bar0.write_dword(0x0010d4, 0) + # RAM stride + await dev_pf0_bar0.write_dword(0x0010d8, stride) + await dev_pf0_bar0.write_dword(0x0010dc, 0) + # clear cycle count + await dev_pf0_bar0.write_dword(0x001008, 0) + await dev_pf0_bar0.write_dword(0x00100c, 0) + # block length + await dev_pf0_bar0.write_dword(0x001010, size) + # block count + await dev_pf0_bar0.write_dword(0x001018, count) + await dev_pf0_bar0.write_dword(0x00101c, 0) + + if stall: + # stall RX + await dev_pf0_bar0.write_dword(0x000040, stall) + + # start + await dev_pf0_bar0.write_dword(0x001000, 1) + + # wait for stall + if stall: + for k in range(stall): + await RisingEdge(tb.dut.clk) + + for k in range(100): + await Timer(1000, 'ns') + run = await dev_pf0_bar0.read_dword(0x001000) + status = await dev_pf0_bar0.read_dword(0x000000) + if run == 0 and status & 0x300 == 0: + break + + if run != 0: + tb.log.warning("Operation timed out") + if status & 0x300 != 0: + tb.log.warning("DMA engine busy") + + cycles = await dev_pf0_bar0.read_dword(0x001008) + + rd_req = await dev_pf0_bar0.read_dword(0x000020) - rd_req + rd_cpl = await dev_pf0_bar0.read_dword(0x000024) - rd_cpl + + tb.log.info("read %d x %d B (total %d B %d CPLD, stride %d) in %d ns (%d req %d cpl) %d Mbps", + count, size, count*size, count*((size+15)//16), stride, cycles*4, rd_req, rd_cpl, size * count * 8 * 1000 / (cycles * 4)) + + assert status & 0x300 == 0 + + @cocotb.test() async def run_test(dut): @@ -309,6 +514,8 @@ async def run_test(dut): await Timer(2000, 'ns') # read status + status = await dev_pf0_bar0.read_dword(0x000000) + tb.log.info("DMA Status: 0x%x", status) val = await dev_pf0_bar0.read_dword(0x000118) tb.log.info("Status: 0x%x", val) assert val == 0x800000AA @@ -323,6 +530,8 @@ async def run_test(dut): await Timer(2000, 'ns') # read status + status = await dev_pf0_bar0.read_dword(0x000000) + tb.log.info("DMA Status: 0x%x", status) val = await dev_pf0_bar0.read_dword(0x000218) tb.log.info("Status: 0x%x", val) assert val == 0x80000055 @@ -343,6 +552,8 @@ async def run_test(dut): await Timer(2000, 'ns') # read status + status = await dev_pf0_bar0.read_dword(0x000000) + tb.log.info("DMA Status: 0x%x", status) val = await dev_pf0_bar0.read_dword(0x000218) tb.log.info("Status: 0x%x", val) assert val == 0x800000AA @@ -353,110 +564,66 @@ async def run_test(dut): tb.log.info("Test DMA block operations") + # disable interrupts + await dev_pf0_bar0.write_dword(0x000008, 0) + region_len = 0x2000 src_offset = 0x0000 dest_offset = 0x4000 - block_size = 256 - block_stride = block_size - block_count = 32 - - # write packet data - mem[src_offset:src_offset+region_len] = bytearray([x % 256 for x in range(region_len)]) - - # enable DMA - await dev_pf0_bar0.write_dword(0x000000, 1) - # disable interrupts - await dev_pf0_bar0.write_dword(0x000008, 0) - - # configure operation (read) - # DMA base address - await dev_pf0_bar0.write_dword(0x001080, (mem_base+src_offset) & 0xffffffff) - await dev_pf0_bar0.write_dword(0x001084, (mem_base+src_offset >> 32) & 0xffffffff) - # DMA offset address - await dev_pf0_bar0.write_dword(0x001088, 0) - await dev_pf0_bar0.write_dword(0x00108c, 0) - # DMA offset mask - await dev_pf0_bar0.write_dword(0x001090, region_len-1) - await dev_pf0_bar0.write_dword(0x001094, 0) - # DMA stride - await dev_pf0_bar0.write_dword(0x001098, block_stride) - await dev_pf0_bar0.write_dword(0x00109c, 0) - # RAM base address - await dev_pf0_bar0.write_dword(0x0010c0, 0) - await dev_pf0_bar0.write_dword(0x0010c4, 0) - # RAM offset address - await dev_pf0_bar0.write_dword(0x0010c8, 0) - await dev_pf0_bar0.write_dword(0x0010cc, 0) - # RAM offset mask - await dev_pf0_bar0.write_dword(0x0010d0, region_len-1) - await dev_pf0_bar0.write_dword(0x0010d4, 0) - # RAM stride - await dev_pf0_bar0.write_dword(0x0010d8, block_stride) - await dev_pf0_bar0.write_dword(0x0010dc, 0) - # clear cycle count - await dev_pf0_bar0.write_dword(0x001008, 0) - await dev_pf0_bar0.write_dword(0x00100c, 0) - # block length - await dev_pf0_bar0.write_dword(0x001010, block_size) - # block count - await dev_pf0_bar0.write_dword(0x001018, block_count) - await dev_pf0_bar0.write_dword(0x00101c, 0) - # start - await dev_pf0_bar0.write_dword(0x001000, 1) - - for k in range(10): - cnt = await dev_pf0_bar0.read_dword(0x001018) - await Timer(1000, 'ns') - if cnt == 0: - break - - # configure operation (write) - # DMA base address - await dev_pf0_bar0.write_dword(0x001180, (mem_base+dest_offset) & 0xffffffff) - await dev_pf0_bar0.write_dword(0x001184, (mem_base+dest_offset >> 32) & 0xffffffff) - # DMA offset address - await dev_pf0_bar0.write_dword(0x001188, 0) - await dev_pf0_bar0.write_dword(0x00118c, 0) - # DMA offset mask - await dev_pf0_bar0.write_dword(0x001190, region_len-1) - await dev_pf0_bar0.write_dword(0x001194, 0) - # DMA stride - await dev_pf0_bar0.write_dword(0x001198, block_stride) - await dev_pf0_bar0.write_dword(0x00119c, 0) - # RAM base address - await dev_pf0_bar0.write_dword(0x0011c0, 0) - await dev_pf0_bar0.write_dword(0x0011c4, 0) - # RAM offset address - await dev_pf0_bar0.write_dword(0x0011c8, 0) - await dev_pf0_bar0.write_dword(0x0011cc, 0) - # RAM offset mask - await dev_pf0_bar0.write_dword(0x0011d0, region_len-1) - await dev_pf0_bar0.write_dword(0x0011d4, 0) - # RAM stride - await dev_pf0_bar0.write_dword(0x0011d8, block_stride) - await dev_pf0_bar0.write_dword(0x0011dc, 0) - # clear cycle count - await dev_pf0_bar0.write_dword(0x001108, 0) - await dev_pf0_bar0.write_dword(0x00110c, 0) - # block length - await dev_pf0_bar0.write_dword(0x001110, block_size) - # block count - await dev_pf0_bar0.write_dword(0x001118, block_count) - await dev_pf0_bar0.write_dword(0x00111c, 0) - # start - await dev_pf0_bar0.write_dword(0x001100, 1) - - for k in range(10): - cnt = await dev_pf0_bar0.read_dword(0x001118) - await Timer(1000, 'ns') - if cnt == 0: - break + await dma_block_read_bench(tb, dev, mem_base+src_offset, region_len-1, 256, 256, 32) + await dma_block_write_bench(tb, dev, mem_base+dest_offset, region_len-1, 256, 256, 32) tb.log.info("%s", mem.hexdump_str(dest_offset, region_len)) assert mem[src_offset:src_offset+region_len] == mem[dest_offset:dest_offset+region_len] + tb.log.info("Test RX completion buffer (CPLH, 8)") + + tb.rc.split_on_all_rcb = True + + size = 8 + stride = size + for count in range(32, 256+1, 8): + await dma_cpl_buf_test(tb, dev, mem_base, region_len-1, size, stride, count, 2000) + + tb.log.info("Test RX completion buffer (CPLH, 8+64)") + + size = 8+64 + stride = 0 + for count in range(8, 256+1, 8): + await dma_cpl_buf_test(tb, dev, mem_base+128-8, region_len-1, size, stride, count, 2000) + + tb.log.info("Test RX completion buffer (CPLH, 8+128+8)") + + size = 8+128+8 + stride = 0 + for count in range(8, 256+1, 8): + await dma_cpl_buf_test(tb, dev, mem_base+128-8, region_len-1, size, stride, count, 2000) + + tb.rc.split_on_all_rcb = False + + tb.log.info("Test RX completion buffer (CPLD)") + + size = 512 + stride = size + for count in range(8, 256+1, 8): + await dma_cpl_buf_test(tb, dev, mem_base, region_len-1, size, stride, count, 4000) + + tb.log.info("Perform block reads") + + count = 100 + for size in [2**x for x in range(14)]: + stride = size + await dma_block_read_bench(tb, dev, mem_base, region_len-1, size, stride, count) + + tb.log.info("Perform block writes") + + count = 100 + for size in [2**x for x in range(14)]: + stride = size + await dma_block_write_bench(tb, dev, mem_base, region_len-1, size, stride, count) + await RisingEdge(dut.clk) await RisingEdge(dut.clk) diff --git a/fpga/lib/pcie/example/common/tb/example_core_pcie_s10/Makefile b/fpga/lib/pcie/example/common/tb/example_core_pcie_s10/Makefile index 6b629949c..fbb5c4899 100644 --- a/fpga/lib/pcie/example/common/tb/example_core_pcie_s10/Makefile +++ b/fpga/lib/pcie/example/common/tb/example_core_pcie_s10/Makefile @@ -57,7 +57,7 @@ VERILOG_SOURCES += ../../../../rtl/priority_encoder.v VERILOG_SOURCES += ../../../../rtl/pulse_merge.v # module parameters -export PARAM_SEG_COUNT := 1 +export PARAM_SEG_COUNT := 2 export PARAM_SEG_DATA_WIDTH := 256 export PARAM_SEG_EMPTY_WIDTH := $(shell python -c "print((($(PARAM_SEG_DATA_WIDTH)//32)-1).bit_length())" ) export PARAM_TX_SEQ_NUM_WIDTH := 6 @@ -69,7 +69,7 @@ export PARAM_IMM_WIDTH := 32 export PARAM_READ_OP_TABLE_SIZE := $(PARAM_PCIE_TAG_COUNT) export PARAM_READ_TX_LIMIT := $(shell echo "$$(( 1 << $(PARAM_TX_SEQ_NUM_WIDTH) ))" ) export PARAM_READ_CPLH_FC_LIMIT := 770 -export PARAM_READ_CPLD_FC_LIMIT := 2500 +export PARAM_READ_CPLD_FC_LIMIT := 2400 export PARAM_WRITE_OP_TABLE_SIZE := $(shell echo "$$(( 1 << $(PARAM_TX_SEQ_NUM_WIDTH) ))" ) export PARAM_WRITE_TX_LIMIT := $(shell echo "$$(( 1 << $(PARAM_TX_SEQ_NUM_WIDTH) ))" ) export PARAM_BAR0_APERTURE := 24 diff --git a/fpga/lib/pcie/example/common/tb/example_core_pcie_s10/test_example_core_pcie_s10.py b/fpga/lib/pcie/example/common/tb/example_core_pcie_s10/test_example_core_pcie_s10.py index b74f4a58e..d6cfac5d0 100644 --- a/fpga/lib/pcie/example/common/tb/example_core_pcie_s10/test_example_core_pcie_s10.py +++ b/fpga/lib/pcie/example/common/tb/example_core_pcie_s10/test_example_core_pcie_s10.py @@ -206,6 +206,211 @@ class TB(object): await self.rc.enumerate() +async def dma_block_read_bench(tb, dev, addr, mask, size, stride, count): + dev_pf0_bar0 = dev.bar_window[0] + + rd_req = await dev_pf0_bar0.read_dword(0x000020) + rd_cpl = await dev_pf0_bar0.read_dword(0x000024) + + # configure operation (read) + # DMA base address + await dev_pf0_bar0.write_dword(0x001080, addr & 0xffffffff) + await dev_pf0_bar0.write_dword(0x001084, (addr >> 32) & 0xffffffff) + # DMA offset address + await dev_pf0_bar0.write_dword(0x001088, 0) + await dev_pf0_bar0.write_dword(0x00108c, 0) + # DMA offset mask + await dev_pf0_bar0.write_dword(0x001090, mask) + await dev_pf0_bar0.write_dword(0x001094, 0) + # DMA stride + await dev_pf0_bar0.write_dword(0x001098, stride) + await dev_pf0_bar0.write_dword(0x00109c, 0) + # RAM base address + await dev_pf0_bar0.write_dword(0x0010c0, 0) + await dev_pf0_bar0.write_dword(0x0010c4, 0) + # RAM offset address + await dev_pf0_bar0.write_dword(0x0010c8, 0) + await dev_pf0_bar0.write_dword(0x0010cc, 0) + # RAM offset mask + await dev_pf0_bar0.write_dword(0x0010d0, mask) + await dev_pf0_bar0.write_dword(0x0010d4, 0) + # RAM stride + await dev_pf0_bar0.write_dword(0x0010d8, stride) + await dev_pf0_bar0.write_dword(0x0010dc, 0) + # clear cycle count + await dev_pf0_bar0.write_dword(0x001008, 0) + await dev_pf0_bar0.write_dword(0x00100c, 0) + # block length + await dev_pf0_bar0.write_dword(0x001010, size) + # block count + await dev_pf0_bar0.write_dword(0x001018, count) + await dev_pf0_bar0.write_dword(0x00101c, 0) + + # start + await dev_pf0_bar0.write_dword(0x001000, 1) + + for k in range(1000): + await Timer(1000, 'ns') + run = await dev_pf0_bar0.read_dword(0x001000) + status = await dev_pf0_bar0.read_dword(0x000000) + if run == 0 and status & 0x300 == 0: + break + + if run != 0: + tb.log.warning("Operation timed out") + if status & 0x300 != 0: + tb.log.warning("DMA engine busy") + + cycles = await dev_pf0_bar0.read_dword(0x001008) + + rd_req = await dev_pf0_bar0.read_dword(0x000020) - rd_req + rd_cpl = await dev_pf0_bar0.read_dword(0x000024) - rd_cpl + + tb.log.info("read %d blocks of %d bytes (total %d B, stride %d) in %d ns (%d req %d cpl) %d Mbps", + count, size, count*size, stride, cycles*4, rd_req, rd_cpl, size * count * 8 * 1000 / (cycles * 4)) + + assert status & 0x300 == 0 + + +async def dma_block_write_bench(tb, dev, addr, mask, size, stride, count): + dev_pf0_bar0 = dev.bar_window[0] + + wr_req = await dev_pf0_bar0.read_dword(0x000028) + + # configure operation (read) + # DMA base address + await dev_pf0_bar0.write_dword(0x001180, addr & 0xffffffff) + await dev_pf0_bar0.write_dword(0x001184, (addr >> 32) & 0xffffffff) + # DMA offset address + await dev_pf0_bar0.write_dword(0x001188, 0) + await dev_pf0_bar0.write_dword(0x00118c, 0) + # DMA offset mask + await dev_pf0_bar0.write_dword(0x001190, mask) + await dev_pf0_bar0.write_dword(0x001194, 0) + # DMA stride + await dev_pf0_bar0.write_dword(0x001198, stride) + await dev_pf0_bar0.write_dword(0x00119c, 0) + # RAM base address + await dev_pf0_bar0.write_dword(0x0011c0, 0) + await dev_pf0_bar0.write_dword(0x0011c4, 0) + # RAM offset address + await dev_pf0_bar0.write_dword(0x0011c8, 0) + await dev_pf0_bar0.write_dword(0x0011cc, 0) + # RAM offset mask + await dev_pf0_bar0.write_dword(0x0011d0, mask) + await dev_pf0_bar0.write_dword(0x0011d4, 0) + # RAM stride + await dev_pf0_bar0.write_dword(0x0011d8, stride) + await dev_pf0_bar0.write_dword(0x0011dc, 0) + # clear cycle count + await dev_pf0_bar0.write_dword(0x001108, 0) + await dev_pf0_bar0.write_dword(0x00110c, 0) + # block length + await dev_pf0_bar0.write_dword(0x001110, size) + # block count + await dev_pf0_bar0.write_dword(0x001118, count) + await dev_pf0_bar0.write_dword(0x00111c, 0) + + # start + await dev_pf0_bar0.write_dword(0x001100, 1) + + for k in range(1000): + await Timer(1000, 'ns') + run = await dev_pf0_bar0.read_dword(0x001100) + status = await dev_pf0_bar0.read_dword(0x000000) + if run == 0 and status & 0x300 == 0: + break + + if run != 0: + tb.log.warning("Operation timed out") + if status & 0x300 != 0: + tb.log.warning("DMA engine busy") + + cycles = await dev_pf0_bar0.read_dword(0x001108) + + wr_req = await dev_pf0_bar0.read_dword(0x000028) - wr_req + + tb.log.info("wrote %d blocks of %d bytes (total %d B, stride %d) in %d ns (%d req) %d Mbps", + count, size, count*size, stride, cycles*4, wr_req, size * count * 8 * 1000 / (cycles * 4)) + + assert status & 0x300 == 0 + + +async def dma_cpl_buf_test(tb, dev, addr, mask, size, stride, count, stall): + dev_pf0_bar0 = dev.bar_window[0] + + rd_req = await dev_pf0_bar0.read_dword(0x000020) + rd_cpl = await dev_pf0_bar0.read_dword(0x000024) + + # configure operation (read) + # DMA base address + await dev_pf0_bar0.write_dword(0x001080, addr & 0xffffffff) + await dev_pf0_bar0.write_dword(0x001084, (addr >> 32) & 0xffffffff) + # DMA offset address + await dev_pf0_bar0.write_dword(0x001088, 0) + await dev_pf0_bar0.write_dword(0x00108c, 0) + # DMA offset mask + await dev_pf0_bar0.write_dword(0x001090, mask) + await dev_pf0_bar0.write_dword(0x001094, 0) + # DMA stride + await dev_pf0_bar0.write_dword(0x001098, stride) + await dev_pf0_bar0.write_dword(0x00109c, 0) + # RAM base address + await dev_pf0_bar0.write_dword(0x0010c0, 0) + await dev_pf0_bar0.write_dword(0x0010c4, 0) + # RAM offset address + await dev_pf0_bar0.write_dword(0x0010c8, 0) + await dev_pf0_bar0.write_dword(0x0010cc, 0) + # RAM offset mask + await dev_pf0_bar0.write_dword(0x0010d0, mask) + await dev_pf0_bar0.write_dword(0x0010d4, 0) + # RAM stride + await dev_pf0_bar0.write_dword(0x0010d8, stride) + await dev_pf0_bar0.write_dword(0x0010dc, 0) + # clear cycle count + await dev_pf0_bar0.write_dword(0x001008, 0) + await dev_pf0_bar0.write_dword(0x00100c, 0) + # block length + await dev_pf0_bar0.write_dword(0x001010, size) + # block count + await dev_pf0_bar0.write_dword(0x001018, count) + await dev_pf0_bar0.write_dword(0x00101c, 0) + + if stall: + # stall RX + await dev_pf0_bar0.write_dword(0x000040, stall) + + # start + await dev_pf0_bar0.write_dword(0x001000, 1) + + # wait for stall + if stall: + for k in range(stall): + await RisingEdge(tb.dut.clk) + + for k in range(100): + await Timer(1000, 'ns') + run = await dev_pf0_bar0.read_dword(0x001000) + status = await dev_pf0_bar0.read_dword(0x000000) + if run == 0 and status & 0x300 == 0: + break + + if run != 0: + tb.log.warning("Operation timed out") + if status & 0x300 != 0: + tb.log.warning("DMA engine busy") + + cycles = await dev_pf0_bar0.read_dword(0x001008) + + rd_req = await dev_pf0_bar0.read_dword(0x000020) - rd_req + rd_cpl = await dev_pf0_bar0.read_dword(0x000024) - rd_cpl + + tb.log.info("read %d x %d B (total %d B %d CPLD, stride %d) in %d ns (%d req %d cpl) %d Mbps", + count, size, count*size, count*((size+15)//16), stride, cycles*4, rd_req, rd_cpl, size * count * 8 * 1000 / (cycles * 4)) + + assert status & 0x300 == 0 + + @cocotb.test() async def run_test(dut): @@ -257,6 +462,8 @@ async def run_test(dut): await Timer(2000, 'ns') # read status + status = await dev_pf0_bar0.read_dword(0x000000) + tb.log.info("DMA Status: 0x%x", status) val = await dev_pf0_bar0.read_dword(0x000118) tb.log.info("Status: 0x%x", val) assert val == 0x800000AA @@ -271,6 +478,8 @@ async def run_test(dut): await Timer(2000, 'ns') # read status + status = await dev_pf0_bar0.read_dword(0x000000) + tb.log.info("DMA Status: 0x%x", status) val = await dev_pf0_bar0.read_dword(0x000218) tb.log.info("Status: 0x%x", val) assert val == 0x80000055 @@ -291,6 +500,8 @@ async def run_test(dut): await Timer(2000, 'ns') # read status + status = await dev_pf0_bar0.read_dword(0x000000) + tb.log.info("DMA Status: 0x%x", status) val = await dev_pf0_bar0.read_dword(0x000218) tb.log.info("Status: 0x%x", val) assert val == 0x800000AA @@ -301,110 +512,66 @@ async def run_test(dut): tb.log.info("Test DMA block operations") + # disable interrupts + await dev_pf0_bar0.write_dword(0x000008, 0) + region_len = 0x2000 src_offset = 0x0000 dest_offset = 0x4000 - block_size = 256 - block_stride = block_size - block_count = 32 - - # write packet data - mem[src_offset:src_offset+region_len] = bytearray([x % 256 for x in range(region_len)]) - - # enable DMA - await dev_pf0_bar0.write_dword(0x000000, 1) - # disable interrupts - await dev_pf0_bar0.write_dword(0x000008, 0) - - # configure operation (read) - # DMA base address - await dev_pf0_bar0.write_dword(0x001080, (mem_base+src_offset) & 0xffffffff) - await dev_pf0_bar0.write_dword(0x001084, (mem_base+src_offset >> 32) & 0xffffffff) - # DMA offset address - await dev_pf0_bar0.write_dword(0x001088, 0) - await dev_pf0_bar0.write_dword(0x00108c, 0) - # DMA offset mask - await dev_pf0_bar0.write_dword(0x001090, region_len-1) - await dev_pf0_bar0.write_dword(0x001094, 0) - # DMA stride - await dev_pf0_bar0.write_dword(0x001098, block_stride) - await dev_pf0_bar0.write_dword(0x00109c, 0) - # RAM base address - await dev_pf0_bar0.write_dword(0x0010c0, 0) - await dev_pf0_bar0.write_dword(0x0010c4, 0) - # RAM offset address - await dev_pf0_bar0.write_dword(0x0010c8, 0) - await dev_pf0_bar0.write_dword(0x0010cc, 0) - # RAM offset mask - await dev_pf0_bar0.write_dword(0x0010d0, region_len-1) - await dev_pf0_bar0.write_dword(0x0010d4, 0) - # RAM stride - await dev_pf0_bar0.write_dword(0x0010d8, block_stride) - await dev_pf0_bar0.write_dword(0x0010dc, 0) - # clear cycle count - await dev_pf0_bar0.write_dword(0x001008, 0) - await dev_pf0_bar0.write_dword(0x00100c, 0) - # block length - await dev_pf0_bar0.write_dword(0x001010, block_size) - # block count - await dev_pf0_bar0.write_dword(0x001018, block_count) - await dev_pf0_bar0.write_dword(0x00101c, 0) - # start - await dev_pf0_bar0.write_dword(0x001000, 1) - - for k in range(10): - cnt = await dev_pf0_bar0.read_dword(0x001018) - await Timer(1000, 'ns') - if cnt == 0: - break - - # configure operation (write) - # DMA base address - await dev_pf0_bar0.write_dword(0x001180, (mem_base+dest_offset) & 0xffffffff) - await dev_pf0_bar0.write_dword(0x001184, (mem_base+dest_offset >> 32) & 0xffffffff) - # DMA offset address - await dev_pf0_bar0.write_dword(0x001188, 0) - await dev_pf0_bar0.write_dword(0x00118c, 0) - # DMA offset mask - await dev_pf0_bar0.write_dword(0x001190, region_len-1) - await dev_pf0_bar0.write_dword(0x001194, 0) - # DMA stride - await dev_pf0_bar0.write_dword(0x001198, block_stride) - await dev_pf0_bar0.write_dword(0x00119c, 0) - # RAM base address - await dev_pf0_bar0.write_dword(0x0011c0, 0) - await dev_pf0_bar0.write_dword(0x0011c4, 0) - # RAM offset address - await dev_pf0_bar0.write_dword(0x0011c8, 0) - await dev_pf0_bar0.write_dword(0x0011cc, 0) - # RAM offset mask - await dev_pf0_bar0.write_dword(0x0011d0, region_len-1) - await dev_pf0_bar0.write_dword(0x0011d4, 0) - # RAM stride - await dev_pf0_bar0.write_dword(0x0011d8, block_stride) - await dev_pf0_bar0.write_dword(0x0011dc, 0) - # clear cycle count - await dev_pf0_bar0.write_dword(0x001108, 0) - await dev_pf0_bar0.write_dword(0x00110c, 0) - # block length - await dev_pf0_bar0.write_dword(0x001110, block_size) - # block count - await dev_pf0_bar0.write_dword(0x001118, block_count) - await dev_pf0_bar0.write_dword(0x00111c, 0) - # start - await dev_pf0_bar0.write_dword(0x001100, 1) - - for k in range(10): - cnt = await dev_pf0_bar0.read_dword(0x001118) - await Timer(1000, 'ns') - if cnt == 0: - break + await dma_block_read_bench(tb, dev, mem_base+src_offset, region_len-1, 256, 256, 32) + await dma_block_write_bench(tb, dev, mem_base+dest_offset, region_len-1, 256, 256, 32) tb.log.info("%s", mem.hexdump_str(dest_offset, region_len)) assert mem[src_offset:src_offset+region_len] == mem[dest_offset:dest_offset+region_len] + tb.log.info("Test RX completion buffer (CPLH, 8)") + + tb.rc.split_on_all_rcb = True + + size = 8 + stride = size + for count in range(32, 256+1, 8): + await dma_cpl_buf_test(tb, dev, mem_base, region_len-1, size, stride, count, 2000) + + tb.log.info("Test RX completion buffer (CPLH, 8+64)") + + size = 8+64 + stride = 0 + for count in range(8, 256+1, 8): + await dma_cpl_buf_test(tb, dev, mem_base+128-8, region_len-1, size, stride, count, 2000) + + tb.log.info("Test RX completion buffer (CPLH, 8+128+8)") + + size = 8+128+8 + stride = 0 + for count in range(8, 256+1, 8): + await dma_cpl_buf_test(tb, dev, mem_base+128-8, region_len-1, size, stride, count, 2000) + + tb.rc.split_on_all_rcb = False + + tb.log.info("Test RX completion buffer (CPLD)") + + size = 512 + stride = size + for count in range(8, 256+1, 8): + await dma_cpl_buf_test(tb, dev, mem_base, region_len-1, size, stride, count, 4000) + + tb.log.info("Perform block reads") + + count = 100 + for size in [2**x for x in range(14)]: + stride = size + await dma_block_read_bench(tb, dev, mem_base, region_len-1, size, stride, count) + + tb.log.info("Perform block writes") + + count = 100 + for size in [2**x for x in range(14)]: + stride = size + await dma_block_write_bench(tb, dev, mem_base, region_len-1, size, stride, count) + await RisingEdge(dut.clk) await RisingEdge(dut.clk) @@ -466,7 +633,7 @@ def test_example_core_pcie_s10(request, data_width, l_tile): parameters['READ_OP_TABLE_SIZE'] = parameters['PCIE_TAG_COUNT'] parameters['READ_TX_LIMIT'] = 2**parameters['TX_SEQ_NUM_WIDTH'] parameters['READ_CPLH_FC_LIMIT'] = 770 - parameters['READ_CPLD_FC_LIMIT'] = 2500 + parameters['READ_CPLD_FC_LIMIT'] = 2400 parameters['WRITE_OP_TABLE_SIZE'] = 2**parameters['TX_SEQ_NUM_WIDTH'] parameters['WRITE_TX_LIMIT'] = 2**parameters['TX_SEQ_NUM_WIDTH'] parameters['BAR0_APERTURE'] = 24 diff --git a/fpga/lib/pcie/example/common/tb/example_core_pcie_us/Makefile b/fpga/lib/pcie/example/common/tb/example_core_pcie_us/Makefile index 4f6ced3e1..fe6923cb2 100644 --- a/fpga/lib/pcie/example/common/tb/example_core_pcie_us/Makefile +++ b/fpga/lib/pcie/example/common/tb/example_core_pcie_us/Makefile @@ -74,8 +74,8 @@ export PARAM_IMM_ENABLE := 1 export PARAM_IMM_WIDTH := 32 export PARAM_READ_OP_TABLE_SIZE := $(PARAM_PCIE_TAG_COUNT) export PARAM_READ_TX_LIMIT := $(shell echo "$$(( 1 << ($(PARAM_RQ_SEQ_NUM_WIDTH)-1) ))" ) -export PARAM_READ_CPLH_FC_LIMIT := $(if $(filter-out 60,$(PARAM_AXIS_PCIE_RQ_USER_WIDTH)),64,128) -export PARAM_READ_CPLD_FC_LIMIT := $(if $(filter-out 60,$(PARAM_AXIS_PCIE_RQ_USER_WIDTH)),992,2048) +export PARAM_READ_CPLH_FC_LIMIT := $(if $(filter-out 60,$(PARAM_AXIS_PCIE_RQ_USER_WIDTH)),256,64) +export PARAM_READ_CPLD_FC_LIMIT := $(if $(filter-out 60,$(PARAM_AXIS_PCIE_RQ_USER_WIDTH)),1792,960) export PARAM_WRITE_OP_TABLE_SIZE := $(shell echo "$$(( 1 << ($(PARAM_RQ_SEQ_NUM_WIDTH)-1) ))" ) export PARAM_WRITE_TX_LIMIT := $(shell echo "$$(( 1 << ($(PARAM_RQ_SEQ_NUM_WIDTH)-1) ))" ) export PARAM_BAR0_APERTURE := 24 diff --git a/fpga/lib/pcie/example/common/tb/example_core_pcie_us/test_example_core_pcie_us.py b/fpga/lib/pcie/example/common/tb/example_core_pcie_us/test_example_core_pcie_us.py index e728be19c..b20144336 100644 --- a/fpga/lib/pcie/example/common/tb/example_core_pcie_us/test_example_core_pcie_us.py +++ b/fpga/lib/pcie/example/common/tb/example_core_pcie_us/test_example_core_pcie_us.py @@ -299,6 +299,211 @@ class TB(object): await self.rc.enumerate() +async def dma_block_read_bench(tb, dev, addr, mask, size, stride, count): + dev_pf0_bar0 = dev.bar_window[0] + + rd_req = await dev_pf0_bar0.read_dword(0x000020) + rd_cpl = await dev_pf0_bar0.read_dword(0x000024) + + # configure operation (read) + # DMA base address + await dev_pf0_bar0.write_dword(0x001080, addr & 0xffffffff) + await dev_pf0_bar0.write_dword(0x001084, (addr >> 32) & 0xffffffff) + # DMA offset address + await dev_pf0_bar0.write_dword(0x001088, 0) + await dev_pf0_bar0.write_dword(0x00108c, 0) + # DMA offset mask + await dev_pf0_bar0.write_dword(0x001090, mask) + await dev_pf0_bar0.write_dword(0x001094, 0) + # DMA stride + await dev_pf0_bar0.write_dword(0x001098, stride) + await dev_pf0_bar0.write_dword(0x00109c, 0) + # RAM base address + await dev_pf0_bar0.write_dword(0x0010c0, 0) + await dev_pf0_bar0.write_dword(0x0010c4, 0) + # RAM offset address + await dev_pf0_bar0.write_dword(0x0010c8, 0) + await dev_pf0_bar0.write_dword(0x0010cc, 0) + # RAM offset mask + await dev_pf0_bar0.write_dword(0x0010d0, mask) + await dev_pf0_bar0.write_dword(0x0010d4, 0) + # RAM stride + await dev_pf0_bar0.write_dword(0x0010d8, stride) + await dev_pf0_bar0.write_dword(0x0010dc, 0) + # clear cycle count + await dev_pf0_bar0.write_dword(0x001008, 0) + await dev_pf0_bar0.write_dword(0x00100c, 0) + # block length + await dev_pf0_bar0.write_dword(0x001010, size) + # block count + await dev_pf0_bar0.write_dword(0x001018, count) + await dev_pf0_bar0.write_dword(0x00101c, 0) + + # start + await dev_pf0_bar0.write_dword(0x001000, 1) + + for k in range(1000): + await Timer(1000, 'ns') + run = await dev_pf0_bar0.read_dword(0x001000) + status = await dev_pf0_bar0.read_dword(0x000000) + if run == 0 and status & 0x300 == 0: + break + + if run != 0: + tb.log.warning("Operation timed out") + if status & 0x300 != 0: + tb.log.warning("DMA engine busy") + + cycles = await dev_pf0_bar0.read_dword(0x001008) + + rd_req = await dev_pf0_bar0.read_dword(0x000020) - rd_req + rd_cpl = await dev_pf0_bar0.read_dword(0x000024) - rd_cpl + + tb.log.info("read %d blocks of %d bytes (total %d B, stride %d) in %d ns (%d req %d cpl) %d Mbps", + count, size, count*size, stride, cycles*4, rd_req, rd_cpl, size * count * 8 * 1000 / (cycles * 4)) + + assert status & 0x300 == 0 + + +async def dma_block_write_bench(tb, dev, addr, mask, size, stride, count): + dev_pf0_bar0 = dev.bar_window[0] + + wr_req = await dev_pf0_bar0.read_dword(0x000028) + + # configure operation (read) + # DMA base address + await dev_pf0_bar0.write_dword(0x001180, addr & 0xffffffff) + await dev_pf0_bar0.write_dword(0x001184, (addr >> 32) & 0xffffffff) + # DMA offset address + await dev_pf0_bar0.write_dword(0x001188, 0) + await dev_pf0_bar0.write_dword(0x00118c, 0) + # DMA offset mask + await dev_pf0_bar0.write_dword(0x001190, mask) + await dev_pf0_bar0.write_dword(0x001194, 0) + # DMA stride + await dev_pf0_bar0.write_dword(0x001198, stride) + await dev_pf0_bar0.write_dword(0x00119c, 0) + # RAM base address + await dev_pf0_bar0.write_dword(0x0011c0, 0) + await dev_pf0_bar0.write_dword(0x0011c4, 0) + # RAM offset address + await dev_pf0_bar0.write_dword(0x0011c8, 0) + await dev_pf0_bar0.write_dword(0x0011cc, 0) + # RAM offset mask + await dev_pf0_bar0.write_dword(0x0011d0, mask) + await dev_pf0_bar0.write_dword(0x0011d4, 0) + # RAM stride + await dev_pf0_bar0.write_dword(0x0011d8, stride) + await dev_pf0_bar0.write_dword(0x0011dc, 0) + # clear cycle count + await dev_pf0_bar0.write_dword(0x001108, 0) + await dev_pf0_bar0.write_dword(0x00110c, 0) + # block length + await dev_pf0_bar0.write_dword(0x001110, size) + # block count + await dev_pf0_bar0.write_dword(0x001118, count) + await dev_pf0_bar0.write_dword(0x00111c, 0) + + # start + await dev_pf0_bar0.write_dword(0x001100, 1) + + for k in range(1000): + await Timer(1000, 'ns') + run = await dev_pf0_bar0.read_dword(0x001100) + status = await dev_pf0_bar0.read_dword(0x000000) + if run == 0 and status & 0x300 == 0: + break + + if run != 0: + tb.log.warning("Operation timed out") + if status & 0x300 != 0: + tb.log.warning("DMA engine busy") + + cycles = await dev_pf0_bar0.read_dword(0x001108) + + wr_req = await dev_pf0_bar0.read_dword(0x000028) - wr_req + + tb.log.info("wrote %d blocks of %d bytes (total %d B, stride %d) in %d ns (%d req) %d Mbps", + count, size, count*size, stride, cycles*4, wr_req, size * count * 8 * 1000 / (cycles * 4)) + + assert status & 0x300 == 0 + + +async def dma_cpl_buf_test(tb, dev, addr, mask, size, stride, count, stall): + dev_pf0_bar0 = dev.bar_window[0] + + rd_req = await dev_pf0_bar0.read_dword(0x000020) + rd_cpl = await dev_pf0_bar0.read_dword(0x000024) + + # configure operation (read) + # DMA base address + await dev_pf0_bar0.write_dword(0x001080, addr & 0xffffffff) + await dev_pf0_bar0.write_dword(0x001084, (addr >> 32) & 0xffffffff) + # DMA offset address + await dev_pf0_bar0.write_dword(0x001088, 0) + await dev_pf0_bar0.write_dword(0x00108c, 0) + # DMA offset mask + await dev_pf0_bar0.write_dword(0x001090, mask) + await dev_pf0_bar0.write_dword(0x001094, 0) + # DMA stride + await dev_pf0_bar0.write_dword(0x001098, stride) + await dev_pf0_bar0.write_dword(0x00109c, 0) + # RAM base address + await dev_pf0_bar0.write_dword(0x0010c0, 0) + await dev_pf0_bar0.write_dword(0x0010c4, 0) + # RAM offset address + await dev_pf0_bar0.write_dword(0x0010c8, 0) + await dev_pf0_bar0.write_dword(0x0010cc, 0) + # RAM offset mask + await dev_pf0_bar0.write_dword(0x0010d0, mask) + await dev_pf0_bar0.write_dword(0x0010d4, 0) + # RAM stride + await dev_pf0_bar0.write_dword(0x0010d8, stride) + await dev_pf0_bar0.write_dword(0x0010dc, 0) + # clear cycle count + await dev_pf0_bar0.write_dword(0x001008, 0) + await dev_pf0_bar0.write_dword(0x00100c, 0) + # block length + await dev_pf0_bar0.write_dword(0x001010, size) + # block count + await dev_pf0_bar0.write_dword(0x001018, count) + await dev_pf0_bar0.write_dword(0x00101c, 0) + + if stall: + # stall RX + await dev_pf0_bar0.write_dword(0x000040, stall) + + # start + await dev_pf0_bar0.write_dword(0x001000, 1) + + # wait for stall + if stall: + for k in range(stall): + await RisingEdge(tb.dut.clk) + + for k in range(100): + await Timer(1000, 'ns') + run = await dev_pf0_bar0.read_dword(0x001000) + status = await dev_pf0_bar0.read_dword(0x000000) + if run == 0 and status & 0x300 == 0: + break + + if run != 0: + tb.log.warning("Operation timed out") + if status & 0x300 != 0: + tb.log.warning("DMA engine busy") + + cycles = await dev_pf0_bar0.read_dword(0x001008) + + rd_req = await dev_pf0_bar0.read_dword(0x000020) - rd_req + rd_cpl = await dev_pf0_bar0.read_dword(0x000024) - rd_cpl + + tb.log.info("read %d x %d B (total %d B %d CPLD, stride %d) in %d ns (%d req %d cpl) %d Mbps", + count, size, count*size, count*((size+15)//16), stride, cycles*4, rd_req, rd_cpl, size * count * 8 * 1000 / (cycles * 4)) + + assert status & 0x300 == 0 + + @cocotb.test() async def run_test(dut): @@ -350,6 +555,8 @@ async def run_test(dut): await Timer(2000, 'ns') # read status + status = await dev_pf0_bar0.read_dword(0x000000) + tb.log.info("DMA Status: 0x%x", status) val = await dev_pf0_bar0.read_dword(0x000118) tb.log.info("Status: 0x%x", val) assert val == 0x800000AA @@ -364,6 +571,8 @@ async def run_test(dut): await Timer(2000, 'ns') # read status + status = await dev_pf0_bar0.read_dword(0x000000) + tb.log.info("DMA Status: 0x%x", status) val = await dev_pf0_bar0.read_dword(0x000218) tb.log.info("Status: 0x%x", val) assert val == 0x80000055 @@ -384,6 +593,8 @@ async def run_test(dut): await Timer(2000, 'ns') # read status + status = await dev_pf0_bar0.read_dword(0x000000) + tb.log.info("DMA Status: 0x%x", status) val = await dev_pf0_bar0.read_dword(0x000218) tb.log.info("Status: 0x%x", val) assert val == 0x800000AA @@ -394,112 +605,66 @@ async def run_test(dut): tb.log.info("Test DMA block operations") + # disable interrupts + await dev_pf0_bar0.write_dword(0x000008, 0) + region_len = 0x2000 src_offset = 0x0000 dest_offset = 0x4000 - block_size = 256 - block_stride = block_size - block_count = 32 - - # write packet data - mem[src_offset:src_offset+region_len] = bytearray([x % 256 for x in range(region_len)]) - - # enable DMA - await dev_pf0_bar0.write_dword(0x000000, 1) - # disable interrupts - await dev_pf0_bar0.write_dword(0x000008, 0) - - # configure operation (read) - # DMA base address - await dev_pf0_bar0.write_dword(0x001080, (mem_base+src_offset) & 0xffffffff) - await dev_pf0_bar0.write_dword(0x001084, (mem_base+src_offset >> 32) & 0xffffffff) - # DMA offset address - await dev_pf0_bar0.write_dword(0x001088, 0) - await dev_pf0_bar0.write_dword(0x00108c, 0) - # DMA offset mask - await dev_pf0_bar0.write_dword(0x001090, region_len-1) - await dev_pf0_bar0.write_dword(0x001094, 0) - # DMA stride - await dev_pf0_bar0.write_dword(0x001098, block_stride) - await dev_pf0_bar0.write_dword(0x00109c, 0) - # RAM base address - await dev_pf0_bar0.write_dword(0x0010c0, 0) - await dev_pf0_bar0.write_dword(0x0010c4, 0) - # RAM offset address - await dev_pf0_bar0.write_dword(0x0010c8, 0) - await dev_pf0_bar0.write_dword(0x0010cc, 0) - # RAM offset mask - await dev_pf0_bar0.write_dword(0x0010d0, region_len-1) - await dev_pf0_bar0.write_dword(0x0010d4, 0) - # RAM stride - await dev_pf0_bar0.write_dword(0x0010d8, block_stride) - await dev_pf0_bar0.write_dword(0x0010dc, 0) - # clear cycle count - await dev_pf0_bar0.write_dword(0x001008, 0) - await dev_pf0_bar0.write_dword(0x00100c, 0) - # block length - await dev_pf0_bar0.write_dword(0x001010, block_size) - # block count - await dev_pf0_bar0.write_dword(0x001018, block_count) - await dev_pf0_bar0.write_dword(0x00101c, 0) - # start - await dev_pf0_bar0.write_dword(0x001000, 1) - - for k in range(10): - cnt = await dev_pf0_bar0.read_dword(0x001018) - await Timer(1000, 'ns') - if cnt == 0: - break - - # configure operation (write) - # DMA base address - await dev_pf0_bar0.write_dword(0x001180, (mem_base+dest_offset) & 0xffffffff) - await dev_pf0_bar0.write_dword(0x001184, (mem_base+dest_offset >> 32) & 0xffffffff) - # DMA offset address - await dev_pf0_bar0.write_dword(0x001188, 0) - await dev_pf0_bar0.write_dword(0x00118c, 0) - # DMA offset mask - await dev_pf0_bar0.write_dword(0x001190, region_len-1) - await dev_pf0_bar0.write_dword(0x001194, 0) - # DMA stride - await dev_pf0_bar0.write_dword(0x001198, block_stride) - await dev_pf0_bar0.write_dword(0x00119c, 0) - # RAM base address - await dev_pf0_bar0.write_dword(0x0011c0, 0) - await dev_pf0_bar0.write_dword(0x0011c4, 0) - # RAM offset address - await dev_pf0_bar0.write_dword(0x0011c8, 0) - await dev_pf0_bar0.write_dword(0x0011cc, 0) - # RAM offset mask - await dev_pf0_bar0.write_dword(0x0011d0, region_len-1) - await dev_pf0_bar0.write_dword(0x0011d4, 0) - # RAM stride - await dev_pf0_bar0.write_dword(0x0011d8, block_stride) - await dev_pf0_bar0.write_dword(0x0011dc, 0) - # clear cycle count - await dev_pf0_bar0.write_dword(0x001108, 0) - await dev_pf0_bar0.write_dword(0x00110c, 0) - # block length - await dev_pf0_bar0.write_dword(0x001110, block_size) - # block count - await dev_pf0_bar0.write_dword(0x001118, block_count) - await dev_pf0_bar0.write_dword(0x00111c, 0) - # start - await dev_pf0_bar0.write_dword(0x001100, 1) - - for k in range(10): - cnt = await dev_pf0_bar0.read_dword(0x001118) - await Timer(1000, 'ns') - if cnt == 0: - break - - await Timer(2000, 'ns') + await dma_block_read_bench(tb, dev, mem_base+src_offset, region_len-1, 256, 256, 32) + await dma_block_write_bench(tb, dev, mem_base+dest_offset, region_len-1, 256, 256, 32) tb.log.info("%s", mem.hexdump_str(dest_offset, region_len)) assert mem[src_offset:src_offset+region_len] == mem[dest_offset:dest_offset+region_len] + tb.log.info("Test RX completion buffer (CPLH, 8)") + + tb.rc.split_on_all_rcb = True + + size = 8 + stride = size + for count in range(32, 256+1, 8): + await dma_cpl_buf_test(tb, dev, mem_base, region_len-1, size, stride, count, 2000) + + tb.log.info("Test RX completion buffer (CPLH, 8+64)") + + size = 8+64 + stride = 0 + for count in range(8, 256+1, 8): + await dma_cpl_buf_test(tb, dev, mem_base+128-8, region_len-1, size, stride, count, 2000) + + tb.log.info("Test RX completion buffer (CPLH, 8+128+8)") + + size = 8+128+8 + stride = 0 + for count in range(8, 256+1, 8): + await dma_cpl_buf_test(tb, dev, mem_base+128-8, region_len-1, size, stride, count, 2000) + + tb.rc.split_on_all_rcb = False + + tb.log.info("Test RX completion buffer (CPLD)") + + size = 512 + stride = size + for count in range(8, 256+1, 8): + await dma_cpl_buf_test(tb, dev, mem_base, region_len-1, size, stride, count, 4000) + + tb.log.info("Perform block reads") + + count = 100 + for size in [2**x for x in range(14)]: + stride = size + await dma_block_read_bench(tb, dev, mem_base, region_len-1, size, stride, count) + + tb.log.info("Perform block writes") + + count = 100 + for size in [2**x for x in range(14)]: + stride = size + await dma_block_write_bench(tb, dev, mem_base, region_len-1, size, stride, count) + await RisingEdge(dut.clk) await RisingEdge(dut.clk) @@ -566,8 +731,8 @@ def test_example_core_pcie_us(request, axis_pcie_data_width, straddle): parameters['IMM_WIDTH'] = 32 parameters['READ_OP_TABLE_SIZE'] = parameters['PCIE_TAG_COUNT'] parameters['READ_TX_LIMIT'] = 2**(parameters['RQ_SEQ_NUM_WIDTH']-1) - parameters['READ_CPLH_FC_LIMIT'] = 64 if parameters['AXIS_PCIE_RQ_USER_WIDTH'] == 60 else 128 - parameters['READ_CPLD_FC_LIMIT'] = 992 if parameters['AXIS_PCIE_RQ_USER_WIDTH'] == 60 else 2048 + parameters['READ_CPLH_FC_LIMIT'] = 64 if parameters['AXIS_PCIE_RQ_USER_WIDTH'] == 60 else 256 + parameters['READ_CPLD_FC_LIMIT'] = 1024-64 if parameters['AXIS_PCIE_RQ_USER_WIDTH'] == 60 else 2048-256 parameters['WRITE_OP_TABLE_SIZE'] = 2**(parameters['RQ_SEQ_NUM_WIDTH']-1) parameters['WRITE_TX_LIMIT'] = 2**(parameters['RQ_SEQ_NUM_WIDTH']-1) parameters['BAR0_APERTURE'] = 24 diff --git a/fpga/lib/pcie/example/fb2CG/fpga/rtl/fpga.v b/fpga/lib/pcie/example/fb2CG/fpga/rtl/fpga.v index 78db5d5e9..04ed89ded 100644 --- a/fpga/lib/pcie/example/fb2CG/fpga/rtl/fpga.v +++ b/fpga/lib/pcie/example/fb2CG/fpga/rtl/fpga.v @@ -56,7 +56,7 @@ module fpga ( parameter AXIS_PCIE_DATA_WIDTH = 512; parameter AXIS_PCIE_KEEP_WIDTH = (AXIS_PCIE_DATA_WIDTH/32); parameter AXIS_PCIE_RC_USER_WIDTH = AXIS_PCIE_DATA_WIDTH < 512 ? 75 : 161; -parameter AXIS_PCIE_RQ_USER_WIDTH = AXIS_PCIE_DATA_WIDTH < 512 ? 60 : 137; +parameter AXIS_PCIE_RQ_USER_WIDTH = AXIS_PCIE_DATA_WIDTH < 512 ? 62 : 137; parameter AXIS_PCIE_CQ_USER_WIDTH = AXIS_PCIE_DATA_WIDTH < 512 ? 85 : 183; parameter AXIS_PCIE_CC_USER_WIDTH = AXIS_PCIE_DATA_WIDTH < 512 ? 33 : 81; parameter RC_STRADDLE = AXIS_PCIE_DATA_WIDTH >= 256; diff --git a/fpga/lib/pcie/example/fb2CG/fpga/rtl/fpga_core.v b/fpga/lib/pcie/example/fb2CG/fpga/rtl/fpga_core.v index 10f534459..aa479324a 100644 --- a/fpga/lib/pcie/example/fb2CG/fpga/rtl/fpga_core.v +++ b/fpga/lib/pcie/example/fb2CG/fpga/rtl/fpga_core.v @@ -161,8 +161,8 @@ example_core_pcie_us #( .PCIE_TAG_COUNT(PCIE_TAG_COUNT), .READ_OP_TABLE_SIZE(PCIE_TAG_COUNT), .READ_TX_LIMIT(2**(RQ_SEQ_NUM_WIDTH-1)), - .READ_CPLH_FC_LIMIT(128), - .READ_CPLD_FC_LIMIT(2048), + .READ_CPLH_FC_LIMIT(256), + .READ_CPLD_FC_LIMIT(2048-256), .WRITE_OP_TABLE_SIZE(2**(RQ_SEQ_NUM_WIDTH-1)), .WRITE_TX_LIMIT(2**(RQ_SEQ_NUM_WIDTH-1)), .BAR0_APERTURE(BAR0_APERTURE), @@ -265,8 +265,7 @@ example_core_pcie_us_inst ( */ .cfg_max_read_req(cfg_max_read_req), .cfg_max_payload(cfg_max_payload), - // .cfg_rcb_status(cfg_rcb_status), - .cfg_rcb_status(1'b1), // force RCB 128 due to insufficient CPLH limit in US+ PCIe HIP + .cfg_rcb_status(cfg_rcb_status), /* * Status diff --git a/fpga/lib/pcie/example/fb2CG/fpga_axi/tb/fpga_core/Makefile b/fpga/lib/pcie/example/fb2CG/fpga_axi/tb/fpga_core/Makefile index e730af529..b09d0d3bf 100644 --- a/fpga/lib/pcie/example/fb2CG/fpga_axi/tb/fpga_core/Makefile +++ b/fpga/lib/pcie/example/fb2CG/fpga_axi/tb/fpga_core/Makefile @@ -54,7 +54,6 @@ export PARAM_AXIS_PCIE_RQ_USER_WIDTH := $(if $(filter-out 512,$(PARAM_AXIS_PCIE_ export PARAM_AXIS_PCIE_RC_USER_WIDTH := $(if $(filter-out 512,$(PARAM_AXIS_PCIE_DATA_WIDTH)),75,161) export PARAM_AXIS_PCIE_CQ_USER_WIDTH := $(if $(filter-out 512,$(PARAM_AXIS_PCIE_DATA_WIDTH)),88,183) export PARAM_AXIS_PCIE_CC_USER_WIDTH := $(if $(filter-out 512,$(PARAM_AXIS_PCIE_DATA_WIDTH)),33,81) -export PARAM_RQ_SEQ_NUM_WIDTH := 6 ifeq ($(SIM), icarus) PLUSARGS += -fst diff --git a/fpga/lib/pcie/example/fb2CG/fpga_axi/tb/fpga_core/test_fpga_core.py b/fpga/lib/pcie/example/fb2CG/fpga_axi/tb/fpga_core/test_fpga_core.py index 80b4057a8..ee8e602bc 100644 --- a/fpga/lib/pcie/example/fb2CG/fpga_axi/tb/fpga_core/test_fpga_core.py +++ b/fpga/lib/pcie/example/fb2CG/fpga_axi/tb/fpga_core/test_fpga_core.py @@ -396,7 +396,6 @@ def test_fpga_core(request): parameters['AXIS_PCIE_RC_USER_WIDTH'] = 75 if parameters['AXIS_PCIE_DATA_WIDTH'] < 512 else 161 parameters['AXIS_PCIE_CQ_USER_WIDTH'] = 88 if parameters['AXIS_PCIE_DATA_WIDTH'] < 512 else 183 parameters['AXIS_PCIE_CC_USER_WIDTH'] = 33 if parameters['AXIS_PCIE_DATA_WIDTH'] < 512 else 81 - parameters['RQ_SEQ_NUM_WIDTH'] = 6 extra_env = {f'PARAM_{k}': str(v) for k, v in parameters.items()} diff --git a/fpga/lib/pcie/rtl/dma_if_pcie_rd.v b/fpga/lib/pcie/rtl/dma_if_pcie_rd.v index a238b0414..3e0801e7f 100644 --- a/fpga/lib/pcie/rtl/dma_if_pcie_rd.v +++ b/fpga/lib/pcie/rtl/dma_if_pcie_rd.v @@ -422,13 +422,13 @@ reg [OP_TAG_WIDTH+1-1:0] active_op_count_reg = 0; reg inc_active_op; reg dec_active_op; -reg [CL_CPLH_FC_LIMIT+1-1:0] active_cplh_fc_count_reg = 0; -reg active_cplh_fc_av_reg = 1'b1; +reg [CL_CPLH_FC_LIMIT+1-1:0] active_cplh_fc_count_reg = 0, active_cplh_fc_count_next; +reg active_cplh_fc_av_reg = 1'b1, active_cplh_fc_av_next; reg [6:0] inc_active_cplh_fc_count; reg [6:0] dec_active_cplh_fc_count; -reg [CL_CPLD_FC_LIMIT+1-1:0] active_cpld_fc_count_reg = 0; -reg active_cpld_fc_av_reg = 1'b1; +reg [CL_CPLD_FC_LIMIT+1-1:0] active_cpld_fc_count_reg = 0, active_cpld_fc_count_next; +reg active_cpld_fc_av_reg = 1'b1, active_cpld_fc_av_next; reg [8:0] inc_active_cpld_fc_count; reg [8:0] dec_active_cpld_fc_count; @@ -1382,6 +1382,12 @@ always @* begin end active_tx_count_av_next = active_tx_count_next < TX_LIMIT; + + active_cplh_fc_count_next <= active_cplh_fc_count_reg + inc_active_cplh_fc_count - dec_active_cplh_fc_count; + active_cplh_fc_av_next <= !CPLH_FC_LIMIT || active_cplh_fc_count_next < CPLH_FC_LIMIT; + + active_cpld_fc_count_next <= active_cpld_fc_count_reg + inc_active_cpld_fc_count - dec_active_cpld_fc_count; + active_cpld_fc_av_next <= !CPLD_FC_LIMIT || active_cpld_fc_count_next < CPLD_FC_LIMIT; end always @(posedge clk) begin @@ -1501,11 +1507,11 @@ always @(posedge clk) begin active_tag_count_reg <= active_tag_count_reg + inc_active_tag - dec_active_tag; active_op_count_reg <= active_op_count_reg + inc_active_op - dec_active_op; - active_cplh_fc_count_reg <= active_cplh_fc_count_reg + inc_active_cplh_fc_count - dec_active_cplh_fc_count; - active_cplh_fc_av_reg <= !CPLH_FC_LIMIT || active_cplh_fc_count_reg < CPLH_FC_LIMIT; + active_cplh_fc_count_reg <= active_cplh_fc_count_next; + active_cplh_fc_av_reg <= active_cplh_fc_av_next; - active_cpld_fc_count_reg <= active_cpld_fc_count_reg + inc_active_cpld_fc_count - dec_active_cpld_fc_count; - active_cpld_fc_av_reg <= !CPLD_FC_LIMIT || active_cpld_fc_count_reg < CPLD_FC_LIMIT; + active_cpld_fc_count_reg <= active_cpld_fc_count_next; + active_cpld_fc_av_reg <= active_cpld_fc_av_next; pcie_tag_table_start_ptr_reg <= pcie_tag_table_start_ptr_next; pcie_tag_table_start_ram_sel_reg <= pcie_tag_table_start_ram_sel_next; diff --git a/fpga/lib/pcie/rtl/pcie_tlp_fifo_mux.v b/fpga/lib/pcie/rtl/pcie_tlp_fifo_mux.v index f347d1d32..847b2325c 100644 --- a/fpga/lib/pcie/rtl/pcie_tlp_fifo_mux.v +++ b/fpga/lib/pcie/rtl/pcie_tlp_fifo_mux.v @@ -347,7 +347,7 @@ always @* begin // compute mux settings for (port = 0; port < PORTS; port = port + 1) begin - port_seg_valid[port] = pause[port] ? 0 : {2{fifo_ctrl_tlp_valid[port]}} >> fifo_ctrl_seg_offset[port]; + port_seg_valid[port] = {2{fifo_ctrl_tlp_valid[port]}} >> fifo_ctrl_seg_offset[port]; port_seg_eop[port] = {2{fifo_ctrl_tlp_eop[port]}} >> fifo_ctrl_seg_offset[port]; end @@ -383,7 +383,7 @@ always @* begin port_cyc = cur_port; seg_offset_cyc = port_seg_offset_cyc[cur_port]; seg_count_cyc = port_seg_count_cyc[cur_port]; - if (port_seg_valid[cur_port][0]) begin + if (!pause[cur_port] && port_seg_valid[cur_port][0]) begin // set frame frame_cyc = 1; sel_tlp_seq_valid_cyc[OUT_TLP_SEG_COUNT*cur_port+seg] = 1'b1; diff --git a/fpga/lib/pcie/tb/pcie_msix/Makefile b/fpga/lib/pcie/tb/pcie_msix/Makefile index d94fab9c5..f3b4d1d86 100644 --- a/fpga/lib/pcie/tb/pcie_msix/Makefile +++ b/fpga/lib/pcie/tb/pcie_msix/Makefile @@ -36,10 +36,7 @@ export PARAM_IRQ_INDEX_WIDTH := 11 export PARAM_AXIL_DATA_WIDTH := 32 export PARAM_AXIL_ADDR_WIDTH := $(shell expr $(PARAM_IRQ_INDEX_WIDTH) + 5 ) export PARAM_AXIL_STRB_WIDTH := $(shell expr $(PARAM_AXIL_DATA_WIDTH) / 8 ) -export PARAM_TLP_DATA_WIDTH := 64 -export PARAM_TLP_STRB_WIDTH := $(shell expr $(PARAM_TLP_DATA_WIDTH) / 32 ) export PARAM_TLP_HDR_WIDTH := 128 -export PARAM_TLP_SEG_COUNT := 1 export PARAM_TLP_FORCE_64_BIT_ADDR := 0 ifeq ($(SIM), icarus) diff --git a/fpga/lib/pcie/tb/pcie_msix/test_pcie_msix.py b/fpga/lib/pcie/tb/pcie_msix/test_pcie_msix.py index c1c054e28..2115dec60 100644 --- a/fpga/lib/pcie/tb/pcie_msix/test_pcie_msix.py +++ b/fpga/lib/pcie/tb/pcie_msix/test_pcie_msix.py @@ -319,8 +319,7 @@ rtl_dir = os.path.abspath(os.path.join(tests_dir, '..', '..', 'rtl')) @pytest.mark.parametrize("axil_data_width", [32, 64]) -@pytest.mark.parametrize("pcie_data_width", [64, 128]) -def test_pcie_msix(request, pcie_data_width, axil_data_width): +def test_pcie_msix(request, axil_data_width): dut = "pcie_msix" module = os.path.splitext(os.path.basename(__file__))[0] toplevel = dut @@ -335,10 +334,7 @@ def test_pcie_msix(request, pcie_data_width, axil_data_width): parameters['AXIL_DATA_WIDTH'] = axil_data_width parameters['AXIL_ADDR_WIDTH'] = parameters['IRQ_INDEX_WIDTH']+5 parameters['AXIL_STRB_WIDTH'] = (axil_data_width // 8) - parameters['TLP_DATA_WIDTH'] = pcie_data_width - parameters['TLP_STRB_WIDTH'] = pcie_data_width // 32 parameters['TLP_HDR_WIDTH'] = 128 - parameters['TLP_SEG_COUNT'] = 1 parameters['TLP_FORCE_64_BIT_ADDR'] = 0 extra_env = {f'PARAM_{k}': str(v) for k, v in parameters.items()} diff --git a/fpga/lib/pcie/tb/pcie_us_axi_dma_wr/test_pcie_us_axi_dma_wr.py b/fpga/lib/pcie/tb/pcie_us_axi_dma_wr/test_pcie_us_axi_dma_wr.py index 4d6c9e7d4..472e0e51f 100644 --- a/fpga/lib/pcie/tb/pcie_us_axi_dma_wr/test_pcie_us_axi_dma_wr.py +++ b/fpga/lib/pcie/tb/pcie_us_axi_dma_wr/test_pcie_us_axi_dma_wr.py @@ -270,9 +270,6 @@ def test_pcie_us_axi_dma_wr(request, axis_pcie_data_width, pcie_offset): parameters['AXI_ID_WIDTH'] = 8 parameters['AXI_MAX_BURST_LEN'] = 256 parameters['PCIE_ADDR_WIDTH'] = 64 - parameters['PCIE_TAG_COUNT'] = 64 if parameters['AXIS_PCIE_RQ_USER_WIDTH'] == 60 else 256 - parameters['PCIE_TAG_WIDTH'] = (parameters['PCIE_TAG_COUNT']-1).bit_length() - parameters['PCIE_EXT_TAG_ENABLE'] = int(parameters['PCIE_TAG_COUNT'] > 32) parameters['LEN_WIDTH'] = 20 parameters['TAG_WIDTH'] = 8 parameters['OP_TABLE_SIZE'] = 2**(parameters['RQ_SEQ_NUM_WIDTH']-1) diff --git a/fpga/lib/pcie/tox.ini b/fpga/lib/pcie/tox.ini index 897b5a325..234ac0741 100644 --- a/fpga/lib/pcie/tox.ini +++ b/fpga/lib/pcie/tox.ini @@ -18,7 +18,7 @@ deps = cocotb-bus == 0.2.1 cocotb-test == 0.2.4 cocotbext-axi == 0.1.24 - cocotbext-pcie == 0.2.12 + cocotbext-pcie == 0.2.14 jinja2 == 3.1.2 commands =