1
0
mirror of https://github.com/KastnerRG/riffa.git synced 2025-01-30 23:02:54 +08:00
riffa/fpga/riffa_hdl/reorder_queue_input.v
2016-02-09 15:23:37 -08:00

347 lines
17 KiB
Verilog

// ----------------------------------------------------------------------
// Copyright (c) 2016, The Regents of the University of California All
// rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following
// disclaimer in the documentation and/or other materials provided
// with the distribution.
//
// * Neither the name of The Regents of the University of California
// nor the names of its contributors may be used to endorse or
// promote products derived from this software without specific
// prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL REGENTS OF THE
// UNIVERSITY OF CALIFORNIA BE LIABLE FOR ANY DIRECT, INDIRECT,
// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
// OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
// TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
// USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
// DAMAGE.
// ----------------------------------------------------------------------
//----------------------------------------------------------------------------
// Filename: reorder_queue_input.v
// Version: 1.00
// Verilog Standard: Verilog-2005
// Description: Input stage to the reorder-queue.
// Author: Dustin Richmond (@darichmond)
//-----------------------------------------------------------------------------
`include "trellis.vh"
`timescale 1ns / 1ps
module reorder_queue_input
#(parameter C_PCI_DATA_WIDTH = 9'd128,
parameter C_TAG_WIDTH = 5, // Number of outstanding requests
parameter C_TAG_DW_COUNT_WIDTH = 8,// Width of max count DWs per packet
parameter C_DATA_ADDR_STRIDE_WIDTH = 5,// Width of max num stored data addr positions per tag
parameter C_DATA_ADDR_WIDTH = 10, // Width of stored data address
// Local parameters
parameter C_PCI_DATA_WORD = C_PCI_DATA_WIDTH/32,
parameter C_PCI_DATA_WORD_WIDTH = clog2s(C_PCI_DATA_WORD),
parameter C_PCI_DATA_COUNT_WIDTH = clog2s(C_PCI_DATA_WORD+1),
parameter C_NUM_TAGS = 2**C_TAG_WIDTH)
(input CLK, // Clock
input RST, // Synchronous reset
input VALID, // Valid input packet
input [C_PCI_DATA_WIDTH-1:0] DATA, // Input packet payload data enable
input [(C_PCI_DATA_WIDTH/32)-1:0] DATA_EN, // Input packet payload data enable
input DATA_START_FLAG, // Input packet payload
input [clog2s(C_PCI_DATA_WIDTH/32)-1:0] DATA_START_OFFSET, // Input packet payload data enable count
input DATA_END_FLAG, // Input packet payload
input [clog2s(C_PCI_DATA_WIDTH/32)-1:0] DATA_END_OFFSET, // Input packet payload data enable count
input DONE, // Input packet done
input ERR, // Input packet has error
input [C_TAG_WIDTH-1:0] TAG, // Input packet tag (external tag)
output [C_NUM_TAGS-1:0] TAG_FINISH, // Bitmap of tags to finish
input [C_NUM_TAGS-1:0] TAG_CLEAR, // Bitmap of tags to clear
output [(C_DATA_ADDR_WIDTH*C_PCI_DATA_WORD)-1:0] STORED_DATA_ADDR, // Address of stored packet data for RAMs
output [C_PCI_DATA_WIDTH-1:0] STORED_DATA, // Stored packet data for RAMs
output [C_PCI_DATA_WORD-1:0] STORED_DATA_EN, // Stored packet data enable for RAMs
output PKT_VALID, // Valid flag for packet data
output [C_TAG_WIDTH-1:0] PKT_TAG, // Tag for stored packet data
output [C_TAG_DW_COUNT_WIDTH-1:0] PKT_WORDS, // Total count of stored packet payload in DWs
output PKT_WORDS_LTE1, // True if total count of stored packet payload is <= 4 DWs
output PKT_WORDS_LTE2, // True if total count of stored packet payload is <= 8 DWs
output PKT_DONE, // Stored packet done flag
output PKT_ERR); // Stored packet error flag
wire [C_PCI_DATA_COUNT_WIDTH-1:0] wDECount;
wire [C_PCI_DATA_WORD-1:0] wDE;
wire [C_PCI_DATA_WIDTH-1:0] wData;
wire [C_PCI_DATA_WORD-1:0] wStartMask;
wire [C_PCI_DATA_WORD-1:0] wEndMask;
reg [5:0] rValid=0;
reg [(C_PCI_DATA_WIDTH*5)-1:0] rData=0;
reg [(C_PCI_DATA_WORD*3)-1:0] rDE=0;
reg [(C_PCI_DATA_COUNT_WIDTH*2)-1:0] rDECount=0;
reg [5:0] rDone=0;
reg [5:0] rErr=0;
reg [(C_TAG_WIDTH*6)-1:0] rTag=0;
reg [C_PCI_DATA_WORD-1:0] rDEShift=0;
reg [(C_PCI_DATA_WORD*2)-1:0] rDEShifted=0;
reg rCountValid=0;
reg [C_NUM_TAGS-1:0] rCountRst=0;
reg [C_NUM_TAGS-1:0] rValidCount=0;
reg rUseCurrCount=0;
reg rUsePrevCount=0;
reg [C_TAG_DW_COUNT_WIDTH-1:0] rPrevCount=0;
reg [C_TAG_DW_COUNT_WIDTH-1:0] rCount=0;
wire [C_TAG_DW_COUNT_WIDTH-1:0] wCount;
wire [C_TAG_DW_COUNT_WIDTH-1:0] wCountClr = wCount & {C_TAG_DW_COUNT_WIDTH{rCountValid}};
reg [(C_TAG_DW_COUNT_WIDTH*3)-1:0] rWords=0;
reg [C_PCI_DATA_WORD_WIDTH-1:0] rShift=0;
reg [C_PCI_DATA_WORD_WIDTH-1:0] rShifted=0;
reg rPosValid=0;
reg [C_NUM_TAGS-1:0] rPosRst=0;
reg [C_NUM_TAGS-1:0] rValidPos=0;
reg rUseCurrPos=0;
reg rUsePrevPos=0;
reg [(C_DATA_ADDR_STRIDE_WIDTH*C_PCI_DATA_WORD)-1:0] rPrevPos=0;
reg [(C_DATA_ADDR_STRIDE_WIDTH*C_PCI_DATA_WORD)-1:0] rPosNow=0;
reg [(C_DATA_ADDR_STRIDE_WIDTH*C_PCI_DATA_WORD)-1:0] rPos=0;
wire [(C_DATA_ADDR_STRIDE_WIDTH*C_PCI_DATA_WORD)-1:0] wPos;
wire [(C_DATA_ADDR_STRIDE_WIDTH*C_PCI_DATA_WORD)-1:0] wPosClr = wPos & {C_DATA_ADDR_STRIDE_WIDTH*C_PCI_DATA_WORD{rPosValid}};
reg [(C_DATA_ADDR_WIDTH*C_PCI_DATA_WORD)-1:0] rAddr=0;
reg [(C_PCI_DATA_WORD_WIDTH+5)-1:0] rShiftUp=0;
reg [(C_PCI_DATA_WORD_WIDTH+5)-1:0] rShiftDown=0;
reg [C_DATA_ADDR_WIDTH-1:0] rBaseAddr=0;
reg [C_PCI_DATA_WIDTH-1:0] rDataShifted=0;
reg rLTE1Pkt=0;
reg rLTE2Pkt=0;
reg [C_NUM_TAGS-1:0] rFinish=0;
wire [31:0] wZero=32'd0;
integer i;
assign wDE = DATA_EN >> (DATA_START_FLAG ? DATA_START_OFFSET : 0);/* TODO: Could move this to the RX Engine*/
assign wData = DATA >> (DATA_START_FLAG ? {DATA_START_OFFSET,5'b0} : 0);
generate
if(C_PCI_DATA_WIDTH == 32) begin
assign wDECount = VALID ? 1 : 0;
end
if(C_PCI_DATA_WIDTH == 64) begin
assign wDECount = VALID ? DATA_EN[1] + DATA_EN[0] : 0;
end
if(C_PCI_DATA_WIDTH == 128) begin
assign wDECount = VALID ? DATA_EN[3] + DATA_EN[2] + DATA_EN[1] + DATA_EN[0] : 0;
end
if(C_PCI_DATA_WIDTH == 256) begin
assign wDECount = VALID ? DATA_EN[7] + DATA_EN[6] + DATA_EN[5] + DATA_EN[4] +
DATA_EN[3] + DATA_EN[2] + DATA_EN[1] + DATA_EN[0] : 0;
end
endgenerate
assign TAG_FINISH = rFinish;
assign STORED_DATA_ADDR = rAddr;
assign STORED_DATA = rDataShifted;
assign STORED_DATA_EN = rDEShifted[1*C_PCI_DATA_WORD +:C_PCI_DATA_WORD];
assign PKT_VALID = rValid[5];
assign PKT_TAG = rTag[5*C_TAG_WIDTH +:C_TAG_WIDTH];
assign PKT_WORDS = rWords[2*C_TAG_DW_COUNT_WIDTH +:C_TAG_DW_COUNT_WIDTH];
assign PKT_WORDS_LTE1 = rLTE1Pkt;
assign PKT_WORDS_LTE2 = rLTE2Pkt;
assign PKT_DONE = rDone[5];
assign PKT_ERR = rErr[5];
// Pipeline the input and intermediate data
always @ (posedge CLK) begin
if (RST) begin
rValid <= #1 0;
rTag <= #1 0;
end
else begin
rValid <= #1 (rValid<<1) | VALID;
rTag <= #1 (rTag<<C_TAG_WIDTH) | TAG;
end
rData <= #1 (rData<<C_PCI_DATA_WIDTH) | wData;
rDE <= #1 (rDE<<C_PCI_DATA_WORD) | wDE;//DATA_EN;
rDECount <= #1 (rDECount<<C_PCI_DATA_COUNT_WIDTH) | wDECount;//DATA_EN_COUNT;
rDone <= #1 (rDone<<1) | DONE;
rErr <= #1 (rErr<<1) | ERR;
rDEShifted <= #1 (rDEShifted<<C_PCI_DATA_WORD) | rDEShift;
rWords <= #1 (rWords<<C_TAG_DW_COUNT_WIDTH) | rCount;
rShifted <= #1 (rShifted<<C_PCI_DATA_WORD_WIDTH) | rShift;
end
// Input processing pipeline
always @ (posedge CLK) begin
// STAGE 0: Register the incoming data
// STAGE 1: Request existing count from RAM
// To cover the gap b/t reads and writes to RAM, next cycle we might need
// to use the existing or even the previous rCount value if the tags match.
rUseCurrCount <= #1 (rTag[0*C_TAG_WIDTH +:C_TAG_WIDTH] == rTag[1*C_TAG_WIDTH +:C_TAG_WIDTH] && rValid[1]);
rUsePrevCount <= #1 (rTag[0*C_TAG_WIDTH +:C_TAG_WIDTH] == rTag[2*C_TAG_WIDTH +:C_TAG_WIDTH] && rValid[2]);
rPrevCount <= #1 rCount;
// See if we need to reset the count
rCountValid <= #1 (RST ? 1'd0 : rCountRst>>rTag[0*C_TAG_WIDTH +:C_TAG_WIDTH]);
rValidCount <= #1 (RST ? 0 : rValid[0]<<rTag[0*C_TAG_WIDTH +:C_TAG_WIDTH]);
// STAGE 2: Calculate new count (saves next cycle)
if (rUseCurrCount) begin
rShift <= #1 rCount[0 +:C_PCI_DATA_WORD_WIDTH];
rCount <= #1 rCount + rDECount[1*C_PCI_DATA_COUNT_WIDTH +:C_PCI_DATA_COUNT_WIDTH];
end
else if (rUsePrevCount) begin
rShift <= #1 rPrevCount[0 +:C_PCI_DATA_WORD_WIDTH];
rCount <= #1 rPrevCount + rDECount[1*C_PCI_DATA_COUNT_WIDTH +:C_PCI_DATA_COUNT_WIDTH];
end
else begin
rShift <= #1 wCountClr[0 +:C_PCI_DATA_WORD_WIDTH];
rCount <= #1 wCountClr + rDECount[1*C_PCI_DATA_COUNT_WIDTH +:C_PCI_DATA_COUNT_WIDTH];
end
// STAGE 3: Request existing positions from RAM
// Barrel shift the DE
rDEShift <= #1 (rDE[2*C_PCI_DATA_WORD +:C_PCI_DATA_WORD]<<rShift) |
(rDE[2*C_PCI_DATA_WORD +:C_PCI_DATA_WORD]>>(C_PCI_DATA_WORD-rShift));
// To cover the gap b/t reads and writes to RAM, next cycle we might need
// to use the existing or even the previous rPos values if the tags match.
rUseCurrPos <= #1 (rTag[2*C_TAG_WIDTH +:C_TAG_WIDTH] == rTag[3*C_TAG_WIDTH +:C_TAG_WIDTH] && rValid[3]);
rUsePrevPos <= #1 (rTag[2*C_TAG_WIDTH +:C_TAG_WIDTH] == rTag[4*C_TAG_WIDTH +:C_TAG_WIDTH] && rValid[4]);
for (i = 0; i < C_PCI_DATA_WORD; i = i + 1) begin
rPrevPos[C_DATA_ADDR_STRIDE_WIDTH*i +:C_DATA_ADDR_STRIDE_WIDTH] <= #1
(RST ? wZero[C_DATA_ADDR_STRIDE_WIDTH-1:0] : rPos[C_DATA_ADDR_STRIDE_WIDTH*i +:C_DATA_ADDR_STRIDE_WIDTH]);
end
// See if we need to reset the positions
rPosValid <= #1 (RST ? 1'd0 : rPosRst>>rTag[2*C_TAG_WIDTH +:C_TAG_WIDTH]);
rValidPos <= #1 (RST ? 0 : rValid[2]<<rTag[2*C_TAG_WIDTH +:C_TAG_WIDTH]);
// STAGE 4: Calculate new positions (saves next cycle)
if (rUseCurrPos) begin
for (i = 0; i < C_PCI_DATA_WORD; i = i + 1) begin
rPosNow[C_DATA_ADDR_STRIDE_WIDTH*i +:C_DATA_ADDR_STRIDE_WIDTH] <= #1
(RST ?
wZero[C_DATA_ADDR_STRIDE_WIDTH-1:0] :
rPos[C_DATA_ADDR_STRIDE_WIDTH*i +:C_DATA_ADDR_STRIDE_WIDTH]);
rPos[C_DATA_ADDR_STRIDE_WIDTH*i +:C_DATA_ADDR_STRIDE_WIDTH] <= #1
(RST ?
wZero[C_DATA_ADDR_STRIDE_WIDTH-1:0] :
rPos[C_DATA_ADDR_STRIDE_WIDTH*i +:C_DATA_ADDR_STRIDE_WIDTH] + rDEShift[i]);
end
end
else if (rUsePrevPos) begin
for (i = 0; i < C_PCI_DATA_WORD; i = i + 1) begin
rPosNow[C_DATA_ADDR_STRIDE_WIDTH*i +:C_DATA_ADDR_STRIDE_WIDTH] <= #1
(RST ?
wZero[C_DATA_ADDR_STRIDE_WIDTH-1:0] :
rPrevPos[C_DATA_ADDR_STRIDE_WIDTH*i +:C_DATA_ADDR_STRIDE_WIDTH]);
rPos[C_DATA_ADDR_STRIDE_WIDTH*i +:C_DATA_ADDR_STRIDE_WIDTH] <= #1
(RST ?
wZero[C_DATA_ADDR_STRIDE_WIDTH-1:0] :
rPrevPos[C_DATA_ADDR_STRIDE_WIDTH*i +:C_DATA_ADDR_STRIDE_WIDTH] + rDEShift[i]);
end
end
else begin
for (i = 0; i < C_PCI_DATA_WORD; i = i + 1) begin
rPosNow[C_DATA_ADDR_STRIDE_WIDTH*i +:C_DATA_ADDR_STRIDE_WIDTH] <= #1
(RST ?
wZero[C_DATA_ADDR_STRIDE_WIDTH-1:0] :
wPosClr[i*C_DATA_ADDR_STRIDE_WIDTH +:C_DATA_ADDR_STRIDE_WIDTH]);
rPos[C_DATA_ADDR_STRIDE_WIDTH*i +:C_DATA_ADDR_STRIDE_WIDTH] <= #1
(RST ?
wZero[C_DATA_ADDR_STRIDE_WIDTH-1:0] :
wPosClr[i*C_DATA_ADDR_STRIDE_WIDTH +:C_DATA_ADDR_STRIDE_WIDTH] + rDEShift[i]);
end
end
// Calculate the base address offset
rBaseAddr <= #1 rTag[3*C_TAG_WIDTH +:C_TAG_WIDTH]<<C_DATA_ADDR_STRIDE_WIDTH;
// Calculate the shift amounts for barrel shifting payload data
rShiftUp <= #1 rShifted[0*C_PCI_DATA_WORD_WIDTH +:C_PCI_DATA_WORD_WIDTH]<<5;
rShiftDown <= #1 (C_PCI_DATA_WORD[C_PCI_DATA_WORD_WIDTH:0] - rShifted[0*C_PCI_DATA_WORD_WIDTH +:C_PCI_DATA_WORD_WIDTH])<<5;
// STAGE 5: Prepare to write data, final info
for (i = 0; i < C_PCI_DATA_WORD; i = i + 1) begin
rAddr[C_DATA_ADDR_WIDTH*i +:C_DATA_ADDR_WIDTH] <= #1
rPosNow[C_DATA_ADDR_STRIDE_WIDTH*i +:C_DATA_ADDR_STRIDE_WIDTH] + rBaseAddr;
end
rDataShifted <= #1 (rData[4*C_PCI_DATA_WIDTH +:C_PCI_DATA_WIDTH]<<rShiftUp) |
(rData[4*C_PCI_DATA_WIDTH +:C_PCI_DATA_WIDTH]>>rShiftDown);
rLTE1Pkt <= #1 (rWords[1*C_TAG_DW_COUNT_WIDTH +:C_TAG_DW_COUNT_WIDTH] <= C_PCI_DATA_WORD);
rLTE2Pkt <= #1 (rWords[1*C_TAG_DW_COUNT_WIDTH +:C_TAG_DW_COUNT_WIDTH] <= (C_PCI_DATA_WORD*2));
rFinish <= #1 (rValid[4] & (rDone[4] | rErr[4]))<<rTag[4*C_TAG_WIDTH +:C_TAG_WIDTH];
// STAGE 6: Write data, final info
end
// Reset the count and positions when needed
always @ (posedge CLK) begin
if (RST) begin
rCountRst <= #1 0;
rPosRst <= #1 0;
end
else begin
rCountRst <= #1 (rCountRst | rValidCount) & ~TAG_CLEAR;
rPosRst <= #1 (rPosRst | rValidPos) & ~TAG_CLEAR;
end
end
// RAM for counts
(* RAM_STYLE="DISTRIBUTED" *)
ram_1clk_1w_1r
#(
.C_RAM_WIDTH(C_TAG_DW_COUNT_WIDTH),
.C_RAM_DEPTH(C_NUM_TAGS))
countRam
(
.CLK(CLK),
.ADDRA(rTag[2*C_TAG_WIDTH +:C_TAG_WIDTH]),
.WEA(rValid[2]),
.DINA(rCount),
.ADDRB(rTag[0*C_TAG_WIDTH +:C_TAG_WIDTH]),
.DOUTB(wCount)
);
// RAM for positions
(* RAM_STYLE="DISTRIBUTED" *)
ram_1clk_1w_1r
#(
.C_RAM_WIDTH(C_PCI_DATA_WORD*C_DATA_ADDR_STRIDE_WIDTH),
.C_RAM_DEPTH(C_NUM_TAGS))
posRam
(
.CLK(CLK),
.ADDRA(rTag[4*C_TAG_WIDTH +:C_TAG_WIDTH]),
.WEA(rValid[4]),
.DINA(rPos),
.ADDRB(rTag[2*C_TAG_WIDTH +:C_TAG_WIDTH]),
.DOUTB(wPos)
);
endmodule
// Local Variables:
// verilog-library-directories:("." "registers/" "../common/")
// End: