// Copyright 2007 Altera Corporation. All rights reserved.  
// Altera products are protected under numerous U.S. and foreign patents, 
// maskwork rights, copyrights and other intellectual property laws.  
//
// This reference design file, and your use thereof, is subject to and governed
// by the terms and conditions of the applicable Altera Reference Design 
// License Agreement (either as signed by you or found at www.altera.com).  By
// using this reference design file, you indicate your acceptance of such terms
// and conditions between you and Altera Corporation.  In the event that you do
// not agree with such terms and conditions, you may not use the reference 
// design file and please promptly destroy any copies you have made.
//
// This reference design file is being provided on an "as-is" basis and as an 
// accommodation and therefore all warranties, representations or guarantees of 
// any kind (whether express, implied or statutory) including, without 
// limitation, warranties of merchantability, non-infringement, or fitness for
// a particular purpose, are specifically disclaimed.  By making this reference
// design file available, Altera expressly does not recommend, suggest or 
// require that this reference design file be used in combination with any 
// other product not provided by Altera.
/////////////////////////////////////////////////////////////////////////////

// baeckler - 05-01-2007

module approx_fp_invsqrt (
	clk,
	in,
	out
);

parameter CORRECTION_ROUND = 1'b1;

input clk;
input [31:0] in;
output [31:0] out;
wire [31:0] out;

// Magic courtesy of Quake 3 / Well known Internet trick
//   first order approximation of 1 / sqrt(in)
//
reg [31:0] app;
always @(posedge clk) begin
	app <= 32'h5F3759DF - {1'b0, in[31:1]};
end

generate

if (!CORRECTION_ROUND) begin
	// output the approx directly
	assign out = app;
end

else begin
	// add a Newton improvement round
	reg [31:0] in_r;

	always @(posedge clk) begin
		in_r <= in;
	end
	
	wire [22:0] in_mant = in_r [22:0];
	wire [7:0] in_exp = in_r [30:23];
	wire [22:0] app_mant = app [22:0];
	wire [7:0] app_exp = app [30:23];

	reg [35:0] app_sqr_m, app_hlf_m;
	reg [8:0] app_sqr_e, app_hlf_e;
	reg [24:0] op5_m;
	reg [8:0] op5_e;
	
	// pipe layer 1
	always @(posedge clk) begin

		// app * app
		app_sqr_m <= {1'b1,app_mant[22:6]} * {1'b1,app_mant[22:6]};
		app_sqr_e <= {app_exp,1'b0} - 8'h7f;
	
		// app * in/2
		app_hlf_m <= {1'b1,app_mant[22:6]} * {1'b1,in_mant[22:6]};
		app_hlf_e <= in_exp + app_exp - 8'h7f - 8'h1;

		// 1.5 * app
		op5_m <= {1'b1,app_mant} + {1'b0,1'b1,app_mant[22:1]};
		op5_e <= app_exp;
	end


	reg [35:0] chunk_m;
	reg [8:0] chunk_e; 
	reg [24:0] op5_m_r;
	reg [8:0] op5_e_r;
	
	// pipe layer 2
	always @(posedge clk) begin
	
		// app^3 * in/2
		chunk_m <= app_sqr_m[35:18] * app_hlf_m[35:18];
		chunk_e <= app_sqr_e[7:0] + app_hlf_e[7:0] - 8'h7f;
	
		op5_m_r <= op5_m;
		op5_e_r <= op5_e;
	end

	
	// work on op5 - chunk
	//    ironically much harder to subtract than multiply FP's
	//
	
	wire [3:0] exp_delta = op5_e_r[7:0] - chunk_e[7:0];
	wire [24:0] scaled_chunk = (chunk_m[35:13] >> exp_delta) << 4;
	reg [24:0] rough_m;
	reg [7:0] rough_e;
		
	// pipe layer 3
	always @(posedge clk) begin
		rough_m <= op5_m_r - scaled_chunk;
		rough_e <= op5_e_r;
	end

	wire [31:0] scaled_m;
	wire [4:0] distance;

	scale_up sc (.in({rough_m[24:0],7'b0}),.out(scaled_m),.distance(distance));
		defparam sc .WIDTH = 32;
		defparam sc .WIDTH_DIST = 5;
	
	reg [22:0] scaled_m_r;
	reg [7:0] distance_r;
	reg [7:0] rough_e_r;
	
	// pipe_layer 4
	always @(posedge clk) begin
		scaled_m_r <= scaled_m[30:8];
		distance_r <= distance;
		rough_e_r <= rough_e;
	end
	
	reg [22:0] out_m;
	reg [7:0] out_e;
	
	// pipe layer 5
	always @(posedge clk) begin
		out_m <= scaled_m_r;
		out_e <= rough_e_r - distance_r + 1;
	end		

	assign out = {1'b0,out_e,out_m};
end

endgenerate

endmodule