diff --git a/Bender.yml b/Bender.yml index a51d9b4005..c725beb543 100644 --- a/Bender.yml +++ b/Bender.yml @@ -10,7 +10,7 @@ dependencies: axi: { git: "https://github.com/pulp-platform/axi.git", version: 0.31.0 } common_cells: { git: "https://github.com/pulp-platform/common_cells", version: 1.23.0 } - fpnew: { git: "https://github.com/openhwgroup/cvfpu.git", rev: 2c79477 } # branch: develop + fpnew: { git: "https://github.com/openhwgroup/cvfpu.git", rev: 58ca3c3 } # branch: develop tech_cells_generic: { git: "https://github.com/pulp-platform/tech_cells_generic.git", version: 0.2.13 } diff --git a/core/cva6.sv b/core/cva6.sv index 3987faf086..03b6ecee4f 100644 --- a/core/cva6.sv +++ b/core/cva6.sv @@ -474,6 +474,7 @@ module cva6 logic [CVA6Cfg.XLEN-1:0] fpu_result_ex_id; logic fpu_valid_ex_id; exception_t fpu_exception_ex_id; + logic fpu_early_valid_ex_id; // ALU2 logic [CVA6Cfg.NrIssuePorts-1:0] alu2_valid_id_ex; // Accelerator @@ -890,6 +891,7 @@ module cva6 .fpu_valid_o (fpu_valid_id_ex), .fpu_fmt_o (fpu_fmt_id_ex), .fpu_rm_o (fpu_rm_id_ex), + .fpu_early_valid_i (fpu_early_valid_ex_id), // ALU2 .alu2_valid_o (alu2_valid_id_ex), // CSR @@ -1022,6 +1024,7 @@ module cva6 .fpu_result_o (fpu_result_ex_id), .fpu_valid_o (fpu_valid_ex_id), .fpu_exception_o (fpu_exception_ex_id), + .fpu_early_valid_o (fpu_early_valid_ex_id), // ALU2 .alu2_valid_i (alu2_valid_id_ex), .amo_valid_commit_i (amo_valid_commit), diff --git a/core/cvfpu b/core/cvfpu index 2c7947726a..58ca3c376b 160000 --- a/core/cvfpu +++ b/core/cvfpu @@ -1 +1 @@ -Subproject commit 2c7947726a0859c08cc27cbbd45662424dedb41e +Subproject commit 58ca3c376beb914b2b80b811d4b270c063d4e6f7 diff --git a/core/ex_stage.sv b/core/ex_stage.sv index c3bf6e2a33..f3e681bbdc 100644 --- a/core/ex_stage.sv +++ b/core/ex_stage.sv @@ -139,6 +139,8 @@ module ex_stage output logic fpu_valid_o, // FPU exception - ISSUE_STAGE output exception_t fpu_exception_o, + // FPU early valid - ISSUE_STAGE + output logic fpu_early_valid_o, // ALU2 instruction is valid - ISSUE_STAGE input logic [CVA6Cfg.NrIssuePorts-1:0] alu2_valid_i, // CVXIF instruction is valid - ISSUE_STAGE @@ -465,7 +467,8 @@ module ex_stage .fpu_trans_id_o(fpu_trans_id), .result_o(fpu_result), .fpu_valid_o(fpu_valid), - .fpu_exception_o + .fpu_exception_o, + .fpu_early_valid_o ); end else begin : no_fpu_gen assign fpu_ready_o = '0; diff --git a/core/fpu_wrap.sv b/core/fpu_wrap.sv index 75cab2b91a..bae2412989 100644 --- a/core/fpu_wrap.sv +++ b/core/fpu_wrap.sv @@ -34,7 +34,8 @@ module fpu_wrap output logic [CVA6Cfg.TRANS_ID_BITS-1:0] fpu_trans_id_o, output logic [ CVA6Cfg.FLen-1:0] result_o, output logic fpu_valid_o, - output exception_t fpu_exception_o + output exception_t fpu_exception_o, + output logic fpu_early_valid_o ); // this is a workaround @@ -553,7 +554,8 @@ module fpu_wrap .tag_o (fpu_trans_id_o), .out_valid_o (fpu_out_valid), .out_ready_i (fpu_out_ready), - .busy_o ( /* unused */) + .busy_o ( /* unused */), + .early_valid_o (fpu_early_valid_o) ); // Pack status flag into exception cause, tval ignored in wb, exception is always invalid diff --git a/core/include/config_pkg.sv b/core/include/config_pkg.sv index a0014722c4..e8802bd4bb 100644 --- a/core/include/config_pkg.sv +++ b/core/include/config_pkg.sv @@ -428,7 +428,6 @@ package config_pkg; assert (Cfg.NrExecuteRegionRules <= NrMaxRules); assert (Cfg.NrCachedRegionRules <= NrMaxRules); assert (Cfg.NrPMPEntries <= 64); - assert (!(Cfg.SuperscalarEn && Cfg.RVF)); assert (Cfg.FETCH_WIDTH == 32 || Cfg.FETCH_WIDTH == 64) else $fatal(1, "[frontend] fetch width != not supported"); // Support for disabling MIP.MSIP and MIE.MSIE in Hypervisor and Supervisor mode is not supported diff --git a/core/issue_read_operands.sv b/core/issue_read_operands.sv index efd58ebdd7..880cac0ced 100644 --- a/core/issue_read_operands.sv +++ b/core/issue_read_operands.sv @@ -82,6 +82,8 @@ module issue_read_operands output logic [CVA6Cfg.NrIssuePorts-1:0] mult_valid_o, // FPU FU is ready - EX_STAGE input logic fpu_ready_i, + // FPU FU will perform a writeback in the next cycle - EX_STAGE + input logic fpu_early_valid_i, // FPU FU is valid - EX_STAGE output logic [CVA6Cfg.NrIssuePorts-1:0] fpu_valid_o, // FPU fmt field - EX_STAGE @@ -152,10 +154,10 @@ module issue_read_operands rs3_len_t operand_c_fpr; // output flipflop (ID <-> EX) fu_data_t [CVA6Cfg.NrIssuePorts-1:0] fu_data_n, fu_data_q; - logic [CVA6Cfg.VLEN-1:0] pc_n; - logic is_compressed_instr_n; - branchpredict_sbe_t branch_predict_n; - logic [CVA6Cfg.XLEN-1:0] imm_forward_rs3; + logic [ CVA6Cfg.VLEN-1:0] pc_n; + logic is_compressed_instr_n; + branchpredict_sbe_t branch_predict_n; + logic [CVA6Cfg.NrIssuePorts-1:0][CVA6Cfg.XLEN-1:0] imm_forward_rs3; logic [CVA6Cfg.NrIssuePorts-1:0] alu_valid_n, alu_valid_q; logic [CVA6Cfg.NrIssuePorts-1:0] aes_valid_n, aes_valid_q; @@ -215,8 +217,10 @@ module issue_read_operands logic [CVA6Cfg.NrIssuePorts-1:0] forward_rs1, forward_rs2, forward_rs3; // original instruction - riscv::instruction_t orig_instr; - assign orig_instr = riscv::instruction_t'(orig_instr_i[0]); + riscv::instruction_t [CVA6Cfg.NrIssuePorts-1:0] orig_instr; + for (genvar i = 0; i < CVA6Cfg.NrIssuePorts; i++) begin + assign orig_instr[i] = riscv::instruction_t'(orig_instr_i[i]); + end // ALU-ALU bypass signals alu_bypass_t alu_bypass, alu_bypass_n, alu_bypass_q; @@ -309,10 +313,11 @@ module issue_read_operands end if (CVA6Cfg.SuperscalarEn) begin - // When a bypass is possible, an instruction uses `alu2` only when `alu` is already busy, - // in all other scenarios `alu2` is preferred over `alu`, unless it is busy + // When a bypass is possible or an FPU instruction is present on the second issue port, + // an instruction uses `alu2` only when `alu` is already busy + // In all other scenarios `alu2` is preferred over `alu`, unless it is busy for (genvar i = 0; i < 2; i++) begin - assign use_alu2[i] = is_alu_bypass ? fus_busy[i].alu : !fus_busy[i].alu2; + assign use_alu2[i] = is_alu_bypass || (issue_instr_i[1].fu inside {FPU, FPU_VEC}) ? fus_busy[i].alu : !fus_busy[i].alu2; end end else begin assign use_alu2 = '0; @@ -347,7 +352,6 @@ module issue_read_operands if (CVA6Cfg.FpPresent && !fpu_ready_i) begin fus_busy[0].fpu = 1'b1; fus_busy[0].fpu_vec = 1'b1; - if (CVA6Cfg.SuperscalarEn) fus_busy[0].alu2 = 1'b1; end if (!lsu_ready_i) begin @@ -356,6 +360,11 @@ module issue_read_operands end if (CVA6Cfg.SuperscalarEn) begin + + if (fpu_early_valid_i) begin + fus_busy[0].alu2 = 1'b1; + end + fus_busy[1] = fus_busy[0]; // Never issue CSR instruction on second issue port. @@ -390,10 +399,6 @@ module issue_read_operands ALU: begin if (use_alu2[0]) begin fus_busy[1].alu2 = 1'b1; - // TODO is there a minimum float execution time? - // If so we could issue FPU & ALU2 the same cycle - fus_busy[1].fpu = 1'b1; - fus_busy[1].fpu_vec = 1'b1; end else begin fus_busy[1].alu = 1'b1; fus_busy[1].ctrl_flow = 1'b1; @@ -408,10 +413,18 @@ module issue_read_operands FPU, FPU_VEC: begin fus_busy[1].fpu = 1'b1; fus_busy[1].fpu_vec = 1'b1; + if (issue_instr_i[1].op inside {[FLD : FSB]}) begin + fus_busy[1].load = 1'b1; + fus_busy[1].store = 1'b1; + end end LOAD, STORE: begin fus_busy[1].load = 1'b1; fus_busy[1].store = 1'b1; + if (issue_instr_i[0].op inside {[FLD : FSB]}) begin + fus_busy[1].fpu = 1'b1; + fus_busy[1].fpu_vec = 1'b1; + end end CVXIF: ; default: ; @@ -658,10 +671,12 @@ module issue_read_operands end // third operand from fp regfile or gp regfile if NR_RGPR_PORTS == 3 - if (OPERANDS_PER_INSTR == 3) begin : gen_gp_rs3 - assign imm_forward_rs3 = rs3_res[0]; - end else begin : gen_fp_rs3 - assign imm_forward_rs3 = {{CVA6Cfg.XLEN - CVA6Cfg.FLen{1'b0}}, rs3_res[0]}; + for (genvar i = 0; i < CVA6Cfg.NrIssuePorts; i++) begin + if (OPERANDS_PER_INSTR == 3) begin : gen_gp_rs3 + assign imm_forward_rs3[i] = rs3_res[i]; + end else begin : gen_fp_rs3 + assign imm_forward_rs3[i] = {{CVA6Cfg.XLEN - CVA6Cfg.FLen{1'b0}}, rs3_res[i]}; + end end // Forwarding/Output MUX @@ -696,7 +711,7 @@ module issue_read_operands fu_data_n[i].operand_b = rs2_res[i]; end if ((CVA6Cfg.FpPresent || (CVA6Cfg.CvxifEn && OPERANDS_PER_INSTR == 3)) && forward_rs3[i]) begin - fu_data_n[i].imm = imm_forward_rs3; + fu_data_n[i].imm = imm_forward_rs3[i]; end // use the PC as operand a @@ -760,12 +775,12 @@ module issue_read_operands default: begin if (issue_instr_i[i].fu == FPU && CVA6Cfg.FpPresent) begin fpu_valid_n[i] = 1'b1; - fpu_fmt_n = orig_instr.rftype.fmt; // fmt bits from instruction - fpu_rm_n = orig_instr.rftype.rm; // rm bits from instruction + fpu_fmt_n = orig_instr[i].rftype.fmt; // fmt bits from instruction + fpu_rm_n = orig_instr[i].rftype.rm; // rm bits from instruction end else if (issue_instr_i[i].fu == FPU_VEC && CVA6Cfg.FpPresent) begin fpu_valid_n[i] = 1'b1; - fpu_fmt_n = orig_instr.rvftype.vfmt; // vfmt bits from instruction - fpu_rm_n = {2'b0, orig_instr.rvftype.repl}; // repl bit from instruction + fpu_fmt_n = orig_instr[i].rvftype.vfmt; // vfmt bits from instruction + fpu_rm_n = {2'b0, orig_instr[i].rvftype.repl}; // repl bit from instruction end end endcase @@ -948,7 +963,7 @@ module issue_read_operands }; if (CVA6Cfg.SuperscalarEn) begin - if (!(issue_instr_i[0].fu inside {FPU, FPU_VEC})) begin + if (!(issue_instr_i[0].fu inside {FPU, FPU_VEC} || issue_instr_i[0].op inside {[FLD:FSB]})) begin fp_raddr_pack = { issue_instr_i[1].result[4:0], issue_instr_i[1].rs2[4:0], issue_instr_i[1].rs1[4:0] }; @@ -1101,20 +1116,6 @@ module issue_read_operands ); end - // FPU does not declare that it will return a result the subsequent cycle so - // it is not possible for issue stage to know when ALU2 can be used if there - // is an FPU. As there are discussions to change the FPU, I did not explore - // its architecture to create this "FPU returns next cycle" signal. Also, a - // "lookahead" optimization should be added to be performant with FPU: when - // issue port 2 is issuing to FPU, issue port 1 should issue to ALU1 instead - // of ALU2 so that FPU is not busy. However, if FPU has a minimum execution - // time of 2 cycles, it is possible to simply not raise fus_busy[1].alu2. - initial begin - assert (!(CVA6Cfg.SuperscalarEn && CVA6Cfg.FpPresent)) - else - $fatal(1, "FPU is not yet supported in superscalar CVA6, see comments above this assertion."); - end - for (genvar i = 0; i < CVA6Cfg.NrIssuePorts; i++) begin assert property (@(posedge clk_i) (branch_valid_q) |-> (!$isunknown( fu_data_q[i].operand_a diff --git a/core/issue_stage.sv b/core/issue_stage.sv index e267e63970..09cbeeb862 100644 --- a/core/issue_stage.sv +++ b/core/issue_stage.sv @@ -94,6 +94,8 @@ module issue_stage output logic [1:0] fpu_fmt_o, // FPU rm field - EX_STAGE output logic [2:0] fpu_rm_o, + // FPU early valid - EX_STAGE + input logic fpu_early_valid_i, // ALU2 FU is valid - EX_STAGE output logic [CVA6Cfg.NrIssuePorts-1:0] alu2_valid_o, // CSR is valid - EX_STAGE @@ -283,6 +285,7 @@ module issue_stage .fpu_valid_o, .fpu_fmt_o, .fpu_rm_o, + .fpu_early_valid_i, .alu2_valid_o, .csr_valid_o, .cvxif_valid_o (xfu_valid_o),