这是一个简单的4级管道,部分实现了RV32I ISA。

支持除jalr之外的所有指令,与内存有关的指令(l*l*us*fencefence.i)或系统调用(sbreakscall)。

流水线阶段或多或少是没有存储访问阶段的经典RISC阶段(即,获取指令,解码和获取操作数,计算结果,写入结果)。

我的最终目标是要有一个性能稍佳的简单CPU,以便与FPGA合成(我希望最终达到150-200MHz)。这是我尝试的第一个大型硬件设计项目,因此,我很确定自己犯了很多初学者错误。

 `define ALU_ADD   0
`define ALU_SUB   1
`define ALU_AND   2
`define ALU_OR    3
`define ALU_XOR   4
`define ALU_SLL   5
`define ALU_SRL   6
`define ALU_SRA   7
`define ALU_SEQ   8
`define ALU_SNE   9
`define ALU_SLT  10
`define ALU_SGE  11
`define ALU_SLTU 12
`define ALU_SGEU 13

`define OPCODE_OP     7'b0110011
`define OPCODE_OP_IMM 7'b0010011
`define OPCODE_LUI    7'b0110111
`define OPCODE_AUIPC  7'b0010111
`define OPCODE_JAL    7'b1101111
`define OPCODE_JALR   7'b1100111
`define OPCODE_BRANCH 7'b1100011
`define OPCODE_SYSTEM 7'b1110011

`define FUNCT3_ADD_SUB 3'b000
`define FUNCT3_SLL     3'b001
`define FUNCT3_SLT     3'b010
`define FUNCT3_SLTU    3'b011
`define FUNCT3_XOR     3'b100
`define FUNCT3_SRL_SRA 3'b101
`define FUNCT3_OR      3'b110
`define FUNCT3_AND     3'b111

`define FUNCT3_BEQ  3'b000
`define FUNCT3_BNE  3'b001
`define FUNCT3_BLT  3'b100
`define FUNCT3_BGE  3'b101
`define FUNCT3_BLTU 3'b110
`define FUNCT3_BGEU 3'b111

`define SYSTEM_RDCYCLE    20'b11000000000000000010
`define SYSTEM_RDCYCLEH   20'b11001000000000000010
`define SYSTEM_RDTIME     20'b11000000000100000010
`define SYSTEM_RDTIMEH    20'b11001000000100000010
`define SYSTEM_RDINSTRET  20'b11000000001000000010
`define SYSTEM_RDINSTRETH 20'b11001000001000000010

module alu (input [3:0]       operation,
            input [31:0]      s1,
            input [31:0]      s2,
            output reg [31:0] d
            );
   wire [4:0]                 shamt = s2[4:0];
   always @ *
     case(operation)
       `ALU_ADD:  d = s1 + s2;
       `ALU_SUB:  d = s1 - s2;
       `ALU_AND:  d = s1 & s2;
       `ALU_OR:   d = s1 | s2;
       `ALU_XOR:  d = s1 ^ s2;
       `ALU_SLL:  d = s1 << shamt;
       `ALU_SRL:  d = s1 >> shamt;
       `ALU_SRA:  d = $signed(s1) >>> shamt;
       `ALU_SEQ:  d = s1 == s2 ? 1 : 0;
       `ALU_SNE:  d = s1 == s2 ? 0 : 1;
       `ALU_SLT:  d = $signed(s1) < $signed(s2) ? 1 : 0;
       `ALU_SGE:  d = $signed(s1) < $signed(s2) ? 0 : 1;
       `ALU_SLTU: d = s1 < s2 ? 1 : 0;
       `ALU_SGEU: d = s1 < s2 ? 0 : 1;
       default:   d = 0;
     endcase
endmodule

module decoder(input [31:0]      insn,
               input [31:0]      pc,
               input [63:0]      cycle,
               input [63:0]      instret,

               output [4:0]      rd,
               output reg        s1_is_imm,
               output [4:0]      rs1,
               output reg [31:0] s1_imm,
               output reg        s2_is_imm,
               output [4:0]      rs2,
               output reg [31:0] s2_imm,
               output reg [3:0]  op_alu,
               output reg        is_jump,
               output reg        is_branch,
               output [31:0]     jump_target
               );
   wire [6:0]                    opcode = insn[ 6: 0];
   wire [2:0]                    funct3 = insn[14:12];
   wire [6:0]                    funct7 = insn[31:25];
   wire [31:0]                   imm12  = {{21{insn[31]}}, insn[30:20]};
   wire [31:0]                   imm20  = {insn[31:12], 12'b0};
   wire [31:0]                   imm12b = {{20{insn[31]}}, insn[7], insn[30:25], insn[11:8], 1'b0};
   wire [31:0]                   imm20j = {{12{insn[31]}}, insn[19:12], insn[20], insn[30:21], 1'b0};
   reg                           rd_write_disable;

   assign rd = rd_write_disable ? 0 : insn[11:7];
   assign rs2 = insn[24:20];
   assign rs1 = insn[19:15];
   assign jump_target = pc + (is_branch ? imm12b : imm20j);

   always @ * begin
      rd_write_disable = 0;
      s1_imm = 0;
      s1_is_imm = 0;
      s2_imm = 0;
      s2_is_imm = 0;
      is_jump = 0;
      is_branch = 0;
      op_alu = `ALU_ADD;
      case(opcode)
        `OPCODE_OP: begin
           case(funct3)
             `FUNCT3_ADD_SUB: op_alu = funct7[5] ? `ALU_SUB : `ALU_ADD;
             `FUNCT3_SLL:     op_alu = `ALU_SLL;
             `FUNCT3_SLT:     op_alu = `ALU_SLT;
             `FUNCT3_SLTU:    op_alu = `ALU_SLTU;
             `FUNCT3_XOR:     op_alu = `ALU_XOR;
             `FUNCT3_SRL_SRA: op_alu = funct7[5] ? `ALU_SRA : `ALU_SRL;
             `FUNCT3_OR:      op_alu = `ALU_OR;
             `FUNCT3_AND:     op_alu = `ALU_AND;
           endcase
        end
        `OPCODE_OP_IMM: begin
           s2_imm = imm12;
           s2_is_imm = 1;
           case(funct3)
             `FUNCT3_ADD_SUB: op_alu = `ALU_ADD;
             `FUNCT3_SLT:     op_alu = `ALU_SLT;
             `FUNCT3_SLTU:    op_alu = `ALU_SLTU;
             `FUNCT3_XOR:     op_alu = `ALU_XOR;
             `FUNCT3_OR:      op_alu = `ALU_OR;
             `FUNCT3_AND:     op_alu = `ALU_AND;
             `FUNCT3_SLL:     op_alu = `ALU_SLL;
             `FUNCT3_SRL_SRA: op_alu = funct7[5] ? `ALU_SRA : `ALU_SRL;
           endcase
        end
        `OPCODE_LUI: begin
           s1_imm = imm20;
           s1_is_imm = 1;
           s2_is_imm = 1;
        end
        `OPCODE_AUIPC: begin
           s1_imm = imm20;
           s1_is_imm = 1;
           s2_imm = pc;
           s2_is_imm = 1;
        end
        `OPCODE_JAL: begin
           s1_is_imm = 1;
           s1_imm = pc;
           s2_is_imm = 1;
           s2_imm = 4;
           is_jump = 1;
        end
        // `OPCODE_JALR: TODO
        `OPCODE_BRANCH: begin
           is_jump = 1;
           is_branch = 1;
           rd_write_disable = 1;
           case(funct3)
             `FUNCT3_BEQ:  op_alu = `ALU_SEQ;
             `FUNCT3_BNE:  op_alu = `ALU_SNE;
             `FUNCT3_BLT:  op_alu = `ALU_SLT;
             `FUNCT3_BGE:  op_alu = `ALU_SGE;
             `FUNCT3_BLTU: op_alu = `ALU_SLTU;
             `FUNCT3_BGEU: op_alu = `ALU_SGEU;
           endcase
        end
        `OPCODE_SYSTEM: begin
           s1_is_imm = 1;
           s2_is_imm = 1;
           case(insn[31:12])
             `SYSTEM_RDCYCLE:    s1_imm = cycle[31:0];
             `SYSTEM_RDCYCLEH:   s1_imm = cycle[63:32];
             `SYSTEM_RDTIME:     s1_imm = cycle[31:0];
             `SYSTEM_RDTIMEH:    s1_imm = cycle[63:32];
             `SYSTEM_RDINSTRET:  s1_imm = instret[31:0];
             `SYSTEM_RDINSTRETH: s1_imm = instret[63:32];
           endcase
        end
      endcase
   end
endmodule

module ezpipe (input         clk,
               input         reset,
               output [31:0] ibus_addr,
               input [31:0]  ibus_data
               // output reg [31:0] dbus_addr,
               // output reg [31:0] dbus_data_wr,
               // input [31:0]      dbus_data_rd,
               // input [31:0]      dbus_data_ready,
               // output reg        dbus_rd,
               // output reg        dbus_wr
               );
   /* registers and counters */
   reg [31:0]                regs [1:31];
   reg [31:0]                pc;
   reg [63:0]                cycle;
   reg [63:0]                instret;
   // There is no counter for RDTIME/RDTIMEH, those instructions just use the cycle register.

   /* pipeline registers */
   // from FETCH to DECODE
   reg [31:0]                f_insn;
   reg [31:0]                f_pc;
   reg                       f_valid;
   // from DECODE to EXECUTE
   reg [31:0]                d_s1;
   reg [31:0]                d_s2;
   reg [3:0]                 d_op_alu;
   reg [4:0]                 d_rd;
   reg                       d_is_jump;
   reg                       d_is_branch;
   reg [31:0]                d_jump_target;
   reg                       d_valid;
   // from EXECUTE to WRITE
   reg [4:0]                 e_rd;
   reg [31:0]                e_d;
   reg                       e_is_jump;
   reg                       e_is_branch;
   reg [31:0]                e_jump_target;
   reg                       e_valid;

   /* instances */
   wire [4:0]                dec_rd;
   wire [4:0]                dec_rs1;
   wire [31:0]               dec_s1_imm;
   wire                      dec_s1_is_imm;
   wire [4:0]                dec_rs2;
   wire [31:0]               dec_s2_imm;
   wire                      dec_s2_is_imm;
   wire [3:0]                dec_op_alu;
   wire                      dec_is_jump;
   wire                      dec_is_branch;
   wire [31:0]               dec_jump_target;
   decoder dec(.pc(f_pc),
               .insn(f_insn),
               .cycle(cycle),
               .instret(instret),
               .op_alu(dec_op_alu),
               .rd(dec_rd),
               .rs1(dec_rs1),
               .s1_is_imm(dec_s1_is_imm),
               .s1_imm(dec_s1_imm),
               .rs2(dec_rs2),
               .s2_is_imm(dec_s2_is_imm),
               .s2_imm(dec_s2_imm),
               .is_jump(dec_is_jump),
               .is_branch(dec_is_branch),
               .jump_target(dec_jump_target)
               );

   wire [31:0]               alu_d;
   alu alu(.s1(d_s1),
           .s2(d_s2),
           .operation(d_op_alu),
           .d(alu_d)
           );

   assign ibus_addr = pc;

   /* the actual pipeline */
   reg                       jumping;
   reg                       stall;
   always @ * begin
      // does the decoded instruction depend on a instruction in the d_* or e_* registers?
      stall = 0;
      if(d_valid && |d_rd) begin
         if(|dec_rs1 && !dec_s1_is_imm && dec_rs1==d_rd)
           stall = 1;
         if(|dec_rs2 && !dec_s2_is_imm && dec_rs2==d_rd)
           stall = 1;
      end
      if(e_valid && |e_rd) begin
         if(|dec_rs1 && !dec_s1_is_imm && dec_rs1==e_rd)
           stall = 1;
         if(|dec_rs2 && !dec_s2_is_imm && dec_rs2==e_rd)
           stall = 1;
      end
      // is there a taken branch/jump sitting in the e_* registers?
      jumping = 0;
      if(e_valid)
        if(e_is_jump) begin
           if(e_is_branch)
             jumping = e_d[0];
           else
             jumping = 1;
        end
   end

   always @(posedge clk) begin
      if(reset) begin
         pc <= 0;
         f_valid <= 0;
         d_valid <= 0;
         e_valid <= 0;
         cycle <= 0;
         instret <= 0;
      end else begin
         cycle <= cycle + 1;

         /* FETCH */
         f_valid <= !jumping;
         if(!stall) begin
            f_insn <= ibus_data;
            f_pc <= pc;
            pc <= pc + 4;
         end else begin
            // don't fetch a new instruction when we can't complete the one in the D stage
         end

         /* DECODE */
         if(!stall) begin
            // fetch operands
            if(dec_s1_is_imm) d_s1 <= dec_s1_imm;
            else              d_s1 <= |dec_rs1 ? regs[dec_rs1] : 0;
            if(dec_s2_is_imm) d_s2 <= dec_s2_imm;
            else              d_s2 <= |dec_rs2 ? regs[dec_rs2] : 0;
            // store decoded instruction
            d_rd <= dec_rd;
            d_op_alu <= dec_op_alu;
            d_jump_target <= dec_jump_target;
            d_is_branch <= dec_is_branch;
            d_is_jump <= dec_is_jump;
            d_valid <= f_valid && !jumping;
         end else begin
            // can't issue this instruction yet; send a bubble down the pipeline
            d_valid <= 0;
         end

         /* EXECUTE */
         // store ALU result
         e_d <= alu_d;
         // send remaining info down the pipeline
         e_rd <= d_rd;
         e_is_jump <= d_is_jump;
         e_is_branch <= d_is_branch;
         e_jump_target <= d_jump_target;
         e_valid <= d_valid && !jumping;

         /* WRITE */
         if(e_valid) begin
            if(jumping)
              pc <= e_jump_target;
            if(|e_rd)
              regs[e_rd] <= e_d;
            instret <= instret + 1;
         end
      end
   end
endmodule
 


我的主要问题是:


我是否犯了任何菜鸟错误或犯了Verilog风格的重大罪行?
有什么方法可以在不使代码过于复杂的情况下提高最大时钟/总体性能?
是否会在这些值无关紧要的地方显式使用未定义的值(x)实际上会帮助综合工具生成较少的逻辑? (一个示例是ALU中的默认情况。)


评论

您正在考虑使用哪种FPGA?如果有FPGA的话,通常会捆绑有很多很好的Verilog和VHDL示例。 AFAIK还可以绘制这些电路,并在支持Verilog的Quartus II等CAD / CAM程序中编译该图。

#1 楼



我是否犯过任何菜鸟错误或犯了Verilog风格的重大错误?进行合成)。使用ANSI样式标头和@*的Verilog-2001语法干净。
我能发现的唯一潜在错误(没有构建测试平台)是f_pcregse_*,并且大多数d_*寄存器未在复位状态下分配。在FPGA上,通常将其初始化为0,但是如果以后任何时候出现reset,则不会将其复位。通常,带有重置的触发器和没有重置的触发器会分配在单独的Always块中。
为了使意外丢失的重置更容易一些,Emacs有一个名为Verilog-mode的插件,可以使用/*AUTORESET*/生成重置分配;以及其他扩展功能。 Vim可以将其用于包装脚本;
我建议确保所有数字文字都具有明确的宽度和基数(例如,`ALU_*值应以4'dcycle <= cycle + 1'b1;pc <= pc + 4'd4开头)。它不会改变任何东西,但是可以减少警告(尤其是在lint工具中)。


有什么方法可以提高最大时钟/总体性能,而又不会使代码过于复杂? br />

可以通过时序报告了解瓶颈所在。
如果瓶颈与对多路复用器进行解码有关,请考虑采用单热并行解码。这将需要更多的门,但可以节省时序。
如果瓶颈与繁重的计算有关,则可以考虑将某些逻辑移到更早的阶段;准备好数据,即使它会被忽略。这也将占用更多的门。这也可能会使代码变得比预期的要复杂,但是如果需要,则需要它。
有一点是收益递减,更多的调整会变成部门的。添加过多的逻辑会使路由更具挑战性,这也会影响时序/性能。而且,如果设计变大,它将无法在FPGA上修复。综合报告应该为此提供一些线索。


将在那些无所谓值的地方显式使用未定义的值(x)实际上会帮助综合工具生成较少的逻辑吗? (一个例子就是ALU中的默认情况。)有时可能会,但是根据我的经验,这会带来更多的挑战,然后带来的好处。当X在模拟中传播时,在条件语句中它将评估为false。硬件中没有X,它将被视为1或0,因此在任何条件下进行评估时,它都可能采用不同的分支。有一些X传播模拟工具/附加组件/插件可以提供帮助,但是它们却要花钱。
如果测试平台很可靠,则可以使用X-prop替代方法(例如:d = `ifdef SYNTHESIS 32'dx `else $random(...) `endif ;)。
将其分配给一个已知值通常不会产生负面影响,并且会使调试变得容易一些。简单,将FETCH,DECODE,EXECUTE和WRITE的算法逻辑移动到组合块中。这将分隔当前状态和下一状态值。这有点是个人选择,是您所教的人(以及老师的老师)的观点。 Cliff Cummings的这篇论文(以及其他论文)对我的编码风格和我的许多同事产生了重大影响。
如果FPGA支持,请考虑启用SystemVerilog。使用一个程序包,并用枚举替换宏(宏与较大的项目会发生名称冲突,特别是在使用其他人的代码时)。通过always_ffalways_comb可以更加明确的意图。可以使用结构和联合来简化decoder的一部分。