支持除
jalr
之外的所有指令,与内存有关的指令(l*
,l*u
,s*
,fence
和fence.i
)或系统调用(sbreak
和scall
)。 流水线阶段或多或少是没有存储访问阶段的经典RISC阶段(即,获取指令,解码和获取操作数,计算结果,写入结果)。
我的最终目标是要有一个性能稍佳的简单CPU,以便与FPGA合成(我希望最终达到150-200MHz)。这是我尝试的第一个大型硬件设计项目,因此,我很确定自己犯了很多初学者错误。
`define ALU_ADD 0
`define ALU_SUB 1
`define ALU_AND 2
`define ALU_OR 3
`define ALU_XOR 4
`define ALU_SLL 5
`define ALU_SRL 6
`define ALU_SRA 7
`define ALU_SEQ 8
`define ALU_SNE 9
`define ALU_SLT 10
`define ALU_SGE 11
`define ALU_SLTU 12
`define ALU_SGEU 13
`define OPCODE_OP 7'b0110011
`define OPCODE_OP_IMM 7'b0010011
`define OPCODE_LUI 7'b0110111
`define OPCODE_AUIPC 7'b0010111
`define OPCODE_JAL 7'b1101111
`define OPCODE_JALR 7'b1100111
`define OPCODE_BRANCH 7'b1100011
`define OPCODE_SYSTEM 7'b1110011
`define FUNCT3_ADD_SUB 3'b000
`define FUNCT3_SLL 3'b001
`define FUNCT3_SLT 3'b010
`define FUNCT3_SLTU 3'b011
`define FUNCT3_XOR 3'b100
`define FUNCT3_SRL_SRA 3'b101
`define FUNCT3_OR 3'b110
`define FUNCT3_AND 3'b111
`define FUNCT3_BEQ 3'b000
`define FUNCT3_BNE 3'b001
`define FUNCT3_BLT 3'b100
`define FUNCT3_BGE 3'b101
`define FUNCT3_BLTU 3'b110
`define FUNCT3_BGEU 3'b111
`define SYSTEM_RDCYCLE 20'b11000000000000000010
`define SYSTEM_RDCYCLEH 20'b11001000000000000010
`define SYSTEM_RDTIME 20'b11000000000100000010
`define SYSTEM_RDTIMEH 20'b11001000000100000010
`define SYSTEM_RDINSTRET 20'b11000000001000000010
`define SYSTEM_RDINSTRETH 20'b11001000001000000010
module alu (input [3:0] operation,
input [31:0] s1,
input [31:0] s2,
output reg [31:0] d
);
wire [4:0] shamt = s2[4:0];
always @ *
case(operation)
`ALU_ADD: d = s1 + s2;
`ALU_SUB: d = s1 - s2;
`ALU_AND: d = s1 & s2;
`ALU_OR: d = s1 | s2;
`ALU_XOR: d = s1 ^ s2;
`ALU_SLL: d = s1 << shamt;
`ALU_SRL: d = s1 >> shamt;
`ALU_SRA: d = $signed(s1) >>> shamt;
`ALU_SEQ: d = s1 == s2 ? 1 : 0;
`ALU_SNE: d = s1 == s2 ? 0 : 1;
`ALU_SLT: d = $signed(s1) < $signed(s2) ? 1 : 0;
`ALU_SGE: d = $signed(s1) < $signed(s2) ? 0 : 1;
`ALU_SLTU: d = s1 < s2 ? 1 : 0;
`ALU_SGEU: d = s1 < s2 ? 0 : 1;
default: d = 0;
endcase
endmodule
module decoder(input [31:0] insn,
input [31:0] pc,
input [63:0] cycle,
input [63:0] instret,
output [4:0] rd,
output reg s1_is_imm,
output [4:0] rs1,
output reg [31:0] s1_imm,
output reg s2_is_imm,
output [4:0] rs2,
output reg [31:0] s2_imm,
output reg [3:0] op_alu,
output reg is_jump,
output reg is_branch,
output [31:0] jump_target
);
wire [6:0] opcode = insn[ 6: 0];
wire [2:0] funct3 = insn[14:12];
wire [6:0] funct7 = insn[31:25];
wire [31:0] imm12 = {{21{insn[31]}}, insn[30:20]};
wire [31:0] imm20 = {insn[31:12], 12'b0};
wire [31:0] imm12b = {{20{insn[31]}}, insn[7], insn[30:25], insn[11:8], 1'b0};
wire [31:0] imm20j = {{12{insn[31]}}, insn[19:12], insn[20], insn[30:21], 1'b0};
reg rd_write_disable;
assign rd = rd_write_disable ? 0 : insn[11:7];
assign rs2 = insn[24:20];
assign rs1 = insn[19:15];
assign jump_target = pc + (is_branch ? imm12b : imm20j);
always @ * begin
rd_write_disable = 0;
s1_imm = 0;
s1_is_imm = 0;
s2_imm = 0;
s2_is_imm = 0;
is_jump = 0;
is_branch = 0;
op_alu = `ALU_ADD;
case(opcode)
`OPCODE_OP: begin
case(funct3)
`FUNCT3_ADD_SUB: op_alu = funct7[5] ? `ALU_SUB : `ALU_ADD;
`FUNCT3_SLL: op_alu = `ALU_SLL;
`FUNCT3_SLT: op_alu = `ALU_SLT;
`FUNCT3_SLTU: op_alu = `ALU_SLTU;
`FUNCT3_XOR: op_alu = `ALU_XOR;
`FUNCT3_SRL_SRA: op_alu = funct7[5] ? `ALU_SRA : `ALU_SRL;
`FUNCT3_OR: op_alu = `ALU_OR;
`FUNCT3_AND: op_alu = `ALU_AND;
endcase
end
`OPCODE_OP_IMM: begin
s2_imm = imm12;
s2_is_imm = 1;
case(funct3)
`FUNCT3_ADD_SUB: op_alu = `ALU_ADD;
`FUNCT3_SLT: op_alu = `ALU_SLT;
`FUNCT3_SLTU: op_alu = `ALU_SLTU;
`FUNCT3_XOR: op_alu = `ALU_XOR;
`FUNCT3_OR: op_alu = `ALU_OR;
`FUNCT3_AND: op_alu = `ALU_AND;
`FUNCT3_SLL: op_alu = `ALU_SLL;
`FUNCT3_SRL_SRA: op_alu = funct7[5] ? `ALU_SRA : `ALU_SRL;
endcase
end
`OPCODE_LUI: begin
s1_imm = imm20;
s1_is_imm = 1;
s2_is_imm = 1;
end
`OPCODE_AUIPC: begin
s1_imm = imm20;
s1_is_imm = 1;
s2_imm = pc;
s2_is_imm = 1;
end
`OPCODE_JAL: begin
s1_is_imm = 1;
s1_imm = pc;
s2_is_imm = 1;
s2_imm = 4;
is_jump = 1;
end
// `OPCODE_JALR: TODO
`OPCODE_BRANCH: begin
is_jump = 1;
is_branch = 1;
rd_write_disable = 1;
case(funct3)
`FUNCT3_BEQ: op_alu = `ALU_SEQ;
`FUNCT3_BNE: op_alu = `ALU_SNE;
`FUNCT3_BLT: op_alu = `ALU_SLT;
`FUNCT3_BGE: op_alu = `ALU_SGE;
`FUNCT3_BLTU: op_alu = `ALU_SLTU;
`FUNCT3_BGEU: op_alu = `ALU_SGEU;
endcase
end
`OPCODE_SYSTEM: begin
s1_is_imm = 1;
s2_is_imm = 1;
case(insn[31:12])
`SYSTEM_RDCYCLE: s1_imm = cycle[31:0];
`SYSTEM_RDCYCLEH: s1_imm = cycle[63:32];
`SYSTEM_RDTIME: s1_imm = cycle[31:0];
`SYSTEM_RDTIMEH: s1_imm = cycle[63:32];
`SYSTEM_RDINSTRET: s1_imm = instret[31:0];
`SYSTEM_RDINSTRETH: s1_imm = instret[63:32];
endcase
end
endcase
end
endmodule
module ezpipe (input clk,
input reset,
output [31:0] ibus_addr,
input [31:0] ibus_data
// output reg [31:0] dbus_addr,
// output reg [31:0] dbus_data_wr,
// input [31:0] dbus_data_rd,
// input [31:0] dbus_data_ready,
// output reg dbus_rd,
// output reg dbus_wr
);
/* registers and counters */
reg [31:0] regs [1:31];
reg [31:0] pc;
reg [63:0] cycle;
reg [63:0] instret;
// There is no counter for RDTIME/RDTIMEH, those instructions just use the cycle register.
/* pipeline registers */
// from FETCH to DECODE
reg [31:0] f_insn;
reg [31:0] f_pc;
reg f_valid;
// from DECODE to EXECUTE
reg [31:0] d_s1;
reg [31:0] d_s2;
reg [3:0] d_op_alu;
reg [4:0] d_rd;
reg d_is_jump;
reg d_is_branch;
reg [31:0] d_jump_target;
reg d_valid;
// from EXECUTE to WRITE
reg [4:0] e_rd;
reg [31:0] e_d;
reg e_is_jump;
reg e_is_branch;
reg [31:0] e_jump_target;
reg e_valid;
/* instances */
wire [4:0] dec_rd;
wire [4:0] dec_rs1;
wire [31:0] dec_s1_imm;
wire dec_s1_is_imm;
wire [4:0] dec_rs2;
wire [31:0] dec_s2_imm;
wire dec_s2_is_imm;
wire [3:0] dec_op_alu;
wire dec_is_jump;
wire dec_is_branch;
wire [31:0] dec_jump_target;
decoder dec(.pc(f_pc),
.insn(f_insn),
.cycle(cycle),
.instret(instret),
.op_alu(dec_op_alu),
.rd(dec_rd),
.rs1(dec_rs1),
.s1_is_imm(dec_s1_is_imm),
.s1_imm(dec_s1_imm),
.rs2(dec_rs2),
.s2_is_imm(dec_s2_is_imm),
.s2_imm(dec_s2_imm),
.is_jump(dec_is_jump),
.is_branch(dec_is_branch),
.jump_target(dec_jump_target)
);
wire [31:0] alu_d;
alu alu(.s1(d_s1),
.s2(d_s2),
.operation(d_op_alu),
.d(alu_d)
);
assign ibus_addr = pc;
/* the actual pipeline */
reg jumping;
reg stall;
always @ * begin
// does the decoded instruction depend on a instruction in the d_* or e_* registers?
stall = 0;
if(d_valid && |d_rd) begin
if(|dec_rs1 && !dec_s1_is_imm && dec_rs1==d_rd)
stall = 1;
if(|dec_rs2 && !dec_s2_is_imm && dec_rs2==d_rd)
stall = 1;
end
if(e_valid && |e_rd) begin
if(|dec_rs1 && !dec_s1_is_imm && dec_rs1==e_rd)
stall = 1;
if(|dec_rs2 && !dec_s2_is_imm && dec_rs2==e_rd)
stall = 1;
end
// is there a taken branch/jump sitting in the e_* registers?
jumping = 0;
if(e_valid)
if(e_is_jump) begin
if(e_is_branch)
jumping = e_d[0];
else
jumping = 1;
end
end
always @(posedge clk) begin
if(reset) begin
pc <= 0;
f_valid <= 0;
d_valid <= 0;
e_valid <= 0;
cycle <= 0;
instret <= 0;
end else begin
cycle <= cycle + 1;
/* FETCH */
f_valid <= !jumping;
if(!stall) begin
f_insn <= ibus_data;
f_pc <= pc;
pc <= pc + 4;
end else begin
// don't fetch a new instruction when we can't complete the one in the D stage
end
/* DECODE */
if(!stall) begin
// fetch operands
if(dec_s1_is_imm) d_s1 <= dec_s1_imm;
else d_s1 <= |dec_rs1 ? regs[dec_rs1] : 0;
if(dec_s2_is_imm) d_s2 <= dec_s2_imm;
else d_s2 <= |dec_rs2 ? regs[dec_rs2] : 0;
// store decoded instruction
d_rd <= dec_rd;
d_op_alu <= dec_op_alu;
d_jump_target <= dec_jump_target;
d_is_branch <= dec_is_branch;
d_is_jump <= dec_is_jump;
d_valid <= f_valid && !jumping;
end else begin
// can't issue this instruction yet; send a bubble down the pipeline
d_valid <= 0;
end
/* EXECUTE */
// store ALU result
e_d <= alu_d;
// send remaining info down the pipeline
e_rd <= d_rd;
e_is_jump <= d_is_jump;
e_is_branch <= d_is_branch;
e_jump_target <= d_jump_target;
e_valid <= d_valid && !jumping;
/* WRITE */
if(e_valid) begin
if(jumping)
pc <= e_jump_target;
if(|e_rd)
regs[e_rd] <= e_d;
instret <= instret + 1;
end
end
end
endmodule
我的主要问题是:
我是否犯了任何菜鸟错误或犯了Verilog风格的重大罪行?
有什么方法可以在不使代码过于复杂的情况下提高最大时钟/总体性能?
是否会在这些值无关紧要的地方显式使用未定义的值(
x
)实际上会帮助综合工具生成较少的逻辑? (一个示例是ALU中的默认情况。)#1 楼
我是否犯过任何菜鸟错误或犯了Verilog风格的重大错误?进行合成)。使用ANSI样式标头和
@*
的Verilog-2001语法干净。我能发现的唯一潜在错误(没有构建测试平台)是
f_pc
,regs
,e_*
,并且大多数d_*
寄存器未在复位状态下分配。在FPGA上,通常将其初始化为0,但是如果以后任何时候出现reset
,则不会将其复位。通常,带有重置的触发器和没有重置的触发器会分配在单独的Always块中。为了使意外丢失的重置更容易一些,Emacs有一个名为Verilog-mode的插件,可以使用
/*AUTORESET*/
生成重置分配;以及其他扩展功能。 Vim可以将其用于包装脚本; 我建议确保所有数字文字都具有明确的宽度和基数(例如,
`ALU_*
值应以4'd
,cycle <= cycle + 1'b1;
,pc <= pc + 4'd4
开头)。它不会改变任何东西,但是可以减少警告(尤其是在lint工具中)。有什么方法可以提高最大时钟/总体性能,而又不会使代码过于复杂? br />
可以通过时序报告了解瓶颈所在。
如果瓶颈与对多路复用器进行解码有关,请考虑采用单热并行解码。这将需要更多的门,但可以节省时序。
如果瓶颈与繁重的计算有关,则可以考虑将某些逻辑移到更早的阶段;准备好数据,即使它会被忽略。这也将占用更多的门。这也可能会使代码变得比预期的要复杂,但是如果需要,则需要它。
有一点是收益递减,更多的调整会变成部门的。添加过多的逻辑会使路由更具挑战性,这也会影响时序/性能。而且,如果设计变大,它将无法在FPGA上修复。综合报告应该为此提供一些线索。
将在那些无所谓值的地方显式使用未定义的值(x)实际上会帮助综合工具生成较少的逻辑吗? (一个例子就是ALU中的默认情况。)有时可能会,但是根据我的经验,这会带来更多的挑战,然后带来的好处。当X在模拟中传播时,在条件语句中它将评估为false。硬件中没有X,它将被视为1或0,因此在任何条件下进行评估时,它都可能采用不同的分支。有一些X传播模拟工具/附加组件/插件可以提供帮助,但是它们却要花钱。
如果测试平台很可靠,则可以使用X-prop替代方法(例如:
d = `ifdef SYNTHESIS 32'dx `else $random(...) `endif ;
)。将其分配给一个已知值通常不会产生负面影响,并且会使调试变得容易一些。简单,将FETCH,DECODE,EXECUTE和WRITE的算法逻辑移动到组合块中。这将分隔当前状态和下一状态值。这有点是个人选择,是您所教的人(以及老师的老师)的观点。 Cliff Cummings的这篇论文(以及其他论文)对我的编码风格和我的许多同事产生了重大影响。
如果FPGA支持,请考虑启用SystemVerilog。使用一个程序包,并用枚举替换宏(宏与较大的项目会发生名称冲突,特别是在使用其他人的代码时)。通过
always_ff
和always_comb
可以更加明确的意图。可以使用结构和联合来简化decoder
的一部分。
评论
您正在考虑使用哪种FPGA?如果有FPGA的话,通常会捆绑有很多很好的Verilog和VHDL示例。 AFAIK还可以绘制这些电路,并在支持Verilog的Quartus II等CAD / CAM程序中编译该图。