py/asmrv32: Make some code sequences smaller.

This commit changes a few code sequences to use more compressed opcodes
where possible.  The sequences in question are the ones that show up the
most in the test suite and require the least amount of code changes, namely
short offset loads from memory to RET/ARG registers, indirect calls through
the function table, register-based jumps, locals' offset calculation,
reg-is-null jumps, and register comparisons.

There are no speed losses or gains from these changes, but there is an
average 15-20% generated code size reduction.

Signed-off-by: Alessandro Gatti <a.gatti@frob.it>
This commit is contained in:
Alessandro Gatti 2024-06-25 15:45:13 +02:00 committed by Damien George
parent 0e261443be
commit 0600e4f273
2 changed files with 105 additions and 23 deletions

View File

@ -64,9 +64,14 @@ static uint32_t fallback_popcount(uint32_t value) {
#endif
#endif
#define INTERNAL_TEMPORARY ASM_RV32_REG_T4
#define INTERNAL_TEMPORARY ASM_RV32_REG_S0
#define AVAILABLE_REGISTERS_COUNT 32
#define IS_IN_C_REGISTER_WINDOW(register_number) \
(((register_number) >= ASM_RV32_REG_X8) && ((register_number) <= ASM_RV32_REG_X15))
#define MAP_IN_C_REGISTER_WINDOW(register_number) \
((register_number) - ASM_RV32_REG_X8)
#define FIT_UNSIGNED(value, bits) (((value) & ~((1U << (bits)) - 1)) == 0)
#define FIT_SIGNED(value, bits) \
((((value) & ~((1U << ((bits) - 1)) - 1)) == 0) || \
@ -269,7 +274,7 @@ static void emit_function_epilogue(asm_rv32_t *state, mp_uint_t registers) {
void asm_rv32_entry(asm_rv32_t *state, mp_uint_t locals) {
state->saved_registers_mask |= (1U << REG_FUN_TABLE) | (1U << REG_LOCAL_1) | \
(1U << REG_LOCAL_2) | (1U << REG_LOCAL_3);
(1U << REG_LOCAL_2) | (1U << REG_LOCAL_3) | (1U << INTERNAL_TEMPORARY);
state->locals_count = locals;
emit_function_prologue(state, state->saved_registers_mask);
}
@ -288,6 +293,14 @@ void asm_rv32_emit_call_ind(asm_rv32_t *state, mp_uint_t index) {
mp_uint_t offset = index * ASM_WORD_SIZE;
state->saved_registers_mask |= (1U << ASM_RV32_REG_RA);
if (IS_IN_C_REGISTER_WINDOW(REG_FUN_TABLE) && IS_IN_C_REGISTER_WINDOW(INTERNAL_TEMPORARY) && FIT_SIGNED(offset, 7)) {
// c.lw temporary, offset(fun_table)
// c.jalr temporary
asm_rv32_opcode_clw(state, MAP_IN_C_REGISTER_WINDOW(INTERNAL_TEMPORARY), MAP_IN_C_REGISTER_WINDOW(REG_FUN_TABLE), offset);
asm_rv32_opcode_cjalr(state, INTERNAL_TEMPORARY);
return;
}
if (FIT_UNSIGNED(offset, 11)) {
// lw temporary, offset(fun_table)
// c.jalr temporary
@ -343,6 +356,12 @@ void asm_rv32_emit_jump_if_reg_eq(asm_rv32_t *state, mp_uint_t rs1, mp_uint_t rs
void asm_rv32_emit_jump_if_reg_nonzero(asm_rv32_t *state, mp_uint_t rs, mp_uint_t label) {
ptrdiff_t displacement = (ptrdiff_t)(state->base.label_offsets[label] - state->base.code_offset);
if (FIT_SIGNED(displacement, 9) && IS_IN_C_REGISTER_WINDOW(rs)) {
// c.bnez rs', displacement
asm_rv32_opcode_cbnez(state, MAP_IN_C_REGISTER_WINDOW(rs), displacement);
return;
}
// The least significant bit is ignored anyway.
if (FIT_SIGNED(displacement, 13)) {
// bne rs, zero, displacement
@ -350,8 +369,8 @@ void asm_rv32_emit_jump_if_reg_nonzero(asm_rv32_t *state, mp_uint_t rs, mp_uint_
return;
}
// Compensate for the initial BEQ opcode.
displacement -= ASM_WORD_SIZE;
// Compensate for the initial C.BEQZ/BEQ opcode.
displacement -= IS_IN_C_REGISTER_WINDOW(rs) ? ASM_HALFWORD_SIZE : ASM_WORD_SIZE;
mp_uint_t upper = 0;
mp_uint_t lower = 0;
@ -359,11 +378,21 @@ void asm_rv32_emit_jump_if_reg_nonzero(asm_rv32_t *state, mp_uint_t rs, mp_uint_
// TODO: Can this clobber REG_TEMP[0:2]?
// beq rs1, zero, 12 ; PC + 0
// auipc temporary, HI(displacement) ; PC + 4
// jalr zero, temporary, LO(displacement) ; PC + 8
// ... ; PC + 12
asm_rv32_opcode_beq(state, rs, ASM_RV32_REG_ZERO, 12);
// if rs1 in C window (the offset always fits):
// c.beqz rs', 10 ; PC + 0
// auipc temporary, HI(displacement) ; PC + 2
// jalr zero, temporary, LO(displacement) ; PC + 6
// ... ; PC + 10
// else:
// beq rs, zero, 12 ; PC + 0
// auipc temporary, HI(displacement) ; PC + 4
// jalr zero, temporary, LO(displacement) ; PC + 8
// ... ; PC + 12
if (IS_IN_C_REGISTER_WINDOW(rs)) {
asm_rv32_opcode_cbeqz(state, MAP_IN_C_REGISTER_WINDOW(rs), 10);
} else {
asm_rv32_opcode_beq(state, rs, ASM_RV32_REG_ZERO, 12);
}
asm_rv32_opcode_auipc(state, INTERNAL_TEMPORARY, upper);
asm_rv32_opcode_jalr(state, ASM_RV32_REG_ZERO, INTERNAL_TEMPORARY, lower);
}
@ -427,7 +456,13 @@ void asm_rv32_emit_mov_reg_local(asm_rv32_t *state, mp_uint_t rd, mp_uint_t loca
void asm_rv32_emit_mov_reg_local_addr(asm_rv32_t *state, mp_uint_t rd, mp_uint_t local) {
mp_uint_t offset = state->locals_stack_offset + (local * ASM_WORD_SIZE);
if (FIT_SIGNED(offset, 11)) {
if (FIT_UNSIGNED(offset, 10) && offset != 0 && IS_IN_C_REGISTER_WINDOW(rd)) {
// c.addi4spn rd', offset
asm_rv32_opcode_caddi4spn(state, MAP_IN_C_REGISTER_WINDOW(rd), offset);
return;
}
if (FIT_UNSIGNED(offset, 11)) {
// addi rd, sp, offset
asm_rv32_opcode_addi(state, rd, ASM_RV32_REG_SP, offset);
return;
@ -442,6 +477,12 @@ void asm_rv32_emit_mov_reg_local_addr(asm_rv32_t *state, mp_uint_t rd, mp_uint_t
void asm_rv32_emit_load_reg_reg_offset(asm_rv32_t *state, mp_uint_t rd, mp_uint_t rs, mp_int_t offset) {
mp_int_t scaled_offset = offset * sizeof(ASM_WORD_SIZE);
if (IS_IN_C_REGISTER_WINDOW(rd) && IS_IN_C_REGISTER_WINDOW(rs) && FIT_SIGNED(offset, 7)) {
// c.lw rd', offset(rs')
asm_rv32_opcode_clw(state, MAP_IN_C_REGISTER_WINDOW(rd), MAP_IN_C_REGISTER_WINDOW(rs), scaled_offset);
return;
}
if (FIT_SIGNED(scaled_offset, 12)) {
// lw rd, offset(rs)
asm_rv32_opcode_lw(state, rd, rs, scaled_offset);
@ -554,12 +595,12 @@ void asm_rv32_emit_optimised_xor(asm_rv32_t *state, mp_uint_t rd, mp_uint_t rs)
void asm_rv32_meta_comparison_eq(asm_rv32_t *state, mp_uint_t rs1, mp_uint_t rs2, mp_uint_t rd) {
// c.li rd, 1 ;
// beq rs1, rs2, 8 ; PC + 0
// addi rd, zero, 0 ; PC + 4
// ... ; PC + 8
// beq rs1, rs2, 6 ; PC + 0
// c.li rd, 0 ; PC + 4
// ... ; PC + 6
asm_rv32_opcode_cli(state, rd, 1);
asm_rv32_opcode_beq(state, rs1, rs2, 8);
asm_rv32_opcode_addi(state, rd, ASM_RV32_REG_ZERO, 0);
asm_rv32_opcode_beq(state, rs1, rs2, 6);
asm_rv32_opcode_cli(state, rd, 0);
}
void asm_rv32_meta_comparison_ne(asm_rv32_t *state, mp_uint_t rs1, mp_uint_t rs2, mp_uint_t rd) {

View File

@ -151,10 +151,21 @@ void asm_rv32_end_pass(asm_rv32_t *state);
((op & 0b1111111) | ((rd & 0b11111) << 7) | \
(imm & 0b11111111111111111111000000000000))
#define RV32_ENCODE_TYPE_CB(op, ft3, rs, imm) \
((op & 0b11) | ((ft3 & 0b111) << 13) | ((rs & 0b111) << 7) | \
(((imm) & 0b100000000) << 4) | (((imm) & 0b11000000) >> 1) | \
(((imm) & 0b100000) >> 3) | (((imm) & 0b11000) << 7) | \
(((imm) & 0b110) << 2))
#define RV32_ENCODE_TYPE_CI(op, ft3, rd, imm) \
((op & 0b11) | ((ft3 & 0b111) << 13) | ((rd & 0b11111) << 7) | \
(((imm) & 0b100000) << 7) | (((imm) & 0b11111) << 2))
#define RV32_ENCODE_TYPE_CIW(op, ft3, rd, imm) \
((op & 0b11) | ((ft3 & 0b111) << 13) | ((rd & 0b111) << 2) | \
((imm & 0b1111000000) << 1) | ((imm & 0b110000) << 7) | \
((imm & 0b1000) << 2) | ((imm & 0b100) << 4))
#define RV32_ENCODE_TYPE_CJ(op, ft3, imm) \
((op & 0b11) | ((ft3 & 0b111) << 13) | \
((imm & 0b1110) << 2) | ((imm & 0b1100000000) << 1) | \
@ -162,12 +173,18 @@ void asm_rv32_end_pass(asm_rv32_t *state);
((imm & 0b10000000) >> 1) | ((imm & 0b1000000) << 1) | \
((imm & 0b100000) >> 3) | ((imm & 0b10000) << 7))
#define RV32_ENCODE_TYPE_CL(op, ft3, rd, rs, imm) \
((op & 0b11) | ((ft3 & 0b111) << 13) | ((rd & 0b111) << 2) | \
((rs & 0b111) << 7) | ((imm & 0b1000000) >> 1) | \
((imm & 0b111000) << 7) | ((imm & 0b100) << 4))
#define RV32_ENCODE_TYPE_CR(op, ft4, rs1, rs2) \
((op & 0b11) | ((rs2 & 0b11111) << 2) | ((rs1 & 0b11111) << 7) | \
((ft4 & 0b1111) << 12))
#define RV32_ENCODE_TYPE_CSS(op, ft3, rs, imm) \
((op & 0b11) | ((ft3 & 0b111) << 13) | ((rs & 0b11111) << 2) | ((imm) & 0b111111) << 7)
((op & 0b11) | ((ft3 & 0b111) << 13) | ((rs & 0b11111) << 2) | \
((imm) & 0b111111) << 7)
void asm_rv32_emit_word_opcode(asm_rv32_t *state, mp_uint_t opcode);
void asm_rv32_emit_halfword_opcode(asm_rv32_t *state, mp_uint_t opcode);
@ -220,10 +237,28 @@ static inline void asm_rv32_opcode_caddi(asm_rv32_t *state, mp_uint_t rd, mp_int
asm_rv32_emit_halfword_opcode(state, RV32_ENCODE_TYPE_CI(0b01, 0b000, rd, immediate));
}
// C.ADDI4SPN RD', IMMEDIATE
static inline void asm_rv32_opcode_caddi4spn(asm_rv32_t *state, mp_uint_t rd, mp_uint_t immediate) {
// CIW: 000 ........ ... 00
asm_rv32_emit_halfword_opcode(state, RV32_ENCODE_TYPE_CIW(0b00, 0b000, rd, immediate));
}
// C.BEQZ RS', IMMEDIATE
static inline void asm_rv32_opcode_cbeqz(asm_rv32_t *state, mp_uint_t rs, mp_int_t offset) {
// CB: 110 ... ... ..... 01
asm_rv32_emit_halfword_opcode(state, RV32_ENCODE_TYPE_CB(0b01, 0b110, rs, offset));
}
// C.BNEZ RS', IMMEDIATE
static inline void asm_rv32_opcode_cbnez(asm_rv32_t *state, mp_uint_t rs, mp_int_t offset) {
// CB: 111 ... ... ..... 01
asm_rv32_emit_halfword_opcode(state, RV32_ENCODE_TYPE_CB(0b01, 0b111, rs, offset));
}
// C.J OFFSET
static inline void asm_rv32_opcode_cj(asm_rv32_t *state, mp_uint_t offset) {
static inline void asm_rv32_opcode_cj(asm_rv32_t *state, mp_int_t offset) {
// CJ: 101 ........... 01
asm_rv32_emit_halfword_opcode(state, RV32_ENCODE_TYPE_CJ(0b01, 0b001, offset));
asm_rv32_emit_halfword_opcode(state, RV32_ENCODE_TYPE_CJ(0b01, 0b101, offset));
}
// C.JALR RS
@ -250,6 +285,12 @@ static inline void asm_rv32_opcode_clui(asm_rv32_t *state, mp_uint_t rd, mp_int_
asm_rv32_emit_halfword_opcode(state, RV32_ENCODE_TYPE_CI(0b01, 0b011, rd, immediate >> 12));
}
// C.LW RD', OFFSET(RS')
static inline void asm_rv32_opcode_clw(asm_rv32_t *state, mp_uint_t rd, mp_uint_t rs, mp_int_t offset) {
// CL: 010 ... ... .. ... 00
asm_rv32_emit_halfword_opcode(state, RV32_ENCODE_TYPE_CL(0b00, 0b010, rd, rs, offset));
}
// C.LWSP RD, OFFSET
static inline void asm_rv32_opcode_clwsp(asm_rv32_t *state, mp_uint_t rd, mp_uint_t offset) {
// CI: 010 . ..... ..... 10
@ -383,6 +424,7 @@ static inline void asm_rv32_opcode_xori(asm_rv32_t *state, mp_uint_t rd, mp_uint
}
#define ASM_WORD_SIZE (4)
#define ASM_HALFWORD_SIZE (2)
#define REG_RET ASM_RV32_REG_A0
#define REG_ARG_1 ASM_RV32_REG_A0
@ -392,8 +434,7 @@ static inline void asm_rv32_opcode_xori(asm_rv32_t *state, mp_uint_t rd, mp_uint
#define REG_TEMP0 ASM_RV32_REG_T1
#define REG_TEMP1 ASM_RV32_REG_T2
#define REG_TEMP2 ASM_RV32_REG_T3
// S0 may be used as the frame pointer by the compiler.
#define REG_FUN_TABLE ASM_RV32_REG_S2
#define REG_FUN_TABLE ASM_RV32_REG_S1
#define REG_LOCAL_1 ASM_RV32_REG_S3
#define REG_LOCAL_2 ASM_RV32_REG_S4
#define REG_LOCAL_3 ASM_RV32_REG_S5
@ -432,10 +473,10 @@ void asm_rv32_emit_store_reg_reg_offset(asm_rv32_t *state, mp_uint_t source, mp_
#define ASM_JUMP_IF_REG_EQ(state, rs1, rs2, label) asm_rv32_emit_jump_if_reg_eq(state, rs1, rs2, label)
#define ASM_JUMP_IF_REG_NONZERO(state, rs, label, bool_test) asm_rv32_emit_jump_if_reg_nonzero(state, rs, label)
#define ASM_JUMP_IF_REG_ZERO(state, rs, label, bool_test) asm_rv32_emit_jump_if_reg_eq(state, rs, ASM_RV32_REG_ZERO, label)
#define ASM_JUMP_REG(state, rs) asm_rv32_opcode_jalr(state, ASM_RV32_REG_ZERO, rs, 0)
#define ASM_JUMP_REG(state, rs) asm_rv32_opcode_cjr(state, rs)
#define ASM_LOAD16_REG_REG_OFFSET(state, rd, rs, offset) asm_rv32_emit_load16_reg_reg_offset(state, rd, rs, offset)
#define ASM_LOAD16_REG_REG(state, rd, rs) asm_rv32_opcode_lhu(state, rd, rs, 0)
#define ASM_LOAD32_REG_REG(state, rd, rs) asm_rv32_opcode_lw(state, rd, rs, 0)
#define ASM_LOAD32_REG_REG(state, rd, rs) ASM_LOAD_REG_REG_OFFSET(state, rd, rs, 0)
#define ASM_LOAD8_REG_REG(state, rd, rs) asm_rv32_opcode_lbu(state, rd, rs, 0)
#define ASM_LOAD_REG_REG_OFFSET(state, rd, rs, offset) asm_rv32_emit_load_reg_reg_offset(state, rd, rs, offset)
#define ASM_LOAD_REG_REG(state, rd, rs) ASM_LOAD32_REG_REG(state, rd, rs)
@ -452,7 +493,7 @@ void asm_rv32_emit_store_reg_reg_offset(asm_rv32_t *state, mp_uint_t source, mp_
#define ASM_NOT_REG(state, rd) asm_rv32_opcode_xori(state, rd, rd, -1)
#define ASM_OR_REG_REG(state, rd, rs) asm_rv32_opcode_or(state, rd, rd, rs)
#define ASM_STORE16_REG_REG(state, rs1, rs2) asm_rv32_opcode_sh(state, rs1, rs2, 0)
#define ASM_STORE32_REG_REG(state, rs1, rs2) asm_rv32_opcode_sw(state, rs1, rs2, 0)
#define ASM_STORE32_REG_REG(state, rs1, rs2) ASM_STORE_REG_REG_OFFSET(state, rs1, rs2, 0)
#define ASM_STORE8_REG_REG(state, rs1, rs2) asm_rv32_opcode_sb(state, rs1, rs2, 0)
#define ASM_STORE_REG_REG_OFFSET(state, rd, rs, offset) asm_rv32_emit_store_reg_reg_offset(state, rd, rs, offset)
#define ASM_STORE_REG_REG(state, rs1, rs2) ASM_STORE32_REG_REG(state, rs1, rs2)