Changes to better support XScale, round 1, back-ported from GCC 3.2.

The GCC ChangeLog does not have a complete description to quote here,
so:

arm.c:

* arm_override_options(): Set arm_is_scale according to the the
  -mcpu=xscale option.  Set arm_constant_limit to 2 if arm_is_xscale.
* arm_adjust_cost(): If arm_is_xscale, account for stalls that can
  occur due to shifted operands.
* arm_gen_load_multiple(): Account for the cost of ldm vs. ldr if
  arm_is_xscale.
* arm_gen_store_multiple(): Likewise for stm vs. str.

arm.h:

* CONSTANT_ALIGNMENT(): Use a constant alignment factor of 2 if
  arm_is_xscale.
* MOVE_RATIO: Set to 4 if arm_is_xscale.

arm.md:

* Add XScale scheduling parameters.
* Define a "shift" attribute (used by arm_adjust_cost()) and give it
  to the appropriate operands on andsi_not_shiftsi_si, *shiftsi3,
  *shiftsi3_compare0, *shiftsi3_compare0_scratch, *notsi_shiftsi,
  *notsi_shiftsi_compare0, *not_shiftsi_compare0_scratch,
  abssi2, *neg_abssi2, extendsidi2, *cmpsi_shiftsi, *cmpsi_shiftsi_swp,
  *cmpsi_neg_shiftsi, *arith_shiftsi, *arith_shiftsi_compare0,
  *arith_shiftsi_compare0_scratch, *sub_shiftsi, *sub_shiftsi_compare0,
  *sub_shiftsi_compare0_scratch, *if_shift_move, *if_move_shift,
  and *if_shift_shift.
This commit is contained in:
thorpej 2002-08-20 23:46:44 +00:00
parent 00975d3872
commit 0414c14b05
3 changed files with 231 additions and 21 deletions

View File

@ -103,6 +103,7 @@ int arm_structure_size_boundary = 32; /* Used to be 8 */
#define FL_THUMB 0x20 /* Thumb aware */
#define FL_LDSCHED 0x40 /* Load scheduling necessary */
#define FL_STRONG 0x80 /* StrongARM */
#define FL_XSCALE 0x100 /* XScale */
/* The bits in this mask specify which instructions we are allowed to generate. */
static int insn_flags = 0;
@ -127,6 +128,9 @@ int arm_ld_sched = 0;
/* Nonzero if this chip is a StrongARM. */
int arm_is_strong = 0;
/* Nonzero if this chip is an XScale. */
int arm_is_xscale = 0;
/* Nonzero if this chip is a an ARM6 or an ARM7. */
int arm_is_6_or_7 = 0;
@ -235,7 +239,7 @@ static struct processors all_cores[] =
--thorpej@netbsd.org */
{"arm10tdmi", FL_MODE32 | FL_FAST_MULT | FL_ARCH4 | FL_THUMB | FL_LDSCHED },
{"arm1020t", FL_MODE32 | FL_FAST_MULT | FL_ARCH4 | FL_THUMB | FL_LDSCHED },
{"xscale", FL_MODE32 | FL_FAST_MULT | FL_ARCH4 | FL_LDSCHED | FL_STRONG },
{"xscale", FL_MODE32 | FL_FAST_MULT | FL_ARCH4 | FL_LDSCHED | FL_STRONG | FL_XSCALE },
{NULL, 0}
};
@ -523,6 +527,7 @@ arm_override_options ()
/* Initialise boolean versions of the flags, for use in the arm.md file. */
arm_fast_multiply = (insn_flags & FL_FAST_MULT) != 0;
arm_arch4 = (insn_flags & FL_ARCH4) != 0;
arm_is_xscale = (insn_flags & FL_XSCALE) != 0;
arm_ld_sched = (tune_flags & FL_LDSCHED) != 0;
arm_is_strong = (tune_flags & FL_STRONG) != 0;
@ -574,6 +579,9 @@ arm_override_options ()
to load a constant, and the load scheduler may well reduce that to 1. */
if (optimize_size || (tune_flags & FL_LDSCHED))
arm_constant_limit = 1;
if (arm_is_xscale)
arm_constant_limit = 2;
/* If optimizing for size, bump the number of instructions that we
are prepared to conditionally execute (even on a StrongARM).
@ -1867,6 +1875,47 @@ arm_adjust_cost (insn, link, dep, cost)
{
rtx i_pat, d_pat;
/* Some true dependencies can have a higher cost depending
on precisely how certain input operands are used. */
if (arm_is_xscale
&& REG_NOTE_KIND (link) == 0
&& recog_memoized (insn) < 0
&& recog_memoized (dep) < 0)
{
int shift_opnum = get_attr_shift (insn);
enum attr_type attr_type = get_attr_type (dep);
/* If nonzero, SHIFT_OPNUM contains the operand number of a shifted
operand for INSN. If we have a shifted input operand and the
instruction we depend on is another ALU instruction, then we may
have to account for an additional stall. */
if (shift_opnum != 0 && attr_type == TYPE_NORMAL)
{
rtx shifted_operand;
int opno;
/* Get the shifted operand. */
extract_insn (insn);
shifted_operand = recog_operand[shift_opnum];
/* Iterate over all the operands in DEP. If we write an operand
that overlaps with SHIFTED_OPERAND, then we have increate the
cost of this dependency. */
extract_insn (dep);
preprocess_constraints ();
for (opno = 0; opno < recog_n_operands; opno++)
{
/* We can ignore strict inputs. */
if (recog_op_type[opno] == OP_IN)
continue;
if (reg_overlap_mentioned_p (recog_operand[opno],
shifted_operand))
return 2;
}
}
}
/* XXX This is not strictly true for the FPA. */
if (REG_NOTE_KIND(link) == REG_DEP_ANTI
|| REG_NOTE_KIND(link) == REG_DEP_OUTPUT)
@ -3164,6 +3213,58 @@ arm_gen_load_multiple (base_regno, count, from, up, write_back, unchanging_p,
int sign = up ? 1 : -1;
rtx mem;
/* XScale has load-store double instructions, but they have stricter
alignment requirements than load-store multiple, so we can not
use them.
For XScale ldm requires 2 + NREGS cycles to complete and blocks
the pipeline until completion.
NREGS CYCLES
1 3
2 4
3 5
4 6
an ldr instruction takes 1-3 cycles, but does not block the
pipeline.
NREGS CYCLES
1 1-3
2 2-6
3 3-9
4 4-12
Best case ldr will always win. However, the more ldr instructions
we issue, the less likely we are to be able to schedule them well.
Using ldr instructions also increases code size.
As a compromise, we use ldr for counts of 1 or 2 regs, and ldm
for counts of 3 or 4 regs. */
if (arm_is_xscale && count <= 2 && ! optimize_size)
{
rtx seq;
start_sequence ();
for (i = 0; i < count; i++)
{
mem = gen_rtx_MEM (SImode, plus_constant (from, i * 4 * sign));
RTX_UNCHANGING_P (mem) = unchanging_p;
MEM_IN_STRUCT_P (mem) = in_struct_p;
MEM_SCALAR_P (mem) = scalar_p;
emit_move_insn (gen_rtx_REG (SImode, base_regno + i), mem);
}
if (write_back)
emit_move_insn (from, plus_constant (from, count * 4 * sign));
seq = gen_sequence ();
end_sequence ();
return seq;
}
result = gen_rtx_PARALLEL (VOIDmode,
rtvec_alloc (count + (write_back ? 2 : 0)));
if (write_back)
@ -3208,6 +3309,32 @@ arm_gen_store_multiple (base_regno, count, to, up, write_back, unchanging_p,
int sign = up ? 1 : -1;
rtx mem;
/* See arm_gen_load_multiple for discussion of
the pros/cons of ldm/stm usage for XScale. */
if (arm_is_xscale && count <= 2 && ! optimize_size)
{
rtx seq;
start_sequence ();
for (i = 0; i < count; i++)
{
mem = gen_rtx_MEM (SImode, plus_constant (to, i * 4 * sign));
RTX_UNCHANGING_P (mem) = unchanging_p;
MEM_IN_STRUCT_P (mem) = in_struct_p;
MEM_SCALAR_P (mem) = scalar_p;
emit_move_insn (mem, gen_rtx_REG (SImode, base_regno + i));
}
if (write_back)
emit_move_insn (to, plus_constant (to, count * 4 * sign));
seq = gen_sequence ();
end_sequence ();
return seq;
}
result = gen_rtx_PARALLEL (VOIDmode,
rtvec_alloc (count + (write_back ? 2 : 0)));
if (write_back)

View File

@ -477,6 +477,9 @@ extern int arm_ld_sched;
/* Nonzero if this chip is a StrongARM. */
extern int arm_is_strong;
/* Nonzero if this chip is an XScale. */
extern int arm_is_xscale;
/* Nonzero if this chip is a an ARM6 or an ARM7. */
extern int arm_is_6_or_7;
@ -614,9 +617,12 @@ extern int arm_is_6_or_7;
#define BIGGEST_ALIGNMENT 32
/* Make strings word-aligned so strcpy from constants will be faster. */
#define CONSTANT_ALIGNMENT(EXP, ALIGN) \
(TREE_CODE (EXP) == STRING_CST \
&& (ALIGN) < BITS_PER_WORD ? BITS_PER_WORD : (ALIGN))
#define CONSTANT_ALIGNMENT_FACTOR (! arm_is_xscale ? 1 : 2)
#define CONSTANT_ALIGNMENT(EXP, ALIGN) \
((TREE_CODE (EXP) == STRING_CST \
&& (ALIGN) < BITS_PER_WORD * CONSTANT_ALIGNMENT_FACTOR) \
? BITS_PER_WORD * CONSTANT_ALIGNMENT_FACTOR : (ALIGN))
/* Every structures size must be a multiple of 32 bits. */
/* This is for compatibility with ARMCC. ARM SDT Reference Manual
@ -1703,6 +1709,9 @@ extern struct rtx_def *legitimize_pic_address ();
in one reasonably fast instruction. */
#define MOVE_MAX 4
#undef MOVE_RATIO
#define MOVE_RATIO (arm_is_xscale ? 4 : 2)
/* Define if operations between registers always perform the operation
on the full register even if a narrower mode is specified. */
#define WORD_REGISTER_OPERATIONS

View File

@ -48,6 +48,11 @@
(define_attr "is_strongarm" "no,yes" (const (symbol_ref "arm_is_strong")))
;; Operand number of an input operand that is shifted. Zoer if the
;; given instruction does not shift one of its input operands.
(define_attr "is_xscale" "no,yes" (const (symbol_ref "arm_is_xscale")))
(define_attr "shift" "" (const_int 0))
; Floating Point Unit. If we only have floating point emulation, then there
; is no point in scheduling the floating point insns. (Well, for best
; performance we should try and group them together).
@ -238,12 +243,26 @@
;; Core unit
;;--------------------------------------------------------------------
;; Everything must spend at least one cycle in the core unit
(define_function_unit "core" 1 0 (eq_attr "core_cycles" "single") 1 1)
(define_function_unit "core" 1 0
(and (eq_attr "ldsched" "yes") (eq_attr "type" "store1")) 1 1)
(define_function_unit "core" 1 0
(and (eq_attr "ldsched" "yes") (eq_attr "type" "load")) 2 1)
;; We do not need to conditionalize the define_function_unit immediately
;; above. This one will be ignored for anything other than xscale
;; compiles and for xscale compiles it provides a larger delay
;; and the scheduler will DTRT.
;; FIXME: this test need to be revamped to not depend on this feature
;; of the scheduler.
(define_function_unit "core" 1 0
(and (and (eq_attr "ldsched" "yes") (eq_attr "type" "load"))
(eq_attr "is_xscale" "yes"))
3 1)
(define_function_unit "core" 1 0
(and (eq_attr "ldsched" "!yes") (eq_attr "type" "load,store1")) 2 2)
@ -275,6 +294,10 @@
(define_function_unit "core" 1 0 (eq_attr "type" "store3") 4 4)
(define_function_unit "core" 1 0 (eq_attr "type" "store4") 5 5)
(define_function_unit "core" 1 0
(and (eq_attr "core_cycles" "multi")
(eq_attr "type" "!mult,load,store1,store2,store3,store4")) 32 32)
;; Note: For DImode insns, there is normally no reason why operands should
;; not be in the same register, what we don't want is for something being
@ -1410,7 +1433,9 @@
(match_operand:SI 3 "arm_rhs_operand" "rM")]))
(match_operand:SI 1 "s_register_operand" "r")))]
""
"bic%?\\t%0, %1, %2%S4")
"bic%?\\t%0, %1, %2%S4"
[(set_attr "shift" "2")]
)
(define_insn "*andsi_notsi_si_compare0"
[(set (reg:CC_NOOV 24)
@ -1783,7 +1808,9 @@
[(match_operand:SI 1 "s_register_operand" "r")
(match_operand:SI 2 "reg_or_int_operand" "rM")]))]
""
"mov%?\\t%0, %1%S3")
"mov%?\\t%0, %1%S3"
[(set_attr "shift" "1")]
)
(define_insn "*shiftsi3_compare0"
[(set (reg:CC_NOOV 24)
@ -1795,7 +1822,10 @@
(match_op_dup 3 [(match_dup 1) (match_dup 2)]))]
""
"mov%?s\\t%0, %1%S3"
[(set_attr "conds" "set")])
[(set_attr "conds" "set")
(set_attr "shift" "1")
]
)
(define_insn "*shiftsi3_compare0_scratch"
[(set (reg:CC_NOOV 24)
@ -1806,7 +1836,10 @@
(clobber (match_scratch:SI 0 "=r"))]
""
"mov%?s\\t%0, %1%S3"
[(set_attr "conds" "set")])
[(set_attr "conds" "set")
(set_attr "shift" "1")
]
)
(define_insn "*notsi_shiftsi"
[(set (match_operand:SI 0 "s_register_operand" "=r")
@ -1814,7 +1847,9 @@
[(match_operand:SI 1 "s_register_operand" "r")
(match_operand:SI 2 "arm_rhs_operand" "rM")])))]
""
"mvn%?\\t%0, %1%S3")
"mvn%?\\t%0, %1%S3"
[(set_attr "shift" "1")]
)
(define_insn "*notsi_shiftsi_compare0"
[(set (reg:CC_NOOV 24)
@ -1826,7 +1861,10 @@
(not:SI (match_op_dup 3 [(match_dup 1) (match_dup 2)])))]
""
"mvn%?s\\t%0, %1%S3"
[(set_attr "conds" "set")])
[(set_attr "conds" "set")
(set_attr "shift" "1")
]
)
(define_insn "*not_shiftsi_compare0_scratch"
[(set (reg:CC_NOOV 24)
@ -1837,7 +1875,10 @@
(clobber (match_scratch:SI 0 "=r"))]
""
"mvn%?s\\t%0, %1%S3"
[(set_attr "conds" "set")])
[(set_attr "conds" "set")
(set_attr "shift" "1")
]
)
;; Unary arithmetic insns
@ -1900,6 +1941,7 @@
cmp\\t%0, #0\;rsblt\\t%0, %0, #0
eor%?\\t%0, %1, %1, asr #31\;sub%?\\t%0, %0, %1, asr #31"
[(set_attr "conds" "clob,*")
(set_attr "shift" "1")
(set_attr "length" "8")])
(define_insn "*neg_abssi2"
@ -1911,6 +1953,7 @@
cmp\\t%0, #0\;rsbgt\\t%0, %0, #0
eor%?\\t%0, %1, %1, asr #31\;rsb%?\\t%0, %0, %1, asr #31"
[(set_attr "conds" "clob,*")
(set_attr "shift" "1")
(set_attr "length" "8")])
(define_insn "abssf2"
@ -2163,7 +2206,10 @@
output_asm_insn (\"mov%?\\t%Q0, %1\", operands);
return \"mov%?\\t%R0, %Q0, asr #31\";
"
[(set_attr "length" "8")])
[(set_attr "length" "8")
(set_attr "shift" "1")
]
)
(define_expand "zero_extendhisi2"
[(set (match_dup 2) (ashift:SI (match_operand:HI 1 "nonimmediate_operand" "")
@ -3597,7 +3643,10 @@
(match_operand:SI 2 "arm_rhs_operand" "rM")])))]
""
"cmp%?\\t%0, %1%S3"
[(set_attr "conds" "set")])
[(set_attr "conds" "set")
(set_attr "shift" "1")
]
)
(define_insn "*cmpsi_shiftsi_swp"
[(set (reg:CC_SWP 24)
@ -3607,7 +3656,10 @@
(match_operand:SI 0 "s_register_operand" "r")))]
""
"cmp%?\\t%0, %1%S3"
[(set_attr "conds" "set")])
[(set_attr "conds" "set")
(set_attr "shift" "1")
]
)
(define_insn "*cmpsi_neg_shiftsi"
[(set (reg:CC 24)
@ -3617,7 +3669,10 @@
(match_operand:SI 2 "arm_rhs_operand" "rM")]))))]
""
"cmn%?\\t%0, %1%S3"
[(set_attr "conds" "set")])
[(set_attr "conds" "set")
(set_attr "shift" "1")
]
)
(define_insn "*cmpsf_insn"
[(set (reg:CCFP 24)
@ -4467,7 +4522,9 @@
(match_operand:SI 5 "reg_or_int_operand" "rI")])
(match_operand:SI 2 "s_register_operand" "r")]))]
""
"%i1%?\\t%0, %2, %4%S3")
"%i1%?\\t%0, %2, %4%S3"
[(set_attr "shift" "4")]
)
(define_insn "*arith_shiftsi_compare0"
[(set (reg:CC_NOOV 24)
@ -4482,7 +4539,10 @@
(match_dup 2)]))]
""
"%i1%?s\\t%0, %2, %4%S3"
[(set_attr "conds" "set")])
[(set_attr "conds" "set")
(set_attr "shift" "4")
]
)
(define_insn "*arith_shiftsi_compare0_scratch"
[(set (reg:CC_NOOV 24)
@ -4495,7 +4555,10 @@
(clobber (match_scratch:SI 0 "=r"))]
""
"%i1%?s\\t%0, %2, %4%S3"
[(set_attr "conds" "set")])
[(set_attr "conds" "set")
(set_attr "shift" "4")
]
)
(define_insn "*sub_shiftsi"
[(set (match_operand:SI 0 "s_register_operand" "=r")
@ -4504,7 +4567,9 @@
[(match_operand:SI 3 "s_register_operand" "r")
(match_operand:SI 4 "reg_or_int_operand" "rM")])))]
""
"sub%?\\t%0, %1, %3%S2")
"sub%?\\t%0, %1, %3%S2"
[(set_attr "shift" "3")]
)
(define_insn "*sub_shiftsi_compare0"
[(set (reg:CC_NOOV 24)
@ -4519,7 +4584,10 @@
(match_dup 4)])))]
""
"sub%?s\\t%0, %1, %3%S2"
[(set_attr "conds" "set")])
[(set_attr "conds" "set")
(set_attr "shift" "3")
]
)
(define_insn "*sub_shiftsi_compare0_scratch"
[(set (reg:CC_NOOV 24)
@ -4532,7 +4600,10 @@
(clobber (match_scratch:SI 0 "=r"))]
""
"sub%?s\\t%0, %1, %3%S2"
[(set_attr "conds" "set")])
[(set_attr "conds" "set")
(set_attr "shift" "3")
]
)
;; These variants of the above insns can occur if the first operand is the
;; frame pointer and we eliminate that. This is a kludge, but there doesn't
@ -5236,6 +5307,7 @@
mov%D5\\t%0, %1\;mov%d5\\t%0, %2%S4
mvn%D5\\t%0, #%B1\;mov%d5\\t%0, %2%S4"
[(set_attr "conds" "use")
(set_attr "shift" "2")
(set_attr "length" "4,8,8")])
(define_insn "*ifcompare_move_shift"
@ -5269,6 +5341,7 @@
mov%d5\\t%0, %1\;mov%D5\\t%0, %2%S4
mvn%d5\\t%0, #%B1\;mov%D5\\t%0, %2%S4"
[(set_attr "conds" "use")
(set_attr "shift" "2")
(set_attr "length" "4,8,8")])
(define_insn "*ifcompare_shift_shift"
@ -5303,6 +5376,7 @@
""
"mov%d5\\t%0, %1%S6\;mov%D5\\t%0, %3%S7"
[(set_attr "conds" "use")
(set_attr "shift" "1")
(set_attr "length" "8")])
(define_insn "*ifcompare_not_arith"