target-i386: remove helper_lock()

It's been superseded by the atomic helpers.

The use of the atomic helpers provides a significant performance and scalability
improvement. Below is the result of running the atomic_add-test microbenchmark with:
 $ x86_64-linux-user/qemu-x86_64 tests/atomic_add-bench -o 5000000 -r $r -n $n
, where $n is the number of threads and $r is the allowed range for the additions.

The scenarios measured are:
- atomic: implements x86' ADDL with the atomic_add helper (i.e. this patchset)
- cmpxchg: implement x86' ADDL with a TCG loop using the cmpxchg helper
- master: before this patchset

Results sorted in ascending range, i.e. descending degree of contention.
Y axis is Throughput in Mops/s. Tests are run on an AMD machine with 64
Opteron 6376 cores.

                atomic_add-bench: 5000000 ops/thread, [0,1] range

  25 ++---------+----------+---------+----------+----------+----------+---++
     + atomic +-E--+       +         +          +          +          +    |
     |cmpxchg +-H--+                                                       |
  20 +Emaster +-N--+                                                      ++
     ||                                                                    |
     |++                                                                   |
     ||                                                                    |
  15 +++                                                                  ++
     |N|                                                                   |
     |+|                                                                   |
  10 ++|                                                                  ++
     |+|+                                                                  |
     | |    -+E+------        +++  ---+E+------+E+------+E+-----+E+------+E|
     |+E+E+- +++     +E+------+E+--                                        |
   5 ++|+                                                                 ++
     |+N+H+---                                 +++                         |
     ++++N+--+H++----+++   +  +++  --++H+------+H+------+H++----+H+---+--- |
   0 ++---------+-----H----+---H-----+----------+----------+----------+---H+
     0          10         20        30         40         50         60
                                Number of threads

                atomic_add-bench: 5000000 ops/thread, [0,2] range

  25 ++---------+----------+---------+----------+----------+----------+---++
     ++atomic +-E--+       +         +          +          +          +    |
     |cmpxchg +-H--+                                                       |
  20 ++master +-N--+                                                      ++
     |E|                                                                   |
     |++                                                                   |
     ||E                                                                   |
  15 ++|                                                                  ++
     |N||                                                                  |
     |+||                                   ---+E+------+E+-----+E+------+E|
  10 ++| |        ---+E+------+E+-----+E+---                    +++      +++
     ||H+E+--+E+--                                                         |
     |+++++                                                                |
     | ||                                                                  |
   5 ++|+H+--                                  +++                        ++
     |+N+    -                              ---+H+------+H+------          |
     +  +N+--+H++----+H+---+--+H+----++H+---    +          +    +H+---+--+H|
   0 ++---------+----------+---------+----------+----------+----------+---++
     0          10         20        30         40         50         60
                                Number of threads

                atomic_add-bench: 5000000 ops/thread, [0,8] range

  40 ++---------+----------+---------+----------+----------+----------+---++
     ++atomic +-E--+       +         +          +          +          +    |
  35 +cmpxchg +-H--+                                                      ++
     | master +-N--+               ---+E+------+E+------+E+-----+E+------+E|
  30 ++|                   ---+E+--   +++                                 ++
     | |            -+E+---                                                |
  25 ++E        ---- +++                                                  ++
     |+++++ -+E+                                                           |
  20 +E+ E-- +++                                                          ++
     |H|+++                                                                |
     |+|                                       +H+-------                  |
  15 ++H+                                   ---+++      +H+------         ++
     |N++H+--                         +++---                    +H+------++|
  10 ++ +++  -       +++           ---+H+                       +++      +H+
     | |     +H+-----+H+------+H+--                                        |
   5 ++|                      +++                                         ++
     ++N+N+--+N++          +         +          +          +          +    |
   0 ++---------+----------+---------+----------+----------+----------+---++
     0          10         20        30         40         50         60
                                Number of threads

               atomic_add-bench: 5000000 ops/thread, [0,128] range

  160 ++---------+---------+----------+---------+----------+----------+---++
      + atomic +-E--+      +          +         +          +          +    |
  140 +cmpxchg +-H--+                          +++      +++               ++
      | master +-N--+                           E--------E------+E+------++|
  120 ++                                      --|        |      +++       E+
      |                                     -- +++      +++              ++|
  100 ++                                   -                              ++
      |                                +++-                     +++      ++|
   80 ++                              -+E+    -+H+------+H+------H--------++
      |                           ----    ----                  +++       H|
      |            ---+E+-----+E+-  ---+H+                               ++|
   60 ++     +E+---   +++  ---+H+---                                      ++
      |    --+++   ---+H+--                                                |
   40 ++ +E+-+H+---                                                       ++
      |  +H+                                                               |
   20 +EE+                                                                ++
      +N+        +         +          +         +          +          +    |
    0 ++N-N---N--+---------+----------+---------+----------+----------+---++
      0          10        20         30        40         50         60
                                Number of threads

              atomic_add-bench: 5000000 ops/thread, [0,1024] range

  350 ++---------+---------+----------+---------+----------+----------+---++
      + atomic +-E--+      +          +         +          +          +    |
  300 +cmpxchg +-H--+                                                    +++
      | master +-N--+                                           +++       ||
      |                                                 +++      |    ----E|
  250 ++                                                 |   ----E----    ++
      |                                              ----E---    |    ---+H|
  200 ++                                      -+E+---   +++  ---+H+---    ++
      |                                   ----         -+H+--              |
      |                                +E+     +++ ---- +++                |
  150 ++                            ---+++  ---+H+-                       ++
      |                          ---  -+H+--                               |
  100 ++                   ---+E+ ---- +++                                ++
      |      +++   ---+E+-----+H+-                                         |
      |     -+E+------+H+--                                                |
   50 ++ +E+                                                              ++
      +EE+       +         +          +         +          +          +    |
    0 ++N-N---N--+---------+----------+---------+----------+----------+---++
      0          10        20         30        40         50         60
                                Number of threads

  hi-res: http://imgur.com/a/fMRmq

For master I stopped measuring master after 8 threads, because there is little
point in measuring the well-known performance collapse of a contended lock.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Emilio G. Cota <cota@braap.org>
Message-Id: <1467054136-10430-21-git-send-email-cota@braap.org>
Signed-off-by: Richard Henderson <rth@twiddle.net>
This commit is contained in:
Emilio G. Cota 2016-06-27 15:02:06 -04:00 committed by Richard Henderson
parent ea97ebe89f
commit 37b995f6e7
3 changed files with 0 additions and 50 deletions

View File

@ -1,8 +1,6 @@
DEF_HELPER_FLAGS_4(cc_compute_all, TCG_CALL_NO_RWG_SE, tl, tl, tl, tl, int)
DEF_HELPER_FLAGS_4(cc_compute_c, TCG_CALL_NO_RWG_SE, tl, tl, tl, tl, int)
DEF_HELPER_0(lock, void)
DEF_HELPER_0(unlock, void)
DEF_HELPER_3(write_eflags, void, env, tl, i32)
DEF_HELPER_1(read_eflags, tl, env)
DEF_HELPER_2(divb_AL, void, env, tl)

View File

@ -25,39 +25,6 @@
#include "qemu/int128.h"
#include "tcg.h"
/* broken thread support */
#if defined(CONFIG_USER_ONLY)
QemuMutex global_cpu_lock;
void helper_lock(void)
{
qemu_mutex_lock(&global_cpu_lock);
}
void helper_unlock(void)
{
qemu_mutex_unlock(&global_cpu_lock);
}
void helper_lock_init(void)
{
qemu_mutex_init(&global_cpu_lock);
}
#else
void helper_lock(void)
{
}
void helper_unlock(void)
{
}
void helper_lock_init(void)
{
}
#endif
void helper_cmpxchg8b_unlocked(CPUX86State *env, target_ulong a0)
{
uintptr_t ra = GETPC();

View File

@ -4536,10 +4536,6 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
s->aflag = aflag;
s->dflag = dflag;
/* lock generation */
if (prefixes & PREFIX_LOCK)
gen_helper_lock();
/* now check op code */
reswitch:
switch(b) {
@ -8211,20 +8207,11 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
default:
goto unknown_op;
}
/* lock generation */
if (s->prefix & PREFIX_LOCK)
gen_helper_unlock();
return s->pc;
illegal_op:
if (s->prefix & PREFIX_LOCK)
gen_helper_unlock();
/* XXX: ensure that no lock was generated */
gen_illegal_opcode(s);
return s->pc;
unknown_op:
if (s->prefix & PREFIX_LOCK)
gen_helper_unlock();
/* XXX: ensure that no lock was generated */
gen_unknown_opcode(env, s);
return s->pc;
}
@ -8316,8 +8303,6 @@ void tcg_x86_init(void)
offsetof(CPUX86State, bnd_regs[i].ub),
bnd_regu_names[i]);
}
helper_lock_init();
}
/* generate intermediate code for basic block 'tb'. */