Commit Graph

1184 Commits

Author SHA1 Message Date
Kirill Smelkov
63193d1794 Optimize vswap()
vswap() is called often enough and shows in profile and it was easy to
hand optimize swapping vtop[-1] and vtop[0] - instead of large (28 bytes
on i386) tmp variable and two memory to memory copies, let's swap areas
by longs through registers with streamlined assembly.

For

    $ ./tcc -B. -bench -DONE_SOURCE -DCONFIG_MULTIARCHDIR=\"i386-linux-gnu\" -c tcc.c

before:

 # Overhead      Command        Shared Object                                          Symbol
 # ........  ...........  ...................  ..............................................
 #
     15.19%          tcc  tcc                  [.] next_nomacro1
      5.19%          tcc  libc-2.13.so         [.] _int_malloc
      4.57%          tcc  tcc                  [.] next
      3.36%          tcc  tcc                  [.] tok_str_add2
      3.03%          tcc  tcc                  [.] macro_subst_tok
      2.93%          tcc  tcc                  [.] macro_subst
      2.53%          tcc  tcc                  [.] next_nomacro_spc
      2.49%          tcc  tcc                  [.] vswap
      2.36%          tcc  libc-2.13.so         [.] _int_free

       │    ST_FUNC void vswap(void)
       │    {
  1,96 │      push   %edi
  2,65 │      push   %esi
  1,08 │      sub    $0x20,%esp
       │        SValue tmp;
       │
       │        /* cannot let cpu flags if other instruction are generated. Also
       │           avoid leaving VT_JMP anywhere except on the top of the stack
       │           because it would complicate the code generator. */
       │        if (vtop >= vstack) {
  0,98 │      mov    0x8078cac,%eax
       │      cmp    $0x8078d3c,%eax
  1,18 │   ┌──jb     24
       │   │        int v = vtop->r & VT_VALMASK;
  1,08 │   │  mov    0x8(%eax),%edx
  0,78 │   │  and    $0x3f,%edx
       │   │        if (v == VT_CMP || (v & ~1) == VT_JMP)
  0,78 │   │  cmp    $0x33,%edx
  0,69 │   │↓ je     54
  0,59 │   │  and    $0xfffffffe,%edx
  0,49 │   │  cmp    $0x34,%edx
  0,29 │   │↓ je     54
       │   │            gv(RC_INT);
       │   │    }
       │   │    tmp = vtop[0];
  1,08 │24:└─→lea    0x4(%esp),%edi
  0,39 │      mov    $0x7,%ecx
       │      mov    %eax,%esi
 14,41 │      rep    movsl %ds:(%esi),%es:(%edi)
       │        vtop[0] = vtop[-1];
  9,51 │      lea    -0x1c(%eax),%esi
  1,96 │      mov    $0x7,%cl
       │      mov    %eax,%edi
 17,06 │      rep    movsl %ds:(%esi),%es:(%edi)
       │        vtop[-1] = tmp;
 10,20 │      mov    0x8078cac,%edi
  2,35 │      sub    $0x1c,%edi
  0,78 │      lea    0x4(%esp),%esi
       │      mov    $0x7,%cl
 15,20 │      rep    movsl %ds:(%esi),%es:(%edi)
       │    }
  9,90 │      add    $0x20,%esp
  2,25 │      pop    %esi
  1,67 │      pop    %edi
  0,69 │      ret

after:

 # Overhead      Command        Shared Object                                          Symbol
 # ........  ...........  ...................  ..............................................
 #
     15.27%          tcc  tcc                  [.] next_nomacro1
      5.08%          tcc  libc-2.13.so         [.] _int_malloc
      4.57%          tcc  tcc                  [.] next
      3.17%          tcc  tcc                  [.] tok_str_add2
      3.12%          tcc  tcc                  [.] macro_subst
      2.99%          tcc  tcc                  [.] macro_subst_tok
      2.43%          tcc  tcc                  [.] next_nomacro_spc
      2.32%          tcc  libc-2.13.so         [.] _int_free

      . . .

      0.71%          tcc  tcc                  [.] vswap

       │    ST_FUNC void vswap(void)
       │    {
  7,22 │      push   %eax
       │        /* cannot let cpu flags if other instruction are generated. Also
       │           avoid leaving VT_JMP anywhere except on the top of the stack
       │           because it would complicate the code generator. */
       │        if (vtop >= vstack) {
 11,34 │      mov    0x8078cac,%eax
  2,75 │      cmp    $0x8078d3c,%eax
  0,34 │   ┌──jb     20
       │   │        int v = vtop->r & VT_VALMASK;
  0,34 │   │  mov    0x8(%eax),%edx
  8,93 │   │  and    $0x3f,%edx
       │   │        if (v == VT_CMP || (v & ~1) == VT_JMP)
  2,06 │   │  cmp    $0x33,%edx
  2,41 │   │↓ je     74
  2,41 │   │  and    $0xfffffffe,%edx
  0,34 │   │  cmp    $0x34,%edx
  2,41 │   │↓ je     74
       │   │        vtopl[-1*VSIZEL + i] = tmpl;    \
       │   │      } do {} while (0)
       │   │
       │   │    VSWAPL(15); VSWAPL(14); VSWAPL(13); VSWAPL(12);
       │   │    VSWAPL(11); VSWAPL(10); VSWAPL( 9); VSWAPL( 8);
       │   │    VSWAPL( 7); VSWAPL( 6); VSWAPL( 5); VSWAPL( 4);
  2,06 │20:└─→mov    0x18(%eax),%edx
  1,37 │      mov    -0x4(%eax),%ecx
  2,06 │      mov    %ecx,0x18(%eax)
  1,37 │      mov    %edx,-0x4(%eax)
  2,06 │      mov    0x14(%eax),%edx
  2,06 │      mov    -0x8(%eax),%ecx
  2,41 │      mov    %ecx,0x14(%eax)
  3,09 │      mov    %edx,-0x8(%eax)
  3,09 │      mov    0x10(%eax),%edx
  1,72 │      mov    -0xc(%eax),%ecx
  2,75 │      mov    %ecx,0x10(%eax)
  1,72 │      mov    %edx,-0xc(%eax)
       │        VSWAPL( 3); VSWAPL( 2); VSWAPL( 1); VSWAPL( 0);
  2,41 │      mov    0xc(%eax),%edx
  2,41 │      mov    -0x10(%eax),%ecx
  2,41 │      mov    %ecx,0xc(%eax)
  0,69 │      mov    %edx,-0x10(%eax)
  1,72 │      mov    0x8(%eax),%edx
  0,69 │      mov    -0x14(%eax),%ecx
  1,03 │      mov    %ecx,0x8(%eax)
  1,37 │      mov    %edx,-0x14(%eax)
  1,37 │      mov    0x4(%eax),%edx
  0,69 │      mov    -0x18(%eax),%ecx
  3,09 │      mov    %ecx,0x4(%eax)
  2,06 │      mov    %edx,-0x18(%eax)
  1,37 │      mov    (%eax),%edx
  2,41 │      mov    -0x1c(%eax),%ecx
  1,37 │      mov    %ecx,(%eax)
  4,12 │      mov    %edx,-0x1c(%eax)
       │        }
       │
       │    #   undef VSWAPL
       │    #   undef VSIZEL
       │    }
  1,03 │      pop    %eax
  3,44 │      ret

Overal speedup:

    # best of 5 runs
    before: 8268 idents, 47203 lines, 1526763 bytes, 0.148 s, 319217 lines/s, 10.3 MB/s
    after:  8273 idents, 47231 lines, 1527685 bytes, 0.146 s, 324092 lines/s, 10.5 MB/s

Static ASSERT macro taken from CCAN's[1] build_assert[2] which is in
public domain.

[1] http://ccodearchive.net/
[2] http://git.ozlabs.org/?p=ccan;a=blob;f=ccan/build_assert/build_assert.h;h=24e59c44cd930173178ac9b6e101b0af64a879e9;hb=HEAD
2012-12-21 20:46:26 +04:00
Kirill Smelkov
8eb92e6052 Optimize cstr_reset() to only reset string to empty, not call free() and later malloc()
A CString could be reset to empty just setting its .size to 0.

If memory was already allocated, that would be remembered in
.data_allocated and .size_allocated and on consequent string
manipulations that memory will be used without immediate need to call
malloc().

For

    $ ./tcc -B. -bench -DONE_SOURCE -DCONFIG_MULTIARCHDIR=\"i386-linux-gnu\" -c tcc.c

after the patch malloc/free are called less often:

(tcc is run in loop; perf record -a sleep 10 && perf report)
before:

 # Overhead      Command       Shared Object                                      Symbol
 # ........  ...........  ..................  ..........................................
 #
     13.89%          tcc  tcc                 [.] next_nomacro1
      4.73%          tcc  libc-2.13.so        [.] _int_malloc
      4.39%          tcc  tcc                 [.] next
      2.94%          tcc  tcc                 [.] tok_str_add2
      2.78%          tcc  tcc                 [.] macro_subst_tok
      2.75%          tcc  libc-2.13.so        [.] free
      2.74%          tcc  tcc                 [.] macro_subst
      2.63%          tcc  libc-2.13.so        [.] _int_free
      2.28%          tcc  tcc                 [.] vswap
      2.24%          tcc  tcc                 [.] next_nomacro_spc
      2.06%          tcc  libc-2.13.so        [.] realloc
      2.00%          tcc  libc-2.13.so        [.] malloc
      1.99%          tcc  tcc                 [.] unary
      1.85%          tcc  libc-2.13.so        [.] __i686.get_pc_thunk.bx
      1.76%  kworker/0:1  [kernel.kallsyms]   [k] delay_tsc
      1.70%          tcc  tcc                 [.] next_nomacro
      1.62%          tcc  tcc                 [.] preprocess
      1.41%          tcc  libc-2.13.so        [.] __memcmp_ssse3
      1.38%          tcc  [kernel.kallsyms]   [k] memset
      1.10%          tcc  tcc                 [.] g
      1.06%          tcc  tcc                 [.] parse_btype
      1.05%          tcc  tcc                 [.] sym_push2
      1.04%          tcc  libc-2.13.so        [.] _int_realloc
      1.00%          tcc  libc-2.13.so        [.] malloc_consolidate

after:

 # Overhead      Command       Shared Object                                          Symbol
 # ........  ...........  ..................  ..............................................
 #
     15.26%          tcc  tcc                 [.] next_nomacro1
      5.07%          tcc  libc-2.13.so        [.] _int_malloc
      4.62%          tcc  tcc                 [.] next
      3.22%          tcc  tcc                 [.] tok_str_add2
      3.03%          tcc  tcc                 [.] macro_subst_tok
      3.02%          tcc  tcc                 [.] macro_subst
      2.59%          tcc  tcc                 [.] next_nomacro_spc
      2.44%          tcc  tcc                 [.] vswap
      2.39%          tcc  libc-2.13.so        [.] _int_free
      2.28%          tcc  libc-2.13.so        [.] free
      2.22%          tcc  tcc                 [.] unary
      2.07%          tcc  libc-2.13.so        [.] realloc
      1.97%          tcc  libc-2.13.so        [.] malloc
      1.70%          tcc  tcc                 [.] preprocess
      1.69%          tcc  libc-2.13.so        [.] __i686.get_pc_thunk.bx
      1.68%          tcc  tcc                 [.] next_nomacro
      1.59%          tcc  [kernel.kallsyms]   [k] memset
      1.55%          tcc  libc-2.13.so        [.] __memcmp_ssse3
      1.22%          tcc  tcc                 [.] parse_comment
      1.11%          tcc  tcc                 [.] g
      1.11%          tcc  tcc                 [.] sym_push2
      1.10%          tcc  tcc                 [.] parse_btype
      1.10%          tcc  libc-2.13.so        [.] _int_realloc
      1.06%          tcc  tcc                 [.] vsetc
      0.98%          tcc  libc-2.13.so        [.] malloc_consolidate

and this gains small speedup for tcc:

    # best of 5 runs
    before: 8268 idents, 47191 lines, 1526670 bytes, 0.153 s, 307997 lines/s, 10.0 MB/s
    after:  8268 idents, 47203 lines, 1526763 bytes, 0.148 s, 319217 lines/s, 10.3 MB/s
2012-12-21 20:46:26 +04:00
Akim Demaille
e79281f58e build: fix out-of-tree install
Makefile (install): Fix installation of headers.
Do not try to install twice libtcc.h, once should be enough.
2012-12-21 14:23:28 +01:00
Akim Demaille
7667a8887a build: fix out-of-tree build
* Makefile (TCC-VERSION): Use top_srcdir.
2012-12-21 14:17:23 +01:00
Akim Demaille
8adfb4a419 build: simplify the makefiles
* Makefile: use "else if" to improve readability.
2012-12-21 14:17:16 +01:00
Akim Demaille
017bbbfee1 configure: support absolete out-of-tree builds
configure: handle the case of absolute paths.
Reported by grishka.
2012-12-21 13:57:22 +01:00
Akim Demaille
d7264e0218 configure: style changes
* configure: use more here-documents.
2012-12-21 13:49:15 +01:00
Akim Demaille
ba49862de6 configure: prefer here-documents
* configure: use here-documents to improve readability and
reduce the clutter.
2012-12-21 13:47:00 +01:00
Akim Demaille
9c9ca2032b configure: style changes
* configure (case $targetos): Improve readibility.
(case $cpu): New, to improve readability compare to if + test.
2012-12-21 13:45:22 +01:00
grischka
5ebc6a964d Makefile: revamp "tar" target
- Creates release tarball from *current* git branch
- Includes tcc-doc.html
- converts important windows files files to CRLF
  (requirement for the cmd.exe batch processor, convenience for
   reading the txt in notepad)
2012-12-20 21:29:57 +01:00
grischka
b174399340 win32: build-tcc.bat: get rid of hardcoded VERSION string
Also:
 - put libtcc.def into libtcc dir
 - remove ar references
 - remove libtcc_test from build
2012-12-20 21:20:54 +01:00
Akim Demaille
3f09b90d21 build: fix VPATH builds
* configure (fn_dirname): New.
Use it to ensure the creation of proper symlinks to Makefiles.
(config.mak): Define top_builddir and top_srcdir.
(CPPFLAGS): Be sure to find the headers.
* Makefile, lib/Makefile, tests/Makefile, tests2/Makefile: Adjust
to set VPATH properly.
Fix confusion between top_builddir and top_srcdir.
2012-12-18 10:06:20 +01:00
Roy
d815896d4c bcheck: there is no unistd.h in win32. 2012-12-10 09:51:49 +08:00
Kirill Smelkov
a55ecf6d2c Repair bounds-checking more, this time tcc -b -run tcc.c -run tcc.c -run tcctest.c works
Hello up there. On the list Grischka made a point that we can't recommend using
-b as long as tcc -b tcc.c doesn't produce anything useful. Now it does, so
please don't treat -b mode as second class citizen anymore.

Thanks,
Kirill

* bcheck2:
  tests: Add tests for compile/run tcc.c with `tcc -b` then compile tcc.c again, then run tcctest.c
  lib/bcheck: Fix code typo in __bound_delete_region()
  lib/bcheck: Don't assume heap goes right after bss
  Make tcc work after self-compiling with bounds-check enabled
2012-12-09 19:51:20 +04:00
Kirill Smelkov
031ff872be tests: Add tests for compile/run tcc.c with tcc -b then compile tcc.c again, then run tcctest.c
Just like with test[123] add their test[123]b variants. After previous 3
patchs all test pass here on Debian GNU/Linux on i385 with gcc-4.7 with
or without memory randomization turned on.
2012-12-09 19:43:40 +04:00
Kirill Smelkov
dbeb4faf21 lib/bcheck: Fix code typo in __bound_delete_region()
We were calling get_page() with t2 index which is not correct, since
get_page() operate on t1 indices. The bug is here from day-1, from
60f781c4 (first version of bounds checker) and show as a crash in
__bound_delete_region() at program exit:

    $ ./tcc   -B. -DTCC_TARGET_I386 -DCONFIG_MULTIARCHDIR=\"i386-linux-gnu\" -b -run -DONE_SOURCE \
      ./tcc.c -B. -DTCC_TARGET_I386 -DCONFIG_MULTIARCHDIR=\"i386-linux-gnu\"    -run -DONE_SOURCE \
      ./tcc.c -B. -run tests/tcctest.c

    (lot's of correct output from tcctest)
    Runtime error: dereferencing invalid pointer
    at 0xa7c21cc4 __bound_delete_region()
    by (nil) ???
    Segmentation fault

The fix is simple - last page should be get through t1_end, like it is
done in __bound_new_region().

After this patch, tcc is being able to compile itself with -b, then
compile itself again and run tcctest with correct output. Tests follow.
2012-12-09 19:33:47 +04:00
Kirill Smelkov
efd9d92b7c lib/bcheck: Don't assume heap goes right after bss
At startup __bound_init() wants to mark malloc zone as invalid memory,
so that any access to memory on heap, not allocated through malloc be
invalid. Other pages are initialized as empty regions, access to which
is not treated as invalid by bounds-checking.

The problem is code incorrectly assumed that heap goes right after bss,
and that is not correct for two cases:

    1) if we are running from `tcc -b -run`, program text data and bss
       will be already in malloced memory, possibly in mmaped region
       insead of heap, and marking memory as invalid from _end
       will not cover heap and probably wrongly mark correct regions.

    2) if address space randomization is turned on, again heap does not
       start from _end, and we'll mark as invalid something else instead
       of malloc area.

For example with the following diagnostic patch ...

    diff --git a/tcc.c b/tcc.c
    index 5dd5725..31c46e8 100644
    --- a/tcc.c
    +++ b/tcc.c
    @@ -479,6 +479,8 @@ static int parse_args(TCCState *s, int argc, char **argv)
         return optind;
     }

    +extern int _etext, _edata, _end;
    +
     int main(int argc, char **argv)
     {
         int i;
    @@ -487,6 +489,18 @@ int main(int argc, char **argv)
         int64_t start_time = 0;
         const char *default_file = NULL;

    +    void *brk;
    +
    +    brk = sbrk(0);
    +
    +    fprintf(stderr, "\n>>> TCC\n\n");
    +    fprintf(stderr, "etext:\t%10p\n",  &_etext);
    +    fprintf(stderr, "edata:\t%10p\n",  &_edata);
    +    fprintf(stderr, "end:\t%10p\n",    &_end);
    +    fprintf(stderr, "brk:\t%10p\n",    brk);
    +    fprintf(stderr, "stack:\t%10p\n",  &brk);
    +
    +    fprintf(stderr, "&errno: %p\n", &errno);
         s = tcc_new();

         output_type = TCC_OUTPUT_EXE;

    diff --git a/tccrun.c b/tccrun.c
    index 531f46a..25ed30a 100644
    --- a/tccrun.c
    +++ b/tccrun.c
    @@ -91,6 +91,8 @@ LIBTCCAPI int tcc_run(TCCState *s1, int argc, char **argv)
         int (*prog_main)(int, char **);
         int ret;

    +    fprintf(stderr, "\n\ntcc_run() ...\n\n");
    +
         if (tcc_relocate(s1, TCC_RELOCATE_AUTO) < 0)
             return -1;

    diff --git a/lib/bcheck.c b/lib/bcheck.c
    index ea5b233..8b26a5f 100644
    --- a/lib/bcheck.c
    +++ b/lib/bcheck.c
    @@ -296,6 +326,8 @@ static void mark_invalid(unsigned long addr, unsigned long size)
         start = addr;
         end = addr + size;

    +    fprintf(stderr, "mark_invalid  %10p - %10p\n", (void *)addr, (void *)end);
    +
         t2_start = (start + BOUND_T3_SIZE - 1) >> BOUND_T3_BITS;
         if (end != 0)
             t2_end = end >> BOUND_T3_BITS;

... Look how memory is laid out for `tcc -b -run ...`:

    $ ./tcc -B. -b -DTCC_TARGET_I386 -DCONFIG_MULTIARCHDIR=\"i386-linux-gnu\"  -run   \
        -DONE_SOURCE ./tcc.c -B. -c x.c

    >>> TCC

    etext:   0x8065477
    edata:   0x8070220
    end:     0x807a95c
    brk:     0x807b000
    stack:  0xaffff0f0
    &errno: 0xa7e25688

    tcc_run() ...

    mark_invalid  0xfff80000 -      (nil)
    mark_invalid  0xa7c31d98 - 0xafc31d98

    >>> TCC

    etext:  0xa7c22767
    edata:  0xa7c2759c
    end:    0xa7c31d98
    brk:     0x8211000
    stack:  0xafffeff0
    &errno: 0xa7e25688
    Runtime error: dereferencing invalid pointer
    ./tccpp.c:1953: at 0xa7beebdf parse_number() (included from ./libtcc.c, ./tcc.c)
    ./tccpp.c:3003: by 0xa7bf0708 next() (included from ./libtcc.c, ./tcc.c)
    ./tccgen.c:4465: by 0xa7bfe348 block() (included from ./libtcc.c, ./tcc.c)
    ./tccgen.c:4440: by 0xa7bfe212 block() (included from ./libtcc.c, ./tcc.c)
    ./tccgen.c:5529: by 0xa7c01929 gen_function() (included from ./libtcc.c, ./tcc.c)
    ./tccgen.c:5767: by 0xa7c02602 decl0() (included from ./libtcc.c, ./tcc.c)

The second mark_invalid goes right after in-memory-compiled program's
_end, and oops, that's not where malloc zone is (starts from brk), and oops
again, mark_invalid covers e.g. errno. Then compiled tcc is crasshing by
bcheck on errno access:

    1776 static void parse_number(const char *p)
    1777 {
    1778     int b, t, shift, frac_bits, s, exp_val, ch;
         ...
    1951             *q = '\0';
    1952             t = toup(ch);
    1953             errno = 0;

The solution here is to use sbrk(0) as approximation for the program
break start instead of &_end:

    - if we are a separately compiled program, __bound_init() runs early,
      and sbrk(0) should be equal or very near to start_brk (in case other
      constructors malloc something), or

    - if we are running from under `tcc -b -run`, sbrk(0) will return
      start of heap portion which is under this program control, and not
      mark as invalid earlier allocated memory.

With this patch `tcc -b -run tcc.c ...` succeeds compiling above
small-test program (diagnostic patch is still applied too):

    $ ./tcc -B. -b -DTCC_TARGET_I386 -DCONFIG_MULTIARCHDIR=\"i386-linux-gnu\"  -run   \
        -DONE_SOURCE ./tcc.c -B. -c x.c

    >>> TCC

    etext:   0x8065477
    edata:   0x8070220
    end:     0x807a95c
    brk:     0x807b000
    stack:  0xaffff0f0
    &errno: 0xa7e25688

    tcc_run() ...

    mark_invalid  0xfff80000 -      (nil)
    mark_invalid   0x8211000 - 0x10211000

    >>> TCC

    etext:  0xa7c22777
    edata:  0xa7c275ac
    end:    0xa7c31da8
    brk:     0x8211000
    stack:  0xafffeff0
    &errno: 0xa7e25688

    (completes ok)

but running `tcc -b -run tcc.c -run tests/tcctest.c` sigsegv's - that's
the plot for the next patch.
2012-12-09 19:05:36 +04:00
Kirill Smelkov
43a11a7ed1 Make tcc work after self-compiling with bounds-check enabled
For vstack Fabrice used the trick to initialize vtop to &vstack[-1], so
that on first push, vtop becomes &vstack[0] and a value is also stored
there - everything works.

Except that when tcc is compiled with bounds-checking enabled, vstack - 1
returns INVALID_POINTER and oops...

Let's workaround it with artificial 1 vstack slot which will not be
used, but only serve as an indicator that pointing to &vstack[-1] is ok.

Now, tcc, after being self-compiled with -b works:

    $ ./tcc -B. -o tccb  -DONE_SOURCE -DCONFIG_MULTIARCHDIR=\"i386-linux-gnu\" tcc.c  -ldl
    $ cd tests
    $ ../tcc -B.. -run tcctest.c >1
    $ ../tccb -B.. -run tcctest.c >2
    $ diff -u 1 2

and note, tcc's compilation speed is not affected:

    $ ./tcc -B. -bench -DONE_SOURCE -DCONFIG_MULTIARCHDIR=\"i386-linux-gnu\" -c tcc.c

    before: 8270 idents, 47221 lines, 1527730 bytes, 0.152 s, 309800 lines/s, 10.0 MB/s
    after:  8271 idents, 47221 lines, 1527733 bytes, 0.152 s, 310107 lines/s, 10.0 MB/s

But note, that `tcc -b -run tcc` is still broken - for example it crashes
on
    $ cat x.c
    double get100 () { return 100.0; }

    $ ./tcc -B. -b -DTCC_TARGET_I386 -DCONFIG_MULTIARCHDIR=\"i386-linux-gnu\"  -run   \
        -DONE_SOURCE ./tcc.c -B. -c x.c
    Runtime error: dereferencing invalid pointer
    ./tccpp.c:1953: at 0xa7beebdf parse_number() (included from ./libtcc.c, ./tcc.c)
    ./tccpp.c:3003: by 0xa7bf0708 next() (included from ./libtcc.c, ./tcc.c)
    ./tccgen.c:4465: by 0xa7bfe348 block() (included from ./libtcc.c, ./tcc.c)
    ./tccgen.c:4440: by 0xa7bfe212 block() (included from ./libtcc.c, ./tcc.c)
    ./tccgen.c:5529: by 0xa7c01929 gen_function() (included from ./libtcc.c, ./tcc.c)
    ./tccgen.c:5767: by 0xa7c02602 decl0() (included from ./libtcc.c, ./tcc.c)

that's because lib/bcheck.c runtime needs more fixes -- see next
patches.
2012-12-09 18:06:09 +04:00
Thomas Preud'homme
c4a18f47a2 Detect ARM CPU version in configure
Instead of guessing the ARM CPU version to compile for from tcc.h, we
now detect it in configure and output the value in config.h
2012-12-04 11:17:51 +01:00
Thomas Preud'homme
05b02a5581 arm-gen.c: Invalid operator test always false
Invalid operator test is always false in gen_opf for arm (found with
cppcheck). This patch fixes the issue.
2012-11-28 22:26:39 +01:00
Thomas Preud'homme
8d90205fd9 Fix OABI calling convention
OABI calling convention was broken since the addition of the hardfloat
calling convention in commit 7f6095bfec.
This commit fixes the breakage.
2012-11-28 22:26:39 +01:00
Kirill Smelkov
168aed4984 tests: btest should only run on targets supporting bcheck
After 40a54c43 (Repair bounds-checking runtime), and in particular
5d648485 (Now btest pass!) `make test` was broken on ARCH != i386,
because I've changed btest to unconditionally run on all arches.

But bounds-checking itsels is only supported on i386 and oops...

Fix it.

Reported-by: Thomas Preud'homme <robotux@celest.fr>
2012-11-24 12:54:03 +04:00
Kirill Smelkov
4744269494 Update .gitignore
The following files were not ignored (produced by build on i386 with
--enable-cross):

    arm-eabi-tcc
    arm-fpa-ld-tcc
    arm-fpa-tcc
    arm-vfp-tcc
    c67-tcc
    i386-win32-tcc
    lib/i386-win32/
    lib/x86_64-win32/
    x86_64-tcc
    x86_64-win32-tcc
2012-11-22 10:40:02 +04:00
Thomas Preud'homme
6eec931038 Only reference vfpr when available
A line in gfunc_call in arm-gen.c is referencing vfpr unconditionally.
Yet, this function is only available when TCC_ARM_VFP is set. While this
code is only triggered when TCC_ARM_VFP, it fails at compile time. This
commit fix the problem.
2012-11-21 12:21:51 +01:00
Thomas Preud'homme
15a315f4a5 Define TCC_ARM_EABI if using hardfloat ABI
TCC_ARM_EABI should be defined when compiling with hardfloat calling
convention. This commit rework the Makefile to distinguish between
calling convention and multiarch and define TCC_ARM_EABI when hardfloat
calling convention is used. The result is to first guess the calling
convention and then add the multiarch triplet if necessary.
2012-11-20 11:36:13 +01:00
Thomas Preud'homme
e2212738d4 Generate PLT thumb stub only when necessary
Generate PLT thumb stub for an ARM PLT entry only when at least one
Thumb instruction branches to that entry. This is a rewrite of the
previous patch.
2012-11-17 10:01:11 +01:00
Kirill Smelkov
ab24aaeca3 i386: We can change 'lea 0(%ebp),r' to 'mov %ebp,r'
Because that mov is 1 byte shorter, look:

    int *func()
    {
        return __builtin_frame_address(0);
    }

before patch:

00000000 <func>:
   0:   55                      push   %ebp
   1:   89 e5                   mov    %esp,%ebp
   3:   81 ec 00 00 00 00       sub    $0x0,%esp
   9:   8d 45 00                lea    0x0(%ebp),%eax   // <- here
   c:   e9 00 00 00 00          jmp    11 <func+0x11>
  11:   c9                      leave
  12:   c3                      ret

after patch:

00000000 <func>:
   0:   55                      push   %ebp
   1:   89 e5                   mov    %esp,%ebp
   3:   81 ec 00 00 00 00       sub    $0x0,%esp
   9:   89 e8                   mov    %ebp,%eax        // <- here
   b:   e9 00 00 00 00          jmp    10 <func+0x10>
  10:   c9                      leave
  11:   c3                      ret
2012-11-16 10:22:45 +04:00
Kirill Smelkov
b2a02961b4 Add support for __builtin_frame_address(level)
Continuing d6072d37 (Add __builtin_frame_address(0)) implement
__builtin_frame_address for levels greater than zero, in order for
tinycc to be able to compile its own lib/bcheck.c after
cffb7af9 (lib/bcheck: Prevent __bound_local_new / __bound_local_delete
from being miscompiled).

I'm new to the internals, and used the most simple way to do it.
Generated code is not very good for levels >= 2, compare

                gcc                         tcc

    level=0     mov    %ebp,%eax            lea    0x0(%ebp),%eax

    level=1     mov    0x0(%ebp),%eax       mov    0x0(%ebp),%eax

    level=2     mov    0x0(%ebp),%eax       mov    0x0(%ebp),%eax
                mov    (%eax),%eax          mov    %eax,-0x10(%ebp)
                                            mov    -0x10(%ebp),%eax
                                            mov    (%eax),%eax

    level=3     mov    0x0(%ebp),%eax       mov    0x0(%ebp),%eax
                mov    (%eax),%eax          mov    (%eax),%ecx
                mov    (%eax),%eax          mov    (%ecx),%eax

But this is still an improvement and for bcheck we need level=1 for
which the code is good.

For the tests I had to force gcc use -O0 to not inline the functions.
And -fno-omit-frame-pointer just in case.

If someone knows how to improve the generated code - help is
appreciated.

Thanks,
Kirill

Cc: Michael Matz <matz@suse.de>
Cc: Shinichiro Hamaji <shinichiro.hamaji@gmail.com>
2012-11-16 10:22:14 +04:00
Milutin Jovanović
e79c3533ec -Wno-unused-result now added only on gcc >= 4.4
This option does not exist in gcc 4.3 and earlier, and it breaks the build on
systems with older compilers. The makefile has been enhanced to test for the
version and adds it only if a newer compiler is detected.
2012-11-14 17:45:15 -05:00
Kirill Smelkov
40a54c4399 Repair bounds-checking runtime
On this weekend a thought came to me again that tinycc could be used for
scripting. Only its bounds-checking mode turned out to be broken, which
is a pity because bounds-checking is sometimes handy especially for
quickly written scripts.

Since tinycc is one of those small beautiful things, seldomly happening
in our times, I couldn't resist trying to fix it.

Thanks,
Kirill

* bcheck:
  Now btest pass!
  lib/bcheck: Prevent __bound_local_new / __bound_local_delete from being miscompiled
  lib/bcheck: Prevent libc_malloc/libc_free etc from being miscompiled
2012-11-14 10:50:34 +04:00
Kirill Smelkov
5d648485bd Now btest pass!
Thanks to two previous commits now btest tests pass, at least on Linux.

Signed-off-by: Kirill Smelkov <kirr@navytux.spb.ru>
2012-11-13 22:23:01 +04:00
Kirill Smelkov
cffb7af9f9 lib/bcheck: Prevent __bound_local_new / __bound_local_delete from being miscompiled
On i386 and gcc-4.7 I found that __bound_local_new was miscompiled -
look:

    #ifdef __i386__
    /* return the frame pointer of the caller */
    #define GET_CALLER_FP(fp)\
    {\
        unsigned long *fp1;\
        __asm__ __volatile__ ("movl %%ebp,%0" :"=g" (fp1));\
        fp = fp1[0];\
    }
    #endif

    /* called when entering a function to add all the local regions */
    void FASTCALL __bound_local_new(void *p1)
    {
        unsigned long addr, size, fp, *p = p1;
        GET_CALLER_FP(fp);
        for(;;) {
            addr = p[0];
            if (addr == 0)
                break;
            addr += fp;
            size = p[1];
            p += 2;
            __bound_new_region((void *)addr, size);
        }
    }

    __bound_local_new:
    .LFB40:
            .cfi_startproc
            pushl   %esi
            .cfi_def_cfa_offset 8
            .cfi_offset 6, -8
            pushl   %ebx
            .cfi_def_cfa_offset 12
            .cfi_offset 3, -12
            subl    $8, %esp            // NOTE prologue does not touch %ebp
            .cfi_def_cfa_offset 20
    #APP
    # 235 "lib/bcheck.c" 1
            movl %ebp,%edx              // %ebp -> fp1
    # 0 "" 2
    #NO_APP
            movl    (%edx), %esi        // fp1[0] -> fp
            movl    (%eax), %edx
            movl    %eax, %ebx
            testl   %edx, %edx
            je      .L167
            .p2align 2,,3
    .L173:
            movl    4(%ebx), %eax
            addl    $8, %ebx
            movl    %eax, 4(%esp)
            addl    %esi, %edx
            movl    %edx, (%esp)
            call    __bound_new_region
            movl    (%ebx), %edx
            testl   %edx, %edx
            jne     .L173
    .L167:
            addl    $8, %esp
            .cfi_def_cfa_offset 12
            popl    %ebx
            .cfi_restore 3
            .cfi_def_cfa_offset 8
            popl    %esi
            .cfi_restore 6
            .cfi_def_cfa_offset 4
            ret

here GET_CALLER_FP() assumed that its using function setups it's stack
frame, i.e. first save, then set %ebp to stack frame start, and then it
has to do perform two lookups: 1) to get current stack frame through
%ebp, and 2) get caller stack frame through (%ebp).

And here is the problem: gcc decided not to setup %ebp for
__bound_local_new and in such case GET_CALLER_FP actually becomes
GET_CALLER_CALLER_FP and oops, wrong regions are registered in bcheck
tables...

The solution is to stop using hand written assembly and rely on gcc's
__builtin_frame_address(1) to get callers frame stack(*). I think for the
builtin gcc should generate correct code, independent of whether it
decides or not to omit frame pointer in using function - it knows it.

(*) judging by gcc history, __builtin_frame_address was there almost
    from the beginning - at least it is present in 1992 as seen from the
    following commit:

    http://gcc.gnu.org/git/?p=gcc.git;a=commit;h=be07f7bdbac76d87d3006c89855491504d5d6202

    so we can rely on it being supported by all versions of gcc.

In my environment the assembly of __bound_local_new changes as follows:

    diff --git a/bcheck0.s b/bcheck1.s
    index 4c02a5f..ef68918 100644
    --- a/bcheck0.s
    +++ b/bcheck1.s
    @@ -1409,20 +1409,17 @@ __bound_init:
     __bound_local_new:
     .LFB40:
            .cfi_startproc
    -       pushl   %esi
    +       pushl   %ebp                // NOTE prologue saves %ebp ...
            .cfi_def_cfa_offset 8
    -       .cfi_offset 6, -8
    +       .cfi_offset 5, -8
    +       movl    %esp, %ebp          // ... and reset it to local stack frame
    +       .cfi_def_cfa_register 5
    +       pushl   %esi
            pushl   %ebx
    -       .cfi_def_cfa_offset 12
    -       .cfi_offset 3, -12
            subl    $8, %esp
    -       .cfi_def_cfa_offset 20
    -#APP
    -# 235 "lib/bcheck.c" 1
    -       movl %ebp,%edx
    -# 0 "" 2
    -#NO_APP
    -       movl    (%edx), %esi
    +       .cfi_offset 6, -12
    +       .cfi_offset 3, -16
    +       movl    0(%ebp), %esi       // stkframe -> stkframe.parent -> fp
            movl    (%eax), %edx
            movl    %eax, %ebx
            testl   %edx, %edx
    @@ -1440,13 +1437,13 @@ __bound_local_new:
            jne     .L173
     .L167:
            addl    $8, %esp
    -       .cfi_def_cfa_offset 12
            popl    %ebx
            .cfi_restore 3
    -       .cfi_def_cfa_offset 8
            popl    %esi
            .cfi_restore 6
    -       .cfi_def_cfa_offset 4
    +       popl    %ebp
    +       .cfi_restore 5
    +       .cfi_def_cfa 4, 4
            ret
            .cfi_endproc

i.e. now it compiles correctly.

Though I do not have x86_64 to test, my guess is that
__builtin_frame_address(1) should work there too. If not - please revert
only x86_64 part of the patch. Thanks.

Cc: Michael Matz <matz@suse.de>
2012-11-13 22:17:58 +04:00
Kirill Smelkov
646b51833f lib/bcheck: Prevent libc_malloc/libc_free etc from being miscompiled
On i386 and gcc-4.7 I found that libc_malloc was miscompiled - look:

static void *libc_malloc(size_t size)
{
    void *ptr;
    restore_malloc_hooks();     // __malloc_hook = saved_malloc_hook
    ptr = malloc(size);
    install_malloc_hooks();     // saved_malloc_hook = __malloc_hook, __malloc_hook = __bound_malloc
    return ptr;
}

	.type	libc_malloc, @function
libc_malloc:
.LFB56:
	.cfi_startproc
	pushl	%edx
	.cfi_def_cfa_offset 8
	movl	%eax, (%esp)
	call	malloc
	movl	$__bound_malloc, __malloc_hook
	movl	$__bound_free, __free_hook
	movl	$__bound_realloc, __realloc_hook
	movl	$__bound_memalign, __memalign_hook
	popl	%ecx
	.cfi_def_cfa_offset 4
	ret

Here gcc inlined both restore_malloc_hooks() and install_malloc_hooks()
and decided that

    saved_malloc_hook -> __malloc_hook -> saved_malloc_hook

stores are not needed and could be ommitted. Only it did not know
__molloc_hook affects malloc()...

So add compiler barrier to both install and restore hooks functions and
be done with it - the code is now ok:

    diff --git a/bcheck0.s b/bcheck1.s
    index 5f50293..4c02a5f 100644
    --- a/bcheck0.s
    +++ b/bcheck1.s
    @@ -42,8 +42,24 @@ libc_malloc:
            .cfi_startproc
            pushl   %edx
            .cfi_def_cfa_offset 8
    +       movl    saved_malloc_hook, %edx
    +       movl    %edx, __malloc_hook
    +       movl    saved_free_hook, %edx
    +       movl    %edx, __free_hook
    +       movl    saved_realloc_hook, %edx
    +       movl    %edx, __realloc_hook
    +       movl    saved_memalign_hook, %edx
    +       movl    %edx, __memalign_hook
            movl    %eax, (%esp)
            call    malloc
    +       movl    __malloc_hook, %edx
    +       movl    %edx, saved_malloc_hook
    +       movl    __free_hook, %edx
    +       movl    %edx, saved_free_hook
    +       movl    __realloc_hook, %edx
    +       movl    %edx, saved_realloc_hook
    +       movl    __memalign_hook, %edx
    +       movl    %edx, saved_memalign_hook
            movl    $__bound_malloc, __malloc_hook
            movl    $__bound_free, __free_hook
            movl    $__bound_realloc, __realloc_hook

For barrier I use

    __asm__ __volatile__ ("": : : "memory")

which is used as compiler barrier by Linux kernel, and mentioned in gcc
docs and in wikipedia [1].

Without this patch any program compiled with tcc -b crashes in startup
because of infinite recursion in libc_malloc.

[1] http://en.wikipedia.org/wiki/Memory_ordering#Compiler_memory_barrier
2012-11-13 22:17:51 +04:00
Thomas Preud'homme
1af3bca4ea Revert "Generate PLT thumb stub only when necessary"
Revert commit 891dfcdf3f since it assumes
*all* architectures supported by tcc have GOT offsets aligned on 2. A
rework of this commit is being done since without it all PLT entries
grow by 4 bytes.
2012-11-12 23:14:21 +01:00
Thomas Preud'homme
3c986eeae3 Add armv6l to ARM supported processors
Add armv6l to the list of supported ARM architecture (as returned by
uname -m) in ./configure
2012-11-11 20:01:01 +01:00
Thomas Preud'homme
14c99236da Call to veneers in ARM mode
Since commit c6630ef92a, Call to a veneer
when the final symbol to be reached is thumb is made through a blx
instruction. This is a mistake since veneers are ARM instructions and
should thus be called with a simple bl. This commit prevent the bl ->
blx conversion when a veneer is used.
2012-11-09 10:59:06 +01:00
Thomas Preud'homme
061b5799cc Allow source fortification
Source fortification now works correctly : it compiles without warning
except unused result and the resulting tcc is working fine. Hence let's
stop disabling source fortification and hide unused result instead.
2012-11-07 21:15:07 +01:00
Thomas Preud'homme
891dfcdf3f Generate PLT thumb stub only when necessary
Generate PLT thumb stub for an ARM PLT entry only when at least one
Thumb instruction branches to that entry.

Warning: To save space, this commit reuses the bit 0 of entries of
got_offsets array. The GOT offset is thus saved in a 31 bit value.
Make sure to divide by 2 (right shift by 1) an offset before storing it
there and conversely to multiply the value by 2 (left shift by 1) before
using it.
2012-11-07 20:51:33 +01:00
Thomas Preud'homme
e07802e39d Support R_ARM_THM_JUMP24 relocation to plt
Add thumb stubs switching from thumb mode to arm mode to *all* PLT
entries so that R_ARM_THM_JUMP24 relocations to PLT entries can be
satisfied.
2012-11-07 20:48:14 +01:00
Thomas Preud'homme
b0f08ace94 Create a clean target for tests2/Makefile
the absence of a clean target in tests2/Makefile make the clean target
in the main Makefile fails to complete. This commit create such a target
which removes the only file created when tests pass successfully.
2012-11-07 14:56:37 +01:00
Thomas Preud'homme
a7f010ee8a Honour *FLAGS everywhere
Add CPPFLAGS, CFLAGS and LDFLAGS everywhere it's missing.
2012-11-06 15:20:53 +01:00
Hitoshi Mitake
5eb64357b1 forbid invalid comparison of struct
Current tcc permits comparison of structs and comparison between
struct and other typed values.
2012-11-05 22:34:43 +09:00
Roy Tam
943574aba5 pe: fix tcc not linking to user32 and gdi32 2012-11-02 16:59:21 +08:00
Thomas Preud'homme
034dce4f04 Enable arm hardfloat calling convention
Use arm hardfloat calling convention when the system is using it
(detected by searching for hardfloat multiarch directory).
2012-10-28 19:55:23 +01:00
Thomas Preud'homme
fad68c9163 Add support for R_ARM_THM_{JUMP24,CALL} relocs
Add support for relocations R_ARM_THM_JUMP24 and R_ARM_THM_CALL. These
are encountered with gcc when compiling for armv6 or greater with
-mthumb flag and a call (conditional or not) is done.
2012-10-28 19:55:12 +01:00
Thomas Preud'homme
508df168f4 Fix commit 85f6fad3a6
Don't reset nocode_wanted with saved_nocode_wanted if it hasn't been
modified (and hence saved_nocode_wanted is uninitialized).
2012-10-25 20:14:55 +02:00
Thomas Preud'homme
cf95ac399c Error out in case of variable name clash
Error out when two local variable with same name are defined in the same
scope. This fixes bug #15597 in savannah's BTS.
2012-10-25 19:40:50 +02:00
Thomas Preud'homme
85f6fad3a6 Forbid VLA as static variables
Currently, VLA are not forbidden for static variable. This leads to
problems even if for fixed-size array when the size expression uses the
ternary operator (cond ? then-value : else-value) because it is parsed
as a general expression which leads to code generated in this case.

This commit solve the problem by forbidding VLA for static variables.
Although not required for the fix, avoiding code generation when the
expression is constant would be a nice addition though.
2012-10-25 18:07:13 +02:00
Thomas Preud'homme
9966fd4eae Only use blx if available
Introduce ARM version for the target architecture in order to determine
if blx instruction can be used or not. Availability of blx instruction
allows for more scenarii supported in R_ARM_CALL relocation. It should
also be useful when introducing support for the R_ARM_THM_CALL
relocation.
2012-10-16 00:31:56 +02:00
Thomas Preud'homme
c6630ef92a Fix R_ARM_CALL when target fonction is Thumb
With R_ARM_CALL, if target function is to be entered in Thumb mode, the
relocation is supposed to transform bl in blx. This is not the case
actually so this commit is there to fix it.
2012-10-10 00:21:26 +02:00