2016-02-10 02:53:54 +03:00
|
|
|
/*
|
|
|
|
* GENANN - Minimal C Artificial Neural Network
|
|
|
|
*
|
|
|
|
* Copyright (c) 2015, 2016 Lewis Van Winkle
|
|
|
|
*
|
|
|
|
* http://CodePlea.com
|
|
|
|
*
|
|
|
|
* This software is provided 'as-is', without any express or implied
|
|
|
|
* warranty. In no event will the authors be held liable for any damages
|
|
|
|
* arising from the use of this software.
|
|
|
|
*
|
|
|
|
* Permission is granted to anyone to use this software for any purpose,
|
|
|
|
* including commercial applications, and to alter it and redistribute it
|
|
|
|
* freely, subject to the following restrictions:
|
|
|
|
*
|
|
|
|
* 1. The origin of this software must not be misrepresented; you must not
|
|
|
|
* claim that you wrote the original software. If you use this software
|
|
|
|
* in a product, an acknowledgement in the product documentation would be
|
|
|
|
* appreciated but is not required.
|
|
|
|
* 2. Altered source versions must be plainly marked as such, and must not be
|
|
|
|
* misrepresented as being the original software.
|
|
|
|
* 3. This notice may not be removed or altered from any source distribution.
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include "genann.h"
|
|
|
|
|
|
|
|
#include <assert.h>
|
2017-10-22 00:53:58 +03:00
|
|
|
#include <errno.h>
|
2017-10-22 00:54:44 +03:00
|
|
|
#include <math.h>
|
|
|
|
#include <stdio.h>
|
|
|
|
#include <stdlib.h>
|
|
|
|
#include <string.h>
|
2016-02-10 02:53:54 +03:00
|
|
|
|
genann: Optionally resolve activation functions at link time
Shave around 94 million instructions and 10 million branches off of execution
trace of example4 if the sigmoid activation function is resolved at link-time.
Before (`make`):
```
Performance counter stats for './example4':
98.988806 task-clock (msec) # 0.998 CPUs utilized
1 context-switches # 0.010 K/sec
0 cpu-migrations # 0.000 K/sec
79 page-faults # 0.798 K/sec
312,298,260 cycles # 3.155 GHz
1,094,183,752 instructions # 3.50 insn per cycle
212,007,732 branches # 2141.734 M/sec
62,774 branch-misses # 0.03% of all branches
0.099228100 seconds time elapsed
```
After:
`make`:
```
Performance counter stats for './example4':
97.335180 task-clock (msec) # 0.998 CPUs utilized
0 context-switches # 0.000 K/sec
0 cpu-migrations # 0.000 K/sec
82 page-faults # 0.842 K/sec
306,722,357 cycles # 3.151 GHz
1,065,669,644 instructions # 3.47 insn per cycle
214,256,601 branches # 2201.225 M/sec
60,154 branch-misses # 0.03% of all branches
0.097577079 seconds time elapsed
```
`make sigmoid`:
```
Performance counter stats for './example4':
92.629610 task-clock (msec) # 0.997 CPUs utilized
0 context-switches # 0.000 K/sec
0 cpu-migrations # 0.000 K/sec
78 page-faults # 0.842 K/sec
291,863,801 cycles # 3.151 GHz
1,000,931,204 instructions # 3.43 insn per cycle
202,465,800 branches # 2185.757 M/sec
50,949 branch-misses # 0.03% of all branches
0.092889789 seconds time elapsed
```
Signed-off-by: Andrew Jeffery <andrew@aj.id.au>
2017-12-18 05:38:30 +03:00
|
|
|
#ifndef genann_act
|
|
|
|
#define genann_act_hidden genann_act_hidden_indirect
|
|
|
|
#define genann_act_output genann_act_output_indirect
|
|
|
|
#else
|
|
|
|
#define genann_act_hidden genann_act
|
|
|
|
#define genann_act_output genann_act
|
|
|
|
#endif
|
|
|
|
|
2016-02-10 02:53:54 +03:00
|
|
|
#define LOOKUP_SIZE 4096
|
|
|
|
|
genann: Optionally resolve activation functions at link time
Shave around 94 million instructions and 10 million branches off of execution
trace of example4 if the sigmoid activation function is resolved at link-time.
Before (`make`):
```
Performance counter stats for './example4':
98.988806 task-clock (msec) # 0.998 CPUs utilized
1 context-switches # 0.010 K/sec
0 cpu-migrations # 0.000 K/sec
79 page-faults # 0.798 K/sec
312,298,260 cycles # 3.155 GHz
1,094,183,752 instructions # 3.50 insn per cycle
212,007,732 branches # 2141.734 M/sec
62,774 branch-misses # 0.03% of all branches
0.099228100 seconds time elapsed
```
After:
`make`:
```
Performance counter stats for './example4':
97.335180 task-clock (msec) # 0.998 CPUs utilized
0 context-switches # 0.000 K/sec
0 cpu-migrations # 0.000 K/sec
82 page-faults # 0.842 K/sec
306,722,357 cycles # 3.151 GHz
1,065,669,644 instructions # 3.47 insn per cycle
214,256,601 branches # 2201.225 M/sec
60,154 branch-misses # 0.03% of all branches
0.097577079 seconds time elapsed
```
`make sigmoid`:
```
Performance counter stats for './example4':
92.629610 task-clock (msec) # 0.997 CPUs utilized
0 context-switches # 0.000 K/sec
0 cpu-migrations # 0.000 K/sec
78 page-faults # 0.842 K/sec
291,863,801 cycles # 3.151 GHz
1,000,931,204 instructions # 3.43 insn per cycle
202,465,800 branches # 2185.757 M/sec
50,949 branch-misses # 0.03% of all branches
0.092889789 seconds time elapsed
```
Signed-off-by: Andrew Jeffery <andrew@aj.id.au>
2017-12-18 05:38:30 +03:00
|
|
|
double genann_act_hidden_indirect(const struct genann *ann, double a) {
|
|
|
|
return ann->activation_hidden(ann, a);
|
|
|
|
}
|
|
|
|
|
|
|
|
double genann_act_output_indirect(const struct genann *ann, double a) {
|
|
|
|
return ann->activation_output(ann, a);
|
|
|
|
}
|
|
|
|
|
|
|
|
const double sigmoid_dom_min = -15.0;
|
|
|
|
const double sigmoid_dom_max = 15.0;
|
|
|
|
double interval;
|
|
|
|
double lookup[LOOKUP_SIZE];
|
|
|
|
|
|
|
|
#define likely(x) __builtin_expect(!!(x), 1)
|
|
|
|
#define unlikely(x) __builtin_expect(!!(x), 0)
|
|
|
|
#define __unused __attribute__((unused))
|
|
|
|
|
|
|
|
double inline genann_act_sigmoid(const genann *ann __unused, double a) {
|
2016-02-10 02:53:54 +03:00
|
|
|
if (a < -45.0) return 0;
|
|
|
|
if (a > 45.0) return 1;
|
|
|
|
return 1.0 / (1 + exp(-a));
|
|
|
|
}
|
|
|
|
|
genann: Optionally resolve activation functions at link time
Shave around 94 million instructions and 10 million branches off of execution
trace of example4 if the sigmoid activation function is resolved at link-time.
Before (`make`):
```
Performance counter stats for './example4':
98.988806 task-clock (msec) # 0.998 CPUs utilized
1 context-switches # 0.010 K/sec
0 cpu-migrations # 0.000 K/sec
79 page-faults # 0.798 K/sec
312,298,260 cycles # 3.155 GHz
1,094,183,752 instructions # 3.50 insn per cycle
212,007,732 branches # 2141.734 M/sec
62,774 branch-misses # 0.03% of all branches
0.099228100 seconds time elapsed
```
After:
`make`:
```
Performance counter stats for './example4':
97.335180 task-clock (msec) # 0.998 CPUs utilized
0 context-switches # 0.000 K/sec
0 cpu-migrations # 0.000 K/sec
82 page-faults # 0.842 K/sec
306,722,357 cycles # 3.151 GHz
1,065,669,644 instructions # 3.47 insn per cycle
214,256,601 branches # 2201.225 M/sec
60,154 branch-misses # 0.03% of all branches
0.097577079 seconds time elapsed
```
`make sigmoid`:
```
Performance counter stats for './example4':
92.629610 task-clock (msec) # 0.997 CPUs utilized
0 context-switches # 0.000 K/sec
0 cpu-migrations # 0.000 K/sec
78 page-faults # 0.842 K/sec
291,863,801 cycles # 3.151 GHz
1,000,931,204 instructions # 3.43 insn per cycle
202,465,800 branches # 2185.757 M/sec
50,949 branch-misses # 0.03% of all branches
0.092889789 seconds time elapsed
```
Signed-off-by: Andrew Jeffery <andrew@aj.id.au>
2017-12-18 05:38:30 +03:00
|
|
|
void genann_init_sigmoid_lookup(const genann *ann) {
|
|
|
|
const double f = (sigmoid_dom_max - sigmoid_dom_min) / LOOKUP_SIZE;
|
2016-02-10 02:53:54 +03:00
|
|
|
int i;
|
genann: Optionally resolve activation functions at link time
Shave around 94 million instructions and 10 million branches off of execution
trace of example4 if the sigmoid activation function is resolved at link-time.
Before (`make`):
```
Performance counter stats for './example4':
98.988806 task-clock (msec) # 0.998 CPUs utilized
1 context-switches # 0.010 K/sec
0 cpu-migrations # 0.000 K/sec
79 page-faults # 0.798 K/sec
312,298,260 cycles # 3.155 GHz
1,094,183,752 instructions # 3.50 insn per cycle
212,007,732 branches # 2141.734 M/sec
62,774 branch-misses # 0.03% of all branches
0.099228100 seconds time elapsed
```
After:
`make`:
```
Performance counter stats for './example4':
97.335180 task-clock (msec) # 0.998 CPUs utilized
0 context-switches # 0.000 K/sec
0 cpu-migrations # 0.000 K/sec
82 page-faults # 0.842 K/sec
306,722,357 cycles # 3.151 GHz
1,065,669,644 instructions # 3.47 insn per cycle
214,256,601 branches # 2201.225 M/sec
60,154 branch-misses # 0.03% of all branches
0.097577079 seconds time elapsed
```
`make sigmoid`:
```
Performance counter stats for './example4':
92.629610 task-clock (msec) # 0.997 CPUs utilized
0 context-switches # 0.000 K/sec
0 cpu-migrations # 0.000 K/sec
78 page-faults # 0.842 K/sec
291,863,801 cycles # 3.151 GHz
1,000,931,204 instructions # 3.43 insn per cycle
202,465,800 branches # 2185.757 M/sec
50,949 branch-misses # 0.03% of all branches
0.092889789 seconds time elapsed
```
Signed-off-by: Andrew Jeffery <andrew@aj.id.au>
2017-12-18 05:38:30 +03:00
|
|
|
|
|
|
|
interval = LOOKUP_SIZE / (sigmoid_dom_max - sigmoid_dom_min);
|
2016-02-10 02:53:54 +03:00
|
|
|
for (i = 0; i < LOOKUP_SIZE; ++i) {
|
genann: Optionally resolve activation functions at link time
Shave around 94 million instructions and 10 million branches off of execution
trace of example4 if the sigmoid activation function is resolved at link-time.
Before (`make`):
```
Performance counter stats for './example4':
98.988806 task-clock (msec) # 0.998 CPUs utilized
1 context-switches # 0.010 K/sec
0 cpu-migrations # 0.000 K/sec
79 page-faults # 0.798 K/sec
312,298,260 cycles # 3.155 GHz
1,094,183,752 instructions # 3.50 insn per cycle
212,007,732 branches # 2141.734 M/sec
62,774 branch-misses # 0.03% of all branches
0.099228100 seconds time elapsed
```
After:
`make`:
```
Performance counter stats for './example4':
97.335180 task-clock (msec) # 0.998 CPUs utilized
0 context-switches # 0.000 K/sec
0 cpu-migrations # 0.000 K/sec
82 page-faults # 0.842 K/sec
306,722,357 cycles # 3.151 GHz
1,065,669,644 instructions # 3.47 insn per cycle
214,256,601 branches # 2201.225 M/sec
60,154 branch-misses # 0.03% of all branches
0.097577079 seconds time elapsed
```
`make sigmoid`:
```
Performance counter stats for './example4':
92.629610 task-clock (msec) # 0.997 CPUs utilized
0 context-switches # 0.000 K/sec
0 cpu-migrations # 0.000 K/sec
78 page-faults # 0.842 K/sec
291,863,801 cycles # 3.151 GHz
1,000,931,204 instructions # 3.43 insn per cycle
202,465,800 branches # 2185.757 M/sec
50,949 branch-misses # 0.03% of all branches
0.092889789 seconds time elapsed
```
Signed-off-by: Andrew Jeffery <andrew@aj.id.au>
2017-12-18 05:38:30 +03:00
|
|
|
lookup[i] = genann_act_sigmoid(ann, sigmoid_dom_min + f * i);
|
2016-02-10 02:53:54 +03:00
|
|
|
}
|
genann: Optionally resolve activation functions at link time
Shave around 94 million instructions and 10 million branches off of execution
trace of example4 if the sigmoid activation function is resolved at link-time.
Before (`make`):
```
Performance counter stats for './example4':
98.988806 task-clock (msec) # 0.998 CPUs utilized
1 context-switches # 0.010 K/sec
0 cpu-migrations # 0.000 K/sec
79 page-faults # 0.798 K/sec
312,298,260 cycles # 3.155 GHz
1,094,183,752 instructions # 3.50 insn per cycle
212,007,732 branches # 2141.734 M/sec
62,774 branch-misses # 0.03% of all branches
0.099228100 seconds time elapsed
```
After:
`make`:
```
Performance counter stats for './example4':
97.335180 task-clock (msec) # 0.998 CPUs utilized
0 context-switches # 0.000 K/sec
0 cpu-migrations # 0.000 K/sec
82 page-faults # 0.842 K/sec
306,722,357 cycles # 3.151 GHz
1,065,669,644 instructions # 3.47 insn per cycle
214,256,601 branches # 2201.225 M/sec
60,154 branch-misses # 0.03% of all branches
0.097577079 seconds time elapsed
```
`make sigmoid`:
```
Performance counter stats for './example4':
92.629610 task-clock (msec) # 0.997 CPUs utilized
0 context-switches # 0.000 K/sec
0 cpu-migrations # 0.000 K/sec
78 page-faults # 0.842 K/sec
291,863,801 cycles # 3.151 GHz
1,000,931,204 instructions # 3.43 insn per cycle
202,465,800 branches # 2185.757 M/sec
50,949 branch-misses # 0.03% of all branches
0.092889789 seconds time elapsed
```
Signed-off-by: Andrew Jeffery <andrew@aj.id.au>
2017-12-18 05:38:30 +03:00
|
|
|
}
|
2016-02-10 02:53:54 +03:00
|
|
|
|
genann: Optionally resolve activation functions at link time
Shave around 94 million instructions and 10 million branches off of execution
trace of example4 if the sigmoid activation function is resolved at link-time.
Before (`make`):
```
Performance counter stats for './example4':
98.988806 task-clock (msec) # 0.998 CPUs utilized
1 context-switches # 0.010 K/sec
0 cpu-migrations # 0.000 K/sec
79 page-faults # 0.798 K/sec
312,298,260 cycles # 3.155 GHz
1,094,183,752 instructions # 3.50 insn per cycle
212,007,732 branches # 2141.734 M/sec
62,774 branch-misses # 0.03% of all branches
0.099228100 seconds time elapsed
```
After:
`make`:
```
Performance counter stats for './example4':
97.335180 task-clock (msec) # 0.998 CPUs utilized
0 context-switches # 0.000 K/sec
0 cpu-migrations # 0.000 K/sec
82 page-faults # 0.842 K/sec
306,722,357 cycles # 3.151 GHz
1,065,669,644 instructions # 3.47 insn per cycle
214,256,601 branches # 2201.225 M/sec
60,154 branch-misses # 0.03% of all branches
0.097577079 seconds time elapsed
```
`make sigmoid`:
```
Performance counter stats for './example4':
92.629610 task-clock (msec) # 0.997 CPUs utilized
0 context-switches # 0.000 K/sec
0 cpu-migrations # 0.000 K/sec
78 page-faults # 0.842 K/sec
291,863,801 cycles # 3.151 GHz
1,000,931,204 instructions # 3.43 insn per cycle
202,465,800 branches # 2185.757 M/sec
50,949 branch-misses # 0.03% of all branches
0.092889789 seconds time elapsed
```
Signed-off-by: Andrew Jeffery <andrew@aj.id.au>
2017-12-18 05:38:30 +03:00
|
|
|
double inline genann_act_sigmoid_cached(const genann *ann __unused, double a) {
|
genann: Use reciprocal interval value to strength reduce divide to multiply
This gives a reduction of roughly 2.5 million instructions in the execution
trace of example4.
genann_act_sigmoid_cached() previously divided by interval to calculate the
lookup index. Divide is a expensive operation, so instead use the reciprocal of
the existing interval calculation to reduce the divide to a multiply.
Building with the following configuration:
```
$ head /proc/cpuinfo
processor : 0
vendor_id : GenuineIntel
cpu family : 6
model : 61
model name : Intel(R) Core(TM) i7-5600U CPU @ 2.60GHz
stepping : 4
microcode : 0x25
cpu MHz : 2593.871
cache size : 4096 KB
physical id : 0
$ cat /etc/os-release
NAME="Ubuntu"
VERSION="17.10 (Artful Aardvark)"
ID=ubuntu
ID_LIKE=debian
PRETTY_NAME="Ubuntu 17.10"
VERSION_ID="17.10"
HOME_URL="https://www.ubuntu.com/"
SUPPORT_URL="https://help.ubuntu.com/"
BUG_REPORT_URL="https://bugs.launchpad.net/ubuntu/"
PRIVACY_POLICY_URL="https://www.ubuntu.com/legal/terms-and-policies/privacy-policy"
VERSION_CODENAME=artful
UBUNTU_CODENAME=artful
$ cc --version
gcc (Ubuntu 7.2.0-8ubuntu3) 7.2.0
Copyright (C) 2017 Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
```
on my Lenovo X1 Carbon Gen 3 machine sees the following:
```
$ make CFLAGS="-g -O3 -march=native -DNDEBUG"
cc -g -O3 -march=native -DNDEBUG -c -o test.o test.c
cc -g -O3 -march=native -DNDEBUG -c -o genann.o genann.c
cc -g -O3 -march=native -DNDEBUG -c -o example1.o example1.c
cc -g -O3 -march=native -DNDEBUG -c -o example2.o example2.c
cc -g -O3 -march=native -DNDEBUG -c -o example3.o example3.c
cc -g -O3 -march=native -DNDEBUG -c -o example4.o example4.c
cc -g -O3 -march=native -DNDEBUG -c -o strings.o strings.c
cc test.o genann.o -lm -o test
cc example1.o genann.o -lm -o example1
cc example4.o genann.o -lm -o example4
cc example3.o genann.o -lm -o example3
cc example2.o genann.o -lm -o example2
cc strings.o genann.o -lm -o strings
$ for i in `seq 0 10`; do ./example4 > /dev/null; done; sudo perf stat record ./example4
GENANN example 4.
Train an ANN on the IRIS dataset using backpropagation.
Loading 150 data points from example/iris.data
Training for 5000 loops over data.
147/150 correct (98.0%).
Performance counter stats for './example4':
101.369081 task-clock (msec) # 0.998 CPUs utilized
1 context-switches # 0.010 K/sec
0 cpu-migrations # 0.000 K/sec
79 page-faults # 0.779 K/sec
320,197,883 cycles # 3.159 GHz
1,121,174,423 instructions # 3.50 insn per cycle
223,257,752 branches # 2202.425 M/sec
62,680 branch-misses # 0.03% of all branches
0.101595114 seconds time elapsed
```
Prior to the change, we see something like:
```
$ make CFLAGS="-g -O3 -march=native"
cc -g -O3 -march=native -c -o test.o test.c
cc -g -O3 -march=native -c -o genann.o genann.c
cc -g -O3 -march=native -c -o example1.o example1.c
cc -g -O3 -march=native -c -o example2.o example2.c
cc -g -O3 -march=native -c -o example3.o example3.c
cc -g -O3 -march=native -c -o example4.o example4.c
cc -g -O3 -march=native -c -o strings.o strings.c
cc test.o genann.o -lm -o test
cc example1.o genann.o -lm -o example1
cc example3.o genann.o -lm -o example3
cc example4.o genann.o -lm -o example4
cc strings.o genann.o -lm -o strings
cc example2.o genann.o -lm -o example2
$ for i in `seq 0 10`; do ./example4 > /dev/null; done; sudo perf stat record ./example4
GENANN example 4.
Train an ANN on the IRIS dataset using backpropagation.
Loading 150 data points from example/iris.data
Training for 5000 loops over data.
147/150 correct (98.0%).
Performance counter stats for './example4':
104.644198 task-clock (msec) # 0.998 CPUs utilized
0 context-switches # 0.000 K/sec
0 cpu-migrations # 0.000 K/sec
79 page-faults # 0.755 K/sec
330,340,554 cycles # 3.157 GHz
1,123,669,767 instructions # 3.40 insn per cycle
215,441,809 branches # 2058.803 M/sec
62,406 branch-misses # 0.03% of all branches
0.104891323 seconds time elapsed
```
Signed-off-by: Andrew Jeffery <andrew@aj.id.au>
2017-12-17 02:59:40 +03:00
|
|
|
assert(!isnan(a));
|
|
|
|
|
genann: Optionally resolve activation functions at link time
Shave around 94 million instructions and 10 million branches off of execution
trace of example4 if the sigmoid activation function is resolved at link-time.
Before (`make`):
```
Performance counter stats for './example4':
98.988806 task-clock (msec) # 0.998 CPUs utilized
1 context-switches # 0.010 K/sec
0 cpu-migrations # 0.000 K/sec
79 page-faults # 0.798 K/sec
312,298,260 cycles # 3.155 GHz
1,094,183,752 instructions # 3.50 insn per cycle
212,007,732 branches # 2141.734 M/sec
62,774 branch-misses # 0.03% of all branches
0.099228100 seconds time elapsed
```
After:
`make`:
```
Performance counter stats for './example4':
97.335180 task-clock (msec) # 0.998 CPUs utilized
0 context-switches # 0.000 K/sec
0 cpu-migrations # 0.000 K/sec
82 page-faults # 0.842 K/sec
306,722,357 cycles # 3.151 GHz
1,065,669,644 instructions # 3.47 insn per cycle
214,256,601 branches # 2201.225 M/sec
60,154 branch-misses # 0.03% of all branches
0.097577079 seconds time elapsed
```
`make sigmoid`:
```
Performance counter stats for './example4':
92.629610 task-clock (msec) # 0.997 CPUs utilized
0 context-switches # 0.000 K/sec
0 cpu-migrations # 0.000 K/sec
78 page-faults # 0.842 K/sec
291,863,801 cycles # 3.151 GHz
1,000,931,204 instructions # 3.43 insn per cycle
202,465,800 branches # 2185.757 M/sec
50,949 branch-misses # 0.03% of all branches
0.092889789 seconds time elapsed
```
Signed-off-by: Andrew Jeffery <andrew@aj.id.au>
2017-12-18 05:38:30 +03:00
|
|
|
if (a < sigmoid_dom_min) return lookup[0];
|
|
|
|
if (a >= sigmoid_dom_max) return lookup[LOOKUP_SIZE - 1];
|
genann: Use reciprocal interval value to strength reduce divide to multiply
This gives a reduction of roughly 2.5 million instructions in the execution
trace of example4.
genann_act_sigmoid_cached() previously divided by interval to calculate the
lookup index. Divide is a expensive operation, so instead use the reciprocal of
the existing interval calculation to reduce the divide to a multiply.
Building with the following configuration:
```
$ head /proc/cpuinfo
processor : 0
vendor_id : GenuineIntel
cpu family : 6
model : 61
model name : Intel(R) Core(TM) i7-5600U CPU @ 2.60GHz
stepping : 4
microcode : 0x25
cpu MHz : 2593.871
cache size : 4096 KB
physical id : 0
$ cat /etc/os-release
NAME="Ubuntu"
VERSION="17.10 (Artful Aardvark)"
ID=ubuntu
ID_LIKE=debian
PRETTY_NAME="Ubuntu 17.10"
VERSION_ID="17.10"
HOME_URL="https://www.ubuntu.com/"
SUPPORT_URL="https://help.ubuntu.com/"
BUG_REPORT_URL="https://bugs.launchpad.net/ubuntu/"
PRIVACY_POLICY_URL="https://www.ubuntu.com/legal/terms-and-policies/privacy-policy"
VERSION_CODENAME=artful
UBUNTU_CODENAME=artful
$ cc --version
gcc (Ubuntu 7.2.0-8ubuntu3) 7.2.0
Copyright (C) 2017 Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
```
on my Lenovo X1 Carbon Gen 3 machine sees the following:
```
$ make CFLAGS="-g -O3 -march=native -DNDEBUG"
cc -g -O3 -march=native -DNDEBUG -c -o test.o test.c
cc -g -O3 -march=native -DNDEBUG -c -o genann.o genann.c
cc -g -O3 -march=native -DNDEBUG -c -o example1.o example1.c
cc -g -O3 -march=native -DNDEBUG -c -o example2.o example2.c
cc -g -O3 -march=native -DNDEBUG -c -o example3.o example3.c
cc -g -O3 -march=native -DNDEBUG -c -o example4.o example4.c
cc -g -O3 -march=native -DNDEBUG -c -o strings.o strings.c
cc test.o genann.o -lm -o test
cc example1.o genann.o -lm -o example1
cc example4.o genann.o -lm -o example4
cc example3.o genann.o -lm -o example3
cc example2.o genann.o -lm -o example2
cc strings.o genann.o -lm -o strings
$ for i in `seq 0 10`; do ./example4 > /dev/null; done; sudo perf stat record ./example4
GENANN example 4.
Train an ANN on the IRIS dataset using backpropagation.
Loading 150 data points from example/iris.data
Training for 5000 loops over data.
147/150 correct (98.0%).
Performance counter stats for './example4':
101.369081 task-clock (msec) # 0.998 CPUs utilized
1 context-switches # 0.010 K/sec
0 cpu-migrations # 0.000 K/sec
79 page-faults # 0.779 K/sec
320,197,883 cycles # 3.159 GHz
1,121,174,423 instructions # 3.50 insn per cycle
223,257,752 branches # 2202.425 M/sec
62,680 branch-misses # 0.03% of all branches
0.101595114 seconds time elapsed
```
Prior to the change, we see something like:
```
$ make CFLAGS="-g -O3 -march=native"
cc -g -O3 -march=native -c -o test.o test.c
cc -g -O3 -march=native -c -o genann.o genann.c
cc -g -O3 -march=native -c -o example1.o example1.c
cc -g -O3 -march=native -c -o example2.o example2.c
cc -g -O3 -march=native -c -o example3.o example3.c
cc -g -O3 -march=native -c -o example4.o example4.c
cc -g -O3 -march=native -c -o strings.o strings.c
cc test.o genann.o -lm -o test
cc example1.o genann.o -lm -o example1
cc example3.o genann.o -lm -o example3
cc example4.o genann.o -lm -o example4
cc strings.o genann.o -lm -o strings
cc example2.o genann.o -lm -o example2
$ for i in `seq 0 10`; do ./example4 > /dev/null; done; sudo perf stat record ./example4
GENANN example 4.
Train an ANN on the IRIS dataset using backpropagation.
Loading 150 data points from example/iris.data
Training for 5000 loops over data.
147/150 correct (98.0%).
Performance counter stats for './example4':
104.644198 task-clock (msec) # 0.998 CPUs utilized
0 context-switches # 0.000 K/sec
0 cpu-migrations # 0.000 K/sec
79 page-faults # 0.755 K/sec
330,340,554 cycles # 3.157 GHz
1,123,669,767 instructions # 3.40 insn per cycle
215,441,809 branches # 2058.803 M/sec
62,406 branch-misses # 0.03% of all branches
0.104891323 seconds time elapsed
```
Signed-off-by: Andrew Jeffery <andrew@aj.id.au>
2017-12-17 02:59:40 +03:00
|
|
|
|
genann: Optionally resolve activation functions at link time
Shave around 94 million instructions and 10 million branches off of execution
trace of example4 if the sigmoid activation function is resolved at link-time.
Before (`make`):
```
Performance counter stats for './example4':
98.988806 task-clock (msec) # 0.998 CPUs utilized
1 context-switches # 0.010 K/sec
0 cpu-migrations # 0.000 K/sec
79 page-faults # 0.798 K/sec
312,298,260 cycles # 3.155 GHz
1,094,183,752 instructions # 3.50 insn per cycle
212,007,732 branches # 2141.734 M/sec
62,774 branch-misses # 0.03% of all branches
0.099228100 seconds time elapsed
```
After:
`make`:
```
Performance counter stats for './example4':
97.335180 task-clock (msec) # 0.998 CPUs utilized
0 context-switches # 0.000 K/sec
0 cpu-migrations # 0.000 K/sec
82 page-faults # 0.842 K/sec
306,722,357 cycles # 3.151 GHz
1,065,669,644 instructions # 3.47 insn per cycle
214,256,601 branches # 2201.225 M/sec
60,154 branch-misses # 0.03% of all branches
0.097577079 seconds time elapsed
```
`make sigmoid`:
```
Performance counter stats for './example4':
92.629610 task-clock (msec) # 0.997 CPUs utilized
0 context-switches # 0.000 K/sec
0 cpu-migrations # 0.000 K/sec
78 page-faults # 0.842 K/sec
291,863,801 cycles # 3.151 GHz
1,000,931,204 instructions # 3.43 insn per cycle
202,465,800 branches # 2185.757 M/sec
50,949 branch-misses # 0.03% of all branches
0.092889789 seconds time elapsed
```
Signed-off-by: Andrew Jeffery <andrew@aj.id.au>
2017-12-18 05:38:30 +03:00
|
|
|
size_t j = (size_t)((a-sigmoid_dom_min)*interval+0.5);
|
genann: Use reciprocal interval value to strength reduce divide to multiply
This gives a reduction of roughly 2.5 million instructions in the execution
trace of example4.
genann_act_sigmoid_cached() previously divided by interval to calculate the
lookup index. Divide is a expensive operation, so instead use the reciprocal of
the existing interval calculation to reduce the divide to a multiply.
Building with the following configuration:
```
$ head /proc/cpuinfo
processor : 0
vendor_id : GenuineIntel
cpu family : 6
model : 61
model name : Intel(R) Core(TM) i7-5600U CPU @ 2.60GHz
stepping : 4
microcode : 0x25
cpu MHz : 2593.871
cache size : 4096 KB
physical id : 0
$ cat /etc/os-release
NAME="Ubuntu"
VERSION="17.10 (Artful Aardvark)"
ID=ubuntu
ID_LIKE=debian
PRETTY_NAME="Ubuntu 17.10"
VERSION_ID="17.10"
HOME_URL="https://www.ubuntu.com/"
SUPPORT_URL="https://help.ubuntu.com/"
BUG_REPORT_URL="https://bugs.launchpad.net/ubuntu/"
PRIVACY_POLICY_URL="https://www.ubuntu.com/legal/terms-and-policies/privacy-policy"
VERSION_CODENAME=artful
UBUNTU_CODENAME=artful
$ cc --version
gcc (Ubuntu 7.2.0-8ubuntu3) 7.2.0
Copyright (C) 2017 Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
```
on my Lenovo X1 Carbon Gen 3 machine sees the following:
```
$ make CFLAGS="-g -O3 -march=native -DNDEBUG"
cc -g -O3 -march=native -DNDEBUG -c -o test.o test.c
cc -g -O3 -march=native -DNDEBUG -c -o genann.o genann.c
cc -g -O3 -march=native -DNDEBUG -c -o example1.o example1.c
cc -g -O3 -march=native -DNDEBUG -c -o example2.o example2.c
cc -g -O3 -march=native -DNDEBUG -c -o example3.o example3.c
cc -g -O3 -march=native -DNDEBUG -c -o example4.o example4.c
cc -g -O3 -march=native -DNDEBUG -c -o strings.o strings.c
cc test.o genann.o -lm -o test
cc example1.o genann.o -lm -o example1
cc example4.o genann.o -lm -o example4
cc example3.o genann.o -lm -o example3
cc example2.o genann.o -lm -o example2
cc strings.o genann.o -lm -o strings
$ for i in `seq 0 10`; do ./example4 > /dev/null; done; sudo perf stat record ./example4
GENANN example 4.
Train an ANN on the IRIS dataset using backpropagation.
Loading 150 data points from example/iris.data
Training for 5000 loops over data.
147/150 correct (98.0%).
Performance counter stats for './example4':
101.369081 task-clock (msec) # 0.998 CPUs utilized
1 context-switches # 0.010 K/sec
0 cpu-migrations # 0.000 K/sec
79 page-faults # 0.779 K/sec
320,197,883 cycles # 3.159 GHz
1,121,174,423 instructions # 3.50 insn per cycle
223,257,752 branches # 2202.425 M/sec
62,680 branch-misses # 0.03% of all branches
0.101595114 seconds time elapsed
```
Prior to the change, we see something like:
```
$ make CFLAGS="-g -O3 -march=native"
cc -g -O3 -march=native -c -o test.o test.c
cc -g -O3 -march=native -c -o genann.o genann.c
cc -g -O3 -march=native -c -o example1.o example1.c
cc -g -O3 -march=native -c -o example2.o example2.c
cc -g -O3 -march=native -c -o example3.o example3.c
cc -g -O3 -march=native -c -o example4.o example4.c
cc -g -O3 -march=native -c -o strings.o strings.c
cc test.o genann.o -lm -o test
cc example1.o genann.o -lm -o example1
cc example3.o genann.o -lm -o example3
cc example4.o genann.o -lm -o example4
cc strings.o genann.o -lm -o strings
cc example2.o genann.o -lm -o example2
$ for i in `seq 0 10`; do ./example4 > /dev/null; done; sudo perf stat record ./example4
GENANN example 4.
Train an ANN on the IRIS dataset using backpropagation.
Loading 150 data points from example/iris.data
Training for 5000 loops over data.
147/150 correct (98.0%).
Performance counter stats for './example4':
104.644198 task-clock (msec) # 0.998 CPUs utilized
0 context-switches # 0.000 K/sec
0 cpu-migrations # 0.000 K/sec
79 page-faults # 0.755 K/sec
330,340,554 cycles # 3.157 GHz
1,123,669,767 instructions # 3.40 insn per cycle
215,441,809 branches # 2058.803 M/sec
62,406 branch-misses # 0.03% of all branches
0.104891323 seconds time elapsed
```
Signed-off-by: Andrew Jeffery <andrew@aj.id.au>
2017-12-17 02:59:40 +03:00
|
|
|
|
genann: Optionally resolve activation functions at link time
Shave around 94 million instructions and 10 million branches off of execution
trace of example4 if the sigmoid activation function is resolved at link-time.
Before (`make`):
```
Performance counter stats for './example4':
98.988806 task-clock (msec) # 0.998 CPUs utilized
1 context-switches # 0.010 K/sec
0 cpu-migrations # 0.000 K/sec
79 page-faults # 0.798 K/sec
312,298,260 cycles # 3.155 GHz
1,094,183,752 instructions # 3.50 insn per cycle
212,007,732 branches # 2141.734 M/sec
62,774 branch-misses # 0.03% of all branches
0.099228100 seconds time elapsed
```
After:
`make`:
```
Performance counter stats for './example4':
97.335180 task-clock (msec) # 0.998 CPUs utilized
0 context-switches # 0.000 K/sec
0 cpu-migrations # 0.000 K/sec
82 page-faults # 0.842 K/sec
306,722,357 cycles # 3.151 GHz
1,065,669,644 instructions # 3.47 insn per cycle
214,256,601 branches # 2201.225 M/sec
60,154 branch-misses # 0.03% of all branches
0.097577079 seconds time elapsed
```
`make sigmoid`:
```
Performance counter stats for './example4':
92.629610 task-clock (msec) # 0.997 CPUs utilized
0 context-switches # 0.000 K/sec
0 cpu-migrations # 0.000 K/sec
78 page-faults # 0.842 K/sec
291,863,801 cycles # 3.151 GHz
1,000,931,204 instructions # 3.43 insn per cycle
202,465,800 branches # 2185.757 M/sec
50,949 branch-misses # 0.03% of all branches
0.092889789 seconds time elapsed
```
Signed-off-by: Andrew Jeffery <andrew@aj.id.au>
2017-12-18 05:38:30 +03:00
|
|
|
/* Because floating point... */
|
|
|
|
if (unlikely(j >= LOOKUP_SIZE)) return lookup[LOOKUP_SIZE - 1];
|
genann: Use reciprocal interval value to strength reduce divide to multiply
This gives a reduction of roughly 2.5 million instructions in the execution
trace of example4.
genann_act_sigmoid_cached() previously divided by interval to calculate the
lookup index. Divide is a expensive operation, so instead use the reciprocal of
the existing interval calculation to reduce the divide to a multiply.
Building with the following configuration:
```
$ head /proc/cpuinfo
processor : 0
vendor_id : GenuineIntel
cpu family : 6
model : 61
model name : Intel(R) Core(TM) i7-5600U CPU @ 2.60GHz
stepping : 4
microcode : 0x25
cpu MHz : 2593.871
cache size : 4096 KB
physical id : 0
$ cat /etc/os-release
NAME="Ubuntu"
VERSION="17.10 (Artful Aardvark)"
ID=ubuntu
ID_LIKE=debian
PRETTY_NAME="Ubuntu 17.10"
VERSION_ID="17.10"
HOME_URL="https://www.ubuntu.com/"
SUPPORT_URL="https://help.ubuntu.com/"
BUG_REPORT_URL="https://bugs.launchpad.net/ubuntu/"
PRIVACY_POLICY_URL="https://www.ubuntu.com/legal/terms-and-policies/privacy-policy"
VERSION_CODENAME=artful
UBUNTU_CODENAME=artful
$ cc --version
gcc (Ubuntu 7.2.0-8ubuntu3) 7.2.0
Copyright (C) 2017 Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
```
on my Lenovo X1 Carbon Gen 3 machine sees the following:
```
$ make CFLAGS="-g -O3 -march=native -DNDEBUG"
cc -g -O3 -march=native -DNDEBUG -c -o test.o test.c
cc -g -O3 -march=native -DNDEBUG -c -o genann.o genann.c
cc -g -O3 -march=native -DNDEBUG -c -o example1.o example1.c
cc -g -O3 -march=native -DNDEBUG -c -o example2.o example2.c
cc -g -O3 -march=native -DNDEBUG -c -o example3.o example3.c
cc -g -O3 -march=native -DNDEBUG -c -o example4.o example4.c
cc -g -O3 -march=native -DNDEBUG -c -o strings.o strings.c
cc test.o genann.o -lm -o test
cc example1.o genann.o -lm -o example1
cc example4.o genann.o -lm -o example4
cc example3.o genann.o -lm -o example3
cc example2.o genann.o -lm -o example2
cc strings.o genann.o -lm -o strings
$ for i in `seq 0 10`; do ./example4 > /dev/null; done; sudo perf stat record ./example4
GENANN example 4.
Train an ANN on the IRIS dataset using backpropagation.
Loading 150 data points from example/iris.data
Training for 5000 loops over data.
147/150 correct (98.0%).
Performance counter stats for './example4':
101.369081 task-clock (msec) # 0.998 CPUs utilized
1 context-switches # 0.010 K/sec
0 cpu-migrations # 0.000 K/sec
79 page-faults # 0.779 K/sec
320,197,883 cycles # 3.159 GHz
1,121,174,423 instructions # 3.50 insn per cycle
223,257,752 branches # 2202.425 M/sec
62,680 branch-misses # 0.03% of all branches
0.101595114 seconds time elapsed
```
Prior to the change, we see something like:
```
$ make CFLAGS="-g -O3 -march=native"
cc -g -O3 -march=native -c -o test.o test.c
cc -g -O3 -march=native -c -o genann.o genann.c
cc -g -O3 -march=native -c -o example1.o example1.c
cc -g -O3 -march=native -c -o example2.o example2.c
cc -g -O3 -march=native -c -o example3.o example3.c
cc -g -O3 -march=native -c -o example4.o example4.c
cc -g -O3 -march=native -c -o strings.o strings.c
cc test.o genann.o -lm -o test
cc example1.o genann.o -lm -o example1
cc example3.o genann.o -lm -o example3
cc example4.o genann.o -lm -o example4
cc strings.o genann.o -lm -o strings
cc example2.o genann.o -lm -o example2
$ for i in `seq 0 10`; do ./example4 > /dev/null; done; sudo perf stat record ./example4
GENANN example 4.
Train an ANN on the IRIS dataset using backpropagation.
Loading 150 data points from example/iris.data
Training for 5000 loops over data.
147/150 correct (98.0%).
Performance counter stats for './example4':
104.644198 task-clock (msec) # 0.998 CPUs utilized
0 context-switches # 0.000 K/sec
0 cpu-migrations # 0.000 K/sec
79 page-faults # 0.755 K/sec
330,340,554 cycles # 3.157 GHz
1,123,669,767 instructions # 3.40 insn per cycle
215,441,809 branches # 2058.803 M/sec
62,406 branch-misses # 0.03% of all branches
0.104891323 seconds time elapsed
```
Signed-off-by: Andrew Jeffery <andrew@aj.id.au>
2017-12-17 02:59:40 +03:00
|
|
|
|
|
|
|
return lookup[j];
|
2016-02-10 02:53:54 +03:00
|
|
|
}
|
|
|
|
|
genann: Optionally resolve activation functions at link time
Shave around 94 million instructions and 10 million branches off of execution
trace of example4 if the sigmoid activation function is resolved at link-time.
Before (`make`):
```
Performance counter stats for './example4':
98.988806 task-clock (msec) # 0.998 CPUs utilized
1 context-switches # 0.010 K/sec
0 cpu-migrations # 0.000 K/sec
79 page-faults # 0.798 K/sec
312,298,260 cycles # 3.155 GHz
1,094,183,752 instructions # 3.50 insn per cycle
212,007,732 branches # 2141.734 M/sec
62,774 branch-misses # 0.03% of all branches
0.099228100 seconds time elapsed
```
After:
`make`:
```
Performance counter stats for './example4':
97.335180 task-clock (msec) # 0.998 CPUs utilized
0 context-switches # 0.000 K/sec
0 cpu-migrations # 0.000 K/sec
82 page-faults # 0.842 K/sec
306,722,357 cycles # 3.151 GHz
1,065,669,644 instructions # 3.47 insn per cycle
214,256,601 branches # 2201.225 M/sec
60,154 branch-misses # 0.03% of all branches
0.097577079 seconds time elapsed
```
`make sigmoid`:
```
Performance counter stats for './example4':
92.629610 task-clock (msec) # 0.997 CPUs utilized
0 context-switches # 0.000 K/sec
0 cpu-migrations # 0.000 K/sec
78 page-faults # 0.842 K/sec
291,863,801 cycles # 3.151 GHz
1,000,931,204 instructions # 3.43 insn per cycle
202,465,800 branches # 2185.757 M/sec
50,949 branch-misses # 0.03% of all branches
0.092889789 seconds time elapsed
```
Signed-off-by: Andrew Jeffery <andrew@aj.id.au>
2017-12-18 05:38:30 +03:00
|
|
|
double inline genann_act_linear(const struct genann *ann __unused, double a) {
|
2016-05-20 00:55:44 +03:00
|
|
|
return a;
|
|
|
|
}
|
|
|
|
|
genann: Optionally resolve activation functions at link time
Shave around 94 million instructions and 10 million branches off of execution
trace of example4 if the sigmoid activation function is resolved at link-time.
Before (`make`):
```
Performance counter stats for './example4':
98.988806 task-clock (msec) # 0.998 CPUs utilized
1 context-switches # 0.010 K/sec
0 cpu-migrations # 0.000 K/sec
79 page-faults # 0.798 K/sec
312,298,260 cycles # 3.155 GHz
1,094,183,752 instructions # 3.50 insn per cycle
212,007,732 branches # 2141.734 M/sec
62,774 branch-misses # 0.03% of all branches
0.099228100 seconds time elapsed
```
After:
`make`:
```
Performance counter stats for './example4':
97.335180 task-clock (msec) # 0.998 CPUs utilized
0 context-switches # 0.000 K/sec
0 cpu-migrations # 0.000 K/sec
82 page-faults # 0.842 K/sec
306,722,357 cycles # 3.151 GHz
1,065,669,644 instructions # 3.47 insn per cycle
214,256,601 branches # 2201.225 M/sec
60,154 branch-misses # 0.03% of all branches
0.097577079 seconds time elapsed
```
`make sigmoid`:
```
Performance counter stats for './example4':
92.629610 task-clock (msec) # 0.997 CPUs utilized
0 context-switches # 0.000 K/sec
0 cpu-migrations # 0.000 K/sec
78 page-faults # 0.842 K/sec
291,863,801 cycles # 3.151 GHz
1,000,931,204 instructions # 3.43 insn per cycle
202,465,800 branches # 2185.757 M/sec
50,949 branch-misses # 0.03% of all branches
0.092889789 seconds time elapsed
```
Signed-off-by: Andrew Jeffery <andrew@aj.id.au>
2017-12-18 05:38:30 +03:00
|
|
|
double inline genann_act_threshold(const struct genann *ann __unused, double a) {
|
|
|
|
return a > 0;
|
|
|
|
}
|
2016-05-20 00:55:44 +03:00
|
|
|
|
2016-02-11 23:38:42 +03:00
|
|
|
genann *genann_init(int inputs, int hidden_layers, int hidden, int outputs) {
|
2016-02-10 02:53:54 +03:00
|
|
|
if (hidden_layers < 0) return 0;
|
|
|
|
if (inputs < 1) return 0;
|
|
|
|
if (outputs < 1) return 0;
|
|
|
|
if (hidden_layers > 0 && hidden < 1) return 0;
|
|
|
|
|
|
|
|
|
|
|
|
const int hidden_weights = hidden_layers ? (inputs+1) * hidden + (hidden_layers-1) * (hidden+1) * hidden : 0;
|
|
|
|
const int output_weights = (hidden_layers ? (hidden+1) : (inputs+1)) * outputs;
|
|
|
|
const int total_weights = (hidden_weights + output_weights);
|
|
|
|
|
|
|
|
const int total_neurons = (inputs + hidden * hidden_layers + outputs);
|
|
|
|
|
|
|
|
/* Allocate extra size for weights, outputs, and deltas. */
|
2016-02-11 23:38:42 +03:00
|
|
|
const int size = sizeof(genann) + sizeof(double) * (total_weights + total_neurons + (total_neurons - inputs));
|
|
|
|
genann *ret = malloc(size);
|
2016-02-10 02:53:54 +03:00
|
|
|
if (!ret) return 0;
|
|
|
|
|
|
|
|
ret->inputs = inputs;
|
|
|
|
ret->hidden_layers = hidden_layers;
|
|
|
|
ret->hidden = hidden;
|
|
|
|
ret->outputs = outputs;
|
|
|
|
|
|
|
|
ret->total_weights = total_weights;
|
|
|
|
ret->total_neurons = total_neurons;
|
|
|
|
|
|
|
|
/* Set pointers. */
|
2016-02-11 23:38:42 +03:00
|
|
|
ret->weight = (double*)((char*)ret + sizeof(genann));
|
2016-02-10 02:53:54 +03:00
|
|
|
ret->output = ret->weight + ret->total_weights;
|
|
|
|
ret->delta = ret->output + ret->total_neurons;
|
|
|
|
|
|
|
|
genann_randomize(ret);
|
|
|
|
|
|
|
|
ret->activation_hidden = genann_act_sigmoid_cached;
|
|
|
|
ret->activation_output = genann_act_sigmoid_cached;
|
|
|
|
|
genann: Optionally resolve activation functions at link time
Shave around 94 million instructions and 10 million branches off of execution
trace of example4 if the sigmoid activation function is resolved at link-time.
Before (`make`):
```
Performance counter stats for './example4':
98.988806 task-clock (msec) # 0.998 CPUs utilized
1 context-switches # 0.010 K/sec
0 cpu-migrations # 0.000 K/sec
79 page-faults # 0.798 K/sec
312,298,260 cycles # 3.155 GHz
1,094,183,752 instructions # 3.50 insn per cycle
212,007,732 branches # 2141.734 M/sec
62,774 branch-misses # 0.03% of all branches
0.099228100 seconds time elapsed
```
After:
`make`:
```
Performance counter stats for './example4':
97.335180 task-clock (msec) # 0.998 CPUs utilized
0 context-switches # 0.000 K/sec
0 cpu-migrations # 0.000 K/sec
82 page-faults # 0.842 K/sec
306,722,357 cycles # 3.151 GHz
1,065,669,644 instructions # 3.47 insn per cycle
214,256,601 branches # 2201.225 M/sec
60,154 branch-misses # 0.03% of all branches
0.097577079 seconds time elapsed
```
`make sigmoid`:
```
Performance counter stats for './example4':
92.629610 task-clock (msec) # 0.997 CPUs utilized
0 context-switches # 0.000 K/sec
0 cpu-migrations # 0.000 K/sec
78 page-faults # 0.842 K/sec
291,863,801 cycles # 3.151 GHz
1,000,931,204 instructions # 3.43 insn per cycle
202,465,800 branches # 2185.757 M/sec
50,949 branch-misses # 0.03% of all branches
0.092889789 seconds time elapsed
```
Signed-off-by: Andrew Jeffery <andrew@aj.id.au>
2017-12-18 05:38:30 +03:00
|
|
|
genann_init_sigmoid_lookup(ret);
|
|
|
|
|
2016-02-10 02:53:54 +03:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2016-02-11 23:38:42 +03:00
|
|
|
genann *genann_read(FILE *in) {
|
2016-02-10 02:53:54 +03:00
|
|
|
int inputs, hidden_layers, hidden, outputs;
|
2017-10-22 00:53:58 +03:00
|
|
|
int rc;
|
|
|
|
|
|
|
|
errno = 0;
|
|
|
|
rc = fscanf(in, "%d %d %d %d", &inputs, &hidden_layers, &hidden, &outputs);
|
|
|
|
if (rc < 4 || errno != 0) {
|
|
|
|
perror("fscanf");
|
|
|
|
return NULL;
|
|
|
|
}
|
2016-02-10 02:53:54 +03:00
|
|
|
|
2016-02-11 23:38:42 +03:00
|
|
|
genann *ann = genann_init(inputs, hidden_layers, hidden, outputs);
|
2016-02-10 02:53:54 +03:00
|
|
|
|
|
|
|
int i;
|
|
|
|
for (i = 0; i < ann->total_weights; ++i) {
|
2017-10-22 00:53:58 +03:00
|
|
|
errno = 0;
|
|
|
|
rc = fscanf(in, " %le", ann->weight + i);
|
|
|
|
if (rc < 1 || errno != 0) {
|
|
|
|
perror("fscanf");
|
|
|
|
genann_free(ann);
|
|
|
|
|
|
|
|
return NULL;
|
|
|
|
}
|
2016-02-10 02:53:54 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
return ann;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2016-02-11 23:38:42 +03:00
|
|
|
genann *genann_copy(genann const *ann) {
|
|
|
|
const int size = sizeof(genann) + sizeof(double) * (ann->total_weights + ann->total_neurons + (ann->total_neurons - ann->inputs));
|
|
|
|
genann *ret = malloc(size);
|
2016-02-10 02:53:54 +03:00
|
|
|
if (!ret) return 0;
|
|
|
|
|
|
|
|
memcpy(ret, ann, size);
|
|
|
|
|
|
|
|
/* Set pointers. */
|
2016-02-11 23:38:42 +03:00
|
|
|
ret->weight = (double*)((char*)ret + sizeof(genann));
|
2016-02-10 02:53:54 +03:00
|
|
|
ret->output = ret->weight + ret->total_weights;
|
|
|
|
ret->delta = ret->output + ret->total_neurons;
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2016-02-11 23:38:42 +03:00
|
|
|
void genann_randomize(genann *ann) {
|
2016-02-10 02:53:54 +03:00
|
|
|
int i;
|
|
|
|
for (i = 0; i < ann->total_weights; ++i) {
|
|
|
|
double r = GENANN_RANDOM();
|
|
|
|
/* Sets weights from -0.5 to 0.5. */
|
|
|
|
ann->weight[i] = r - 0.5;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2016-02-11 23:38:42 +03:00
|
|
|
void genann_free(genann *ann) {
|
2016-02-10 02:53:54 +03:00
|
|
|
/* The weight, output, and delta pointers go to the same buffer. */
|
|
|
|
free(ann);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2016-02-11 23:38:42 +03:00
|
|
|
double const *genann_run(genann const *ann, double const *inputs) {
|
2016-02-10 02:53:54 +03:00
|
|
|
double const *w = ann->weight;
|
|
|
|
double *o = ann->output + ann->inputs;
|
|
|
|
double const *i = ann->output;
|
|
|
|
|
|
|
|
/* Copy the inputs to the scratch area, where we also store each neuron's
|
|
|
|
* output, for consistency. This way the first layer isn't a special case. */
|
|
|
|
memcpy(ann->output, inputs, sizeof(double) * ann->inputs);
|
|
|
|
|
|
|
|
int h, j, k;
|
|
|
|
|
genann: Unroll loops via hoisting inner-loop conditions in genann_run()
This gives a reduction of rougly 27 million instructions and 11 million
branches in the execution trace of example4.
On a Lenovo X1 Carbon Gen 3 machine (Intel(R) Core(TM) i7-5600U CPU @ 2.60GHz)
running Ubuntu 17.10 with GCC 7.2.0-8ubuntu3, using
CFLAGS="-g -O3 -march=native -DNDEBUG" I see the following change in
`perf stat`:
Before:
```
Performance counter stats for './example4':
101.369081 task-clock (msec) # 0.998 CPUs utilized
1 context-switches # 0.010 K/sec
0 cpu-migrations # 0.000 K/sec
79 page-faults # 0.779 K/sec
320,197,883 cycles # 3.159 GHz
1,121,174,423 instructions # 3.50 insn per cycle
223,257,752 branches # 2202.425 M/sec
62,680 branch-misses # 0.03% of all branches
0.101595114 seconds time elapsed
```
After:
```
Performance counter stats for './example4':
98.988806 task-clock (msec) # 0.998 CPUs utilized
1 context-switches # 0.010 K/sec
0 cpu-migrations # 0.000 K/sec
79 page-faults # 0.798 K/sec
312,298,260 cycles # 3.155 GHz
1,094,183,752 instructions # 3.50 insn per cycle
212,007,732 branches # 2141.734 M/sec
62,774 branch-misses # 0.03% of all branches
0.099228100 seconds time elapsed
```
Signed-off-by: Andrew Jeffery <andrew@aj.id.au>
2017-12-18 01:27:58 +03:00
|
|
|
if (!ann->hidden_layers) {
|
|
|
|
double *ret = o;
|
|
|
|
for (j = 0; j < ann->outputs; ++j) {
|
|
|
|
double sum = *w++ * -1.0;
|
|
|
|
for (k = 0; k < ann->inputs; ++k) {
|
|
|
|
sum += *w++ * i[k];
|
|
|
|
}
|
genann: Optionally resolve activation functions at link time
Shave around 94 million instructions and 10 million branches off of execution
trace of example4 if the sigmoid activation function is resolved at link-time.
Before (`make`):
```
Performance counter stats for './example4':
98.988806 task-clock (msec) # 0.998 CPUs utilized
1 context-switches # 0.010 K/sec
0 cpu-migrations # 0.000 K/sec
79 page-faults # 0.798 K/sec
312,298,260 cycles # 3.155 GHz
1,094,183,752 instructions # 3.50 insn per cycle
212,007,732 branches # 2141.734 M/sec
62,774 branch-misses # 0.03% of all branches
0.099228100 seconds time elapsed
```
After:
`make`:
```
Performance counter stats for './example4':
97.335180 task-clock (msec) # 0.998 CPUs utilized
0 context-switches # 0.000 K/sec
0 cpu-migrations # 0.000 K/sec
82 page-faults # 0.842 K/sec
306,722,357 cycles # 3.151 GHz
1,065,669,644 instructions # 3.47 insn per cycle
214,256,601 branches # 2201.225 M/sec
60,154 branch-misses # 0.03% of all branches
0.097577079 seconds time elapsed
```
`make sigmoid`:
```
Performance counter stats for './example4':
92.629610 task-clock (msec) # 0.997 CPUs utilized
0 context-switches # 0.000 K/sec
0 cpu-migrations # 0.000 K/sec
78 page-faults # 0.842 K/sec
291,863,801 cycles # 3.151 GHz
1,000,931,204 instructions # 3.43 insn per cycle
202,465,800 branches # 2185.757 M/sec
50,949 branch-misses # 0.03% of all branches
0.092889789 seconds time elapsed
```
Signed-off-by: Andrew Jeffery <andrew@aj.id.au>
2017-12-18 05:38:30 +03:00
|
|
|
*o++ = genann_act_output(ann, sum);
|
genann: Unroll loops via hoisting inner-loop conditions in genann_run()
This gives a reduction of rougly 27 million instructions and 11 million
branches in the execution trace of example4.
On a Lenovo X1 Carbon Gen 3 machine (Intel(R) Core(TM) i7-5600U CPU @ 2.60GHz)
running Ubuntu 17.10 with GCC 7.2.0-8ubuntu3, using
CFLAGS="-g -O3 -march=native -DNDEBUG" I see the following change in
`perf stat`:
Before:
```
Performance counter stats for './example4':
101.369081 task-clock (msec) # 0.998 CPUs utilized
1 context-switches # 0.010 K/sec
0 cpu-migrations # 0.000 K/sec
79 page-faults # 0.779 K/sec
320,197,883 cycles # 3.159 GHz
1,121,174,423 instructions # 3.50 insn per cycle
223,257,752 branches # 2202.425 M/sec
62,680 branch-misses # 0.03% of all branches
0.101595114 seconds time elapsed
```
After:
```
Performance counter stats for './example4':
98.988806 task-clock (msec) # 0.998 CPUs utilized
1 context-switches # 0.010 K/sec
0 cpu-migrations # 0.000 K/sec
79 page-faults # 0.798 K/sec
312,298,260 cycles # 3.155 GHz
1,094,183,752 instructions # 3.50 insn per cycle
212,007,732 branches # 2141.734 M/sec
62,774 branch-misses # 0.03% of all branches
0.099228100 seconds time elapsed
```
Signed-off-by: Andrew Jeffery <andrew@aj.id.au>
2017-12-18 01:27:58 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Figure input layer */
|
|
|
|
for (j = 0; j < ann->hidden; ++j) {
|
|
|
|
double sum = *w++ * -1.0;
|
|
|
|
for (k = 0; k < ann->inputs; ++k) {
|
|
|
|
sum += *w++ * i[k];
|
|
|
|
}
|
genann: Optionally resolve activation functions at link time
Shave around 94 million instructions and 10 million branches off of execution
trace of example4 if the sigmoid activation function is resolved at link-time.
Before (`make`):
```
Performance counter stats for './example4':
98.988806 task-clock (msec) # 0.998 CPUs utilized
1 context-switches # 0.010 K/sec
0 cpu-migrations # 0.000 K/sec
79 page-faults # 0.798 K/sec
312,298,260 cycles # 3.155 GHz
1,094,183,752 instructions # 3.50 insn per cycle
212,007,732 branches # 2141.734 M/sec
62,774 branch-misses # 0.03% of all branches
0.099228100 seconds time elapsed
```
After:
`make`:
```
Performance counter stats for './example4':
97.335180 task-clock (msec) # 0.998 CPUs utilized
0 context-switches # 0.000 K/sec
0 cpu-migrations # 0.000 K/sec
82 page-faults # 0.842 K/sec
306,722,357 cycles # 3.151 GHz
1,065,669,644 instructions # 3.47 insn per cycle
214,256,601 branches # 2201.225 M/sec
60,154 branch-misses # 0.03% of all branches
0.097577079 seconds time elapsed
```
`make sigmoid`:
```
Performance counter stats for './example4':
92.629610 task-clock (msec) # 0.997 CPUs utilized
0 context-switches # 0.000 K/sec
0 cpu-migrations # 0.000 K/sec
78 page-faults # 0.842 K/sec
291,863,801 cycles # 3.151 GHz
1,000,931,204 instructions # 3.43 insn per cycle
202,465,800 branches # 2185.757 M/sec
50,949 branch-misses # 0.03% of all branches
0.092889789 seconds time elapsed
```
Signed-off-by: Andrew Jeffery <andrew@aj.id.au>
2017-12-18 05:38:30 +03:00
|
|
|
*o++ = genann_act_hidden(ann, sum);
|
genann: Unroll loops via hoisting inner-loop conditions in genann_run()
This gives a reduction of rougly 27 million instructions and 11 million
branches in the execution trace of example4.
On a Lenovo X1 Carbon Gen 3 machine (Intel(R) Core(TM) i7-5600U CPU @ 2.60GHz)
running Ubuntu 17.10 with GCC 7.2.0-8ubuntu3, using
CFLAGS="-g -O3 -march=native -DNDEBUG" I see the following change in
`perf stat`:
Before:
```
Performance counter stats for './example4':
101.369081 task-clock (msec) # 0.998 CPUs utilized
1 context-switches # 0.010 K/sec
0 cpu-migrations # 0.000 K/sec
79 page-faults # 0.779 K/sec
320,197,883 cycles # 3.159 GHz
1,121,174,423 instructions # 3.50 insn per cycle
223,257,752 branches # 2202.425 M/sec
62,680 branch-misses # 0.03% of all branches
0.101595114 seconds time elapsed
```
After:
```
Performance counter stats for './example4':
98.988806 task-clock (msec) # 0.998 CPUs utilized
1 context-switches # 0.010 K/sec
0 cpu-migrations # 0.000 K/sec
79 page-faults # 0.798 K/sec
312,298,260 cycles # 3.155 GHz
1,094,183,752 instructions # 3.50 insn per cycle
212,007,732 branches # 2141.734 M/sec
62,774 branch-misses # 0.03% of all branches
0.099228100 seconds time elapsed
```
Signed-off-by: Andrew Jeffery <andrew@aj.id.au>
2017-12-18 01:27:58 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
i += ann->inputs;
|
|
|
|
|
2016-02-10 02:53:54 +03:00
|
|
|
/* Figure hidden layers, if any. */
|
genann: Unroll loops via hoisting inner-loop conditions in genann_run()
This gives a reduction of rougly 27 million instructions and 11 million
branches in the execution trace of example4.
On a Lenovo X1 Carbon Gen 3 machine (Intel(R) Core(TM) i7-5600U CPU @ 2.60GHz)
running Ubuntu 17.10 with GCC 7.2.0-8ubuntu3, using
CFLAGS="-g -O3 -march=native -DNDEBUG" I see the following change in
`perf stat`:
Before:
```
Performance counter stats for './example4':
101.369081 task-clock (msec) # 0.998 CPUs utilized
1 context-switches # 0.010 K/sec
0 cpu-migrations # 0.000 K/sec
79 page-faults # 0.779 K/sec
320,197,883 cycles # 3.159 GHz
1,121,174,423 instructions # 3.50 insn per cycle
223,257,752 branches # 2202.425 M/sec
62,680 branch-misses # 0.03% of all branches
0.101595114 seconds time elapsed
```
After:
```
Performance counter stats for './example4':
98.988806 task-clock (msec) # 0.998 CPUs utilized
1 context-switches # 0.010 K/sec
0 cpu-migrations # 0.000 K/sec
79 page-faults # 0.798 K/sec
312,298,260 cycles # 3.155 GHz
1,094,183,752 instructions # 3.50 insn per cycle
212,007,732 branches # 2141.734 M/sec
62,774 branch-misses # 0.03% of all branches
0.099228100 seconds time elapsed
```
Signed-off-by: Andrew Jeffery <andrew@aj.id.au>
2017-12-18 01:27:58 +03:00
|
|
|
for (h = 1; h < ann->hidden_layers; ++h) {
|
2016-02-10 02:53:54 +03:00
|
|
|
for (j = 0; j < ann->hidden; ++j) {
|
2017-08-20 22:27:54 +03:00
|
|
|
double sum = *w++ * -1.0;
|
genann: Unroll loops via hoisting inner-loop conditions in genann_run()
This gives a reduction of rougly 27 million instructions and 11 million
branches in the execution trace of example4.
On a Lenovo X1 Carbon Gen 3 machine (Intel(R) Core(TM) i7-5600U CPU @ 2.60GHz)
running Ubuntu 17.10 with GCC 7.2.0-8ubuntu3, using
CFLAGS="-g -O3 -march=native -DNDEBUG" I see the following change in
`perf stat`:
Before:
```
Performance counter stats for './example4':
101.369081 task-clock (msec) # 0.998 CPUs utilized
1 context-switches # 0.010 K/sec
0 cpu-migrations # 0.000 K/sec
79 page-faults # 0.779 K/sec
320,197,883 cycles # 3.159 GHz
1,121,174,423 instructions # 3.50 insn per cycle
223,257,752 branches # 2202.425 M/sec
62,680 branch-misses # 0.03% of all branches
0.101595114 seconds time elapsed
```
After:
```
Performance counter stats for './example4':
98.988806 task-clock (msec) # 0.998 CPUs utilized
1 context-switches # 0.010 K/sec
0 cpu-migrations # 0.000 K/sec
79 page-faults # 0.798 K/sec
312,298,260 cycles # 3.155 GHz
1,094,183,752 instructions # 3.50 insn per cycle
212,007,732 branches # 2141.734 M/sec
62,774 branch-misses # 0.03% of all branches
0.099228100 seconds time elapsed
```
Signed-off-by: Andrew Jeffery <andrew@aj.id.au>
2017-12-18 01:27:58 +03:00
|
|
|
for (k = 0; k < ann->hidden; ++k) {
|
2017-08-20 22:27:54 +03:00
|
|
|
sum += *w++ * i[k];
|
2016-02-10 02:53:54 +03:00
|
|
|
}
|
genann: Optionally resolve activation functions at link time
Shave around 94 million instructions and 10 million branches off of execution
trace of example4 if the sigmoid activation function is resolved at link-time.
Before (`make`):
```
Performance counter stats for './example4':
98.988806 task-clock (msec) # 0.998 CPUs utilized
1 context-switches # 0.010 K/sec
0 cpu-migrations # 0.000 K/sec
79 page-faults # 0.798 K/sec
312,298,260 cycles # 3.155 GHz
1,094,183,752 instructions # 3.50 insn per cycle
212,007,732 branches # 2141.734 M/sec
62,774 branch-misses # 0.03% of all branches
0.099228100 seconds time elapsed
```
After:
`make`:
```
Performance counter stats for './example4':
97.335180 task-clock (msec) # 0.998 CPUs utilized
0 context-switches # 0.000 K/sec
0 cpu-migrations # 0.000 K/sec
82 page-faults # 0.842 K/sec
306,722,357 cycles # 3.151 GHz
1,065,669,644 instructions # 3.47 insn per cycle
214,256,601 branches # 2201.225 M/sec
60,154 branch-misses # 0.03% of all branches
0.097577079 seconds time elapsed
```
`make sigmoid`:
```
Performance counter stats for './example4':
92.629610 task-clock (msec) # 0.997 CPUs utilized
0 context-switches # 0.000 K/sec
0 cpu-migrations # 0.000 K/sec
78 page-faults # 0.842 K/sec
291,863,801 cycles # 3.151 GHz
1,000,931,204 instructions # 3.43 insn per cycle
202,465,800 branches # 2185.757 M/sec
50,949 branch-misses # 0.03% of all branches
0.092889789 seconds time elapsed
```
Signed-off-by: Andrew Jeffery <andrew@aj.id.au>
2017-12-18 05:38:30 +03:00
|
|
|
*o++ = genann_act_hidden(ann, sum);
|
2016-02-10 02:53:54 +03:00
|
|
|
}
|
|
|
|
|
genann: Unroll loops via hoisting inner-loop conditions in genann_run()
This gives a reduction of rougly 27 million instructions and 11 million
branches in the execution trace of example4.
On a Lenovo X1 Carbon Gen 3 machine (Intel(R) Core(TM) i7-5600U CPU @ 2.60GHz)
running Ubuntu 17.10 with GCC 7.2.0-8ubuntu3, using
CFLAGS="-g -O3 -march=native -DNDEBUG" I see the following change in
`perf stat`:
Before:
```
Performance counter stats for './example4':
101.369081 task-clock (msec) # 0.998 CPUs utilized
1 context-switches # 0.010 K/sec
0 cpu-migrations # 0.000 K/sec
79 page-faults # 0.779 K/sec
320,197,883 cycles # 3.159 GHz
1,121,174,423 instructions # 3.50 insn per cycle
223,257,752 branches # 2202.425 M/sec
62,680 branch-misses # 0.03% of all branches
0.101595114 seconds time elapsed
```
After:
```
Performance counter stats for './example4':
98.988806 task-clock (msec) # 0.998 CPUs utilized
1 context-switches # 0.010 K/sec
0 cpu-migrations # 0.000 K/sec
79 page-faults # 0.798 K/sec
312,298,260 cycles # 3.155 GHz
1,094,183,752 instructions # 3.50 insn per cycle
212,007,732 branches # 2141.734 M/sec
62,774 branch-misses # 0.03% of all branches
0.099228100 seconds time elapsed
```
Signed-off-by: Andrew Jeffery <andrew@aj.id.au>
2017-12-18 01:27:58 +03:00
|
|
|
i += ann->hidden;
|
2016-02-10 02:53:54 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
double const *ret = o;
|
|
|
|
|
|
|
|
/* Figure output layer. */
|
|
|
|
for (j = 0; j < ann->outputs; ++j) {
|
2017-08-20 22:27:54 +03:00
|
|
|
double sum = *w++ * -1.0;
|
genann: Unroll loops via hoisting inner-loop conditions in genann_run()
This gives a reduction of rougly 27 million instructions and 11 million
branches in the execution trace of example4.
On a Lenovo X1 Carbon Gen 3 machine (Intel(R) Core(TM) i7-5600U CPU @ 2.60GHz)
running Ubuntu 17.10 with GCC 7.2.0-8ubuntu3, using
CFLAGS="-g -O3 -march=native -DNDEBUG" I see the following change in
`perf stat`:
Before:
```
Performance counter stats for './example4':
101.369081 task-clock (msec) # 0.998 CPUs utilized
1 context-switches # 0.010 K/sec
0 cpu-migrations # 0.000 K/sec
79 page-faults # 0.779 K/sec
320,197,883 cycles # 3.159 GHz
1,121,174,423 instructions # 3.50 insn per cycle
223,257,752 branches # 2202.425 M/sec
62,680 branch-misses # 0.03% of all branches
0.101595114 seconds time elapsed
```
After:
```
Performance counter stats for './example4':
98.988806 task-clock (msec) # 0.998 CPUs utilized
1 context-switches # 0.010 K/sec
0 cpu-migrations # 0.000 K/sec
79 page-faults # 0.798 K/sec
312,298,260 cycles # 3.155 GHz
1,094,183,752 instructions # 3.50 insn per cycle
212,007,732 branches # 2141.734 M/sec
62,774 branch-misses # 0.03% of all branches
0.099228100 seconds time elapsed
```
Signed-off-by: Andrew Jeffery <andrew@aj.id.au>
2017-12-18 01:27:58 +03:00
|
|
|
for (k = 0; k < ann->hidden; ++k) {
|
2017-08-20 22:27:54 +03:00
|
|
|
sum += *w++ * i[k];
|
2016-02-10 02:53:54 +03:00
|
|
|
}
|
genann: Optionally resolve activation functions at link time
Shave around 94 million instructions and 10 million branches off of execution
trace of example4 if the sigmoid activation function is resolved at link-time.
Before (`make`):
```
Performance counter stats for './example4':
98.988806 task-clock (msec) # 0.998 CPUs utilized
1 context-switches # 0.010 K/sec
0 cpu-migrations # 0.000 K/sec
79 page-faults # 0.798 K/sec
312,298,260 cycles # 3.155 GHz
1,094,183,752 instructions # 3.50 insn per cycle
212,007,732 branches # 2141.734 M/sec
62,774 branch-misses # 0.03% of all branches
0.099228100 seconds time elapsed
```
After:
`make`:
```
Performance counter stats for './example4':
97.335180 task-clock (msec) # 0.998 CPUs utilized
0 context-switches # 0.000 K/sec
0 cpu-migrations # 0.000 K/sec
82 page-faults # 0.842 K/sec
306,722,357 cycles # 3.151 GHz
1,065,669,644 instructions # 3.47 insn per cycle
214,256,601 branches # 2201.225 M/sec
60,154 branch-misses # 0.03% of all branches
0.097577079 seconds time elapsed
```
`make sigmoid`:
```
Performance counter stats for './example4':
92.629610 task-clock (msec) # 0.997 CPUs utilized
0 context-switches # 0.000 K/sec
0 cpu-migrations # 0.000 K/sec
78 page-faults # 0.842 K/sec
291,863,801 cycles # 3.151 GHz
1,000,931,204 instructions # 3.43 insn per cycle
202,465,800 branches # 2185.757 M/sec
50,949 branch-misses # 0.03% of all branches
0.092889789 seconds time elapsed
```
Signed-off-by: Andrew Jeffery <andrew@aj.id.au>
2017-12-18 05:38:30 +03:00
|
|
|
*o++ = genann_act_output(ann, sum);
|
2016-02-10 02:53:54 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Sanity check that we used all weights and wrote all outputs. */
|
|
|
|
assert(w - ann->weight == ann->total_weights);
|
|
|
|
assert(o - ann->output == ann->total_neurons);
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2016-02-11 23:38:42 +03:00
|
|
|
void genann_train(genann const *ann, double const *inputs, double const *desired_outputs, double learning_rate) {
|
2016-02-10 02:53:54 +03:00
|
|
|
/* To begin with, we must run the network forward. */
|
|
|
|
genann_run(ann, inputs);
|
|
|
|
|
|
|
|
int h, j, k;
|
|
|
|
|
|
|
|
/* First set the output layer deltas. */
|
|
|
|
{
|
|
|
|
double const *o = ann->output + ann->inputs + ann->hidden * ann->hidden_layers; /* First output. */
|
|
|
|
double *d = ann->delta + ann->hidden * ann->hidden_layers; /* First delta. */
|
|
|
|
double const *t = desired_outputs; /* First desired output. */
|
|
|
|
|
|
|
|
|
|
|
|
/* Set output layer deltas. */
|
genann: Optionally resolve activation functions at link time
Shave around 94 million instructions and 10 million branches off of execution
trace of example4 if the sigmoid activation function is resolved at link-time.
Before (`make`):
```
Performance counter stats for './example4':
98.988806 task-clock (msec) # 0.998 CPUs utilized
1 context-switches # 0.010 K/sec
0 cpu-migrations # 0.000 K/sec
79 page-faults # 0.798 K/sec
312,298,260 cycles # 3.155 GHz
1,094,183,752 instructions # 3.50 insn per cycle
212,007,732 branches # 2141.734 M/sec
62,774 branch-misses # 0.03% of all branches
0.099228100 seconds time elapsed
```
After:
`make`:
```
Performance counter stats for './example4':
97.335180 task-clock (msec) # 0.998 CPUs utilized
0 context-switches # 0.000 K/sec
0 cpu-migrations # 0.000 K/sec
82 page-faults # 0.842 K/sec
306,722,357 cycles # 3.151 GHz
1,065,669,644 instructions # 3.47 insn per cycle
214,256,601 branches # 2201.225 M/sec
60,154 branch-misses # 0.03% of all branches
0.097577079 seconds time elapsed
```
`make sigmoid`:
```
Performance counter stats for './example4':
92.629610 task-clock (msec) # 0.997 CPUs utilized
0 context-switches # 0.000 K/sec
0 cpu-migrations # 0.000 K/sec
78 page-faults # 0.842 K/sec
291,863,801 cycles # 3.151 GHz
1,000,931,204 instructions # 3.43 insn per cycle
202,465,800 branches # 2185.757 M/sec
50,949 branch-misses # 0.03% of all branches
0.092889789 seconds time elapsed
```
Signed-off-by: Andrew Jeffery <andrew@aj.id.au>
2017-12-18 05:38:30 +03:00
|
|
|
if (genann_act_output == genann_act_linear ||
|
|
|
|
ann->activation_output == genann_act_linear) {
|
2016-05-21 01:54:46 +03:00
|
|
|
for (j = 0; j < ann->outputs; ++j) {
|
|
|
|
*d++ = *t++ - *o++;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
for (j = 0; j < ann->outputs; ++j) {
|
|
|
|
*d++ = (*t - *o) * *o * (1.0 - *o);
|
|
|
|
++o; ++t;
|
|
|
|
}
|
2016-02-10 02:53:54 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* Set hidden layer deltas, start on last layer and work backwards. */
|
|
|
|
/* Note that loop is skipped in the case of hidden_layers == 0. */
|
|
|
|
for (h = ann->hidden_layers - 1; h >= 0; --h) {
|
|
|
|
|
|
|
|
/* Find first output and delta in this layer. */
|
|
|
|
double const *o = ann->output + ann->inputs + (h * ann->hidden);
|
|
|
|
double *d = ann->delta + (h * ann->hidden);
|
|
|
|
|
|
|
|
/* Find first delta in following layer (which may be hidden or output). */
|
|
|
|
double const * const dd = ann->delta + ((h+1) * ann->hidden);
|
|
|
|
|
|
|
|
/* Find first weight in following layer (which may be hidden or output). */
|
|
|
|
double const * const ww = ann->weight + ((ann->inputs+1) * ann->hidden) + ((ann->hidden+1) * ann->hidden * (h));
|
|
|
|
|
|
|
|
for (j = 0; j < ann->hidden; ++j) {
|
|
|
|
|
|
|
|
double delta = 0;
|
|
|
|
|
|
|
|
for (k = 0; k < (h == ann->hidden_layers-1 ? ann->outputs : ann->hidden); ++k) {
|
|
|
|
const double forward_delta = dd[k];
|
|
|
|
const int windex = k * (ann->hidden + 1) + (j + 1);
|
|
|
|
const double forward_weight = ww[windex];
|
|
|
|
delta += forward_delta * forward_weight;
|
|
|
|
}
|
|
|
|
|
|
|
|
*d = *o * (1.0-*o) * delta;
|
|
|
|
++d; ++o;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* Train the outputs. */
|
|
|
|
{
|
|
|
|
/* Find first output delta. */
|
|
|
|
double const *d = ann->delta + ann->hidden * ann->hidden_layers; /* First output delta. */
|
|
|
|
|
|
|
|
/* Find first weight to first output delta. */
|
|
|
|
double *w = ann->weight + (ann->hidden_layers
|
|
|
|
? ((ann->inputs+1) * ann->hidden + (ann->hidden+1) * ann->hidden * (ann->hidden_layers-1))
|
|
|
|
: (0));
|
|
|
|
|
|
|
|
/* Find first output in previous layer. */
|
|
|
|
double const * const i = ann->output + (ann->hidden_layers
|
|
|
|
? (ann->inputs + (ann->hidden) * (ann->hidden_layers-1))
|
|
|
|
: 0);
|
|
|
|
|
2016-05-21 01:54:46 +03:00
|
|
|
/* Set output layer weights. */
|
2016-02-10 02:53:54 +03:00
|
|
|
for (j = 0; j < ann->outputs; ++j) {
|
genann: Remove branching from back-propagation inner-loop
This saves approximately 80 million instructions and 44 million branches in the
trace of example4, shaving off around 8ms:
Before:
```
Performance counter stats for './example4':
92.629610 task-clock (msec) # 0.997 CPUs utilized
0 context-switches # 0.000 K/sec
0 cpu-migrations # 0.000 K/sec
78 page-faults # 0.842 K/sec
291,863,801 cycles # 3.151 GHz
1,000,931,204 instructions # 3.43 insn per cycle
202,465,800 branches # 2185.757 M/sec
50,949 branch-misses # 0.03% of all branches
0.092889789 seconds time elapsed
```
After:
```
Performance counter stats for './example4':
84.473035 task-clock (msec) # 0.997 CPUs utilized
3 context-switches # 0.036 K/sec
0 cpu-migrations # 0.000 K/sec
81 page-faults # 0.959 K/sec
265,472,170 cycles # 3.143 GHz
919,372,488 instructions # 3.46 insn per cycle
158,754,885 branches # 1879.356 M/sec
65,337 branch-misses # 0.04% of all branches
0.084755458 seconds time elapsed
```
Signed-off-by: Andrew Jeffery <andrew@aj.id.au>
2017-12-18 10:09:25 +03:00
|
|
|
*w++ += *d * learning_rate * -1.0;
|
|
|
|
for (k = 1; k < (ann->hidden_layers ? ann->hidden : ann->inputs) + 1; ++k) {
|
|
|
|
*w++ += *d * learning_rate * i[k-1];
|
2016-02-10 02:53:54 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
++d;
|
|
|
|
}
|
|
|
|
|
|
|
|
assert(w - ann->weight == ann->total_weights);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* Train the hidden layers. */
|
|
|
|
for (h = ann->hidden_layers - 1; h >= 0; --h) {
|
|
|
|
|
|
|
|
/* Find first delta in this layer. */
|
|
|
|
double const *d = ann->delta + (h * ann->hidden);
|
|
|
|
|
|
|
|
/* Find first input to this layer. */
|
|
|
|
double const *i = ann->output + (h
|
|
|
|
? (ann->inputs + ann->hidden * (h-1))
|
|
|
|
: 0);
|
|
|
|
|
|
|
|
/* Find first weight to this layer. */
|
|
|
|
double *w = ann->weight + (h
|
|
|
|
? ((ann->inputs+1) * ann->hidden + (ann->hidden+1) * (ann->hidden) * (h-1))
|
|
|
|
: 0);
|
|
|
|
|
|
|
|
|
|
|
|
for (j = 0; j < ann->hidden; ++j) {
|
genann: Remove branching from back-propagation inner-loop
This saves approximately 80 million instructions and 44 million branches in the
trace of example4, shaving off around 8ms:
Before:
```
Performance counter stats for './example4':
92.629610 task-clock (msec) # 0.997 CPUs utilized
0 context-switches # 0.000 K/sec
0 cpu-migrations # 0.000 K/sec
78 page-faults # 0.842 K/sec
291,863,801 cycles # 3.151 GHz
1,000,931,204 instructions # 3.43 insn per cycle
202,465,800 branches # 2185.757 M/sec
50,949 branch-misses # 0.03% of all branches
0.092889789 seconds time elapsed
```
After:
```
Performance counter stats for './example4':
84.473035 task-clock (msec) # 0.997 CPUs utilized
3 context-switches # 0.036 K/sec
0 cpu-migrations # 0.000 K/sec
81 page-faults # 0.959 K/sec
265,472,170 cycles # 3.143 GHz
919,372,488 instructions # 3.46 insn per cycle
158,754,885 branches # 1879.356 M/sec
65,337 branch-misses # 0.04% of all branches
0.084755458 seconds time elapsed
```
Signed-off-by: Andrew Jeffery <andrew@aj.id.au>
2017-12-18 10:09:25 +03:00
|
|
|
*w++ += *d * learning_rate * -1.0;
|
|
|
|
for (k = 1; k < (h == 0 ? ann->inputs : ann->hidden) + 1; ++k) {
|
|
|
|
*w++ += *d * learning_rate * i[k-1];
|
2016-02-10 02:53:54 +03:00
|
|
|
}
|
|
|
|
++d;
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2016-02-11 23:38:42 +03:00
|
|
|
void genann_write(genann const *ann, FILE *out) {
|
2016-02-10 02:53:54 +03:00
|
|
|
fprintf(out, "%d %d %d %d", ann->inputs, ann->hidden_layers, ann->hidden, ann->outputs);
|
|
|
|
|
|
|
|
int i;
|
|
|
|
for (i = 0; i < ann->total_weights; ++i) {
|
|
|
|
fprintf(out, " %.20e", ann->weight[i]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|