forked from carlosedp/riscv-bringup
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhoult-primes.c
115 lines (111 loc) · 6.07 KB
/
hoult-primes.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
// Copyright 2016-2020 Bruce hoult [email protected]
//
// Permission is hereby granted, free of charge, to any person obtaining a copy of
// this software and associated documentation files (the "Software"), to deal in
// the Software without restriction, including without limitation the rights to
// use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
// of the Software, and to permit persons to whom the Software is furnished to do
// so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
// INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
// PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
// OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
// SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
//
// Program to count primes. I wanted something that could run in 16 KB but took enough
// time to measure on a modern x86 and is not susceptible to optimizer tricks.
// Code size is for just the countPrimes() function with gcc -O.
//
// Original x86&ARM data 2016, received user contributions 2019&2020 from eevblog members.
//
// SZ = 1000 -> 3713160 primes, all primes up to 7919^2 = 62710561
// 2.735 sec i7 8650U @ 4.2 GHz 242 bytes 11.5 billion clocks
// 2.795 sec Mac Mini M1 @ 3.2 GHz 212 bytes 8.9 billion clocks
// 2.810 sec Mac Mini M1 arm64 Ubuntu in VM 280 bytes 9.0 billion clocks
// 2.872 sec i7 6700K @ 4.2 GHz 240 bytes 12.1 billion clocks
// 2.925 sec Mac Mini M1 @ 3.2 GHz Rosetta 208 bytes 9.4 billion clocks
// 3.448 sec Ryzen 5 4500U @ 4.0 GHz WSL2 242 bytes 13.8 billion clocks
// 3.505 sec Xeon Plat 8151 @ 4.0 GHz (AWS z1d) 244 bytes 14.0 billion clocks
// 3.515 sec Threadripper 2990WX @ 4.2 GHz 242 bytes 14.8 billion clocks
// 3.836 sec i7 4700MQ @ 3.4 GHz 258 bytes 13.0 billion clocks
// 3.972 sec i7 8650U @ 4.2 GHz webasm 277 bytes 16.7 billion clocks
// 4.868 sec i7 3770 @ 3.9 GHz 240 bytes 19.0 billion clocks
// 6.377 sec AWS C6g graviton2 A64 @ 2.5 GHz 276 bytes 15.9 billion clocks
// 6.757 sec M1 Mini, qemu-riscv64 in UbuntuVM 216 bytes 23.0 billion clocks
// 8.538 sec NXP LX2160A A72 @ 2 GHz 260 bytes 17.1 billion clocks
// 9.692 sec RISC-V Fedora in qemu in VM on M1 208 bytes 31.0 billion clocks
// 9.740 sec i7 6700K qemu-riscv32 178 bytes 40.9 billion clocks
// 10.046 sec i7 8650U @ 4.2 GHz qemu-riscv32 190 bytes 42.2 billion clocks
// 11.190 sec Pi4 Cortex A72 @ 1.5 GHz T32 232 bytes 16.8 billion clocks
// 11.445 sec Odroid XU4 A15 @ 2 GHz 204 bytes 22.9 billion clocks
// 12.115 sec Pi4 Cortex A72 @ 1.5 GHz A64 300 bytes 18.2 billion clocks
// 12.605 sec Pi4 Cortex A72 @ 1.5 GHz A32 300 bytes 18.9 billion clocks
// 13.721 sec RISC-V Fedora in qemu on 2990wx 208 bytes 57.6 billion clocks
// 17.394 sec RISCV U74 @1500 MHz (est vs FPGA) 228 bytes 26.1 billion clocks
// 19.500 sec Odroid C2 A53 @ 1.536 GHz A64 276 bytes 30.0 billion clocks
// 23.940 sec Odroid C2 A53 @ 1.536 GHz T32 204 bytes 36.8 billion clocks
// 24.636 sec i7 6700K qemu-arm 204 bytes 103.5 billion clocks
// 25.060 sec i7 6700K qemu-aarch64 276 bytes 105.3 billion clocks
// 27.196 sec Teensy 4.0 Cortex M7 @ 960 MHz 228 bytes 26.1 billion clocks
// 27.480 sec HiFive Unl RISCV U54 @ 1.45 GHz 228 bytes 39.8 billion clocks
// 30.420 sec Pi3 Cortex A53 @ 1.2 GHz T32 204 bytes 36.5 billion clocks
// 39.840 sec HiFive Unl RISCV U54 @ 1.0 GHz 228 bytes 39.8 billion clocks
// 43.516 sec Teensy 4.0 Cortex M7 @ 600 MHz 228 bytes 26.1 billion clocks
// 47.910 sec Pi2 Cortex A7 @ 900 MHz T32 204 bytes 42.1 billion clocks
// 48.206 sec Zynq-7010 Cortex A9 @ 650MHz 248 bytes 31.3 billion clocks
// 112.163 sec HiFive1 RISCV E31 @ 320 MHz 178 bytes 35.9 billion clocks
// 260.907 sec RISCV U74 @ 100 MHz FPGA 228 bytes 26.1 billion clocks
// 261.068 sec esp32/Arduino @ 240 MHz ??? bytes 62.7 billion clocks
// 294.749 sec chipKIT Pro MZ pic32 @ 200 MHz ??? bytes 58.9 billion clocks
// 306.988 sec esp8266 @ 160 MHz ??? bytes 49.1 billion clocks
// 309.251 sec BlackPill Cortex M4F @ 168 MHz 228 bytes 52.0 billion clocks
// 927.547 sec BluePill Cortex M3 @ 72 MHz 228 bytes 66.8 billion clocks
// 13449.513 sec AVR ATmega2560 @ 20 MHz 318 bytes 269.0 billion clocks
#include <stdio.h>
#include <time.h>
#include <stdint.h>
#define SZ 1000
int32_t primes[SZ], sieve[SZ];
int nSieve = 0;
int32_t countPrimes(){
primes[0] = 2; sieve[0] = 4; ++nSieve;
int32_t nPrimes = 1, trial = 3, sqr=2;
while (1){
while (sqr*sqr <= trial) ++sqr;
--sqr;
for (int i=0; i<nSieve; ++i){
if (primes[i] > sqr) goto found_prime;
while (sieve[i] < trial) sieve[i] += primes[i];
if (sieve[i] == trial) goto try_next;
}
break;
found_prime:
if (nSieve < SZ){
primes[nSieve] = trial;
sieve[nSieve] = trial*trial;
++nSieve;
// printf("Saved %d: %d\n", nSieve, trial);
}
++nPrimes;
try_next:
trial+=1;
}
return nPrimes;
}
int main(){
printf("Starting run\n");
clock_t start = clock();
int res = countPrimes();
int ms = (clock() - start) / (CLOCKS_PER_SEC / 1000.0) + 0.5;
// Size calculation does not work if opt >1 or if compiler or linker
// otherwise reorders functions in the binary.
int codeSz = (char*)main - (char*)countPrimes;
printf("%d primes found in %d ms\n", res, ms);
printf("%d bytes of code in countPrimes()\n", codeSz);
return 0;
}