Skip to content

Commit

Permalink
for v2.1.0 release
Browse files Browse the repository at this point in the history
  • Loading branch information
s-kurihara123 committed Dec 15, 2021
1 parent 0a53eec commit 3048642
Show file tree
Hide file tree
Showing 282 changed files with 16,485 additions and 3,788 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
*_log\.*
*.npy
*.npz
*.png
*.pickle
*.egg-info
/build/
/dist/
Expand All @@ -13,3 +15,5 @@
generated
MANIFEST.in
__pycache__
htmlcov
.coverage
1 change: 1 addition & 0 deletions MANIFEST.in.sdist
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,6 @@ prune nlcpy.egg-info
include Makefile
include make.inc
include MANIFEST.in.sdist
include MANIFEST.in.wheel
recursive-include nlcpy *
recursive-include scripts *
3 changes: 2 additions & 1 deletion MANIFEST.in.wheel
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
global-exclude *.o *.c *.h *.cpp *.pyx *.pxd *.pxi *.m4 *.master *.master2
global-exclude *.o *.c *.h *.cpp *.pyx *.pxi *.m4 *.master *.master2
include nlcpy/include/*.h
include nlcpy/veo/*.pxd
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ include make.inc
.PHONY: all nlcpy_ve_common nlcpy_ve_no_fast_math nlcpy_ve_fast_math FORCE

JOBS:=$(shell grep -c ^processor /proc/cpuinfo 2>/dev/null)
JOBS:=$(shell if [ $(JOBS) -le 16 ]; then echo "8"; else echo "16"; fi)
JOBS:=$(shell if [ $(JOBS) -le 16 ]; then echo "8"; elif [ $(JOBS) -le 32 ]; then echo "16"; else echo "32"; fi)

all: make.dep nlcpy_ve_common nlcpy_ve_no_fast_math nlcpy_ve_fast_math
cp $(SRCDIR)/*.h $(BASEDIR)/nlcpy/include/
Expand Down
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
Before the installation, the following components are required to be installed on your x86 Node of SX-Aurora TSUBASA.

- [NEC SDK](https://www.hpc.nec/documents/guide/pdfs/InstallationGuide_E.pdf)
- required NEC C/C++ compiler version: >= 3.2.1
- required NEC C/C++ compiler version: >= 3.3.1
- required NLC version: >= 2.3.0

- [Alternative VE Offloading (AVEO)](https://www.hpc.nec/documents/veos/en/aveo/index.html)
Expand All @@ -35,7 +35,7 @@ Before the installation, the following components are required to be installed o
- required version: 3.6, 3.7, or 3.8

- [NumPy](https://www.numpy.org/)
- required version: v1.17, v1.18, v1.19, or v1.20
- required version: >= v1.17

## Install from wheel

Expand All @@ -59,7 +59,7 @@ You can install NLCPy by executing either of following commands.
$ pip install <path_to_wheel>
```
The shared objects for Vector Engine, which are included in the wheel package, are compiled and tested by using NEC C/C++ Version 3.2.1 and NumPy v1.19.2.
The shared objects for Vector Engine, which are included in the wheel package, are compiled and tested by using NEC C/C++ Version 3.3.1 and NumPy v1.19.2.
## Install from source (with building)
Expand Down
218 changes: 198 additions & 20 deletions bench/Haversine/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,86 @@
import json
import pandas as pd
import os
import string
import sys

# lat1, lon1, lat2, lon2

_test_source1_c = r'''
#include <stdint.h>
#include <stdlib.h>
#include <math.h>
void base_clock_(int *base) {
char buf[256];
if (ve_get_ve_info("clock_base", buf, sizeof(buf)) < 0)
{
*base = -1;
return;
}
*base = atoi(buf);
return;
}
void stm_(int64_t *val) {
void *vehva = (void *)0x1000;
__asm__ __volatile__ ("lhm.l %0,0(%1)":"=r"(*val):"r"(vehva));
return;
}
float calc_harv(${dtype} lat1, ${dtype} lon1, ${dtype} *lat2, ${dtype} *lon2,
int64_t num, ${dtype} *ans) {
int ibase;
int64_t ts, te;
base_clock_(&ibase);
stm_(&ts);
${dtype} miles_constant = 3959.0;
${dtype} dlat, dlon;
${dtype} a0, a1, a, c;
lat1 = lat1 * M_PI / 180.0;
lon1 = lon1 * M_PI / 180.0;
#ifdef FLOAT32
#pragma omp parallel for
for (int64_t i=0; i<num; i++){
lat2[i] = lat2[i] * M_PI / 180.0;
lon2[i] = lon2[i] * M_PI / 180.0;
dlat = lat2[i] - lat1;
dlon = lon2[i] - lon1;
a0 = sinf(dlat/2.0);
a1 = sinf(dlon/2.0);
a = a0*a0 + cosf(lat1) * cosf(lat2[i]) * a1*a1;
c = 2.0 * asinf(sqrtf(a));
ans[i] = miles_constant * c;
}
#else
#pragma omp parallel for
for (int64_t i=0; i<num; i++){
lat2[i] = lat2[i] * M_PI / 180.0;
lon2[i] = lon2[i] * M_PI / 180.0;
dlat = lat2[i] - lat1;
dlon = lon2[i] - lon1;
a0 = sin(dlat/2.0);
a1 = sin(dlon/2.0);
a = a0*a0 + cos(lat1) * cos(lat2[i]) * a1*a1;
c = 2.0 * asin(sqrt(a));
ans[i] = miles_constant * c;
}
#endif
stm_(&te);
return (float)(te - ts) / ibase * 1e-6;
}
'''

# Read in the data
df = pd.read_csv('data', encoding='cp1252')
Expand All @@ -80,22 +160,90 @@
SIZE_NAME = 'result/size.dat'
T_NP_NAME = 'result/time_numpy.dat'
T_VP_NAME = 'result/time_nlcpy.dat'
T_J_NAME = 'result/time_jit.dat'
T_JE_NAME = 'result/time_jit_e.dat'

# calculation dtype
# calculation dtype np.float32 or np.float64
DT = np.float64

# vp.request.set_offload_timing_onthefly()

gentime = {np:0, vp:0, "nlcpy-jit":0}
intime = {"pre":0, "exec(VE+VH)":0, "exec(VE)":0}
runtime = {np:0, vp:0, "nlcpy-jit":0}
exetime = {"nlcpy-jit":0}
result = {np:None, vp:None, "nlcpy-jit":None}

def _pytype2ctype(dtype):
if dtype == np.float64:
return "double"
elif dtype == np.float32:
return "float"
else:
raise TypeError


def use_nlcpy_jit_haversine(lat1, lon1, lat2, lon2, size):
from nlcpy.ve_types import (uint32, uint64, int32, int64,
float32, float64, void_p, void)

vp.request.flush()
start = time.time()

mod = vp.jit.CustomVELibrary(
code=string.Template(
_test_source1_c
).substitute(
dtype=_pytype2ctype(DT)
),
compiler='/opt/nec/ve/bin/ncc',
cflags=vp.jit.get_default_cflags(openmp=True) + \
(('-DFLOAT32',) if DT is np.float32 else ()),
ldflags=vp.jit.get_default_ldflags(openmp=True),
# ftrace=True,
)

if DT is np.float32:
args_type = (float32, float32, uint64, uint64, int64, uint64)
elif DT is np.float64:
args_type = (float64, float64, uint64, uint64, int64, uint64)
else:
raise ValueError

kern = mod.get_function(
'calc_harv',
args_type=args_type,
ret_type=float32
)

end = time.time()
intime["pre"] = end - start

mi = vp.zeros(size, dtype=DT)
vp.request.flush()
start = time.time()
ve_elapsed = kern(lat1, lon1, lat2.ve_adr, lon2.ve_adr, size, mi.ve_adr,
callback=None, sync=True)
end = time.time()

intime["exec(VE+VH)"] = end - start
intime["exec(VE)"] = ve_elapsed

return mi


# Haversine definition
def haversine(lat1, lon1, lat2, lon2, xp):
miles_constant = 3959.0

lat1 = lat1 * pi / 180.0
lon1 = lon1 * pi / 180.0
lat2 = lat2 * pi / 180.0
lon2 = lon2 * pi / 180.0

dlat = lat2 - lat1
dlon = lon2 - lon1

a0 = xp.sin(dlat/2.0)
a1 = xp.sin(dlon/2.0)
a = a0*a0 + xp.cos(lat1) * xp.cos(lat2) * a1*a1
Expand All @@ -104,6 +252,7 @@ def haversine(lat1, lon1, lat2, lon2, xp):
return mi



def write_to_file(arr, name):
with open(name, 'a') as f:
arr.tofile(f)
Expand All @@ -113,8 +262,13 @@ def gen_data(lat, lon, scale=10, xp=np):
'''
Generates the array replicated X times.
'''

if xp == "nlcpy-jit":
xp = vp

if xp is vp:
vp.request.flush()

start = time.time()
new_lat = xp.arange(scale*len(lat),dtype=DT).reshape(scale, len(lat))
new_lon = xp.arange(scale*len(lon),dtype=DT).reshape(scale, len(lon))
Expand All @@ -130,7 +284,6 @@ def gen_data(lat, lon, scale=10, xp=np):


def compare(R1, R2):
R2 = R2.get() # convert to numpy.ndarray
assert R1.dtype == R2.dtype, 'dtypes must match!'
np.testing.assert_allclose(R1, R2, rtol=1e-4)

Expand All @@ -140,56 +293,78 @@ def print_args(args):
print('params: ', str(d))


def run_haversine(scale=10, use_numpy=True, use_nlcpy=False):
def run_haversine(scale=10, use_numpy=True, use_nlcpy=False, use_nlcpy_jit=False):
orig_lat = df['latitude'].values
orig_lon = df['longitude'].values
size = orig_lat.size * scale

print(" size = ", size)
print(' dtype: ', DT.__name__)
print(' size: ', size)

use_liblist = []
if use_numpy:
use_liblist.append(np)
if use_nlcpy_jit:
use_liblist.append("nlcpy-jit")
if use_nlcpy:
use_liblist.append(vp)
if use_numpy:
use_liblist.append(np)

runtime = {np:0, vp:0}
result = {np:None, vp:None}
for xp in use_liblist:
if xp is np:
print(" numpy...", end="", flush=True)
print(" numpy.......", end="", flush=True)
elif xp is vp:
print(" nlcpy.......", end="", flush=True)
else:
print(" nlcpy...", end="", flush=True)
print(" nlcpy-jit...", end="", flush=True)

lat, lon, gen_time = gen_data(orig_lat, orig_lon, scale=scale, xp=xp)
gentime[xp] = gen_time

if xp is vp:
vp.request.flush()
#vp.start_profiling()
start = time.time()

dist = haversine(40.671, -73.985, lat, lon, xp)
if xp == "nlcpy-jit" :
dist = use_nlcpy_jit_haversine(40.671, -73.985, lat, lon, size)
else:
dist = haversine(40.671, -73.985, lat, lon, xp)

if xp is vp:
vp.request.flush()
#vp.stop_profiling()
#vp.print_run_stats()
end = time.time()
print("done", flush=True)
print("done")
print(" generating data:", gen_time)
runtime[xp] = end - start
result[xp] = dist
result[xp] = np.asarray(dist)
print(" caluculation :", runtime[xp])

if use_numpy and use_nlcpy:
if xp == "nlcpy-jit":
print(" -> pre : {:.17f}".format(intime["pre"]))
print(" -> exec(VE+VH): {:.17f}".format(intime["exec(VE+VH)"]))
print(" -> exec(VE) : {:.17f}".format(intime["exec(VE)"]))
print(" -> other : {:.17f}".format(runtime["nlcpy-jit"] -
intime["pre"] -
intime["exec(VE+VH)"]))

if use_numpy & use_nlcpy & use_nlcpy_jit:
compare(result[np], result[vp])
compare(result[np], result['nlcpy-jit'])
elif use_numpy & use_nlcpy:
compare(result[np], result[vp])
elif use_numpy & use_nlcpy_jit:
compare(result[np], result['nlcpy-jit'])
elif use_nlcpy & use_nlcpy_jit:
compare(result[vp], result['nlcpy-jit'])

# write result to file
write_to_file(np.array(size, dtype='f8'), SIZE_NAME)
if use_numpy:
write_to_file(np.array(runtime[np], dtype='f8'), T_NP_NAME)
if use_nlcpy:
write_to_file(np.array(runtime[vp], dtype='f8'), T_VP_NAME)

if use_nlcpy_jit:
write_to_file(np.array(runtime["nlcpy-jit"], dtype='f8'), T_J_NAME)
write_to_file(np.array(intime["exec(VE+VH)"], dtype='f8'), T_JE_NAME)

if __name__ == "__main__":
parser = argparse.ArgumentParser()
Expand All @@ -199,9 +374,12 @@ def run_haversine(scale=10, use_numpy=True, use_nlcpy=False):
help="use numpy or not in this run")
parser.add_argument('--nlcpy', action='store_true',
help="use nlcpy or not in this run")
parser.add_argument('--nlcpy-jit', action='store_true',
help="use jit compiler")

args = parser.parse_args()

arg = sys.argv
print("arg:{}".format(arg))
print_args(args)

run_haversine(args.scale, args.numpy, args.nlcpy)
run_haversine(args.scale, args.numpy, args.nlcpy, args.nlcpy_jit)
Loading

0 comments on commit 3048642

Please sign in to comment.