diff --git a/README.md b/README.md index f7f7f4c..8632e31 100644 --- a/README.md +++ b/README.md @@ -14,8 +14,8 @@ Before the installation, the following components are required to be installed on your x86 Node of SX-Aurora TSUBASA. - [NEC SDK](https://www.hpc.nec/documents/guide/pdfs/InstallationGuide_E.pdf) - - required NEC C/C++ compiler version: >= 3.1.1 - - required NLC version: >= 2.2.0 + - required NEC C/C++ compiler version: >= 3.2.1 + - required NLC version: >= 2.3.0 - [Alternative VE Offloading (AVEO)](https://www.hpc.nec/documents/veos/en/aveo/index.html) diff --git a/doc/source/installation.rst b/doc/source/installation.rst index 8b28068..632a77b 100644 --- a/doc/source/installation.rst +++ b/doc/source/installation.rst @@ -6,29 +6,29 @@ Installation Guide This page describes installation of NLCPy. .. attention:: - - Since April 2021, NLCPy has been provided as a software of NEC SDK (NEC Software - Development Kit for Vector Engine). If NEC SDK on your machine has been properly - installed or updated after that, NLCPy is available by using ``/usr/bin/python3`` - command, and the installation described in this page is not needed. - However, when you use another Python command such as ``/usr/local/bin/python3`` - and `python3` in a virtual environment, please install NLCPy from a wheel + - Since April 2021, NLCPy has been provided as a software of NEC SDK (NEC Software + Development Kit for Vector Engine). If NEC SDK on your machine has been properly + installed or updated after that, NLCPy is available by using ``/usr/bin/python3`` + command, and the installation described in this page is not needed. + However, when you use another Python command such as ``/usr/local/bin/python3`` + and `python3` in a virtual environment, please install NLCPy from a wheel package or source files described in this page. .. seealso:: `SX-Aurora TSUBASA Installation Guide `_ - - The libraries of NLCPy are located in the following directory after NEC SDK + - The libraries of NLCPy are located in the following directory after NEC SDK is installed or updated:: /opt/nec/ve/nlcpy/X.X.X/lib/python36/nlcpy Here, X.X.X denotes the version of NLCPy. - If you install or update NEC SDK, the directory of the latest version of + If you install or update NEC SDK, the directory of the latest version of NLCPy is added in Python module search path. When you use a specific version of NLCPy, the evironmental variable **PYTHONPATH** must be set as follows:: - + $ export PYTHONPATH=/opt/nec/ve/nlcpy/X.X.X/lib/python36/ Requirements @@ -39,8 +39,8 @@ installed on your x86 Node of SX-Aurora TSUBASA. * | `NEC SDK `_ - - required NEC C/C++ compiler version: >= 3.1.1 - - required NLC version: >= 2.2.0 + - required NEC C/C++ compiler version: >= 3.2.1 + - required NLC version: >= 2.3.0 * | `Alternative VE Offloading (AVEO) `_ @@ -93,7 +93,7 @@ You can install NLCPy by executing either of following commands. $ pip install -The shared objects for Vector Engine, which are included in the wheel package, are compiled and tested by using NEC C/C++ Version 3.1.1 and NumPy v1.17.4. +The shared objects for Vector Engine, which are included in the wheel package, are compiled and tested by using NEC C/C++ Version 3.2.1 and NumPy v1.17.4. Install from source (with building) diff --git a/doc/source/locale/ja/LC_MESSAGES/installation.mo b/doc/source/locale/ja/LC_MESSAGES/installation.mo index feed2da..a1d2f05 100644 Binary files a/doc/source/locale/ja/LC_MESSAGES/installation.mo and b/doc/source/locale/ja/LC_MESSAGES/installation.mo differ diff --git a/doc/source/locale/ja/LC_MESSAGES/installation.po b/doc/source/locale/ja/LC_MESSAGES/installation.po index df4bfc1..14961a9 100644 --- a/doc/source/locale/ja/LC_MESSAGES/installation.po +++ b/doc/source/locale/ja/LC_MESSAGES/installation.po @@ -7,7 +7,7 @@ msgid "" msgstr "" "Project-Id-Version: nlcpy 1.0.0b1\n" "Report-Msgid-Bugs-To: \n" -"POT-Creation-Date: 2021-02-12 11:33+0900\n" +"POT-Creation-Date: 2021-05-19 15:45+0900\n" "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" "Last-Translator: FULL NAME \n" "Language-Team: LANGUAGE \n" @@ -84,12 +84,12 @@ msgid "" msgstr "" #: ../../source/installation.rst:42 -msgid "required NEC C/C++ compiler version: >= 3.1.1" -msgstr "必要なNEC C/C++ コンパイラバージョン: >= 3.1.1" +msgid "required NEC C/C++ compiler version: >= 3.2.1" +msgstr "必要なNEC C/C++ コンパイラバージョン: >= 3.2.1" #: ../../source/installation.rst:43 -msgid "required NLC version: >= 2.2.0" -msgstr "必要なNLCバージョン: >= 2.2.0" +msgid "required NLC version: >= 2.3.0" +msgstr "必要なNLCバージョン: >= 2.3.0" #: ../../source/installation.rst msgid "" @@ -99,11 +99,11 @@ msgstr "" "`Alternative VE Offloading (AVEO) " "`_" -#: ../../source/installation.rst +#: ../../source/installation.rst:49 msgid "" -"The following installation of AVEO is not necessary if VEOS 2.5.1 or later is installed on your machine." -msgstr "" -"VEOS 2.5.1以降がマシンにインストールされていれば、以下のAVEOパッケージのインストールは不要です。" +"The following installation of AVEO is not necessary if VEOS 2.5.1 or " +"later is installed on your machine." +msgstr "VEOS 2.5.1以降がマシンにインストールされていれば、以下のAVEOパッケージのインストールは不要です。" #: ../../source/installation.rst msgid "" @@ -169,17 +169,17 @@ msgstr "`GitHub `_ からwheelパッケー msgid "Put the wheel package to your any directory." msgstr "wheelパッケージを任意のディレクトリに配置" -#: ../../source/installation.rst:89 +#: ../../source/installation.rst:90 msgid "Install the local wheel package via pip command." msgstr "pipコマンドを使用してローカルのwheelパッケージをインストール" #: ../../source/installation.rst:96 msgid "" "The shared objects for Vector Engine, which are included in the wheel " -"package, are compiled and tested by using NEC C/C++ Version 3.1.1 and " +"package, are compiled and tested by using NEC C/C++ Version 3.2.1 and " "NumPy v1.17.4." msgstr "" -"wheelパッケージに含まれているVector Engine用の共有オブジェクトは、NEC C/C++ バージョン3.1.1とNumPy " +"wheelパッケージに含まれているVector Engine用の共有オブジェクトは、NEC C/C++ バージョン3.2.1とNumPy " "v1.17.4を使用してコンパイルおよびテストしています。" #: ../../source/installation.rst:100 diff --git a/doc/source/locale/ja/LC_MESSAGES/reference/constants.po b/doc/source/locale/ja/LC_MESSAGES/reference/constants.po index 72d9f50..7469784 100644 --- a/doc/source/locale/ja/LC_MESSAGES/reference/constants.po +++ b/doc/source/locale/ja/LC_MESSAGES/reference/constants.po @@ -3,7 +3,6 @@ # This file is distributed under the same license as the nlcpy package. # FIRST AUTHOR , 2021. # -#, fuzzy msgid "" msgstr "" "Project-Id-Version: nlcpy 1.0.0\n" diff --git a/doc/source/locale/ja/LC_MESSAGES/reference/creation.mo b/doc/source/locale/ja/LC_MESSAGES/reference/creation.mo index ba7c15d..08fe9e0 100644 Binary files a/doc/source/locale/ja/LC_MESSAGES/reference/creation.mo and b/doc/source/locale/ja/LC_MESSAGES/reference/creation.mo differ diff --git a/doc/source/locale/ja/LC_MESSAGES/reference/creation.po b/doc/source/locale/ja/LC_MESSAGES/reference/creation.po index c3ee844..b165267 100644 --- a/doc/source/locale/ja/LC_MESSAGES/reference/creation.po +++ b/doc/source/locale/ja/LC_MESSAGES/reference/creation.po @@ -3,12 +3,11 @@ # This file is distributed under the same license as the nlcpy package. # FIRST AUTHOR , 2020. # -#, fuzzy msgid "" msgstr "" "Project-Id-Version: nlcpy 1.0.0b1\n" "Report-Msgid-Bugs-To: \n" -"POT-Creation-Date: 2021-02-05 14:32+0900\n" +"POT-Creation-Date: 2021-05-19 15:45+0900\n" "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" "Last-Translator: FULL NAME \n" "Language-Team: LANGUAGE \n" @@ -112,7 +111,7 @@ msgid "Returns a full array with the same shape and type as a given array." msgstr "" #: ../../source/reference/creation.rst:26 -msgid "From Exixting Data" +msgid "From Existing Data" msgstr "" #: ../../source/reference/creation.rst:39::1 @@ -214,3 +213,6 @@ msgstr "" #~ msgid "Returns a new array of given shape and type, filled with *fill_value.*" #~ msgstr "" +#~ msgid "From Exixting Data" +#~ msgstr "" + diff --git a/doc/source/locale/ja/LC_MESSAGES/reference/mathematical.mo b/doc/source/locale/ja/LC_MESSAGES/reference/mathematical.mo index ba7c15d..08fe9e0 100644 Binary files a/doc/source/locale/ja/LC_MESSAGES/reference/mathematical.mo and b/doc/source/locale/ja/LC_MESSAGES/reference/mathematical.mo differ diff --git a/doc/source/locale/ja/LC_MESSAGES/reference/mathematical.po b/doc/source/locale/ja/LC_MESSAGES/reference/mathematical.po index 9b0c517..457f2ca 100644 --- a/doc/source/locale/ja/LC_MESSAGES/reference/mathematical.po +++ b/doc/source/locale/ja/LC_MESSAGES/reference/mathematical.po @@ -8,7 +8,7 @@ msgid "" msgstr "" "Project-Id-Version: nlcpy 1.0.0b1\n" "Report-Msgid-Bugs-To: \n" -"POT-Creation-Date: 2021-02-05 14:32+0900\n" +"POT-Creation-Date: 2021-05-19 15:45+0900\n" "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" "Last-Translator: FULL NAME \n" "Language-Team: LANGUAGE \n" @@ -208,7 +208,7 @@ msgid ":obj:`nlcpy.prod `" msgstr "" #: ../../source/reference/mathematical.rst:59::1 -msgid "Return the product of array elements over a given axis." +msgid "Returns the product of array elements over a given axis." msgstr "" #: ../../source/reference/mathematical.rst:59::1 @@ -555,3 +555,6 @@ msgstr "" msgid ":obj:`nlcpy.fmin `" msgstr "" +#~ msgid "Return the product of array elements over a given axis." +#~ msgstr "" + diff --git a/doc/source/locale/ja/LC_MESSAGES/reference/random.mo b/doc/source/locale/ja/LC_MESSAGES/reference/random.mo index 581fe0c..08fe9e0 100644 Binary files a/doc/source/locale/ja/LC_MESSAGES/reference/random.mo and b/doc/source/locale/ja/LC_MESSAGES/reference/random.mo differ diff --git a/doc/source/locale/ja/LC_MESSAGES/reference/random.po b/doc/source/locale/ja/LC_MESSAGES/reference/random.po index 4782979..057fd8d 100644 --- a/doc/source/locale/ja/LC_MESSAGES/reference/random.po +++ b/doc/source/locale/ja/LC_MESSAGES/reference/random.po @@ -8,14 +8,14 @@ msgid "" msgstr "" "Project-Id-Version: nlcpy 1.0.0b1\n" "Report-Msgid-Bugs-To: \n" -"POT-Creation-Date: 2020-11-18 17:24+0900\n" +"POT-Creation-Date: 2021-05-19 15:45+0900\n" "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" "Last-Translator: FULL NAME \n" "Language-Team: LANGUAGE \n" "MIME-Version: 1.0\n" "Content-Type: text/plain; charset=utf-8\n" "Content-Transfer-Encoding: 8bit\n" -"Generated-By: Babel 2.7.0\n" +"Generated-By: Babel 2.9.0\n" #: ../../source/reference/random.rst:4 msgid "Random Sampling" @@ -295,7 +295,7 @@ msgid "" msgstr "" #: ../../source/reference/random.rst:119::1 -msgid "Returns a ndarray representing the internal state of the generator." +msgid "Returns an ndarray representing the internal state of the generator." msgstr "" #: ../../source/reference/random.rst:119::1 @@ -315,7 +315,7 @@ msgid "" msgstr "" #: ../../source/reference/random.rst:119::1 -msgid "Sets the internal state of the generator from a ndarray." +msgid "Sets the internal state of the generator from an ndarray." msgstr "" #: ../../source/reference/random.rst:137::1 @@ -478,3 +478,9 @@ msgstr "" msgid ":obj:`nlcpy.random.RandomState.weibull `" msgstr "" +#~ msgid "Returns a ndarray representing the internal state of the generator." +#~ msgstr "" + +#~ msgid "Sets the internal state of the generator from a ndarray." +#~ msgstr "" + diff --git a/doc/source/locale/ja/LC_MESSAGES/reference/statistics.mo b/doc/source/locale/ja/LC_MESSAGES/reference/statistics.mo index ba7c15d..08fe9e0 100644 Binary files a/doc/source/locale/ja/LC_MESSAGES/reference/statistics.mo and b/doc/source/locale/ja/LC_MESSAGES/reference/statistics.mo differ diff --git a/doc/source/locale/ja/LC_MESSAGES/reference/statistics.po b/doc/source/locale/ja/LC_MESSAGES/reference/statistics.po index c6567e2..0583dc3 100644 --- a/doc/source/locale/ja/LC_MESSAGES/reference/statistics.po +++ b/doc/source/locale/ja/LC_MESSAGES/reference/statistics.po @@ -8,7 +8,7 @@ msgid "" msgstr "" "Project-Id-Version: nlcpy 1.0.0b1\n" "Report-Msgid-Bugs-To: \n" -"POT-Creation-Date: 2021-02-05 14:32+0900\n" +"POT-Creation-Date: 2021-05-19 15:45+0900\n" "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" "Last-Translator: FULL NAME \n" "Language-Team: LANGUAGE \n" @@ -192,7 +192,7 @@ msgid ":obj:`nlcpy.correlate `" msgstr "" #: ../../source/reference/statistics.rst:50::1 -msgid "Cross-correlation of two 1-dimensional sequences.." +msgid "Cross-correlation of two 1-dimensional sequences." msgstr "" #: ../../source/reference/statistics.rst:50::1 @@ -252,7 +252,9 @@ msgid ":obj:`nlcpy.histogram_bin_edges `" msgstr "" #: ../../source/reference/statistics.rst:63::1 -msgid "Function to calculate only the edges of the bins used by the" +msgid "" +"Function to calculate only the edges of the bins used by " +":func:`histogram` function." msgstr "" #: ../../source/reference/statistics.rst:63::1 @@ -265,3 +267,9 @@ msgid "" "belongs." msgstr "" +#~ msgid "Cross-correlation of two 1-dimensional sequences.." +#~ msgstr "" + +#~ msgid "Function to calculate only the edges of the bins used by the" +#~ msgstr "" + diff --git a/doc/source/locale/ja/LC_MESSAGES/release_notes/v1.0.1.mo b/doc/source/locale/ja/LC_MESSAGES/release_notes/v1.0.1.mo new file mode 100644 index 0000000..1706217 Binary files /dev/null and b/doc/source/locale/ja/LC_MESSAGES/release_notes/v1.0.1.mo differ diff --git a/doc/source/locale/ja/LC_MESSAGES/release_notes/v1.0.1.po b/doc/source/locale/ja/LC_MESSAGES/release_notes/v1.0.1.po new file mode 100644 index 0000000..cf45d81 --- /dev/null +++ b/doc/source/locale/ja/LC_MESSAGES/release_notes/v1.0.1.po @@ -0,0 +1,47 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2020-2021, NEC Corporation +# This file is distributed under the same license as the nlcpy package. +# FIRST AUTHOR , 2021. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: nlcpy 1.0.1\n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2021-05-19 16:06+0900\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language-Team: LANGUAGE \n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Generated-By: Babel 2.9.0\n" + +#: ../../source/release_notes/v1.0.1.rst:3 +msgid "What's new in Version 1.0.1 (May 31, 2021)" +msgstr "バージョン1.0.1の更新事項(2021年5月31日)" + +#: ../../source/release_notes/v1.0.1.rst:8 +msgid "Problem Fixes" +msgstr "不具合修正" + +#: ../../source/release_notes/v1.0.1.rst:10 +msgid "" +"Fixed abnormal termination that could occur when a Python script using " +"NLCPy is executed by a batch scheduler." +msgstr "バッチ環境で実行に失敗する不具合を修正" + +#: ../../source/release_notes/v1.0.1.rst:11 +msgid "" +"Fixed a problem that stalls when a multi-dimensional :class:`ndarray` is " +"specified as a parameter to :func:`sort` ." +msgstr ":func:`sort` に多次元の :class:`ndarray` が指定された時にストールする不具合を修正" + +#: ../../source/release_notes/v1.0.1.rst:15 +msgid "Function Enhancements" +msgstr "機能強化" + +#: ../../source/release_notes/v1.0.1.rst:17 +msgid "Improved a part of the memory allocation logic during the lazy evaluation." +msgstr "遅延評価時のメモリ確保のロジックの一部を改善" + diff --git a/doc/source/release_notes/index.rst b/doc/source/release_notes/index.rst index 2a8acd8..8b78ded 100644 --- a/doc/source/release_notes/index.rst +++ b/doc/source/release_notes/index.rst @@ -9,5 +9,6 @@ This is the list of the revision history of NLCPy between each releases. .. toctree:: :maxdepth: 1 + v1.0.1 v1.0.0 v1.0.0b2 diff --git a/doc/source/release_notes/v1.0.1.rst b/doc/source/release_notes/v1.0.1.rst new file mode 100644 index 0000000..9078d20 --- /dev/null +++ b/doc/source/release_notes/v1.0.1.rst @@ -0,0 +1,17 @@ +============================================ +What's new in Version 1.0.1 (May 31, 2021) +============================================ + +.. currentmodule:: nlcpy + +Problem Fixes +------------- + +* Fixed abnormal termination that could occur when a Python script using NLCPy is executed by a batch scheduler. +* Fixed a problem that stalls when a multi-dimensional :class:`ndarray` is specified as a parameter to :func:`sort` . + + +Function Enhancements +--------------------- + +* Improved a part of the memory allocation logic during the lazy evaluation. diff --git a/nlcpy/_version.py b/nlcpy/_version.py index 1f356cc..cd7ca49 100644 --- a/nlcpy/_version.py +++ b/nlcpy/_version.py @@ -1 +1 @@ -__version__ = '1.0.0' +__version__ = '1.0.1' diff --git a/nlcpy/core/internal.pyx b/nlcpy/core/internal.pyx index 5fe6291..aa1c741 100644 --- a/nlcpy/core/internal.pyx +++ b/nlcpy/core/internal.pyx @@ -107,7 +107,7 @@ cpdef inline bint get_c_contiguity( vector[Py_ssize_t]& strides, Py_ssize_t itemsize): cdef vector[Py_ssize_t] r_shape, r_strides - cpdef Py_ssize_t ndim + cdef Py_ssize_t ndim ndim = strides.size() if ndim == 0 or (ndim == 1 and strides[0] == itemsize): return True @@ -216,8 +216,8 @@ cpdef inline Py_ssize_t _extract_slice_element(x) except? 0: cpdef slice complete_slice(slice slc, Py_ssize_t dim): - cpdef Py_ssize_t start=0, stop=0, step=0 - cpdef bint start_none, stop_none + cdef Py_ssize_t start=0, stop=0, step=0 + cdef bint start_none, stop_none if slc.step is None: step = 1 else: diff --git a/nlcpy/linalg/norm.py b/nlcpy/linalg/norm.py index 1f78074..1362800 100644 --- a/nlcpy/linalg/norm.py +++ b/nlcpy/linalg/norm.py @@ -69,9 +69,11 @@ def _lange(x, norm, axis): if x.size == 0: shape = [x.shape[i] for i in set(range(x.ndim)) - set(axis)] return nlcpy.zeros(shape, dtype=dtype) - if norm in (None, 'fro'): - norm = 'F' - elif norm == nlcpy.inf: + if norm in (None, 'fro', 'f'): + if x.dtype.kind == 'c': + x = abs(x) + return nlcpy.sqrt(nlcpy.sum(x * x, axis=axis)) + if norm == nlcpy.inf: norm = 'I' else: norm = '1' diff --git a/nlcpy/prof/prof.py b/nlcpy/prof/prof.py index 62a5bd9..db1fce0 100644 --- a/nlcpy/prof/prof.py +++ b/nlcpy/prof/prof.py @@ -30,6 +30,7 @@ # import time +import functools # profiling status NOT_PROFILING = 0 @@ -170,10 +171,13 @@ def stop_profiling(): def profile_alloc_mem(func): + @functools.wraps(func) def wrap_func(*args, **kwargs): if _prof.status != UNDER_PROFILING: return func(*args, **kwargs) pre_wait_result = _prof.t_wait_result + pre_write_mem = _prof.t_write_mem + pre_free_mem = _prof.t_free_mem s = time.time() res = func(*args, **kwargs) e = time.time() @@ -181,11 +185,16 @@ def wrap_func(*args, **kwargs): _prof.t_alloc_mem += (e - s) if pre_wait_result != _prof.t_wait_result: _prof.t_alloc_mem -= (_prof.t_wait_result - pre_wait_result) + if pre_write_mem != _prof.t_write_mem: + _prof.t_alloc_mem -= (_prof.t_write_mem - pre_write_mem) + if pre_free_mem != _prof.t_free_mem: + _prof.t_free_mem -= (_prof.t_free_mem - pre_free_mem) return res return wrap_func def profile_free_mem(func): + @functools.wraps(func) def wrap_func(*args, **kwargs): if _prof.status != UNDER_PROFILING: return func(*args, **kwargs) @@ -199,6 +208,7 @@ def wrap_func(*args, **kwargs): def profile_write_mem(func): + @functools.wraps(func) def wrap_func(*args, **kwargs): if _prof.status != UNDER_PROFILING: return func(*args, **kwargs) @@ -212,6 +222,7 @@ def wrap_func(*args, **kwargs): def profile_read_mem(func): + @functools.wraps(func) def wrap_func(*args, **kwargs): if _prof.status != UNDER_PROFILING: return func(*args, **kwargs) @@ -225,6 +236,7 @@ def wrap_func(*args, **kwargs): def profile_wait_result(func): + @functools.wraps(func) def wrap_func(*args, **kwargs): if _prof.status != UNDER_PROFILING: return func(*args, **kwargs) diff --git a/nlcpy/request/ve_kernel.pxd b/nlcpy/request/ve_kernel.pxd index 576cbc5..0223a2f 100644 --- a/nlcpy/request/ve_kernel.pxd +++ b/nlcpy/request/ve_kernel.pxd @@ -294,7 +294,7 @@ cdef extern from "../ve_kernel/ve_funcnum.h": VE_FUNC_SHUFFLE -cpdef dict funcNumList +cdef dict funcNumList cdef extern from "../ve_kernel/ve_functype.h": @@ -316,7 +316,7 @@ cdef extern from "../ve_kernel/ve_functype.h": MATH_OP RANDOM_OP -cpdef dict funcTypeList +cdef dict funcTypeList cdef extern from "../ve_kernel/ve_array.h": diff --git a/nlcpy/request/ve_kernel.pyx b/nlcpy/request/ve_kernel.pyx index 04aec62..7cd9340 100644 --- a/nlcpy/request/ve_kernel.pyx +++ b/nlcpy/request/ve_kernel.pyx @@ -32,7 +32,7 @@ from nlcpy.request.ve_kernel cimport * -cpdef dict funcNumList = { +cdef dict funcNumList = { # binary functionss "nlcpy_add": ve_funcnum.VE_FUNC_ADD, "nlcpy_subtract": ve_funcnum.VE_FUNC_SUBTRACT, @@ -292,7 +292,7 @@ cpdef dict funcNumList = { } -cpdef dict funcTypeList = { +cdef dict funcTypeList = { "binary_op": ve_functype.BINARY_OP, "unary_op": ve_functype.UNARY_OP, "indexing_op": ve_functype.INDEXING_OP, diff --git a/nlcpy/ve_kernel/nlcpy_argsort.c b/nlcpy/ve_kernel/nlcpy_argsort.c index e2f817e..6de47ae 100644 --- a/nlcpy/ve_kernel/nlcpy_argsort.c +++ b/nlcpy/ve_kernel/nlcpy_argsort.c @@ -3,10 +3,10 @@ # * The source code in this file is developed independently by NEC Corporation. # # # NLCPy License # -# +# # Copyright (c) 2020-2021 NEC Corporation # All rights reserved. -# +# # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # * Redistributions of source code must retain the above copyright notice, @@ -17,7 +17,7 @@ # * Neither NEC Corporation nor the names of its contributors may be # used to endorse or promote products derived from this software # without specific prior written permission. -# +# # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE @@ -34,6 +34,7 @@ #include "nlcpy.h" + /**************************** * * @OPERATOR_NAME@ @@ -59,8 +60,8 @@ uint64_t nlcpy_argsort_bool(ve_array *val, ve_array *idx, int32_t *psw) #endif /* _OPENMP */ { pval[0] = 0; -} /* omp single */ - +} /* omp single */ + ///////// // 1-d // ///////// @@ -137,7 +138,7 @@ uint64_t nlcpy_argsort_bool(ve_array *val, ve_array *idx, int32_t *psw) /* set strides */ asl_err = asl_sort_set_input_key_long_stride(sort, ival0); if (asl_err != ASL_ERROR_OK) return NLCPY_ERROR_ASL; - + for (int64_t cnt = cnt_s; cnt < cnt_e; cnt++) { ival = cnt * val->strides[n_outer] / val->itemsize; iidx = cnt * idx->strides[n_outer] / idx->itemsize; @@ -159,12 +160,30 @@ uint64_t nlcpy_argsort_bool(ve_array *val, ve_array *idx, int32_t *psw) if (k < 1) break; } } +#ifdef _OPENMP +#pragma omp barrier +#endif /* _OPENMP */ /* destroy sorter */ asl_err = asl_sort_destroy(sort); if (asl_err != ASL_ERROR_OK) return NLCPY_ERROR_ASL; } else { return (uint64_t)NLCPY_ERROR_NDIM; } + +#ifdef _OPENMP + const int nt = omp_get_max_threads(); +#else + const int nt = 1; +#endif /* _OPENMP */ +#ifdef _OPENMP +#pragma omp single +#endif /* _OPENMP */ +{ + /* restore thread count */ + asl_err = asl_library_set_thread_count(nt); + if (asl_err != ASL_ERROR_OK) return NLCPY_ERROR_ASL; +} + retrieve_fpe_flags(psw); return (uint64_t)NLCPY_ERROR_OK; } @@ -188,8 +207,8 @@ uint64_t nlcpy_argsort_i32(ve_array *val, ve_array *idx, int32_t *psw) #endif /* _OPENMP */ { pval[0] = 0; -} /* omp single */ - +} /* omp single */ + ///////// // 1-d // ///////// @@ -266,7 +285,7 @@ uint64_t nlcpy_argsort_i32(ve_array *val, ve_array *idx, int32_t *psw) /* set strides */ asl_err = asl_sort_set_input_key_long_stride(sort, ival0); if (asl_err != ASL_ERROR_OK) return NLCPY_ERROR_ASL; - + for (int64_t cnt = cnt_s; cnt < cnt_e; cnt++) { ival = cnt * val->strides[n_outer] / val->itemsize; iidx = cnt * idx->strides[n_outer] / idx->itemsize; @@ -288,12 +307,30 @@ uint64_t nlcpy_argsort_i32(ve_array *val, ve_array *idx, int32_t *psw) if (k < 1) break; } } +#ifdef _OPENMP +#pragma omp barrier +#endif /* _OPENMP */ /* destroy sorter */ asl_err = asl_sort_destroy(sort); if (asl_err != ASL_ERROR_OK) return NLCPY_ERROR_ASL; } else { return (uint64_t)NLCPY_ERROR_NDIM; } + +#ifdef _OPENMP + const int nt = omp_get_max_threads(); +#else + const int nt = 1; +#endif /* _OPENMP */ +#ifdef _OPENMP +#pragma omp single +#endif /* _OPENMP */ +{ + /* restore thread count */ + asl_err = asl_library_set_thread_count(nt); + if (asl_err != ASL_ERROR_OK) return NLCPY_ERROR_ASL; +} + retrieve_fpe_flags(psw); return (uint64_t)NLCPY_ERROR_OK; } @@ -317,8 +354,8 @@ uint64_t nlcpy_argsort_i64(ve_array *val, ve_array *idx, int32_t *psw) #endif /* _OPENMP */ { pval[0] = 0; -} /* omp single */ - +} /* omp single */ + ///////// // 1-d // ///////// @@ -395,7 +432,7 @@ uint64_t nlcpy_argsort_i64(ve_array *val, ve_array *idx, int32_t *psw) /* set strides */ asl_err = asl_sort_set_input_key_long_stride(sort, ival0); if (asl_err != ASL_ERROR_OK) return NLCPY_ERROR_ASL; - + for (int64_t cnt = cnt_s; cnt < cnt_e; cnt++) { ival = cnt * val->strides[n_outer] / val->itemsize; iidx = cnt * idx->strides[n_outer] / idx->itemsize; @@ -417,12 +454,30 @@ uint64_t nlcpy_argsort_i64(ve_array *val, ve_array *idx, int32_t *psw) if (k < 1) break; } } +#ifdef _OPENMP +#pragma omp barrier +#endif /* _OPENMP */ /* destroy sorter */ asl_err = asl_sort_destroy(sort); if (asl_err != ASL_ERROR_OK) return NLCPY_ERROR_ASL; } else { return (uint64_t)NLCPY_ERROR_NDIM; } + +#ifdef _OPENMP + const int nt = omp_get_max_threads(); +#else + const int nt = 1; +#endif /* _OPENMP */ +#ifdef _OPENMP +#pragma omp single +#endif /* _OPENMP */ +{ + /* restore thread count */ + asl_err = asl_library_set_thread_count(nt); + if (asl_err != ASL_ERROR_OK) return NLCPY_ERROR_ASL; +} + retrieve_fpe_flags(psw); return (uint64_t)NLCPY_ERROR_OK; } @@ -446,8 +501,8 @@ uint64_t nlcpy_argsort_u32(ve_array *val, ve_array *idx, int32_t *psw) #endif /* _OPENMP */ { pval[0] = 0; -} /* omp single */ - +} /* omp single */ + ///////// // 1-d // ///////// @@ -524,7 +579,7 @@ uint64_t nlcpy_argsort_u32(ve_array *val, ve_array *idx, int32_t *psw) /* set strides */ asl_err = asl_sort_set_input_key_long_stride(sort, ival0); if (asl_err != ASL_ERROR_OK) return NLCPY_ERROR_ASL; - + for (int64_t cnt = cnt_s; cnt < cnt_e; cnt++) { ival = cnt * val->strides[n_outer] / val->itemsize; iidx = cnt * idx->strides[n_outer] / idx->itemsize; @@ -546,12 +601,30 @@ uint64_t nlcpy_argsort_u32(ve_array *val, ve_array *idx, int32_t *psw) if (k < 1) break; } } +#ifdef _OPENMP +#pragma omp barrier +#endif /* _OPENMP */ /* destroy sorter */ asl_err = asl_sort_destroy(sort); if (asl_err != ASL_ERROR_OK) return NLCPY_ERROR_ASL; } else { return (uint64_t)NLCPY_ERROR_NDIM; } + +#ifdef _OPENMP + const int nt = omp_get_max_threads(); +#else + const int nt = 1; +#endif /* _OPENMP */ +#ifdef _OPENMP +#pragma omp single +#endif /* _OPENMP */ +{ + /* restore thread count */ + asl_err = asl_library_set_thread_count(nt); + if (asl_err != ASL_ERROR_OK) return NLCPY_ERROR_ASL; +} + retrieve_fpe_flags(psw); return (uint64_t)NLCPY_ERROR_OK; } @@ -575,8 +648,8 @@ uint64_t nlcpy_argsort_u64(ve_array *val, ve_array *idx, int32_t *psw) #endif /* _OPENMP */ { pval[0] = 0; -} /* omp single */ - +} /* omp single */ + ///////// // 1-d // ///////// @@ -653,7 +726,7 @@ uint64_t nlcpy_argsort_u64(ve_array *val, ve_array *idx, int32_t *psw) /* set strides */ asl_err = asl_sort_set_input_key_long_stride(sort, ival0); if (asl_err != ASL_ERROR_OK) return NLCPY_ERROR_ASL; - + for (int64_t cnt = cnt_s; cnt < cnt_e; cnt++) { ival = cnt * val->strides[n_outer] / val->itemsize; iidx = cnt * idx->strides[n_outer] / idx->itemsize; @@ -675,12 +748,30 @@ uint64_t nlcpy_argsort_u64(ve_array *val, ve_array *idx, int32_t *psw) if (k < 1) break; } } +#ifdef _OPENMP +#pragma omp barrier +#endif /* _OPENMP */ /* destroy sorter */ asl_err = asl_sort_destroy(sort); if (asl_err != ASL_ERROR_OK) return NLCPY_ERROR_ASL; } else { return (uint64_t)NLCPY_ERROR_NDIM; } + +#ifdef _OPENMP + const int nt = omp_get_max_threads(); +#else + const int nt = 1; +#endif /* _OPENMP */ +#ifdef _OPENMP +#pragma omp single +#endif /* _OPENMP */ +{ + /* restore thread count */ + asl_err = asl_library_set_thread_count(nt); + if (asl_err != ASL_ERROR_OK) return NLCPY_ERROR_ASL; +} + retrieve_fpe_flags(psw); return (uint64_t)NLCPY_ERROR_OK; } @@ -704,8 +795,8 @@ uint64_t nlcpy_argsort_f32(ve_array *val, ve_array *idx, int32_t *psw) #endif /* _OPENMP */ { pval[0] = 0; -} /* omp single */ - +} /* omp single */ + ///////// // 1-d // ///////// @@ -782,7 +873,7 @@ uint64_t nlcpy_argsort_f32(ve_array *val, ve_array *idx, int32_t *psw) /* set strides */ asl_err = asl_sort_set_input_key_long_stride(sort, ival0); if (asl_err != ASL_ERROR_OK) return NLCPY_ERROR_ASL; - + for (int64_t cnt = cnt_s; cnt < cnt_e; cnt++) { ival = cnt * val->strides[n_outer] / val->itemsize; iidx = cnt * idx->strides[n_outer] / idx->itemsize; @@ -804,12 +895,30 @@ uint64_t nlcpy_argsort_f32(ve_array *val, ve_array *idx, int32_t *psw) if (k < 1) break; } } +#ifdef _OPENMP +#pragma omp barrier +#endif /* _OPENMP */ /* destroy sorter */ asl_err = asl_sort_destroy(sort); if (asl_err != ASL_ERROR_OK) return NLCPY_ERROR_ASL; } else { return (uint64_t)NLCPY_ERROR_NDIM; } + +#ifdef _OPENMP + const int nt = omp_get_max_threads(); +#else + const int nt = 1; +#endif /* _OPENMP */ +#ifdef _OPENMP +#pragma omp single +#endif /* _OPENMP */ +{ + /* restore thread count */ + asl_err = asl_library_set_thread_count(nt); + if (asl_err != ASL_ERROR_OK) return NLCPY_ERROR_ASL; +} + retrieve_fpe_flags(psw); return (uint64_t)NLCPY_ERROR_OK; } @@ -833,8 +942,8 @@ uint64_t nlcpy_argsort_f64(ve_array *val, ve_array *idx, int32_t *psw) #endif /* _OPENMP */ { pval[0] = 0; -} /* omp single */ - +} /* omp single */ + ///////// // 1-d // ///////// @@ -911,7 +1020,7 @@ uint64_t nlcpy_argsort_f64(ve_array *val, ve_array *idx, int32_t *psw) /* set strides */ asl_err = asl_sort_set_input_key_long_stride(sort, ival0); if (asl_err != ASL_ERROR_OK) return NLCPY_ERROR_ASL; - + for (int64_t cnt = cnt_s; cnt < cnt_e; cnt++) { ival = cnt * val->strides[n_outer] / val->itemsize; iidx = cnt * idx->strides[n_outer] / idx->itemsize; @@ -933,12 +1042,30 @@ uint64_t nlcpy_argsort_f64(ve_array *val, ve_array *idx, int32_t *psw) if (k < 1) break; } } +#ifdef _OPENMP +#pragma omp barrier +#endif /* _OPENMP */ /* destroy sorter */ asl_err = asl_sort_destroy(sort); if (asl_err != ASL_ERROR_OK) return NLCPY_ERROR_ASL; } else { return (uint64_t)NLCPY_ERROR_NDIM; } + +#ifdef _OPENMP + const int nt = omp_get_max_threads(); +#else + const int nt = 1; +#endif /* _OPENMP */ +#ifdef _OPENMP +#pragma omp single +#endif /* _OPENMP */ +{ + /* restore thread count */ + asl_err = asl_library_set_thread_count(nt); + if (asl_err != ASL_ERROR_OK) return NLCPY_ERROR_ASL; +} + retrieve_fpe_flags(psw); return (uint64_t)NLCPY_ERROR_OK; } @@ -950,7 +1077,7 @@ uint64_t nlcpy_argsort(ve_arguments *args, int32_t *psw) ve_array *val = &(args->unary.x); ve_array *idx = &(args->unary.z); uint64_t err = NLCPY_ERROR_OK; - + switch (val->dtype) { case ve_bool: err = nlcpy_argsort_bool (val, idx, psw); break; case ve_i32: err = nlcpy_argsort_i32 (val, idx, psw); break; diff --git a/nlcpy/ve_kernel/nlcpy_argsort.c.m4 b/nlcpy/ve_kernel/nlcpy_argsort.c.m4 index fd1fec4..a039d0a 100644 --- a/nlcpy/ve_kernel/nlcpy_argsort.c.m4 +++ b/nlcpy/ve_kernel/nlcpy_argsort.c.m4 @@ -3,10 +3,10 @@ # * The source code in this file is developed independently by NEC Corporation. # # # NLCPy License # -# +# # Copyright (c) 2020-2021 NEC Corporation # All rights reserved. -# +# # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # * Redistributions of source code must retain the above copyright notice, @@ -17,7 +17,7 @@ # * Neither NEC Corporation nor the names of its contributors may be # used to endorse or promote products derived from this software # without specific prior written permission. -# +# # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE @@ -60,8 +60,8 @@ uint64_t nlcpy_argsort_$1(ve_array *val, ve_array *idx, int32_t *psw) #endif /* _OPENMP */ { pval[0] = 0; -} /* omp single */ - +} /* omp single */ + ///////// // 1-d // ///////// @@ -138,7 +138,7 @@ uint64_t nlcpy_argsort_$1(ve_array *val, ve_array *idx, int32_t *psw) /* set strides */ asl_err = asl_sort_set_input_key_long_stride(sort, ival0); if (asl_err != ASL_ERROR_OK) return NLCPY_ERROR_ASL; - + for (int64_t cnt = cnt_s; cnt < cnt_e; cnt++) { ival = cnt * val->strides[n_outer] / val->itemsize; iidx = cnt * idx->strides[n_outer] / idx->itemsize; @@ -160,12 +160,30 @@ uint64_t nlcpy_argsort_$1(ve_array *val, ve_array *idx, int32_t *psw) if (k < 1) break; } } +#ifdef _OPENMP +#pragma omp barrier +#endif /* _OPENMP */ /* destroy sorter */ asl_err = asl_sort_destroy(sort); if (asl_err != ASL_ERROR_OK) return NLCPY_ERROR_ASL; } else { return (uint64_t)NLCPY_ERROR_NDIM; } + +#ifdef _OPENMP + const int nt = omp_get_max_threads(); +#else + const int nt = 1; +#endif /* _OPENMP */ +#ifdef _OPENMP +#pragma omp single +#endif /* _OPENMP */ +{ + /* restore thread count */ + asl_err = asl_library_set_thread_count(nt); + if (asl_err != ASL_ERROR_OK) return NLCPY_ERROR_ASL; +} + retrieve_fpe_flags(psw); return (uint64_t)NLCPY_ERROR_OK; } @@ -185,7 +203,7 @@ uint64_t nlcpy_argsort(ve_arguments *args, int32_t *psw) ve_array *val = &(args->unary.x); ve_array *idx = &(args->unary.z); uint64_t err = NLCPY_ERROR_OK; - + switch (val->dtype) { case ve_bool: err = nlcpy_argsort_bool (val, idx, psw); break; case ve_i32: err = nlcpy_argsort_i32 (val, idx, psw); break; diff --git a/nlcpy/ve_kernel/nlcpy_sort.c b/nlcpy/ve_kernel/nlcpy_sort.c index 9cfd0bf..f869dbc 100644 --- a/nlcpy/ve_kernel/nlcpy_sort.c +++ b/nlcpy/ve_kernel/nlcpy_sort.c @@ -3,10 +3,10 @@ # * The source code in this file is developed independently by NEC Corporation. # # # NLCPy License # -# +# # Copyright (c) 2020-2021 NEC Corporation # All rights reserved. -# +# # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # * Redistributions of source code must retain the above copyright notice, @@ -17,7 +17,7 @@ # * Neither NEC Corporation nor the names of its contributors may be # used to endorse or promote products derived from this software # without specific prior written permission. -# +# # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE @@ -58,8 +58,8 @@ uint64_t nlcpy_sort_bool(ve_array *x, int32_t *psw) #endif /* _OPENMP */ { /* noting to do */ -} /* omp single */ - +} /* omp single */ + ///////// // 1-d // ///////// @@ -137,7 +137,7 @@ uint64_t nlcpy_sort_bool(ve_array *x, int32_t *psw) if (asl_err != ASL_ERROR_OK) return NLCPY_ERROR_ASL; asl_err = asl_sort_set_output_key_long_stride(sort, ix0); if (asl_err != ASL_ERROR_OK) return NLCPY_ERROR_ASL; - + for (int64_t cnt = cnt_s; cnt < cnt_e; cnt++) { ix = cnt * x->strides[n_outer] / x->itemsize; for (;;) { @@ -156,6 +156,9 @@ uint64_t nlcpy_sort_bool(ve_array *x, int32_t *psw) if (k < 1) break; } } +#ifdef _OPENMP +#pragma omp barrier +#endif /* _OPENMP */ /* destroy sorter */ asl_err = asl_sort_destroy(sort); if (asl_err != ASL_ERROR_OK) return NLCPY_ERROR_ASL; @@ -163,19 +166,26 @@ uint64_t nlcpy_sort_bool(ve_array *x, int32_t *psw) return (uint64_t)NLCPY_ERROR_NDIM; } - /* restore thread count */ #ifdef _OPENMP const int nt = omp_get_max_threads(); #else const int nt = 1; #endif /* _OPENMP */ - asl_library_set_thread_count(nt); +#ifdef _OPENMP +#pragma omp single +#endif /* _OPENMP */ +{ + /* restore thread count */ + asl_err = asl_library_set_thread_count(nt); + if (asl_err != ASL_ERROR_OK) return NLCPY_ERROR_ASL; +} retrieve_fpe_flags(psw); return (uint64_t)NLCPY_ERROR_OK; } + uint64_t nlcpy_sort_i32(ve_array *x, int32_t *psw) { asl_error_t asl_err; @@ -192,8 +202,8 @@ uint64_t nlcpy_sort_i32(ve_array *x, int32_t *psw) #endif /* _OPENMP */ { /* noting to do */ -} /* omp single */ - +} /* omp single */ + ///////// // 1-d // ///////// @@ -271,7 +281,7 @@ uint64_t nlcpy_sort_i32(ve_array *x, int32_t *psw) if (asl_err != ASL_ERROR_OK) return NLCPY_ERROR_ASL; asl_err = asl_sort_set_output_key_long_stride(sort, ix0); if (asl_err != ASL_ERROR_OK) return NLCPY_ERROR_ASL; - + for (int64_t cnt = cnt_s; cnt < cnt_e; cnt++) { ix = cnt * x->strides[n_outer] / x->itemsize; for (;;) { @@ -290,6 +300,9 @@ uint64_t nlcpy_sort_i32(ve_array *x, int32_t *psw) if (k < 1) break; } } +#ifdef _OPENMP +#pragma omp barrier +#endif /* _OPENMP */ /* destroy sorter */ asl_err = asl_sort_destroy(sort); if (asl_err != ASL_ERROR_OK) return NLCPY_ERROR_ASL; @@ -297,19 +310,26 @@ uint64_t nlcpy_sort_i32(ve_array *x, int32_t *psw) return (uint64_t)NLCPY_ERROR_NDIM; } - /* restore thread count */ #ifdef _OPENMP const int nt = omp_get_max_threads(); #else const int nt = 1; #endif /* _OPENMP */ - asl_library_set_thread_count(nt); +#ifdef _OPENMP +#pragma omp single +#endif /* _OPENMP */ +{ + /* restore thread count */ + asl_err = asl_library_set_thread_count(nt); + if (asl_err != ASL_ERROR_OK) return NLCPY_ERROR_ASL; +} retrieve_fpe_flags(psw); return (uint64_t)NLCPY_ERROR_OK; } + uint64_t nlcpy_sort_i64(ve_array *x, int32_t *psw) { asl_error_t asl_err; @@ -326,8 +346,8 @@ uint64_t nlcpy_sort_i64(ve_array *x, int32_t *psw) #endif /* _OPENMP */ { /* noting to do */ -} /* omp single */ - +} /* omp single */ + ///////// // 1-d // ///////// @@ -405,7 +425,7 @@ uint64_t nlcpy_sort_i64(ve_array *x, int32_t *psw) if (asl_err != ASL_ERROR_OK) return NLCPY_ERROR_ASL; asl_err = asl_sort_set_output_key_long_stride(sort, ix0); if (asl_err != ASL_ERROR_OK) return NLCPY_ERROR_ASL; - + for (int64_t cnt = cnt_s; cnt < cnt_e; cnt++) { ix = cnt * x->strides[n_outer] / x->itemsize; for (;;) { @@ -424,6 +444,9 @@ uint64_t nlcpy_sort_i64(ve_array *x, int32_t *psw) if (k < 1) break; } } +#ifdef _OPENMP +#pragma omp barrier +#endif /* _OPENMP */ /* destroy sorter */ asl_err = asl_sort_destroy(sort); if (asl_err != ASL_ERROR_OK) return NLCPY_ERROR_ASL; @@ -431,19 +454,26 @@ uint64_t nlcpy_sort_i64(ve_array *x, int32_t *psw) return (uint64_t)NLCPY_ERROR_NDIM; } - /* restore thread count */ #ifdef _OPENMP const int nt = omp_get_max_threads(); #else const int nt = 1; #endif /* _OPENMP */ - asl_library_set_thread_count(nt); +#ifdef _OPENMP +#pragma omp single +#endif /* _OPENMP */ +{ + /* restore thread count */ + asl_err = asl_library_set_thread_count(nt); + if (asl_err != ASL_ERROR_OK) return NLCPY_ERROR_ASL; +} retrieve_fpe_flags(psw); return (uint64_t)NLCPY_ERROR_OK; } + uint64_t nlcpy_sort_u32(ve_array *x, int32_t *psw) { asl_error_t asl_err; @@ -460,8 +490,8 @@ uint64_t nlcpy_sort_u32(ve_array *x, int32_t *psw) #endif /* _OPENMP */ { /* noting to do */ -} /* omp single */ - +} /* omp single */ + ///////// // 1-d // ///////// @@ -539,7 +569,7 @@ uint64_t nlcpy_sort_u32(ve_array *x, int32_t *psw) if (asl_err != ASL_ERROR_OK) return NLCPY_ERROR_ASL; asl_err = asl_sort_set_output_key_long_stride(sort, ix0); if (asl_err != ASL_ERROR_OK) return NLCPY_ERROR_ASL; - + for (int64_t cnt = cnt_s; cnt < cnt_e; cnt++) { ix = cnt * x->strides[n_outer] / x->itemsize; for (;;) { @@ -558,6 +588,9 @@ uint64_t nlcpy_sort_u32(ve_array *x, int32_t *psw) if (k < 1) break; } } +#ifdef _OPENMP +#pragma omp barrier +#endif /* _OPENMP */ /* destroy sorter */ asl_err = asl_sort_destroy(sort); if (asl_err != ASL_ERROR_OK) return NLCPY_ERROR_ASL; @@ -565,19 +598,26 @@ uint64_t nlcpy_sort_u32(ve_array *x, int32_t *psw) return (uint64_t)NLCPY_ERROR_NDIM; } - /* restore thread count */ #ifdef _OPENMP const int nt = omp_get_max_threads(); #else const int nt = 1; #endif /* _OPENMP */ - asl_library_set_thread_count(nt); +#ifdef _OPENMP +#pragma omp single +#endif /* _OPENMP */ +{ + /* restore thread count */ + asl_err = asl_library_set_thread_count(nt); + if (asl_err != ASL_ERROR_OK) return NLCPY_ERROR_ASL; +} retrieve_fpe_flags(psw); return (uint64_t)NLCPY_ERROR_OK; } + uint64_t nlcpy_sort_u64(ve_array *x, int32_t *psw) { asl_error_t asl_err; @@ -594,8 +634,8 @@ uint64_t nlcpy_sort_u64(ve_array *x, int32_t *psw) #endif /* _OPENMP */ { /* noting to do */ -} /* omp single */ - +} /* omp single */ + ///////// // 1-d // ///////// @@ -673,7 +713,7 @@ uint64_t nlcpy_sort_u64(ve_array *x, int32_t *psw) if (asl_err != ASL_ERROR_OK) return NLCPY_ERROR_ASL; asl_err = asl_sort_set_output_key_long_stride(sort, ix0); if (asl_err != ASL_ERROR_OK) return NLCPY_ERROR_ASL; - + for (int64_t cnt = cnt_s; cnt < cnt_e; cnt++) { ix = cnt * x->strides[n_outer] / x->itemsize; for (;;) { @@ -692,6 +732,9 @@ uint64_t nlcpy_sort_u64(ve_array *x, int32_t *psw) if (k < 1) break; } } +#ifdef _OPENMP +#pragma omp barrier +#endif /* _OPENMP */ /* destroy sorter */ asl_err = asl_sort_destroy(sort); if (asl_err != ASL_ERROR_OK) return NLCPY_ERROR_ASL; @@ -699,19 +742,26 @@ uint64_t nlcpy_sort_u64(ve_array *x, int32_t *psw) return (uint64_t)NLCPY_ERROR_NDIM; } - /* restore thread count */ #ifdef _OPENMP const int nt = omp_get_max_threads(); #else const int nt = 1; #endif /* _OPENMP */ - asl_library_set_thread_count(nt); +#ifdef _OPENMP +#pragma omp single +#endif /* _OPENMP */ +{ + /* restore thread count */ + asl_err = asl_library_set_thread_count(nt); + if (asl_err != ASL_ERROR_OK) return NLCPY_ERROR_ASL; +} retrieve_fpe_flags(psw); return (uint64_t)NLCPY_ERROR_OK; } + uint64_t nlcpy_sort_f32(ve_array *x, int32_t *psw) { asl_error_t asl_err; @@ -728,8 +778,8 @@ uint64_t nlcpy_sort_f32(ve_array *x, int32_t *psw) #endif /* _OPENMP */ { /* noting to do */ -} /* omp single */ - +} /* omp single */ + ///////// // 1-d // ///////// @@ -807,7 +857,7 @@ uint64_t nlcpy_sort_f32(ve_array *x, int32_t *psw) if (asl_err != ASL_ERROR_OK) return NLCPY_ERROR_ASL; asl_err = asl_sort_set_output_key_long_stride(sort, ix0); if (asl_err != ASL_ERROR_OK) return NLCPY_ERROR_ASL; - + for (int64_t cnt = cnt_s; cnt < cnt_e; cnt++) { ix = cnt * x->strides[n_outer] / x->itemsize; for (;;) { @@ -826,6 +876,9 @@ uint64_t nlcpy_sort_f32(ve_array *x, int32_t *psw) if (k < 1) break; } } +#ifdef _OPENMP +#pragma omp barrier +#endif /* _OPENMP */ /* destroy sorter */ asl_err = asl_sort_destroy(sort); if (asl_err != ASL_ERROR_OK) return NLCPY_ERROR_ASL; @@ -833,19 +886,26 @@ uint64_t nlcpy_sort_f32(ve_array *x, int32_t *psw) return (uint64_t)NLCPY_ERROR_NDIM; } - /* restore thread count */ #ifdef _OPENMP const int nt = omp_get_max_threads(); #else const int nt = 1; #endif /* _OPENMP */ - asl_library_set_thread_count(nt); +#ifdef _OPENMP +#pragma omp single +#endif /* _OPENMP */ +{ + /* restore thread count */ + asl_err = asl_library_set_thread_count(nt); + if (asl_err != ASL_ERROR_OK) return NLCPY_ERROR_ASL; +} retrieve_fpe_flags(psw); return (uint64_t)NLCPY_ERROR_OK; } + uint64_t nlcpy_sort_f64(ve_array *x, int32_t *psw) { asl_error_t asl_err; @@ -862,8 +922,8 @@ uint64_t nlcpy_sort_f64(ve_array *x, int32_t *psw) #endif /* _OPENMP */ { /* noting to do */ -} /* omp single */ - +} /* omp single */ + ///////// // 1-d // ///////// @@ -941,7 +1001,7 @@ uint64_t nlcpy_sort_f64(ve_array *x, int32_t *psw) if (asl_err != ASL_ERROR_OK) return NLCPY_ERROR_ASL; asl_err = asl_sort_set_output_key_long_stride(sort, ix0); if (asl_err != ASL_ERROR_OK) return NLCPY_ERROR_ASL; - + for (int64_t cnt = cnt_s; cnt < cnt_e; cnt++) { ix = cnt * x->strides[n_outer] / x->itemsize; for (;;) { @@ -960,6 +1020,9 @@ uint64_t nlcpy_sort_f64(ve_array *x, int32_t *psw) if (k < 1) break; } } +#ifdef _OPENMP +#pragma omp barrier +#endif /* _OPENMP */ /* destroy sorter */ asl_err = asl_sort_destroy(sort); if (asl_err != ASL_ERROR_OK) return NLCPY_ERROR_ASL; @@ -967,13 +1030,19 @@ uint64_t nlcpy_sort_f64(ve_array *x, int32_t *psw) return (uint64_t)NLCPY_ERROR_NDIM; } - /* restore thread count */ #ifdef _OPENMP const int nt = omp_get_max_threads(); #else const int nt = 1; #endif /* _OPENMP */ - asl_library_set_thread_count(nt); +#ifdef _OPENMP +#pragma omp single +#endif /* _OPENMP */ +{ + /* restore thread count */ + asl_err = asl_library_set_thread_count(nt); + if (asl_err != ASL_ERROR_OK) return NLCPY_ERROR_ASL; +} retrieve_fpe_flags(psw); return (uint64_t)NLCPY_ERROR_OK; @@ -981,11 +1050,12 @@ uint64_t nlcpy_sort_f64(ve_array *x, int32_t *psw) + uint64_t nlcpy_sort(ve_arguments *args, int32_t *psw) { ve_array *x = &(args->single.x); uint64_t err = NLCPY_ERROR_OK; - + switch (x->dtype) { case ve_bool: err = nlcpy_sort_bool (x, psw); break; case ve_i32: err = nlcpy_sort_i32 (x, psw); break; diff --git a/nlcpy/ve_kernel/nlcpy_sort.c.m4 b/nlcpy/ve_kernel/nlcpy_sort.c.m4 index 6e53ee3..1912605 100644 --- a/nlcpy/ve_kernel/nlcpy_sort.c.m4 +++ b/nlcpy/ve_kernel/nlcpy_sort.c.m4 @@ -3,10 +3,10 @@ # * The source code in this file is developed independently by NEC Corporation. # # # NLCPy License # -# +# # Copyright (c) 2020-2021 NEC Corporation # All rights reserved. -# +# # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # * Redistributions of source code must retain the above copyright notice, @@ -17,7 +17,7 @@ # * Neither NEC Corporation nor the names of its contributors may be # used to endorse or promote products derived from this software # without specific prior written permission. -# +# # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE @@ -58,8 +58,8 @@ uint64_t nlcpy_sort_$1(ve_array *x, int32_t *psw) #endif /* _OPENMP */ { /* noting to do */ -} /* omp single */ - +} /* omp single */ + ///////// // 1-d // ///////// @@ -137,7 +137,7 @@ uint64_t nlcpy_sort_$1(ve_array *x, int32_t *psw) if (asl_err != ASL_ERROR_OK) return NLCPY_ERROR_ASL; asl_err = asl_sort_set_output_key_long_stride(sort, ix0); if (asl_err != ASL_ERROR_OK) return NLCPY_ERROR_ASL; - + for (int64_t cnt = cnt_s; cnt < cnt_e; cnt++) { ix = cnt * x->strides[n_outer] / x->itemsize; for (;;) { @@ -156,6 +156,9 @@ uint64_t nlcpy_sort_$1(ve_array *x, int32_t *psw) if (k < 1) break; } } +#ifdef _OPENMP +#pragma omp barrier +#endif /* _OPENMP */ /* destroy sorter */ asl_err = asl_sort_destroy(sort); if (asl_err != ASL_ERROR_OK) return NLCPY_ERROR_ASL; @@ -163,18 +166,25 @@ uint64_t nlcpy_sort_$1(ve_array *x, int32_t *psw) return (uint64_t)NLCPY_ERROR_NDIM; } - /* restore thread count */ #ifdef _OPENMP const int nt = omp_get_max_threads(); #else const int nt = 1; #endif /* _OPENMP */ - asl_library_set_thread_count(nt); +#ifdef _OPENMP +#pragma omp single +#endif /* _OPENMP */ +{ + /* restore thread count */ + asl_err = asl_library_set_thread_count(nt); + if (asl_err != ASL_ERROR_OK) return NLCPY_ERROR_ASL; +} retrieve_fpe_flags(psw); return (uint64_t)NLCPY_ERROR_OK; } + @-->)dnl macro_asl_sort(bool,int32_t,i32)dnl macro_asl_sort(i32,int32_t,i32)dnl @@ -189,7 +199,7 @@ uint64_t nlcpy_sort(ve_arguments *args, int32_t *psw) { ve_array *x = &(args->single.x); uint64_t err = NLCPY_ERROR_OK; - + switch (x->dtype) { case ve_bool: err = nlcpy_sort_bool (x, psw); break; case ve_i32: err = nlcpy_sort_i32 (x, psw); break; diff --git a/nlcpy/veo/_veo.pyx b/nlcpy/veo/_veo.pyx index eec53a2..f144944 100644 --- a/nlcpy/veo/_veo.pyx +++ b/nlcpy/veo/_veo.pyx @@ -518,7 +518,9 @@ cdef class VeoProc(object): cdef uint64_t addr if size > pool_threashold: if veo_alloc_mem(self.proc_handle, &addr, size): - raise MemoryError("Out of memory on VE") + nlcpy.request.flush() + if veo_alloc_mem(self.proc_handle, &addr, size): + raise MemoryError("Out of memory on VE") else: v = VeoAlloc() addr = v.pool.reserve(size) @@ -648,9 +650,7 @@ _here = _path._here class VeoAlloc(metaclass=Singleton): def __init__(self): set_proc_init_hook(register._register_ve_kernel) - node_num = os.environ.get('VE_NODE_NUMBER', '0') - node_num = int(node_num) - self.proc = VeoProc(node_num) + self.proc = VeoProc(-1) self.lib = None fast_math = os.environ.get('VE_NLCPY_FAST_MATH', 'no') diff --git a/rpm/Makefile b/rpm/Makefile index 29c9f8d..1ac3d56 100644 --- a/rpm/Makefile +++ b/rpm/Makefile @@ -8,7 +8,7 @@ BASEDIR = .. PLAT = el8 ARCH = x86_64 VERSION_PYTHON = python3.6 -VERSION_NLCPY = 1.0.0 +VERSION_NLCPY = 1.0.1 VERSION_RPM = 1 RELEASE = 1 VERSION_NUMPY = 1.17.4 @@ -24,7 +24,7 @@ SOURCES = $(RPMDIR)/SOURCES $(eval REQUIRES_PYTHON = $(shell echo $(VERSION_PYTHON) | sed -e s"/\.//")) PRODUCT = nec-$(REQUIRES_PYTHON)-nlcpy-ve-$(VERSION_NLCPY) -REQUIRES_NUMPY = nec-$(REQUIRES_PYTHON)-numpy-$(VERSION_NUMPY) +REQUIRES_NUMPY = nec-$(REQUIRES_PYTHON)-numpy-$(VERSION_NUMPY)-$(VERSION_NLCPY) SRC_NLCPY = nec-$(REQUIRES_PYTHON)-nlcpy-ve-$(VERSION_NLCPY)-$(VERSION_RPM)-$(RELEASE) RPM_NLCPY = $(SRC_NLCPY).$(PLAT).$(ARCH).rpm diff --git a/rpm/SPECS/nec-python3-nlcpy-ve b/rpm/SPECS/nec-python3-nlcpy-ve index 4f08ea5..f87c291 100644 --- a/rpm/SPECS/nec-python3-nlcpy-ve +++ b/rpm/SPECS/nec-python3-nlcpy-ve @@ -19,7 +19,7 @@ Source0: %{product}-%{rpm_version}-%{rpm_release}.tar.gz # BuildRoot: %(mktemp -ud %{_tmppath}/%{product}-XXXXXX) -Requires: %{requires_numpy}, nec-asl-ve-2.2.0 >= 2.2, nec-blas-ve-2.2.0 >= 2.3, nec-lapack-ve-2.2.0 >= 2.1 +Requires: %{requires_numpy} AutoReqProv: no %description diff --git a/tests/pytest/fft_tests/test_fft2_fftn.py b/tests/pytest/fft_tests/test_fft2_fftn.py index e858abb..0e52c61 100644 --- a/tests/pytest/fft_tests/test_fft2_fftn.py +++ b/tests/pytest/fft_tests/test_fft2_fftn.py @@ -3,10 +3,13 @@ import pytest # NOQA import numpy as np - +from numpy.testing import assert_allclose import nlcpy # NOQA from nlcpy import testing +signed_int_types = [np.int32, np.int64] +unsigned_int_types = [np.uint32, np.uint64] +int_types = signed_int_types + unsigned_int_types global enable_nd_planning enable_nd_planning = True @@ -50,6 +53,12 @@ def test_func(self, *args, **kw): return decorator +def _numpy_fftn_correct_dtype(xp, a): + if xp == np and a.dtype in int_types + [np.bool]: + a = xp.asarray(a, dtype=np.float64) + return a + + def _size_last_transform_axis(shape, s, axes): if s is not None: if s[-1] is not None: @@ -72,8 +81,6 @@ def _size_last_transform_axis(shape, s, axes): {'shape': (2, 3, 4), 's': None, 'axes': (-1, -2, -3), 'norm': None}, {'shape': (2, 3, 4), 's': None, 'axes': (0, 1), 'norm': None}, {'shape': (2, 3, 4), 's': None, 'axes': None, 'norm': 'ortho'}, - {'shape': (2, 3, 4), 's': (2, 3), 'axes': (0, 1, 2), 'norm': 'ortho'}, - {'shape': (2, 3, 4), 's': (2, 3), 'axes': None, 'norm': 'ortho'}, {'shape': (2, 3, 4, 5), 's': None, 'axes': None, 'norm': None}, ) @testing.with_requires('numpy>=1.10.0') @@ -89,6 +96,7 @@ def test_fft2(self, xp, dtype, order, enable_nd): assert enable_nd_planning == enable_nd a = testing.shaped_random(self.shape, xp, dtype) a = xp.asarray(a, order=order) + a = _numpy_fftn_correct_dtype(xp, a) out = xp.fft.fft2(a, s=self.s, norm=self.norm) if xp == np and dtype in [np.float16, np.float32, np.complex64]: @@ -106,6 +114,7 @@ def test_ifft2(self, xp, dtype, order, enable_nd): assert enable_nd_planning == enable_nd a = testing.shaped_random(self.shape, xp, dtype) a = xp.asarray(a, order=order) + a = _numpy_fftn_correct_dtype(xp, a) out = xp.fft.ifft2(a, s=self.s, norm=self.norm) if xp == np and dtype in [np.float16, np.float32, np.complex64]: @@ -115,23 +124,45 @@ def test_ifft2(self, xp, dtype, order, enable_nd): class TestFft2DInvalidParam(object): @pytest.mark.parametrize('a', (1, 1 + 2j, - ["aaa"], [[1, 2], [3, "4"]], [], - ("aaa",), ((1, 2), (3, "4")), (), - ([1, 2], [3, "4"]), [(1, 2), (3, "4")], - [[1, 2], (3, "4")], ((1, 2), [3, "4"]),)) + ["aaa"], [], + ("aaa",), (), + )) def test_fft2_param_array(self, a): with pytest.raises(ValueError): nlcpy.fft.fft2(a) - @pytest.mark.parametrize('a', (1, 1 + 2j, - [[1, 2], [3, "4"]], [], - ((1, 2), (3, "4")), (), + @pytest.mark.parametrize('a', ( + [[1, 2], [3, "4"]], + ((1, 2), (3, "4")), ([1, 2], [3, "4"]), [(1, 2), (3, "4")], [[1, 2], (3, "4")], ((1, 2), [3, "4"]),)) + def test_fft2_param_array_U21(self, a): + if np.__version__ < np.lib.NumpyVersion('1.19.0'): + with pytest.raises(ValueError): + nlcpy.fft.fft2(a) + else: + assert_allclose(nlcpy.fft.fft2(a), np.fft.fft2(a)) + + @pytest.mark.parametrize('a', (1, 1 + 2j, + ["aaa"], [], + ("aaa",), (), + )) def test_ifft2_param_array(self, a): with pytest.raises(ValueError): nlcpy.fft.ifft2(a) + @pytest.mark.parametrize('a', ( + [[1, 2], [3, "4"]], + ((1, 2), (3, "4")), + ([1, 2], [3, "4"]), [(1, 2), (3, "4")], + [[1, 2], (3, "4")], ((1, 2), [3, "4"]),)) + def test_ifft2_param_array_U21(self, a): + if np.__version__ < np.lib.NumpyVersion('1.19.0'): + with pytest.raises(ValueError): + nlcpy.fft.ifft2(a) + else: + assert_allclose(nlcpy.fft.ifft2(a), np.fft.ifft2(a)) + @pytest.mark.parametrize('param', ( ([[1, 2, 3], [4, 5, 6]], (-1, -3)), ([[1, 2, 3], [4, 5, 6]], (0, 2)), @@ -172,6 +203,28 @@ def test_ifft2_param_s_ValueError(self, s): with pytest.raises(ValueError): nlcpy.fft.ifft2([[1, 2, 3], [4, 5, 6]], s=s) + @pytest.mark.parametrize('norm', (None, 'ortho')) + @pytest.mark.parametrize('param', ( + ((2, 3), (0, 1, 2)), + ((2,), (0, 1, 2)), + ((2,), (0, 1)) + )) + def test_fft2_invalid_axes_s(self, param, norm): + a = nlcpy.arange(24).reshape(2, 3, 4) + with pytest.raises(ValueError): + nlcpy.fft.fft2(a, s=param[0], axes=param[1], norm=norm) + + @pytest.mark.parametrize('norm', (None, 'ortho')) + @pytest.mark.parametrize('param', ( + ((2, 3), (0, 1, 2)), + ((2,), (0, 1, 2)), + ((2,), (0, 1)) + )) + def test_ifft2_invalid_axes_s(self, param, norm): + a = nlcpy.arange(24).reshape(2, 3, 4) + with pytest.raises(ValueError): + nlcpy.fft.ifft2(a, s=param[0], axes=param[1], norm=norm) + @testing.parameterize( {'shape': (3, 4), 's': None, 'axes': None, 'norm': None}, @@ -198,8 +251,6 @@ def test_ifft2_param_s_ValueError(self, s): {'shape': (2, 3, 4), 's': None, 'axes': (-1, -3, -2), 'norm': 'ortho'}, {'shape': (2, 3, 4), 's': None, 'axes': (0, 1), 'norm': None}, {'shape': (2, 3, 4), 's': None, 'axes': (0, 1), 'norm': 'ortho'}, - {'shape': (2, 3, 4), 's': (2, 3), 'axes': (0, 1, 2), 'norm': None}, - {'shape': (2, 3, 4), 's': (2, 3), 'axes': (0, 1, 2), 'norm': 'ortho'}, {'shape': (2, 3, 4, 5), 's': None, 'axes': None, 'norm': None}, {'shape': (2, 3, 4, 5, 6), 's': None, 'axes': (0, 1, 2, 3), 'norm': None}, {'shape': (2, 3, 4, 5, 6), 's': None, 'axes': (3, 2, 1, 0), 'norm': None}, @@ -221,6 +272,7 @@ def test_fftn(self, xp, dtype, order, enable_nd): assert enable_nd_planning == enable_nd a = testing.shaped_random(self.shape, xp, dtype) a = xp.asarray(a, order=order) + a = _numpy_fftn_correct_dtype(xp, a) out = xp.fft.fftn(a, s=self.s, axes=self.axes, norm=self.norm) if xp == np and dtype in [np.float16, np.float32, np.complex64]: @@ -238,6 +290,7 @@ def test_ifftn(self, xp, dtype, order, enable_nd): assert enable_nd_planning == enable_nd a = testing.shaped_random(self.shape, xp, dtype) a = xp.asarray(a, order=order) + a = _numpy_fftn_correct_dtype(xp, a) out = xp.fft.ifftn(a, s=self.s, axes=self.axes, norm=self.norm) if xp == np and dtype in [np.float16, np.float32, np.complex64]: diff --git a/tests/pytest/linalg_tests/test_solve.py b/tests/pytest/linalg_tests/test_solve.py index 879acc1..a022c1d 100644 --- a/tests/pytest/linalg_tests/test_solve.py +++ b/tests/pytest/linalg_tests/test_solve.py @@ -279,11 +279,10 @@ def test_lstsq_incompatible_dim(self, xp): return xp.linalg.lstsq(a, b) def test_lstsq_not_converge(self): - try: - nlcpy.linalg.lstsq(nlcpy.ones([200, 200]), nlcpy.zeros([200, 1])) - raise ValueError - except nlcpy.linalg.LinAlgError: - pass + a = nlcpy.ones([200, 200]) + a[0, 0] = nlcpy.nan + with self.assertRaises(nlcpy.linalg.LinAlgError): + nlcpy.linalg.lstsq(a, nlcpy.zeros([200, 1])) @testing.parameterize(*(