From 26309472f3ac3e730ab2d2932f945d3fd5dd790f Mon Sep 17 00:00:00 2001 From: lbugnon Date: Fri, 17 Jan 2020 18:16:27 -0300 Subject: [PATCH] first release --- LICENSE | 19 + README.md | 29 + deesom/__init__.py | 1 + deesom/deesom.py | 304 ++++++++ deesom/demo.ipynb | 379 ++++++++++ requirements.txt | 3 + setup.py | 22 + sompy/__init__.py | 1 + sompy/codebook.py | 173 +++++ sompy/decorators.py | 21 + sompy/neighborhood.py | 48 ++ sompy/normalization.py | 74 ++ sompy/sompy.py | 712 ++++++++++++++++++ sompy/visualization/__init__.py | 1 + .../__pycache__/__init__.cpython-38.pyc | Bin 0 -> 254 bytes .../__pycache__/dotmap.cpython-38.pyc | Bin 0 -> 2081 bytes .../__pycache__/view.cpython-38.pyc | Bin 0 -> 2416 bytes sompy/visualization/bmuhits.py | 54 ++ sompy/visualization/dotmap.py | 68 ++ sompy/visualization/histogram.py | 59 ++ sompy/visualization/hitmap.py | 37 + sompy/visualization/mapview.py | 187 +++++ sompy/visualization/umatrix.py | 95 +++ sompy/visualization/view.py | 54 ++ 24 files changed, 2341 insertions(+) create mode 100644 LICENSE create mode 100644 README.md create mode 100644 deesom/__init__.py create mode 100644 deesom/deesom.py create mode 100644 deesom/demo.ipynb create mode 100644 requirements.txt create mode 100644 setup.py create mode 100644 sompy/__init__.py create mode 100644 sompy/codebook.py create mode 100644 sompy/decorators.py create mode 100644 sompy/neighborhood.py create mode 100644 sompy/normalization.py create mode 100644 sompy/sompy.py create mode 100755 sompy/visualization/__init__.py create mode 100644 sompy/visualization/__pycache__/__init__.cpython-38.pyc create mode 100644 sompy/visualization/__pycache__/dotmap.cpython-38.pyc create mode 100644 sompy/visualization/__pycache__/view.cpython-38.pyc create mode 100644 sompy/visualization/bmuhits.py create mode 100644 sompy/visualization/dotmap.py create mode 100644 sompy/visualization/histogram.py create mode 100644 sompy/visualization/hitmap.py create mode 100644 sompy/visualization/mapview.py create mode 100644 sompy/visualization/umatrix.py create mode 100644 sompy/visualization/view.py diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..2bdaca2 --- /dev/null +++ b/LICENSE @@ -0,0 +1,19 @@ +Copyright (c) 2018 The Python Packaging Authority + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..090f2c4 --- /dev/null +++ b/README.md @@ -0,0 +1,29 @@ +DeeSOM +-------------------------------------------------------------------------------- +Self-organized map based classifier, developed to deal with large and highly imbalanced data. + +sinc(i) - http://sinc.unl.edu.ar + +The methods automatically build several layers of SOM. Data is clustered and samples that +are not likely to be positive class member are discarded at each level. + +The elastic-deepSOM (elasticSOM) is a deep architecture of SOM layers where the map +size is automatically set in each layer according to the data filtered in each previous +map. The ensemble-elasticSOM (eeSOM) uses several SOMs in ensemble layers to +face the high imbalance challenges. These new models are particularly suited +to handle problems where there is a labeled class of interest (positive +class) that is significantly under-represented with respect to a higher number +of unlabeled data. + +This code can be used, modified or distributed for academic purposes under GNU +GPL. Please feel free to contact with any issue, comment or suggestion. + +This code was used in: + +"Deep neural architectures for highly imbalanced data in bioinformatics" +L. A. Bugnon, C. Yones, D. H. Milone and G. Stegmayer*, IEEE Transactions on Neural Networks and Learning Systems, + Special Issue on Recent Advances in Theory, Methodology and Applications of Imbalanced Learning (in press). + +## Instalation + +## Running the demo. \ No newline at end of file diff --git a/deesom/__init__.py b/deesom/__init__.py new file mode 100644 index 0000000..bfd0d8b --- /dev/null +++ b/deesom/__init__.py @@ -0,0 +1 @@ +from deesom.deesom import DeeSOM \ No newline at end of file diff --git a/deesom/deesom.py b/deesom/deesom.py new file mode 100644 index 0000000..247a84d --- /dev/null +++ b/deesom/deesom.py @@ -0,0 +1,304 @@ +# ========================================================================= +# sinc(i) - http://fich.unl.edu.ar/sinc/ +# Leandro Bugnon, Cristian Yones, Diego Milone and Georgina Stegmayer +# lbugnon@sinc.unl.edu.ar +# ========================================================================= +import numpy as np +from sompy import SOMFactory +import time +from sklearn.base import BaseEstimator + + +class _SOM(): + def __init__(self, train_data, train_labels, n_jobs, train_len, max_train_len, max_map_size, nsize, elastic_factor, + visualization_neighbour, verbosity, mapsize=None): + """ + Train a SOM with a different labeling scheme. A one-positive-sample scheme is used: if one positive sample is + contained in a SOM unit, the unit will be a positive one. + :param train_data: numpy array NxM + :param train_labels: numpy array N + :param mapsize: + """ + self.verbosity = verbosity + self.visualization_neighbour = visualization_neighbour + self.n_jobs = n_jobs + + if mapsize is None: + n = max(2, int(min(max_map_size, int(np.sqrt(5 * train_data.shape[0] ** 0.54321) * nsize) * + elastic_factor))) + mapsize = [n, n] + self.mapsize = mapsize + self.som = SOMFactory.build(train_data, mapsize, normalization=None) + self.som.train(n_job=n_jobs, train_len_factor=train_len, verbose=None, maxtrainlen=max_train_len) + self.som_labels = self.set_neighbour(mapsize[0], train_data, train_labels) + + + def set_neighbour(self, n, data, labels): + """ + Extend the positive units labels to a neighbourhood (positive region). + :param som: sompy.SOM model + :param n: map size + :param data: data to find best matching units + :param labels: labels for data. + :return: new labels for data + """ + visualization_neighbour = self.visualization_neighbour + + idx_mi = np.where(labels == 1)[0] + # Find best matching units for positives + bmus1 = self.som.find_bmu(data[idx_mi], self.n_jobs).astype(int)[0] + + som_labels = np.zeros((n * n,), dtype=np.bool) + som_labels[bmus1] = 1 + # Positive labels are spread trough a neighbourhood of width visualization_neighbour + if visualization_neighbour is None: + # distance in units + visualization_neighbour = max(0, np.floor(np.log(n) / np.log(7)) - + np.floor(np.exp(-abs(len(idx_mi) / 1e3 - 6) + 0.4))) + # gaussian distance + visualization_neighbour **= 2 + if visualization_neighbour > 0: + dist_matrix = self.som._distance_matrix + pos_units = np.where(som_labels == 1)[0] + for l in pos_units: + som_labels[dist_matrix[l, :] <= visualization_neighbour] = 1 + + return som_labels + + def predict(self, data): + """ + Use a SOM to label new data + :param data: data to label + :return: new labels for data + """ + bmus = self.som.find_bmu(data, 4) + labels = self.som_labels[bmus[0].astype(int)] + return labels + + +class _SOMLayer(): + def __init__(self, train_data, train_labels, idxtrn, n_jobs, train_len, max_train_len, max_map_size, nsize, + elastic_factor, visualization_neighbour, verbosity, h): + n_in = len(idxtrn) + starth_time = time.time() + + self.som = _SOM(train_data[idxtrn, :], train_labels[idxtrn], n_jobs, train_len, max_train_len, max_map_size, + nsize, elastic_factor, visualization_neighbour, verbosity) + resh = self.som.predict(train_data[idxtrn, :]) + + # Update candidates for the next layer + idxtrn = idxtrn[np.where(resh == 1)[0]] + n_out = len(idxtrn) + + if verbosity: + print("Layer=%03d \t layer_size=%03d \t n_inputs=%06d \t n_outputs=%06d \t (layer_time=%0.1f min)" % + (h, self.som.mapsize[0], n_in, n_out, (time.time() - starth_time) / 60)) + self.out_index = idxtrn + + def predict(self, test_data): + return self.som.predict(test_data) + +class _SOMensembleLayer(): + def __init__(self, train_data, train_labels, idxtrn, n_jobs, train_len, max_train_len, max_map_size, nsize, + elastic_factor, target_imbalance, visualization_neighbour, verbosity, h=0): + """ + Train SOMs in parallel. Positive cases can be presented to all layers, and unknown cases are split en equal parts. + Output is the OR operation for all layers. + """ + + self.soms = [] + + starth_time = time.time() + posind = np.where(train_labels[idxtrn] == 1)[0] + negind = np.where(train_labels[idxtrn] == 0)[0] + NP = len(posind) + NN = len(negind) + idxtrnh0 = [] + + # Define ensemble size to achieve imbalance target_imbalance. + negatives_wanted = NP * target_imbalance + Nhp = max(int(np.round(NN / negatives_wanted)), 2) + + idxtrn_new = [] + resh = np.zeros(train_labels[idxtrn].shape, dtype=bool) + for h0 in range(Nhp): + negindh0 = np.s_[int(h0 * NN / Nhp):int((h0 + 1) * NN / Nhp)] + posindh0 = range(len(posind)) + idxtrn_part = idxtrn[np.append(posind[posindh0], negind[negindh0])] + + trdat = train_data[idxtrn_part, :] + trlab = train_labels[idxtrn_part] + starth0_time = time.time() + som = _SOM(trdat, trlab, n_jobs, train_len, max_train_len, max_map_size, nsize, elastic_factor, + visualization_neighbour, verbosity) + self.soms.append(som) + + resh0 = som.predict(trdat) + n_out0 = sum(resh0) + idxtrn_part = idxtrn_part[np.where(resh0 == 1)[0]] + if len(idxtrn_new) < 1: + idxtrn_new = idxtrn_part + else: + idxtrn_new = np.concatenate((idxtrn_new, idxtrn_part)) + + if verbosity: + print("ensemble_member=%d_%03d \t layer_size=%03d \t n_inputs=%06d \t n_outputs=%06d \t (layer_time=%0.1f min)" % + (h, h0, som.mapsize[0], trdat.shape[0], n_out0, (time.time() - starth_time) / 60)) + + n_in = len(idxtrn) + idxtrn = np.array(list(set(idxtrn_new))) + n_out = len(idxtrn) + + if verbosity: + print("ensemble_layer=%d, n_inputs=%06d, n_outputs=%06d, (htime=%0.1f min)" % + (h, n_in, n_out, (time.time() - starth_time) / 60)) + + self.out_index = idxtrn + + def predict(self, test_data): + """ + """ + resh = [] + for h0 in range(len(self.soms)): + resh0 = self.soms[h0].predict(test_data) + if len(resh) == 0: + resh = resh0 + else: + resh[np.where(resh0 == 1)[0]] == 1 + return resh + + +class DeeSOM(BaseEstimator): + """DeeSOM is a library for highly-umbalanced class recognition based on self-organizing maps (SOM). """ + + def __init__(self, verbosity=False, max_map_size=150, max_number_layers=100, visualization_neighbour=None, n_jobs=4, + train_len_factor=1, map_size_factor=1, max_train_len=200, elastic_threshold=0.97, + positive_th=1.02, target_imbalance=1500, elastic=True): + """ + Classifier based on several layers of self-organizing maps (SOMs). ElasticSOM and Ensemble-elasticSOM are + implemented, as described in: + + "Deep neural architectures for highly imbalanced data in bioinformatics, L. A. Bugnon, C. Yones, D. H. Milone, + G. Stegmayer, IEEE Transactions on Neural Networks and Learning Systems (2019)". + + Several hyperparameters are provided in the interface to easily explore in research. However, the main + hyperparameters are: visualization_neighbour and target_imb + + :param verbosity: If verbosity>0, it shows evolution of training via standard output + ;param elastic: True set elastic behaviour. + :param max_map_size_esmapsize: Maximum number of layers. + :param max_number_layers: Maximum size of each layer. + :param visualization_neighbour: This controls the size of the cluster that selects the samples at each layer + that pass to the next layer. If None, it is automatically determined. + :param n_jobs: Number of jobs (multithread). + :param train_len_factor: Multiplying factor in (0,1] applied to number of training steps + :param map_size_factor: Multiplying factor applied to the map size. + :param max_train_len: Maximum number of iterations per layer. + :param elastic_threshold: Elastic algorithm is triggered if data reduction < (1-elastic_threshold)*100% + :param positive_th: Stop training when the number of candidates is positive_th * training_positives + :param target_imbalance: If set None, no ensemble layers are created (as in the elasticSOM model). Otherwise, + ensemble layers are built to approximate the data imbalance of each layer to target_imbalance (as in the eeSOM + model). If input imbalance is less than target_imbalance, the model behaviour is the same as if elasticSOM. + """ + + self.max_map_size = max_map_size + self.visualization_neighbour = visualization_neighbour + self.n_jobs = n_jobs + self.train_len_factor = train_len_factor + self.max_train_len = max_train_len + self.map_size_factor = map_size_factor + self.max_number_layers = max_number_layers + self.elastic_threshold = elastic_threshold + self.positive_th = positive_th + self.target_imbalance = target_imbalance + self.elastic = True + self.verbosity = verbosity + + # Initial state: TODO check if this is ok with the good practices + self.elastic_factor = 1 + self.layers = [] + self.layers_labels = [] + + + + def fit(self, train_data, train_labels): + """ + Train a deep-SOM model, using several layers of SOMs. + :param train_data: numpy.array NxM + :param train_labels: numpy.array N + :return: + """ + start_time = time.time() + n_data = train_data.shape[0] + idxtrn = np.array(range(n_data)) + n_out = n_data + + n_posh = len(np.where(train_labels == 1)[0]) + h = 0 + if self.verbosity: + print("\nStart training: n_samples=%d, n_positives=%d" % (n_data, n_posh)) + self.data_proba = np.zeros(train_labels.shape) + while (h < self.max_number_layers) and (n_out >= self.positive_th * n_posh): + n_in = len(idxtrn) + n_pos = len(np.where(train_labels[idxtrn] == 1)[0]) + n_neg = n_in - n_pos + + # imbalance-driven ensemble layer + if self.target_imbalance > 0 and n_neg / n_pos >= 2 * self.target_imbalance: + layer = _SOMensembleLayer(train_data, train_labels, idxtrn, self.n_jobs, self.train_len_factor, + self.max_train_len, self.max_map_size, self.map_size_factor, self.elastic_factor, + self.target_imbalance, self.visualization_neighbour, self.verbosity, h) + else: + layer = _SOMLayer(train_data, train_labels, idxtrn, self.n_jobs, self.train_len_factor, self.max_train_len, + self.max_map_size, self.map_size_factor, self.elastic_factor, + self.visualization_neighbour, self.verbosity, h) + + idxtrn = layer.out_index + self.data_proba[idxtrn] += 1 # candidates ranking + self.layers.append(layer) + + n_out = len(idxtrn) + # Define elastic behaviour + self.get_deesom_height(n_out, n_in) + h += 1 + + self.elastic_factor = 1 + if self.verbosity: + print("(Total time=%0.1f min)" % ((time.time() - start_time) / 60)) + return + + + def predict_proba(self, test_data=None): + """ + :param test_data: + :return: + """ + if test_data is None: + return self.data_proba/np.max(self.data_proba) + + # TODO when test_data is not none + + def get_deesom_height(self, n_out, n_in): + """ Calculate next SOM layer height using the elastic algorithm""" + if self.elastic and n_out > self.elastic_threshold * n_in: + self.elastic_factor *= 1.2 + + def predict(self, test_data): + """ + Implement deepsom testing per level. + + """ + n_data = test_data.shape[0] + idxtst = np.array(range(n_data)) + slabsh = np.zeros(n_data, dtype=np.int) + h = 0 + while h < len(self.layers) and len(idxtst) > 0: + + resh = self.layers[h].predict(test_data[idxtst, :]) + + idxtst = idxtst[np.where(resh == 1)[0]] + slabsh[idxtst] += 1 + # slab[i]==n means sample i was discarded in layer n (thus it was considered as positive up to layer n-1) + h += 1 + return slabsh + diff --git a/deesom/demo.ipynb b/deesom/demo.ipynb new file mode 100644 index 0000000..ed474af --- /dev/null +++ b/deesom/demo.ipynb @@ -0,0 +1,379 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# DeeSOM: Getting started\n", + "This is a brief demo where a deeSOM model is trained and tested with real genome-wide data. " + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/leandro/.local/lib/python3.7/site-packages/sklearn/externals/joblib/__init__.py:15: FutureWarning: sklearn.externals.joblib is deprecated in 0.21 and will be removed in 0.23. Please import this functionality directly from joblib, which can be installed with: pip install joblib. If this warning is raised when loading pickled models, you may need to re-serialize those models with scikit-learn 0.21+.\n", + " warnings.warn(msg, category=FutureWarning)\n", + "Loaded backend module://ipykernel.pylab.backend_inline version unknown.\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "from matplotlib import pyplot as plt\n", + "from shutil import unpack_archive\n", + "from random import shuffle, seed\n", + "from scipy.stats import zscore\n", + "\n", + "import pandas as pd\n", + "from sklearn.metrics import precision_recall_curve\n", + "from deesom import DeeSOM\n", + "\n", + "# Reproducibility\n", + "seed(1)\n", + "np.random.seed(1)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "def load_test_data(fname):\n", + " \"\"\"\n", + " Load dataset and generate train/test partitions.\n", + " :param fname: Dataset to load in format .csv. Has M features and labels under the name \"CLASS\".\n", + " :return: train_data: dataset with a 20% of positives labeled as 0 (unlabeled). test_labels contains the real labels\n", + " to test how many true-positives are finally retrieved.\n", + " \"\"\"\n", + " train_data = pd.read_csv(fname)\n", + " train_labels = train_data.CLASS.values\n", + " \n", + " test_labels = train_labels.copy()\n", + " train_data = train_data.drop(columns=[\"sequence_names\", \"CLASS\"]).values\n", + "\n", + " pos_ind = np.where(train_labels == 1)[0]\n", + " shuffle(pos_ind)\n", + "\n", + " P = len(pos_ind)\n", + " train_labels[pos_ind[:int(P*.20)]] = 0\n", + "\n", + " # Feature Normalization \n", + " train_data = zscore(train_data, axis=0)\n", + " \n", + " return train_data, train_labels, test_labels" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Download data\n", + "We will use the features of more than 1.3M secuences than could be micro-RNAs. Some of them are tagged as _positive_ (1), and the others are _unknown_ (0). More details can be found in:\n", + "\n", + ">Genome-wide hairpins datasets of animals and plants for novel miRNA prediction, L. A. Bugnon, C. Yones, J. Raad, D. H. Milone, G. Stegmayer, Data in Brief (in press), 2019\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Data is composed of 10000 samples, where 304 are well-known micro-RNAs (positives). 60 positive samples are marked as unknown for testing\n" + ] + } + ], + "source": [ + "#!wget -O \"ath.zip\" https://sourceforge.net/projects/sourcesinc/files/mirdata/features/ath.zip/download \n", + "#unpack_archive('ath.zip', 'data/')\n", + "\n", + "# Generate train and test partitions, simplily removing a 20% of positives. \n", + "train_data, train_labels, test_labels = load_test_data(\"data/ath.csv\")\n", + "\n", + "print(\"Data is composed of %d samples, where %d are well-known micro-RNAs (positives). %d positive samples are marked as unknown for testing\" % \n", + " (len(train_labels), sum(test_labels), sum(test_labels)-sum(train_labels)))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Training the model\n", + "DeeSOM uses the scikit-learn API, thus can be combined in any processing pipeline0. Hyperparameters are described in the documentation, but defaults values will do for many cases. After creating an instance, the model is trained with the method fit()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Start training: n_samples=10000, n_positives=244\n", + "Layer=000 \t layer_size=027 \t n_inputs=010000 \t n_outputs=003942 \t (layer_time=0.0 min)\n", + "Layer=001 \t layer_size=021 \t n_inputs=003942 \t n_outputs=002540 \t (layer_time=0.0 min)\n", + "Layer=002 \t layer_size=018 \t n_inputs=002540 \t n_outputs=001946 \t (layer_time=0.0 min)\n", + "Layer=003 \t layer_size=017 \t n_inputs=001946 \t n_outputs=001638 \t (layer_time=0.0 min)\n", + "Layer=004 \t layer_size=016 \t n_inputs=001638 \t n_outputs=001512 \t (layer_time=0.0 min)\n", + "Layer=005 \t layer_size=016 \t n_inputs=001512 \t n_outputs=001433 \t (layer_time=0.0 min)\n", + "Layer=006 \t layer_size=016 \t n_inputs=001433 \t n_outputs=001369 \t (layer_time=0.0 min)\n", + "Layer=007 \t layer_size=015 \t n_inputs=001369 \t n_outputs=001349 \t (layer_time=0.0 min)\n", + "Layer=008 \t layer_size=018 \t n_inputs=001349 \t n_outputs=001230 \t (layer_time=0.0 min)\n", + "Layer=009 \t layer_size=018 \t n_inputs=001230 \t n_outputs=001092 \t (layer_time=0.0 min)\n", + "Layer=010 \t layer_size=016 \t n_inputs=001092 \t n_outputs=001050 \t (layer_time=0.0 min)\n", + "Layer=011 \t layer_size=016 \t n_inputs=001050 \t n_outputs=001031 \t (layer_time=0.0 min)\n", + "Layer=012 \t layer_size=020 \t n_inputs=001031 \t n_outputs=000920 \t (layer_time=0.1 min)\n", + "Layer=013 \t layer_size=020 \t n_inputs=000920 \t n_outputs=000806 \t (layer_time=0.1 min)\n", + "Layer=014 \t layer_size=018 \t n_inputs=000806 \t n_outputs=000768 \t (layer_time=0.1 min)\n", + "Layer=015 \t layer_size=018 \t n_inputs=000768 \t n_outputs=000732 \t (layer_time=0.1 min)\n", + "Layer=016 \t layer_size=018 \t n_inputs=000732 \t n_outputs=000717 \t (layer_time=0.1 min)\n", + "Layer=017 \t layer_size=022 \t n_inputs=000717 \t n_outputs=000651 \t (layer_time=0.1 min)\n", + "Layer=018 \t layer_size=020 \t n_inputs=000651 \t n_outputs=000614 \t (layer_time=0.1 min)\n", + "Layer=019 \t layer_size=020 \t n_inputs=000614 \t n_outputs=000583 \t (layer_time=0.1 min)\n", + "Layer=020 \t layer_size=020 \t n_inputs=000583 \t n_outputs=000567 \t (layer_time=0.1 min)\n", + "Layer=021 \t layer_size=024 \t n_inputs=000567 \t n_outputs=000521 \t (layer_time=0.2 min)\n", + "Layer=022 \t layer_size=024 \t n_inputs=000521 \t n_outputs=000487 \t (layer_time=0.2 min)\n", + "Layer=023 \t layer_size=024 \t n_inputs=000487 \t n_outputs=000463 \t (layer_time=0.2 min)\n", + "Layer=024 \t layer_size=022 \t n_inputs=000463 \t n_outputs=000447 \t (layer_time=0.2 min)\n", + "Layer=025 \t layer_size=022 \t n_inputs=000447 \t n_outputs=000433 \t (layer_time=0.2 min)\n", + "Layer=026 \t layer_size=022 \t n_inputs=000433 \t n_outputs=000424 \t (layer_time=0.2 min)\n", + "Layer=027 \t layer_size=027 \t n_inputs=000424 \t n_outputs=000390 \t (layer_time=0.3 min)\n", + "Layer=028 \t layer_size=027 \t n_inputs=000390 \t n_outputs=000362 \t (layer_time=0.3 min)\n", + "Layer=029 \t layer_size=027 \t n_inputs=000362 \t n_outputs=000351 \t (layer_time=0.3 min)\n", + "Layer=030 \t layer_size=024 \t n_inputs=000351 \t n_outputs=000342 \t (layer_time=0.2 min)\n", + "Layer=031 \t layer_size=029 \t n_inputs=000342 \t n_outputs=000311 \t (layer_time=0.4 min)\n", + "Layer=032 \t layer_size=029 \t n_inputs=000311 \t n_outputs=000304 \t (layer_time=0.4 min)\n", + "Layer=033 \t layer_size=035 \t n_inputs=000304 \t n_outputs=000281 \t (layer_time=0.7 min)\n", + "Layer=034 \t layer_size=035 \t n_inputs=000281 \t n_outputs=000271 \t (layer_time=0.8 min)\n", + "Layer=035 \t layer_size=035 \t n_inputs=000271 \t n_outputs=000265 \t (layer_time=0.8 min)\n", + "Layer=036 \t layer_size=042 \t n_inputs=000265 \t n_outputs=000254 \t (layer_time=1.3 min)\n", + "Layer=037 \t layer_size=042 \t n_inputs=000254 \t n_outputs=000250 \t (layer_time=1.3 min)\n" + ] + } + ], + "source": [ + "# Init\n", + "deesom = DeeSOM(verbosity=True)\n", + "# Train\n", + "deesom.fit(train_data, train_labels)" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/leandro/.local/lib/python3.8/site-packages/IPython/core/pylabtools.py:128: UserWarning: Creating legend with loc=\"best\" can be slow with large amounts of data.\n", + " fig.canvas.print_figure(bytes_io, **kw)\n" + ] + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYQAAAD4CAYAAADsKpHdAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8li6FKAAAgAElEQVR4nO3dfXxU1b3v8c+P8BBABIVoUVSCRQQSSCAgHIqlVhG1xdoee+TUtmirbRVrLz1UvD7Ueuur9ui5KufgA8drsdgqoMeWKq3Uilat8mgggsizGLQaqIBAAiTzu3/MTpiEPEzCZM+e8H2/XvPKzNore/9mTWb/svbaey9zd0RERNqlOwAREYkGJQQREQGUEEREJKCEICIigBKCiIgE2qdrw7169fK+ffuma/MiIhlpxYoVO9w9pzXWnbaE0LdvX5YvX56uzYuIZCQze6+11q1DRiIiAighiIhIQAlBRESANI4h1OfQoUOUlpZSUVGR7lCkFWRnZ9OnTx86dOiQ7lBEpB6RSgilpaV069aNvn37YmbpDkdSyN3ZuXMnpaWl5ObmpjscEalHk4eMzOwxM/vYzN5uYLmZ2Qwz22hmq81sWEuDqaiooGfPnkoGbZCZ0bNnT/X+gMuevYz8x/O5/HeXpzuUtHl09aPkP57P7JLZ6Q6l2d7Y/gZDfz2UJR8sabLuup3rGP3b0Sz5YAlffvbL5M/OO/x46Ey4o3sIEScvmTGE2cCERpZfBPQPHtcCDx1NQEoGbZc+27iNezYCsG73ujRHkj4PvPUAAP+x8j/SHEnz/dtf/42Yx5j6ytQm605/dTp7D+1l6itT2bp7S+2FnTvHf0YoKTR5yMjd/2pmfRupcinwa4/fR/tNM+thZr3d/cMUxSjSJuQ/nt9gWcm3S8IOJy0yuQ3qxr7n4J4GY6+vLvX8Q5Sfezq4E5V3noqzjE4F3k94XRqUHcHMrjWz5Wa2vKysLAWbFskcnz3+s/WWn9397JAjSZ8bC2+st/zHw34cciTNN+v8WXTO6lyrrHNWZx694NEj6s7/0nxO6XpK4yt0jz/Ky1MZ5lEJ9bRTd5/l7kXuXpST0ypXXkdKZWVlukOQCHn2smfrLZ//lfkhR5I+3x3y3XrLJ+dPDjeQFhh96mjaZ9U+qNIhqwPnnHLOEXXP7nk22e2zaxdWJ4DqR6Dkox2tEm9LpCIhbAdOS3jdJyjLSPv27eOSSy5h6NCh5OXlMXfuXJYtW8Y//dM/MXToUEaOHMmnn35KRUUFV111Ffn5+RQWFrJ48WIAZs+ezcSJEznvvPP44he/CMA999zDiBEjGDJkCD/96U/T+fYkIjpax3SHkHZjTxmb7hCa7UDlAY7veDxTh03l+I7HU1HZ8EkSnx78lDO7n0lez7wQIzw6qTjtdAEwxcyeAs4Bdqdi/OBnf1jD2g/2HHVwiQadcjw//fLgRuv86U9/4pRTTuH5558HYPfu3RQWFjJ37lxGjBjBnj176Ny5Mw888ABmRklJCevWrWP8+PGsX78egJUrV7J69WpOPPFEFi1axIYNG1i6dCnuzsSJE/nrX//Kueeem9L3Jpkh6sfJw5DJbbDimytqnl+Vf1WjdV/6+ktHlMXu6I45YFAzonDH7tQFeJSSOe30SeANYICZlZrZd8zs+2b2/aDKQmAzsBH4b+C6Vos2BPn5+fz5z3/mpptu4tVXX2Xbtm307t2bESNGAHD88cfTvn17XnvtNa688koAzj77bM4444yahHDBBRdw4oknArBo0SIWLVpEYWEhw4YNY926dWzYsCE9b05E0qps6kfkHvgtT15UEk8EEUoGkNxZRpOaWO7A9SmLKNDUf/Kt5ayzzmLlypUsXLiQW2+9lfPOO6/Z6+jatWvNc3fn5ptv5nvf+14qwxSRDJQwdBBJupdRHR988AFdunThyiuvZNq0aSxZsoQPP/yQZcuWAfDpp59SWVnJ2LFj+c1vfgPA+vXr2bZtGwMGDDhifRdeeCGPPfYYe/fuBWD79u18/PHH4b0hEYmcqF6SE6lbV0RBSUkJ06ZNo127dnTo0IGHHnoId+eGG26gvLyczp078+KLL3Ldddfxgx/8gPz8fNq3b8/s2bPp1KnTEesbP34877zzDqNHjwbguOOO44knnuCkk04K+62JSJo50e4imKepD1NUVOR1J8h55513GDhwYFrikXDoM5Zj2Ye7yxn9i5e4+6v5XDHy9Batw8xWuHtRikMDdMhIRCQ0GkMQEZFaojqGoIQgIhKSiHcQlBBERMJmRLOLoIQgIhKSdJ3EkywlBBGRsEWzg6CEICISloh3EDI8IayeB/flwR094j9Xzzuq1e3atYsHH3yw2b938cUXs2vXrmb/3uTJk3n66acbrTNu3DjqXq/RmJdffpkvfelLzY5FRMIT0Q5CBieE1fPgDz+E3e8DHv/5hx8eVVJoKCE0Na/BwoUL6dGjR4u3KyISBZmbEP5yJxyqM9PQofJ4eQtNnz6dTZs2UVBQwIgRIxg7diwTJ05k0KBBAHzlK19h+PDhDB48mFmzZtX8Xt++fdmxYwdbt25l4MCBXHPNNQwePJjx48dTnuRsSHfeeScjRowgLy+Pa6+9ttbg05w5cygoKCAvL4+lS5cC8Xkbrr76akaOHElhYSG///3vj1jnK6+8QkFBAQUFBRQWFvLpp5+2uG1EJHWiOr945iaE3aXNK0/C3XffzZlnnklxcTH33HMPK1eu5IEHHqi5rfVjjz3GihUrWL58OTNmzGDnzp1HrGPDhg1cf/31rFmzhh49evDMM88kte0pU6awbNky3n77bcrLy3nuuedqlu3fv5/i4mIefPBBrr76agDuuusuzjvvPJYuXcrixYuZNm0a+/btq7XOe++9l5kzZ1JcXMyrr75K5861p/8TkXBpDKG1dO/TvPIWGDlyJLm5uTWvZ8yYwdChQxk1ahTvv/9+vfMa5ObmUlBQAMDw4cPZunVrUttavHgx55xzDvn5+bz00kusWbOmZtmkSfE7kJ977rns2bOHXbt2sWjRIu6++24KCgoYN24cFRUVbNu2rdY6x4wZw9SpU5kxYwa7du2ifXvdy1AkCqLZP8jkhPDF26FDnf94O3SOl6dI4rwGL7/8Mi+++CJvvPEGq1atorCwkIqKI6fPS7zjaVZWVlLzKldUVHDdddfx9NNPU1JSwjXXXFNr3XW7l2aGu/PMM89QXFxMcXEx27ZtO+KmcdOnT+fRRx+lvLycMWPGsG7duqTfu4ikXtTvdpq5CWHI1+HLM6D7aYDFf355Rry8hbp169bgcfbdu3dzwgkn0KVLF9atW8ebb77Z4u3UVb3z79WrF3v37j3izKO5c+cC8Nprr9G9e3e6d+/OhRdeyH/+53/WjDW89dZbR6x306ZN5Ofnc9NNNzFixAglBJGIiOgQQobPhzDk60eVAOrq2bMnY8aMIS8vj86dO3PyySfXLJswYQIPP/wwAwcOZMCAAYwaNSpl2+3RowfXXHMNeXl5fOYzn6mZrrNadnY2hYWFHDp0iMceewyA2267jR/96EcMGTKEWCxGbm5urXEHgPvvv5/FixfTrl07Bg8ezEUXXZSymEWk+aI+hqD5ECRU+ozlWLZ1xz7G3fsy9/3LUC4rbNl4p+ZDEBFpAyLeQcjwQ0YZ4vrrr+f111+vVXbjjTdy1VVXpSkiEUmnqN7tVAkhBDNnzkx3CCISAbrbqYiI1BLVs4yUEEREQhLt/oESgoiIBJQQRERCEvEhhMxPCGX7y5j8p8nsKN9x1OsKez6ERMXFxSxcuLBFv5tM3Fu3biUvL6/ROi2ZS6G58zWIiO522moeXv0wKz9ayUOrHjrqdYU1H0Lfvn2PKGvthCAiURDtLkLGJoThTwwn//F85r07D8eZ9+488h/PZ/gTw1u8znTNh3Dw4EFuv/125s6dS0FBAXPnzm1wvoM1a9YwcuRICgoKGDJkCBs2bKgV97Rp05rc3tatWxk7dizDhg1j2LBh/O1vf6tZtmfPHi655BIGDBjA97//fWKxGACLFi1i9OjRDBs2jMsvv5y9e/fWWmdVVRWTJ08mLy+P/Px87rvvvqYbXOQYU33IKJr9A+LnxabjMXz4cK9r7dq1R5Q15ON9H/tPXvmJF80p8rzZeV40p8hveuUmL9tflvQ66tqyZYsPHjzY3d0XL17sXbp08c2bN9cs37lzp7u779+/3wcPHuw7duxwd/czzjjDy8rKfMuWLZ6VleVvvfWWu7tffvnlPmfOnCO2c8YZZxxR9qtf/cqvv/76mtc333xzze9+8skn3r9/f9+7d69PmTLFn3jiCXd3P3DggO/fv79W3Mm8t3379nl5ebm7u69fv96rP4vFixd7p06dfNOmTV5ZWennn3++z58/38vKynzs2LG+d+9ed3e/++67/Wc/+5m7u3/+85/3ZcuW+fLly/3888+v2d4nn3xSbxzN+YxF2pr1f9/jZ9z0nP9h1fYWrwNY7q20X87YC9NyuuTQtUNXDlQdoGNWRw5UHaBrx6706twrZduobz6EZ599FqBmPoSePXvW+p2G5kO46667mD9/PgAffPBBTZ0xY8bUe+HaokWLWLBgAffeey9AzXwHo0eP5q677qK0tJSvfvWr9O/fv9nv69ChQ0yZMoXi4mKysrJqJgCqfs/9+vUD4vMwvPbaa2RnZ7N27VrGjBkDxHs0o0ePrrXOfv36sXnzZm644QYuueQSxo8f3+y4RNq6aB8wSvJKZTObADwAZAGPuvvddZafDjwO9AjqTHf3lh0Qb4Z/VPyDrw/4OpefdTnz189PycByoobmQ+jSpUvNpDR11Z0PofqQ0S233MItt9wCxA8xFRcXN7ptD+Y7GDBgQK3ygQMHcs455/D8889z8cUX88gjj9TswJN13333cfLJJ7Nq1SpisRjZ2dk1yxqae+GCCy7gySefbHCdJ5xwAqtWreKFF17g4YcfZt68eTV3ZhWR2qJ664omxxDMLAuYCVwEDAImmdmgOtVuBea5eyFwBRDKCOf9X7ifW0fdyoATB3DrqFu5/wv3H9X60jUfQn3bbmi+g82bN9OvXz9++MMfcumll7J69epG427ovfTu3Zt27doxZ84cqqqqapYtXbqULVu2EIvFmDt3Lp/73OcYNWoUr7/+Ohs3bgTi8zkn9ioAduzYQSwW42tf+xo///nPWblyZYvbQqStagunnY4ENrr7Znc/CDwFXFqnjgPHB8+7Ax+kLsTwJM6HUHdwdsKECVRWVjJw4ECmT5+e0vkQAL7whS+wdu3amkHl2267jUOHDjFkyBAGDx7MbbfdBsC8efPIy8ujoKCAt99+m29961uNxl2f6667jscff5yhQ4eybt26Wj2hESNGMGXKFAYOHEhubi6XXXYZOTk5zJ49m0mTJjFkyBBGjx59xGQ727dvZ9y4cRQUFHDllVfyi1/8IqXtI9KWRPSs06bnQzCzfwYmuPt3g9ffBM5x9ykJdXoDi4ATgK7A+e6+op51XQtcC3D66acPf++992ot173y2z59xnIsW/f3PUy4/1Ue/MYwLs7v3aJ1ZMJ8CJOA2e7eB7gYmGNmR6zb3We5e5G7F+Xk5KRo0yIimSWiHYSkBpW3A6clvO4TlCX6DjABwN3fMLNsoBfwcSqClOSVlJTwzW9+s1ZZp06dWLJkSZoiEpFqUR9DSCYhLAP6m1ku8URwBfCvdepsA74IzDazgUA2UNaSgNw9spd1Z4L8/Pwmz2BKl6YOT4ocK6K6i2vykJG7VwJTgBeAd4ifTbTGzO40s4lBtR8D15jZKuBJYLK34NufnZ3Nzp07teNog9ydnTt31jrFVeRYE/VdW1LXIQTXFCysU3Z7wvO1wJijDaZPnz6UlpZSVtaizoVEXHZ2Nn36tGxicZG2JZpdhEhdqdyhQ4daVwaLiLQlHvFrlTP25nYiIpkqY8cQREQkNQ5Uxu8eXBWLZk9BCUFEJCQds+K73INBYogaJQQRkZBUn2XUtVOkhm9rKCGIiISkelA5okMISggiImGpmTEtohlBCUFEJGRKCCIix7honlt0mBKCiEhIqm/Lk7EzpomISIpFMx8oIYiIhEWHjEREBEg4yyi9YTRICUFEJDTBGEJETzNSQhARCVk004ESgohIaKI+QY4SgohISKrzQUSPGCkhiIiETdchiIgc43TISEREgIQrlaPZQVBCEBEJS80YQlqjaJgSgohI2CKaEZQQRERCojEEEREBEmdMi2YXQQlBRCQsmjFNREQSRTQfKCGIiIQl4kMISggiImGpuf11RI8ZKSGIiIQsovlACUFEJCwe8YNGSSUEM5tgZu+a2UYzm95Ana+b2VozW2Nmv01tmCIimS/qM6a1b6qCmWUBM4ELgFJgmZktcPe1CXX6AzcDY9z9EzM7qbUCFhHJVG3h9tcjgY3uvtndDwJPAZfWqXMNMNPdPwFw949TG6aISFsSzYyQTEI4FXg/4XVpUJboLOAsM3vdzN40swn1rcjMrjWz5Wa2vKysrGURi4hkKI/4vStSNajcHugPjAMmAf9tZj3qVnL3We5e5O5FOTk5Kdq0iEhmaAuHjLYDpyW87hOUJSoFFrj7IXffAqwnniBERKSOiOaDpBLCMqC/meWaWUfgCmBBnTq/I947wMx6ET+EtDmFcYqIZL5oHzFqOiG4eyUwBXgBeAeY5+5rzOxOM5sYVHsB2Glma4HFwDR339laQYuIZKKau51G9JhRk6edArj7QmBhnbLbE547MDV4iIhIPaJ+HYKuVBYRCVlEOwhKCCIiYYn4WadKCCIiYak57TSiB42UEEREQqZDRiIix7hDVbF0h9AoJQQRkZDsPVAJQFUsmoMJSggiIiHp3CELgC4ds9IcSf2UEEREQnL4XkbRHERQQhARCUn13U4jmg+UEEREwqIrlUVEBIj+vYyUEEREQqIegoiIAIcTQjv1EEREjm0xDSqLiAhEfn4cJQQRkdBUjyGohyAicmzTWUYiIgIkDiqnN46GKCGIiIQkVnPaaTQzghKCiEhIDh8ySnMgDVBCEBEJiS5MExERIOG004hmBCUEEZGwBF0EXaksInKMi+mQkYiIQOJ8CNFMCUoIIiIhqZkxLa1RNEwJQUQkJK5bV4iICCT2EKKZEZQQRERCUjOGENE9b0TDEhFpe0o/KQcgK6LHjJQQRERCcmLXjgBkd8hKcyT1SyohmNkEM3vXzDaa2fRG6n3NzNzMilIXoohI2xCruTAtzYE0oMmEYGZZwEzgImAQMMnMBtVTrxtwI7Ak1UGKiLQFNRemZfAho5HARnff7O4HgaeAS+up93+AXwIVKYxPRKTNcPfI9g4guYRwKvB+wuvSoKyGmQ0DTnP35xtbkZlda2bLzWx5WVlZs4MVEclkMffI3scIUjCobGbtgP8L/Lipuu4+y92L3L0oJyfnaDctIpJRYh7dG9tBcglhO3Bawus+QVm1bkAe8LKZbQVGAQs0sCwiUlvMPbJXKUNyCWEZ0N/Mcs2sI3AFsKB6obvvdvde7t7X3fsCbwIT3X15q0QsIpKhPNN7CO5eCUwBXgDeAea5+xozu9PMJrZ2gCIibUUsFu1B5fbJVHL3hcDCOmW3N1B33NGHJSLS9rSFMQQREUmBtjCGICIiKeDutIvwMSMlBBGRkByojNXMiRBFSggiIiH5aE8FFYeq0h1Gg5QQRERCckLXjnTMiu5uN7qRiYi0Me7Qo2uHdIfRICUEEZGQVMU8spPjgBKCiEhoqnSWkYiIQPxKZfUQRESEmDtZ6iGIiEhVLLqzpYESgohIaOI9hHRH0bAIhyYi0rboLCMREQFg48d7dchIRESgV7dO7Nh7IN1hNEgJQUQkLO589qTj0h1Fg5QQRERCoglyREQEiJ9lFOHLEJQQRETCEnNdhyAiIgQzpkU3HyghiIiEJX7IKLoZQQlBRCQkGlQWEREg3kOIcD5QQhARCYurhyAiIqDTTkVEJKBBZRERASCm+RBERARg+65yHE93GA1SQhARCUmv4zqya/+hdIfRICUEEZGQxBxO7dE53WE0KKmEYGYTzOxdM9toZtPrWT7VzNaa2Woz+4uZnZH6UEVEMltlVYysCJ9m1GRCMLMsYCZwETAImGRmg+pUewsocvchwNPAv6c6UBGRTFcVc9pnckIARgIb3X2zux8EngIuTazg7ovdfX/w8k2gT2rDFBHJfJUxJysrsxPCqcD7Ca9Lg7KGfAf4Y30LzOxaM1tuZsvLysqSj1JEpA1oCz2EpJnZlUARcE99y919lrsXuXtRTk5OKjctIhJpsZhTGXOM6CaE9knU2Q6clvC6T1BWi5mdD9wCfN7dozuLtIhIGhyojAU/q9IcScOS6SEsA/qbWa6ZdQSuABYkVjCzQuARYKK7f5z6MEVEMlt1Ijglk087dfdKYArwAvAOMM/d15jZnWY2Mah2D3AcMN/Mis1sQQOrExE5Ju09UAlAp/ZZaY6kYckcMsLdFwIL65TdnvD8/BTHJSLSpny0pwKAKtetK0REjmmxIA/k9uya3kAaoYQgIhKCyqp4RsjoK5VFROToVQVdhPYZfmGaiIgcpcpY/LRT9RBERI5xNT0EJQQRkWNb2afx63XVQxAROcZVn2wa5esQlBBEREIQC64/OD47qcu/0kIJQUQkBNVjCDpkJCJyjDtUVT2oHN3dbnQjExFpQ6qqTzvVdQgiIse2Sp12KiIiAO/tiM8yrIQgInKM69wxfrqpBpVFRI5xBypj5HTrhJkSgojIMe0f+w7QMSvau9xoRyci0ka8sWlnzcVpURXdS+ZERNqQ4zq15/SeXdIdRqPUQxARCcHBKqdfznHpDqNRSggiIq1s1fu72LH3AJ3aR3uXG+3oRETagEVr/w7A58/KSXMkjVNCEBFpZQcOxejaMYtxA05KdyiN0qCyiEgrqYo5Jdt38/4n++nUIbrzIFRTQhARaSV/fPtDpvz2LQD65XRNczRNU0IQEWkln+w7CMBD3xhGfp/uaY6maUoIIiKt5EBl/JbXn+vfi27ZHdIcTdOUEEREmuH+F9fz6oYdSdX9++4KINrzKCdSQhARaYanV5RysDLGWSd3a7Jubq+ufHHgSXSI8KQ4iZQQRESa4UBljPMHnswvvpqf7lBSTglBRFrd29t31xw+yXT7DlRG/orjllJCEJFWdaCyiq8++DcOVsXSHUrK9DquY7pDaBVJJQQzmwA8AGQBj7r73XWWdwJ+DQwHdgL/4u5bUxsqcEd38k/uBZ07p3zVItJ6OvWHTukOIoUe2QaPzD7KlZSXU/LRDrhjdypCSokm+z1mlgXMBC4CBgGTzGxQnWrfAT5x988C9wG/THWg3BGcw6tkICJtQfW+7I7oXJ+QTA9hJLDR3TcDmNlTwKXA2oQ6lwJ3BM+fBv7LzMw9dbNB5Pc9DSI89ZyISHPl554O7pSkO5BAMiMjpwLvJ7wuDcrqrePulcBuoGfdFZnZtWa23MyWl5WVNS/S8nJwjz9ERDJd9f6svDzdkdQIdVDZ3WcBswCKioqatWcv+WhHvJcQX1HKYxMRSYeSj5K7yC0MyfQQtgOnJbzuE5TVW8fM2gPdiQ8ui4hIhkimh7AM6G9mucR3/FcA/1qnzgLg28AbwD8DL6Vy/ACAO3ZTEqHBFxGRlIjQWUZNJgR3rzSzKcALxE87fczd15jZncByd18A/D9gjpltBP5BPGmkXoQaTkSkrUlqDMHdFwIL65TdnvC8Arg8taGJiEiY2ub11yIi0mxKCCIiAighiIhIQAlBREQAsFSfHZr0hs3KgPda+Ou9gOhczZEcxRwOxRwOxRyO+mI+w91zWmNjaUsIR8PMlrt7UbrjaA7FHA7FHA7FHI6wY9YhIxERAZQQREQkkKkJYVa6A2gBxRwOxRwOxRyOUGPOyDEEERFJvUztIYiISIopIYiISJy7Z9QDmAC8C2wEpoewvdOAxcSnDF0D3BiUnwj8GdgQ/DwhKDdgRhDfamBYwrq+HdTfAHw7oXw4UBL8zgwOH8qrdxvNiD0LeAt4LnidCywJtjMX6BiUdwpebwyW901Yx81B+bvAhU19Dg1tI8l4exCfgnUd8A4wOurtDPyv4O/ibeBJIDtq7Qw8BnwMvJ1QlrZ2bWwbTcR8T/C3sRp4FuiR6vZryWfUWMwJy34MONArSu18RJzN3UGm80F8B7cJ6Ad0BFYBg1p5m72rGxLoBqwHBgH/Xv0HBkwHfhk8vxj4Y/BhjAKWJHxom4OfJwTPqz+4pUFdC373oqC83m00I/apwG85nBDmAVcEzx8GfhA8vw54OHh+BTA3eD4oaONOwRdoU/AZNPg5NLSNJON9HPhu8Lwj8QQR2XYmPnXsFqBzwnufHLV2Bs4FhlF755q2dm1oG0nEPB5oHzz/ZcL6UtZ+zf2Mmoo5KD+N+PQB73E4IUSinY9o99bakbbGg/h/jC8kvL4ZuDnkGH4PXED8v4TeQVlv4N3g+SPApIT67wbLJwGPJJQ/EpT1BtYllNfUa2gbScbZB/gLcB7wXPBHsSPhC1XTlsEf6+jgefugntVt3+p6DX0OjW0jiXi7E9+5Wp3yyLYzh+cSPzFot+eAC6PYzkBfau9c09auDW2jqZjrLLsM+E19+4Gjab/mfkbJxEy81zsU2MrhhBCZdk58ZNoYQvUXsFppUBYKM+sLFBLvSp7s7h8Gi/4OnBw8byjGxspL6ymnkW0k437gJ0AseN0T2OXulfVspya2YPnuoH5z30tj22hKLlAG/MrM3jKzR82sKxFuZ3ffDtwLbAM+JN5uK4h2O1dLZ7um4nt8NfH/flsScyq/C40ys0uB7e6+qs6iSLZzpiWEtDGz44BngB+5+57EZR5Pv96a22/ONszsS8DH7r6iNWNKsfbEu9sPuXshsI9497dGBNv5BOBS4snsFKAr8WPWGSVq7doUM7sFqAR+k4r1tRYz6wL8b+D2puqmytG2c6YlhO3Ej8dV6xOUtSoz60A8GfzG3f8nKP7IzHoHy3sTH0xqLMbGyvvUU97YNpoyBphoZluBp4gfNnoA6GFm1bPkJW6nJrZgeXdgZwvey85GttGUUqDU3ZcEr58mniCi3M7nA1vcveMubE0AAAHQSURBVMzdDwH/Q7zto9zO1dLZri3+HpvZZOBLwDeCnV9LYm6s/Zr7GTXmTOL/LKwKvot9gJVm9pkWxBxOOydzrDQqD+L/RW4OGrl6kGhwK2/TgF8D99cpv4faAzn/Hjy/hNoDOUuD8hOJHyM/IXhsAU4MltUdLLq4sW00M/5xHB5Unk/tgbTrgufXU3sgbV7wfDC1B9I2Ex+oa/BzaGgbScb6KjAgeH5H8P4j287AOcTPMOoSrPNx4IYotjNHjiGkrV0b2kYSMU8gfrZfTp16KWu/5n5GTcVcZ9lWDo8hRKada8XY0h1luh7ER87XEx/lvyWE7X2OeBdsNVAcPC4mflzxL8RP9Xox4UMzYGYQXwlQlLCuq4mfArYRuCqhvIj4aYubgP/i8Olk9W6jmfGP43BC6Bf8UW0MvhCdgvLs4PXGYHm/hN+/JYjrXYKzGhr7HBraRpKxFgDLg7b+XfCFiHQ7Az8jfirk28Ac4juMSLUz8dNhPwQOEe+JfSed7drYNpqIeSPxY+LV38OHU91+LfmMGou5zvKt1D7tNO3tXPehW1eIiAiQeWMIIiLSSpQQREQEUEIQEZGAEoKIiABKCCIiElBCEBERQAlBREQC/x+Na0G7uh/XzQAAAABJRU5ErkJggg==\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# In a transductive approach, data is considered as one-class partially labeled. This is, samples can be either\n", + "# positive or unknown. predict_proba gives the score for all samples, labeled and unlabeled.\n", + "score = deesom.predict_proba()\n", + "\n", + "ind = np.argsort(score)\n", + "\n", + "H = max(score)\n", + "plt.plot(score[ind], label=\"score\")\n", + "plt.plot(train_labels[ind]*H, 'o', label=\"train_labels\")\n", + "plt.plot(test_labels[ind]*H, '*', label=\"train+test_labels\")\n", + "plt.legend()\n", + "plt.show()\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Aca se podría plotear el tsne o algo asi para ver donde caen los unlabeled con scores mas altos" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### [Hasta aca lo que iria en el TP]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Prediction on unseen data" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [], + "source": [ + "# Metrics can be drawn from comparing the score and labels. As dataset is highly imbalanced, precision-recall curve is\n", + "# used.\n", + "from sklearn.svm import OneClassSVM\n", + "\n", + "ocsvm = OneClassSVM(kernel=\"linear\")\n", + "ocsvm.fit(train_data[train_labels == 1])\n", + "score_svm = ocsvm.predict(train_data)\n", + "\n", + "# Remove training positives\n", + "ind = train_labels == 0\n", + "test_labels = test_labels[ind]\n", + "score = score[ind]\n", + "score_svm = score_svm[ind]\n", + "\n", + "pre, rec, _ = precision_recall_curve(test_labels, score)\n", + "presvm, recsvm, _ = precision_recall_curve(test_labels, score_svm)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXwAAAD4CAYAAADvsV2wAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8li6FKAAAgAElEQVR4nO3deVhV1f7H8fc6h1FGZVAEpxxRnBFnM3EshzQrrUwb1LLh3vo13m6DZYPWrW5lKZmZaampddVKU9NMcwIrFXFOE0VBEGQe1++PjaQminImON/X8/DA2ezhe3b2YZ+1115Laa0RQghR/ZnsXYAQQgjbkMAXQggnIYEvhBBOQgJfCCGchAS+EEI4CRd7F3A5gYGBumHDhvYuQwghqpS4uLjTWuugi5c7ZOArpYYAQ5o0aUJsbKy9yxFCiCpFKXX0UssdsklHa71caz3Bz8/P3qUIIUS14ZCBL4QQwvIk8IUQwkk4ZBu+EEKcr7CwkMTERPLy8uxdikPx8PAgLCwMV1fXCq3vkIF//k1bIYRITEzEx8eHhg0bopSydzkOQWtNamoqiYmJNGrUqELbOGSTjty0FUKcLy8vj4CAAAn78yilCAgIuKpPPQ4Z+EIIcTEJ+7+72nNSLQN/5e6TfLzhsL3LEEIIh1ItA39Nwinm/HLE3mUIIZzUihUraN++PW3btqVly5bMnDmz7HcxMTG0aNGCFi1aEBUVxcaNG8t+17t3b+rXr8/585TcfPPNeHt7W6Quh7xpK4QQVVVhYSETJkxg27ZthIWFkZ+fz5EjRwDjD8HMmTPZuHEjgYGB7Nixg5tvvplt27ZRp04dAPz9/dm0aRM9evQgPT2dpKQki9VWLa/whRDC0ubNm0dUVBTt2rVj4sSJFBcX88MPP9C1a1c6dOjArbfeSlZWFpmZmRQVFREQEACAu7s7zZs3B2Dq1Km8+eabBAYGAtChQwfGjh3L9OnTy44zatQoFixYAMDSpUsZMWKExd6DQ17hS7dMIUR5Ji+PZ8+JsxbdZ8u6vrw4pFW5v09ISGDhwoVs2rQJV1dXJk2axPz585k1axZr1qzBy8uLqVOn8vbbb/PCCy8wdOhQGjRoQHR0NIMHD2b06NGYTCbi4+Pp2LHjBfuOjIzks88+K3sdHR3N+PHjKS4uZsGCBcTExPDKK69Y5H06ZOBrrZcDyyMjI8fbuxYhhFi7di1xcXF06tQJgNzcXLZt28aRI0fo3r07AAUFBXTt2hWAWbNmsWvXLtasWcNbb73F6tWrmTNnToWOZTab6dGjBwsWLCA3NxdLjhjskIEvhBDludyVuLVorRk7diyvv/562bLly5fzxRdf8OWXX15ym9atW9O6dWvGjBlDo0aNmDNnDi1btiQuLo4+ffqUrRcXF0erVhe+p1GjRjF8+HBeeukli74PacMXQogriI6OZvHixSQnJwOQlpZGmzZt2LRpEwcPHgQgOzub/fv3k5WVxfr168u2/e2332jQoAEATz31FE8//TSpqallv5szZw6TJk264Hg9e/bk2WefZfTo0RZ9H3KFL4QQV9CyZUumTJlC//79KSkpwdXVlenTpzNnzhxGjx5Nfn4+AFOmTCEkJIRp06YxceJEPD098fLyKmvOGTp0KMePH6dbt24opfDx8WHevHmEhIRccDylFE888YTF34c6v7+no4mMjNTXMgHKE1/9zuZDqWx6ps+VVxZCOLyEhATCw8PtXYZDutS5UUrFaa0jL15XmnSEEMJJSOALIYSTkMAXQggn4ZCBr5QaopSKycjIsHcpQghRbThk4Mt4+EIIYXkOGfhCCCEsTwJfCCGchAS+EEI4CQl8IYSogLfffpuIiAgiIiJ49913AZg7dy5t2rShbdu2jBkzBoCvvvqKiIgI2rZtS69evQDo0qUL8fHxZfvq3bs3sbGxvPTSS4wdO5aePXvSoEEDli5dylNPPUXr1q0ZOHAghYWFFn0PMrSCEKJq+f4ZOLnLsvus0xoGvVHur+Pi4vj000/ZunUrWms6d+5Mp06dmDJlCr/88guBgYGkpaUB8PLLL7Nq1SpCQ0NJT08H4Pbbb2fRokVMnjyZpKQkkpKSiIyMZMWKFRw6dIh169axZ88eunbtypIlS5g2bRrDhw/n22+/5eabb7bY25QrfCGEuIKNGzcyfPhwvLy88Pb2ZsSIEcTGxnLrrbeWTWZSq1YtALp37864ceP4+OOPKS4uBuC2225j8eLFACxatIiRI0eW7XvQoEG4urrSunVriouLGThwIGCMtnlupixLkSt8IUTVcpkrcUcwY8YMtm7dyrfffkvHjh2Ji4sjNDSUgIAAdu7cycKFC5kxY0bZ+u7u7gCYTCZcXV1RSpW9LioqsmhtcoUvhBBX0LNnT7755htycnLIzs7m66+/JjIykq+++qpsqONzTTqHDh2ic+fOvPzyywQFBXHs2DHAaNaZNm0aGRkZtGnTxi7vQ67whRDiCjp06MC4ceOIiooC4P7776d79+4899xzXH/99ZjNZtq3b8+cOXN48sknOXDgAFproqOjadu2LQAjR47kH//4B88//7zd3ocMjyyEcHgyPHL5HHJ4ZKXUdUqpT5RSi211TCGEEH+pUOArpWYrpZKVUrsvWj5QKbVPKXVQKfXM5fahtT6stb6vMsUKIYS4dhVtw58DfADMPbdAKWUGpgP9gERgu1JqGWAGXr9o+3u11smVrlYI4bS01mU9WIThapvkKxT4WusNSqmGFy2OAg5qrQ8DKKUWAMO01q8Dg6+qivMopSYAEwDq169/rbsRQlQjHh4epKamEhAQIKFfSmtNamoqHh4eFd6mMr10QoFj571OBDqXt7JSKgB4FWivlHq29A/D32itY4AYMG7aVqI+IUQ1ERYWRmJiIikpKfYuxaF4eHgQFhZW4fVt1i1Ta50KPFCRdZVSQ4AhTZo0sW5RQogqwdXVlUaNGtm7jCqvMr10jgP1znsdVrqs0mQCFCGEsLzKBP52oKlSqpFSyg0YBSyzTFnW4cjPHAghhLVVqElHKfUl0BsIVEolAi9qrT9RSj0MrMLomTNbax1/md1UmCWadLTWHEvLIfZoGnFHzxB3NJ0jp7P56oGuRITKJwchhPOpaC+d0eUs/w74zqIVGftdDiyPjIwcf637OJGRR89p6wDwdnehUaAXuYXFHE/PlcAXQjilajmWzrB2dQFoW8+fjvVr0ryOD3tPnuWm9zbauTIhhLAfhwz8yjbp9GwaRM+mQZYtSgghqjiHHB5ZeukIIYTlOWTgCyGEsDyHDHyl1BClVExGRoa9SxFCiGrDIQNfmnSEEMLyHPKmrSPKyi8iKT2XwmKNh6sJD1cz7i7Gdw9XM2aTDOgkhHBsThP4CiOQ31y1j3V7kwkP8SU8xJcWIT54uJg5dTaP4+m5JGXkciI9jxPpuZxIzyUpw1iemXf5yYRdTKo0/E24u5ip4WZm8tBWdGsSaIu3J4QQV+SQgW+NwdOa1vbmoRsa8+uf6ayKP8mC7X8N9KkUXDzqQs0artT19ySsZg06N6pFiL8nIX4euLuYyCssIa+wmPwi43teYQn5Rcb3vKJi8gqKWfrrcWKPnpHAF0I4DIcMfEs8aXsxV7OJJwe0OLd/Tp3NJyHpLHuSzlJQVEKovyd1/T0J8fegrp8nnm7maz5WcYlm6a8WGUdOCCEsxiED39qUUtTx86COnwc3tAi2dzlCCGETDtlLRwghhOVJ4AshhJNwyMCXB6+EEMLyHDLw5cErIYSwPIcMfCGEEJYngS+EEE5CAl8IIZyEBL4QQjgJCXwhhHASDhn40i1TCCEszyEDX7plCiGE5TnlWDq2kpVfxPH0XEpKNCVaU1z2nbKfL7e8WW0favt62PttCCGqCQl8K3ExKWI2HCZmw+Fr3keX62qxYEJXC1YlhHBmEvhWYDYpZo/rROKZXMwmMCmF2WR8nfvZpP5abjIpzOrC30/8PJbs/GJ7vxUhRDUigW8lvZoFVWr7dvX8Sc0usFA1QgjhoDdthRBCWJ4EvgM7fiaXRduPkZ4jV/pCiMqTJh0HdUvHMA6lZPPUkp3862tFj6aB3NQ6hP4t6+BXw9Xe5QkhqiClL5692wGcN4n5+AMHDti7HLvRWrP7+FlW7DrBtzuTSDyTi5vZxLB2dZnQ6zqa1vaxd4lCCAeklIrTWkf+bbkjBv45kZGROjY21t5lOAStNTsTM1gcl8hXccfIKywhukUwE3pdR1SjWiil7F2iEMJBSOBXI2nZBczdfIS5m4+Sll1A23r+vDikJR3q17R3aUIIB1Be4MtN2yqolpcb/+zbjE1P9+GVYa34/Vg6szf+Ye+yhBAOTgK/CvN0MzOma0OaBHvjwB/UhBAOQnrpiDIlJZrcwmKyC4rIyS/9XlBMdv6F3yNCfenYoJa9yxVCXCUJ/GriaFo2czb9QUFxCQVFxld+cQn5hSUXLCsoKiG3sJicgiKy8o3v2aXfcwoqNpRDizo+rPxnLyu/IyGEpUngVwOB3m5sOZzG7uN7Llju5mLCvfTLzWzCzcX48nRzwcvNTKi/G17uZmqUvq7hftF3N5e/fu9uxsvNhcnL4/njdLad3qkQojIk8KuBz+6N4kx2YVmgu5lNuJqVVbpquruaLb5PIYRtSOBXA+4uZur4SRALIS5PAl9ctdNZBTy7dBens/JJzconNbuAjNxC/DxdCfZxJ9jHgyAfd4J83Amr6UnjIG8aB3nj6SZ/lISwJwl8cVUaBXjx/a4kVu85SaC3OwHebrSr5Y+vhyvpuYUkn80j4eRZNuzPJzO/6IJtQ/09aRzsTdswPx7r2wyTSZ4OFsKWbBr4SqmbgZsAX+ATrfUPtjy+qLwnBjTn8X4VC+ucgiKOpeVyKCWLQ8lZHEzJ4vdj6WzYn8LAiDq0DPGVISGEsKEKD62glJoNDAaStdYR5y0fCPwXMAOztNZvVGBfNYG3tNb3XW49GVqh+lm5O4kH5u0AwMvNTIi/J3X9Panr50Gwjzs1vdyo5eVGgJe78d3bjZo13HBzkWcEhaio8oZWuJor/DnAB8Dc83ZqBqYD/YBEYLtSahlG+L9+0fb3aq2TS3/+d+l2ojrb9jF41oSIW6D0Sj46vDYzx3TkWFoOJ9LzOJGey4mMXPacOEtqdn65Twz7eLgQUPrHIMjHnScHNKdJsIwWKsTVqHDga603KKUaXrQ4CjiotT4MoJRaAAzTWr+O8WngAsr4/P4G8L3WeseljqOUmgBMAKhfv35FyxOOpqQEdi2GY1tg+ywY+AbUbYer2cSAVnUuuUlxiSYjt5C07HxSswpIyy4gNdv4fu7nlMw8VsWfIrJBLQl8Ia5SZdvwQ4Fj571OBDpfZv1HgL6An1KqidZ6xsUraK1jgBgwmnQqWZ+wF5MJ7vkOfp0Ha1+GmN7QYQz0eQG8Lz3fr9mkqFV6Fd8k+NK7zcovIuLFVdarW4hqzKYNo1rr97TWHbXWD1wq7M9RSg1RSsVkZGTYsjxhaSYzdBwLj8RB14fgty/g/Q7wywdQJNM2CmFrlQ3840C9816HlS6rFK31cq31BD8/v8ruSjgCT38Y8CpM2gL1OsMPz8FH3eDAantXJoRTqWzgbweaKqUaKaXcgFHAssqXJaqlwKZw12K44ytAw/yRMP9WOH3Q3pUJ4RQqHPhKqS+BzUBzpVSiUuo+rXUR8DCwCkgAFmmt4ytblDTpVHPN+sODm6H/FPhzC3zYGVY9B3ny31sIa5IpDoV9ZSUbN3V/nQdegRD9ArS702j/v4TcgmJavrgSV5OJprW9CQ/xNb7q+BAR5oevh6uN34AQjkfmtBWO7cSv8P0zRjfOkLYwcCo06HrJVdftTWbL4VT2JJ1l78lMUjLzAXAzm4gOD2ZEhzB6Nw/C1SwPawnnVKUCXyk1BBjSpEmT8QcOHLB3OcJWtIbdS2D1C3D2OESMhH6TwS/sspudzsonIeks6/el8L/fjnM6q4AALzeGtK3LyI5hRITKzX/hXKpU4J8jV/hOqiAbNr4Lv7wHKOjxGHR/FFw9r7hpYXEJG/ansGRHImv2JFNQXMLCCV3ofF2A9esWwkGUF/jymVc4Hjcv6PMcPLQNmg2A9a/BB1EQ/zVXmq3d1WwiOrw2H97ZkYUTuwBwJkf6/AsBDhr40ktHAFCzAdz2GYxdAR6+8NU4mDMYTu6q0OYeMjuXEBdwyMCXB6/EBRr1hIkbYPA7kLwHZvaC5f+E7NQKbe7ArZZC2JRMgCKqBpMZIu+FVsNh/VTYFgPxS6H3s9DpfjD/vTumS+mY/Q/O30GjQC9C/DwI8fOkrr8H/VrWpk2Yv63fhRB2JTdtRdWUvBdWPgOH10Fgcxj4OjSJvmCVkhLNothjHEzOIikjj6SMXJIy8jh5No+u1wXwxfgudipeCOuyxHj4NnNet0x7lyIcVXALGPM17F8JK5+FeSOg2SBjzJ6AxgCYTIpRUX8fYntUzGaKShz3QkcIa5E2fFF1KQXNB8FDW6HvZDjyM0zvbPTjz8+0d3VCOByHDHwhroqLO/T4pzEMc5vbYNN/4f2O8Ot8YyKWS0jNyufnAykcTM4ip6DokusIUd1IG76ofo7HwfdPQ+J2qNsBBk2Dep3Kfv34ot9YuuPCUbxr1nA15tYtnV+37Gd/T0L9PQnyccdcgYnbhXAE8qStcC4lJbDrK1jzImQmQZvboe9L4FuXkhLNiYzcC+bUPZH+1+vj6blk5l141e9iUtT29SDU3+jl06quH/f2aCR/BIRDqlKBL2PpCIvJz4KNbxuzbJlcoOfj0PVhcPW47GaZeYUkZeRxPP3cH4ML/yAknslldFR9XhsegVIS+sKxVKnAP0eu8IXFpP0Bq5+HhOXg38DozdNisHHj9xpMXbmXj9Yf4qEbGvPkgBYWLlaIypGxdIRzq9UIbp8Hd//PGKtn4V0wdyicurb5ep4a0JxRneoxfd0hPtn4h4WLFcI6JPCFc7muN0z8GW58C5J2wowe8O0TkJN2VbtRSvHq8NYMbFWHV1bsYemORKuUK4QlSeAL52N2gajx8OivxrAMsbPhvfawNQaKK95F02xSvDuqHd0aB/Dk4p2sTThlxaKFqDwJfOG8atSCG9+EBzZCSBv4/knjiv/w+grvwsPVTMzdkbQM8WXS/B1s++PqPikIYUsS+ELUbgl3L4Pb50NhDswdBgvuNG70VoC3uwtz7ulEaE1P7puznd3HZVhv4ZgcMvBlPHxhc0pB+GBj0pXoF+DQOpgeBWsmG107ryDA251593XG19OVsbO3cSjlytsIYWsOGfgylo6wG1cP6Pl/8EgstBph9OF/vyP8vqDcYRrOqevvyef3RaEUjJm1lePpuTYqWoiKccjAF8LufOvCiJlw3xrj568nwuz+kBh32c2uC/Lms3ujyMwvYsysraRk5tuoYCGuTAJfiMup1wnuXws3fwTpf8KsPvD1g5B5stxNWtX149NxnTiRkcvds7eRkVtow4KFKJ8EvhBXYjJBuzuM0Ti7/xN2LzaaeTa+A0WXvoKPbFiLmWMiOZicyX1ztpNbUGzjooX4Owl8ISrK3Qf6TYZJW6BRL1jzkjH+/t7vLjlx7vXNgvjvqPbs+PMME+fFUVB0+XsAQlibBL4QVyugMYz+Eu5aCmY3WDAaPh9uTLt4kRtbh/DGiDZs2J/CYwt/o1hm2hJ2JIEvxLVqEg0PboKBU+HEDviomzEOf+6ZC1a7rVM9/n1TON/uSuJfS3fhyAMWiupNAl+IyjC7QpcH4JFfoeNY2BYD73WA7Z9AyV/t9vf3vI5H+zRhYewxXvsuQUJf2IVDBr48eCWqHK8AGPwOTNwAwS3h28dhZi/44+eyVR7r14xx3Rry8c9/MH3dQTsWK5yVQwa+PHglqqw6rWHcCrj1M8g7C58NhkV3w5mjKKV4YXBLRrQP5a0f9vPtziR7VyucjEMGvhBVmlLQ6mZ4eBvc8G84sNoYpuHHVzEV5TBtZBvahPkxeXk8WfkygbqwHQl8IazF1ROufxIejoXwIbBhGrwfiUv8El4Z2oqUrHz+u2a/vasUTkQCXwhr8wuFW2bBvavAOwiW3k/b1bfzeKscZm86wr6TmfauUDgJCXwhbKV+Fxi/HoZ+AGmHefjgeN50+5j/LP1Zeu0Im5DAF8KWTCboMAYeiUN1e5hhagP/OXUv8YunQFGBvasT1ZwEvhD24OEH/aegJm1hr3sEEfFvUTy9M+xfZe/KRDUmgS+EHZmCmuJ+92LGFT7FmZwi+OI2mDcSUuRmrrA8CXwh7KxNmD+hkUPpnvkqp7q+AMe2wkddYeW/IDfd3uWJakQCXwgH8OSA5tTw8ODhP7qiH4mDdnfClg+NYZjj5lwwTIMQ10oCXwgH4F/DjWcGtWD7kTN8vb8Ahr4HE3+CwGaw/B8Qcz0c/cXeZYoqzmaBr5QKV0rNUEotVko9aKvjClFV3NqxHu3r+/PadwnGLFkhbeGe72DkbMg5A58Ogq/ugfRj9i5VVFEVCnyl1GylVLJSavdFywcqpfYppQ4qpZ653D601gla6weA24Du116yENWTyaR4ZVgEadkFvLO69KatUhBxCzy8Ha5/BvZ9Bx90gvVvQEGOfQsWVU5Fr/DnAAPPX6CUMgPTgUFAS2C0UqqlUqq1UmrFRV/BpdsMBb4FvrPYOxCiGokI9eOuLg2Yu/kI8SfOGy3WrQbc8KwxTEPzQbD+dSP4dy+55GxbQlxKhQJfa70BSLtocRRwUGt9WGtdACwAhmmtd2mtB1/0lVy6n2Va60HAneUdSyk1QSkVq5SKTUlJubZ3JUQV9n/9mlOzhhsv/C+ekotnyPKvB7d+Cvd8DzVqwuJ74dMbIel3+xQrqpTKtOGHAuc3JiaWLrskpVRvpdR7SqmZXOYKX2sdo7WO1FpHBgUFVaI8IaomvxquPDOoBXFHz7BkR+KlV2rQDSb8BEP+C6f3wczrYdmjkH3atsWKKsVmN2211uu11o9qrSdqradfbl2ZAEU4u1s6hNGxQU3e+H4vGTmFl17JZIaO4+CRHdBlEvw235hta/N0KC5nG+HUKhP4x4F6570OK11WaTIBinB2JpPi5WGtOJNTwH9W77v8yp7+MPA1eHAz1OsEq/5lzK97YI1tihVVRmUCfzvQVCnVSCnlBowCllmmLCFEq7p+3N21IfO2HGX38Qp82g1qBncuhjsWGQ9qzb8F5t8Gp2U6RWGoaLfML4HNQHOlVKJS6j6tdRHwMLAKSAAWaa3jLVGUNOkIYXisXzNqebnz7292//0G7qUoBc0GwKQt0O8V42GtD7vAD/82plwUTk058jjckZGROjY21t5lCGFXS3ck8vii35l6S2tu71T/6jbOSoa1k+HX+eAVCNEvGsM2mOQh++pMKRWntY68eLn8VxfCwQ1vH0qnhsYN3PScqxwz3zsYhk2H8T9Cretg2cPw8Q3w51brFCscmkMGvjTpCPEXpRQvD4vgbF4Rb666wg3c8oR2MKZYHDHLuOqf3R+W3A8ZFulnIaoIhwx86aUjxIXCQ3wZ27UhX2z7k52J1zhkslLQ5lZ4JBZ6PQl7lsEHkfDTm1CYa9mChUNyyMAXQvzdP/s1JdDbnecregO3PG5e0Offxvg8TfvBuikwPQr2/E+GaajmHDLwpUlHiL/z9XDluRvD+T0xgwXbLTBiZs0GcNtcGLsc3Hxg0d3w2RA4ufvK24oqySEDX5p0hLi0Ye3qEtWoFtNW7SUt20KTnjfqBRM3wE1vw6l4mNkTVjwO2amW2b9wGA4Z+EKIS1PKGEI5M6+IpxbvJK/QQjNhmV2g033wSBxETTBm2Xq/PWyZIcM0VCMS+EJUMc3r+PD8TeGsSTjF3bO3lT/WzrWoUQsGTYUHN0Hd9rDyaZjRAw79aLljCLtxyMCXNnwhLm9c90a8N7o9v/2ZzsgZv3A83cK9bILDYcw3MOpLKMqHz4fDl6Mh7bBljyNsSp60FaIK23wolQmfx+LpambOPVG0rOtr+YMU5RsTqm94C4oLjJE5ez0B7j6WP5awCHnSVohqqGvjABY/0A2zSXHbzM1sPGCF8fBd3KHHY0b7fsRI2PQuvN8RfvsCSkosfzxhNRL4QlRxzev4sHRSN8JqejLu0218/Ws5k6ZUlk8dGP4R3P8j+NWDbx6ET/pConwKryok8IWoBkL8PFn0QFc6NazFYwt/58P1B7Fac21YR7hvNQyfaQzNMCsalk6Es0nWOZ6wGIcMfLlpK8TV8/VwZc69nRjati7TVu7j+f/tprgyT+RejskEbUcZzTw9Hof4pUYzz8//gcI86xxTVJrctBWimikp0UxdtZeZPx2mX8vavDeqPZ5uZuseNO0PY8z9vSugZkPo/yq0uMkYv0fYnNy0FcJJmEyKZweFM3loK9YknOKOWVss91RueWo1glHzja6cLp6w8E6YOwxO7bHuccVVkcAXopoa260hH93ZgT0nznLLR7/wZ2qO9Q/a+AZ4YCPc+BYk/W48tPXdk5CTZv1jiyuSwBeiGhsYEcL8+ztzJqeAER9tuvahla+G2QWixsOjv0LkvbB9FrzfAbZ9DMVF1j++KJcEvhDVXGTDWix+oBsermZGxWxh3b5k2xy4Ri246S3jir92BHz3hDEw2+GfbHN88TcS+EI4gSbB3iyd1I1GgV7c/1ksC7f/abuD125lDMF8+zwoyIa5Q2HhXXDmiO1qEICDBr50yxTC8oJ9PFg4sSvdmwTy9JJdvLN6v/X66l9MKQgfAg9tgz7Pw8G18EEUrH0Z8rNsU4OQbplCOJvC4hKeXbqLxXGJ3B5ZjynDI3A12/ja7+wJWPMS7FwIPiHQdzK0uU26cVqIdMsUQgDgajbx5sg2PNqnCQtjjzF+bizZ+Ta+mepbF0bEGE/s+oTA1xPgk/5wPM62dTgZCXwhnJBSisf7N+e14a3ZsD+FUTFbSMnMt30h9aLg/rUw7EOjTf/jPvDNJMg8ZftanIAEvhBO7I7O9fn47kgOJmcx4qNNHE6xQ3u6yQTt7zSGaej+D9i5yBimYeO7xtDMwmIk8IVwctHhtflyQhdy8ou55aNfiDt6xj6FePhCv5fhoa3QqCeseRE+7AL7vjdmPKMAAAz1SURBVAcHvtdYlUjgCyFoV8+fJQ92w8/TlTs+3sIP8SftV0xAYxj9Jdy1BEyu8OUomDcCkvfar6ZqQgJfCAFAw0AvljzYjRYhvjwwL47Ptxy1b0FN+hpz6w58AxLj4KNu8P0zkGunTyDVgAS+EKJMgLc7C8Z3oU+LYJ7/ZjdTV+61XV/9SzG7QpcH4dEd0HEsbJtptO/HzoaSYvvVVUU5ZODLg1dC2I+nm5kZd3Xkjs71+Wj9IR5f9DsFRXaeytArEAa/AxN+gqBwWPEYzLwejmy0b11VjDx4JYS4JK01H64/xJur9tGjSSAf3dUBHw9Xe5dl3MDd8w388DxkHIOWN0P/V8C/vr0rcxjy4JUQ4qoopXjohib859a2bDmcyq0zNnPqrAPMZqUUtBoOD2+HG56D/avgg06w7jVjrB5RLgl8IcRl3dIxjNnjOnEsLYfh0zdx4FSmvUsyuHrC9U/BI7HQYjD8NNUI/l2LpRtnOSTwhRBX1KtZEIse6EphieaWj35h6+FUe5f0F78wGPkJ3LPSaOtfch/MHggnfrN3ZQ5HAl8IUSGt6vrx9aRuBPm4M+aTbXy7M8neJV2oQVcYvw6Gvg9phyCmNyx7BLJS7F2Zw5CbtkKIq5KeU8D4ubFsP3KGtmF+RIfXJjo8mJYhvihHGe0yLwN+mgZbZ4BrDaPpJ2oiuLjZuzKbKO+mrQS+EOKq5RUW8+mmI/yw5yS/HUtHa6jr51EW/l0bB+DuYrZ3mXD6AKz6Fxz4AQKawIDXoVl/e1dldRL4QgirSMnMZ93eZNYknOLnA6fJLSymhpuZnk0DiQ6vTZ8WwQR6u9u3yP0/wKpnIfUgNO0PA16DwKb2rcmKJPCFEFaXV1jM5sOprE04xdqEZJIy8lAK2tfzJzq8Nn3Da9Ostrd9mn6KCmBbjNGbpzAHOj9gNPV4+Nm+FiuTwBdC2JTWmvgTZ1mbkMzavafYmWg8OR9W05O+peEf1agWbi427juSlQI/vgw7PocaARD9ArS/C0wO0ARlIRL4Qgi7OnU2zwj/hFNsPHia/KISfNxd6NUsiOjwYG5oHkxNLxveVD3xG6x8Bv7cDCFtYeBUo6dPNeAQga+U8gJ+Al7SWq+40voS+EJUT7kFxWw6eJq1e42mn+TMfEwKOjaoWdb00zjIy/pNP1rD7iWw+gU4exwibjHG5PcLs+5xraxSga+Umg0MBpK11hHnLR8I/BcwA7O01m9cYT8vA1nAHgl8IQRASYlm94kM1uw5xZqEZPYknQWgYUCNsl4/nRrWsu5E6wU5sOm/sOldQEGPx6DbI+BWw3rHtKLKBn4vjKCeey7wlVJmYD/QD0gEtgOjMcL/9Yt2cS/QFggAPIDTEvhCiEs5kZ7L2r1G088vB1MpKC7B18OF65sH0zc8mN7NgvGrYaVB3NL/NK72478Gv3rG1X6r4cb4PVVIpZt0lFINgRXnBX5XjKaZAaWvnwXQWl8c9ue2fxXwAloCucBwrfXfxlxVSk0AJgDUr1+/49Gjdp6EQQhhN9n5Rfx84DRrE06xbl8yp7MKMJsUnRrWpG94baLDa9Mo0MvyBz6y0Zhs5dQuaNDdmIQlpI3lj2Ml1gj8kcBArfX9pa/HAJ211g9fYT/jkCt8IcRVKinR/JaYXtblc+9JYxC364K8ynr9dKjvj4ulmn5KimHHXPjxFWOWrQ5joc+/jfF6HJzDBH4FjzUEGNKkSZPxBw4cqOzuhBDV0LG0HCP89yaz5XAqhcUa/xqu3NA8mOjwYHo1C8LXEuP3554xhmnYFgOuXtD7GYgab8zG5aDs3qRzLeQKXwhREZl5hfx84DRr9hhNP2dyCnExKTpfV6vs6r9erUregE3ZZ3TjPPQjBDaDga8b8+46IGsEvgvGTdto4DjGTds7tNbxFqpZAl8IcdWKSzQ7/jzDmtKmn4PJWQA0q+1d2uUzmHb1amI2XcONWK2NCVdWPQtph6HZIBjwKgQ0tvC7qJzK9tL5EugNBAKngBe11p8opW4E3sXomTNba/2qhYqVJh0hhEUcOZ1dFv7bj6RRVKIJ8HKjd/Ng+rUMpmfTILzcXa5up0X5xkicP70JRXnGROu9ngQPX+u8iavkEA9eXS25whdCWFJGbiE/7U8xev3sTeZsXhFuZhNdGgfQNzyY6PDahPp7VnyHmadg7cvw2zzwCoa+L0LbO8Bk36lGJPCFEOI8hcUlxB45U3bj94/Txny44SG+ZeHfJtQPU0Wafo7HGd04E7dB3fYwaBrUi7LyOyhflQp8adIRQtjaoZQs1iYYT/vGHkmjREOQjzt9Snv99GgaSA23yzT9aA27vjIe3MpMgta3Qb/J4FvXdm+iVJUK/HPkCl8IYQ9nsgtYvz+ZNQnJbNiXQmZ+Ee4uJro3CSQ6PJjoFrWp4+dx6Y3zs2DjO/DL+8YInD0fh66PgGs561uBBL4QQlyDgqISth9JY/WeU6zde4pjabkARIT6Et3C6PIZEXqJ6R3PHIEfnoeEZeBfH/q/CuFDbDJMQ5UKfGnSEUI4Iq01B5Kzynr97PjzDFpDHV8P+oQbY/10axyIh+t5Y+sf/glWPgvJ8dCwJwyaCrVbWbXOKhX458gVvhDCkaVm5bNuXwpr9pzi5wMpZBcU4+lqpnuTQPqGB9MnPJhgHw8oLoIdc+DHKcYE65H3wg3PQY1aVqlLAl8IIawov6iYLYfTysb6OZ5uNP20redP3xZGr59w/yLU+jdg+yxw9zFCP/JeMF/lcwBXIIEvhBA2orUmISnT6PWzN5nfj6UDEOrvSZ8WwQytm0HHhKmY/vgJgsKNYRoa32Cx41epwJc2fCFEdZKcmce6vUavn40HTpNbWIyXm4mH6x5gzNmZeOckQovB0P8VqHVdpY9XpQL/HLnCF0JUN3mFxfxy6DRrSuf3TT+byb0uK3nU9RvcKCa93URqDXgaVYlhGiTwhRDCwWitiT9xljUJp9gRv4dhp2dxi/lnTquanB70MS2i+l3TfssLfMveKRBCCFFhSikiQv2ICPWDvs04mTGQlVtWE/brOwTXD7f48STwhRDCQdTx82DggCEwYIhV9m/fId3KoZQaopSKycjIsHcpQghRbThk4Gutl2utJ/j5+dm7FCGEqDYcMvCFEEJYngS+EEI4CQl8IYRwEhL4QgjhJCTwhRDCSThk4Eu3TCGEsDyHHlpBKZUCHLV3HTYSCJy2dxF2JudAzgHIOYDKn4MGWuugixc6dOA7E6VU7KXGvnAmcg7kHICcA7DeOXDIJh0hhBCWJ4EvhBBOQgLfccTYuwAHIOdAzgHIOQArnQNpwxdCCCchV/hCCOEkJPCFEMJJSODbkFJqoFJqn1LqoFLqmUv8/nGl1B6l1E6l1FqlVAN71GltVzoP5613i1JKK6WqXRe9ipwDpdRtpf8e4pVSX9i6RmurwP8P9ZVS65RSv5b+P3GjPeq0JqXUbKVUslJqdzm/V0qp90rP0U6lVIdKHVBrLV82+ALMwCHgOsAN+B1oedE6NwA1Sn9+EFho77rtcR5K1/MBNgBbgEh7122HfwtNgV+BmqWvg+1dtx3OQQzwYOnPLYEj9q7bCuehF9AB2F3O728EvgcU0AXYWpnjyRW+7UQBB7XWh7XWBcACYNj5K2it12mtc0pfbgHCbFyjLVzxPJR6BZgK5NmyOBupyDkYD0zXWp8B0Fon27hGa6vIOdCAb+nPfsAJG9ZnE1rrDUDaZVYZBszVhi2Av1Iq5FqPJ4FvO6HAsfNeJ5YuK899GH/Zq5srnofSj631tNbf2rIwG6rIv4VmQDOl1Cal1Bal1ECbVWcbFTkHLwF3KaUSge+AR2xTmkO52ty4LJnE3AEppe4CIoHr7V2LrSmlTMDbwDg7l2JvLhjNOr0xPultUEq11lqn27Uq2xoNzNFa/0cp1RX4XCkVobUusXdhVZVc4dvOcaDeea/DSpddQCnVF3gOGKq1zrdRbbZ0pfPgA0QA65VSRzDaLZdVsxu3Ffm3kAgs01oXaq3/APZj/AGoLipyDu4DFgForTcDHhiDijmTCuVGRUng2852oKlSqpFSyg0YBSw7fwWlVHtgJkbYV7c223Muex601hla60CtdUOtdUOMexlDtdax9inXKq74bwH4BuPqHqVUIEYTz2FbFmllFTkHfwLRAEqpcIzAT7Fplfa3DLi7tLdOFyBDa510rTuTJh0b0VoXKaUeBlZh9FCYrbWOV0q9DMRqrZcBbwLewFdKKYA/tdZD7Va0FVTwPFRrFTwHq4D+Sqk9QDHwpNY61X5VW1YFz8H/AR8rpR7DuIE7Tpd2XakulFJfYvxhDyy9V/Ei4AqgtZ6Bce/iRuAgkAPcU6njVbPzJ4QQohzSpCOEEE5CAl8IIZyEBL4QQjgJCXwhhHASEvhCCOEkJPCFEMJJSOALIYST+H9EzU5JkOfOLAAAAABJRU5ErkJggg==\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.figure()\n", + "plt.plot(rec[:-2], pre[:-2], '+-', label=\"eeSOM\")\n", + "plt.plot(recsvm[:-1], presvm[:-1], '+-', label=\"ocsvm\")\n", + "plt.legend()\n", + "plt.yscale(\"log\")\n", + "plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [ + { + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# Predict (inductive)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mscore\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0melsom\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpredict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtrain_data\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;31m# TODO tal vez se puede probar con algun dato artificial, por ahora ya estaria solo con esto.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/elsom_package/elsom/elsom.py\u001b[0m in \u001b[0;36mpredict\u001b[0;34m(self, test_data)\u001b[0m\n\u001b[1;32m 295\u001b[0m \u001b[0;32mwhile\u001b[0m \u001b[0mh\u001b[0m \u001b[0;34m<\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlayers\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0midxtst\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 296\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 297\u001b[0;31m \u001b[0mresh\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlayers\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mh\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpredict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtest_data\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0midxtst\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 298\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 299\u001b[0m \u001b[0midxtst\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0midxtst\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwhere\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresh\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/elsom_package/elsom/elsom.py\u001b[0m in \u001b[0;36mpredict\u001b[0;34m(self, test_data)\u001b[0m\n\u001b[1;32m 161\u001b[0m \u001b[0mresh\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 162\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mh0\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msoms\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 163\u001b[0;31m \u001b[0mresh0\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msoms\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mh0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpredict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtest_data\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 164\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresh\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 165\u001b[0m \u001b[0mresh\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mresh0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/elsom_package/elsom/elsom.py\u001b[0m in \u001b[0;36mpredict\u001b[0;34m(self, data)\u001b[0m\n\u001b[1;32m 72\u001b[0m \u001b[0;34m:\u001b[0m\u001b[0;32mreturn\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mnew\u001b[0m \u001b[0mlabels\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 73\u001b[0m \"\"\"\n\u001b[0;32m---> 74\u001b[0;31m \u001b[0mbmus\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msom\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfind_bmu\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m4\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 75\u001b[0m \u001b[0mlabels\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msom_labels\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mbmus\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mastype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mint\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 76\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mlabels\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/elsom_package/sompy/decorators.py\u001b[0m in \u001b[0;36mwrapped_f\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 9\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mwrapped_f\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[0mt0\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtime\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 11\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 12\u001b[0m \u001b[0mts\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mround\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtime\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mt0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m3\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 13\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/elsom_package/sompy/sompy.py\u001b[0m in \u001b[0;36mfind_bmu\u001b[0;34m(self, input_matrix, njb, nth)\u001b[0m\n\u001b[1;32m 386\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 387\u001b[0m \u001b[0mchunks\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0minput_matrix\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mrow_chunk\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0mcol_chunk\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnjb\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 388\u001b[0;31m \u001b[0mb\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpool\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmap\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;32mlambda\u001b[0m \u001b[0mchk\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mchunk_bmu_finder\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mchk\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcodebook\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmatrix\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnth\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mnth\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mchunks\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 389\u001b[0m \u001b[0mpool\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mclose\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 390\u001b[0m \u001b[0mpool\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.8/multiprocessing/pool.py\u001b[0m in \u001b[0;36mmap\u001b[0;34m(self, func, iterable, chunksize)\u001b[0m\n\u001b[1;32m 362\u001b[0m \u001b[0;32min\u001b[0m \u001b[0ma\u001b[0m \u001b[0mlist\u001b[0m \u001b[0mthat\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0mreturned\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 363\u001b[0m '''\n\u001b[0;32m--> 364\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_map_async\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfunc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0miterable\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmapstar\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mchunksize\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 365\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 366\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mstarmap\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0miterable\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mchunksize\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.8/multiprocessing/pool.py\u001b[0m in \u001b[0;36mget\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 760\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 761\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtimeout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 762\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwait\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 763\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mready\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 764\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mTimeoutError\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.8/multiprocessing/pool.py\u001b[0m in \u001b[0;36mwait\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 757\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 758\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mwait\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtimeout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 759\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_event\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwait\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 760\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 761\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtimeout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.8/threading.py\u001b[0m in \u001b[0;36mwait\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 556\u001b[0m \u001b[0msignaled\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_flag\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 557\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0msignaled\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 558\u001b[0;31m \u001b[0msignaled\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_cond\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwait\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 559\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0msignaled\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 560\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.8/threading.py\u001b[0m in \u001b[0;36mwait\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 300\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;31m# restore state no matter what (e.g., KeyboardInterrupt)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 301\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mtimeout\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 302\u001b[0;31m \u001b[0mwaiter\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0macquire\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 303\u001b[0m \u001b[0mgotit\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 304\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m: " + ] + } + ], + "source": [ + "# TODO ver si se puede armar lago artificial para probar el predict\n", + "# Predict (inductive)\n", + "# score = deesom.predict(train_data)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..0c2e69b --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +numpy +scikit-learn +pandas diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..1f1fe6f --- /dev/null +++ b/setup.py @@ -0,0 +1,22 @@ +import setuptools + +with open("README.md", "r") as fh: + long_description = fh.read() + +setuptools.setup( + name="deeSOM", + version="0.1", + author="L.A. Bugnon, C. Yones, D. Milone, G. Stegmayer", + author_email="lbugnon@sinc.unl.edu.ar", + description="Deep ensemble-elastic self-organized map (deesom): a SOM based classifier to deal with large and highly imbalanced data.", + long_description=long_description, + long_description_content_type="text/markdown", + url="https://github.com/lbugnon/deesom", + packages=setuptools.find_packages(), + classifiers=[ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + ], + python_requires='>=3.6', +) diff --git a/sompy/__init__.py b/sompy/__init__.py new file mode 100644 index 0000000..06b77cd --- /dev/null +++ b/sompy/__init__.py @@ -0,0 +1 @@ +from .sompy import SOMFactory diff --git a/sompy/codebook.py b/sompy/codebook.py new file mode 100644 index 0000000..b5e16c4 --- /dev/null +++ b/sompy/codebook.py @@ -0,0 +1,173 @@ +import numpy as np + +from sklearn.decomposition import PCA +#from sklearn.decomposition import RandomizedPCA# (randomizedpca is deprecated) +from .decorators import timeit + + +class InvalidNodeIndexError(Exception): + pass + + +class InvalidMapsizeError(Exception): + pass + + +class Codebook(object): + + def __init__(self, mapsize, lattice='rect'): + self.lattice = lattice + + if 2 == len(mapsize): + _size = [1, np.max(mapsize)] if 1 == np.min(mapsize) else mapsize + + elif 1 == len(mapsize): + _size = [1, mapsize[0]] + print('input was considered as the numbers of nodes') + print('map size is [{dlen},{dlen}]'.format(dlen=int(mapsize[0]/2))) + else: + raise InvalidMapsizeError( + "Mapsize is expected to be a 2 element list or a single int") + + self.mapsize = _size + self.nnodes = mapsize[0]*mapsize[1] + self.matrix = np.asarray(self.mapsize) + self.initialized = False + + @timeit() + def random_initialization(self, data): + """ + :param data: data to use for the initialization + :returns: initialized matrix with same dimension as input data + """ + mn = np.tile(np.min(data, axis=0), (self.nnodes, 1)) + mx = np.tile(np.max(data, axis=0), (self.nnodes, 1)) + self.matrix = mn + (mx-mn)*(np.random.rand(self.nnodes, data.shape[1])) + self.initialized = True + + @timeit() + def pca_linear_initialization(self, data): + """ + We initialize the map, just by using the first two first eigen vals and + eigenvectors + Further, we create a linear combination of them in the new map by + giving values from -1 to 1 in each + + X = UsigmaWT + XTX = Wsigma^2WT + T = XW = Usigma + + // Transformed by W EigenVector, can be calculated by multiplication + // PC matrix by eigenval too + // Further, we can get lower ranks by using just few of the eigen + // vevtors + + T(2) = U(2)sigma(2) = XW(2) ---> 2 is the number of selected + eigenvectors + + (*) Note that 'X' is the covariance matrix of original data + + :param data: data to use for the initialization + :returns: initialized matrix with same dimension as input data + """ + cols = self.mapsize[1] + coord = None + pca_components = None + + if np.min(self.mapsize) > 1: + coord = np.zeros((self.nnodes, 2)) + pca_components = 2 + + for i in range(0, self.nnodes): + coord[i, 0] = int(i / cols) # x + coord[i, 1] = int(i % cols) # y + + elif np.min(self.mapsize) == 1: + coord = np.zeros((self.nnodes, 1)) + pca_components = 1 + + for i in range(0, self.nnodes): + coord[i, 0] = int(i % cols) # y + + mx = np.max(coord, axis=0) + mn = np.min(coord, axis=0) + coord = (coord - mn)/(mx-mn) + coord = (coord - .5)*2 + me = np.mean(data, 0) + data = (data - me) + tmp_matrix = np.tile(me, (self.nnodes, 1)) + + # Randomized PCA is scalable + #pca = RandomizedPCA(n_components=pca_components) # RandomizedPCA is deprecated. + pca = PCA(n_components=pca_components, svd_solver='randomized') + + pca.fit(data) + eigvec = pca.components_ + eigval = pca.explained_variance_ + norms = np.sqrt(np.einsum('ij,ij->i', eigvec, eigvec)) + eigvec = ((eigvec.T/norms)*eigval).T + + for j in range(self.nnodes): + for i in range(eigvec.shape[0]): + tmp_matrix[j, :] = tmp_matrix[j, :] + coord[j, i]*eigvec[i, :] + + self.matrix = np.around(tmp_matrix, decimals=6) + self.initialized = True + + def grid_dist(self, node_ind): + """ + Calculates grid distance based on the lattice type. + + :param node_ind: number between 0 and number of nodes-1. Depending on + the map size, starting from top left + :returns: matrix representing the distance matrix + """ + if self.lattice == 'rect': + return self._rect_dist(node_ind) + + elif self.lattice == 'hexa': + return self._hexa_dist(node_ind) + + def _hexa_dist(self, node_ind): + raise NotImplementedError() + + def _rect_dist(self, node_ind): + """ + Calculates the distance of the specified node to the other nodes in the + matrix, generating a distance matrix + + Ej. The distance matrix for the node_ind=5, that corresponds to + the_coord (1,1) + array([[2, 1, 2, 5], + [1, 0, 1, 4], + [2, 1, 2, 5], + [5, 4, 5, 8]]) + + :param node_ind: number between 0 and number of nodes-1. Depending on + the map size, starting from top left + :returns: matrix representing the distance matrix + """ + rows = self.mapsize[0] + cols = self.mapsize[1] + dist = None + + # bmu should be an integer between 0 to no_nodes + if 0 <= node_ind <= (rows*cols): + node_col = int(node_ind % cols) + node_row = int(node_ind / cols) + else: + raise InvalidNodeIndexError( + "Node index '%s' is invalid" % node_ind) + + if rows > 0 and cols > 0: + r = np.arange(0, rows, 1)[:, np.newaxis] + c = np.arange(0, cols, 1) + dist2 = (r-node_row)**2 + (c-node_col)**2 + + dist = dist2.ravel() + else: + raise InvalidMapsizeError( + "One or both of the map dimensions are invalid. " + "Cols '%s', Rows '%s'".format(cols=cols, rows=rows)) + + return dist diff --git a/sompy/decorators.py b/sompy/decorators.py new file mode 100644 index 0000000..a3ae0fc --- /dev/null +++ b/sompy/decorators.py @@ -0,0 +1,21 @@ +import logging +from functools import wraps +from time import time + + +def timeit(log_level=logging.INFO, alternative_title=None): + def wrap(f): + @wraps(f) # keeps the f.__name__ outside the wrapper + def wrapped_f(*args, **kwargs): + t0 = time() + result = f(*args, **kwargs) + ts = round(time() - t0, 3) + + title = alternative_title or f.__name__ + logging.getLogger().log( + log_level, " %s took: %f seconds" % (title, ts)) + + return result + + return wrapped_f + return wrap diff --git a/sompy/neighborhood.py b/sompy/neighborhood.py new file mode 100644 index 0000000..a4d2c9c --- /dev/null +++ b/sompy/neighborhood.py @@ -0,0 +1,48 @@ +import numpy as np +import inspect +import sys + +small = .000000000001 + + +class NeighborhoodFactory(object): + + @staticmethod + def build(neighborhood_func): + for name, obj in inspect.getmembers(sys.modules[__name__]): + if inspect.isclass(obj): + if hasattr(obj, 'name') and neighborhood_func == obj.name: + return obj() + else: + raise Exception( + "Unsupported neighborhood function '%s'" % neighborhood_func) + + +class GaussianNeighborhood(object): + + name = 'gaussian' + + @staticmethod + def calculate(distance_matrix, radius, dim): + return np.exp(-1.0*distance_matrix/(2.0*radius**2)).reshape(dim, dim) + + def __call__(self, *args, **kwargs): + return self.calculate(*args) + + +class BubbleNeighborhood(object): + + name = 'bubble' + + @staticmethod + def calculate(distance_matrix, radius, dim): + def l(a, b): + c = np.zeros(b.shape) + c[a-b >= 0] = 1 + return c + + return l(radius, + np.sqrt(distance_matrix.flatten())).reshape(dim, dim) + small + + def __call__(self, *args, **kwargs): + return self.calculate(*args) diff --git a/sompy/normalization.py b/sompy/normalization.py new file mode 100644 index 0000000..b641b94 --- /dev/null +++ b/sompy/normalization.py @@ -0,0 +1,74 @@ +import numpy as np +import sys +import inspect + + +class NormalizatorFactory(object): + + @staticmethod + def build(type_name): + for name, obj in inspect.getmembers(sys.modules[__name__]): + if inspect.isclass(obj): + if hasattr(obj, 'name') and type_name == obj.name: + return obj() + else: + raise Exception("Unknown normalization type '%s'" % type_name) + + +class Normalizator(object): + + def normalize(self, data): + raise NotImplementedError() + + def normalize_by(self, raw_data, data): + raise NotImplementedError() + + def denormalize_by(self, raw_data, data): + raise NotImplementedError() + + +class VarianceNormalizator(Normalizator): + + name = 'var' + + def _mean_and_standard_dev(self, data): + return np.mean(data, axis=0), np.std(data, axis=0) + + def normalize(self, data): + me, st = self._mean_and_standard_dev(data) + st[st == 0] = 1 # prevent: when sd = 0, normalized result = NaN + return (data-me)/st + + def normalize_by(self, raw_data, data): + me, st = self._mean_and_standard_dev(raw_data) + st[st == 0] = 1 # prevent: when sd = 0, normalized result = NaN + return (data-me)/st + + def denormalize_by(self, data_by, n_vect): + me, st = self._mean_and_standard_dev(data_by) + return n_vect * st + me + + +class RangeNormalizator(Normalizator): + + name = 'range' + + +class LogNormalizator(Normalizator): + + name = 'log' + + +class LogisticNormalizator(Normalizator): + + name = 'logistic' + + +class HistDNormalizator(Normalizator): + + name = 'histd' + + +class HistCNormalizator(Normalizator): + + name = 'histc' diff --git a/sompy/sompy.py b/sompy/sompy.py new file mode 100644 index 0000000..306982c --- /dev/null +++ b/sompy/sompy.py @@ -0,0 +1,712 @@ +# -*- coding: utf-8 -*- + +# Author: Vahid Moosavi (sevamoo@gmail.com) +# Chair For Computer Aided Architectural Design, ETH Zurich +# Future Cities Lab +# www.vahidmoosavi.com + +# Contributor: Sebastian Packmann (sebastian.packmann@gmail.com) + + +import tempfile +import os +import itertools +import logging + +import numpy as np + +from time import time +from multiprocessing.dummy import Pool +from multiprocessing import cpu_count +from scipy.sparse import csr_matrix +from sklearn import neighbors +from sklearn.externals.joblib import Parallel, delayed, load, dump +import sys + +from .decorators import timeit +from .codebook import Codebook +from .neighborhood import NeighborhoodFactory +from .normalization import NormalizatorFactory + + + +class ComponentNamesError(Exception): + pass + + +class LabelsError(Exception): + pass + + +class SOMFactory(object): + + @staticmethod + def build(data, + mapsize=None, + mask=None, + mapshape='planar', + lattice='rect', + normalization='var', + initialization='pca', + neighborhood='gaussian', + training='batch', + name='sompy', + component_names=None): + """ + :param data: data to be clustered, represented as a matrix of n rows, + as inputs and m cols as input features + :param neighborhood: neighborhood object calculator. Options are: + - gaussian + - bubble + - manhattan (not implemented yet) + - cut_gaussian (not implemented yet) + - epanechicov (not implemented yet) + + :param normalization: normalizer object calculator. Options are: + - var + + :param mapsize: tuple/list defining the dimensions of the som. + If single number is provided is considered as the number of nodes. + :param mask: mask + :param mapshape: shape of the som. Options are: + - planar + - toroid (not implemented yet) + - cylinder (not implemented yet) + + :param lattice: type of lattice. Options are: + - rect + - hexa (not implemented yet) + + :param initialization: method to be used for initialization of the som. + Options are: + - pca + - random + + :param name: name used to identify the som + :param training: Training mode (seq, batch) + """ + if normalization: + normalizer = NormalizatorFactory.build(normalization) + else: + normalizer = None + neighborhood_calculator = NeighborhoodFactory.build(neighborhood) + return SOM(data, neighborhood_calculator, normalizer, mapsize, mask, + mapshape, lattice, initialization, training, name, component_names) + + +class SOM(object): + + def __init__(self, + data, + neighborhood, + normalizer=None, + mapsize=None, + mask=None, + mapshape='planar', + lattice='rect', + initialization='pca', + training='batch', + name='sompy', + component_names=None): + """ + Self Organizing Map + + :param data: data to be clustered, represented as a matrix of n rows, + as inputs and m cols as input features + :param neighborhood: neighborhood object calculator. + :param normalizer: normalizer object calculator. + :param mapsize: tuple/list defining the dimensions of the som. If + single number is provided is considered as the number of nodes. + :param mask: mask + :param mapshape: shape of the som. + :param lattice: type of lattice. + :param initialization: method to be used for initialization of the som. + :param name: name used to identify the som + :param training: Training mode (seq, batch) + """ + self._data = normalizer.normalize(data) if normalizer else data + self._normalizer = normalizer + self._dim = data.shape[1] + self._dlen = data.shape[0] + self._dlabel = None + self._bmu = None + + self.name = name + self.data_raw = data + self.neighborhood = neighborhood + self.mapshape = mapshape + self.initialization = initialization + self.mask = mask or np.ones([1, self._dim]) + mapsize = self.calculate_map_size(lattice) if not mapsize else mapsize + self.codebook = Codebook(mapsize, lattice) + self.training = training + self._component_names = self.build_component_names() if component_names is None else [component_names] + self._distance_matrix = self.calculate_map_dist() + + @property + def component_names(self): + return self._component_names + + @component_names.setter + def component_names(self, compnames): + if self._dim == len(compnames): + self._component_names = np.asarray(compnames)[np.newaxis, :] + else: + raise ComponentNamesError('Component names should have the same ' + 'size as the data dimension/features') + + def build_component_names(self): + cc = ['Variable-' + str(i+1) for i in range(0, self._dim)] + return np.asarray(cc)[np.newaxis, :] + + @property + def data_labels(self): + return self._dlabel + + @data_labels.setter + def data_labels(self, labels): + """ + Set labels of the training data, it should be in the format of a list + of strings + """ + if labels.shape == (1, self._dlen): + label = labels.T + elif labels.shape == (self._dlen, 1): + label = labels + elif labels.shape == (self._dlen,): + label = labels[:, np.newaxis] + else: + raise LabelsError('wrong label format') + + self._dlabel = label + + def build_data_labels(self): + cc = ['dlabel-' + str(i) for i in range(0, self._dlen)] + return np.asarray(cc)[:, np.newaxis] + + def calculate_map_dist(self): + """ + Calculates the grid distance, which will be used during the training + steps. It supports only planar grids for the moment + """ + nnodes = self.codebook.nnodes + + distance_matrix = np.zeros((nnodes, nnodes)) + for i in range(nnodes): + distance_matrix[i] = self.codebook.grid_dist(i).reshape(1, nnodes) + return distance_matrix + + @timeit() + def train(self, + n_job=1, + shared_memory=False, + verbose='info', + train_rough_len=None, + train_rough_radiusin=None, + train_rough_radiusfin=None, + train_finetune_len=None, + train_finetune_radiusin=None, + train_finetune_radiusfin=None, + train_len_factor=1, + maxtrainlen=np.Inf): + """ + Trains the som + + :param n_job: number of jobs to use to parallelize the traning + :param shared_memory: flag to active shared memory + :param verbose: verbosity, could be 'debug', 'info' or None + :param train_len_factor: Factor that multiply default training lenghts (similar to "training" parameter in the matlab version). (lbugnon) + """ + logging.root.setLevel( + getattr(logging, verbose.upper()) if verbose else logging.ERROR) + + logging.info(" Training...") + logging.debug(( + "--------------------------------------------------------------\n" + " details: \n" + " > data len is {data_len} and data dimension is {data_dim}\n" + " > map size is {mpsz0},{mpsz1}\n" + " > array size in log10 scale is {array_size}\n" + " > number of jobs in parallel: {n_job}\n" + " -------------------------------------------------------------\n") + .format(data_len=self._dlen, + data_dim=self._dim, + mpsz0=self.codebook.mapsize[0], + mpsz1=self.codebook.mapsize[1], + array_size=np.log10( + self._dlen * self.codebook.nnodes * self._dim), + n_job=n_job)) + + if self.initialization == 'random': + self.codebook.random_initialization(self._data) + + elif self.initialization == 'pca': + self.codebook.pca_linear_initialization(self._data) + + self.rough_train(njob=n_job, shared_memory=shared_memory, trainlen=train_rough_len, + radiusin=train_rough_radiusin, radiusfin=train_rough_radiusfin,trainlen_factor=train_len_factor,maxtrainlen=maxtrainlen) + self.finetune_train(njob=n_job, shared_memory=shared_memory, trainlen=train_finetune_len, + radiusin=train_finetune_radiusin, radiusfin=train_finetune_radiusfin,trainlen_factor=train_len_factor,maxtrainlen=maxtrainlen) + logging.debug( + " --------------------------------------------------------------") + logging.info(" Final quantization error: %f" % np.mean(self._bmu[1])) + + def _calculate_ms_and_mpd(self): + mn = np.min(self.codebook.mapsize) + max_s = max(self.codebook.mapsize[0], self.codebook.mapsize[1]) + + if mn == 1: + mpd = float(self.codebook.nnodes*10)/float(self._dlen) + else: + mpd = float(self.codebook.nnodes)/float(self._dlen) + ms = max_s/2.0 if mn == 1 else max_s + + return ms, mpd + + def rough_train(self, njob=1, shared_memory=False, trainlen=None, radiusin=None, radiusfin=None,trainlen_factor=1,maxtrainlen=np.Inf): + logging.info(" Rough training...") + + ms, mpd = self._calculate_ms_and_mpd() + #lbugnon: add maxtrainlen + trainlen = min(int(np.ceil(30*mpd)),maxtrainlen) if not trainlen else trainlen + #lbugnon: add trainlen_factor + trainlen=int(trainlen*trainlen_factor) + + if self.initialization == 'random': + radiusin = max(1, np.ceil(ms/3.)) if not radiusin else radiusin + radiusfin = max(1, radiusin/6.) if not radiusfin else radiusfin + + elif self.initialization == 'pca': + radiusin = max(1, np.ceil(ms/8.)) if not radiusin else radiusin + radiusfin = max(1, radiusin/4.) if not radiusfin else radiusfin + + self._batchtrain(trainlen, radiusin, radiusfin, njob, shared_memory) + + def finetune_train(self, njob=1, shared_memory=False, trainlen=None, radiusin=None, radiusfin=None,trainlen_factor=1,maxtrainlen=np.Inf): + logging.info(" Finetune training...") + + ms, mpd = self._calculate_ms_and_mpd() + + #lbugnon: add maxtrainlen + if self.initialization == 'random': + trainlen = min(int(np.ceil(50*mpd)),maxtrainlen) if not trainlen else trainlen + radiusin = max(1, ms/12.) if not radiusin else radiusin # from radius fin in rough training + radiusfin = max(1, radiusin/25.) if not radiusfin else radiusfin + + elif self.initialization == 'pca': + trainlen = min(int(np.ceil(40*mpd)),maxtrainlen) if not trainlen else trainlen + radiusin = max(1, np.ceil(ms/8.)/4) if not radiusin else radiusin + radiusfin = 1 if not radiusfin else radiusfin # max(1, ms/128) + + + + #lbugnon: add trainlen_factor + trainlen=int(trainlen_factor*trainlen) + + + self._batchtrain(trainlen, radiusin, radiusfin, njob, shared_memory) + + def _batchtrain(self, trainlen, radiusin, radiusfin, njob=1, + shared_memory=False): + radius = np.linspace(radiusin, radiusfin, trainlen) + + if shared_memory: + data = self._data + data_folder = tempfile.mkdtemp() + data_name = os.path.join(data_folder, 'data') + dump(data, data_name) + data = load(data_name, mmap_mode='r') + + else: + data = self._data + + bmu = None + + # X2 is part of euclidean distance (x-y)^2 = x^2 +y^2 - 2xy that we use + # for each data row in bmu finding. + # Since it is a fixed value we can skip it during bmu finding for each + # data point, but later we need it calculate quantification error + fixed_euclidean_x2 = np.einsum('ij,ij->i', data, data) + + logging.info(" radius_ini: %f , radius_final: %f, trainlen: %d\n" % + (radiusin, radiusfin, trainlen)) + + for i in range(trainlen): + t1 = time() + neighborhood = self.neighborhood.calculate( + self._distance_matrix, radius[i], self.codebook.nnodes) + bmu = self.find_bmu(data, njb=njob) + self.codebook.matrix = self.update_codebook_voronoi(data, bmu, + neighborhood) + + #lbugnon: ojo! aca el bmy[1] a veces da negativo, y despues de eso se rompe...hay algo raro ahi + qerror = (i + 1, round(time() - t1, 3), + np.mean(np.sqrt(bmu[1] + fixed_euclidean_x2))) #lbugnon: ojo aca me tiró un warning, revisar (commit sinc: 965666d3d4d93bcf48e8cef6ea2c41a018c1cb83 ) + #lbugnon + #ipdb.set_trace() + # + logging.info( + " epoch: %d ---> elapsed time: %f, quantization error: %f\n" % + qerror) + if np.any(np.isnan(qerror)): + logging.info("nan quantization error, exit train\n") + + #sys.exit("quantization error=nan, exit train") + + bmu[1] = np.sqrt(bmu[1] + fixed_euclidean_x2) + self._bmu = bmu + + @timeit(logging.DEBUG) + def find_bmu(self, input_matrix, njb=1, nth=1): + """ + Finds the best matching unit (bmu) for each input data from the input + matrix. It does all at once parallelizing the calculation instead of + going through each input and running it against the codebook. + + :param input_matrix: numpy matrix representing inputs as rows and + features/dimension as cols + :param njb: number of jobs to parallelize the search + :returns: the best matching unit for each input + """ + dlen = input_matrix.shape[0] + y2 = np.einsum('ij,ij->i', self.codebook.matrix, self.codebook.matrix) + if njb == -1: + njb = cpu_count() + + pool = Pool(njb) + chunk_bmu_finder = _chunk_based_bmu_find + + def row_chunk(part): + return part * dlen // njb + + def col_chunk(part): + return min((part+1)*dlen // njb, dlen) + + chunks = [input_matrix[row_chunk(i):col_chunk(i)] for i in range(njb)] + b = pool.map(lambda chk: chunk_bmu_finder(chk, self.codebook.matrix, y2, nth=nth), chunks) + pool.close() + pool.join() + bmu = np.asarray(list(itertools.chain(*b))).T + del b + return bmu + + @timeit(logging.DEBUG) + def update_codebook_voronoi(self, training_data, bmu, neighborhood): + """ + Updates the weights of each node in the codebook that belongs to the + bmu's neighborhood. + + First finds the Voronoi set of each node. It needs to calculate a + smaller matrix. + Super fast comparing to classic batch training algorithm, it is based + on the implemented algorithm in som toolbox for Matlab by Helsinky + University. + + :param training_data: input matrix with input vectors as rows and + vector features as cols + :param bmu: best matching unit for each input data. Has shape of + (2, dlen) where first row has bmu indexes + :param neighborhood: matrix representing the neighborhood of each bmu + + :returns: An updated codebook that incorporates the learnings from the + input data + """ + row = bmu[0].astype(int) + col = np.arange(self._dlen) + val = np.tile(1, self._dlen) + P = csr_matrix((val, (row, col)), shape=(self.codebook.nnodes, + self._dlen)) + S = P.dot(training_data) + + # neighborhood has nnodes*nnodes and S has nnodes*dim + # ---> Nominator has nnodes*dim + nom = neighborhood.T.dot(S) + nV = P.sum(axis=1).reshape(1, self.codebook.nnodes) + denom = nV.dot(neighborhood.T).reshape(self.codebook.nnodes, 1) + new_codebook = np.divide(nom, denom) + + return np.around(new_codebook, decimals=6) + + def project_data(self, data): + """ + Projects a data set to a trained SOM. It is based on nearest + neighborhood search module of scikitlearn, but it is not that fast. + """ + clf = neighbors.KNeighborsClassifier(n_neighbors=1) + labels = np.arange(0, self.codebook.matrix.shape[0]) + clf.fit(self.codebook.matrix, labels) + + # The codebook values are all normalized + # we can normalize the input data based on mean and std of + # original data + if self._normalizer is not None: + data = self._normalizer.normalize_by(self.data_raw, data) + + return clf.predict(data) + + def predict_by(self, data, target, k=5, wt='distance'): + # here it is assumed that target is the last column in the codebook + # and data has dim-1 columns + dim = self.codebook.matrix.shape[1] + ind = np.arange(0, dim) + indX = ind[ind != target] + x = self.codebook.matrix[:, indX] + y = self.codebook.matrix[:, target] + n_neighbors = k + clf = neighbors.KNeighborsRegressor(n_neighbors, weights=wt) + clf.fit(x, y) + + # The codebook values are all normalized + # we can normalize the input data based on mean and std of + # original data + dimdata = data.shape[1] + + if dimdata == dim: + data[:, target] = 0 + data = self._normalizer.normalize_by(self.data_raw, data) + data = data[:, indX] + + elif dimdata == dim-1: + data = self._normalizer.normalize_by(self.data_raw[:, indX], data) + + predicted_values = clf.predict(data) + predicted_values = self._normalizer.denormalize_by( + self.data_raw[:, target], predicted_values) + return predicted_values + + def predict(self, x_test, k=5, wt='distance'): + """ + Similar to SKlearn we assume that we have X_tr, Y_tr and X_test. Here + it is assumed that target is the last column in the codebook and data + has dim-1 columns + + :param x_test: input vector + :param k: number of neighbors to use + :param wt: method to use for the weights + (more detail in KNeighborsRegressor docs) + :returns: predicted values for the input data + """ + target = self.data_raw.shape[1]-1 + x_train = self.codebook.matrix[:, :target] + y_train = self.codebook.matrix[:, target] + clf = neighbors.KNeighborsRegressor(k, weights=wt) + clf.fit(x_train, y_train) + + # The codebook values are all normalized + # we can normalize the input data based on mean and std of + # original data + x_test = self._normalizer.normalize_by( + self.data_raw[:, :target], x_test) + predicted_values = clf.predict(x_test) + + return self._normalizer.denormalize_by( + self.data_raw[:, target], predicted_values) + + def find_k_nodes(self, data, k=5): + from sklearn.neighbors import NearestNeighbors + # we find the k most similar nodes to the input vector + neighbor = NearestNeighbors(n_neighbors=k) + neighbor.fit(self.codebook.matrix) + + # The codebook values are all normalized + # we can normalize the input data based on mean and std of + # original data + return neighbor.kneighbors( + self._normalizer.normalize_by(self.data_raw, data)) + + def bmu_ind_to_xy(self, bmu_ind): + """ + Translates a best matching unit index to the corresponding + matrix x,y coordinates. + + :param bmu_ind: node index of the best matching unit + (number of node from top left node) + :returns: corresponding (x,y) coordinate + """ + rows = self.codebook.mapsize[0] + cols = self.codebook.mapsize[1] + + # bmu should be an integer between 0 to no_nodes + out = np.zeros((bmu_ind.shape[0], 3)) + out[:, 2] = bmu_ind + out[:, 0] = rows-1-bmu_ind / cols + out[:, 0] = bmu_ind / cols + out[:, 1] = bmu_ind % cols + + return out.astype(int) + + def cluster(self, n_clusters=8): + import sklearn.cluster as clust + + if self._normalizer is not None: + cl_labels = clust.KMeans(n_clusters=n_clusters).fit_predict( + self._normalizer.denormalize_by(self.data_raw, + self.codebook.matrix)) + else: + cl_labels = clust.KMeans(n_clusters=n_clusters).fit_predict( + self.codebook.matrix) + + self.cluster_labels = cl_labels + return cl_labels + + def predict_probability(self, data, target, k=5): + """ + Predicts probability of the input data to be target + + :param data: data to predict, it is assumed that 'target' is the last + column in the codebook, so data hould have dim-1 columns + :param target: target to predict probability + :param k: k parameter on KNeighborsRegressor + :returns: probability of data been target + """ + dim = self.codebook.matrix.shape[1] + ind = np.arange(0, dim) + indx = ind[ind != target] + x = self.codebook.matrix[:, indx] + y = self.codebook.matrix[:, target] + + clf = neighbors.KNeighborsRegressor(k, weights='distance') + clf.fit(x, y) + + # The codebook values are all normalized + # we can normalize the input data based on mean and std of + # original data + dimdata = data.shape[1] + + if dimdata == dim: + data[:, target] = 0 + data = self._normalizer.normalize_by(self.data_raw, data) + data = data[:, indx] + + elif dimdata == dim-1: + data = self._normalizer.normalize_by(self.data_raw[:, indx], data) + + weights, ind = clf.kneighbors(data, n_neighbors=k, + return_distance=True) + weights = 1./weights + sum_ = np.sum(weights, axis=1) + weights = weights/sum_[:, np.newaxis] + labels = np.sign(self.codebook.matrix[ind, target]) + labels[labels >= 0] = 1 + + # for positives + pos_prob = labels.copy() + pos_prob[pos_prob < 0] = 0 + pos_prob *= weights + pos_prob = np.sum(pos_prob, axis=1)[:, np.newaxis] + + # for negatives + neg_prob = labels.copy() + neg_prob[neg_prob > 0] = 0 + neg_prob = neg_prob * weights * -1 + neg_prob = np.sum(neg_prob, axis=1)[:, np.newaxis] + + return np.concatenate((pos_prob, neg_prob), axis=1) + + def node_activation(self, data, target=None, wt='distance'): + weights, ind = None, None + + if not target: + clf = neighbors.KNeighborsClassifier( + n_neighbors=self.codebook.nnodes) + labels = np.arange(0, self.codebook.matrix.shape[0]) + clf.fit(self.codebook.matrix, labels) + + # The codebook values are all normalized + # we can normalize the input data based on mean and std of + # original data + data = self._normalizer.normalize_by(self.data_raw, data) + weights, ind = clf.kneighbors(data) + + # Softmax function + weights = 1./weights + + return weights, ind + + def calculate_topographic_error(self): + bmus1 = self.find_bmu(self.data_raw, njb=1, nth=1) + bmus2 = self.find_bmu(self.data_raw, njb=1, nth=2) + bmus_gap = np.abs((self.bmu_ind_to_xy(np.array(bmus1[0]))[:, 0:2] - self.bmu_ind_to_xy(np.array(bmus2[0]))[:, 0:2]).sum(axis=1)) + return np.mean(bmus_gap != 1) + + def calculate_map_size(self, lattice): + """ + Calculates the optimal map size given a dataset using eigenvalues and eigenvectors. Matlab ported + :lattice: 'rect' or 'hex' + :return: map sizes + """ + D = self.data_raw.copy() + dlen = D.shape[0] + dim = D.shape[1] + munits = np.ceil(5 * (dlen ** 0.5)) + A = np.ndarray(shape=[dim, dim]) + np.Inf + + for i in range(dim): + D[:, i] = D[:, i] - np.mean(D[np.isfinite(D[:, i]), i]) + + for i in range(dim): + for j in range(dim): + c = D[:, i] * D[:, j] + c = c[np.isfinite(c)] + A[i, j] = sum(c) / len(c) + A[j, i] = A[i, j] + + VS = np.linalg.eig(A) + eigval = sorted(np.linalg.eig(A)[0]) + if eigval[-1] == 0 or eigval[-2] * munits < eigval[-1]: + ratio = 1 + else: + ratio = np.sqrt(eigval[-1] / eigval[-2]) + + if lattice == "rect": + size1 = min(munits, round(np.sqrt(munits / ratio))) + else: + size1 = min(munits, round(np.sqrt(munits / ratio*np.sqrt(0.75)))) + + size2 = round(munits / size1) + + return [int(size1), int(size2)] + + +# Since joblib.delayed uses Pickle, this method needs to be a top level +# method in order to be pickled +# Joblib is working on adding support for cloudpickle or dill which will allow +# class methods to be pickled +# when that that comes out we can move this to SOM class +def _chunk_based_bmu_find(input_matrix, codebook, y2, nth=1): + """ + Finds the corresponding bmus to the input matrix. + + :param input_matrix: a matrix of input data, representing input vector as + rows, and vectors features/dimention as cols + when parallelizing the search, the input_matrix can be + a sub matrix from the bigger matrix + :param codebook: matrix of weights to be used for the bmu search + :param y2: + """ + dlen = input_matrix.shape[0] + nnodes = codebook.shape[0] + bmu = np.empty((dlen, 2)) + + # It seems that small batches for large dlen is really faster: + # that is because of ddata in loops and n_jobs. for large data it slows + # down due to memory needs in parallel + blen = min(50, dlen) + i0 = 0 + + while i0+1 <= dlen: + low = i0 + high = min(dlen, i0+blen) + i0 = i0+blen + ddata = input_matrix[low:high+1] + d = np.dot(codebook, ddata.T) + d *= -2 + d += y2.reshape(nnodes, 1) + bmu[low:high+1, 0] = np.argpartition(d, nth, axis=0)[nth-1] + bmu[low:high+1, 1] = np.partition(d, nth, axis=0)[nth-1] + del ddata + + return bmu + + + diff --git a/sompy/visualization/__init__.py b/sompy/visualization/__init__.py new file mode 100755 index 0000000..3d7dfea --- /dev/null +++ b/sompy/visualization/__init__.py @@ -0,0 +1 @@ +from . import dotmap, histogram, hitmap, mapview, umatrix diff --git a/sompy/visualization/__pycache__/__init__.cpython-38.pyc b/sompy/visualization/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..af6d599f20003155a0be3091f9ad3594becec97f GIT binary patch literal 254 zcmYk0J#ND=4232CB`uJlH^}NGIzWmbC+HGn(NIB%1e>U68M2%>35FT&7;?$6kOh{q#7b6J%bGJ&99Uca za9V5Y)EFO8kLjU@Df%vGOid4eqXo^+VBgbXMjZouytGYXAs-55MwL-#G=wh}mbuTk zZAI_V3TO3Tf*00J9@VH%Z@ROB`tc&pFwNS*r;gwUp%e@dmD-G#w0uTyW@mT#$6qqL E1MCq%i~s-t literal 0 HcmV?d00001 diff --git a/sompy/visualization/__pycache__/dotmap.cpython-38.pyc b/sompy/visualization/__pycache__/dotmap.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..319248a48d21432190f6501ae2b4b661796810ed GIT binary patch literal 2081 zcma)-OK%)S5PE)8-m>)n+_AwU@oWd3HEBb~wSxB4_LV~JZJ8^`N=uy{Hb=OqCzOMPa-;W5C2iMPi zy$RZ{Xq+tqjLT5lw?RlEX+cUFQ%WS0ykI4dh0{gh#r{W zvGvDUJ{oHqG%9OSl??(b+N`)~xy(x|hIP@#D^;DSdY0r>f_vj^=s2}49a}nkNYo0% z{_~yfab0HHMV3}l)!SLo)@9P9!@YErZG+y-w{Pd|G%fOjROfZIEo)t-%~mtF-Mq?m zax))IRTe_576TI6qia;aa~__93r{w5>x}!*L4)p4R-xjjZgc+vMH3U;p;yRPzb_(9 z519!~$3(cqbQ)bvf{`g*1h9$l1DAfm>D@q+siCAW5ExLAQ%29lc<_)54qD==3EbESvj zN&7PXllIPt9KkgJk7%jg3LONUB?Q2>NcZzrA$G?x2uruD*r}^*5L!C4;&z$tqs%Ky zTkDsp+RK#fWO9_jY1ay9fP=DLS~TNy^l0`JK*e{T)IZA(D=fgc1YVO{HRn@_137P8D`op?bvbCaY4_XPCy z89Lz$0Ba&9-j{H<7lEONKGeW?N8s@oM|O`PGKQGXBq`QXQp}Myhs-L8&eBDR; z|HN8)!nJadG&x+sECefF#f&IB_hA+e$7m-GKb2n;A50T<%wW5Y@v6H#rQSjf`5b+- z{{7Wo4t~CLlDZyV8nD#J z#I*TAX8XJ46tb2~be-(aZLd-FBpd3)0q3>jw8_*O&f!fAEcCnqs`E-)4z$HRjhQkj z(s@1A@#+caZIa5#wAF*|l3oruF7C|6`EZ;-3RTFNqf_CJL%o7xfZ}Bouc1JU{{?D0 zu4h}&(V`&qIpzbbFCwrk1R#6#0u2cx>~CLeILmjI;NjD-Vw?eYTf4I}@NJkRRa$0A zVxuG}YdI}Y?U s`G+m?U4=jH<%xOK6n+Eh1#G}LXV(6B02{u$oZNf(2XmwJV3{HR2V)%m0{{R3 literal 0 HcmV?d00001 diff --git a/sompy/visualization/__pycache__/view.cpython-38.pyc b/sompy/visualization/__pycache__/view.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..36e5a8d8c00c82cdfd39671bd18ec443691269fb GIT binary patch literal 2416 zcmbVNNpB-X7_I8v+f25J*cX9-Mj~Mu;DS&j#IQ)1LjngMXmz^EaeLZsQ{9;)R-Z_u zIe|EHKr=`GrMhxJ`3oEv-d7zbb|ylCvg-9$Ro(UN&);=AO@{H??&=S|&)6TdSR4)( zJDA100Ko*$*ogNm&c0y6680GrwzN;}o`ba`T&!K`VC@QTmxca0zG5NQ-sEJGfSx={@BqbNR33%D33_odK{QXZF4kxu8QwAMkAXJ&KO(oP|>zCyH_ z3JAzzMQKoEuWA!?2yOwi6RQKDz_MxyX4N*i5n@*Qv!JjR|hTY(+e=JdwK#22ji5^&j=>knC< zo91l%OLK0B+hXO6_k6J`*3hoOLul_c(6V=Q>zlYFCZJf?9vg?B>RRfF+oYd(r(`Ai z_S70$=wAzc>!4pf^l8u!s>bhD)%=BXuoF7gzO2R2(Qa`#kxIRaD@?4k6ZO*rMfajo z8&Q;GaZyCkRVA9&ULd`cC}yp33*}~U>u<}dr(;hy7JRt5PpCSnN==1QKUeG#dyKeW zS;vvrB+Cm)Ce3)=AXmw&+aijD%%bQu+%PYi^=0Mee5qbZHznNYCNC;BMAw!45m;}? z9-ByA^lNAPms4f%p&iNP(E)a3gN9W3XGF%&tX&DCHRQoLxOAG5MR89S&%Ud zo>)(&D~l=pe>AA5b*}3oS|S_f6}+qm?022~?0f2cy5|D`Quq?dFB7~%U`X#Rl1y%w z=}&hqnmUmg8fU?j*XavW%tQwiqJma|HD|+HKG@I+{H9|FqZ@j 0: + txt = "" + ax.annotate(txt, (cents[i, 1] + 0.5, cents[-(i+1), 0] + 0.5), va="center", ha="center", size=fontsize) + + def show(self, som, anotate=True, onlyzeros=False, labelsize=7, cmap="jet", logaritmic = False): + (self.width, self.height, indtoshow, no_row_in_plot, no_col_in_plot, + axis_num) = self._calculate_figure_params(som, 1, 1) + + self.prepare() + ax = plt.gca() + counts = Counter(som._bmu[0]) + counts = [counts.get(x, 0) for x in range(som.codebook.mapsize[0] * som.codebook.mapsize[1])] + mp = np.array(counts).reshape(som.codebook.mapsize[0], + som.codebook.mapsize[1]) + + if not logaritmic: + norm = matplotlib.colors.Normalize( + vmin=0, + vmax=np.max(mp.flatten()), + clip=True) + else: + norm = matplotlib.colors.LogNorm( + vmin=1, + vmax=np.max(mp.flatten())) + + msz = som.codebook.mapsize + + cents = som.bmu_ind_to_xy(np.arange(0, msz[0] * msz[1])) + + if anotate: + self._set_labels(cents, ax, counts, onlyzeros, labelsize) + + + pl = plt.pcolor(mp[::-1], norm=norm, cmap=cmap) + + plt.axis([0, som.codebook.mapsize[1], 0, som.codebook.mapsize[0]]) + ax.set_yticklabels([]) + ax.set_xticklabels([]) + plt.colorbar(pl) + + plt.show() diff --git a/sompy/visualization/dotmap.py b/sompy/visualization/dotmap.py new file mode 100644 index 0000000..7ba10ff --- /dev/null +++ b/sompy/visualization/dotmap.py @@ -0,0 +1,68 @@ +from .view import MatplotView +from matplotlib import pyplot as plt +import numpy as np + + +class DotMapView(MatplotView): + + def init_figure(self, dim, cols): + no_row_in_plot = dim/cols + 1 + no_col_in_plot = dim if no_row_in_plot <= 1 else cols + h = .1 + w = .1 + self.width = no_col_in_plot*2.5*(1+w) + self.height = no_row_in_plot*2.5*(1+h) + self.prepare() + + def plot(self, data, coords, msz0, msz1, colormap, dlen, dim, rows, cols): + for i in range(dim): + plt.subplot(rows, cols, i+1) + + # This uses the colors uniquely for each record, while in normal + # views, it is based on the values within each dimensions. This is + # important when we are dealing with time series. Where we don't + # want to normalize colors within each time period, rather we like + # to see the patterns of each data records in time. + mn = np.min(data[:, :], axis=1) + mx = np.max(data[:, :], axis=1) + + for j in range(dlen): + plt.scatter(coords[j, 1], + msz0-1-coords[j, 0], + c=data[j, i], + vmax=mx[j], vmin=mn[j], + s=90, + marker='.', + edgecolor='None', + cmap=colormap, + alpha=1) + + eps = .0075 + plt.xlim(0-eps, msz1-1+eps) + plt.ylim(0-eps, msz0-1+eps) + plt.xticks([]) + plt.yticks([]) + + def show(self, som, which_dim='all', colormap=None, cols=None): + plt.cm.get_cmap(colormap) if colormap else plt.cm.get_cmap('RdYlBu_r') + + data = som.data_raw + msz0, msz1 = som.codebook.mapsize + coords = som.bmu_ind_to_xy(som.project_data(data))[:, :2] + cols = cols if cols else 8 # 8 is arbitrary + rows = data.shape[1]/cols+1 + + if which_dim == 'all': + dim = data.shape[0] + self.init_figure(dim, cols) + self.plot(data, coords, msz0, msz1, colormap, data.shape[0], + data.shape[1], rows, cols) + + else: + dim = 1 if type(which_dim) is int else len(which_dim) + self.init_figure(dim, cols) + self.plot(data, coords, msz0, msz1, colormap, data.shape[0], + len(which_dim), rows, cols) + + plt.tight_layout() + plt.subplots_adjust(hspace=.16, wspace=.05) diff --git a/sompy/visualization/histogram.py b/sompy/visualization/histogram.py new file mode 100644 index 0000000..cc71996 --- /dev/null +++ b/sompy/visualization/histogram.py @@ -0,0 +1,59 @@ +from .view import MatplotView +from matplotlib import pyplot as plt +from matplotlib import cm +from matplotlib.colors import LogNorm +import numpy as np + + +class Hist2d(MatplotView): + + def _fill_hist(self, x, y, mapsize, data_coords, what='train'): + x = np.arange(.5, mapsize[1]+.5, 1) + y = np.arange(.5, mapsize[0]+.5, 1) + X, Y = np.meshgrid(x, y) + + if what == 'train': + a = plt.hist2d(x, y, bins=(mapsize[1], mapsize[0]), alpha=.0, + cmap=cm.jet) + area = a[0].T * 12 + plt.scatter(data_coords[:, 1], mapsize[0] - .5 - data_coords[:, 0], + s=area.flatten(), alpha=.9, c='None', marker='o', + cmap='jet', linewidths=3, edgecolor='r') + + else: + a = plt.hist2d(x, y, bins=(mapsize[1], mapsize[0]), alpha=.0, + cmap=cm.jet, norm=LogNorm()) + area = a[0].T*50 + plt.scatter(data_coords[:, 1] + .5, + mapsize[0] - .5 - data_coords[:, 0], + s=area, alpha=0.9, c='None', marker='o', cmap='jet', + linewidths=3, edgecolor='r') + plt.scatter(data_coords[:, 1]+.5, mapsize[0]-.5-data_coords[:, 0], + s=area, alpha=0.2, c='b', marker='o', cmap='jet', + linewidths=3, edgecolor='r') + + plt.xlim(0, mapsize[1]) + plt.ylim(0, mapsize[0]) + + def show(self, som, data=None): + # First Step: show the hitmap of all the training data + coord = som.bmu_ind_to_xy(som.project_data(som.data_raw)) + + self.prepare() + + ax = self._fig.add_subplot(111) + ax.xaxis.set_ticks([i for i in range(0, som.codebook.mapsize[1])]) + ax.yaxis.set_ticks([i for i in range(0, som.codebook.mapsize[0])]) + ax.xaxis.set_ticklabels([]) + ax.yaxis.set_ticklabels([]) + ax.grid(True, linestyle='-', linewidth=.5) + + self._fill_hist(coord[:, 1], coord[:, 0], som.codebook.mapsize, + som.bmu_ind_to_xy(np.arange(som.codebook.nnodes))) + + if data: + coord_d = som.bmu_ind_to_xy(som.project_data(data)) + self._fill_hist(coord[:, 1], coord[:, 0], som.codebook.mapsize, + coord_d, 'data') + + plt.show() diff --git a/sompy/visualization/hitmap.py b/sompy/visualization/hitmap.py new file mode 100644 index 0000000..96cf92b --- /dev/null +++ b/sompy/visualization/hitmap.py @@ -0,0 +1,37 @@ +from .view import MatplotView +from matplotlib import pyplot as plt +import numpy as np +import ipdb + + +class HitMapView(MatplotView): + + def _set_labels(self, cents, ax, labels): + for i, txt in enumerate(labels): + ax.annotate(txt, (cents[i, 1], cents[i, 0]), size=10, va="center") + + def show(self, som, data=None): + + try: + codebook = getattr(som, 'cluster_labels') + except: + codebook = som.cluster() + + # codebook = getattr(som, 'cluster_labels', som.cluster()) + msz = som.codebook.mapsize + self.prepare() + ax = self._fig.add_subplot(111) + + if len(data)>1: + proj = som.project_data(data) + cents = som.bmu_ind_to_xy(proj) + self._set_labels(cents, ax, codebook[proj]) + + else: + cents = som.bmu_ind_to_xy(np.arange(0, msz[0]*msz[1])) + self._set_labels(cents, ax, codebook) + + plt.imshow(codebook.reshape(msz[0], msz[1])[::], alpha=.5) + plt.show() + + return cents diff --git a/sompy/visualization/mapview.py b/sompy/visualization/mapview.py new file mode 100644 index 0000000..dd6d635 --- /dev/null +++ b/sompy/visualization/mapview.py @@ -0,0 +1,187 @@ +import matplotlib +from .view import MatplotView +from matplotlib import pyplot as plt +import numpy as np +import ipdb + + +class MapView(MatplotView): + + def _calculate_figure_params(self, som, which_dim, col_sz): + + # add this to avoid error when normalization is not used + if som._normalizer is not None: + codebook = som._normalizer.denormalize_by(som.data_raw, som.codebook.matrix) + else: + codebook = som.codebook.matrix + + indtoshow, sV, sH = None, None, None + + if which_dim == 'all': + dim = som._dim + row_sz = np.ceil(float(dim) / col_sz) + msz_row, msz_col = som.codebook.mapsize + ratio_hitmap = msz_row / float(msz_col) + ratio_fig = row_sz / float(col_sz) + indtoshow = np.arange(0, dim).T + sH, sV = 16, 16*ratio_fig*ratio_hitmap + + elif type(which_dim) == int: + dim = 1 + msz_row, msz_col = som.codebook.mapsize + ratio_hitmap = msz_row / float(msz_col) + indtoshow = np.zeros(1) + indtoshow[0] = int(which_dim) + sH, sV = 16, 16 * ratio_hitmap + + elif type(which_dim) == list: + max_dim = codebook.shape[1] + dim = len(which_dim) + row_sz = np.ceil(float(dim) / col_sz) + msz_row, msz_col = som.codebook.mapsize + ratio_hitmap = msz_row / float(msz_col) + ratio_fig = row_sz / float(col_sz) + indtoshow = np.asarray(which_dim).T + sH, sV = 16, 16*ratio_fig*ratio_hitmap + + no_row_in_plot = dim / col_sz + 1 # 6 is arbitrarily selected + if no_row_in_plot <= 1: + no_col_in_plot = dim + else: + no_col_in_plot = col_sz + + axis_num = 0 + + width = sH + height = sV + + return (width, height, indtoshow, no_row_in_plot, no_col_in_plot, + axis_num) + + +class View2D(MapView): + + def show(self, som, what='codebook', which_dim='all', cmap=None, + col_sz=None, desnormalize=False): + (self.width, self.height, indtoshow, no_row_in_plot, no_col_in_plot, + axis_num) = self._calculate_figure_params(som, which_dim, col_sz) + self.prepare() + + if not desnormalize: + codebook = som.codebook.matrix + else: + codebook = som._normalizer.denormalize_by(som.data_raw, som.codebook.matrix) + + if which_dim == 'all': + names = som._component_names[0] + elif type(which_dim) == int: + names = [som._component_names[0][which_dim]] + elif type(which_dim) == list: + names = som._component_names[0][which_dim] + + + while axis_num < len(indtoshow): + axis_num += 1 + ax = plt.subplot(no_row_in_plot, no_col_in_plot, axis_num) + ind = int(indtoshow[axis_num-1]) + + min_color_scale = np.mean(codebook[:, ind].flatten()) - 1 * np.std(codebook[:, ind].flatten()) + max_color_scale = np.mean(codebook[:, ind].flatten()) + 1 * np.std(codebook[:, ind].flatten()) + min_color_scale = min_color_scale if min_color_scale >= min(codebook[:, ind].flatten()) else \ + min(codebook[:, ind].flatten()) + max_color_scale = max_color_scale if max_color_scale <= max(codebook[:, ind].flatten()) else \ + max(codebook[:, ind].flatten()) + norm = matplotlib.colors.Normalize(vmin=min_color_scale, vmax=max_color_scale, clip=True) + + mp = codebook[:, ind].reshape(som.codebook.mapsize[0], + som.codebook.mapsize[1]) + pl = plt.pcolor(mp[::-1], norm=norm) + plt.axis([0, som.codebook.mapsize[1], 0, som.codebook.mapsize[0]]) + plt.title(names[axis_num - 1]) + ax.set_yticklabels([]) + ax.set_xticklabels([]) + plt.colorbar(pl) + + #plt.show() + + +class View2DPacked(MapView): + + def _set_axis(self, ax, msz0, msz1): + plt.axis([0, msz0, 0, msz1]) + plt.axis('off') + ax.axis('off') + + def show(self, som, what='codebook', which_dim='all', cmap=None, + col_sz=None): + if col_sz is None: + col_sz = 6 + (self.width, self.height, indtoshow, no_row_in_plot, no_col_in_plot, + axis_num) = self._calculate_figure_params(som, which_dim, col_sz) + codebook = som.codebook.matrix + + cmap = cmap or plt.cm.get_cmap('RdYlBu_r') + msz0, msz1 = som.codebook.mapsize + compname = som.component_names + if what == 'codebook': + h = .1 + w = .1 + self.width = no_col_in_plot*2.5*(1+w) + self.height = no_row_in_plot*2.5*(1+h) + self.prepare() + + while axis_num < len(indtoshow): + axis_num += 1 + ax = self._fig.add_subplot(no_row_in_plot, no_col_in_plot, + axis_num) + ax.axis('off') + ind = int(indtoshow[axis_num-1]) + mp = codebook[:, ind].reshape(msz0, msz1) + plt.imshow(mp[::-1], norm=None, cmap=cmap) + self._set_axis(ax, msz0, msz1) + + if self.show_text is True: + plt.title(compname[0][ind]) + font = {'size': self.text_size} + plt.rc('font', **font) + if what == 'cluster': + try: + codebook = getattr(som, 'cluster_labels') + except: + codebook = som.cluster() + + h = .2 + w = .001 + self.width = msz0/2 + self.height = msz1/2 + self.prepare() + + ax = self._fig.add_subplot(1, 1, 1) + mp = codebook[:].reshape(msz0, msz1) + plt.imshow(mp[::-1], cmap=cmap) + + self._set_axis(ax, msz0, msz1) + + plt.subplots_adjust(hspace=h, wspace=w) + + plt.show() + + +class View1D(MapView): + + def show(self, som, what='codebook', which_dim='all', cmap=None, + col_sz=None): + (self.width, self.height, indtoshow, no_row_in_plot, no_col_in_plot, + axis_num) = self._calculate_figure_params(som, which_dim, col_sz) + self.prepare() + + codebook = som.codebook.matrix + + while axis_num < len(indtoshow): + axis_num += 1 + plt.subplot(no_row_in_plot, no_col_in_plot, axis_num) + ind = int(indtoshow[axis_num-1]) + mp = codebook[:, ind] + plt.plot(mp, '-k', linewidth=0.8) + + #plt.show() diff --git a/sompy/visualization/umatrix.py b/sompy/visualization/umatrix.py new file mode 100644 index 0000000..afa8ae8 --- /dev/null +++ b/sompy/visualization/umatrix.py @@ -0,0 +1,95 @@ +from .view import MatplotView +from matplotlib import pyplot as plt +from pylab import imshow, contour +from math import sqrt +import numpy as np +import scipy + + +class UMatrixView(MatplotView): + + def build_u_matrix(self, som, distance=1, row_normalized=False): + UD2 = som.calculate_map_dist() + Umatrix = np.zeros((som.codebook.nnodes, 1)) + codebook = som.codebook.matrix + if row_normalized: + vector = som._normalizer.normalize_by(codebook.T, codebook.T, + method='var').T + else: + vector = codebook + + for i in range(som.codebook.nnodes): + codebook_i = vector[i][np.newaxis, :] + neighborbor_ind = UD2[i][0:] <= distance + neighborbor_codebooks = vector[neighborbor_ind] + Umatrix[i] = scipy.spatial.distance_matrix( + codebook_i, neighborbor_codebooks).mean() + + return Umatrix.reshape(som.codebook.mapsize) + + def show(self, som, distance2=1, row_normalized=False, show_data=True, + contooor=True, blob=False, labels=False): + umat = self.build_u_matrix(som, distance=distance2, + row_normalized=row_normalized) + msz = som.codebook.mapsize + proj = som.project_data(som.data_raw) + coord = som.bmu_ind_to_xy(proj) + + fig, ax = plt.subplots(1, 1) + imshow(umat, cmap=plt.cm.get_cmap('RdYlBu_r'), alpha=1) + + if contooor: + mn = np.min(umat.flatten()) + mx = np.max(umat.flatten()) + std = np.std(umat.flatten()) + md = np.median(umat.flatten()) + mx = md + 0*std + contour(umat, np.linspace(mn, mx, 15), linewidths=0.7, + cmap=plt.cm.get_cmap('Blues')) + + if show_data: + plt.scatter(coord[:, 1], coord[:, 0], s=2, alpha=1., c='Gray', + marker='o', cmap='jet', linewidths=3, edgecolor='Gray') + plt.axis('off') + + if labels: + if labels is True: + labels = som.build_data_labels() + for label, x, y in zip(labels, coord[:, 1], coord[:, 0]): + plt.annotate(str(label), xy=(x, y), + horizontalalignment='center', + verticalalignment='center') + + ratio = float(msz[0])/(msz[0]+msz[1]) + fig.set_size_inches((1-ratio)*15, ratio*15) + plt.tight_layout() + plt.subplots_adjust(hspace=.00, wspace=.000) + sel_points = list() + + if blob: + from skimage.color import rgb2gray + from skimage.feature import blob_log + + image = 1 / umat + rgb2gray(image) + + # 'Laplacian of Gaussian' + blobs = blob_log(image, max_sigma=5, num_sigma=4, threshold=.152) + blobs[:, 2] = blobs[:, 2] * sqrt(2) + imshow(umat, cmap=plt.cm.get_cmap('RdYlBu_r'), alpha=1) + sel_points = list() + + for blob in blobs: + row, col, r = blob + c = plt.Circle((col, row), r, color='red', linewidth=2, + fill=False) + ax.add_patch(c) + dist = scipy.spatial.distance_matrix( + coord[:, :2], np.array([row, col])[np.newaxis, :]) + sel_point = dist <= r + plt.plot(coord[:, 1][sel_point[:, 0]], + coord[:, 0][sel_point[:, 0]], '.r') + sel_points.append(sel_point[:, 0]) + + plt.show() + return sel_points, umat diff --git a/sompy/visualization/view.py b/sompy/visualization/view.py new file mode 100644 index 0000000..c618a23 --- /dev/null +++ b/sompy/visualization/view.py @@ -0,0 +1,54 @@ +from matplotlib import pyplot as plt + + +class View(object): + def __init__(self, width, height, title, show_axis=True, packed=True, + text_size=2.8, show_text=True, col_size=6, *args, **kwargs): + self.width = width + self.height = height + self.title = title + self.show_axis = show_axis + self.packed = packed + self.text_size = text_size + self.show_text = show_text + self.col_size = col_size + + def prepare(self, *args, **kwargs): + raise NotImplementedError() + + def save(self, filename): + raise NotImplementedError() + + def show(self, *args, **kwrags): + raise NotImplementedError() + + +class MatplotView(View): + + def __init__(self, width, height, title, show_axis=True, packed=True, + text_size=2.8, show_text=True, col_size=6, *args, **kwargs): + super(MatplotView, self).__init__(width, height, title, show_axis, + packed, text_size, show_text, + col_size, *args, **kwargs) + self._fig = None + + def __del__(self): + self._close_fig() + + def _close_fig(self): + if self._fig: + plt.close(self._fig) + + def prepare(self, *args, **kwargs): + self._close_fig() + self._fig = plt.figure(figsize=(self.width, self.height)) + plt.title(self.title) + plt.axis('off') + plt.rc('font', **{'size': self.text_size}) + + def save(self, filename, transparent=False, bbox_inches='tight', dpi=400): + self._fig.savefig(filename, transparent=transparent, dpi=dpi, + bbox_inches=bbox_inches) + + def show(self, *args, **kwrags): + raise NotImplementedError()