DOC: clean notebooks

diana-hep · Apr 22, 2016 · 323d1ed · 323d1ed
1 parent e47c705
commit 323d1ed
Showing 6 changed files with 83 additions and 225 deletions.
diff --git a/carl/__init__.py b/carl/__init__.py
@@ -10,18 +10,13 @@
 [![DOI](https://zenodo.org/badge/doi/10.5281/zenodo.47798.svg)](http://dx.doi.org/10.5281/zenodo.47798)
 
 
-## Documentation
-
-* Illustrative examples complementing the online reference
-  can be found under the
-  [`examples/`](https://github.com/diana-hep/carl/tree/master/examples)
-  directory.
-
-* Extended details regarding likelihood-free inference with calibrated
-  classifiers can be found in the companion paper _"Approximating Likelihood
-  Ratios with Calibrated Discriminative Classifiers", Kyle Cranmer, Juan Pavez,
-  Gilles Louppe._
-  [http://arxiv.org/abs/1506.02169](http://arxiv.org/abs/1506.02169)
+## Likelihood-free inference with classifiers
+
+Extended details regarding likelihood-free inference with calibrated
+classifiers can be found in the companion paper _"Approximating Likelihood
+Ratios with Calibrated Discriminative Classifiers", Kyle Cranmer, Juan Pavez,
+Gilles Louppe._
+[http://arxiv.org/abs/1506.02169](http://arxiv.org/abs/1506.02169)
 
 
 ## Installation

diff --git a/carl/learning/calibration.py b/carl/learning/calibration.py
@@ -32,7 +32,7 @@ class CalibratedClassifierCV(BaseEstimator, ClassifierMixin):
     probabilities for each of the folds are then averaged for prediction.
     """
 
-    def __init__(self, base_estimator, method="histogram", cv=1):
+    def __init__(self, base_estimator, method="histogram", bins="auto", cv=1):
         """Constructor.
 
         Parameters
@@ -47,6 +47,9 @@ def __init__(self, base_estimator, method="histogram", cv=1):
             `"histogram"`, `"kde"`, `"isotonic"`, `"interpolated-isotonic"` and
             `"sigmoid"`.
 
+        * `bins` [int, default="auto"]:
+            The number of bins, if `method` is `"histogram"`.
+
         * `cv` [integer, cross-validation generator, iterable or `"prefit"`]:
             Determines the cross-validation splitting strategy.
             Possible inputs for cv are:
@@ -61,6 +64,7 @@ def __init__(self, base_estimator, method="histogram", cv=1):
         """
         self.base_estimator = base_estimator
         self.method = method
+        self.bins = bins
         self.cv = cv
 
     def fit(self, X, y):
@@ -93,7 +97,7 @@ def fit(self, X, y):
 
         # Calibrator
         if self.method == "histogram":
-            base_calibrator = HistogramCalibrator()
+            base_calibrator = HistogramCalibrator(bins=self.bins)
         elif self.method == "kde":
             base_calibrator = KernelDensityCalibrator()
         elif self.method == "isotonic":

diff --git a/ci/templates/html.mako b/ci/templates/html.mako
@@ -361,6 +361,8 @@
   %>
   <div id="sidebar">
     <ul id="index">
+    <li class="set"><h3><a href="${ root_url }">Index</a></h3></li>
+    
     % if len(variables) > 0:
     <li class="set"><h3><a href="#header-variables">Module variables</a></h3>
       ${show_column_list(map(lambda v: link(v.refname), variables))}
@@ -412,8 +414,6 @@
       </ul>
     </li>
     % endif
-
-    <li class="set"><h3><a href="${ root_url }">Index</a></h3></li>
     </ul>
   </div>
 </%def>

diff --git a/examples/Diagnostics for approximate likelihood ratios.ipynb b/examples/Diagnostics for approximate likelihood ratios.ipynb
@@ -4,7 +4,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Diagnostics for approximate likelihood ratios\n",
+    "# Diagnostics for approximate likelihood ratios\n",
     "\n",
     "Kyle Cranmer, Juan Pavez, Gilles Louppe, March 2016.\n",
     "\n",
@@ -23,23 +23,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {
     "collapsed": false
    },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Couldn't import dot_parser, loading of dot files will not be possible.\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "%matplotlib inline\n",
     "import matplotlib.pyplot as plt\n",
-    "#plt.set_cmap(\"viridis\")\n",
     "\n",
     "import numpy as np\n",
     "import theano\n",
@@ -53,7 +44,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Create model and generate artificial dataset"
+    "## Create model and generate artificial data"
    ]
   },
   {
@@ -71,8 +62,6 @@
     "from carl.distributions import LinearTransform\n",
     "from sklearn.datasets import make_sparse_spd_matrix\n",
     "\n",
-    "import pdb\n",
-    "\n",
     "# Parameters\n",
     "true_A = 1.\n",
     "A = theano.shared(true_A, name=\"A\")\n",
@@ -100,6 +89,7 @@
     "        Exponential(inverse_scale=3.0),\n",
     "        Exponential(inverse_scale=0.5)]), R))\n",
     "p1 = p1s[0]\n",
+    "\n",
     "# Draw data\n",
     "X_true = p0.rvs(500, random_state=314) "
    ]
@@ -108,7 +98,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Known likelihood setup"
+    "## Known likelihood setup"
    ]
   },
   {
@@ -129,7 +119,9 @@
    "source": [
     "# Minimize the exact LR\n",
     "from scipy.optimize import minimize\n",
+    "\n",
     "p1 = p1s[2]\n",
+    "\n",
     "def nll_exact(theta, X):\n",
     "    A.set_value(theta[0])\n",
     "    return (p0.nll(X) - p1.nll(X)).sum()\n",
@@ -188,7 +180,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Likelihood-free setup\n",
+    "## Likelihood-free setup\n",
     "Here we create the data to train a parametrized classifier"
    ]
   },
@@ -205,36 +197,25 @@
     "\n",
     "bounds = [(-3, 3), (-3, 3)]\n",
     "\n",
-    "clf_parameters = [(1000,100000),(1000000,500),(1000000,100000)]\n",
+    "clf_parameters = [(1000, 100000), (1000000, 500), (1000000, 100000)]\n",
     "X = [0]*3*3\n",
     "y = [0]*3*3\n",
     "\n",
-    "\n",
-    "\n",
     "for k,(param,p1) in enumerate(product(clf_parameters,p1s)):\n",
     "    X[k], y[k] = make_parameterized_classification(\n",
     "        p0, p1,\n",
     "        param[0], \n",
     "        [(A, np.linspace(bounds[0][0],bounds[0][1], num=30))],\n",
-    "        random_state=0)\n"
+    "        random_state=0)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
    "metadata": {
     "collapsed": false
    },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/Users/juanpavez/Library/Python/2.7/lib/python/site-packages/sklearn/cross_validation.py:43: DeprecationWarning: This module has been deprecated in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
-      "  \"This module will be removed in 0.20.\", DeprecationWarning)\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "# Train parameterized classifier\n",
     "from carl.learning import as_classifier\n",
@@ -243,9 +224,10 @@
     "from sklearn.neural_network import MLPRegressor\n",
     "from sklearn.pipeline import make_pipeline\n",
     "from sklearn.preprocessing import StandardScaler\n",
-    "from sklearn.model_selection import RandomizedSearchCV\n",
+    "\n",
     "clfs = []\n",
-    "for k,_ in enumerate(product(clf_parameters,p1s)):\n",
+    "\n",
+    "for k, _ in enumerate(product(clf_parameters,p1s)):\n",
     "    clfs.append(ParameterizedClassifier(\n",
     "        make_pipeline(StandardScaler(), \n",
     "                  as_classifier(MLPRegressor(learning_rate=\"adaptive\", \n",
@@ -279,9 +261,9 @@
     "    return wrapper\n",
     "\n",
     "def objective(theta, random_state=0, n_samples=100000, clf=clfs[0],p1=p1s[0]):    \n",
-    "    \n",
     "    # Set parameter values   \n",
     "    A.set_value(theta[0])\n",
+    "    \n",
     "    # Fit ratio\n",
     "    ratio = ClassifierRatio(CalibratedClassifierCV(\n",
     "        base_estimator=clf, \n",
@@ -313,40 +295,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": null,
    "metadata": {
     "collapsed": false
    },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "*Optimization completed:\n",
-      "   -Maximum number of iterations reached.\n",
-      "*Optimization completed:\n",
-      "   -Maximum number of iterations reached.\n",
-      "*Optimization completed:\n",
-      "   -Maximum number of iterations reached.\n",
-      "*Optimization completed:\n",
-      "   -Maximum number of iterations reached.\n",
-      "*Optimization completed:\n",
-      "   -Maximum number of iterations reached.\n",
-      "*Optimization completed:\n",
-      "   -Maximum number of iterations reached.\n",
-      "*Optimization completed:\n",
-      "   -Maximum number of iterations reached.\n",
-      "*Optimization completed:\n",
-      "   -Maximum number of iterations reached.\n",
-      "*Optimization completed:\n",
-      "   -Maximum number of iterations reached.\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "from GPyOpt.methods import BayesianOptimization\n",
+    "\n",
     "solvers = []\n",
-    "for k,(param,p1) in enumerate(product(clf_parameters,p1s)):\n",
+    "\n",
+    "for k, (param, p1) in enumerate(product(clf_parameters,p1s)):\n",
     "    clf = clfs[k]\n",
     "    n_samples = param[1]\n",
     "    bounds = [(-3, 3)]\n",
@@ -379,7 +338,8 @@
    ],
    "source": [
     "approx_MLEs = []\n",
-    "for k,_ in enumerate(product(clf_parameters,p1s)):\n",
+    "\n",
+    "for k, _ in enumerate(product(clf_parameters,p1s)):\n",
     "    solver = solvers[k]\n",
     "    approx_MLE = solver.x_opt\n",
     "    approx_MLEs.append(approx_MLE)\n",
@@ -459,7 +419,7 @@
     "rs = []\n",
     "solver = solvers[0]\n",
     "\n",
-    "for k,_ in enumerate(product(clf_parameters,p1s)):\n",
+    "for k, _ in enumerate(product(clf_parameters,p1s)):\n",
     "    def gp_objective(theta):\n",
     "        theta = theta.reshape(1, -1)\n",
     "        return solvers[k].model.predict(theta)[0][0]\n",
@@ -496,27 +456,18 @@
     "    nll_gp, var_gp = solvers[k].model.predict(As.reshape(-1, 1))\n",
     "    nll_gp = 2. * (nll_gp - rs[k].fun) * len(X_true)\n",
     "    gp_ratios.append(nll_gp)\n",
+    "    \n",
     "    # STD\n",
     "    std_gp = np.sqrt(4*var_gp*len(X_true)*len(X_true))\n",
     "    std_gp[np.isnan(std_gp)] = 0.\n",
     "    gp_std.append(std_gp)\n",
+    "    \n",
     "    # 95% CI\n",
     "    q1_gp, q2_gp = solvers[k].model.predict_quantiles(As.reshape(-1, 1))\n",
     "    q1_gp = 2. * (q1_gp - rs[k].fun) * len(X_true)\n",
     "    q2_gp = 2. * (q2_gp - rs[k].fun) * len(X_true)\n",
     "    gp_q1.append(q1_gp)\n",
-    "    gp_q2.append(q2_gp)\n",
-    "\n",
-    "    #nll_approx = np.zeros(n_points)\n",
-    "\n",
-    "    #approx = [objective([a]) for a in np.linspace(*bounds[0], n_points)]\n",
-    "    #approx = [objective([a],n_samples=n_samples,clf=clf,p1=p1) for a \n",
-    "    #          in np.linspace(bounds[0][0],bounds[0][1], n_points)]\n",
-    "\n",
-    "    #approx = np.array(approx)\n",
-    "    #approx = 2. * (approx - approx.min()) * len(X_true)\n",
-    "    #nll_approx = approx\n",
-    "    #approx_ratios.append(nll_approx)\n"
+    "    gp_q2.append(q2_gp)"
    ]
   },
   {
@@ -566,7 +517,8 @@
    ],
    "source": [
     "bounds = [(true_A - 0.30, true_A + 0.30)]\n",
-    "for k,_ in enumerate(clf_parameters):\n",
+    "\n",
+    "for k, _ in enumerate(clf_parameters):\n",
     "    fig = plt.figure()\n",
     "    ax = fig.add_subplot(1,1,1)\n",
     "    ax.plot(As, nll, label=\"Exact\")\n",
@@ -624,9 +576,11 @@
    "outputs": [],
    "source": [
     "from sklearn.metrics import roc_curve, auc\n",
-    "def makeROC(predictions,targetdata):\n",
+    "\n",
+    "def makeROC(predictions ,targetdata):\n",
     "    fpr, tpr, _  = roc_curve(targetdata.ravel(),predictions.ravel())\n",
     "    roc_auc = auc(fpr, tpr)\n",
+    "    \n",
     "    return fpr,tpr,roc_auc"
    ]
   },
@@ -669,8 +623,6 @@
     }
    ],
    "source": [
-    "#fig = plt.figure(figsize=(15,15))\n",
-    "\n",
     "# I obtain data from r*p1 by resampling data from p1 using r as weights\n",
     "def weight_data(x0,x1,weights):\n",
     "    x1_len = x1.shape[0]\n",
@@ -767,15 +719,6 @@
     "trained, well calibrated case is almost identical with the exact likelihood\n",
     "ratio, confirming the quality of the approximation."
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {