Updated Week 4 - Training Models

dTmC0945 · Oct 21, 2024 · ca316de · ca316de
1 parent efd8a50
commit ca316de
Show file tree

Hide file tree

Showing 40 changed files with 32,053 additions and 203 deletions.
diff --git a/Lecture-Slides/Machine-Learning-Lecture/MachineLearningIISlide.pdf b/Lecture-Slides/Machine-Learning-Lecture/MachineLearningIISlide.pdf
diff --git a/Lecture-Slides/Machine-Learning-Lecture/ipynb/Classification.ipynb b/Lecture-Slides/Machine-Learning-Lecture/ipynb/Classification.ipynb
@@ -0,0 +1 @@
+{"cells":[{"cell_type":"markdown","id":"8244638f-81cd-4e4c-94dd-5e4a1f245993","metadata":{},"source":"Code for Machine Learning and Data Science II\n=============================================\n\n"},{"cell_type":"markdown","id":"4c0065e3-9790-4748-95e0-7f153dabea7b","metadata":{},"source":["These are the code snippets used in Classification\npart of Machine Learning and Data Science II.\n\n"]},{"cell_type":"markdown","id":"88803470-b332-4009-b876-f2d9d2741fcd","metadata":{},"source":["### Introduction\n\n"]},{"cell_type":"markdown","id":"91407057-4815-43eb-ab06-469ea6f33de6","metadata":{},"source":["#### Preamble\n\n"]},{"cell_type":"code","execution_count":1,"id":"d2289792-84e7-4f0e-8128-720de76f53e7","metadata":{},"outputs":[],"source":["# import for custom color cycling\nfrom cycler import cycler \n\n# define custom default colors for generating image with\n# consisten color cycling\nnew_colors = ['#326199',\n              '#4fb1a1',\n              '#fcc055',\n              '#eb8d50',\n              '#df6e5b',\n              '#9a031e',\n              '#984ea3']\n\n\n# define a function to store matplotlib style information\ndef plot_settings(style, size=None):\n   \n   # define the styles for generating web compatible images.\n   if style == \"web\":\n      style = {\n         'axes.edgecolor': '1e1e2e',\n         'axes.facecolor': '363a4f',\n         'axes.axisbelow' : True,\n         'axes.labelcolor' : 'cad3f5',\n         'axes.grid': True,\n         'axes.grid.which': 'both',\n         'axes.spines.left': False,\n         'axes.spines.right': False,\n         'axes.spines.top': False,\n         'axes.spines.bottom': False,\n         'axes.prop_cycle': cycler(color=new_colors),\n\n         'grid.color': '5b6078',\n         'grid.linewidth': '1.2',\n         \n         'xtick.color': 'cad3f5',\n         'xtick.major.bottom': True,\n         'xtick.labelsize': 10,\n         'xtick.minor.bottom': True,\n         'xtick.minor.bottom': True,\n         'xtick.minor.visible': True,\n         'xtick.minor.width': 0.5,\n         \n         'ytick.color': 'cad3f5',\n         'ytick.major.left': True,\n         'ytick.minor.left': False,\n         'ytick.minor.visible': True,\n         'ytick.labelsize': 10,\n\n         'savefig.facecolor': '363a4f',\n\n         'text.color': 'cad3f5',\n         \n         'lines.linewidth': 4,\n\n         'font.size': 16,\n         \n         'legend.fancybox' : False,\n         'legend.facecolor' : '6c7086',\n         \n         'figure.facecolor': '838ba7',\n      }\n\n      # Define the font dictionary to store label formatting\n      font = {'color':  '#cad3f5',\n              'weight': 'normal',\n              'size': 16,\n              }   \n\n   elif style == \"slide\":\n      style = {\n         'axes.edgecolor': 'f0f0f0',\n         'axes.facecolor': 'fafafa',\n         'axes.axisbelow' : True,\n         'axes.labelcolor' : '1e1e1e',\n         'axes.grid': True,\n         'axes.grid.which': 'both',\n         'axes.spines.left': False,\n         'axes.spines.right': False,\n         'axes.spines.top': False,\n         'axes.spines.bottom': False,\n         'axes.prop_cycle': cycler(color=new_colors),\n\n         'grid.color': 'f0f0f0',\n         'grid.linewidth': '1.2',\n         \n         'xtick.color': '1e1e1e',\n         'xtick.major.bottom': True,\n         'xtick.labelsize': 10,\n         'xtick.minor.bottom': True,\n         'xtick.minor.bottom': True,\n         'xtick.minor.visible': True,\n         'xtick.minor.width': 0.5,\n         \n         'ytick.color': '1e1e1e',\n         'ytick.major.left': True,\n         'ytick.minor.left': False,\n         'ytick.labelsize': 10,\n\n         'savefig.facecolor': 'fafafa',\n\n         'text.color': '1e1e1e',\n         \n         'lines.linewidth': 4,\n\n         'font.size': 16,\n         \n         'legend.fancybox' : False,\n         'legend.facecolor' : '6c7086',\n         \n         'figure.facecolor': 'fafafa',\n      }\n\n      # Define the font dictionary to store label formatting\n      font = {'color':  '#1e1e1e',\n              'weight': 'normal',\n              'size': 16,\n              }   \n\n   # Apply style sheet for use in matplotlib\n   plt.rcParams.update(style)\n   \n   # Define figure size based on the number of figures\n   if size == 1:\n      plt.figure(figsize = (10, 6))\n   elif size == 2:\n      plt.figure(figsize = (12, 5))\n   elif size == 3:\n      plt.figure(figsize = (12, 8))\n   elif size == None:\n      return 0\n\ndef grid_settings(style):\n\n   if style == \"web\":\n      plt.grid(which='minor', color='#5b6078', linestyle=':', linewidth=0.5)\n      plt.grid(which='major', color='#5b6078', linestyle=':', linewidth=0.8)\n      \n   elif style == \"slide\": \n      plt.grid(which='minor', color='#c8c8c8', linestyle=':', linewidth=0.5)\n      plt.grid(which='major', color='#c8c8c8', linestyle=':', linewidth=0.8)\n\n   plt.minorticks_on()"]},{"cell_type":"code","execution_count":1,"id":"aaf3a5c5-6b43-415e-b65c-d43edcafbaf0","metadata":{},"outputs":[],"source":["from pathlib import Path\n\n# Define paths to store images\nIMAGES_PATH = Path() / \"images\" / \"Classification\"\n# Check if path exists\nIMAGES_PATH.mkdir(parents=True, exist_ok=True)\n\ndef store_fig(fig_id,\n              tight_layout=True,\n              fig_extension=\"png\",\n              resolution=300,\n              style=None,\n              close=None):\n    \n    if tight_layout:\n        plt.tight_layout()\n        \n    if style == \"web\":\n        plt.grid(which='minor', color='#5b6078', linestyle=':', linewidth=0.5)\n        plt.grid(which='major', color='#5b6078', linestyle=':', linewidth=0.8)\n        fig_extension = \"png\"\n        \n    elif style == 'slide':\n        grid_settings(style = \"slide\")\n        fig_extension = \"pdf\"\n\n    path = IMAGES_PATH / f\"{fig_id}.{fig_extension}\"\n    plt.savefig(path, format=fig_extension, dpi=resolution)\n\n    if close:\n        plt.close()"]},{"cell_type":"markdown","id":"1ffdf469-52c4-4031-bb91-21d3d0638bb5","metadata":{},"source":["### MNIST\n\n"]},{"cell_type":"markdown","id":"6069d10f-3754-492e-9135-743a480221f4","metadata":{},"source":["#### Download Initial Data\n\n"]},{"cell_type":"code","execution_count":1,"id":"b340554f-0e45-499f-96f7-ed14f961c362","metadata":{},"outputs":[],"source":["from sklearn.datasets import fetch_openml\n\nmnist = fetch_openml('mnist_784', as_frame=False)"]},{"cell_type":"markdown","id":"257629f6-aad4-4bdc-bf1b-bfdbe9737c23","metadata":{},"source":["For more info on the sklearn.datasets.fetch<sub>openml</sub> click [here](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_openml.html).\nHowever for our application two parameters are used.\n\n\n| PARAMETER|DESCRIPTION|\n|---|---|\n| name|String identifier of the dataset. Note that OpenML can have multiple datasets with the same name.|\n| as<sub>frame</sub>|If True, the data is a pandas DataFrame including columns with appropriate dtypes, if false e data and target will be NumPy arrays and the data will only contain numerical values|\n\n"]},{"cell_type":"code","execution_count":1,"id":"62d476cd-256b-4c4a-bb8d-f4162f8a9338","metadata":{},"outputs":[],"source":["print(mnist.keys())"]},{"cell_type":"code","execution_count":1,"id":"8e0ed918-6a91-434e-934a-ddde4cd1c90d","metadata":{},"outputs":[],"source":["X, y = mnist.data, mnist.target\nprint(X)"]},{"cell_type":"code","execution_count":1,"id":"c8fe76e1-d799-4f80-be94-fc3c1f56e059","metadata":{},"outputs":[],"source":["print(X.shape)"]},{"cell_type":"code","execution_count":1,"id":"cf184e41-784b-4b3f-aaa1-b3676d96f8f6","metadata":{},"outputs":[],"source":["X[0]"]},{"cell_type":"code","execution_count":1,"id":"c953eaf0-a4fb-4e4d-9a08-7a3ed347f4ba","metadata":{},"outputs":[],"source":["import matplotlib.pyplot as plt\n\ndef plot_digit(image_data):\n    image = image_data.reshape(28, 28)\n    plt.imshow(image, cmap=\"binary\")\n    plt.axis(\"off\")\n\nsome_digit = X[0]\nplot_digit(some_digit)\nstore_fig(\"some-digits-plot\",\n          close = True)"]},{"cell_type":"code","execution_count":1,"id":"ed02c08c-f11f-4467-b338-668f6573d679","metadata":{},"outputs":[],"source":["plt.figure(figsize=(9, 9))\nfor idx, image_data in enumerate(X[:100]):\n    plt.subplot(10, 10, idx + 1)\n    plot_digit(image_data)\nplt.subplots_adjust(wspace=0, hspace=0)\n\nstore_fig(\"more-digits-plot\",\n          close = True)"]},{"cell_type":"code","execution_count":1,"id":"fbd33566-2c34-4313-8b32-65c613e4d91e","metadata":{},"outputs":[],"source":["X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]"]},{"cell_type":"markdown","id":"79512997-3dc2-4e80-891d-c7881acd1b9a","metadata":{},"source":["#### Training a Binary Classifier\n\n"]},{"cell_type":"code","execution_count":1,"id":"3b43a050-c799-4dba-9ea0-f1835ad49160","metadata":{},"outputs":[],"source":["y_train_5 = (y_train == '5')  # True for all 5s, False for all other digits\ny_test_5 = (y_test == '5')"]},{"cell_type":"code","execution_count":1,"id":"110ea320-f39a-48e4-8cd8-c0edc0212c15","metadata":{},"outputs":[],"source":["from sklearn.linear_model import SGDClassifier\n\nsgd_clf = SGDClassifier(random_state=42)\nsgd_clf.fit(X_train, y_train_5)"]},{"cell_type":"code","execution_count":1,"id":"d6096bab-7a53-413c-a736-18ba6655d252","metadata":{},"outputs":[],"source":["sgd_clf.predict([some_digit])"]},{"cell_type":"markdown","id":"4c5a47a2-7ea9-452c-99e7-92c1b98463ee","metadata":{},"source":["### Performance Measures\n\n"]},{"cell_type":"markdown","id":"60516fb8-23f6-40bc-a243-9f53e949d977","metadata":{},"source":["#### Measuring Accuracy Using Cross-Validation\n\n"]},{"cell_type":"code","execution_count":1,"id":"54aeb5ee-cfab-43f1-9757-3b5c59deeb78","metadata":{},"outputs":[],"source":["from sklearn.model_selection import cross_val_score\n\ncross_val_score(sgd_clf, X = X_train,y =  y_train_5, cv=3, scoring=\"accuracy\")"]},{"cell_type":"markdown","id":"ba1c54c3-e99c-4ee7-a049-4b94f0316291","metadata":{},"source":["\n| PARAMETER|DESCRIPTION|\n|---|---|\n| estimator|The object to use to fit the data.|\n| X|The data to fit. Can be for example a list, or an array.|\n| y|The target variable to try to predict in the case of supervised learning.|\n| cv|Determines the cross-validation splitting strategy.|\n| scoring|A str (see model evaluation documentation) or a scorer callable object / function with signature scorer(estimator, X, y) which should return only a single value.|\n\n"]},{"cell_type":"code","execution_count":1,"id":"c146e975-e6ff-442f-b253-4fc656ab45a1","metadata":{},"outputs":[],"source":["from sklearn.model_selection import StratifiedKFold\nfrom sklearn.base import clone\n\nskfolds = StratifiedKFold(n_splits=3)  # add shuffle=True if the dataset is not\n                                       # already shuffled\nfor train_index, test_index in skfolds.split(X_train, y_train_5):\n    clone_clf = clone(sgd_clf)\n    X_train_folds = X_train[train_index]\n    y_train_folds = y_train_5[train_index]\n    X_test_fold = X_train[test_index]\n    y_test_fold = y_train_5[test_index]\n\n    clone_clf.fit(X_train_folds, y_train_folds)\n    y_pred = clone_clf.predict(X_test_fold)\n    n_correct = sum(y_pred == y_test_fold)\n    print(n_correct / len(y_pred))"]},{"cell_type":"code","execution_count":1,"id":"4c31fe34-a821-4168-94d3-f20e2e707fbc","metadata":{},"outputs":[],"source":["from sklearn.dummy import DummyClassifier\n\ndummy_clf = DummyClassifier()\ndummy_clf.fit(X_train, y_train_5)\nprint(any(dummy_clf.predict(X_train)))"]},{"cell_type":"code","execution_count":1,"id":"c6eb0611-19a8-428a-b986-d0c073460c68","metadata":{},"outputs":[],"source":["cross_val_score(dummy_clf, X_train, y_train_5, cv=3, scoring=\"accuracy\")"]},{"cell_type":"markdown","id":"3247c12a-4c99-4f2d-a12d-c3db2ecb9653","metadata":{},"source":["#### Confusion Matrix\n\n"]},{"cell_type":"code","execution_count":1,"id":"23487611-1853-49c2-b3a3-04cb9230b670","metadata":{},"outputs":[],"source":["from sklearn.model_selection import cross_val_predict\n\ny_train_pred = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3)"]},{"cell_type":"code","execution_count":1,"id":"43ac5ac3-ff8c-4e42-9283-0dcce724e43b","metadata":{},"outputs":[],"source":["from sklearn.metrics import confusion_matrix\n\ncm = confusion_matrix(y_train_5, y_train_pred)\ncm"]},{"cell_type":"code","execution_count":1,"id":"1e991d3b-ea43-4a01-9a32-f6a2ebfb53e7","metadata":{},"outputs":[],"source":["y_train_perfect_predictions = y_train_5  # pretend we reached perfection\nconfusion_matrix(y_train_5, y_train_perfect_predictions)"]},{"cell_type":"markdown","id":"03f0967a-6d7a-47f4-961d-280640e2fcea","metadata":{},"source":["#### Presicion and Recall\n\n"]},{"cell_type":"code","execution_count":1,"id":"94cd7195-92d0-4df9-a83a-8bd1637e10c2","metadata":{},"outputs":[],"source":["from sklearn.metrics import precision_score, recall_score\n\nprecision_score(y_train_5, y_train_pred)  # == 3530 / (687 + 3530)"]}],"metadata":{"org":null,"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.5.2"}},"nbformat":4,"nbformat_minor":5}
diff --git a/Lecture-Slides/Machine-Learning-Lecture/ipynb/End-to-End-ML-Project.ipynb b/Lecture-Slides/Machine-Learning-Lecture/ipynb/End-to-End-ML-Project.ipynb
diff --git a/Lecture-Slides/Machine-Learning-Lecture/ipynb/Training-Models.ipynb b/Lecture-Slides/Machine-Learning-Lecture/ipynb/Training-Models.ipynb
diff --git a/data/housing.tgz → ...arning-Lecture/ipynb/datasets/housing.tgz b/data/housing.tgz → ...arning-Lecture/ipynb/datasets/housing.tgz
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{"cells":[{"cell_type":"markdown","id":"8244638f-81cd-4e4c-94dd-5e4a1f245993","metadata":{},"source":"Code for Machine Learning and Data Science II\n=============================================\n\n"},{"cell_type":"markdown","id":"4c0065e3-9790-4748-95e0-7f153dabea7b","metadata":{},"source":["These are the code snippets used in Classification\npart of Machine Learning and Data Science II.\n\n"]},{"cell_type":"markdown","id":"88803470-b332-4009-b876-f2d9d2741fcd","metadata":{},"source":["### Introduction\n\n"]},{"cell_type":"markdown","id":"91407057-4815-43eb-ab06-469ea6f33de6","metadata":{},"source":["#### Preamble\n\n"]},{"cell_type":"code","execution_count":1,"id":"d2289792-84e7-4f0e-8128-720de76f53e7","metadata":{},"outputs":[],"source":["# import for custom color cycling\nfrom cycler import cycler \n\n# define custom default colors for generating image with\n# consisten color cycling\nnew_colors = ['#326199',\n '#4fb1a1',\n '#fcc055',\n '#eb8d50',\n '#df6e5b',\n '#9a031e',\n '#984ea3']\n\n\n# define a function to store matplotlib style information\ndef plot_settings(style, size=None):\n \n # define the styles for generating web compatible images.\n if style == \"web\":\n style = {\n 'axes.edgecolor': '1e1e2e',\n 'axes.facecolor': '363a4f',\n 'axes.axisbelow' : True,\n 'axes.labelcolor' : 'cad3f5',\n 'axes.grid': True,\n 'axes.grid.which': 'both',\n 'axes.spines.left': False,\n 'axes.spines.right': False,\n 'axes.spines.top': False,\n 'axes.spines.bottom': False,\n 'axes.prop_cycle': cycler(color=new_colors),\n\n 'grid.color': '5b6078',\n 'grid.linewidth': '1.2',\n \n 'xtick.color': 'cad3f5',\n 'xtick.major.bottom': True,\n 'xtick.labelsize': 10,\n 'xtick.minor.bottom': True,\n 'xtick.minor.bottom': True,\n 'xtick.minor.visible': True,\n 'xtick.minor.width': 0.5,\n \n 'ytick.color': 'cad3f5',\n 'ytick.major.left': True,\n 'ytick.minor.left': False,\n 'ytick.minor.visible': True,\n 'ytick.labelsize': 10,\n\n 'savefig.facecolor': '363a4f',\n\n 'text.color': 'cad3f5',\n \n 'lines.linewidth': 4,\n\n 'font.size': 16,\n \n 'legend.fancybox' : False,\n 'legend.facecolor' : '6c7086',\n \n 'figure.facecolor': '838ba7',\n }\n\n # Define the font dictionary to store label formatting\n font = {'color': '#cad3f5',\n 'weight': 'normal',\n 'size': 16,\n } \n\n elif style == \"slide\":\n style = {\n 'axes.edgecolor': 'f0f0f0',\n 'axes.facecolor': 'fafafa',\n 'axes.axisbelow' : True,\n 'axes.labelcolor' : '1e1e1e',\n 'axes.grid': True,\n 'axes.grid.which': 'both',\n 'axes.spines.left': False,\n 'axes.spines.right': False,\n 'axes.spines.top': False,\n 'axes.spines.bottom': False,\n 'axes.prop_cycle': cycler(color=new_colors),\n\n 'grid.color': 'f0f0f0',\n 'grid.linewidth': '1.2',\n \n 'xtick.color': '1e1e1e',\n 'xtick.major.bottom': True,\n 'xtick.labelsize': 10,\n 'xtick.minor.bottom': True,\n 'xtick.minor.bottom': True,\n 'xtick.minor.visible': True,\n 'xtick.minor.width': 0.5,\n \n 'ytick.color': '1e1e1e',\n 'ytick.major.left': True,\n 'ytick.minor.left': False,\n 'ytick.labelsize': 10,\n\n 'savefig.facecolor': 'fafafa',\n\n 'text.color': '1e1e1e',\n \n 'lines.linewidth': 4,\n\n 'font.size': 16,\n \n 'legend.fancybox' : False,\n 'legend.facecolor' : '6c7086',\n \n 'figure.facecolor': 'fafafa',\n }\n\n # Define the font dictionary to store label formatting\n font = {'color': '#1e1e1e',\n 'weight': 'normal',\n 'size': 16,\n } \n\n # Apply style sheet for use in matplotlib\n plt.rcParams.update(style)\n \n # Define figure size based on the number of figures\n if size == 1:\n plt.figure(figsize = (10, 6))\n elif size == 2:\n plt.figure(figsize = (12, 5))\n elif size == 3:\n plt.figure(figsize = (12, 8))\n elif size == None:\n return 0\n\ndef grid_settings(style):\n\n if style == \"web\":\n plt.grid(which='minor', color='#5b6078', linestyle=':', linewidth=0.5)\n plt.grid(which='major', color='#5b6078', linestyle=':', linewidth=0.8)\n \n elif style == \"slide\": \n plt.grid(which='minor', color='#c8c8c8', linestyle=':', linewidth=0.5)\n plt.grid(which='major', color='#c8c8c8', linestyle=':', linewidth=0.8)\n\n plt.minorticks_on()"]},{"cell_type":"code","execution_count":1,"id":"aaf3a5c5-6b43-415e-b65c-d43edcafbaf0","metadata":{},"outputs":[],"source":["from pathlib import Path\n\n# Define paths to store images\nIMAGES_PATH = Path() / \"images\" / \"Classification\"\n# Check if path exists\nIMAGES_PATH.mkdir(parents=True, exist_ok=True)\n\ndef store_fig(fig_id,\n tight_layout=True,\n fig_extension=\"png\",\n resolution=300,\n style=None,\n close=None):\n \n if tight_layout:\n plt.tight_layout()\n \n if style == \"web\":\n plt.grid(which='minor', color='#5b6078', linestyle=':', linewidth=0.5)\n plt.grid(which='major', color='#5b6078', linestyle=':', linewidth=0.8)\n fig_extension = \"png\"\n \n elif style == 'slide':\n grid_settings(style = \"slide\")\n fig_extension = \"pdf\"\n\n path = IMAGES_PATH / f\"{fig_id}.{fig_extension}\"\n plt.savefig(path, format=fig_extension, dpi=resolution)\n\n if close:\n plt.close()"]},{"cell_type":"markdown","id":"1ffdf469-52c4-4031-bb91-21d3d0638bb5","metadata":{},"source":["### MNIST\n\n"]},{"cell_type":"markdown","id":"6069d10f-3754-492e-9135-743a480221f4","metadata":{},"source":["#### Download Initial Data\n\n"]},{"cell_type":"code","execution_count":1,"id":"b340554f-0e45-499f-96f7-ed14f961c362","metadata":{},"outputs":[],"source":["from sklearn.datasets import fetch_openml\n\nmnist = fetch_openml('mnist_784', as_frame=False)"]},{"cell_type":"markdown","id":"257629f6-aad4-4bdc-bf1b-bfdbe9737c23","metadata":{},"source":["For more info on the sklearn.datasets.fetch<sub>openml</sub> click [here](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_openml.html).\nHowever for our application two parameters are used.\n\n\n\| PARAMETER\|DESCRIPTION\|\n\|---\|---\|\n\| name\|String identifier of the dataset. Note that OpenML can have multiple datasets with the same name.\|\n\| as<sub>frame</sub>\|If True, the data is a pandas DataFrame including columns with appropriate dtypes, if false e data and target will be NumPy arrays and the data will only contain numerical values\|\n\n"]},{"cell_type":"code","execution_count":1,"id":"62d476cd-256b-4c4a-bb8d-f4162f8a9338","metadata":{},"outputs":[],"source":["print(mnist.keys())"]},{"cell_type":"code","execution_count":1,"id":"8e0ed918-6a91-434e-934a-ddde4cd1c90d","metadata":{},"outputs":[],"source":["X, y = mnist.data, mnist.target\nprint(X)"]},{"cell_type":"code","execution_count":1,"id":"c8fe76e1-d799-4f80-be94-fc3c1f56e059","metadata":{},"outputs":[],"source":["print(X.shape)"]},{"cell_type":"code","execution_count":1,"id":"cf184e41-784b-4b3f-aaa1-b3676d96f8f6","metadata":{},"outputs":[],"source":["X[0]"]},{"cell_type":"code","execution_count":1,"id":"c953eaf0-a4fb-4e4d-9a08-7a3ed347f4ba","metadata":{},"outputs":[],"source":["import matplotlib.pyplot as plt\n\ndef plot_digit(image_data):\n image = image_data.reshape(28, 28)\n plt.imshow(image, cmap=\"binary\")\n plt.axis(\"off\")\n\nsome_digit = X[0]\nplot_digit(some_digit)\nstore_fig(\"some-digits-plot\",\n close = True)"]},{"cell_type":"code","execution_count":1,"id":"ed02c08c-f11f-4467-b338-668f6573d679","metadata":{},"outputs":[],"source":["plt.figure(figsize=(9, 9))\nfor idx, image_data in enumerate(X[:100]):\n plt.subplot(10, 10, idx + 1)\n plot_digit(image_data)\nplt.subplots_adjust(wspace=0, hspace=0)\n\nstore_fig(\"more-digits-plot\",\n close = True)"]},{"cell_type":"code","execution_count":1,"id":"fbd33566-2c34-4313-8b32-65c613e4d91e","metadata":{},"outputs":[],"source":["X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]"]},{"cell_type":"markdown","id":"79512997-3dc2-4e80-891d-c7881acd1b9a","metadata":{},"source":["#### Training a Binary Classifier\n\n"]},{"cell_type":"code","execution_count":1,"id":"3b43a050-c799-4dba-9ea0-f1835ad49160","metadata":{},"outputs":[],"source":["y_train_5 = (y_train == '5') # True for all 5s, False for all other digits\ny_test_5 = (y_test == '5')"]},{"cell_type":"code","execution_count":1,"id":"110ea320-f39a-48e4-8cd8-c0edc0212c15","metadata":{},"outputs":[],"source":["from sklearn.linear_model import SGDClassifier\n\nsgd_clf = SGDClassifier(random_state=42)\nsgd_clf.fit(X_train, y_train_5)"]},{"cell_type":"code","execution_count":1,"id":"d6096bab-7a53-413c-a736-18ba6655d252","metadata":{},"outputs":[],"source":["sgd_clf.predict([some_digit])"]},{"cell_type":"markdown","id":"4c5a47a2-7ea9-452c-99e7-92c1b98463ee","metadata":{},"source":["### Performance Measures\n\n"]},{"cell_type":"markdown","id":"60516fb8-23f6-40bc-a243-9f53e949d977","metadata":{},"source":["#### Measuring Accuracy Using Cross-Validation\n\n"]},{"cell_type":"code","execution_count":1,"id":"54aeb5ee-cfab-43f1-9757-3b5c59deeb78","metadata":{},"outputs":[],"source":["from sklearn.model_selection import cross_val_score\n\ncross_val_score(sgd_clf, X = X_train,y = y_train_5, cv=3, scoring=\"accuracy\")"]},{"cell_type":"markdown","id":"ba1c54c3-e99c-4ee7-a049-4b94f0316291","metadata":{},"source":["\n\| PARAMETER\|DESCRIPTION\|\n\|---\|---\|\n\| estimator\|The object to use to fit the data.\|\n\| X\|The data to fit. Can be for example a list, or an array.\|\n\| y\|The target variable to try to predict in the case of supervised learning.\|\n\| cv\|Determines the cross-validation splitting strategy.\|\n\| scoring\|A str (see model evaluation documentation) or a scorer callable object / function with signature scorer(estimator, X, y) which should return only a single value.\|\n\n"]},{"cell_type":"code","execution_count":1,"id":"c146e975-e6ff-442f-b253-4fc656ab45a1","metadata":{},"outputs":[],"source":["from sklearn.model_selection import StratifiedKFold\nfrom sklearn.base import clone\n\nskfolds = StratifiedKFold(n_splits=3) # add shuffle=True if the dataset is not\n # already shuffled\nfor train_index, test_index in skfolds.split(X_train, y_train_5):\n clone_clf = clone(sgd_clf)\n X_train_folds = X_train[train_index]\n y_train_folds = y_train_5[train_index]\n X_test_fold = X_train[test_index]\n y_test_fold = y_train_5[test_index]\n\n clone_clf.fit(X_train_folds, y_train_folds)\n y_pred = clone_clf.predict(X_test_fold)\n n_correct = sum(y_pred == y_test_fold)\n print(n_correct / len(y_pred))"]},{"cell_type":"code","execution_count":1,"id":"4c31fe34-a821-4168-94d3-f20e2e707fbc","metadata":{},"outputs":[],"source":["from sklearn.dummy import DummyClassifier\n\ndummy_clf = DummyClassifier()\ndummy_clf.fit(X_train, y_train_5)\nprint(any(dummy_clf.predict(X_train)))"]},{"cell_type":"code","execution_count":1,"id":"c6eb0611-19a8-428a-b986-d0c073460c68","metadata":{},"outputs":[],"source":["cross_val_score(dummy_clf, X_train, y_train_5, cv=3, scoring=\"accuracy\")"]},{"cell_type":"markdown","id":"3247c12a-4c99-4f2d-a12d-c3db2ecb9653","metadata":{},"source":["#### Confusion Matrix\n\n"]},{"cell_type":"code","execution_count":1,"id":"23487611-1853-49c2-b3a3-04cb9230b670","metadata":{},"outputs":[],"source":["from sklearn.model_selection import cross_val_predict\n\ny_train_pred = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3)"]},{"cell_type":"code","execution_count":1,"id":"43ac5ac3-ff8c-4e42-9283-0dcce724e43b","metadata":{},"outputs":[],"source":["from sklearn.metrics import confusion_matrix\n\ncm = confusion_matrix(y_train_5, y_train_pred)\ncm"]},{"cell_type":"code","execution_count":1,"id":"1e991d3b-ea43-4a01-9a32-f6a2ebfb53e7","metadata":{},"outputs":[],"source":["y_train_perfect_predictions = y_train_5 # pretend we reached perfection\nconfusion_matrix(y_train_5, y_train_perfect_predictions)"]},{"cell_type":"markdown","id":"03f0967a-6d7a-47f4-961d-280640e2fcea","metadata":{},"source":["#### Presicion and Recall\n\n"]},{"cell_type":"code","execution_count":1,"id":"94cd7195-92d0-4df9-a83a-8bd1637e10c2","metadata":{},"outputs":[],"source":["from sklearn.metrics import precision_score, recall_score\n\nprecision_score(y_train_5, y_train_pred) # == 3530 / (687 + 3530)"]}],"metadata":{"org":null,"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.5.2"}},"nbformat":4,"nbformat_minor":5}