diff --git a/app.py b/app.py index c48f6d9..9d6559a 100644 --- a/app.py +++ b/app.py @@ -1,9 +1,64 @@ +import pickle +from tensorflow.keras.models import load_model +from tensorflow.keras.preprocessing import sequence + from flask import Flask, render_template, request +from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer + app = Flask(__name__) + +model = None +tokenizer = None +loaded = False # Flag to ensure loading happens only once + +def load_keras_model(): + global model + try: + model = load_model('models/uci_sentimentanalysis.h5') + print("Model loaded successfully.") + except Exception as e: + print(f"Error loading model: {e}") + +def load_tokenizer(): + global tokenizer + try: + with open('models/tokenizer.pickle', 'rb') as handle: + tokenizer = pickle.load(handle) + print("Tokenizer loaded successfully.") + except Exception as e: + print(f"Error loading tokenizer: {e}") + +# Befor_first_request was not working with the keras version i have isntalled +@app.before_request +def before_request(): + global loaded + if not loaded: + load_keras_model() + load_tokenizer() + loaded = True # Set the flag to True to prevent reloading + +def sentiment_analysis(input_text): + if tokenizer is None or model is None: + return "Model or tokenizer not loaded." + + user_sequences = tokenizer.texts_to_sequences([input_text]) + user_sequences_matrix = sequence.pad_sequences(user_sequences, maxlen=1225) + prediction = model.predict(user_sequences_matrix) + + return round(float(prediction[0][0]), 2) + @app.route("/", methods=["GET", "POST"]) def index(): - # TODO: Write the code that calls the sentiment analysis functions here. - # hint: use request.method == "POST" - return render_template('form.html') + sentiment = {} + if request.method == "POST": + text = request.form.get("user_text") # Get user input + if text: + analyzer = SentimentIntensityAnalyzer() + sentiment = analyzer.polarity_scores(text) # VADER analysis + sentiment["custom model positive"] = sentiment_analysis(text) # Custom model analysis + + return render_template('form.html', sentiment=sentiment) + if __name__ == "__main__": - app.run() + app.run(debug=True) + diff --git a/models/tokenizer.pickle b/models/tokenizer.pickle new file mode 100644 index 0000000..bcc876a Binary files /dev/null and b/models/tokenizer.pickle differ diff --git a/models/uci_sentimentanalysis.h5 b/models/uci_sentimentanalysis.h5 new file mode 100644 index 0000000..9d39ff4 Binary files /dev/null and b/models/uci_sentimentanalysis.h5 differ diff --git a/notebooks/US_02.ipynb b/notebooks/US_02.ipynb new file mode 100644 index 0000000..9fd26f0 --- /dev/null +++ b/notebooks/US_02.ipynb @@ -0,0 +1,577 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "wDR9jCIVpck3", + "outputId": "f2d5cb78-b1cc-49b0-ed4f-8ca8f5d349c6" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + " % Total % Received % Xferd Average Speed Time Time Time Current\n", + " Dload Upload Total Spent Left Speed\n", + "100 84188 0 84188 0 0 143k 0 --:--:-- --:--:-- --:--:-- 143k\n", + "Archive: uci-labelled-sentences.zip\n", + " creating: sentiment labelled sentences/\n", + " inflating: sentiment labelled sentences/.DS_Store \n", + " creating: __MACOSX/\n", + " creating: __MACOSX/sentiment labelled sentences/\n", + " inflating: __MACOSX/sentiment labelled sentences/._.DS_Store \n", + " inflating: sentiment labelled sentences/amazon_cells_labelled.txt \n", + " inflating: sentiment labelled sentences/imdb_labelled.txt \n", + " inflating: __MACOSX/sentiment labelled sentences/._imdb_labelled.txt \n", + " inflating: sentiment labelled sentences/readme.txt \n", + " inflating: __MACOSX/sentiment labelled sentences/._readme.txt \n", + " inflating: sentiment labelled sentences/yelp_labelled.txt \n", + " inflating: __MACOSX/._sentiment labelled sentences \n" + ] + } + ], + "source": [ + "# download dataset from the UCI website\n", + "!curl -o uci-labelled-sentences.zip https://archive.ics.uci.edu/static/public/331/sentiment+labelled+sentences.zip\n", + "\n", + "# unzip dataset in Colab\n", + "!unzip uci-labelled-sentences.zip" + ] + }, + { + "cell_type": "code", + "source": [ + "import pandas as pd\n", + "import pickle\n", + "from sklearn.model_selection import train_test_split\n", + "from tensorflow.keras.preprocessing.text import Tokenizer\n", + "from keras.preprocessing.sequence import pad_sequences\n", + "from keras.models import Sequential\n", + "from keras.layers import Dense, Embedding, LSTM\n", + "from keras.callbacks import EarlyStopping" + ], + "metadata": { + "id": "hl25OnZ7pmAz" + }, + "execution_count": 1, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# download dataset from the UCI website\n", + "!curl -o uci-labelled-sentences.zip https://archive.ics.uci.edu/static/public/331/sentiment+labelled+sentences.zip\n", + "\n", + "# unzip dataset in Colab\n", + "!unzip uci-labelled-sentences.zip" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ObzFyA7diey3", + "outputId": "283c3183-a803-4731-eebf-d0b7e9ba817f" + }, + "execution_count": 2, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + " % Total % Received % Xferd Average Speed Time Time Time Current\n", + " Dload Upload Total Spent Left Speed\n", + "\r 0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0\r 0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0\r100 84188 0 84188 0 0 646k 0 --:--:-- --:--:-- --:--:-- 647k\n", + "Archive: uci-labelled-sentences.zip\n", + " creating: sentiment labelled sentences/\n", + " inflating: sentiment labelled sentences/.DS_Store \n", + " creating: __MACOSX/\n", + " creating: __MACOSX/sentiment labelled sentences/\n", + " inflating: __MACOSX/sentiment labelled sentences/._.DS_Store \n", + " inflating: sentiment labelled sentences/amazon_cells_labelled.txt \n", + " inflating: sentiment labelled sentences/imdb_labelled.txt \n", + " inflating: __MACOSX/sentiment labelled sentences/._imdb_labelled.txt \n", + " inflating: sentiment labelled sentences/readme.txt \n", + " inflating: __MACOSX/sentiment labelled sentences/._readme.txt \n", + " inflating: sentiment labelled sentences/yelp_labelled.txt \n", + " inflating: __MACOSX/._sentiment labelled sentences \n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "df_list = []\n", + "\n", + "# Yelp\n", + "df_yelp = pd.read_csv('sentiment labelled sentences/yelp_labelled.txt', names=['sentence', 'label'], sep='\\t')\n", + "df_yelp['source'] = 'yelp'\n", + "df_list.append(df_yelp)\n", + "\n", + "# Amazon\n", + "df_amazon = pd.read_csv('sentiment labelled sentences/amazon_cells_labelled.txt', names=['sentence', 'label'], sep='\\t')\n", + "df_amazon['source'] = 'amazon'\n", + "df_list.append(df_amazon)\n", + "\n", + "# IMDB\n", + "df_imdb = pd.read_csv('sentiment labelled sentences/imdb_labelled.txt', names=['sentence', 'label'], sep='\\t')\n", + "df_imdb['source'] = 'imdb'\n", + "df_list.append(df_imdb)\n", + "\n", + "# Concatenate the dataframes\n", + "df = pd.concat(df_list)\n", + "\n", + "df.head()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + }, + "id": "ym148xwIild2", + "outputId": "271905f5-4fa5-4f25-c7d6-6ea16746bc6a" + }, + "execution_count": 3, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " sentence label source\n", + "0 Wow... Loved this place. 1 yelp\n", + "1 Crust is not good. 0 yelp\n", + "2 Not tasty and the texture was just nasty. 0 yelp\n", + "3 Stopped by during the late May bank holiday of... 1 yelp\n", + "4 The selection on the menu was great and so wer... 1 yelp" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sentencelabelsource
0Wow... Loved this place.1yelp
1Crust is not good.0yelp
2Not tasty and the texture was just nasty.0yelp
3Stopped by during the late May bank holiday of...1yelp
4The selection on the menu was great and so wer...1yelp
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "df", + "summary": "{\n \"name\": \"df\",\n \"rows\": 2748,\n \"fields\": [\n {\n \"column\": \"sentence\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 2731,\n \"samples\": [\n \"Tied to charger for conversations lasting more than 45 minutes.MAJOR PROBLEMS!!\",\n \"It was too predictable, even for a chick flick. \",\n \"The bose noise cancelling is amazing, which is very important for a NYC commuter.\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"label\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 0,\n \"max\": 1,\n \"num_unique_values\": 2,\n \"samples\": [\n 0,\n 1\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"source\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"yelp\",\n \"amazon\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 3 + } + ] + }, + { + "cell_type": "code", + "source": [ + "max_features = 2000\n", + "tokenizer = Tokenizer(num_words=max_features, split=' ')\n", + "tokenizer.fit_on_texts(df['sentence'].values)\n", + "X = tokenizer.texts_to_sequences(df['sentence'].values)\n", + "X = pad_sequences(X)\n", + "y = df['label'].values" + ], + "metadata": { + "id": "F_qjw9IVi2Bs" + }, + "execution_count": 4, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.12)\n" + ], + "metadata": { + "id": "q60NFtcri5BJ" + }, + "execution_count": 5, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "def create_model():\n", + " model = Sequential()\n", + " model.add(Embedding(max_features, 64, input_length=X.shape[1]))\n", + " model.add(LSTM(16))\n", + " model.add(Dense(1, activation='sigmoid'))\n", + " model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])\n", + " return model\n", + "\n", + "model = create_model()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "V2GCbbHSi8K_", + "outputId": "522a67d2-288b-471d-aef6-fcde5c882d32" + }, + "execution_count": 6, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.10/dist-packages/keras/src/layers/core/embedding.py:90: UserWarning: Argument `input_length` is deprecated. Just remove it.\n", + " warnings.warn(\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "model.fit(X_train, y_train, epochs=6, batch_size=16, validation_data=(X_test, y_test), callbacks = [EarlyStopping(monitor='val_accuracy', min_delta=0.001, patience=2, verbose=1)])" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "PePtblNTi-9l", + "outputId": "5782e609-6492-4709-d160-288eb1ae3c60" + }, + "execution_count": 7, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 1/6\n", + "\u001b[1m152/152\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m77s\u001b[0m 488ms/step - accuracy: 0.5971 - loss: 0.6729 - val_accuracy: 0.7606 - val_loss: 0.5231\n", + "Epoch 2/6\n", + "\u001b[1m152/152\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m75s\u001b[0m 491ms/step - accuracy: 0.8629 - loss: 0.3859 - val_accuracy: 0.8061 - val_loss: 0.4332\n", + "Epoch 3/6\n", + "\u001b[1m152/152\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m79s\u001b[0m 470ms/step - accuracy: 0.9335 - loss: 0.2057 - val_accuracy: 0.8121 - val_loss: 0.4459\n", + "Epoch 4/6\n", + "\u001b[1m152/152\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m72s\u001b[0m 472ms/step - accuracy: 0.9587 - loss: 0.1486 - val_accuracy: 0.8333 - val_loss: 0.4730\n", + "Epoch 5/6\n", + "\u001b[1m152/152\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m82s\u001b[0m 472ms/step - accuracy: 0.9692 - loss: 0.1114 - val_accuracy: 0.8121 - val_loss: 0.5696\n", + "Epoch 6/6\n", + "\u001b[1m152/152\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m82s\u001b[0m 472ms/step - accuracy: 0.9772 - loss: 0.0743 - val_accuracy: 0.8333 - val_loss: 0.5894\n", + "Epoch 6: early stopping\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "" + ] + }, + "metadata": {}, + "execution_count": 7 + } + ] + }, + { + "cell_type": "code", + "source": [ + "model.save(\"uci_sentimentanalysis.h5\")\n", + "\n", + "with open('tokenizer.pickle', 'wb') as handle:\n", + " pickle.dump(tokenizer, handle, protocol=pickle.DEFAULT_PROTOCOL)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "VaMeW7CYjBPj", + "outputId": "013e87ee-8f9c-4fb3-bcab-93679a999334" + }, + "execution_count": 8, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "WARNING:absl:You are saving your model as an HDF5 file via `model.save()` or `keras.saving.save_model(model)`. This file format is considered legacy. We recommend using instead the native Keras format, e.g. `model.save('my_model.keras')` or `keras.saving.save_model(model, 'my_model.keras')`. \n" + ] + } + ] + } + ] +} \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index a55b9cf..04b3e6d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,11 +1,151 @@ -Flask==2.2.5 -Flask-SQLAlchemy==2.5.1 -numpy==1.23.5 -keras==2.13.1 -sqlalchemy==1.4.20 -psycopg2-binary==2.9.1 -pytest==7.4.2 -scikit-learn==1.2.2 -tensorflow==2.13.0 +absl-py==2.1.0 +anyio==4.6.0 +argon2-cffi==23.1.0 +argon2-cffi-bindings==21.2.0 +arrow==1.3.0 +asttokens==2.4.1 +astunparse==1.6.3 +async-lru==2.0.4 +attrs==24.2.0 +babel==2.16.0 +beautifulsoup4==4.12.3 +bleach==6.1.0 +blinker==1.8.2 +certifi==2024.8.30 +cffi==1.17.1 +charset-normalizer==3.3.2 +click==8.1.7 +colorama==0.4.6 +comm==0.2.2 +contourpy==1.3.0 +cycler==0.12.1 +debugpy==1.8.6 +decorator==5.1.1 +defusedxml==0.7.1 +executing==2.1.0 +fastjsonschema==2.20.0 +filelock==3.13.1 +Flask==3.0.3 +flatbuffers==24.3.25 +fonttools==4.54.1 +fqdn==1.5.1 +fsspec==2024.2.0 +gast==0.6.0 +gitdb==4.0.11 +GitPython==3.1.43 +google-pasta==0.2.0 +grpcio==1.67.1 +h11==0.14.0 +h5py==3.12.1 +httpcore==1.0.5 +httpx==0.27.2 +idna==3.10 +ipykernel==6.29.5 +ipython==8.27.0 +isoduration==20.11.0 +itsdangerous==2.2.0 +jedi==0.19.1 +Jinja2==3.1.4 +joblib==1.4.2 +json5==0.9.25 +jsonpointer==3.0.0 +jsonschema==4.23.0 +jsonschema-specifications==2023.12.1 +jupyter-events==0.10.0 +jupyter-lsp==2.2.5 +jupyter-server-mathjax==0.2.6 +jupyter_client==8.6.3 +jupyter_core==5.7.2 +jupyter_server==2.14.2 +jupyter_server_terminals==0.5.3 +jupyterlab==4.2.5 +jupyterlab_git==0.50.1 +jupyterlab_pygments==0.3.0 +jupyterlab_server==2.27.3 +keras==3.6.0 +kiwisolver==1.4.7 +libclang==18.1.1 +Markdown==3.7 +markdown-it-py==3.0.0 +MarkupSafe==2.1.5 +matplotlib==3.9.2 +matplotlib-inline==0.1.7 +mdurl==0.1.2 +mistune==3.0.2 +ml-dtypes==0.4.1 +mpmath==1.3.0 +namex==0.0.8 +nbclient==0.10.0 +nbconvert==7.16.4 +nbdime==4.0.2 +nbformat==5.10.4 +nest-asyncio==1.6.0 +networkx==3.2.1 +notebook_shim==0.2.4 +numpy==2.0.2 +opt_einsum==3.4.0 +optree==0.13.0 +overrides==7.7.0 +packaging==24.1 +pandas==2.2.3 +pandocfilters==1.5.1 +parso==0.8.4 +pexpect==4.9.0 +pillow==10.4.0 +platformdirs==4.3.6 +plotly==5.24.1 +prometheus_client==0.21.0 +prompt_toolkit==3.0.48 +protobuf==5.28.3 +psutil==6.0.0 +ptyprocess==0.7.0 +pure_eval==0.2.3 +pycparser==2.22 +Pygments==2.18.0 +pyparsing==3.1.4 +python-dateutil==2.9.0.post0 +python-json-logger==2.0.7 +pytz==2024.2 +PyYAML==6.0.2 +pyzmq==26.2.0 +referencing==0.35.1 +requests==2.32.3 +rfc3339-validator==0.1.4 +rfc3986-validator==0.1.1 +rich==13.9.4 +rpds-py==0.20.0 +scikit-learn==1.5.2 +scipy==1.14.1 +seaborn==0.13.2 +Send2Trash==1.8.3 +setuptools==75.1.0 +six==1.16.0 +smmap==5.0.1 +sniffio==1.3.1 +soupsieve==2.6 +stack-data==0.6.3 +sympy==1.12 +tenacity==9.0.0 +tensorboard==2.18.0 +tensorboard-data-server==0.7.2 +tensorflow==2.18.0 +termcolor==2.5.0 +terminado==0.18.1 +threadpoolctl==3.5.0 +tinycss2==1.3.0 +torch==2.4.1 +tornado==6.4.1 +traitlets==5.14.3 +types-python-dateutil==2.9.0.20240906 +typing_extensions==4.9.0 +tzdata==2024.2 +uri-template==1.3.0 +urllib3==2.2.3 vaderSentiment==3.3.2 -gunicorn==19.3.0 \ No newline at end of file +wcwidth==0.2.13 +webcolors==24.8.0 +webencodings==0.5.1 +websocket-client==1.8.0 +Werkzeug==3.1.2 +wheel==0.45.0 +wrapt==1.16.0 diff --git a/templates/form.html b/templates/form.html index f1890b6..34ce15b 100644 --- a/templates/form.html +++ b/templates/form.html @@ -1,12 +1,38 @@ + + + + + Sentiment Analysis -
- - -
+
+

SentimentScope Analyzer

+
+
+ +
+ +
+
+ {% if sentiment %} +

Sentiment Analysis Results:

+

Positive: {{ sentiment['pos'] * 100 }}%

+

Neutral: {{ sentiment['neu'] * 100 }}%

+

Negative: {{ sentiment['neg'] * 100 }}%

+

Compound: {{ sentiment['compound'] * 100 }}%

+

Custom Keras model: {{ sentiment['custom model positive'] }}

+ {% endif %} +
+ +
+