diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..894a44c --- /dev/null +++ b/.gitignore @@ -0,0 +1,104 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ diff --git a/.ipynb_checkpoints/NN-checkpoint.ipynb b/.ipynb_checkpoints/NN-checkpoint.ipynb deleted file mode 100644 index b286466..0000000 --- a/.ipynb_checkpoints/NN-checkpoint.ipynb +++ /dev/null @@ -1,1193 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 58, - "metadata": {}, - "outputs": [], - "source": [ - "import string \n", - "import unicodedata\n", - "import sys\n", - "import collections\n", - "import random\n", - "import math\n", - "import os\n", - "from collections import Counter\n", - "from ast import literal_eval\n", - "import regex as re\n", - "import pickle\n", - "from functools import reduce\n", - "from datetime import datetime \n", - "\n", - "import pandas as pd\n", - "import numpy as np\n", - "import matplotlib.pyplot as plt\n", - "\n", - "import xgboost as xgb\n", - "import lightgbm as lgbm\n", - "from IPython.display import display\n", - "\n", - "from sklearn.model_selection import train_test_split\n", - "from sklearn.feature_extraction.text import TfidfVectorizer\n", - "from sklearn.linear_model import LogisticRegression\n", - "from sklearn.naive_bayes import BernoulliNB\n", - "from sklearn.ensemble import RandomForestClassifier\n", - "\n", - "from nltk.corpus import stopwords\n", - "from nltk import word_tokenize\n", - "from nltk.stem import PorterStemmer\n", - "from nltk.stem.wordnet import WordNetLemmatizer\n", - "\n", - "import tensorflow as tf\n", - "from keras.models import Sequential, Model, load_model\n", - "from keras.layers import ( Dense, Conv1D, Activation, MaxPool1D, \n", - " Embedding, Flatten, Reshape, concatenate, \n", - " Input, Dropout, LSTM, AveragePooling1D, Masking )\n", - "from keras import optimizers\n", - "from keras import backend as K\n", - "from keras.callbacks import ModelCheckpoint, EarlyStopping\n", - "import h5py as h5py\n", - "\n", - "%matplotlib inline" - ] - }, - { - "cell_type": "code", - "execution_count": 59, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
DifferencedatenextClosenextDayprevCloseprevDaytext
0-1.302011-06-08376.552011-06-09377.852011-06-07Airtel commences 3G services in J&K
15.352011-06-13379.302011-06-14373.952011-06-10Airtel dances to African tune, sees more 3G li...
23.802011-06-16380.552011-06-17376.752011-06-15TCIL may approach company law board against Bh...
320.202011-06-17389.852011-06-20369.652011-06-16Malkani bullish on Bharti Airtel
420.202011-06-17389.852011-06-20369.652011-06-16Hold Bharti Airtel: Angel Broking
\n", - "
" - ], - "text/plain": [ - " Difference date nextClose nextDay prevClose prevDay \\\n", - "0 -1.30 2011-06-08 376.55 2011-06-09 377.85 2011-06-07 \n", - "1 5.35 2011-06-13 379.30 2011-06-14 373.95 2011-06-10 \n", - "2 3.80 2011-06-16 380.55 2011-06-17 376.75 2011-06-15 \n", - "3 20.20 2011-06-17 389.85 2011-06-20 369.65 2011-06-16 \n", - "4 20.20 2011-06-17 389.85 2011-06-20 369.65 2011-06-16 \n", - "\n", - " text \n", - "0 Airtel commences 3G services in J&K \n", - "1 Airtel dances to African tune, sees more 3G li... \n", - "2 TCIL may approach company law board against Bh... \n", - "3 Malkani bullish on Bharti Airtel \n", - "4 Hold Bharti Airtel: Angel Broking " - ] - }, - "execution_count": 59, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df = pd.DataFrame()\n", - "\n", - "for filename in os.listdir(os.path.join(os.getcwd(), 'datasets')):\n", - " if filename[-3:] == 'csv':\n", - " df = df.append(pd.read_csv(os.path.join(os.getcwd(), 'datasets', filename), sep='|'), ignore_index=True) \n", - " \n", - "# display(df.head())\n", - "df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 60, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
DifferencenextCloseprevClose
count30045.00000030045.00000030045.000000
mean-2.8320621089.1469761091.979038
std91.934622775.223777778.694248
min-2648.650000162.050000155.900000
25%-12.100000405.950000405.850000
50%0.950000938.000000934.250000
75%14.9000001436.4000001445.050000
max303.0500004365.9000004359.850000
\n", - "
" - ], - "text/plain": [ - " Difference nextClose prevClose\n", - "count 30045.000000 30045.000000 30045.000000\n", - "mean -2.832062 1089.146976 1091.979038\n", - "std 91.934622 775.223777 778.694248\n", - "min -2648.650000 162.050000 155.900000\n", - "25% -12.100000 405.950000 405.850000\n", - "50% 0.950000 938.000000 934.250000\n", - "75% 14.900000 1436.400000 1445.050000\n", - "max 303.050000 4365.900000 4359.850000" - ] - }, - "execution_count": 60, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 61, - "metadata": {}, - "outputs": [], - "source": [ - "stops = stopwords.words('english')\n", - "porter = PorterStemmer()\n", - "lemma = WordNetLemmatizer()\n", - "\n", - "tbl = dict.fromkeys(i for i in range(sys.maxunicode)\n", - " if unicodedata.category(chr(i)).startswith('P') or i == 36 or i == ord('`'))\n", - "\n", - "\n", - "def remove_punctuation(text):\n", - " '''\n", - " From https://stackoverflow.com/questions/11066400/remove-punctuation-from-unicode-formatted-strings\n", - "\n", - " '''\n", - " return text.translate(tbl)\n", - "\n", - "\n", - "def remove_stopwords(text, ret_format='str'):\n", - " tokens = filter(lambda x: x not in stops, map(porter.stem, word_tokenize(text)))\n", - " if ret_format == 'list':\n", - " return list(tokens)\n", - " elif ret_format == 'str':\n", - " return ' '.join(tokens)\n", - " else:\n", - " raise Exception('Invalid format')\n", - "\n", - "\n", - "def restore_arr(a):\n", - " '''\n", - " Converts strings to python list\n", - " \n", - " params:\n", - " a: String -> Input string to be converted to array\n", - " return:\n", - " list\n", - " \n", - " Usage with pandas:\n", - " train_mod = pd.read_csv('modified_train.csv', converters={'description_norm': restore_arr})\n", - " '''\n", - " return [x.replace(\"'\", \"\") for x in a[:-1][1:].split(', ')]\n", - " \n", - " \n", - "\n", - "def restore_int_arr(a):\n", - " return [int(x.replace(\"'\", \"\")) for x in a[:-1][1:].split(', ')]\n", - " \n", - "\n", - "def restore_float_arr(a):\n", - " ret = [float(x.replace(\"'\", \"\")) for x in a[:-1][1:].split(', ')]\n", - " if len(ret) == 1:\n", - " return ret[0]\n", - " else:\n", - " return ret\n", - " \n", - "def lemmatize(a):\n", - " return [lemma.lemmatize(x) for x in a.split()]\n", - "\n", - "def remove_numbers(a):\n", - " ans = []\n", - " for s in a.split():\n", - " try:\n", - " g = int(s)\n", - " except ValueError:\n", - " ans.append(s)\n", - " \n", - " return ' '.join(ans)" - ] - }, - { - "cell_type": "code", - "execution_count": 62, - "metadata": {}, - "outputs": [], - "source": [ - "df['text_norm'] = df['text']\\\n", - " .apply(str.lower)\\\n", - " .apply(remove_punctuation)\\\n", - " .apply(remove_numbers)\\\n", - " .apply(remove_stopwords)\\\n", - " .apply(lambda x: x.split())" - ] - }, - { - "cell_type": "code", - "execution_count": 63, - "metadata": {}, - "outputs": [], - "source": [ - "df['date'] = df['date'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d'))" - ] - }, - { - "cell_type": "code", - "execution_count": 64, - "metadata": {}, - "outputs": [], - "source": [ - "df['target'] = (df['Difference'] > 0).astype('int')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 72, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
DifferencedatenextClosenextDayprevCloseprevDaytexttext_normtarget
2122-1.22017-01-02304.452017-01-03305.652016-12-30GSM telcos gain over 10mn subscribers in Nov; ...[gsm, telco, gain, 10mn, subscrib, nov, idea, ...0
21232.92017-01-03313.902017-01-04311.002017-01-02Tulsian's take on Bharat Financial, oil & gas ...[tulsian, take, bharat, financi, oil, ga, co, ...1
21242.92017-01-03313.902017-01-04311.002017-01-02Tariff War:Airtel offers 3GB free 4G mthly dat...[tariff, warairtel, offer, 3gb, free, 4g, mthl...1
21252.92017-01-03313.902017-01-04311.002017-01-02Airtel in discussions to buy Telenor's Indian ...[airtel, discuss, buy, telenor, indian, busi]1
21262.92017-01-03313.902017-01-04311.002017-01-02Maximum call drop recorded on Aircel network i...[maximum, call, drop, record, aircel, network,...1
\n", - "
" - ], - "text/plain": [ - " Difference date nextClose nextDay prevClose prevDay \\\n", - "2122 -1.2 2017-01-02 304.45 2017-01-03 305.65 2016-12-30 \n", - "2123 2.9 2017-01-03 313.90 2017-01-04 311.00 2017-01-02 \n", - "2124 2.9 2017-01-03 313.90 2017-01-04 311.00 2017-01-02 \n", - "2125 2.9 2017-01-03 313.90 2017-01-04 311.00 2017-01-02 \n", - "2126 2.9 2017-01-03 313.90 2017-01-04 311.00 2017-01-02 \n", - "\n", - " text \\\n", - "2122 GSM telcos gain over 10mn subscribers in Nov; ... \n", - "2123 Tulsian's take on Bharat Financial, oil & gas ... \n", - "2124 Tariff War:Airtel offers 3GB free 4G mthly dat... \n", - "2125 Airtel in discussions to buy Telenor's Indian ... \n", - "2126 Maximum call drop recorded on Aircel network i... \n", - "\n", - " text_norm target \n", - "2122 [gsm, telco, gain, 10mn, subscrib, nov, idea, ... 0 \n", - "2123 [tulsian, take, bharat, financi, oil, ga, co, ... 1 \n", - "2124 [tariff, warairtel, offer, 3gb, free, 4g, mthl... 1 \n", - "2125 [airtel, discuss, buy, telenor, indian, busi] 1 \n", - "2126 [maximum, call, drop, record, aircel, network,... 1 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
DifferencedatenextClosenextDayprevCloseprevDaytexttext_normtarget
0-1.302011-06-08376.552011-06-09377.852011-06-07Airtel commences 3G services in J&K[airtel, commenc, 3g, servic, jk]0
15.352011-06-13379.302011-06-14373.952011-06-10Airtel dances to African tune, sees more 3G li...[airtel, danc, african, tune, see, 3g, licens]1
23.802011-06-16380.552011-06-17376.752011-06-15TCIL may approach company law board against Bh...[tcil, may, approach, compani, law, board, bha...1
320.202011-06-17389.852011-06-20369.652011-06-16Malkani bullish on Bharti Airtel[malkani, bullish, bharti, airtel]1
420.202011-06-17389.852011-06-20369.652011-06-16Hold Bharti Airtel: Angel Broking[hold, bharti, airtel, angel, broke]1
\n", - "
" - ], - "text/plain": [ - " Difference date nextClose nextDay prevClose prevDay \\\n", - "0 -1.30 2011-06-08 376.55 2011-06-09 377.85 2011-06-07 \n", - "1 5.35 2011-06-13 379.30 2011-06-14 373.95 2011-06-10 \n", - "2 3.80 2011-06-16 380.55 2011-06-17 376.75 2011-06-15 \n", - "3 20.20 2011-06-17 389.85 2011-06-20 369.65 2011-06-16 \n", - "4 20.20 2011-06-17 389.85 2011-06-20 369.65 2011-06-16 \n", - "\n", - " text \\\n", - "0 Airtel commences 3G services in J&K \n", - "1 Airtel dances to African tune, sees more 3G li... \n", - "2 TCIL may approach company law board against Bh... \n", - "3 Malkani bullish on Bharti Airtel \n", - "4 Hold Bharti Airtel: Angel Broking \n", - "\n", - " text_norm target \n", - "0 [airtel, commenc, 3g, servic, jk] 0 \n", - "1 [airtel, danc, african, tune, see, 3g, licens] 1 \n", - "2 [tcil, may, approach, compani, law, board, bha... 1 \n", - "3 [malkani, bullish, bharti, airtel] 1 \n", - "4 [hold, bharti, airtel, angel, broke] 1 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
DifferencenextCloseprevClosetarget
count2700.0000002700.0000002700.0000002700.000000
mean2.4968701181.9122781179.4154070.578519
std77.984853877.235840877.6239150.493888
min-1519.450000251.100000251.1000000.000000
25%-7.312500451.250000449.9500000.000000
50%3.250000920.625000919.7000001.000000
75%19.8500001693.9000001698.7000001.000000
max209.0500003412.9500003412.9500001.000000
\n", - "
" - ], - "text/plain": [ - " Difference nextClose prevClose target\n", - "count 2700.000000 2700.000000 2700.000000 2700.000000\n", - "mean 2.496870 1181.912278 1179.415407 0.578519\n", - "std 77.984853 877.235840 877.623915 0.493888\n", - "min -1519.450000 251.100000 251.100000 0.000000\n", - "25% -7.312500 451.250000 449.950000 0.000000\n", - "50% 3.250000 920.625000 919.700000 1.000000\n", - "75% 19.850000 1693.900000 1698.700000 1.000000\n", - "max 209.050000 3412.950000 3412.950000 1.000000" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
DifferencedatenextClosenextDayprevCloseprevDaytexttext_normtarget
18943-0.952018-03-14311.852018-03-15312.802018-03-13Buy ICICI Bank, State Bank of India, Adani Ent...[buy, icici, bank, state, bank, india, adani, ...0
18944-0.952018-03-14311.852018-03-15312.802018-03-13News Highlights: Facebook bans far-right group...[news, highlight, facebook, ban, farright, gro...0
18945-0.952018-03-14311.852018-03-15312.802018-03-13Malaysia based IHH eyes YES Bank's Fortis stake[malaysia, base, ihh, eye, ye, bank, forti, st...0
18946-0.952018-03-14311.852018-03-15312.802018-03-13Nifty likely to remain rangebound between 10,3...[nifti, like, remain, rangebound, ye, bank, gi...0
18947-5.952018-03-15312.902018-03-16318.852018-03-14Buy Yes Bank, target Rs 408; bank well positio...[buy, ye, bank, target, rs, bank, well, posit,...0
\n", - "
" - ], - "text/plain": [ - " Difference date nextClose nextDay prevClose prevDay \\\n", - "18943 -0.95 2018-03-14 311.85 2018-03-15 312.80 2018-03-13 \n", - "18944 -0.95 2018-03-14 311.85 2018-03-15 312.80 2018-03-13 \n", - "18945 -0.95 2018-03-14 311.85 2018-03-15 312.80 2018-03-13 \n", - "18946 -0.95 2018-03-14 311.85 2018-03-15 312.80 2018-03-13 \n", - "18947 -5.95 2018-03-15 312.90 2018-03-16 318.85 2018-03-14 \n", - "\n", - " text \\\n", - "18943 Buy ICICI Bank, State Bank of India, Adani Ent... \n", - "18944 News Highlights: Facebook bans far-right group... \n", - "18945 Malaysia based IHH eyes YES Bank's Fortis stake \n", - "18946 Nifty likely to remain rangebound between 10,3... \n", - "18947 Buy Yes Bank, target Rs 408; bank well positio... \n", - "\n", - " text_norm target \n", - "18943 [buy, icici, bank, state, bank, india, adani, ... 0 \n", - "18944 [news, highlight, facebook, ban, farright, gro... 0 \n", - "18945 [malaysia, base, ihh, eye, ye, bank, forti, st... 0 \n", - "18946 [nifti, like, remain, rangebound, ye, bank, gi... 0 \n", - "18947 [buy, ye, bank, target, rs, bank, well, posit,... 0 " - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "last_date = datetime.strptime('2016 Dec 31', '%Y %b %d')\n", - "\n", - "# test = df[last_date < df['date']]\n", - "# train = df[last_date >= df['date']]\n", - "train = df\n", - "\n", - "display(test.head())\n", - "display(train.head())\n", - "display(test.describe())\n", - "display(test.tail())" - ] - }, - { - "cell_type": "code", - "execution_count": 82, - "metadata": {}, - "outputs": [], - "source": [ - "params = {\n", - " 'vocab_size': 2000,\n", - " 'PAD': 0,\n", - " 'UNK': 1,\n", - " 'maxlen': df['text_norm'].map(len).max(),\n", - " 'kernels': (3, 3, 3,),\n", - " 'num_filters': (2, 2, 2),\n", - " 'hidden_dims': 64,\n", - " 'batch_size': 32,\n", - " 'embedding_size': 64,\n", - " 'pool_size': 2,\n", - " 'threshold': 0.5,\n", - " 'LSTM_units': 64\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": 83, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'10 most common words'" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "[('bank', 9174),\n", - " ('buy', 6843),\n", - " ('rs', 6040),\n", - " ('say', 2813),\n", - " ('sukhani', 2716),\n", - " ('tata', 2467),\n", - " ('infosi', 2400),\n", - " ('sell', 2399),\n", - " ('sbi', 2207),\n", - " ('icici', 2187)]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "'Number of unique tokens'" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "8521" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "vocabulary = dict()\n", - "counter = Counter()\n", - "\n", - "for sent in train['text_norm']:\n", - " counter.update(sent)\n", - " \n", - "display(\"10 most common words\")\n", - "display(counter.most_common(10)) \n", - "display(\"Number of unique tokens\")\n", - "display(len(counter.keys()))" - ] - }, - { - "cell_type": "code", - "execution_count": 84, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Most common words + (PAD and UNK) [['PAD', 299096], ['UNK', 14734], ('bank', 9174), ('buy', 6843), ('rs', 6040), ('say', 2813), ('sukhani', 2716), ('tata', 2467)]\n", - "Dataset \n", - " 0 [20, 1, 729, 113, 730, 0, 0, 0, 0, 0, 0, 0, 0,...\n", - "1 [20, 1, 1, 1, 41, 729, 1, 0, 0, 0, 0, 0, 0, 0,...\n", - "2 [1, 12, 1404, 181, 1405, 160, 24, 20, 0, 0, 0,...\n", - "3 [1406, 128, 24, 20, 0, 0, 0, 0, 0, 0, 0, 0, 0,...\n", - "4 [33, 24, 20, 131, 162, 0, 0, 0, 0, 0, 0, 0, 0,...\n", - "Name: description_vectors, dtype: object\n" - ] - } - ], - "source": [ - "unk_count = 0\n", - "pad_count = 0\n", - "\n", - "def convert_to_dict(arr, maxlen, dictionary, pad=True):\n", - " global unk_count, pad_count\n", - " \n", - " if pad:\n", - "# ret = np.full(shape=(maxlen, ), fill_value=params['PAD'])\n", - " ret = [params['PAD'] for x in range(maxlen)]\n", - " else:\n", - " ret = np.full(shape=(len(arr) + 1, ), fill_value=params['PAD'])\n", - " \n", - " for i, word in enumerate(arr):\n", - " if word in dictionary:\n", - " ret[i] = dictionary[word]\n", - " else:\n", - " ret[i] = params['UNK']\n", - " unk_count += 1\n", - " \n", - " pad_count += maxlen - len(arr)\n", - " return ret\n", - " \n", - "\n", - "def create_dataset(cnt, train):\n", - " global pad_count, unk_count\n", - " \n", - " count = [['PAD', -2], ['UNK', -1]]\n", - " count.extend(cnt.most_common(params['vocab_size'] - 2))\n", - " dictionary = dict()\n", - " for word, _ in count:\n", - " dictionary[word] = len(dictionary)\n", - " \n", - " reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))\n", - " series = train['text_norm'].apply(convert_to_dict, args=(params['maxlen'], dictionary))\n", - " count[0][-1] = pad_count\n", - " count[1][-1] = unk_count\n", - " return series, dictionary, reverse_dictionary, count\n", - "\n", - "train['description_vectors'], word_dict, word_rev_dict, count = create_dataset(counter, train)\n", - "print('Most common words + (PAD and UNK) ', count[:8])\n", - "print('Dataset','\\n', train['description_vectors'][:5])" - ] - }, - { - "cell_type": "code", - "execution_count": 85, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(27040, 17) (3005, 17) (27040,)\n" - ] - } - ], - "source": [ - "desc = np.array([np.array(x) for x in train['description_vectors']])\n", - "\n", - "x_train, x_test, y_train, y_test = train_test_split(\n", - " desc, \n", - " train['Difference'], \n", - " test_size=0.1, \n", - " random_state=4)\n", - "\n", - "print(x_train.shape, x_test.shape, y_train.shape)" - ] - }, - { - "cell_type": "code", - "execution_count": 86, - "metadata": {}, - "outputs": [], - "source": [ - "convolutions = list()\n", - "\n", - "input_shape = (params['maxlen'], )\n", - "conv_input = Input(shape=input_shape)\n", - "\n", - "embed = Embedding(params['vocab_size'],\n", - " params['embedding_size'],\n", - " input_length=params['maxlen'],\n", - " name='Embedding'\n", - " )(conv_input)\n", - "\n", - "\n", - "embed = Dropout(0.5, name='Dropout_1')(embed)\n", - "\n", - "for i, size in enumerate(params['kernels']):\n", - " conv = Conv1D(filters=params['num_filters'][i], kernel_size=size, padding='same', activation='relu')(embed)\n", - " conv = MaxPool1D(pool_size=params['pool_size'])(conv)\n", - " conv = Flatten()(conv)\n", - " convolutions.append(conv)\n", - " \n", - "out = concatenate(convolutions)\n", - "out = Dropout(0.4, name='Dropout_2')(out)\n", - "out = Dense(params['hidden_dims'], activation='relu', name='Dense_hidden')(out)\n", - "out = Dense(1, activation='softmax', name='Dense_final')(out)\n", - "\n", - "model = Model(inputs=conv_input, outputs=out)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Train on 27040 samples, validate on 3005 samples\n", - "Epoch 1/3\n", - "27040/27040 [==============================] - 11s - loss: 59.6068 - acc: 9.2456e-04 - val_loss: 74.4581 - val_acc: 3.3278e-04\n", - "Epoch 2/3\n", - "27040/27040 [==============================] - 10s - loss: 59.6068 - acc: 9.2456e-04 - val_loss: 74.4581 - val_acc: 3.3278e-04\n", - "Epoch 3/3\n", - "10656/27040 [==========>...................] - ETA: 5s - loss: 40.8227 - acc: 0.0012" - ] - } - ], - "source": [ - "model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])\n", - "model.fit(x_train, y_train, \n", - " batch_size=params['batch_size'], \n", - " validation_data=(x_test, y_test),\n", - " verbose=1, epochs=3, callbacks=[ModelCheckpoint('model_cnn_v1.keras', save_best_only=True)])" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "ename": "ModuleNotFoundError", - "evalue": "No module named 'gensim'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mgensim\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmodels\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mWord2Vec\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'gensim'" - ] - } - ], - "source": [ - "from gensim.models import Word2Vec" - ] - }, - { - "cell_type": "code", - "execution_count": 111, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], - "source": [ - "from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer\n", - "count_vect = TfidfVectorizer()\n", - "X = count_vect.fit_transform(train['text'])\n", - "X.toarray()\n", - "print (type(X))\n", - "\n", - "x_train, x_test, y_train, y_test = train_test_split(\n", - " X, \n", - " train['target'], \n", - " test_size=0.1, \n", - " random_state=4)" - ] - }, - { - "cell_type": "code", - "execution_count": 112, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[0]\ttrain-error:0.462833\tvalid-error:0.478536\n", - "Multiple eval metrics have been passed: 'valid-error' will be used for early stopping.\n", - "\n", - "Will train until valid-error hasn't improved in 50 rounds.\n", - "[20]\ttrain-error:0.421709\tvalid-error:0.47188\n", - "[40]\ttrain-error:0.40233\tvalid-error:0.462562\n", - "[60]\ttrain-error:0.38824\tvalid-error:0.463561\n", - "[80]\ttrain-error:0.376331\tvalid-error:0.464226\n", - "[100]\ttrain-error:0.370932\tvalid-error:0.464892\n", - "Stopping. Best iteration:\n", - "[69]\ttrain-error:0.383617\tvalid-error:0.461231\n", - "\n" - ] - } - ], - "source": [ - "d_train = xgb.DMatrix(x_train, label=y_train)\n", - "d_valid = xgb.DMatrix(x_test, label=y_test)\n", - "\n", - "xgb_params = {\n", - " 'eta': 0.12,\n", - " 'objective': 'binary:logistic',\n", - " 'eval_metric': 'error',\n", - " 'max-depth': 6,\n", - " 'gamma': 5,\n", - " 'subsample': 0.76,\n", - " 'colsample_bytree': 0.8\n", - "}\n", - "\n", - "watchlist = [(d_train, 'train'), (d_valid, 'valid')]\n", - "\n", - "xgb_model = xgb.train(xgb_params, d_train, 500, watchlist, verbose_eval=20, early_stopping_rounds=50)" - ] - }, - { - "cell_type": "code", - "execution_count": 113, - "metadata": {}, - "outputs": [], - "source": [ - "p = xgb_model.predict(d_train)\n", - "mask = p > 0.5\n", - "\n", - "p[mask] = 1\n", - "p[np.logical_not(mask)] = 0" - ] - }, - { - "cell_type": "code", - "execution_count": 115, - "metadata": {}, - "outputs": [ - { - "ename": "ValueError", - "evalue": "Lengths must match to compare", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mp\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mp\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0my_train\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m/\u001b[0m\u001b[0mp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;32m/usr/local/Cellar/python3/3.6.1/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/pandas/core/ops.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(self, other, axis)\u001b[0m\n\u001b[1;32m 820\u001b[0m if (not lib.isscalar(lib.item_from_zerodim(other)) and\n\u001b[1;32m 821\u001b[0m len(self) != len(other)):\n\u001b[0;32m--> 822\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'Lengths must match to compare'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 823\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 824\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mother\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mABCPeriodIndex\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mValueError\u001b[0m: Lengths must match to compare" - ] - } - ], - "source": [ - "p[p == y_train].shape[0]/p.shape[0]" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.3" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/.ipynb_checkpoints/merge-checkpoint.ipynb b/.ipynb_checkpoints/merge-checkpoint.ipynb deleted file mode 100644 index f92e362..0000000 --- a/.ipynb_checkpoints/merge-checkpoint.ipynb +++ /dev/null @@ -1,181 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "from collections import defaultdict\n", - "from datetime import datetime\n", - "from operator import itemgetter\n", - "import csv\n", - "import os\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 48, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'BPCL', 'INFY', 'LT', 'SBIN', 'M&M'}\n" - ] - } - ], - "source": [ - "dpath = os.path.join(os.getcwd(), 'datasets', 'EXTRA')\n", - "l = os.listdir(dpath)\n", - "names = set([x[24:-7] for x in l])\n", - "print(names)\n", - "d = {}\n", - "\n", - "for x in names:\n", - " d[x] = pd.DataFrame()\n", - " \n", - "for x in l:\n", - " df = pd.read_csv(os.path.join(dpath, x))\n", - " d[x[24:-7]] = d[x[24:-7]].append(df, ignore_index=True)\n", - "\n", - "for x in names:\n", - " d[x].to_csv(os.path.join(os.getcwd(), 'datasets/NSE', '{}.csv'.format(x)), index=False)" - ] - }, - { - "cell_type": "code", - "execution_count": 49, - "metadata": {}, - "outputs": [], - "source": [ - "for f in names:\n", - " nseName = os.path.join(os.getcwd(), 'datasets/NSE', '{}.csv'.format(f))\n", - " mcName = os.path.join(os.getcwd(), 'datasets/MC', 'MC{}.csv'.format(f))\n", - " nse = pd.read_csv(nseName)\n", - " mc = pd.read_csv(mcName, sep='|')\n", - "\n", - " # print(nse)\n", - " # print(mc.head())\n", - " nse['Date'] = nse['Date'].apply(lambda x: datetime.strptime(x, '%d-%b-%Y'))\n", - " mc['date'] = mc['date'].apply(lambda x: x.strip()).apply(lambda x: datetime.strptime(x, '%d %b %Y'))\n", - " mc = mc.sort_values(by='date')\n", - " # print(nse.head())\n", - " # print(mc.head())\n", - " \n", - " mydick = defaultdict(list)\n", - " Date = [[row['Date'], row['Close Price']] for i, row in nse.iterrows()]\n", - "\n", - " for date, text in zip(mc['date'], mc['text']):\n", - " mydick['date'].append(date)\n", - " mydick['text'].append(text)\n", - " myDate = Date[:]\n", - " myDate.append([date, 0])\n", - " myDate.sort(key=itemgetter(0))\n", - "\n", - " ind = myDate.index([date, 0])\n", - " try:\n", - " prevDay = myDate[ind - 1]\n", - " except IndexError:\n", - " prevDay = myDate[ind + 1]\n", - " try:\n", - " nextDay = myDate[ind + 1]\n", - " except IndexError:\n", - " nextDay = myDate[ind - 1]\n", - " try:\n", - " if prevDay[0] == date:\n", - " prevDay = myDate[ind - 2]\n", - " if nextDay[0] == date:\n", - " nextDay = myDate[ind + 2]\n", - " except IndexError:\n", - " pass\n", - " mydick['prevDay'].append(prevDay[0])\n", - " mydick['prevClose'].append(prevDay[1])\n", - " mydick['nextDay'].append(nextDay[0])\n", - " mydick['nextClose'].append(nextDay[1])\n", - " mydick['Difference'].append(nextDay[1] - prevDay[1])\n", - " \n", - " \n", - " df = pd.DataFrame(mydick)\n", - " df.to_csv(os.path.join(os.getcwd(),'datasets/Merge/MERGE' + f + '.csv'), index=False, sep='|')\n", - "\n", - " " - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "mydick = defaultdict(list)\n", - "Date = [[row['Date'], row['Close Price']] for i, row in nse.iterrows()]\n", - "\n", - "for date, text in zip(mc['date'], mc['text']):\n", - " mydick['date'].append(date)\n", - " mydick['text'].append(text)\n", - " myDate = Date[:]\n", - " myDate.append([date, 0])\n", - " myDate.sort(key=itemgetter(0))\n", - "\n", - " ind = myDate.index([date, 0])\n", - " try:\n", - " prevDay = myDate[ind - 1]\n", - " except IndexError:\n", - " prevDay = myDate[ind + 1]\n", - " try:\n", - " nextDay = myDate[ind + 1]\n", - " except IndexError:\n", - " nextDay = myDate[ind - 1]\n", - " try:\n", - " if prevDay[0] == date:\n", - " prevDay = myDate[ind - 2]\n", - " if nextDay[0] == date:\n", - " nextDay = myDate[ind + 2]\n", - " except IndexError:\n", - " pass\n", - " mydick['prevDay'].append(prevDay[0])\n", - " mydick['prevClose'].append(prevDay[1])\n", - " mydick['nextDay'].append(nextDay[0])\n", - " mydick['nextClose'].append(nextDay[1])\n", - " mydick['Difference'].append(nextDay[1] - prevDay[1])\n", - "\n", - "# print(mydick['Difference'])\n" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [], - "source": [ - "df = pd.DataFrame(mydick)\n", - "df.to_csv('Merge/MERGEAIRTEL.csv', index=False, sep='|')\n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.3" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -}