From 6f1101989199775a6567838d73abc1158b561214 Mon Sep 17 00:00:00 2001 From: shubham76 Date: Tue, 27 Mar 2018 00:36:46 +0530 Subject: [PATCH] Added twitter analysis files --- Business News Analysis/requirements.txt | 2 - Business News Analysis/stocknews/ReadME.md | 1 + .../stocknews/spiders/urlGenerator.py | 24 ++++ .../stocknews/spiders/urlGenerator.pyc | Bin 0 -> 1156 bytes .../twt_sentiment_analyser.py | 119 ++++++++++++++++++ Twitter Analysis | 1 + requirements.txt | 6 + 7 files changed, 151 insertions(+), 2 deletions(-) delete mode 100644 Business News Analysis/requirements.txt create mode 100644 Business News Analysis/stocknews/stocknews/spiders/urlGenerator.py create mode 100644 Business News Analysis/stocknews/stocknews/spiders/urlGenerator.pyc create mode 100644 Business News Analysis/twt_sentiment_analyser.py create mode 160000 Twitter Analysis create mode 100644 requirements.txt diff --git a/Business News Analysis/requirements.txt b/Business News Analysis/requirements.txt deleted file mode 100644 index f040444..0000000 --- a/Business News Analysis/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -scrapy -pandas diff --git a/Business News Analysis/stocknews/ReadME.md b/Business News Analysis/stocknews/ReadME.md index 6b27a19..7ae532a 100644 --- a/Business News Analysis/stocknews/ReadME.md +++ b/Business News Analysis/stocknews/ReadME.md @@ -7,4 +7,5 @@ To run on local machine: `scrapy crawl livemint_spider` To generate a CSV file: + `scrapy crawl livemint_spider -o file.csv -t csv` \ No newline at end of file diff --git a/Business News Analysis/stocknews/stocknews/spiders/urlGenerator.py b/Business News Analysis/stocknews/stocknews/spiders/urlGenerator.py new file mode 100644 index 0000000..cbbc0b6 --- /dev/null +++ b/Business News Analysis/stocknews/stocknews/spiders/urlGenerator.py @@ -0,0 +1,24 @@ +import scrapy +import pandas as pd +import re + +from stocknews.items import StocknewsItem + +class urlGenerator(scrapy.Spider): + + #spider name + name = "urlGenerator" + #domains + allowed_domains = ["livemint.com"] + #urls + start_urls = [] + file_name = '../../livemint_data_3.csv' + df = pd.read_csv(file_name,encoding='iso-8859-1') + start_urls = df['href'].tolist() + + base_url = "https://www.livemint.com/Query/lZy3FU0kP9Cso5deYypuDI/people.html?facet=subSection&page=" + + def parse(self, response): + print start_urls + + \ No newline at end of file diff --git a/Business News Analysis/stocknews/stocknews/spiders/urlGenerator.pyc b/Business News Analysis/stocknews/stocknews/spiders/urlGenerator.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0e7ff8053281adf95b26785b60c7fc2bbeb0307b GIT binary patch literal 1156 zcmd5*&2H2%5FRJH*&kY{ggErpV=kK&LP$kI2--v8Kqay#a*0gr-SswcB9FIK51h&y z^}v~D;sIdB`$G>rKofg1o*9q7`6iBkAC7edkblEm~ip%x4rEI2w)5&M)uC$+gu(G~2Xz)K~ww|rg zSdDmcNliEj(QL+DUmj|mQQG13VTZEr)^lGw=}1sODm^a;rTxh$=0bXj^Xt}L5Y zt!rK~J?!z%NRf5j-e^^-wvonS&Q8xX%?$;IFwWblEeleWOh4Py+$S8})&_mfo%~16 zBVj1~T8s>zl1Snt8bygn#0zm0r9%eK6=3BgvB_#Dt&*7Y1qn5}4NLo`WQJ}F=H%XN corTVXV;)`cp<{11ZGRW;7*~8C=#7q|zvPo4oB#j- literal 0 HcmV?d00001 diff --git a/Business News Analysis/twt_sentiment_analyser.py b/Business News Analysis/twt_sentiment_analyser.py new file mode 100644 index 0000000..396fe10 --- /dev/null +++ b/Business News Analysis/twt_sentiment_analyser.py @@ -0,0 +1,119 @@ +import re +import tweepy +from tweepy import OAuthHandler +from textblob import TextBlob + +class TwitterClient(object): + ''' + Generic Twitter Class for sentiment analysis. + ''' + def __init__(self): + ''' + Class constructor or initialization method. + ''' + # keys and tokens from the Twitter Dev Console + consumer_key = '5t56s0IVP6wt9nDYQt4V1vz9G' + consumer_secret = 'RwrhzqZEGptj4FQEBS2Rxft38WiKNrEzxbE3WBkmATVGf1Vj40' + access_token = '543284380-Ity3NkRNf80XnU6wCnrJCXZfIumrI4JrRUBx2VZZ' + access_token_secret = '0shseAXCMKiH8JQXb4vzCCyBkOLNgNShRglgkxrceBDgz' + + # attempt authentication + try: + # create OAuthHandler object + self.auth = OAuthHandler(consumer_key, consumer_secret) + # set access token and secret + self.auth.set_access_token(access_token, access_token_secret) + # create tweepy API object to fetch tweets + self.api = tweepy.API(self.auth) + except: + print("Error: Authentication Failed") + + def clean_tweet(self, tweet): + ''' + Utility function to clean tweet text by removing links, special characters + using simple regex statements. + ''' + return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", tweet).split()) + + def get_tweet_sentiment(self, tweet): + ''' + Utility function to classify sentiment of passed tweet + using textblob's sentiment method + ''' + # create TextBlob object of passed tweet text + analysis = TextBlob(self.clean_tweet(tweet)) + # set sentiment + if analysis.sentiment.polarity > 0: + return 'positive' + elif analysis.sentiment.polarity == 0: + return 'neutral' + else: + return 'negative' + + def get_tweets(self, query, count = 10): + ''' + Main function to fetch tweets and parse them. + ''' + # empty list to store parsed tweets + tweets = [] + + try: + # call twitter api to fetch tweets + fetched_tweets = self.api.search(q = query, count = count) + + # parsing tweets one by one + for tweet in fetched_tweets: + # empty dictionary to store required params of a tweet + parsed_tweet = {} + + # saving text of tweet + parsed_tweet['text'] = tweet.text + # saving sentiment of tweet + parsed_tweet['sentiment'] = self.get_tweet_sentiment(tweet.text) + + # appending parsed tweet to tweets list + if tweet.retweet_count > 0: + # if tweet has retweets, ensure that it is appended only once + if parsed_tweet not in tweets: + tweets.append(parsed_tweet) + else: + tweets.append(parsed_tweet) + + # return parsed tweets + return tweets + + except tweepy.TweepError as e: + # print error (if any) + print("Error : " + str(e)) + +def main(): + # creating object of TwitterClient Class + api = TwitterClient() + # calling function to get tweets + tweets = api.get_tweets(query = 'TCS Tata', count = 1000) + + # picking positive tweets from tweets + ptweets = [tweet for tweet in tweets if tweet['sentiment'] == 'positive'] + # percentage of positive tweets + positive_percentage = 100*len(ptweets)/len(tweets) + print("Positive tweets percentage: %d" % positive_percentage) + + # picking negative tweets from tweets + ntweets = [tweet for tweet in tweets if tweet['sentiment'] == 'negative'] + # percentage of negative tweets + negative_percentage = 100*len(ntweets)/len(tweets) + print("Negative tweets percentage: %d" % negative_percentage) + + # printing first 5 positive tweets + print("\n\nPositive tweets:") + for tweet in ptweets[:10]: + print(tweet['text']) + + # printing first 5 negative tweets + print("\n\nNegative tweets:") + for tweet in ntweets[:10]: + print(tweet['text']) + +if __name__ == "__main__": + # calling main function + main() \ No newline at end of file diff --git a/Twitter Analysis b/Twitter Analysis new file mode 160000 index 0000000..b9d5a1d --- /dev/null +++ b/Twitter Analysis @@ -0,0 +1 @@ +Subproject commit b9d5a1d21256832ceba797ab8b8b421e8572618a diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..3823a32 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,6 @@ +scrapy +pandas +tweepy +textblob +pyquery +TextBlob