How to separate text from twitter streaming JSON responses and run analysis on text with python?

3.8k views Asked by At

I am trying to use the twitter API to run sentiment analysis on the text. I am running into the issue that I am not understanding the way to separate the text from each tweet and running the sentiment polarity analysis provided in the TextBlob library. Further more I would like this to only pull back on english tweets. The output is in JSON.

Here is the code to produce the tweets based on keywords (in this case "usd", "euro", "loonie") and my lame attempt at storing the text and using the result in a variable:

from tweepy.streaming import StreamListener
from tweepy import OAuthHandler
from tweepy import Stream
import json
import re
import pandas as pd
import matplotlib.pyplot as plt


#Variables that contains the user credentials to access Twitter API 
access_token = "xxxx"
access_token_secret = "xxxx"
consumer_key = "xxxx"
consumer_secret = "xxxx"


#This is a basic listener that just prints received tweets to stdout.
class StdOutListener(StreamListener):

    def on_data(self, data):
        print data
        return True

    def on_error(self, status):
        print status


if __name__ == '__main__':

    #This handles Twitter authentication and the connection to Twitter Streaming API
    l = StdOutListener()
    auth = OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_token, access_token_secret)
    stream = Stream(auth, l)

    #This line filter Twitter Streams to capture data by the keywords: 'python', 'javascript', 'ruby'
    stream.filter(track=['euro', 'dollar', 'loonie', ] )

    tweets_data_path = stream.filter

    tweets_data = []
    tweets_file = open(tweets_data_path, "r")
    for line in tweets_file:
        try:
            tweet = json.loads(line)
            tweets_data.append(tweet)
        except:
            continue
print len(tweets_data)

tweets['text'] = map(lambda tweet: tweet['text'], tweets_data)
wiki = TextBlob(tweets['text'])
r = wiki.sentiment.polarity

print r

This is what the output looks like:

{"created_at":"Sun Jun 14 23:43:31 +0000 2015","id":610231121016524801,"id_str":"610231121016524801","text":"RT @amirulimannn: RM6 diperlukan utk tukar kpd 1Pound.\nRM3 diperlukan utk tukar kpd 1S'pore Dollar.\n\nGraf matawang jatuh. Tak sedih ke? htt\u2026","source":"\u003ca href=\"http://twitter.com/download/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":42642877,"id_str":"42642877","name":"Wny","screen_name":"waaannnyyy","location":"Dirgahayu Darul Makmur","url":null,"description":"Aku serba tiada, aku kekurangan.","protected":false,"verified":false,"followers_count":320,"friends_count":239,"listed_count":1,"favourites_count":4344,"statuses_count":34408,"created_at":"Tue May 26 15:10:28 +0000 2009","utc_offset":28800,"time_zone":"Kuala Lumpur","geo_enabled":true,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"FFFFFF","profile_background_image_url":"http://pbs.twimg.com/profile_background_images/433201191825047553/PM76m-v2.jpeg","profile_background_image_url_https":"https://pbs.twimg.com/profile_background_images/433201191825047553/PM76m-v2.jpeg","profile_background_tile":true,"profile_link_color":"DD2E44","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"EFEFEF","profile_text_color":"333333","profile_use_background_image":true,"profile_image_url":"http://pbs.twimg.com/profile_images/609402965795835904/mm6jjRRO_normal.jpg","profile_image_url_https":"https://pbs.twimg.com/profile_images/609402965795835904/mm6jjRRO_normal.jpg","profile_banner_url":"https://pbs.twimg.com/profile_banners/42642877/1415486321","default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Sat Jun 13 03:33:29 +0000 2015","id":609564219495706624,"id_str":"609564219495706624","text":"RM6 diperlukan utk tukar kpd 1Pound.\nRM3 diperlukan utk tukar kpd 1S'pore Dollar.\n\nGraf matawang jatuh. Tak sedih ke? http://t.co/dum4skb6uK","source":"\u003ca href=\"http://twitter.com/download/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":481856658,"id_str":"481856658","name":"seorang iman","screen_name":"amirulimannn","location":"+06MY","url":"http://instagram.com/amirulimannn","description":"I wanna drown myself in a bottle of her perfume","protected":false,"verified":false,"followers_count":723,"friends_count":834,"listed_count":2,"favourites_count":4810,"statuses_count":50981,"created_at":"Fri Feb 03 07:49:55 +0000 2012","utc_offset":28800,"time_zone":"Kuala Lumpur","geo_enabled":true,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"AD0A20","profile_background_image_url":"http://pbs.twimg.com/profile_background_images/378800000139426816/61DHBnYy.jpeg","profile_background_image_url_https":"https://pbs.twimg.com/profile_background_images/378800000139426816/61DHBnYy.jpeg","profile_background_tile":false,"profile_link_color":"E36009","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"24210E","profile_text_color":"89B5A2","profile_use_background_image":true,"profile_image_url":"http://pbs.twimg.com/profile_images/592744790283911169/dW7S73WA_normal.jpg","profile_image_url_https":"https://pbs.twimg.com/profile_images/592744790283911169/dW7S73WA_normal.jpg","profile_banner_url":"https://pbs.twimg.com/profile_banners/481856658/1428379855","default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":1321,"favorite_count":229,"entities":{"hashtags":[],"trends":[],"urls":[],"user_mentions":[],"symbols":[],"media":[{"id":609564142886760448,"id_str":"609564142886760448","indices":[118,140],"media_url":"http://pbs.twimg.com/media/CHWbW7yUsAAyAEw.jpg","media_url_https":"https://pbs.twimg.com/media/CHWbW7yUsAAyAEw.jpg","url":"http://t.co/dum4skb6uK","display_url":"pic.twitter.com/dum4skb6uK","expanded_url":"http://twitter.com/amirulimannn/status/609564219495706624/photo/1","type":"photo","sizes":{"small":{"w":340,"h":340,"resize":"fit"},"thumb":{"w":150,"h":150,"resize":"crop"},"medium":{"w":600,"h":600,"resize":"fit"},"large":{"w":1024,"h":1024,"resize":"fit"}}}]},"extended_entities":{"media":[{"id":609564142886760448,"id_str":"609564142886760448","indices":[118,140],"media_url":"http://pbs.twimg.com/media/CHWbW7yUsAAyAEw.jpg","media_url_https":"https://pbs.twimg.com/media/CHWbW7yUsAAyAEw.jpg","url":"http://t.co/dum4skb6uK","display_url":"pic.twitter.com/dum4skb6uK","expanded_url":"http://twitter.com/amirulimannn/status/609564219495706624/photo/1","type":"photo","sizes":{"small":{"w":340,"h":340,"resize":"fit"},"thumb":{"w":150,"h":150,"resize":"crop"},"medium":{"w":600,"h":600,"resize":"fit"},"large":{"w":1024,"h":1024,"resize":"fit"}}}]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"low","lang":"in"},"retweet_count":0,"favorite_count":0,"entities":{"hashtags":[],"trends":[],"urls":[],"user_mentions":[{"screen_name":"amirulimannn","name":"seorang iman","id":481856658,"id_str":"481856658","indices":[3,16]}],"symbols":[],"media":[{"id":609564142886760448,"id_str":"609564142886760448","indices":[139,140],"media_url":"http://pbs.twimg.com/media/CHWbW7yUsAAyAEw.jpg","media_url_https":"https://pbs.twimg.com/media/CHWbW7yUsAAyAEw.jpg","url":"http://t.co/dum4skb6uK","display_url":"pic.twitter.com/dum4skb6uK","expanded_url":"http://twitter.com/amirulimannn/status/609564219495706624/photo/1","type":"photo","sizes":{"small":{"w":340,"h":340,"resize":"fit"},"thumb":{"w":150,"h":150,"resize":"crop"},"medium":{"w":600,"h":600,"resize":"fit"},"large":{"w":1024,"h":1024,"resize":"fit"}},"source_status_id":609564219495706624,"source_status_id_str":"609564219495706624"}]},"extended_entities":{"media":[{"id":609564142886760448,"id_str":"609564142886760448","indices":[139,140],"media_url":"http://pbs.twimg.com/media/CHWbW7yUsAAyAEw.jpg","media_url_https":"https://pbs.twimg.com/media/CHWbW7yUsAAyAEw.jpg","url":"http://t.co/dum4skb6uK","display_url":"pic.twitter.com/dum4skb6uK","expanded_url":"http://twitter.com/amirulimannn/status/609564219495706624/photo/1","type":"photo","sizes":{"small":{"w":340,"h":340,"resize":"fit"},"thumb":{"w":150,"h":150,"resize":"crop"},"medium":{"w":600,"h":600,"resize":"fit"},"large":{"w":1024,"h":1024,"resize":"fit"}},"source_status_id":609564219495706624,"source_status_id_str":"609564219495706624"}]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"low","lang":"in","timestamp_ms":"1434325411453"}

1

There are 1 answers

5
Leb On BEST ANSWER
from tweepy.streaming import StreamListener
from tweepy import OAuthHandler
from tweepy import Stream
import json

# Variables that contains the user credentials to access Twitter API
access_token = ''
access_token_secret = ''
consumer_key = ''
consumer_secret = ''


# This is a basic listener that just prints received tweets to stdout.
class StdOutListener(StreamListener):
    def on_data(self, data):
        json_load = json.loads(data)
        texts = json_load['text']
        coded = texts.encode('utf-8')
        s = str(coded)
        print(s[2:-1])
        return True

    def on_error(self, status):
        print(status)

auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
stream = Stream(auth, StdOutListener())

# This line filter Twitter Streams to capture data by the keywords: 'python', 'javascript', 'ruby'
stream.filter(track=['euro', 'dollar', 'loonie', ], languages=['en'])

For your original question about the json: You can load the data asit streams using json.loads(). The reason for the other stuff so you don't get charmap error when you're extracting the data from twitter onto python. The reason for s[2:-1] is to get rid of the extra character from encoding to utf-8.

For english only tweets you can also filter directly from the stream using languages=['en'].

I'm not familiar with TextBlob library, but you can store it through multiple ways, just write your information onto a file and when you run TextBlob read directly from the file. You can replace print(s[2:-1]) or add to it:

myfile = open('text.csv','a')
myFile.write(s[2:-1])
myFile.write('\n') # adds a line between tweets
myFile.close() 

You can read it using file = open('text.csv', 'r') to do your sentiment analysis. Don't forget to add file.close() anytime you open a file.