This phase of the project containts the following tasks which need to be done programmatically.
#Importing basic packages needed to get Data
import pandas as pd
import requests
import os
import tweepy
import json
1. Download Data Manually and Read in to check
archive = pd.read_csv('data/twitter-archive-enhanced.csv')
archive.head()
2. Programmatically download data from a URL
folderName = 'data'
fName = url.split('/')[-1]
#Creating Folder Named Data
if not os.path.exists(folderName):
os.makedirs(folderName)
# Here we have the URL provided by UDACITY
url = "https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv"
#fetching Data and saving to disk.
r = requests.get(url)
#Writing Data to file
with open(os.path.join(folderName,fName),mode = 'wb') as file:
file.write(r.content)
#Reading in Downloaded Data to check if working.
img = pd.read_csv('data/image-predictions.tsv', sep='\t')
img.head()
3. Downloading Twitter API Data For the Required Values
#Extracting Twitter Id's from the Archive DataFrame.
tweet_id = archive['tweet_id']
#Twiter Auth Data (Remove before sumbission)
consumer_key = 'VjFpwyCsbShxMv2ECEDWu71Uo'
consumer_secret = 'tLKupsqpJlJbGAE595oLptb4zVgyTVe5cGRaRQHOfnDt06w29e'
access_token = '2981974992-nCKD9ib35SsdrNN0HuMHKUNqpBCPvzWYZYtd0PR'
access_token_secret = 'msZMlp6w3mAjAxmiiqhIwgwntJPlyXMHHgX2wc5xgKMOg'
# consumer_key = ''
# consumer_secret = ''
# access_token = ''
# access_token_secret = ''
#Tweety Auth
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth,wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
#Variable Array created to capure Index ID's of Errors using the Twitter API
api_error = []
Writing a loop to download additional data from the twitter api and save them as their respective text files to be used in the futher steps.
Folder Name Being saved to : tweets/
File name: tweet_[ID OF TWEET].txt
#Using a Try Except Block here to access the twitter API, the Error's are logged in the api_error varuiable if needed later.
counter = 0;
for t in tweet_id:
try:
counter = counter+1
fileName = 'data/tweet_json.txt'
tweet = api.get_status(id=t, tweet_mode='extended')
with open(fileName, 'a') as outfile:
json.dump(tweet._json,outfile)
outfile.write(','+'\n')
print(counter)
except Exception:
print(str(counter)+" ERROR ERROR ERROR")
api_error.append(counter)
pass
#Errors for ID's, They have been not added to the DataFrame.
errors = [2056,1993,1945,1865,1836,1616,1310]
api_error
#Loading in JSON File
with open('data/tweet_json.txt') as f:
data = json.loads(f.read())
data[0]
#Using this Dumped Data (first Instance Only) to parse the JSON using the following tool : http://json.parser.online.fr/ \
#and understand the structure of the JSON.
#TEST CODE - Used to only check first Data
#Checking Queries on first Data Value printing them to the console.
Tid = data[0]['id_str']
full_text = data[0]['full_text']
retweet_count = data[0]['retweet_count']
fav_count = data[0]['favorite_count']
url = data[0]["extended_entities"]["media"][0]["url"]
index = data[0]['full_text'].index('/')
numerator = int(data[0]['full_text'][index-2:index])
denominator = int(data[0]['full_text'][index+1:index+3])
name = (data[0]['full_text'].split('.')[0].split(" ")[-1])
if 'doggo' in val:
dog = 'doggo'
elif 'pupper' in val:
dog = 'pupper'
elif 'puppo' in val:
dog = 'puppo'
elif 'floofer' in val:
dog = 'floofer'
else:
dog = None
print(Tid)
print(full_text)
print(retweet_count)
print(fav_count)
print(url)
print(index)
print(numerator)
print(denominator)
print(name)
#Another Check for parsing the JSON (it has quite a complicated Schema.)
data[0]["extended_entities"]["media"][0]["url"]
Here we are extracting data which will be used to solve various data Quality issues.
These are Dog Name, Numerator, Denominator and the type of dog (doggo, fluffer, etc)
#Extracting Required feilds from JSON and making a new data frame df
df_list = []
for val in data:
Tid = val['id_str']
full_text = val['full_text']
retweet_count = val['retweet_count']
fav_count = val['favorite_count']
index = val['full_text'].index('/')
numerator = full_text[index-2:index]
denominator = full_text[index+1:index+3]
name = full_text.split('.')[0].split(" ")[-1]
if 'doggo' in full_text:
dog = 'doggo'
elif 'pupper' in full_text:
dog = 'pupper'
elif 'puppo' in full_text:
dog = 'puppo'
elif 'floofer' in full_text:
dog = 'floofer'
else:
dog = None
df_list.append({'tweet_id': int(Tid),
'full_text': full_text,
'retweet_count': int(retweet_count),
'fav_count' : int(fav_count),
'numerator' : numerator, #[Q#1]
'denominator': denominator, #[Q#3]
'pet_name' : name, #[Q#2]
'dog' : dog #[T#1]
})
api = pd.DataFrame(data=df_list)
#Checking our newly created DataFrame.
api.head(50)
#Rearanging the DF Columns to make more sense when read in.
api = api[['tweet_id', 'full_text', 'fav_count','retweet_count', 'pet_name', 'dog', 'numerator', 'denominator']]
api.head()
#Saving Twitter Data extracted from API as CSV
api.to_csv('data/twitter_archive_api.csv')
#Sample Test Block for the loop used above.
df_api = []
for val in api['full_text']:
#Code for finding numberator and denominator
Tid = api["tweet_id"]
index = val.index('/')
rating_numerator = val[index-2:index]
rating_denominator = val[index+1:index+3]
name = (val.split('.')[0].split(" ")[-1])
if 'doggo' in val:
dog = 'doggo'
elif 'pupper' in val:
dog = 'pupper'
elif 'puppo' in val:
dog = 'puppo'
elif 'floofer' in val:
dog = 'floofer'
else:
dog = None
df_api.append({
'tweet_id' : Tid,
'name' : name,
'rating_numerator' : rating_numerator,
'rating_denominator' : rating_denominator,
'dog' : dog
})
df_api_pd = pd.DataFrame(data=df_api)
df_api_pd.head(50)
#Sample code used for Calculating Numerator, Denominator and Pet Name
index = val.index('/')
print(val[index-2:index])
print(val[index+1:index+3])
print(val.split('.')[0].split(" ")[-1])
#Checking the value of variable val which contains full_text from the twitter api
val
#Re initing Val with a diffrent Data value (without a dog Name)
val = api["full_text"][12]
val
str(val)
if 'doggo' in val:
dog = ('doggo')
elif 'pupper' in val:
dog = ('pupper')
elif 'puppo' in val:
dog = ('puppo')
elif 'floofer' or 'floof' in val:
dog = ('floofer')
else:
dog = ("None")