This phase of the project containts the following tasks which need to be done programmatically.
#Importing basic packages needed to get Data
import pandas as pd
import requests
import os
import tweepy
import json
1. Download Data Manually and Read in to check
archive = pd.read_csv('data/twitter-archive-enhanced.csv')
archive.head()
2. Programmatically download data from a URL
# Here we have the URL provided by UDACITY
url = "https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv"
#fetching Data and saving to disk.
r = requests.get(url)
folderName = 'data'
fName = url.split('/')[-1]
#Creating Folder Named Data
if not os.path.exists(folderName):
os.makedirs(folderName)
#Writing Data to file
with open(os.path.join(folderName,fName),mode = 'wb') as file:
file.write(r.content)
#Reading in Downloaded Data to check if working.
img = pd.read_csv('data/image-predictions.tsv', sep='\t')
img.head()
3. Downloading Twitter API Data For the Required Values
#Extracting Twitter Id's from the Archive DataFrame.
tweet_id = archive['tweet_id']
#Twiter Auth Data (Remove before sumbission)
consumer_key = 'VjFpwyCsbShxMv2ECEDWu71Uo'
consumer_secret = 'tLKupsqpJlJbGAE595oLptb4zVgyTVe5cGRaRQHOfnDt06w29e'
access_token = '2981974992-nCKD9ib35SsdrNN0HuMHKUNqpBCPvzWYZYtd0PR'
access_token_secret = 'msZMlp6w3mAjAxmiiqhIwgwntJPlyXMHHgX2wc5xgKMOg'
# consumer_key = ''
# consumer_secret = ''
# access_token = ''
# access_token_secret = ''
#Tweety Auth
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth,wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
#Variable Array created to capure Index ID's of Errors using the Twitter API
api_error = []
Writing a loop to download additional data from the twitter api and save them as their respective text files to be used in the futher steps.
Folder Name Being saved to : tweets/
File name: tweet_[ID OF TWEET].txt
#Using a Try Except Block here to access the twitter API, the Error's are logged in the api_error varuiable if needed later.
counter = 0;
for t in tweet_id:
try:
counter = counter+1
fileName = 'data/tweet_json.txt'
tweet = api.get_status(id=t, tweet_mode='extended')
with open(fileName, 'a') as outfile:
json.dump(tweet._json,outfile)
outfile.write(','+'\n')
print(str(counter) +" Success")
except Exception:
print(str(counter)+" ERROR ERROR ERROR")
api_error.append(counter)
pass
#Errors for ID's, They have been not added to the DataFrame.
errors = [2056,1993,1945,1865,1836,1616,1310]
api_error
#Loading in JSON File
with open('data/tweet_json.txt') as f:
data = json.loads(f.read())
data[0]
#Using this Dumped Data (first Instance Only) to parse the JSON using the following tool : http://json.parser.online.fr/ \
#and understand the structure of the JSON.
#TEST CODE - Used to only check first Data
#Checking Queries on first Data Value printing them to the console.
Tid = data[0]['id_str']
full_text = data[0]['full_text']
retweet_count = data[0]['retweet_count']
fav_count = data[0]['favorite_count']
url = data[0]["extended_entities"]["media"][0]["url"]
index = data[0]['full_text'].index('/')
numerator = int(data[0]['full_text'][index-2:index])
denominator = int(data[0]['full_text'][index+1:index+3])
name = (data[0]['full_text'].split('.')[0].split(" ")[-1])
val = data[0]['full_text']
if 'doggo' in val:
dog = 'doggo'
elif 'pupper' in val:
dog = 'pupper'
elif 'puppo' in val:
dog = 'puppo'
elif 'floofer' in val:
dog = 'floofer'
else:
dog = None
print(Tid)
print(full_text)
print(retweet_count)
print(fav_count)
print(url)
print(index)
print(numerator)
print(denominator)
print(name)
#Another Check for parsing the JSON (it has quite a complicated Schema.)
data[0]["extended_entities"]["media"][0]["url"]
Here we are extracting data which will be used to solve various data Quality issues.
These are Dog Name, Numerator, Denominator and the type of dog (doggo, fluffer, etc)
#Extracting Required feilds from JSON and making a new data frame df
df_list = []
for val in data:
Tid = val['id_str']
full_text = val['full_text']
retweet_count = val['retweet_count']
fav_count = val['favorite_count']
index = val['full_text'].index('/')
numerator = full_text[index-2:index]
denominator = full_text[index+1:index+3]
name = full_text.split('.')[0].split(" ")[-1]
if 'doggo' in full_text:
dog = 'doggo'
elif 'pupper' in full_text:
dog = 'pupper'
elif 'puppo' in full_text:
dog = 'puppo'
elif 'floofer' in full_text:
dog = 'floofer'
else:
dog = None
df_list.append({'tweet_id': int(Tid),
'full_text': full_text,
'retweet_count': int(retweet_count),
'fav_count' : int(fav_count),
'numerator' : numerator, #[Q#1]
'denominator': denominator, #[Q#3]
'pet_name' : name, #[Q#2]
'dog' : dog #[T#1]
})
api = pd.DataFrame(data=df_list)
#Checking our newly created DataFrame.
api.head()
#Rearanging the DF Columns to make more sense when read in.
api = api[['tweet_id', 'full_text', 'fav_count','retweet_count', 'pet_name', 'dog', 'numerator', 'denominator']]
api.head()
#Saving Twitter Data extracted from API as CSV
#api.to_csv('data/twitter_archive_api.csv')
#Sample Test Block for the loop used above.
df_api = []
for val in api['full_text']:
#Code for finding numberator and denominator
Tid = api["tweet_id"]
index = val.index('/')
rating_numerator = val[index-2:index]
rating_denominator = val[index+1:index+3]
name = (val.split('.')[0].split(" ")[-1])
if 'doggo' in val:
dog = 'doggo'
elif 'pupper' in val:
dog = 'pupper'
elif 'puppo' in val:
dog = 'puppo'
elif 'floofer' in val:
dog = 'floofer'
else:
dog = None
df_api.append({
'tweet_id' : Tid,
'name' : name,
'rating_numerator' : rating_numerator,
'rating_denominator' : rating_denominator,
'dog' : dog
})
df_api_pd = pd.DataFrame(data=df_api)
df_api_pd.head()
#Sample code used for Calculating Numerator, Denominator and Pet Name
index = val.index('/')
print(val[index-2:index])
print(val[index+1:index+3])
print(val.split('.')[0].split(" ")[-1])
#Checking the value of variable val which contains full_text from the twitter api
val
#Re initing Val with a diffrent Data value (without a dog Name)
val = api["full_text"][12]
val
str(val)
if 'doggo' in val:
dog = ('doggo')
elif 'pupper' in val:
dog = ('pupper')
elif 'puppo' in val:
dog = ('puppo')
elif 'floofer' or 'floof' in val:
dog = ('floofer')
else:
dog = ("None")
#Importing basic packages needed to get Data
import pandas as pd
import requests
import os
import tweepy
import json
# Assigning agreed upon variable names
archive = pd.read_csv('data/twitter-archive-enhanced.csv')
img = pd.read_csv('data/image-predictions.tsv', sep='\t')
api = pd.read_csv('data/twitter_archive_api.csv')
archive.head()
val = archive.text[12]
val.split('.')[0].split(" ")[-1]
# print(val.split('/')[1])
val
index = val.index('/')
print(val[index-2:index])
print(val[index+1:index+3])
#Code for finding numberator and denominator
index = val.index('/')
rating_numerator = val[index-2:index]
rating_denominator = val[index+1:index+3]
# index = val.index('This is')
# index
list(archive.columns.values)
archive.count()
img.head(20)
img.sample(50)
list(img.columns.values)
api.head(30)
#Listing All Colums Before dropping
list(api.columns.values)
#Dropping First Column which has Just Numbers
api.drop(columns=['Unnamed: 0'], inplace=True)
#Listing All Colums Before dropping
list(api.columns.values)
api.head(20)
Programatic Assessment
archive.info()
archive.describe()
api.info()
img.info()
api.head()
# archive.subset(retweeted_status_id != 'NaN')
#archive(archive['retweeted_status_id'] != None)
#patients(patients['city'] == 'New York')
# CHecking for Duplicates throughout the data sets
all_colums = pd.Series(list(archive) + list(api) + list(img))
all_colums[all_colums.duplicated()]
Detect and document at least eight (8) quality issues and two (2) tidiness issues in your wrangle_act.ipynb Jupyter Notebook.
archive.duplicated(['tweet_id']).sum()
img.duplicated(['tweet_id']).sum()
#Importing Basic Packages
import pandas as pd
import numpy as np
import math
#Assigning agreed upon variable names (Original Data)
img = pd.read_csv('data/image-predictions.tsv', sep='\t')
api = pd.read_csv('data/twitter_archive_api.csv')
archive = pd.read_csv('data/twitter-archive-enhanced.csv')
#Creating Backups and Working on the *_clean Data.
img_clean = img.copy()
api_clean = api.copy()
archive_clean = archive.copy()
archive
The numerator needs to be Recalculated as told to us - can be taken from api column (needs to be Cleaned)archive
Dog names are incorrect, need to re-extract archive
Dog Ratings are incorrect, need to re-extractarchive
Remove retwetted Dataimg
The columns p1,p2,p3 have underscores separating their names. We should add white spacesimg
if P1_dog ,P2_dog and P3_dog are all false. The tweet is Invalid and cannot be processed by the neural network and hence must be removed.api
remove stray count column (unnamed 0)achive
and api
: Combine doggo, pupper etc into one columnapi
merge into archive
dataset)archive
The numerator Column needs to be recaluclated as mentioned.¶Define
Code
Test
archive
Dog names are incorret, need to re-extract¶Define
Code
Test
archive
Dog Ratings are incorrect, need to re-extract¶Define
Code
Test
archive
Remove retwetted DataDefine
Looking at the Column names 'retweeted_status_id' stands out as the defacto proof that the tweet has been retweeted. We hence Check to see this value should be null throughout the entire column and just keep the values which have 'NAN'
retweeted_status_id
columnnCode
archive_clean = archive[archive.retweeted_status_id.isnull()]
archive_clean.head()
Test
archive_clean[archive_clean.retweeted_status_id.notnull()]
img
The columns p1,p2,p3 have underscores seperating their names¶Define
img.head(1)
Code
#Remvoving Whitespace using str.replace()
img_clean.p1 = img_clean.p1.str.replace('_',' ')
img_clean.p2 = img_clean.p2.str.replace('_',' ')
img_clean.p3 = img_clean.p3.str.replace('_',' ')
Test
#Checking if the above solution worked.
img_clean.head()
p1, p2 & p3 Columns have whiteSpaces. - Success
img
if P1_dog ,P2_dog and P3_dog are all false.¶Define
p1_dog, p2_dog & p3_dog
are all FALSE
and cannot be processed by the neural network and hence must be removed.Code
#looking at the original DF
img.head()
#Keeping only the True Values
#img_clean = img_clean[~(img_clean.p1_dog & img_clean.p2_dog & img_clean.p3_dog)]
img.count()
#Keeping only the True Values
img_clean = img_clean[~((img_clean.p1_dog == False) & (img_clean.p2_dog == False) & (img_clean.p3_dog == False))]
img_clean.count()
Test
img_clean[(img_clean.p1_dog == False) & (img_clean.p2_dog == False) & (img_clean.p3_dog == False)]
api
remove stray count column ['unnamed 0']¶Define
-When the API csv was imported there was a stray column, we must drop it as it will cause issues later while merging
api.head(0)
Code
#Dropping Stray Column
api_clean.drop(columns=['Unnamed: 0'], inplace=True)
Test
api_clean.head()
api
merge into archive
dataset)¶Define
As per the final project specification we need to merge these two tables to eliminate redudant data
These Tables are
Code
#Merge into two Tables (get rid of `api` merge into `archive` dataset)
#Saving new DF as var name 'final'
final = pd.merge(archive_clean,api_clean,how='right',on='tweet_id')
Test
print(final.columns)
archive
and api
: Combine doggo, pupper etc into one column¶Define
dog
in the api table, we can replace the columns floofer
,pupper
,puppo
,doggo
with the single column dogCode
#Solving Tidiness Issue #1
#1. `achive` and `api` : Combine doggo, pupper etc into one column
final.drop(columns=['floofer','pupper','puppo','doggo'], inplace=True)
Test
final.head()
Define
names
[Q#2]rating_numerator
and rating_denominator
[Q#1] [Q#3]text
Code
final.drop(columns=['name','rating_numerator','rating_denominator','text'], inplace=True)
#[Q#2] Solved.
#[Q#1], [Q#3] Solved,
numerator
column¶Define
The code we wrote turned out some errors, we shall manual clean this.
#We Need to check what values should be not there
final.numerator.value_counts()
Code
#Making a list of which rows have errors - using value_counts() for refrence.
numerator_error = []
numerator_error.append(final[final.numerator == '.9']['tweet_id'])
numerator_error.append(final[final.numerator == '.5']['tweet_id'])
numerator_error.append(final[final.numerator == '\r\n9']['tweet_id'])
numerator_error.append(final[final.numerator == 'ry']['tweet_id'])
numerator_error.append(final[final.numerator == '.8']['tweet_id'])
numerator_error.append(final[final.numerator == '.9']['tweet_id'])
numerator_error.append(final[final.numerator == ';2']['tweet_id'])
numerator_error.append(final[final.numerator == '(8']['tweet_id'])
numerator_error.append(final[final.numerator == 'st']['tweet_id'])
numerator_error.append(final[final.numerator == '\r\n5']['tweet_id'])
numerator_error.append(final[final.numerator == '-5']['tweet_id'])
numerator_error.append(final[final.numerator == ' w']['tweet_id'])
numerator_error
Here we start to Manually Clean the data.
Steps
full_text
final[final.numerator == '.9'].full_text
final.loc[[847,1192,1430],'numerator'] = 9
final[final.numerator == '.5']
final.loc[42].full_text
final.loc[[42],'numerator'] = 13.5
final.loc[1509].full_text
final.loc[[1509],'numerator'] = 9.5
final[final.numerator == '\r\n9']
final.loc[1492].full_text
final.loc[[1492,2082],'numerator'] = 9
final[final.numerator == 'ry']
final.loc[2329].full_text
final.loc[[2329],'numerator'] = 11
final[final.numerator == '.8']
final.loc[1255].full_text
final.loc[[1255],'numerator'] = 8
final[final.numerator == '.9']
final[final.numerator == ';2']
final.loc[2066].full_text
final.loc[[2066],'numerator'] = 2
final[final.numerator == '(8']
final.loc[1473].full_text
final.loc[[1473],'numerator'] = 8
final[final.numerator == 'st']
final.loc[1635].full_text
final.loc[[1635],'numerator'] = 12
final[final.numerator == '\r\n5']
final.loc[1912].full_text
final.loc[[1912],'numerator'] = 5
final[final.numerator == '-5']
final[final.numerator == ' w']
final.loc[1069].full_text
final.loc[[1069],'numerator'] = 3
Test
print(final.numerator.value_counts())
denominator
column as well¶Define
The code we wrote turned out some errors, we shall manual clean this.
We need to eliminate string values
Code
final.denominator.value_counts()
final[final.denominator == 'pe']
final.loc[2329].full_text
final.loc[[2329],'denominator'] = 10
final[final.denominator == 'ou']
final.loc[1069].full_text
final.loc[[1069],'denominator'] = 10
final[final.denominator == 'sw']
final.loc[1635].full_text
final.loc[[1635],'denominator'] = 10
Test
final.denominator.value_counts()
Test
final.columns
RUN LAST AND UNCOMMENT
#Save Files to CSV
#final.to_csv('data/final/twitter_archive_master.csv')
#img_clean.to_csv('data/final/image_predictions.csv')
#print("Saved Successfully")
#Import Statements
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
#importing Final Data found in Data/final Folder
archive = pd.read_csv('data/final/twitter_archive_master.csv',index_col=0)
img = pd.read_csv('data/final/image_predictions.csv', index_col=0)
print("Import Successful")
fav_count
from archive
¶fav_mean = archive.fav_count.mean()
fav_median = archive.fav_count.median()
fav_max = archive.fav_count.max()
fav_sum = archive.fav_count.sum()
archive.fav_count.count()
print("Mean Favourite Value is : {}".format(fav_mean))
print("Median Favourite Value is : {}".format(fav_median))
print("Max Favourite Value for an tweet is : {}".format(fav_max))
print("Total Favourite secured for All Tweets : {}".format(fav_sum))
fav_plt = archive.fav_count.hist(alpha=0.8,figsize=(8,8))
plt.xlabel("Fav Counts");
plt.ylabel("Count of Tweets");
plt.title("Favorited Tweets");
plt.savefig('Docs/Viz/1.png');
retweet_count
from archive
¶archive
ass = np.sort(archive.retweet_count)[::-1]
ass
retweet_mean = archive.retweet_count.mean()
retweet_median = archive.retweet_count.median()
retweet_max = archive.retweet_count.max()
retweet_sum = archive.retweet_count.sum()
archive.retweet_count.hist(alpha=0.8,figsize=(8,8),color = "green")
plt.xlabel("Retweet Counts");
plt.ylabel("Count of Tweets");
plt.title("Re-Tweeted Tweets");
plt.savefig('Docs/Viz/2.png');
print("Mean Retweets Value is : {}".format(retweet_mean))
print("Median Retweets Value is : {}".format(retweet_median))
print("Max Retweets Value for an tweet is : {}".format(retweet_max))
print("Total Retweets secured for All Tweets : {}".format(retweet_sum))
Dog Names
from archive
¶pie = archive.dog.value_counts()
pie.plot(kind="pie");
plt.savefig('Docs/Viz/3.png');
dog_val = archive.dog.value_counts()
name_sum = dog_val[0]+dog_val[1]+dog_val[2]+dog_val[3]
pupper_per = (dog_val[0]/name_sum)*100
doggo_per = (dog_val[1]/name_sum)*100
puppo_per = (dog_val[2]/name_sum)*100
floofer_per = (dog_val[3]/name_sum)*100
print("The Percentile Value of Pupper to all dogs is {}%".format(pupper_per))
print("The Percentile Value of Doggo to all dogs is {}%".format(doggo_per))
print("The Percentile Value of Puppo to all dogs is {}%".format(puppo_per))
print("The Percentile Value of Floofer to all dogs is {}%".format(floofer_per))
dog_val.sum()
numerators
& denominators
from archive
¶num = archive.numerator
dom = archive.denominator
num_mean = num.mean()
num_median = num.median()
num_max = archive.numerator.max()
dom_mean = dom.mean()
dom_median = dom.median()
print("The mean value of all the numerator of the ratings given is : {}".format(num_mean))
print("The median value of all the numerator of the ratings given is : {}".format(num_median))
print("The mean value of all the denominators of the ratings given is : {}".format(dom_mean))
print("The median value of all the denominators of the ratings given is : {}".format(dom_median))
print("".format())
num_plt = num.plot(figsize=(10,10), kind='hist', color="#ff9960");
plt.ylabel("Tweets")
plt.xlabel("Numerator Values")
plt.title("Numerator Histogram");
# num_plt.axes.get_yaxis().set_visible(False)
plt.savefig('Docs/Viz/4.png');
print("The Maximum Rating Numerator given is {}".format(num_max))
time_dt = pd.to_datetime(archive.timestamp).dt.date
time_hr = pd.to_datetime(archive.timestamp).dt.hour
time_yr = pd.to_datetime(archive.timestamp).dt.year
time_hr.value_counts()
time_hr.plot(figsize=(8,6), kind='hist');
plt.title("Hourly Posting Graph");
plt.savefig('Docs/Viz/5.png');
time_dt = time_dt.value_counts()
time_dt.plot(figsize=(15,10));
plt.title("Daily Posting Graph");
plt.savefig('Docs/Viz/6.png');
time_dt.value_counts()
time_yr.value_counts().plot(kind='bar');
plt.title("Yearly Graph Figure");
plt.savefig('Docs/Viz/7.png');
img
DataSet¶#Viewing The DataSet
img.head()
img
¶#Calulating the mean values for the images uploaded per post
img_uploaded_mean = img.img_num.mean()
img_uploaded_median = img.img_num.median()
print("The Average amount of pictures uploaded per tweet is : {}".format(img_uploaded_mean))
print("The Median amount of the uploaded photos is : {}".format(img_uploaded_median))
Neural Network Analysis
p1_conf
,p2_conf
& p3_conf
¶# Calulating the mean values for the confidence prediction varibles
Cp1_mean = img.p1_conf.mean()
Cp2_mean = img.p2_conf.mean()
Cp3_mean = img.p3_conf.mean()
print("Calculating the Efficiency of the Neural Network on the diffrent stages p1,p2,p3")
print("The Average Efficiency of Stage one P1 {} ".format(Cp1_mean))
print("The Average Efficiency of Stage one P2 {} ".format(Cp2_mean))
print("The Average Efficiency of Stage one P3 {} ".format(Cp3_mean))
#using counts to get true false valuses of the data
prediction_p1 = img.p1_dog.value_counts()
prediction_p2 = img.p2_dog.value_counts()
prediction_p3 = img.p3_dog.value_counts()
#Finding percentile values of each
p1_per = prediction_p1[1]/ (prediction_p1[0]+prediction_p1[1])*100
p2_per = prediction_p2[1]/ (prediction_p2[0]+prediction_p2[1])*100
p3_per = prediction_p3[1]/ (prediction_p3[0]+prediction_p3[1])*100
#Printing Above found percentiles.
print("P1 Stage Success Hit Rate is {} %".format(p1_per))
print("P2 Stage Success Hit Rate is {} %".format(p2_per))
print("P3 Stage Success Hit Rate is {} %".format(p3_per))
#Anaylzing which dogs are the most popular through diffrent stages of the neural network.
d_p1 = img.p1.value_counts()
d_p2 = img.p2.value_counts()
d_p3 = img.p3.value_counts()
#Dumping the Data to Read and Anaylyze
print("Finding out the most popular Dogs for Each Stage \n")
#print("P1")
print("The Top Popular Dogs for Stage P1 Are :\n{} \n".format(d_p1.head()))
#print("P2")
print("The Top Popular Dogs for Stage P2 Are :\n{} \n".format(d_p2.head()))
#print("P3")
print("The Top Popular Dogs for Stage P3 Are :\n{} \n".format(d_p3.head()))
# Merging all the Data into one Series so i can get a better picutre for joint anaylysis
all_dogs = pd.concat([d_p1, d_p2, d_p3])
d_all = all_dogs.groupby(all_dogs.index).aggregate(sum)
d_all = d_all.sort_values(ascending=False)
print("The Top 10 Dogs overall through all the stages in our DataSet are \n\n{}\n".format(d_all.head(10)))
Thank You for Reading :)