#Importing Basic Packages
import pandas as pd
import numpy as np
import math
#Assigning agreed upon variable names (Original Data)
img = pd.read_csv('data/image-predictions.tsv', sep='\t')
api = pd.read_csv('data/twitter_archive_api.csv')
archive = pd.read_csv('data/twitter-archive-enhanced.csv')
#Creating Backups and Working on the *_clean Data.
img_clean = img.copy()
api_clean = api.copy()
archive_clean = archive.copy()
archive
The numerator needs to be Recalculated as told to us - can be taken from api column (needs to be Cleaned)archive
Dog names are incorrect, need to re-extract archive
Dog Ratings are incorrect, need to re-extractarchive
Remove retwetted Dataimg
The columns p1,p2,p3 have underscores separating their names. We should add white spacesimg
if P1_dog ,P2_dog and P3_dog are all false. The tweet is Invalid and cannot be processed by the neural network and hence must be removed.api
remove stray count column (unnamed 0)achive
and api
: Combine doggo, pupper etc into one columnapi
merge into archive
dataset)archive
The numerator Column needs to be recaluclated as mentioned.¶Define
Code
Test
archive
Dog names are incorret, need to re-extract¶Define
Code
Test
archive
Dog Ratings are incorrect, need to re-extract¶Define
Code
Test
archive
Remove retwetted DataDefine
Looking at the Column names 'retweeted_status_id' stands out as the defacto proof that the tweet has been retweeted. We hence Check to see this value should be null throughout the entire column and just keep the values which have 'NAN'
retweeted_status_id
columnnCode
archive_clean = archive[archive.retweeted_status_id.isnull()]
archive_clean.head()
Test
archive_clean[archive_clean.retweeted_status_id.notnull()]
img
The columns p1,p2,p3 have underscores seperating their names¶Define
img.head(1)
Code
#Remvoving Whitespace using str.replace()
img_clean.p1 = img_clean.p1.str.replace('_',' ')
img_clean.p2 = img_clean.p2.str.replace('_',' ')
img_clean.p3 = img_clean.p3.str.replace('_',' ')
Test
#Checking if the above solution worked.
img_clean.head()
p1, p2 & p3 Columns have whiteSpaces. - Success
img
if P1_dog ,P2_dog and P3_dog are all false.¶Define
p1_dog, p2_dog & p3_dog
are all FALSE
and cannot be processed by the neural network and hence must be removed.Code
#looking at the original DF
img.head()
#Keeping only the True Values
#img_clean = img_clean[~(img_clean.p1_dog & img_clean.p2_dog & img_clean.p3_dog)]
img.count()
#Keeping only the True Values
img_clean = img_clean[~((img_clean.p1_dog == False) & (img_clean.p2_dog == False) & (img_clean.p3_dog == False))]
img_clean.count()
Test
img_clean[(img_clean.p1_dog == False) & (img_clean.p2_dog == False) & (img_clean.p3_dog == False)]
api
remove stray count column ['unnamed 0']¶Define
-When the API csv was imported there was a stray column, we must drop it as it will cause issues later while merging
api.head(0)
Code
#Dropping Stray Column
api_clean.drop(columns=['Unnamed: 0'], inplace=True)
Test
api_clean.head()
api
merge into archive
dataset)¶Define
As per the final project specification we need to merge these two tables to eliminate redudant data
These Tables are
Code
#Merge into two Tables (get rid of `api` merge into `archive` dataset)
#Saving new DF as var name 'final'
final = pd.merge(archive_clean,api_clean,how='right',on='tweet_id')
Test
print(final.columns)
archive
and api
: Combine doggo, pupper etc into one column¶Define
dog
in the api table, we can replace the columns floofer
,pupper
,puppo
,doggo
with the single column dogCode
#Solving Tidiness Issue #1
#1. `achive` and `api` : Combine doggo, pupper etc into one column
final.drop(columns=['floofer','pupper','puppo','doggo'], inplace=True)
Test
final.head()
Define
names
[Q#2]rating_numerator
and rating_denominator
[Q#1] [Q#3]text
Code
final.drop(columns=['name','rating_numerator','rating_denominator','text'], inplace=True)
#[Q#2] Solved.
#[Q#1], [Q#3] Solved,
numerator
column¶Define
The code we wrote turned out some errors, we shall manual clean this.
#We Need to check what values should be not there
final.numerator.value_counts()
Code
#Making a list of which rows have errors - using value_counts() for refrence.
numerator_error = []
numerator_error.append(final[final.numerator == '.9']['tweet_id'])
numerator_error.append(final[final.numerator == '.5']['tweet_id'])
numerator_error.append(final[final.numerator == '\r\n9']['tweet_id'])
numerator_error.append(final[final.numerator == 'ry']['tweet_id'])
numerator_error.append(final[final.numerator == '.8']['tweet_id'])
numerator_error.append(final[final.numerator == '.9']['tweet_id'])
numerator_error.append(final[final.numerator == ';2']['tweet_id'])
numerator_error.append(final[final.numerator == '(8']['tweet_id'])
numerator_error.append(final[final.numerator == 'st']['tweet_id'])
numerator_error.append(final[final.numerator == '\r\n5']['tweet_id'])
numerator_error.append(final[final.numerator == '-5']['tweet_id'])
numerator_error.append(final[final.numerator == ' w']['tweet_id'])
numerator_error
Here we start to Manually Clean the data.
Steps
full_text
final[final.numerator == '.9'].full_text
final.loc[[847,1192,1430],'numerator'] = 9
final[final.numerator == '.5']
final.loc[42].full_text
final.loc[[42],'numerator'] = 13.5
final.loc[1509].full_text
final.loc[[1509],'numerator'] = 9.5
final[final.numerator == '\r\n9']
final.loc[1492].full_text
final.loc[[1492,2082],'numerator'] = 9
final[final.numerator == 'ry']
final.loc[2329].full_text
final.loc[[2329],'numerator'] = 11
final[final.numerator == '.8']
final.loc[1255].full_text
final.loc[[1255],'numerator'] = 8
final[final.numerator == '.9']
final[final.numerator == ';2']
final.loc[2066].full_text
final.loc[[2066],'numerator'] = 2
final[final.numerator == '(8']
final.loc[1473].full_text
final.loc[[1473],'numerator'] = 8
final[final.numerator == 'st']
final.loc[1635].full_text
final.loc[[1635],'numerator'] = 12
final[final.numerator == '\r\n5']
final.loc[1912].full_text
final.loc[[1912],'numerator'] = 5
final[final.numerator == '-5']
final[final.numerator == ' w']
final.loc[1069].full_text
final.loc[[1069],'numerator'] = 3
Test
print(final.numerator.value_counts())
denominator
column as well¶Define
The code we wrote turned out some errors, we shall manual clean this.
We need to eliminate string values
Code
final.denominator.value_counts()
final[final.denominator == 'pe']
final.loc[2329].full_text
final.loc[[2329],'denominator'] = 10
final[final.denominator == 'ou']
final.loc[1069].full_text
final.loc[[1069],'denominator'] = 10
final[final.denominator == 'sw']
final.loc[1635].full_text
final.loc[[1635],'denominator'] = 10
Test
final.denominator.value_counts()
Test
final.columns
RUN LAST AND UNCOMMENT
#Save Files to CSV
#final.to_csv('data/final/twitter_archive_master.csv')
#img_clean.to_csv('data/final/image_predictions.csv')
#print("Saved Successfully")