Twitter Data Cleaning using Python

Image for post
Image for post
Photo by MORAN on Unsplash
import pandas as pdimport htmlimport refrom nltk.corpus import stopwordsfrom nltk.tokenize import word_tokenize
pd.set_option(‘display.max_colwidth’, None)data = pd.read_csv(‘your_sample.csv’)data.head()
new_data = data.drop_duplicates(‘Tweet Content’,keep=’first’) #delete the duplicates by dropping them and store the result value to a new variablenew_data.head()
new_data.to_csv(r’your_new_sample.csv’, index = False)new_sample = pd.read_csv(‘your_new_sample.csv’)new_sample.head()tweets = new_sample[‘Tweet Content’]tweets.head()
for i in range (len(tweets)):x = tweets[i].replace(“\n”,” “) #cleaning newline “\n” from the tweetstweets[i] = html.unescape(x)tweets.head()
for i in range (len(tweets)):tweets[i] = re.sub(r”(@[A-Za-z0–9_]+)|[^\w\s]|#|http\S+”, “”, tweets[i])tweets.head()
tweets_to_token = tweetssw = stopwords.words(‘english’) #you can adjust the language as you desiresw.remove(‘not’) #we exclude not from the stopwords corpus since removing not from the text will change the context of the text
for i in range(len(tweets_to_token)):tweets_to_token[i] = word_tokenize(tweets_to_token[i])
for i in range(len(tweets_to_token)):tweets_to_token[i] = [word for word in tweets_to_token[i] if not word in sw]tweets_to_token

〖A data geek 📊〗〖Life-long learner〗〖ESFP-T〗〖✨ŸØⱠØ✨〗

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store