source('collect_data_twitter.R') # Some tweets head(df$text) # [1] "@rdryan33 @WingedWheels I'd be ordering a Big Mac! Haha" # [2] "RT @Savito_BossDon: Only Thing I Need For Thanksgiving Is A Big Ass Plate Of Sweet Potatoes And Baked Mac And Cheese With A Few Slabs Of Tu???" # [3] "Never mess with a mans Big Mac (Vine by @sport_fun_facts) https://t.co/6ThanEvBkG" ### We are interested in the text - Let's clean it! # We first convert the encoding of the text from latin1 to ASCII df$text <- sapply(df$text,function(row) iconv(row, "latin1", "ASCII", sub="")) # Create a function to clean tweets clean.text <- function(tx){ tx <- gsub("htt.{1,20}", " ", tx, ignore.case=TRUE) tx = gsub("[^#[:^punct:]]|@|RT", " ", tx, perl=TRUE, ignore.case=TRUE) tx = gsub("[[:digit:]]", " ", tx, ignore.case=TRUE) tx = gsub(" {1,}", " ", tx, ignore.case=TRUE) tx = gsub("^\\s+|\\s+$", " ", tx, ignore.case=TRUE) return(tx) } clean_tweets <- lapply(df$text, clean.text) head(clean_tweets)