#install things install.packages("tm") install.packages("SnowballC") install.packages("wordcloud") install.packages("e1071") install.packages("gmodels") # read the sms data into the sms data frame sms_raw <- read.csv("sms_spam.csv.txt", stringsAsFactors = FALSE) # examine the structure of the sms data str(sms_raw) # convert spam/ham to factor (category variable) sms_raw$type <- factor(sms_raw$type) # examine the type variable more carefully str(sms_raw$type) table(sms_raw$type) # build a corpus (body of docs) using the text mining (tm) package library(tm) sms_corpus <- VCorpus(VectorSource(sms_raw$text)) # examine the sms corpus print(sms_corpus) inspect(sms_corpus[1:2]) lapply(sms_corpus[1:2], as.character) # clean up the corpus using tm_map() sms_corpus_clean <- tm_map(sms_corpus, content_transformer(tolower)) # show the difference between sms_corpus and corpus_clean as.character(sms_corpus[[1]]) as.character(sms_corpus_clean[[1]]) sms_corpus_clean <- tm_map(sms_corpus_clean, removeNumbers) # remove numbers sms_corpus_clean <- tm_map(sms_corpus_clean, removeWords, stopwords()) # remove stop words sms_corpus_clean <- tm_map(sms_corpus_clean, removePunctuation) # remove punctuation sms_corpus_clean <- tm_map(sms_corpus_clean, stemDocument) # word stems sms_corpus_clean <- tm_map(sms_corpus_clean, stripWhitespace) # eliminate extra whitespace # examine the final clean corpus lapply(sms_corpus[1:3], as.character) lapply(sms_corpus_clean[1:3], as.character) # create a document-term sparse matrix sms_dtm <- DocumentTermMatrix(sms_corpus_clean) # see the result sms_dtm # creating training and test datasets sms_dtm_train <- sms_dtm[1:4169, ] sms_dtm_test <- sms_dtm[4170:5559, ] # also save the labels sms_train_labels <- sms_raw[1:4169, ]$type sms_test_labels <- sms_raw[4170:5559, ]$type # check that the proportion of spam is similar prop.table(table(sms_train_labels)) prop.table(table(sms_test_labels)) # subset the training data into spam and ham groups spam <- subset(sms_raw, type == "spam") ham <- subset(sms_raw, type == "ham") # word cloud visualization library(wordcloud) wordcloud(spam$text, max.words = 40, scale = c(3, 0.5)) wordcloud(ham$text, max.words = 40, scale = c(3, 0.5)) # save frequently-appearing terms to a character vector sms_freq_words <- findFreqTerms(sms_dtm_train, 5) str(sms_freq_words) # create DTMs with only the frequent terms sms_dtm_freq_train <- sms_dtm_train[ , sms_freq_words] sms_dtm_freq_test <- sms_dtm_test[ , sms_freq_words] #see what is in a DTM inspect(sms_dtm_freq_train) # convert counts to a factor convert_counts <- function(x) { x <- ifelse(x > 0, "Yes", "No") } # apply() convert_counts() to columns of train/test data sms_train <- apply(sms_dtm_freq_train, MARGIN = 2, convert_counts) sms_test <- apply(sms_dtm_freq_test, MARGIN = 2, convert_counts) # Training a model on the data ---- library(e1071) sms_classifier <- naiveBayes(sms_train, sms_train_labels) #Evaluating model performance ---- sms_test_pred <- predict(sms_classifier, sms_test) library(gmodels) CrossTable(sms_test_pred, sms_test_labels, prop.chisq = FALSE, prop.t = FALSE, prop.r = FALSE, dnn = c('predicted', 'actual')) #Improving model performance with Laplace smoothing sms_classifier2 <- naiveBayes(sms_train, sms_train_labels, laplace = 1) sms_test_pred2 <- predict(sms_classifier2, sms_test) CrossTable(sms_test_pred2, sms_test_labels, prop.chisq = FALSE, prop.t = FALSE, prop.r = FALSE, dnn = c('predicted', 'actual'))