### Unigram bag_words bag_words <- function(character_vector){ ### Create a document term matrix from a character vector # removes all tags such as
,
character_vector <- gsub(';', ' ', character_vector)
corpus = VCorpus(VectorSource(character_vector))
corpus <- tm_map(corpus, stripWhitespace) # strip white space
corpus <- tm_map(corpus, removeWords, stopwords("english")) # remove stop Words
corpus <- tm_map(corpus, stemDocument) # stemming
dtm <- DocumentTermMatrix(corpus, control = list(weighting = weightTfIdf))
}
as_sparseMatrix <- function(simple_triplet_matrix_sparse) {
retval <- sparseMatrix(i=as.numeric(simple_triplet_matrix_sparse$i),
j=as.numeric(simple_triplet_matrix_sparse$j),
x=as.numeric(as.character(simple_triplet_matrix_sparse$v)),
dims=c(simple_triplet_matrix_sparse$nrow,
simple_triplet_matrix_sparse$ncol),
dimnames = dimnames(simple_triplet_matrix_sparse),
giveCsparse = TRUE)
}