> library(text2vec) > library(tm) > product.dat <- read.csv("C:/DAR 2017/product_descriptions.csv",stringsAsFactors=FALSE) > txt <- product.dat[,2] > txt <- removePunctuation(txt) > txt <- tolower(txt) > it = itoken(txt, tolower, word_tokenizer, n_chunks = 10) > vocab = create_vocabulary(it) > dim(vocab) [1] 364758 3 > vocab = prune_vocabulary(vocab, term_count_min = 10, doc_proportion_max = 0.8,doc_proportion_min = 0.001, vocab_term_max = 20000) > dim(vocab) [1] 6563 3 > vocab[1:3,] Number of docs: 124428 0 stopwords: ... ngram_min = 1; ngram_max = 1 Vocabulary: term term_count doc_count 1: with 250258 98674 2: of 229820 91535 3: is 192208 87733 > vectorizer = vocab_vectorizer(vocab) > tcm = create_tcm(it, vectorizer, skip_grams_window = 5L) > glove = GlobalVectors$new(rank = 50, x_max = 10) > wv_main = glove$fit_transform(tcm, n_iter = 50, convergence_tol = 0.01) > wv_context = glove$components > word_vectors = wv_main + t(wv_context) > word_vectors[1:3,1:5] [,1] [,2] [,3] [,4] [,5] with 0.1572956 -0.4986556 -0.30388024 -0.5208205 -0.49814761 of 0.5462567 -0.3089656 -0.40487972 0.3400864 -0.16796595 is -0.3926085 -0.9640168 0.08781935 0.3989814 -0.08923554 > screw = word_vectors["screw", , drop = FALSE] > cos_sim = sim2(x = word_vectors, y = screw, method = "cosine", norm = "l2") > head(sort(cos_sim[,1], decreasing = TRUE), 5) screw screws head hex bolts 1.0000000 0.7952427 0.6618135 0.6532096 0.6130304 > design = word_vectors["design", , drop = FALSE] > cos_sim = sim2(x = word_vectors, y = design, method = "cosine", norm = "l2") > head(sort(cos_sim[,1], decreasing = TRUE), 5) design an unique its style 1.0000000 0.7495426 0.7422682 0.7408201 0.7407434 > cos_sim = sim2(x = word_vectors, method = "cosine", norm = "l2") > dim(cos_sim) [1] 6563 6563 > cos_sim[1:5,1:5] with of is or your with 1.0000000 0.7087212 0.7530961 0.5669205 0.5603469 of 0.7087212 1.0000000 0.6033791 0.4730575 0.5919557 is 0.7530961 0.6033791 1.0000000 0.5426918 0.4613141 or 0.5669205 0.4730575 0.5426918 1.0000000 0.6112975 your 0.5603469 0.5919557 0.4613141 0.6112975 1.0000000 > cos_sim["of","is"] [1] 0.6033791 > max(cos_sim["of",c("is","with")]) [1] 0.7087212 # maar let op! > cos_sim["hahaha","is"] Error in cos_sim["hahaha", "is"] : subscript out of bounds