# lees de query_product tabel in

> query_product.dat <- read.csv("D:/Home Depot/query_product.csv",stringsAsFactors=FALSE)

# bekijk structuur data

> str(query_product.dat)
'data.frame':   74067 obs. of  5 variables:
 $ id           : int  2 3 9 16 17 18 20 21 23 27 ...
 $ product_uid  : int  100001 100001 100002 100005 100005 100006 100006 100006 100007 100009 ...
 $ product_title: chr  "Simpson Strong-Tie 12-Gauge Angle" "Simpson Strong-Tie 12-Gauge Angle" "BEHR Premium Textured DeckOver 1-gal. #SC-141 Tugboat Wood and Concrete Coating" "Delta Vero 1-Handle Shower Only Faucet Trim Kit in Chrome (Valve Not Included)" ...
 $ search_term  : chr  "angle bracket" "l bracket" "deck over" "rain shower head" ...
 $ relevance    : num  3 2.5 3 2.33 2.67 3 2.67 3 2.67 3 ...

# beschrijvende statistieken van "relevantie"

> summary(query_product.dat$relevance)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  1.000   2.000   2.330   2.382   3.000   3.000 

> summary(as.factor(query_product.dat$relevance))
    1  1.25  1.33   1.5  1.67  1.75     2  2.25  2.33   2.5  2.67  2.75     3 
 2105     4  3006     5  6780     9 11730    11 16060    19 15202    11 19125 

# laad de library "tau" voor de "textcnt" functie

> library(tau)

# bekijk de productnaam van het eerste query-product paar

> query_product.dat[1,3]
[1] "Simpson Strong-Tie 12-Gauge Angle"

# tel de woorden in de productnaam
# bij verstek wordt gesplitst op spaties, leestekens, en cijfers, 
# en alles wordt naar lower case omgezet

> textcnt(query_product.dat[1,3],method="string",n=1L)
  angle   gauge simpson  strong     tie 
      1       1       1       1       1 

# zelfde als hierboven, maar splits nu alleen op spaties

> textcnt(query_product.dat[1,3],method="string",split="[[:space:]]+",n=1L)
  12-gauge      angle    simpson strong-tie 
         1          1          1          1 

# splits op spaties en leestekens

> textcnt(query_product.dat[1,3],method="string",split="[[:space:][:punct:]]+",n=1L)
     12   angle   gauge simpson  strong     tie 
      1       1       1       1       1       1 

> wc <- textcnt(query_product.dat[1,3],method="string",n=1L)

# welke woorden komen voor in de productnaam?

> names(wc)
[1] "angle"   "gauge"   "simpson" "strong"  "tie"    



# functie om te berekenen of alle querywoorden in de productnaam voorkomen

all.queryterms <- function (queries,docs) 
{
n <- length(queries)
feature <- vector(length=n)
for(i in 1:n){
 query <- queries[i]
 document <- docs[i]
 a <- textcnt(query,method="string",n=1L)
 b <- textcnt(document,method="string",n=1L)
 c <- intersect(names(a), names(b))
 feature[i] <- as.numeric(length(a)==length(c))}
feature
}

# bereken deze feature op de query_product tabel

> allterms <- all.queryterms(query_product.dat$search_term,query_product.dat$product_title)

# in ongeveer 24% van de query-product paren komen alle zoektermen voor in de productnaam

> summary(allterms)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 0.0000  0.0000  0.0000  0.2433  0.0000  1.0000 

# maak een data frame met de zojuist berekende feature en de relevantie-score.

> qp.dat <- data.frame(relevance=query_product.dat$relevance,allterms=allterms)

# trek een random sample ter grootte 50000 uit de getallen 1:74067
# dit zijn de rij-nummers van de training set

> tr.index <- sample(74067,50000)

# schat een lineair regressiemodel op de trainingset

> qp.lm <- lm(relevance~allterms,data=qp.dat[tr.index,])

# bekijk het model; heeft de coefficient van "allterms" het verwachtte teken?

> summary(qp.lm)

Call:
lm(formula = relevance ~ allterms, data = qp.dat[tr.index, ])

Residuals:
     Min       1Q   Median       3Q      Max 
-1.61913 -0.30683  0.02317  0.38087  0.69317 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 2.306835   0.002656  868.61   <2e-16 ***
allterms    0.312298   0.005389   57.95   <2e-16 ***
---
Signif. codes:  0 "***" 0.001 "**" 0.01 "*" 0.05 "." 0.1 " " 1

Residual standard error: 0.5167 on 49998 degrees of freedom
Multiple R-squared:  0.06294,   Adjusted R-squared:  0.06292 
F-statistic:  3358 on 1 and 49998 DF,  p-value: < 2.2e-16