This is the first piece to pull in the data and measure the relationships and write it back out to your database. The supporting pieces will be coming later on building the database and query to return the recommended products to use. Also coming up is some website scraping in Python to top selling products from others and how to include them to boost your recommendations.
#Author: Todd Palecek (toddpalecek@gmail.com)
#Description: Determine Product Affinity from a Market Basket
# using Apriori to feed a SQL Recommendation Engine
library(RODBC)
library(stats)
library(reshape2)
library(arules)
library(arulesViz)
library(config)
#Definitions:
#Support: The fraction of which our item set occurs in our dataset.
#Confidence: probability that a rule is correct for a new transaction with items on the left.
#Lift: The ratio by which by the confidence of a rule exceeds the expected confidence.
#Note: if the lift is 1 it indicates that the items on the left and right are independent.
#variable for report name and key to write to SQL
var.RptType <- 'MktBskt-60'
var.Link <- 'DC1-DWDB1.Silvermine.dbo.DimProduct.SKU'
#memory cleanup - may not be needed but can't hurt
gc()
#check and increase memory limit
memory.limit()
memory.limit(size = 26000) #if lower than currently set will not decrease memory available
#import from SQL
cn2 <- odbcDriverConnect(connection="Driver={SQL Server Native Client 11.0};server=YourServer;database=DataWarehouse;trusted_connection=yes;")
cnWrite <- odbcDriverConnect(connection="Driver={SQL Server Native Client 11.0};server=YourServer;database=Grok;trusted_connection=yes;case=nochange")
#Import data to R from SQL Data Warehouse
#To include text in SQL statement need to format as \'text\'
#If using your data warehouse as source date is likely an integer
#Only including orders that have multiple items to limit data imported to only useful data
dataFetch2 <-sqlQuery(cn2, 'with OrderItems (SKU, OrderID,BrandKey,OrderMethodKey)
as
(SELECT distinct P.SKU, D.OrderID, D.BrandKey,D.OrderMethodKey
FROM [Silvermine].[ssas].[FactDemand] D
join [Silvermine].[ssas].[DimProduct] P on D.ProductKey = P.ProductKey
where D.OrderReceivedDateKey between (SELECT YEAR(GETDATE()-60) * 10000 + MONTH(GETDATE()-60) * 100 + DAY(GETDATE()-60))
and (SELECT YEAR(GETDATE()-1) * 10000 + MONTH(GETDATE()-1) * 100 + DAY(GETDATE()-1))
)
select O.OrderID, O.SKU, O.BrandKey, O.OrderMethodKey
from OrderItems O
where O.BrandKey in (1) and O.OrderID in (select OrderID from OrderItems group by OrderID having COUNT(SKU) > 1)', rows_at_time=1000)
#close connection after not needed to free resources
close(cn2)
#memory cleanup - may not be needed but can't hurt
gc()
#copy to new dataframe for modifications
dataFetch3 <- dataFetch2
#extract unique values of SKUs and Order Methods to loop through later
sku_list <- unique(dataFetch3$BrandKey)
ordermethod_list <- unique(dataFetch3$OrderMethodKey)
#create blank data frame to populated relationships into
rulestoDBALL <- data.frame()
#loop through each SKU, Brand, and Order Method to build relationships
for (i in 1:length(sku_list)){
dataFetch2 <- dataFetch3[which(dataFetch3$BrandKey==sku_list[i]),]
dataFetch2$BrandKey <- NULL
for (j in 1:length(ordermethod_list)){
dataFetch4 <- dataFetch2[which(dataFetch2$OrderMethodKey==ordermethod_list[j]),]
dataFetch4$OrderMethodKey <- NULL
t <- dcast(dataFetch4, OrderID ~ SKU, drop = FALSE) #added drop = FALSE becasue of errors
t[is.na(t)] <- 0
gc()
t1 <- as(t[,-1]>0,"transactions")
#Get the rules, maxlen sets number of items in set - default is 10 - the more returned the
#more resource intenstive analysis will be. maxlen = 2 returns the top related products that
#we will recommend and speeds processing of data
rules <- apriori(t1, parameter = list(supp = 0.00001, conf = 0.05,maxlen=2))
#Show only 2 digits
options(digits=2)
#convert rules to data frame in prep to write to DB
rulestoDB <- as(rules, "data.frame");
rulestoDB$brandkey <- (sku_list[i])
rulestoDB$Link <- var.Link
rulestoDB$OrderMethod <- (ordermethod_list[j])
rulestoDBALL <- rbind(rulestoDBALL,rulestoDB)
}
}
#optional: review the rules created
inspect(rules[1:5])
#variable for report name and key to write to SQL
rulestoDBALL$AnalysisType <- print(var.RptType)
rulestoDBALL$AnalysisDate <- as.Date(Sys.Date())
rulestoDBALL$ReportType <- print(var.RptType)
rulestoDBALL$ReportDate <- as.Date(Sys.Date())
#Write data to SQL table. Further analysis is done within SQL.
sqlSave(cnWrite,rulestoDBALL,tablename = "stgR_MarketBasket",rownames=FALSE,append = TRUE, fast = FALSE )
#further reading
#http://www.salemmarafi.com/code/market-basket-analysis-with-r/
#Supporting SQL Tables
# USE [Grok]
# GO
#
# SET ANSI_NULLS ON
# GO
#
# SET QUOTED_IDENTIFIER ON
# GO
#
# CREATE TABLE [dbo].[stgR_MarketBasket](
# [rules] [varchar](250) NULL,
# [support] [decimal](8, 7) NULL,
# [confidence] [decimal](5, 4) NULL,
# [lift] [int] NULL,
# [brandkey] [int] NULL,
# [Link] [varchar](250) NULL,
# [OrderMethod] [int] NULL,
# [AnalysisType] [varchar](50) NULL,
# [AnalysisDate] [date] NULL,
# [ReportType] [varchar](50) NULL,
# [ReportDate] [date] NULL
# ) ON [PRIMARY]
# GO