-
Notifications
You must be signed in to change notification settings - Fork 5
/
2_Final_Report_Team_K-2.rmd
686 lines (486 loc) · 41.4 KB
/
2_Final_Report_Team_K-2.rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
---
title: "Advertising Response Measurement - Final Report"
author: "Team K - Zichen (Zoe) Huang, Jiaying (Claire) Wu, Chen (Cici) Chen"
date: "5/8/2019"
output:
prettydoc::html_pretty:
theme: cayman
highlight: github
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, eval = TRUE, comment="", warning = FALSE, message = FALSE, tidy.opts=list(width.cutoff=55), tidy = TRUE)
set.seed(123) # ensures repeatable results for attribution rules
options(scipen=999) # suppress scientific notation
```
```{r libraries, echo=FALSE,message=F}
library(prettydoc)
library(data.table)
library(Hmisc)
library(scales)
library(DT)
library(lubridate)
```
```{r functions, echo=FALSE}
fit.model <- function(dt, outcome.name, input.names, model.type, digits = 3){
the.formula <- reduce.formula(dt = dt, outcome.name = outcome.name, input.names = input.names)
if(model.type == "logistic"){
mod <- glm(formula = the.formula, family = "binomial", data = dt)
mod.summary <- logistic.regression.summary(glm.mod = mod, digits = digits)
}
if(model.type == "linear"){
mod <- lm(formula = the.formula, data = dt)
mod.summary <- linear.regression.summary(lm.mod = mod, digits = digits)
}
mod.summary.rounded <- mod.summary[, lapply(X = .SD, FUN = "round.numerics", digits = digits)]
return(mod.summary.rounded)
}
logistic.regression.summary <- function(glm.mod, digits = 3){
library(data.table)
glm.coefs <- as.data.table(summary(glm.mod)$coefficients, keep.rownames = TRUE)
alpha = 0.05
z <- qnorm(p = 1-alpha/2, mean = 0, sd = 1)
glm.coefs[, Odds.Ratio := exp(Estimate)]
glm.coefs[, OR.Lower.95 := exp(Estimate - z * `Std. Error`)]
glm.coefs[, OR.Upper.95 := exp(Estimate + z * `Std. Error`)]
return(glm.coefs[])
}
linear.regression.summary <- function(lm.mod, digits = 3, alpha = 0.05) {
lm.coefs <- as.data.table(summary(lm.mod)$coefficients,keep.rownames = TRUE)
setnames(x = lm.coefs, old = "rn", new = "Variable")
z <- qnorm(p = 1 - alpha/2, mean = 0, sd = 1)
lm.coefs[, Coef.Lower.95 := Estimate - z * `Std. Error`]
lm.coefs[, Coef.Upper.95 := Estimate + z * `Std. Error`]
return(lm.coefs)
}
create.formula <- function(outcome.name, input.names, input.patterns = NA, all.data.names = NA, return.as = "character"){
variable.names.from.patterns <- c()
if(!is.na(input.patterns[1]) & !is.na(all.data.names[1])){
pattern <- paste(input.patterns, collapse = "|")
variable.names.from.patterns <- all.data.names[grep(pattern = pattern, x = all.data.names)]
}
all.input.names <- unique(c(input.names, variable.names.from.patterns))
all.input.names <- all.input.names[all.input.names != outcome.name]
if(!is.na(all.data.names[1])){
all.input.names <- all.input.names[all.input.names %in% all.data.names]
}
input.names.delineated <- sprintf("`%s`", all.input.names)
the.formula <- sprintf("`%s` ~ %s", outcome.name, paste(input.names.delineated, collapse = "+"))
if(return.as == "formula"){
return(as.formula(the.formula))
}
if(return.as != "formula"){
return(the.formula)
}
}
reduce.formula <- function(dt, outcome.name, input.names, input.patterns = NA, max.input.categories = 20, max.outcome.categories.to.search = 4, return.as = "formula"){
require(data.table)
dt <- setDT(dt)
if(!(outcome.name %in% names(dt))){
return("Error: outcome.name is not in names(dt).")
}
pattern.names <- list()
if(!is.na(input.patterns[1])){
for(i in 1:length(input.patterns)){
pattern.names[[i]] <- names(dt)[grep(pattern = input.patterns[i], x = names(dt))]
}
}
all.input.names <- c(input.names, as.character(pattern.names))
num.outcome.categories <- dt[!is.na(get(outcome.name)), length(unique(get(outcome.name)))]
if(num.outcome.categories <= max.outcome.categories.to.search){
num.unique.tab <- dt[, lapply(X = .SD, FUN = function(x){return(length(unique(x[!is.na(x)])))}), .SDcols = input.names, by = outcome.name]
min.categories.tab <- num.unique.tab[, lapply(X = .SD, FUN = "min"), .SDcols = input.names]
reduced.inputs <- names(min.categories.tab)[min.categories.tab >= 2]
}
if(num.outcome.categories > max.outcome.categories.to.search){
reduced.inputs <- all.input.names
}
the.formula <- create.formula(outcome.name = outcome.name, input.names = reduced.inputs, all.data.names = names(dt), input.patterns = NA, return.as = return.as)
return(the.formula)
}
round.numerics <- function(x, digits = 0, nearest = 1){
if(is.numeric(x)){
return(nearest * round(x = x/nearest, digits = digits))
}
else{
return(x)
}
}
```
```{r read_data, echo=FALSE, eval=TRUE, results='hide'}
customer <- fread(input = "../Data/customer.csv")
impressions <- fread(input = "../Data/impressions.csv")
transactions <- fread(input = "../Data/transactions.csv")
```
```{r constant, echo=FALSE}
num.lines <- 10
```
# Introduction
The ultimate goal of any marketing activities is to increase sales, either in short-term or long-term, and ideally each campaign or marketing channel should be evaluated based on the incremental profit, which is the additional sales we produce with advertising over what we would have sold without advertising, relative to its cost. While marketing efforts seem intangible as it is hard to land causal relationships between marketing events and transactions, advanced statistical methods could definitely shed the lights of tangible marketing impact on business growth, and guide the direction for future practice.
By analyzing the correlations between marketing activities and the transactions, weakness and strengths of various channels, seasonality of consumer’s response, the marketing team could evaluate the cost efficiency and thus move forward to optimize the resources allocation for marketing spending, and strategically plan marketing events to improve the effectiveness of marketing efforts.
Among diverse marketing activities, advertising is a major mean for branding and informing. Both online (emails, social media, displays, etc.) and offline (direct mails) advertisements play an indispensable role in marketing. As placing ads is expensive in terms of the cost of time, recourses, and budget, one most frequently asked question from the management team is that: how does my advertising work? With this question in mind, our team leveraged four methods to tackle the advertising effectiveness measurement problem. The methods are last-click attribution analysis, holdout testing (experimentation), marketing mix models and model-based attribution analysis. The main goal of this project is to evaluate the **effectiveness of different advertising channels** from four different perspectives. Meanwhile, we would learn from the advantages and disadvantages of the approaches and gain a more comprehensive understanding of mix modeling methods for continuous study.
# Sources of Data
The full dataset could be found in the submission files. As dataset related to advertisement responses and transaction records is very sensitive, we could hardly find real or original open data. The dataset we used for this project is a synthetic one simulated by the Elea McDonnell Feit, Marketing Professor of Drexel University, and is organized from three perspectives: customer, impressions, and transactions. Data generation method could be found here: https://github.com/eleafeit/ad_response_tutorial/blob/master/R%20code/AdResponseDataGeneration.R. The whole raw digital advertising dataset describes 10,000 customers as well as potential customers of a retailer, and the retailer uses four different advertising channels - display ads, social media ads, email ads, and direct mail ads for its marketing promotion.
# Examination of the Data
We will be working with a simulated data set related to social media sites. The data are stored in following three files:\ **customer.csv**,\ **impressions.csv**,\ and\ **transactions.csv**. The first row of the data set includes the column names, and each subsequent row includes one observation of values. Here is a selection of `r num.lines` lines from each data file:
## Inspect customer file
**customer.csv**: Information about the users with some fields from their profiles. Each row in the file represents a customer, 10,000 rows; the columns describe some basic information about each customer including id number, whether the customer has made a purchase prior to the observation period, and whether the customer is eligible to receive emails or direct mails.
```{r show_header_customer, echo=FALSE, comment=""}
datatable(data = customer[1:num.lines,])
```
- **past.purchase**: in the original dataset, the type of *past.purchase* is *int*, but this variable records whether the customer has made a purchase prior to the observation period so we decided to convert it into *factor*.
- **email**: in the original dataset, the type of *email* is *int*, but this variable records whether the customer is eligible to receive emails so we decided to convert it into *factor*.
- **direct**: in the original dataset, the type of *direct* is *int*, but this variable records whether the customer is eligible to receive direct mails so we decided to convert it into *factor*.
```{r preprocessing_customer, echo=FALSE, comment=""}
customer[,past.purchase := as.factor(past.purchase)]
customer[,email := as.factor(email)]
customer[,direct := as.factor(direct)]
```
## Inspect impression file
**impressions.csv**: Information about which users are connected to other users. Each row is an exposure of marketing communication to a specific customer, 501,336 rows; the columns describe the information about the customer’s impression towards an advertisement including id number for the customer, date of impression, the channel of the ad exposure, and whether the customer clicked on the ad.
```{r show_header_impression , echo=FALSE, comment=""}
datatable(data = impressions[1:num.lines,])
```
- **date**: in the original dataset, the type of *date* is *chr*, and we decided to convert it into *date*.
- **channel**: in the original dataset, the type of *channel* is *chr*, and we decided to convert it into *factor*.
- **click**: in the original dataset, the type of *click* is *int*, but this variable records whether the customer clicked on the ad so we decided to convert it into *factor*.
```{r preprocessing_impressions, echo=FALSE, comment=""}
impressions[,date := as.Date(date)]
impressions[,channel := as.factor(channel)]
impressions[,click := as.factor(click)]
```
## Inspect transactions file
**transactions.csv**: Information about history of the user's account registrations (logins) over time. Each row is a transaction made by a customer; columns record the basic information of a transaction including customer id, date of the transaction, channel of the last ad impression the customer saw before the transaction, and channel of the last ad the customer clicked before the transaction.
```{r show_header_transactions , echo=FALSE, comment=""}
datatable(data = transactions[1:num.lines,])
```
- **V1**: in the original dataset, *V1* only represents the row number, so it is safe for us to remove this variable.
- **date**: in the original dataset, the type of *date* is *chr*, and we decided to convert it into *date*.
- **last.touch**: in the original dataset, the type of *last.touch* is *chr*, and we decided to convert it into *factor*.
- **last.touch**: in the original dataset, the type of *last.touch* is *chr*, and we decided to convert it into *factor*.
- **last.click**: in the original dataset, the type of *last.click* is *chr*, and we decided to convert it into *factor*.
```{r preprocessing_transactions, echo=FALSE, comment=""}
transactions[,V1 := NULL]
transactions[,date := as.Date(date)]
transactions[,last.touch := as.factor(last.touch)]
transactions[,last.click := as.factor(last.click)]
```
# Method 1: Last Touch Analysis
**Attribution rules with last-touch analysis**
### 1. Investigation
Based on last-touch attribution, we are able to find the last ad the user clicked on prior to the conversion so that we can get the sales attributed to each channel. In this case, information about the last click and the last touch is stored in the transaction file. By doing a quick crosstab on the transaction table, we are able to calculate the number of transactions attached to each channel by last touch.
```{r attribution.rules.table, echo=F}
last.touch.tab <- xtabs(~last.touch, data = transactions)
```
```{r attribution.rules.table_show,echo=F}
last.touch.tab
```
```{r attribution.rules.barplot, echo=FALSE}
tab <- transactions[,.N,by = last.touch]
barplot <- barplot(height = tab[,N], space=0.01, las = 1, main = "Last Touch Attribution", ylab = "Transactions", xlab = "channel", ylim = c(0, 1.2*max(tab[,N], na.rm = TRUE)), col = "dodgerblue")
text(barplot, par("usr")[3], labels = tab[,last.touch], srt = 45, adj = c(1.1,1.1), xpd = TRUE)
space_val = 0
text(x = -0.4 + 1:length(tab[,last.touch]) * (1+space_val), y = tab[,N], labels = tab[,N], pos = 3)
```
### 2. Results and Interpretation
Seen from the result, the incremental sales for social media ads are 6596.
Please refer to the dashboard for last touch analysis result for subgroups of transactions to see if some channels are not available, which channel of the rest is the best choice.
Also, we can see if the incremental sales for different ads would be different if we take time into consideration. For example, we can subset out the transactions based on date and then crosstab.
```{r attribution.rules.table2, echo=F}
last.touch.tab1 <- xtabs(~last.touch, data = transactions[date>=as.Date('2017-02-01') & date<=as.Date('2017-02-28'),])
```
```{r attribution.rules.table2_show, echo=F}
last.touch.tab1
```
```{r attribution.rules.barplot2, echo=F}
tab1 <- transactions[date>=as.Date('2017-02-01') & date<=as.Date('2017-02-28'),.N,by = last.touch]
barplot1 <- barplot(height = tab1[,N], space=0.01, las = 1, main = "Last Touch Attribution", ylab = "Transactions", xlab = "channel", ylim = c(0, 1.2*max(tab1[,N], na.rm = TRUE)), col = "dodgerblue")
text(barplot1, par("usr")[3], labels = tab1[,last.touch], srt = 45, adj = c(1.1,1.1), xpd = TRUE)
space_val = 0
text(x = -0.4 + 1:length(tab1[,last.touch]) * (1+space_val), y = tab1[,N], labels = tab1[,N], pos = 3)
```
Seen from the result, in February, the incremental sales are far lower attached to social media ads. Probably, this is because that social media ads were ended at the end of January. Please refer to the dashboard to see the difference in different period of time.
### 3. Assumptions
When we do this, one assumption is that we ignore all the customers who didn't make a real transaction. Also, we assumed that all the sales are counted as incremental, which means that consumers who saw ads would not have bought if they hadn't seen the ads.
# Method 2: Holdout Test Analysis
### 1. Investigation
Holdout testing, also called randomized controlled trial, is an experiment that randomly select the customers for the control group not exposed to an ad and see if there is any difference between people who received ads and those who didn't so that we can know if the ads is effective.
There was an email on 2017-01-03, 2017-01-17, 2017-01-24, 2017-01-31, 2017-02-07, 2017-02-14, and 2017-02-21 that included a holdout group by randomly selecting the customers for the control group to be not exposed to an email ad.
Here, we picked 2017-01-31 to analyze the result of holdout test for the first 10 days.
```{r holdout_test ttable, echo=F}
test.date <- as.Date("2017-01-31")
test.id <- impressions[date == test.date & channel == "email",.(id = unique(id))]
test.id[,group := "test group"]
hold.out.id <- impressions[date == test.date & channel == "email.holdout",.(id = unique(id))]
hold.out.id[,group := "control group"]
total.tab <- rbind(test.id,hold.out.id)
total.tab[,group := as.factor(group)]
duration <- 10
trans.id <- transactions[date>=test.date & date<(test.date+duration),id]
total.tab[,converted := id %in% trans.id]
setnames(total.tab, old = "converted", new = "consumed")
ttable <- xtabs(~group + consumed, data = total.tab)
ttable
mosaicplot(~group + consumed, data = total.tab,
main = paste("Holdout test on", test.date))
```
### 2. Results and Interpretation
Seen from the table as well as the plot, it is obvious that the proportion of people made actual consumption in test group who have received the email ads is higher than that in control group who didn't receive any email ads. Below is a proptest for more detailed information about the comparison of conversion rate between these two groups.
```{r holdout_test proptest, echo=F}
proptest <- prop.test(x = ttable[,"TRUE"], n = xtabs(~group, data = total.tab))
proptest
diff.conv <- c(diff = (proptest$estimate[2]-proptest$estimate[1]), ci = -proptest$conf.int)
diff.conv
```
```{r holdout_test proptest_show,echo=F }
proptest
diff.conv
```
Seen from this example, the test group had a 45.89% conversion rate in the first 10 days after the email was sent, while the hold out group had a 40.1% conversion rate. The email on `r test.date` produced incremental sales. The incremental increase in conversion rate is between +2.60% and +8.98% (95% confidence interval). Please refer to the dashboard to see the difference about how much advertising increases sales with different experiment date and different window period. One inference we can get from the dashboard is that ad response is often greatest just after exposure and then falls off over time.
### 3. Assumptions
By randomly selecting the customers for the control group, we assumed that the two groups are the same on average. Both the treatment and control groups are assured to be similar in their propensity to transaction and response to ads, which is also called probabilistically equivalent.
# Method 3: Marketing Mix Modeling
Based on our datasets, the data points are usually categorical response or logical response, but before we go deep into the logical regression, we want to do linear regression first to find the correlation between total sales to advertising spending on that same day/week/month.
### 1. Investigation
We did some research about marketing mix model before we started to build our models.
A "marketing mix model" or "MMM" is a regression relating advertising spending or total impressions to some response such as "sales (transactions)". Marketing mix modeling (MMM) is also a statistical analysis utilizing marketing time series data to estimate the impact of various marketing strategies on sales and then predict the impact of future sets of marketing mix or tactics. It is often used to optimize advertising mix and promotional tactics with respect to sales revenue or profit.
In order to investigate the relationships or correlations between total sales (transactions) and impression factors (four different advertising channels - display ads, social media ads, email ads, and direct mail ads) within a specific time period, we implemented linear regression model for marketing mix modeling.
A simple marketing mix model for our case could be represented as follows:
$$sales_t = \beta_0 + \beta_1display_t + \beta_2social_t + \beta_3email_t + \beta_4direct_t + \epsilon_t$$
the $\beta$'s represent the unknown relationship between advertising spending and sales. For instance, $\beta_1$ means the increase in sales we will gain for each additional display impression unit while holding other impression factors fixed.
We have built five different linear models to see whether there are some thoughtful results which may help us understand the relationship between sales and advertising spending better.
* Model 1: Basic regression including email.holdout
* Model 2: Add in a day of week variable
* Model 3: Taking the advertising effect into consideration
* Model 4: Interactions terms ($Email \times Social$)
* Model 5: Interactions terms ($Direct \times Social$)
When we fit models, it will be more convenient to put the transactions and impressions data together in the same data frame. Also, it will be easy to do regression when we put the necessary variables together, so we did some data preparations before modeling.
so let's take a look at our summarized data, exploration plots, and the corresponding correlation matrix.
```{r MMM_Data.Prep,echo=F}
# Data Preparation: summarize impressions and transactions by date
trans.by.day <- xtabs(~date, transactions)
trans.by.click<- xtabs(~last.click, data=transactions)
data.chanel <- xtabs(~date+channel, impressions)
mdata <- data.frame(cbind(Sales=trans.by.day[1:57], data.chanel[2:58,]))
# Day of week
mdata$dayofweek <- weekdays(as.Date(rownames(mdata)))
# Add Ad effect variable
mdata$Email.ad.effect <- as.numeric(filter(x=mdata$email, filter=0.5, method="recursive"))
mdata$Display.ad.effect <- as.numeric(filter(x=mdata$display, filter=0.3, method="recursive"))
mdata$Direct.ad.effect <- as.numeric(filter(x=mdata$direct, filter=0.75, method="recursive"))
mdata$Social.ad.effect <- as.numeric(filter(x=mdata$social, filter=0.3, method="recursive"))
# Add Interaction terms
mdata$inter<-(mdata$Email.ad.effect)*(mdata$Social.ad.effect)
mdata$inter2<-(mdata$Direct.ad.effect)*(mdata$Social.ad.effect)
# Fancy
colnames(mdata)<-c("Sales","Direct","Display","Email","Email Holdout","Social",
"Day of Week",
"Email ad.effect","Display ad.effect","Direct ad.effect","Social ad.effect",
"Email&Social","Direct&Social")
```
* Summarized data
```{r MMM_Data,echo=F}
datatable(head(mdata,10))
```
* Exploration plots
```{r MMM_Plot1,fig.height=4, fig.width=14, echo=F}
# Daily impressions versus transactions
par(mfrow=c(1,4))
plot(x=mdata$Display, y=mdata$Sales, xlab="Display",
ylab="Transactions", main="Display v. Sales",col="dodgerblue")
plot(x=mdata$Social, y=mdata$Sales, xlab="Social",
ylab="Transactions", main="Social v. Sales",col="dodgerblue")
plot(x=mdata$Direct, y=mdata$Sales, xlab="Direct",
ylab="Transactions", main="Direct vs. Sales",col="dodgerblue")
plot(x=mdata$Email, y=mdata$Sales, xlab="Email",
ylab="Transactions", main="Email v. Sales",col="dodgerblue")
```
```{r MMM_Plot2, fig.align='center',echo=F}
# Transactions by date
plot(trans.by.day, col="dodgerblue", ylim=c(0,700), ylab="Transactions", main="Transactions by Date", xlab="Date")
```
Those plots gave us a general idea about how's the relationship between impression factors and sales, the Display channel tends to show a different trend compared to others, which will be explored more in the following model building part.
* Correlation matrix
```{r MMM_corr, echo=FALSE}
datatable(round.numerics(cor(mdata[,c(1:6)]),3))
```
Based on the correlation matrix, it can be easily found that Email, Email Holdout and Social seems to have a big impact on Sales. They have positive correlations, which are 0.529, 0.533, and 0.395, and the other two factors seem to have a slight impact.
### 2. Results and Interpretation
By utilizing two datasets `impressions` and `transactions`, we can have observations of the sales and other advertising data for different impression channels, which will give us the data to estimate these unknown parameters of the model, and the process will be shown below.
**Model 1: Basic regression including email.holdout relating transactions to impressions**
The model 1 can be written as
$$Sales_t=\beta_0 + \beta_1Direct_t+\beta_2Display_t+\beta_3 Email_t+\beta_4 Email.holdout_t+\beta_5Social_t+\epsilon_t$$
```{r MMM_Model1, echo=FALSE}
model.1<-fit.model(dt=mdata[,c(1:6)], "Sales",
input.names=c("Direct","Display","Email","Email Holdout","Social"),
model.type="linear", digits = 3)
datatable(model.1)
```
```{r MMM_Model1_reportshows, echo=FALSE}
M1.beta<-model.1$Estimate
```
* $Sales_t=`r M1.beta[1]`+`r M1.beta[2]`Direct_t`r M1.beta[3]`Display_t`r M1.beta[4]`Email_t+`r M1.beta[5]`Email.holdout_t+`r M1.beta[6]`Social_t$
* From the above output, there is only one statistically significant effect, which is social impressions, and we gain `r M1.beta[6]` additional transactions for each social impression.
* The estimated effect of Email and Display is negative but they are not significant.
**Model 2: Add the Day of Week variable**
Besides advertising impressions, we introduced a new variable into the linear regression model - the Day of Week, which enable us to inspect the relationships between sales and marketing channels across the week rather than within the same day.
The Model 2 can be written as
$$Sales_t=\beta_0 + \beta_1Direct_t+\beta_2Display_t+\beta_3 Email_t+\beta_4Social_t+\beta_iDay.of.Week(i\ is\ different\ weekdays)+\epsilon_t$$
```{r MMM_Model2, echo=F}
model.2<-fit.model(dt=mdata, "Sales",
input.names=c("Direct","Display","Email","Social","Day of Week"),
model.type="linear", digits = 4)
datatable(model.2)
```
```{r MMM_Model2_reportshows, echo=F}
M2.beta<-model.2$Estimate
```
When the new variable added into the model, it changed the coefficients of our model. Also, it can be noticed that there is no Friday, and it is just a coincidence that there is no Friday in the subdata which we merged from the original data that meets our assumptions in modeling for transactions and impressions.
In Model 2, we found a significant association between direct impressions and transactions, as well as social impressions and transactions.
For instance, Direct impression has `r M2.beta[2]*1000` additional transactions per thousand direct impressions, and Social social `r M2.beta[5]*1000` additional transactions per thousand impressions.
**Model 3: Taking the advertising effect into consideration**
Now we have Model 1 and Model 2, and these two models assume that an impression on specific time t has an effect on the number of transactions on a specific time t and that those impressions have no effect on sales on other days.
However, this is not really happened in the marketing area, just like the results from the email holdout test, it suggests that email impression last about three days! It is easy for us to understand that an advertisement had its biggest advertising effect right after it is shown to the users and then the effect wears over time.
The following picture shows how's advertising effect looks like, and we usually use an exponential decay function to describe how the effect of the advertisements drops.
```{r MMM_ad.effect, echo=FALSE, fig.align='center', fig.width=5,fig.height=5}
x<-seq(0.1,5,0.01)
plot(x,1/x, type = "l", main="Advertising effect",
xlab="Time", ylab="Impression effect", col="red",xlim=c(0,5),ylim=c(0,10))
```
Additionally, markets defined a term named advertising adstock, which describes the prolonged or lagged effect of advertising on consumer purchase behavior. It is also known as 'advertising carry-over'. Adstock is an important component of marketing-mix models. An ad stock variable is created by computing the exponential decay of the impressions on each day and then sum up the total stock from impressions on previous days.
In order to make the report easy to understand, we will simply call this phenomenon as a advertising effect. In Model 3, we will count advertising effect into consideration.
By using R, we can transform the original impression factors into the new variables corresponding to four different channels, and their plots could be shown below:
```{r MMM_Model3_plots,fig.height=4, fig.width=14, echo=F}
# Plot the ad ad.effects over time
par(mfrow=c(1,4))
plot(mdata$`Email ad.effect`, type="l", xlab="Time", ylab="Email ad effect", col="dodgerblue")
plot(mdata$`Display ad.effect`, type="l", xlab="Time", ylab="Display ad effect", col="dodgerblue")
plot(mdata$`Direct ad.effect`, type="l", xlab="Time", ylab="Direct ad effect", col="dodgerblue")
plot(mdata$`Social ad.effect`, type="l", xlab="Time", ylab="Social ad effect", col="dodgerblue")
```
The Model 3 can be written as
$Sales_t=\beta_0 + \beta_1Direct.ad.effect_t+\beta_2Display.ad.effect_t+\beta_3 Email.ad.effect_t+\beta_4Social.ad.effect_t+\epsilon_t$
In order to make the results more precisely, we will remove first few observations to allow for the "Warmup" of the advertising effect based on the theory of advertising effect.
```{r MMM_Model3, echo=F}
model.3<-fit.model(dt=mdata[10:nrow(mdata),c(1,8:11)], "Sales",
input.names=c("Direct ad.effect","Display ad.effect","Email ad.effect","Social ad.effect"),
model.type="linear", digits = 3)
datatable(model.3)
```
```{r MMM_Model3_reportsshows,echo=FALSE}
m3.beta<-model.3$Estimate
```
* Positive effects for all forms of advertising.
* Email and Direct appears to have a similar influence on sales at about `r m3.beta[2]` and `r m3.beta[4]` additional sales.
* All effects are statistically significant except for Display. Display still has a high standard error after the adjustment of advertising effect and it indicates that we do not have a precise estimate of its effect. This happened because daily display impressions are pretty much the same every day.
**Model 4: Interactions ($Email \times Social$)**
Until now, we have three models, what's else that we need to consider? Interactions occured when there is an extra effect to have two advertising channels or impressions active meanwhile.
We model this by adding an extra interaction term into our basic model, and we take $Email \times Social$ as an example, which could be shown below:
$Sales_t=\beta_0 + \beta_1Direct.ad.effect_t+\beta_2Display.ad.effect_t+\beta_3 Email.ad.effect_t+\beta_4Social.ad.effect_t+\beta_5 (Email \times Social)+\epsilon_t$
```{r MMM_Model4, echo=F}
model.4<-fit.model(dt=mdata[10:nrow(mdata),c(1,8:12)], "Sales",
input.names=c("Direct ad.effect","Display ad.effect","Email ad.effect","Social ad.effect",
"Email&Social"),
model.type="linear", digits = 3)
datatable(model.4)
```
There is no significant interaction effect between Email and Social at the significant level at 0.05 as its p-value is larger than 0.05.
**Model 5: Interactions ($Direct \times Social$)**
Similarly, we did the interaction factor in Model 5, which consider the interaction between Direct and Social impression factors.
```{r MMM_Model5, echo=FALSE}
model.5<-fit.model(dt=mdata[10:nrow(mdata),c(1,8:11,13)], "Sales",
input.names=c("Email ad.effect","Display ad.effect","Direct ad.effect","Social ad.effect",
"Direct&Social"),
model.type="linear", digits = 3)
datatable(model.5)
```
There is no significant interaction effect between Direct and Social at the significant level at 0.05 as its p-value is larger than 0.05.
There are different combinations in our case, and we will show the entire combinations in our reporting engine that you can choose whatever interaction terms you want eventually.
### 3. Assumptions
In our case, we assumed a decay rate for each channel when we created the advertising effect variables. We selected a small decay rate for display and social and a larger rate for email due to our intuition, which is reasonable as people normally have a short memory in display and social ads while having a slightly longer memory in email impressions.
We assume that there are no holiday effects in our modeling process. Take a holiday advertising as an example, the causality is actually reversed as the retailers are buying more ads as they know there will be a high demand than usual, so sales now have an influence on advertising impressions. As we want to know an ordinary result, it is reasonable to think that our simulation data does not include the holidays' factor.
# Method 4: Model-based Attribution Analysis
```{r method_4_data_prep, echo=FALSE}
adatal <- as.data.frame(xtabs(~ id + date + channel, data=impressions), stringsAsFactors=FALSE)
adatal$id <- as.integer(adatal$id)
adatal$date <- as.Date(adatal$date)
adatal$channel <- as.factor(adatal$channel)
dimnames(adatal)[[2]][4] <- "impr"
# Add the records for the users with no impressions
pop <- unique(customer$id)
no.impress.ids <- pop[!(pop %in% unique(impressions$id))]
dates <- sort(unique(impressions$date))
channels <- unique(impressions$channel)
no.impress.obs <- data.frame(id=rep(no.impress.ids, each=length(dates)*length(channels)),
date=rep(rep(dates, each=length(channels)), length(no.impress.ids)),
channel=rep(channels, length(no.impress.ids)*length(dates)),
impr=rep(0, length(dates)*length(no.impress.ids)*length(channels)),
stringsAsFactors=FALSE)
no.impress.obs$channel <- as.factor(no.impress.obs$channel)
adatal <- rbind(adatal, no.impress.obs)
# Convert from long format table to wide format table
adata <- reshape(adatal, direction="wide", v.names="impr", idvar=c("id", "date"),
timevar="channel", new.row.names=NULL)
# Add the transactions column
atrans <- as.data.frame(xtabs(~ id + date, data=transactions), stringsAsFactors=FALSE)
atrans$id <- as.integer(atrans$id)
atrans$date <- as.Date(atrans$date)
dimnames(atrans)[[2]][3] <- "trans"
adata <- merge(adata, atrans, by=c("id", "date"), all=TRUE)
adata$trans[is.na(adata$trans)] <- 0 # Fill in zeros for transactions
# Final tidy up
adata <- adata[adata$date!="2016-12-31" & adata$date != "2017-02-28" & adata$date != "2017-02-27",]
adata <- merge(adata, customer, by=c("id"))
dimnames(adata)[[2]][3:11] <- c("direct", "display", "email", "email.holdout", "social", "trans", "past.purchase", "has.email", "has.direct")
rm(adatal, atrans)
```
### 1. Investigation
Different from the MMM method, model-based attribution analysis focuses on the **user level** by connecting the transactions of customers to their prior advertising impressions. By implementing this method, we could get a sense of how the advertising impressions could impact the user's purchasing behavior. To investigate this relationship, we firstly used the visualization to have a rough but quick idea.
```{r method_4_plots, fig.align='center', fig.width=16, echo=FALSE}
par(mfrow=c(1,4))
plot(aggregate(trans~direct, data=adata, FUN=mean), type="h", ylim=c(0,0.15),
xlab="Impressions on Day", main="Direct Mail", col="dodgerblue")
plot(aggregate(trans~email, data=adata, FUN=mean), type="h", ylim=c(0,0.15),
xlab="Impressions on Day", main="Email", col="dodgerblue")
plot(aggregate(trans~display, data=adata, FUN=mean), type="h", ylim=c(0,0.15),
xlab="Impressions on Day", main="Display", col="dodgerblue")
plot(aggregate(trans~social, data=adata, FUN=mean), type="h", ylim=c(0,0.15),
xlab="Impressions on Day", main="Social Media", col="dodgerblue")
```
The above four plots are telling the purchasing tendency in a very straightforward way. First, users of the online retailer convert more on days they get **emails** or **direct mails**. To be more specific, the channels of email and direct mail are more likely to bring more transactions to the store. (Email may bring even more transactions than direct mail.)
Additionally, we could also see that with more Display and Social Media advertisements, transactions may increase accordingly, and too many ads (around 10) on these two platforms would decrease transactions. However, there are fluctuations as reflected from the plots for Display and Social Media. We then implemented model-based analysis to further verify our findings.
### 2. Results and Interpretation
By implementing the logistic regression model on the dependent variable as transactions and the independent variables as the different advertising channels and whether or not the user has purchased before, we obtained the model summary as below:
```{r fit_in_logit_model, echo=FALSE}
model.4 <- fit.model(dt = adata, "trans", input.names = c("direct", "display","email", "social", "past.purchase"), model.type = "logistic", digits = 3)
datatable(model.4)
```
From the statistical significance perspective, all the four channels and the binary past purchase record have significant impacts on the online retail transactions.
From the odds ratio perspective, we could tell that **email** has the highest positive impact on the transaction - each email could increase 114% of probability for the transaction. The second highest channel is **direct mail** - each direct mail could increase 51% of probability for the transaction. Another notable fact is that the customers who have purchased before will significantly and positively impact the transactions - they are 160% more likely to purchase.
Relating these findings to future marketing strategy, we would recommend the retailer to encourage the users with more than one purchase record to register for memberships, and design the email sending and tracking system specifically for those super users. In this way, we could match up the regular customer group with the most effective advertising channel and would likely to maximize the efficiency of the marketing efforts.
### 3. Assumptions
In this analysis method, we assumed that direct mail, email, social media, and display are the only four channels that the retailer applied for its marketing promotion, such that the effectivenesses are only compared between these four channels. Furthermore, there are no special events (holiday, big discounts, etc.) happening in the same time period (as the date in the dataset) so that the impact of the advertisement would not be influenced by other factors.
# Limitations and Uncertainties
For these four methods, each of them has its own advantages and disadvantages:
|Method |Advantages |Disadvantages|
|------------------------|-------------------------------|-------------------------------|
|Attribution rules |Easy to compute and understand |Ignores non-transaction customers|
|Holdout testing |Simple analysis|requires planning in advance|
|Marketing-mix modeling |Radically reduces data size|sensitive to model assumptions|
|Model-based attribution |More flexible|requires large dataset|
Limitations and uncertainties of this case mainly come from four aspects:
Firstly, this dataset is not a real one but simulated for training only, which means the result may not be a good reveal of reality;
Secondly, this case is limited by marketing channels provided in the original dataset so that we cannot exclude the impact of other marketing channels, such as friends' recommendation; also in our analysis, we count all the sales as incremental, assuming that customers who saw ads would not have made a transaction if they hadn't seen these ads in the first place;
Thirdly, each holdout test was conducted on Tuesday. Therefore, it is hard to see whether there would be any difference if the holdout test was conducted on other days during a week.
Last, the dataset is not large enough and especially after combining the data, the informative data is not enough.
# Areas of Future Investigation
By using last-click attribution, experiments(holdout testing), marketing mix models and attribution models, we are likely to find out the reasons behind the increasing or decreasing sales as mentioned above.
However, there are still rooms for us to make additional contributions based on this theory when applied to reality in the future. Correlation is a complicated magic that people may spend a long time on it. Also, it will be interesting to focus on the trends on different channels besides the impression factors we mentioned before.
We may be able to explore more about the connections among different channels. With the rapid development of the social media and the internet, the traditional way of advertising has been changed a lot. What we studied in our project is just a small projection about what is really happened every single day in our real life. The simple theories used here is still useful in many areas of today’s world.
Moreover, breaking down each advertising channel would give us a more explicit and detailed overview of the effectiveness across different promotion platforms. For example, having data aggregated from social media channels - Facebook, Instagram, Snapchat, LinkedIn, etc., we could investigate the advertising platform with the highest cost efficiency, or ARPPU (average revenue per paid user), and the time slots with highest response rate.
Additionally, adding back the users' profile data would also largely increase the potential of learning the user response behaviors at a granular level. Customer information such as age, gender, education level, and personal income would help us do customer segmentation before implementing the analyses, which would distill more insights of advertisement response from different user groups.
There are still a lot of potential areas of future investigation, such as Marketing optimization. Marketing Optimization services and tools can help improve efficiency and profitability by pinpointing the level of marketing activity required to maximize your sales and profit and minimize costs. Methods we utilized before could be excellently generalized in this service in reality. It helps the retailers plan budgets, manage multi-channels, create pricing strategies and produce optimal revenue for a single brand.
Marketing data could be quantified by using these different methods and thanks to these methods, we will be able to understand that why every dollar spent in advertising is actually working for your business.
# References
* Marketing mix modeling. (2019, April 24). Retrieved from https://en.wikipedia.org/wiki/Marketing_mix_modeling
* Lohse, A. (n.d.). TV Effectiveness Analysis is Incomplete Without Ad-stock Consideration. Retrieved from https://www.martechadvisor.com/articles/ads/tv-effectiveness-analysis-is-incomplete-without-adstock-consideration/
* Marketing Optimization. (n.d.). Retrieved from https://analyticpartners.com/marketing-optimization/?gclid=CjwKCAjwk7rmBRAaEiwAhDGhxMBUlY0olF0IYV-oplCVBS1f4HbT8dV4p2aRmEaUghp50RSXM1-sExoCXLIQAvD_BwE