Den Revenues Analysis1.Rmd

---
title: "R Notebook"
output: html_notebook
---

#importing dataset and getting initial EDA of dataset
```{r}
library(readxl)
library(data.table)
den = read_excel("DEN 2012 - Jun 17.xlsx")
den = data.table(den)
head(den)
summary(den)

#converted `Cannibas?` hype and no hype values to binary, however discovered character values did not impact model so removed dummy variable creation
#den$`Cannibas?`[den$`Cannibas?` == 'hype'] = 1
#den$`Cannibas?`[den$`Cannibas?` == 'no hype'] = 0
```
#selecting columns by non-airline revenues (response variable) and `Month` & `Year`
```{r}
library(dplyr)
den1 = select(den, "Month and Year", Concession:Ground)
head(den1)
```
#slice `Month` and `Year` column using tidyr
#observe sum totals of Non-airline revenues using janitor
```{r}
library(tidyr)
separate_time = den1 %>% 
  separate("Month and Year", c('Day', 'Month', 'Year'))
den2 = select(separate_time, -Day)
#splitting date for all variables in dataset
separate_time1 = den %>% 
  separate("Month and Year", c('Day', 'Month', 'Year'))
den7 = select(separate_time1, -Day); den7

library(janitor)
den3 = den2 %>%
  adorn_totals("col"); den3
```
#summarize average non-airline revenue by `Month` and `Year` using dplyr
```{r}
avg.month = den2 %>%
  group_by(Month) %>%
  summarise_each(funs(mean, n()), Concession, Parking, "Rental Car", Ground)
avg.month
sum.month.avgs = apply(avg.month[,2:5],1,sum); sum.month.avgs
#df.month = new dataframe joining sum of averages from variables that make up response with `Month`
df.month = data.frame(cbind(avg.month["Month"],sum.month.avgs)); df.month

#transpose df.month dataframe for use in executive summary
View(t(df.month))

avg.year = den2 %>%
  group_by(Year) %>%
  summarise_each(funs(mean, n()), Concession, Parking, "Rental Car", Ground)
avg.year
sum.year.avgs = apply(avg.year[,2:5],1,sum); sum.year.avgs
#df.year = new dataframe joining sum of averages from variables that make up response with `Year`
df.year = data.frame(cbind(avg.year["Year"], sum.year.avgs)); df.year
```
#Visualizations: Non-airline revenues by `Month` and `Year` with ggplot2
```{r}
library(ggplot2)
revplot.total = ggplot(df.month, aes(x = Month, y = sum.month.avgs)) + geom_point(size = 3, colour = "green") + ggtitle("Total Avg Non-airline Revenue by Month"); revplot.total + theme_dark()
revplot1 = ggplot(den2, aes(x = Month, y = Concession)) + geom_point(size = 2, colour = "red") + ggtitle("Concession revenue by Month"); revplot1 + theme_dark()
revplot2 = ggplot(den2, aes(x = Month, y = Parking)) + geom_point(size = 2, colour = "orange") + ggtitle("Parking revenue by Month"); revplot2 + theme_dark()
revplot3 = ggplot(den2, aes(x = Month, y = `Rental Car`)) + geom_point(size = 2, colour = "yellow") + ggtitle("Rental Car revenue by Month"); revplot3 + theme_dark()
revplot4 = ggplot(den2, aes(x = Month, y = Ground)) + geom_point(size = 2, colour = "black") + ggtitle("Ground revenue by Month"); revplot4 + theme_dark()

#all non-airline revenue dependent variables have positive linear relationship with `Year`
revplot.total.1 = ggplot(df.year, aes(x = Year, y = sum.year.avgs)) + geom_point(size = 3, colour = "blue") + ggtitle("Total Avg Non-airline Revenue by Year"); revplot.total.1 + theme_dark()
revplot5 = ggplot(den2, aes(x = Year, y = Concession)) + geom_point(size = 2, colour = "purple") + ggtitle("Concession revenue by Year"); revplot5 + theme_dark()
revplot6 = ggplot(den2, aes(x = Year, y = Parking)) + geom_point(size = 2, colour = "dark green") + ggtitle("Parking revenue by Year"); revplot6 + theme_dark()
revplot7 = ggplot(den2, aes(x = Year, y = `Rental Car`)) + geom_point(size = 2, colour = "dark blue") + ggtitle("Rental Car revenue by Year"); revplot7 + theme_dark()
revplot8 = ggplot(den2, aes(x = Year, y = Ground)) + geom_point(size = 2, colour = "dark red") + ggtitle("Ground revenue by Year"); revplot8 + theme_dark()

#looked at melting all the values directly in one column for quick analysis
library(reshape2)
df.plot = data.frame(den2$Concession, den2$Parking, den2$`Rental Car`, den2$Ground)
df.plot.table = data.table(df.plot)
df.m.total = melt(df.plot.table); df.m.total

#layered plots showing relationships between non-airline revenue and `Month` & `Year`
ggplot(den2) + geom_point(aes(den2$Month,den2$Parking, colour = "Parking")) + geom_point(aes(den2$Month,den2$`Rental Car`, colour = "Rental Car")) + geom_point(aes(den2$Month,den2$Ground, colour = "Ground")) + geom_point(aes(den2$Month, den2$Concession, colour = "Concession")) + labs(x = "Months", y = "Revenue", title = "Non-airline Revenue by Month") + theme_dark()

ggplot(den2) + geom_point(aes(den2$Year,den2$Parking, colour = "Parking")) + geom_point(aes(den2$Year,den2$`Rental Car`, colour = "Rental Car")) + geom_point(aes(den2$Year,den2$Ground, colour = "Ground")) + geom_point(aes(den2$Year, den2$Concession, colour = "Concession")) + labs(x = "Years", y = "Revenue", title = "Non-airline Revenue by Year") + theme_dark()

#Grouped non-airline revenue by `Year` for easy analysis
rev.year = den3 %>%
  group_by(Year) %>%
  summarise_each(funs(sum, n()), Total); rev.year

#plot of total sum of non-airline revenue by `Year`
ggplot(rev.year, aes(x = Year, y = sum)) + geom_smooth() + geom_point(size = 3, color = "green") + labs(x = "Years 2012 - 2017", y = "Revenue", title = "Total Sum of Non-airline Revenue by Year", caption = "*2017 revenues only accrued Jan - June")+ theme_dark()
```
#trimmed top 3 rows from dataframe to avoid incorrect results from NA values
#adding observation column to try and predict model over time
#adding totalrev column for response variables
```{r}
den4 = den7[4:66,]
den4$ObsNumber =  1:nrow(den4)
den4$TotalRev = (den4$Concession + den4$Parking + den4$`Rental Car` + den4$Ground)
```

#Model 1: Total sum dependent variables + IVs: (Month, Destination, Cannibas, UMCSENTLag3)
```{r}
library(car)

#using log transformation on dependent variable due to using lagged explanatory variable
mod1 = lm(log(den4$TotalRev) ~ den4$Month + den4$Destination + den4$`Cannibas?` + den4$UMCSENTLag3)
summary(mod1)
plot(den4$TotalRev, mod1$residuals, xlab = 'pred', ylab = 'res', main = 'Residuals vs Predicted')
abline(h=0, col='red')
qqnorm(mod1$residuals)
qqline(mod1$residuals, col=2)
hist(mod1$residuals)
AIC(mod1)
BIC(mod1)
vif(mod1)
durbinWatsonTest(mod1)
anova(mod1)
#adjusted r2: 0.8555
#AIC: -185.4749 is by far the lowest = more parsimonious model
#BIC: -151.1847
#DW test: p=0.36 Do not reject H0 and no autocorrelation

#RMSE from summary: the square root of the residual deviance per degree of freedom
#977992

#inverse transformation of log base e on RMSE
library(stats)
sigma(mod1)

den4$TotalRevlog = log(den4$TotalRev)

rmset <- function(model){
  y = den4$TotalRevlog
  y.pred = predict(model)
  
  y<-exp(y) - 1
  y.pred<-exp(y.pred)-1
  
  return(sqrt(mean((y - y.pred)^2)))
} 
rmset (mod1)

#Looked at adding den4$ObsNumber predictor but eventually decided to remove this IV as it didn't increase the accuracy of the model in this case.
```
```
#creating predictor variable datasets based upon independent variables
#Predictor Variables used across all models: `Month`,`Destination`,`Cannibas?`,`UMCSENTLag3`,`UMCSENTLag2`,`Origin + Destin`,`UMCSENTLag1`
```{r}
den.predict1 = den4[60:63,c("Month","Destination","Cannibas?","UMCSENTLag3")]
den.predict2 = den4[60:63,c("Deplaned","UMCSENTLag2","Month")]
den.predict3 = den4[60:63,c("UMCSENTLag2","Cannibas?","Month")]
den.predict4 = den4[60:63,c("Origin + Destin", "UMCSENTLag3","Cannibas?","Month")]
den.predict5 = den4[60:63,c("Origin + Destin", "UMCSENTLag1","Month")]
```

#Model 1: running prediction for `Month` values: March - June 2017
```{r}
#predict
model1.predict = predict(mod1, den.predict1)
#used exponential function as used log in original lm
exp(model1.predict)
#26901433 25923248 26707321 27711106
```
#Model 1: Leave-k out cross validation
```{r}
library(boot)
set.seed(4)
m1 = glm(log(den4$TotalRev) ~ den4$Month + den4$Destination + den4$`Cannibas?` + den4$UMCSENTLag3)
summary(m1)
cv_result1 = cv.glm(den4, m1, K=5)
cv_result1$delta
#adjusted delta: 0.03278247
```
#Model 2: Concession DV + IVs: (`Deplaned`, `UMCSENTLag2`, `Month`)
```{r}
mod2 = lm(den4$Concession ~ den4$Deplaned + den4$UMCSENTLag2 + den4$Month)
summary(mod2)
plot(den4$Concession, mod2$residuals, xlab = 'pred', ylab = 'res', main = 'Residuals vs Predicted')
abline(h=0, col='red')
qqnorm(mod2$residuals)
qqline(mod2$residuals, col=2)
hist(mod2$residuals)
AIC(mod2)
BIC(mod2)
durbinWatsonTest(mod2)
anova(mod2)
vif(mod2)
#adjusted r2: 0.7309
#AIC: 1800.509
#BIC: 1832.656
#RMSE: 347,500
#DW test: p=0.836 Do not reject H0 and no autocorrelation
```
#Model 2: running prediction for `Month` values: March - June 2017
```{r}
model2.predict = predict(mod2, den.predict2); model2.predict
#5340259 5207826 5684507 5695202
```
#Model 2: Leave-k out cross validation
```{r}
library(boot)
set.seed(4)
m2 = glm(den4$Concession ~ den4$Deplaned + den4$UMCSENTLag2 + den4$Month)
summary(m2)
cv_result2 = cv.glm(den4, m2, K=5)
cv_result2$delta
#adjusted delta: 849635678249
```
#Model 3: Parking DV + IVs: (`UMCSENTLag2`, `Cannibas`, `Month`)
```{r}
mod3 = lm(den4$Parking ~ den4$UMCSENTLag2 + den4$`Cannibas?` + den4$Month)
summary(mod3)
plot(den4$Parking, mod3$residuals, xlab = 'pred', ylab = 'res', main = 'Residuals vs Predicted')
abline(h=0, col='red')
qqnorm(mod3$residuals)
qqline(mod3$residuals, col=2)
hist(mod3$residuals)
AIC(mod3)
BIC(mod3)
durbinWatsonTest(mod3)
anova(mod3)
vif(mod3)
#adjusted r2: 0.6241
#AIC: 1926.021
#BIC: 1958.168
#RMSE: 940,800
#DW test: p=0.182 Do not reject H0 and no autocorrelation 

#Tried combination of Destination/Origin +Dest/Enplaned/Transfer/Lg1-3/Month/Year
```
#Model 3: running prediction for `Month` values: March - June 2017
```{r}
model3.predict = predict(mod3, den.predict3); model3.predict
#15364870 15087553 15389519 15384025  
```
#Model 3: Leave-k out cross validation
```{r}
library(boot)
set.seed(4)
m3 = glm(den4$Concession ~ den4$UMCSENTLag2 + den4$`Cannibas?` + den4$Month)
summary(m3)
cv_result3 = cv.glm(den4, m3, K=5)
cv_result3$delta
#adjusted delta: 764367218313
```
#Model 4: Rental Car DV + IVs: (`Origin` + `Destin`, `UMCSENTLag3`, `Cannibas`, `Month`)
```{r}
mod4 = lm(log(den4$`Rental Car`) ~ den4$`Origin + Destin` + den4$UMCSENTLag3 + den4$`Cannibas?` + den4$Month)
summary(mod4)
plot(den4$`Rental Car`, mod4$residuals, xlab = 'pred', ylab = 'res', main = 'Residuals vs Predicted')
abline(h=0, col='red')
qqnorm(mod4$residuals)
qqline(mod4$residuals, col=2)
hist(mod4$residuals)
AIC(mod4)
BIC(mod4)
durbinWatsonTest(mod4)
anova(mod4)
vif(mod4)
#adjusted r2: 0.8447 
#AIC: -90.41809
#BIC: -56.12794
#RMSE: 426724.8 
#DW test: p=0.706 Do not reject H0 and no autocorrelation 

#inverse transformation of log base e on RMSE
sigma(mod4)
den4$`Rental Car Log` = log(den4$`Rental Car`)


rmset <- function(model){
  y = den4$`Rental Car Log`
  y.pred = predict(model)
  
  y<-exp(y) - 1
  y.pred<-exp(y.pred)-1
  
  return(sqrt(mean((y - y.pred)^2)))
} 
rmset (mod4)

#Tried combination of Destination/Origin +Dest/Enplaned/Transfer/Lg1-3/Month/Year
```
#Model 4: running prediction for `Month` values: March - June 2017
```{r}
model4.predict = predict(mod4, den.predict4)
#used exponential function as used log in original lm
exp(model4.predict)
#5423134 4223400 4347512 5322921
```
#Model 4: Leave-k out cross validation
```{r}
library(boot)
set.seed(4)
m4 = glm(den4$`Rental Car` ~ den4$`Origin + Destin` + den4$UMCSENTLag3 + den4$`Cannibas?` + den4$Month)
summary(m4)
cv_result4 = cv.glm(den4, m4, K=5)
cv_result4$delta
#adjusted delta: 2.235139e+12
```
#Model 5: Ground DV + IVs: (`Origin` + `Destin`, `UMCSENTLag1`, `Month`)
```{r}
mod5 = lm(den4$Ground ~ den4$`Origin + Destin` + den4$UMCSENTLag1 + den4$Month)
summary(mod5)
plot(den4$Ground, mod5$residuals, xlab = 'pred', ylab = 'res', main = 'Residuals vs Predicted')
abline(h=0, col='red')
qqnorm(mod5$residuals)
qqline(mod5$residuals, col=2)
hist(mod5$residuals)
AIC(mod5)
BIC(mod5)
durbinWatsonTest(mod5)
anova(mod5)
vif(mod5)
#adjusted r2: 0.8105
#AIC: 1628.353
#BIC: 1660.5
#RMSE: 88,620
#DW test: p=0.492 Do not reject H0 and no autocorrelation 

#Tried combination of Destination/Origin +Dest/Enplaned/Transfer/Lg1-3/Time/Month/Year
```