-
Notifications
You must be signed in to change notification settings - Fork 0
/
PML_final-Project.Rmd
103 lines (85 loc) · 2.87 KB
/
PML_final-Project.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
---
output:
html_document: default
pdf_document: default
---
```{r echo=TRUE}
## Sreya Dhar
## Practical Machine Learning: Final Project
rm(list = ls());
#load libraries
library(caret)
library(rpart)
library(rpart.plot)
library(RColorBrewer)
library(rattle)
library(e1071)
library(randomForest)
library(readr)
library(party) # Alternative decision tree algorithm
library(partykit) # Convert rpart object to BinaryTree
# set the seed
set.seed(100)
# the url of the files incase you want to read them online
train.url <-
"https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv"
test.url <-
"https://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv"
# read the csv files (saved in hard drive)
train_orig <- read.csv("C:/File E/Regresion analysis/pml-training.csv", header=T)
test_orig <- read.csv("C:/File E/Regresion analysis/pml-testing.csv", header=T)
# Drop the first 7 columns as they're unnecessary for predicting.
train_clean <- train_orig[,8:length(colnames(train_orig))]
test_clean <- test_orig[,8:length(colnames(test_orig))]
# Drop colums with NAs
train_clean <- train_clean[, colSums(is.na(train_clean)) == 0]
test_clean <- test_clean[, colSums(is.na(test_clean)) == 0]
# Check for near zero variance predictors and drop them if necessary
nzv <- nearZeroVar(train_clean,saveMetrics=TRUE)
zero.var.ind <- sum(nzv$nzv)
#
if ((zero.var.ind>=0)) {
train_clean <- train_clean[,nzv$nzv==FALSE]
}
# data partitioning in training set (80%) and validation set (20%)
train_part <- createDataPartition(train_clean$classe, p=0.80, list=F)
train_final <- train_clean[train_part, ]
validate_final <- train_clean[-train_part, ]
# 5-fold cross-validation performed on Random Forest algorithm
controlPara <- trainControl(method="cv", 5)
# Random Forest for training data set, 300 trees performed
mod_rf <- train(classe ~ ., data=train_final, method="rf",
trControl=controlPara, ntree=300)
# model summary
mod_rf
```
```{r}
# predict the trained model on validation set
pred_rf <- predict(mod_rf, validate_final)
# confusion matrix on validation set
confusionMatrix(validate_final$classe, pred_rf)
```
```{r}
# accuracy of the predicted model
accuracy <- postResample(pred_rf, validate_final$classe)
accu_out <- accuracy[1]
accu_out
```
```{r}
# Overall out of sample error:
OSE <- 1 - as.numeric(confusionMatrix(validate_final$classe, pred_rf)
$overall[1])
OSE
```
```{r}
# trained model applied to test data for results
pred_test <- predict(mod_rf, test_clean[, -length(names(test_clean))])
pred_test
```
```{r}
# Building the regression tree
mytree <- rpart(classe ~ ., data=train_final, method="class")
# Plot the tree
fancyRpartPlot(mytree, caption = 'https://github.com/sreyadhar') +
title('Regression Tree Visualization',adj = 0.0)
```