Skip to content

Latest commit

 

History

History
148 lines (130 loc) · 7.67 KB

Readme.md

File metadata and controls

148 lines (130 loc) · 7.67 KB

title author date output Course Project 1 - Reproducible Research Ebenezer Bediam December 06, 2019 html_document ##Assignment Instructions 1.Code for reading in the dataset and/or processing the data 2.Histogram of the total number of steps taken each day 3.Mean and median number of steps taken each day 4.Time series plot of the average number of steps taken 5.The 5-minute interval that, on average, contains the maximum number of steps 6.Code to describe and show a strategy for imputing missing data 7.Histogram of the total number of steps taken each day after missing values are imputed 8.Panel plot comparing the average number of steps taken per 5-minute interval across weekdays and weekends 9.All of the R code needed to reproduce the results (numbers, plots, etc.) in the report

##Step 1 ##Code for reading in the dataset and/or processing the data

setwd("C:/Users/Shengyu Chen/Dropbox/Academics/Coursera/Data Science Specialization/Reproducible Research/Course Project 1") activity<-read.csv("activity.csv") Exploring the basics of this data

dim(activity) names(activity) head(activity) str(activity) #total number of missing data sum(is.na(activity$steps))/dim(activity)[[1]] #transforming the date column into date format using lubridate library(lubridate) activity$date<-ymd(activity$date) length(unique(activity$date)) ##Step 2 ##Histogram of the total number of steps taken each day

library(ggplot2) Q2<-data.frame(tapply(activity$steps,activity$date,sum,na.rm=TRUE)) Q2$date<-rownames(Q2) rownames(Q2)<-NULL names(Q2)[[1]]<-"Total Steps" png("plot1.png") #Total Steps by date bar chart ggplot(Q2,aes(y=Q2$Total Steps,x=Q2$date))+geom_bar(stat="identity") + ylab("Total Steps")+xlab("Date")+ggtitle("Total Steps by date") dev.off() ggplot(Q2,aes(y=Q2$Total Steps,x=Q2$date))+geom_bar(stat="identity") + ylab("Total Steps")+xlab("Date")+ggtitle("Total Steps by date") #Histogram of total steps qplot(Q2$Total Steps,geom="histogram",xlab="Total Steps",ylab="Counts",main="Total Steps Historgram") png("plot1.1.png") qplot(Q2$Total Steps,geom="histogram",xlab="Total Steps",ylab="Counts",main="Total Steps Historgram") dev.off() ##Step 3 ##Mean and median number of steps taken each day

library(dplyr) Q3<-data.frame(round(tapply(activity$steps,activity$date,mean,na.rm=TRUE),2)) Q3$date<-rownames(Q3) rownames(Q3)<-NULL names(Q3)[[1]]<-"Mean Steps" temp<-activity%>%select(date,steps) %>% group_by(date) %>% summarise(median(steps)) names(temp)[[2]]<-"Median Steps" Q3$median<-temp$Median Steps Q3<-Q3 %>% select(date,Mean Steps,median) ##Step 4 ##Time series plot of the average number of steps taken

Q4<-Q3 Q4$date<-as.Date(Q4$date,format="%Y-%m-%d") ggplot(Q4,aes(x=Q4$date,y=Q4$Mean Steps))+geom_bar(stat="identity")+scale_x_date()+ylab("Mean Steps Every day")+xlab("Date")+ggtitle("Mean Steps by Date") png("plot4.png") ggplot(Q4,aes(x=Q4$date,y=Q4$Mean Steps))+geom_bar(stat="identity")+scale_x_date()+ylab("Mean Steps Every day")+xlab("Date")+ggtitle("Mean Steps by Date") dev.off() ##Step 5 ##The 5-minute interval that, on average, contains the maximum number of steps

#This is assuming that the words on average means averaging steps by date and interval activity$interval<-factor(activity$interval) Q5<-aggregate(data=activity,stepsdate+interval,FUN="mean") Q5<-aggregate(data=Q5,stepsinterval,FUN="max") ##Step 6 Code to describe and show a strategy for imputing missing data There are multiple strategies to deal with multiple value imputations. The common strategies include:

Constant value imputations Regression model value imputations Mean/mode value substitutions For the purpose of simplicity, in this question, I will use the mean/mode value substitution strategy to impute missing values. That is, using the mean values to substitute out the missing values in the original data set Before doing any sort of imputation, it is helpful to understand what are the distributions of missing values by date and interval Q6<-activity Q6$Missing<-is.na(Q6$steps) Q6<-aggregate(data=Q6,Missing~date+interval,FUN="sum") Q6.1<-data.frame(tapply(Q6$Missing,Q6$date,sum)) Q6.1$date<-rownames(Q6.1) rownames(Q6.1)<-NULL names(Q6.1)<-c("Missing","date") Q6.1$date<-as.Date(Q6.1$date,format="%Y-%m-%d")

Q6.2<-data.frame(tapply(Q6$Missing,Q6$interval,sum)) Q6.2$date<-rownames(Q6.2) rownames(Q6.2)<-NULL names(Q6.2)<-c("Missing","Interval")

par(mfrow=c(1,2)) plot(y=Q6.1$Missing,x=Q6.1$date,main="Missing Value Distribution by Date") plot(y=Q6.2$Missing,x=Q6.2$Interval,main="Missing Value Distribution by Interval") table(activity$date) By this point, from the plot, that the missing values have a very disctinct pattern. For every interval, there are consistantly 8 missing values. For the date, there are consistantly 288 missing values. And in total, there are 8 dates that have missing value. We don't exactly know the cause for these missing values but there's a pattern. For that matter, we can see that the mean value imputation is appropriate.

We can see that every date has 288 data points. It means that the 8 dates have no data points at all what so ever. We can refine the analysis by looking at these missing values depending on their Weekday and interval parameters to matach with the average

#Dates that have missing values library(lubridate) Q6.3<-as.data.frame(Q6.1) %>% select(date,Missing) %>% arrange(desc(Missing)) Q6.3<-Q6.3[which(Q6.3$Missing!=0),] Q6.3$Weekday<-wday(Q6.3$date,label=TRUE) Q6.4<-activity Q6.4$weekday<-wday(Q6.4$date,label=TRUE) #Finding the mean of steps every monday, and every interval Q6.5<-aggregate(data=Q6.4,steps~interval+weekday,FUN="mean",na.rm=TRUE) #Merge the pre-imputation table Q6.4 table with the average table Q6.5 Q6.6<-merge(x=Q6.4,y=Q6.5,by.x=c("interval","weekday"),by.y=c("interval","weekday"),all.x=TRUE) #Conditionally replacing the steps.x column NA value with the values from steps.y column value Q6.6$Steps.Updated<-0 for (i in 1:dim(Q6.6)[[1]]){ if(is.na(Q6.6[i,3])){Q6.6[i,6]=Q6.6[i,5]} else {Q6.6[i,6]=Q6.6[i,3]} } #Now simplify the im