setwd("C:/Users/ece.o/OneDrive/Coursera/Johns Hopkins University Data Science/Reproducible Research/RepData_PeerAssessment1")
data <- read.csv("activity.csv")
- Total number of steps taken per day
dailySteps <- tapply(data[,1], data[,2], sum, na.rm = TRUE)
- Histogram of the total number of steps taken each day
hist(dailySteps)
- Mean of the total number of steps taken per day is 9354 and median is 10395
- Time series plot of the 5-minute interval and the average number of steps taken, averaged across all days
intSteps <- tapply(data[,1], data[,3], mean, na.rm = TRUE)
plot(names(intSteps), intSteps, type = "l", xlab = "5-Minute Interval", ylab = "# of Steps")
- Interval 835 contains the maximum number of steps.
- The total number of rows with NAs is 2304.
- Fill missing data with interval mean
filled <- data
for (i in 1:nrow(filled)) {
if (is.na(data[i,1])) {
filled[i, 1] = intSteps[names(intSteps) == filled[i, 3]]
}
}
dailySteps2 <- tapply(filled[,1], filled[,2], sum)
hist(dailySteps2)
Mean of the total number of steps taken per day is 10766 and median is 10766, too.
The frequencies in the middle part of the histogram increased because the empty values are now replaced with average values. The mean and median are slightly higher, but not too high that can distort the analysis in a bad way.
- Convert to PosIX first, then apply
weekdays()
.
filled$weekday <- as.factor(weekdays(as.POSIXct(filled[,2])))
filled$weekDE <- ifelse((filled[,4] == "Saturday") | (filled[,4] == "Sunday"), "weekend", "weekday")
filled$weekDE <- as.factor(filled$weekDE)
- Group the data using
data.table
.
library(data.table)
filled <- data.table(filled)
weekMean <- filled[, lapply(.SD, mean), by = "interval,weekDE"]
Create line chart.
library(lattice)
xyplot(steps ~ interval|weekDE,
data = weekMean,
type = "l",
xlab = "Interval",
ylab = "Number of steps",
layout=c(1,2))