-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathrun_analysis.R
More file actions
106 lines (87 loc) · 4.02 KB
/
run_analysis.R
File metadata and controls
106 lines (87 loc) · 4.02 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
# Tidies up the raw data in "UCI HAR Dataset" folder:
# - Merges training and test data sets
# - keeps only columns that measure a mean or standard deviation
# - adds columns for subject and activity
# - groups by subject and activity and calculates mean of each column in each group
# - outputs tidy data to "tidy-data.txt"
# top level directory name of the raw data files
dataRootDir <- "UCI HAR Dataset"
# If main data directory not found in current working directory,
# then set the working directory to the directory this script itself is in
setWorkingDir <- function () {
dir <- dirname(sys.frame(1)$ofile)
setwd(dir)
}
if(!(dataRootDir %in% dir())) {
setWorkingDir()
}
# Load dplyr, install first if needed
if(!require(dplyr)) {
print("Installing dplyr package")
install.packages("dplyr")
}
library(dplyr)
# read all column names of main data files
features <- read.table(paste0(dataRootDir, "/features.txt"))
columnNames <- features[, 2]
# We are only interested in columns that end with -mean() and -std()
columnPattern <- "-mean\\(\\)|-std\\(\\)"
# the original column names contain duplicates, so first read the data
# without column names, then select relevant columns by index,
# then set the correct column names for the remaining columns
relevantColumnIndices <- grep(columnPattern, columnNames, value=FALSE)
relevantColumnNames <- columnNames[relevantColumnIndices]
# Clean up column names, so that they could be used as variable names in R
# remove "()" in names
relevantColumnNames <- sub("\\(\\)", "", relevantColumnNames)
# replace "-" by "_"
relevantColumnNames <- gsub("-", "_", relevantColumnNames)
# Replace single letter prefix by its long form
relevantColumnNames <- gsub("^t", "time", relevantColumnNames)
relevantColumnNames <- gsub("^f", "frequency", relevantColumnNames)
# Fix duplicated "Body" terms
relevantColumnNames <- gsub("BodyBody", "Body", relevantColumnNames)
# read table that contain codes and labels for activities
activityLabelsFile <- paste0(dataRootDir, "/activity_labels.txt")
activityLabels <- read.table(activityLabelsFile, col.names=c("code", "activityLabel"))
# reads either training data (name == "train") or test data (name == "test")
# and returns the tidied (but yet ungrouped) data.
readDataSet <- function (name) {
dataSetDir <- paste0(dataRootDir, "/", name, "/")
# read in main data file
dataFile <- paste0(dataSetDir, "X_", name, ".txt")
# unfortunately data.table's fread() function fails on this data set, so use the slower read.table()
data <- read.table(dataFile)
# wrap the data in a dpylr table to make manipulation easier
data <- tbl_df(data)
# keep only relevant columns
data <- select(data, relevantColumnIndices)
# and assign names
colnames(data) <- relevantColumnNames
# for each observation this variable contains the activity code
activityCodes <- read.table(paste0(dataSetDir, "y_", name, ".txt"), col.names="code")
# add activity label for each code and select only label column
activityLabelsColumn <- left_join(activityCodes, activityLabels, by="code")$activityLabel
# read in subject numbers
subjectFile <- paste0(dataSetDir, "subject_", name, ".txt")
subjectsColumn <- read.table(subjectFile, col.names="subject")$subject
# add columns for subject and activity to the main data set
data <- mutate(data, subject=subjectsColumn, activity=activityLabelsColumn)
# move subject and activity columns to the beginning
colCount <- ncol(data)
data <- data[, c(colCount - 1, colCount, 1:(colCount-2))]
return(data)
}
print("Loading training data")
trainData <- readDataSet("train")
print("Loading test data")
testData <- readDataSet("test")
# merge training and test data set
allData <- rbind(trainData, testData)
# group by subject and activity columns and calculate mean value for all other columns
tidyData <-
allData %>%
group_by(subject, activity) %>%
summarise_each(funs(mean))
print("Writing tidy data to tidy-data.txt")
write.table(tidyData, "tidy-data.txt", row.name=FALSE)