coursera-getdata/run_analysis.R at master · eekboom/coursera-getdata · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
# Tidies up the raw data in "UCI HAR Dataset" folder:
# - Merges training and test data sets
# - keeps only columns that measure a mean or standard deviation
# - adds columns for subject and activity
# - groups by subject and activity and calculates mean of each column in each group
# - outputs tidy data to "tidy-data.txt"

# top level directory name of the raw data files
dataRootDir <- "UCI HAR Dataset"

# If main data directory not found in current working directory,
# then set the working directory to the directory this script itself is in
setWorkingDir <- function () {
    dir <- dirname(sys.frame(1)$ofile)
    setwd(dir)
}
if(!(dataRootDir %in% dir())) {
    setWorkingDir()
}

# Load dplyr, install first if needed
if(!require(dplyr)) {
    print("Installing dplyr package")
    install.packages("dplyr")
}
library(dplyr)


# read all column names of main data files
features <- read.table(paste0(dataRootDir, "/features.txt"))
columnNames <- features[, 2]
# We are only interested in columns that end with -mean() and -std()
columnPattern <- "-mean\\(\\)|-std\\(\\)"
# the original column names contain duplicates, so first read the data
# without column names, then select relevant columns by index,
# then set the correct column names for the remaining columns
relevantColumnIndices <- grep(columnPattern, columnNames, value=FALSE)
relevantColumnNames <- columnNames[relevantColumnIndices]

# Clean up column names, so that they could be used as variable names in R
# remove "()" in names
relevantColumnNames <- sub("\\(\\)", "", relevantColumnNames)
# replace "-" by "_"
relevantColumnNames <- gsub("-", "_", relevantColumnNames)
# Replace single letter prefix by its long form
relevantColumnNames <- gsub("^t", "time", relevantColumnNames)
relevantColumnNames <- gsub("^f", "frequency", relevantColumnNames)
# Fix duplicated "Body" terms
relevantColumnNames <- gsub("BodyBody", "Body", relevantColumnNames)

# read table that contain codes and labels for activities
activityLabelsFile <- paste0(dataRootDir, "/activity_labels.txt")
activityLabels <- read.table(activityLabelsFile, col.names=c("code", "activityLabel"))

# reads either training data (name == "train") or test data (name == "test")
# and returns the tidied (but yet ungrouped) data.
readDataSet <- function (name) {
    dataSetDir <- paste0(dataRootDir, "/", name, "/")

    # read in main data file
    dataFile <- paste0(dataSetDir, "X_", name, ".txt")
    # unfortunately data.table's fread() function fails on this data set, so use the slower read.table()
    data <- read.table(dataFile)

    # wrap the data in a dpylr table to make manipulation easier
    data <- tbl_df(data)

    # keep only relevant columns
    data <- select(data, relevantColumnIndices)
    # and assign names
    colnames(data) <- relevantColumnNames

    # for each observation this variable contains the activity code
    activityCodes <- read.table(paste0(dataSetDir, "y_", name, ".txt"), col.names="code")
    # add activity label for each code and select only label column
    activityLabelsColumn <- left_join(activityCodes, activityLabels, by="code")$activityLabel

    # read in subject numbers
    subjectFile <- paste0(dataSetDir, "subject_", name, ".txt")
    subjectsColumn <- read.table(subjectFile, col.names="subject")$subject

    # add columns for subject and activity to the main data set
    data <- mutate(data, subject=subjectsColumn, activity=activityLabelsColumn)
    # move subject and activity columns to the beginning
    colCount <- ncol(data)
    data <- data[, c(colCount - 1, colCount, 1:(colCount-2))]

    return(data)
}

print("Loading training data")
trainData <- readDataSet("train")
print("Loading test data")
testData <- readDataSet("test")

# merge training and test data set
allData <- rbind(trainData, testData)

# group by subject and activity columns and calculate mean value for all other columns
tidyData <-
    allData %>%
    group_by(subject, activity) %>%
    summarise_each(funs(mean))

print("Writing tidy data to tidy-data.txt")
write.table(tidyData, "tidy-data.txt", row.name=FALSE)