Skip to content

Commit 9bb1979

Browse files
committed
indicate source for CP stations and dates file
1 parent f413584 commit 9bb1979

File tree

1 file changed

+142
-140
lines changed

1 file changed

+142
-140
lines changed

R/align_dates.R

Lines changed: 142 additions & 140 deletions
Original file line numberDiff line numberDiff line change
@@ -1,140 +1,142 @@
1-
#' Aligns Date and Time Data Based on Official Station Records
2-
#'
3-
#' This function compares date and time data from a user-provided dataframe (`df`) against an official record stored in a CSV file. It checks if the date-time for each station in the input dataframe matches the expected values from the official record, based on station, year, and season. The function attempts to correct mismatches where possible and reports any discrepancies. It updates the `date_time` column in the input dataframe where necessary and provides messages about the status of each match.
4-
#'
5-
#' @param df A dataframe containing station data with a `date_time` column. The dataframe must include a `station` column to match against the official record.
6-
#'
7-
#' @return A dataframe with the same structure as the input dataframe (`df`), but with updated `date_time` values where mismatches are corrected. The columns `year_date_time` and `season_date_time` are removed from the output.
8-
#'
9-
#' @details
10-
#' The function performs the following steps:
11-
#' 1. Loads an official record CSV containing expected station data.
12-
#' 2. Adds `year_date_time` and `season_date_time` columns to the input dataframe (`df`) and the official record using the `lubridate` package and the `infer_season` function.
13-
#' 3. Compares the `date_time` column from `df` to the official record, checking for matches by station, year, and season.
14-
#' 4. For each row, it outputs a message describing the match status (e.g., no match, partial match, or full match).
15-
#' 5. Updates the `date_time` column in the input dataframe if a mismatch is found and a valid correction is possible.
16-
#' 6. Returns the updated dataframe with corrected date-time values.
17-
#'
18-
#' @import httr
19-
#' @import lubridate
20-
#' @import dplyr
21-
#'
22-
#' @examples
23-
#' # Assuming 'df' is a dataframe with a 'station' and 'date_time' column
24-
#' aligned_df <- align_dates(df)
25-
#'
26-
#' @export
27-
align_dates <- function(df) {
28-
29-
30-
# Internal function to check data matches
31-
check_data_matches <- function(input_data, official_record) {
32-
check_matches <- function(row) {
33-
station_matches <- official_record[official_record$station == row$station, ]
34-
result_list <- list(original_date_time = row$date_time, date_time_changed = row$date_time,
35-
expected_date_time = NA, match_status = "No station match", result = FALSE)
36-
37-
if (nrow(station_matches) > 0) {
38-
year_matches <- station_matches[station_matches$year_date_time == row$year_date_time, ]
39-
40-
if (nrow(year_matches) > 0) {
41-
season_matches <- year_matches[year_matches$season_date_time == row$season_date_time, ]
42-
43-
if (nrow(season_matches) > 0) {
44-
correct_dates <- season_matches$date_time
45-
46-
if (length(correct_dates) > 1) {
47-
if (row$date_time %in% correct_dates) {
48-
result_list$match_status <- "Full match found"
49-
result_list$expected_date_time <- row$date_time
50-
result_list$date_time_changed <- row$date_time
51-
} else {
52-
result_list$match_status <- paste("Multiple dates found in CP data, none match:", toString(correct_dates))
53-
result_list$expected_date_time <- NA
54-
}
55-
} else {
56-
result_list$expected_date_time <- correct_dates
57-
if (row$date_time != correct_dates) {
58-
result_list$date_time_changed <- correct_dates # Update if mismatch
59-
result_list$match_status <- paste("Date/time updated")
60-
} else {
61-
result_list$match_status <- "Full match found"
62-
}
63-
}
64-
result_list$result <- result_list$date_time_changed == result_list$expected_date_time
65-
} else {
66-
result_list$match_status <- "Year match but no season match"
67-
}
68-
} else {
69-
result_list$match_status <- "Station match but no year match"
70-
}
71-
}
72-
73-
return(result_list)
74-
}
75-
76-
results <- rowwise(input_data) %>%
77-
mutate(match_details = list(check_matches(cur_data())),
78-
original_date_time = match_details$original_date_time,
79-
date_time_changed = match_details$date_time_changed,
80-
expected_date_time = match_details$expected_date_time,
81-
match_status = match_details$match_status,
82-
result = match_details$result) %>%
83-
ungroup() %>%
84-
select(station, season_date_time, original_date_time, date_time_changed, expected_date_time, year_date_time, match_status, result)
85-
86-
return(results)
87-
}
88-
89-
# Load official record
90-
official_record <- read.csv("https://utexas.box.com/shared/static/9hcctqqilisc0t61wbbdiziig8ok8rg8.csv")
91-
92-
# Add year and season columns
93-
df$year_date_time <- lubridate::year(lubridate::ymd_hms(df$date_time))
94-
df$season_date_time <- infer_season(data = df, date_col = "date_time")
95-
official_record$season_date_time <- infer_season(data = official_record, date_col = "date_time")
96-
official_record$year_date_time <- lubridate::year(lubridate::ymd_hms(official_record$date_time))
97-
98-
# Use check_data_matches to compare input data with official record
99-
results_data <- check_data_matches(df, official_record)
100-
101-
# Track already-printed messages to avoid duplicates
102-
printed_messages <- c()
103-
104-
# Update the date_time column in the input dataframe and display mismatch messages
105-
for (i in seq_len(nrow(results_data))) {
106-
row <- results_data[i, ]
107-
108-
# Generate the message
109-
message_text <- NULL
110-
if (row$match_status == "No station match") {
111-
message_text <- paste("No Station match:", row$station)
112-
} else if (row$match_status == "Station match but no year match") {
113-
message_text <- paste("Station match but no year match:", row$station,
114-
"Original year:", df$year_date_time[i])
115-
} else if (row$match_status == "Year match but no season match") {
116-
message_text <- paste("Year match but no season match:", row$station,
117-
"Original season:", df$season_date_time[i])
118-
} else if (grepl("Multiple dates found in CP data", row$match_status)) {
119-
message_text <- paste("Multiple dates found for station", row$station,
120-
"in CP data:", row$match_status)
121-
}
122-
123-
# Print the message only if it hasn't been printed before
124-
if (!is.null(message_text) && !(message_text %in% printed_messages)) {
125-
message(message_text)
126-
printed_messages <- c(printed_messages, message_text) # Add to the printed set
127-
}
128-
129-
# Update the date_time if needed
130-
if (!is.na(row$expected_date_time) && row$original_date_time != row$date_time_changed) {
131-
df$date_time[df$station == row$station & df$date_time == row$original_date_time] <- row$date_time_changed
132-
}
133-
}
134-
135-
# Remove columns
136-
df <- df %>% dplyr::select(-year_date_time, -season_date_time)
137-
138-
# Return the updated dataframe
139-
return(df)
140-
}
1+
#' Aligns Date and Time Data Based on Official Station Records
2+
#'
3+
#' This function compares date and time data from a user-provided dataframe (`df`) against an official record stored in a CSV file. It checks if the date-time for each station in the input dataframe matches the expected values from the official record, based on station, year, and season. The function attempts to correct mismatches where possible and reports any discrepancies. It updates the `date_time` column in the input dataframe where necessary and provides messages about the status of each match.
4+
#'
5+
#' @param df A dataframe containing station data with a `date_time` column. The dataframe must include a `station` column to match against the official record.
6+
#'
7+
#' @return A dataframe with the same structure as the input dataframe (`df`), but with updated `date_time` values where mismatches are corrected. The columns `year_date_time` and `season_date_time` are removed from the output.
8+
#'
9+
#' @details
10+
#' The function performs the following steps:
11+
#' 1. Loads an official record CSV containing expected station data.
12+
#' 2. Adds `year_date_time` and `season_date_time` columns to the input dataframe (`df`) and the official record using the `lubridate` package and the `infer_season` function.
13+
#' 3. Compares the `date_time` column from `df` to the official record, checking for matches by station, year, and season.
14+
#' 4. For each row, it outputs a message describing the match status (e.g., no match, partial match, or full match).
15+
#' 5. Updates the `date_time` column in the input dataframe if a mismatch is found and a valid correction is possible.
16+
#' 6. Returns the updated dataframe with corrected date-time values.
17+
#'
18+
#' @import httr
19+
#' @import lubridate
20+
#' @import dplyr
21+
#'
22+
#' @examples
23+
#' # Assuming 'df' is a dataframe with a 'station' and 'date_time' column
24+
#' aligned_df <- align_dates(df)
25+
#'
26+
#' @export
27+
align_dates <- function(df) {
28+
29+
30+
# Internal function to check data matches
31+
check_data_matches <- function(input_data, official_record) {
32+
check_matches <- function(row) {
33+
station_matches <- official_record[official_record$station == row$station, ]
34+
result_list <- list(original_date_time = row$date_time, date_time_changed = row$date_time,
35+
expected_date_time = NA, match_status = "No station match", result = FALSE)
36+
37+
if (nrow(station_matches) > 0) {
38+
year_matches <- station_matches[station_matches$year_date_time == row$year_date_time, ]
39+
40+
if (nrow(year_matches) > 0) {
41+
season_matches <- year_matches[year_matches$season_date_time == row$season_date_time, ]
42+
43+
if (nrow(season_matches) > 0) {
44+
correct_dates <- season_matches$date_time
45+
46+
if (length(correct_dates) > 1) {
47+
if (row$date_time %in% correct_dates) {
48+
result_list$match_status <- "Full match found"
49+
result_list$expected_date_time <- row$date_time
50+
result_list$date_time_changed <- row$date_time
51+
} else {
52+
result_list$match_status <- paste("Multiple dates found in CP data, none match:", toString(correct_dates))
53+
result_list$expected_date_time <- NA
54+
}
55+
} else {
56+
result_list$expected_date_time <- correct_dates
57+
if (row$date_time != correct_dates) {
58+
result_list$date_time_changed <- correct_dates # Update if mismatch
59+
result_list$match_status <- paste("Date/time updated")
60+
} else {
61+
result_list$match_status <- "Full match found"
62+
}
63+
}
64+
result_list$result <- result_list$date_time_changed == result_list$expected_date_time
65+
} else {
66+
result_list$match_status <- "Year match but no season match"
67+
}
68+
} else {
69+
result_list$match_status <- "Station match but no year match"
70+
}
71+
}
72+
73+
return(result_list)
74+
}
75+
76+
results <- rowwise(input_data) %>%
77+
mutate(match_details = list(check_matches(cur_data())),
78+
original_date_time = match_details$original_date_time,
79+
date_time_changed = match_details$date_time_changed,
80+
expected_date_time = match_details$expected_date_time,
81+
match_status = match_details$match_status,
82+
result = match_details$result) %>%
83+
ungroup() %>%
84+
select(station, season_date_time, original_date_time, date_time_changed, expected_date_time, year_date_time, match_status, result)
85+
86+
return(results)
87+
}
88+
89+
# Load official record
90+
# This is from Box\Beaufort LTER\Core Program\Internal Data and Sample Sharing\Core Program Dat\CP_stations_and_dates.csv
91+
# The CSV file is managed by the Core Program team, currently Kaylie Plumb (2025-01-09).
92+
official_record <- read.csv("https://utexas.box.com/shared/static/9hcctqqilisc0t61wbbdiziig8ok8rg8.csv")
93+
94+
# Add year and season columns
95+
df$year_date_time <- lubridate::year(lubridate::ymd_hms(df$date_time))
96+
df$season_date_time <- infer_season(data = df, date_col = "date_time")
97+
official_record$season_date_time <- infer_season(data = official_record, date_col = "date_time")
98+
official_record$year_date_time <- lubridate::year(lubridate::ymd_hms(official_record$date_time))
99+
100+
# Use check_data_matches to compare input data with official record
101+
results_data <- check_data_matches(df, official_record)
102+
103+
# Track already-printed messages to avoid duplicates
104+
printed_messages <- c()
105+
106+
# Update the date_time column in the input dataframe and display mismatch messages
107+
for (i in seq_len(nrow(results_data))) {
108+
row <- results_data[i, ]
109+
110+
# Generate the message
111+
message_text <- NULL
112+
if (row$match_status == "No station match") {
113+
message_text <- paste("No Station match:", row$station)
114+
} else if (row$match_status == "Station match but no year match") {
115+
message_text <- paste("Station match but no year match:", row$station,
116+
"Original year:", df$year_date_time[i])
117+
} else if (row$match_status == "Year match but no season match") {
118+
message_text <- paste("Year match but no season match:", row$station,
119+
"Original season:", df$season_date_time[i])
120+
} else if (grepl("Multiple dates found in CP data", row$match_status)) {
121+
message_text <- paste("Multiple dates found for station", row$station,
122+
"in CP data:", row$match_status)
123+
}
124+
125+
# Print the message only if it hasn't been printed before
126+
if (!is.null(message_text) && !(message_text %in% printed_messages)) {
127+
message(message_text)
128+
printed_messages <- c(printed_messages, message_text) # Add to the printed set
129+
}
130+
131+
# Update the date_time if needed
132+
if (!is.na(row$expected_date_time) && row$original_date_time != row$date_time_changed) {
133+
df$date_time[df$station == row$station & df$date_time == row$original_date_time] <- row$date_time_changed
134+
}
135+
}
136+
137+
# Remove columns
138+
df <- df %>% dplyr::select(-year_date_time, -season_date_time)
139+
140+
# Return the updated dataframe
141+
return(df)
142+
}

0 commit comments

Comments
 (0)