Skip to content

Reading fathom csv benchmarks #28

@mhpob

Description

@mhpob

File from actel issue #164

download.file(
    'https://github.com/user-attachments/files/18086120/NexTrak-R1.801032.2024-10-25.093307.csv',
    destfile = 'NexTrak-R1.801032.2024-10-25.093307.csv'
)

fathom_csv <- 'NexTrak-R1.801032.2024-10-25.093307.csv'

data.table version:

read_fathom_dt <- function(fathom_csv, data_type = "DET"){
  
  # Update: this is needed as grep is not in default Windows
  search_fun <- ifelse(.Platform$OS.type == "windows", 'FINDSTR', "grep")
  
  data.table::fread(
    cmd = paste(
      search_fun, data_type, fathom_csv
    ),
    header = F,
    col.names = {
      as.character(
        data.table::fread(cmd = paste(
          search_fun, paste0(data_type, "_DESC"), fathom_csv
        ),
        header = F)
      )
    }
  )
}

readr version:

read_fathom_readr <- function(fathom_csv, data_type = "DET") {
  
  search_fun <- ifelse(.Platform$OS.type == "windows", 'FINDSTR', "grep")
  
  # cheating here. Could/should be switched from system
  header <- system2(
    search_fun,
    c(paste0(data_type, "_DESC"),
      fathom_csv
    ),
    stdout = T) |> 
    strsplit(',') |> 
    unlist()
  
  select_detections <- function(x, pos) x[x[,1] == data_type,]
  
  readr::read_csv_chunked(
    fathom_csv,
    readr::DataFrameCallback$new(select_detections),
    col_names = header,
    col_types = readr::cols()
  ) |> 
    readr::type_convert() 
}

Drag race:

microbenchmark::microbenchmark(
  readr = read_fathom_readr(fathom_csv),
  dt = read_fathom_dt(fathom_csv),
  times = 20
)

Unit: milliseconds
  expr       min        lq      mean    median        uq       max neval cld
 readr 2311.1762 2474.3908 2670.6241 2616.3300 2717.2186 4093.1096    20  a 
    dt  299.9092  309.8798  320.0476  313.5691  328.8836  361.5658    20   b

Note that data.table can read from HTTP if not on Windows, if desired. readr, as coded cannot.

read_fathom_readr('https://github.com/user-attachments/files/18086120/NexTrak-R1.801032.2024-10-25.093307.csv')
read_fathom_dt('https://github.com/user-attachments/files/18086120/NexTrak-R1.801032.2024-10-25.093307.csv')

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions