Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .ci/build-docs.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ sudo tlmgr --verify-repo=none update --self
sudo tlmgr --verify-repo=none install inconsolata helvetic rsfs

# install dependencies
Rscript -e "install.packages(c('assertthat', 'curl', 'data.table', 'futile.logger', 'jsonlite', 'knitr', 'markdown', 'pkgdown', 'purrr', 'roxygen2', 'stringr'), repos = 'https://cran.r-project.org', Ncpus = parallel::detectCores())"
Rscript -e "install.packages(c('assertthat', 'curl', 'data.table', 'jsonlite', 'knitr', 'markdown', 'pkgdown', 'purrr', 'roxygen2', 'stringr'), repos = 'https://cran.r-project.org', Ncpus = parallel::detectCores())"

cp NEWS.md ./r-pkg/
cp README.md ./r-pkg/
Expand Down
2 changes: 1 addition & 1 deletion .ci/setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,5 +19,5 @@ sudo apt-get install \
tidy \
qpdf

Rscript -e "install.packages(c('covr', 'curl', 'data.table', 'futile.logger', 'jsonlite', 'knitr', 'lintr', 'markdown', 'purrr', 'stringr', 'testthat'), repos = 'https://cran.r-project.org', Ncpus = parallel::detectCores())"
Rscript -e "install.packages(c('covr', 'curl', 'data.table', 'jsonlite', 'knitr', 'lintr', 'markdown', 'purrr', 'stringr', 'testthat'), repos = 'https://cran.r-project.org', Ncpus = parallel::detectCores())"
cp test-data/* r-pkg/inst/testdata/
1 change: 1 addition & 0 deletions .github/workflows/build-docs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ jobs:
- uses: actions/checkout@v6
with:
fetch-depth: 0
persist-credentials: false
- name: set up R
uses: r-lib/actions/setup-r@v2
with:
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ jobs:
uses: actions/checkout@v6
with:
fetch-depth: 0
persist-credentials: false
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Another tiny change I wanted to slip in here since we're touching the repo for the first time in months.

This is a small security patch that avoids storing the temporary git creds created by actions/checkout, which makes it harder for injected code to do bad things.

- uses: pre-commit/[email protected]
- name: set up R
uses: r-lib/actions/setup-r@v2
Expand Down
6 changes: 3 additions & 3 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,17 @@ exclude: |
)$
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v5.0.0
rev: v6.0.0
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just ran pre-commit autoupdate to pull in new updates... might as well since while we're touching the repo 🤷🏻

hooks:
- id: end-of-file-fixer
- id: trailing-whitespace
- repo: https://github.com/maxwinterstein/shfmt-py
rev: v3.11.0.2
rev: v3.12.0.1
hooks:
- id: shfmt
args: ["--indent=4", "--space-redirects", "--write"]
- repo: https://github.com/shellcheck-py/shellcheck-py
rev: v0.10.0.1
rev: v0.11.0.1
hooks:
- id: shellcheck
args: ["--exclude=SC2002"]
Expand Down
3 changes: 3 additions & 0 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,9 @@ Look at the source code of `setup_local.sh` for a list of the valid arguments.
# Start up Elasticsearch on localhost:9200 and seed it with data
./setup_local.sh 8.17.2

# enable the integration tests
export NOT_CRAN=true

# Run tests
make test

Expand Down
3 changes: 1 addition & 2 deletions r-pkg/DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ Depends:
Imports:
curl,
data.table,
futile.logger,
jsonlite,
purrr,
stats,
Expand All @@ -36,6 +35,6 @@ Suggests:
License: BSD_3_clause + file LICENSE
URL: https://github.com/uptake/uptasticsearch
BugReports: https://github.com/uptake/uptasticsearch/issues
RoxygenNote: 7.3.2
RoxygenNote: 7.3.3
VignetteBuilder: knitr
Encoding: UTF-8
4 changes: 0 additions & 4 deletions r-pkg/NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,6 @@ importFrom(data.table,setcolorder)
importFrom(data.table,setkeyv)
importFrom(data.table,setnames)
importFrom(data.table,uniqueN)
importFrom(futile.logger,flog.debug)
importFrom(futile.logger,flog.fatal)
importFrom(futile.logger,flog.info)
importFrom(futile.logger,flog.warn)
importFrom(jsonlite,fromJSON)
importFrom(parallel,clusterMap)
importFrom(parallel,detectCores)
Expand Down
37 changes: 26 additions & 11 deletions r-pkg/R/es_search.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
#' as it has pulled this many hits. Default is \code{Inf}, meaning that
#' all possible hits will be pulled.
#' @param size Number of records per page of results.
#' See \href{https://www.elastic.co/guide/en/elasticsearch/reference/current/search-request-body.html#request-body-search-from-size}{Elasticsearch docs} for more.
#' See \href{https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-scroll}{Elasticsearch docs} for more.
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

R CMD check was raising this NOTE:

Found the following (possibly) invalid URLs:
  URL: https://www.elastic.co/guide/en/elasticsearch/reference/current/search-request-body.html#request-body-search-from-size (moved to https://www.elastic.co/guide/en/elasticsearch/reference/8.18/search-request-body.html#request-body-search-from-size)
    From: man/es_search.Rd
    Status: 200
    Message: OK
  URL: https://www.elastic.co/guide/en/elasticsearch/reference/current/search-request-body.html#request-body-search-scroll (moved to https://www.elastic.co/guide/en/elasticsearch/reference/8.18/search-request-body.html#request-body-search-scroll)
    From: man/es_search.Rd
    Status: 200
    Message: OK

#' Note that this will be reset to 0 if you submit a \code{query_body} with
#' an "aggs" request in it. Also see \code{max_hits}.
#' @param query_body String with a valid Elasticsearch query. Default is an empty query.
Expand All @@ -17,7 +17,7 @@
#' The scroll context will be refreshed every time you ask Elasticsearch
#' for another record, so this parameter should just be the amount of
#' time you expect to pass between requests. See the
#' \href{https://www.elastic.co/guide/en/elasticsearch/reference/current/search-request-body.html#request-body-search-scroll}{Elasticsearch scroll/pagination docs}
#' \href{https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-scroll}{Elasticsearch scroll/pagination docs}
#' for more information.
#' @param n_cores Number of cores to distribute fetching and processing over.
#' @param break_on_duplicates Boolean, defaults to TRUE. \code{es_search} uses the size of the
Expand Down Expand Up @@ -84,6 +84,7 @@ es_search <- function(es_host
, break_on_duplicates = TRUE
, ignore_scroll_restriction = FALSE
, intermediates_dir = getwd()
, verbose = FALSE
) {

# Check if this is an aggs or straight-up search query
Expand Down Expand Up @@ -135,6 +136,7 @@ es_search <- function(es_host
, es_index = es_index
, trailing_args = "size=0"
, query_body = query_body
, verbose = verbose
)

return(chomp_aggs(aggs_json = result))
Expand All @@ -151,7 +153,8 @@ es_search <- function(es_host
, n_cores = n_cores
, break_on_duplicates = break_on_duplicates
, ignore_scroll_restriction = ignore_scroll_restriction
, intermediates_dir = intermediates_dir))
, intermediates_dir = intermediates_dir
, verbose = verbose))
}

# nolint start
Expand All @@ -162,7 +165,7 @@ es_search <- function(es_host
# [param] es_host A string identifying an Elasticsearch host. This should be of the form
# [transfer_protocol][hostname]:[port]. For example, 'http://myindex.thing.com:9200'.
# [param] es_index The name of an Elasticsearch index to be queried.
# [param] size Number of records per page of results. See \href{https://www.elastic.co/guide/en/elasticsearch/reference/current/search-request-body.html#request-body-search-from-size}{Elasticsearch docs} for more
# [param] size Number of records per page of results. See \href{https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-scroll}{Elasticsearch docs} for more
# [param] query_body String with a valid Elasticsearch query to be passed to \code{\link[elastic]{Search}}.
# Default is an empty query.
# [param] scroll How long should the scroll context be held open? This should be a
Expand Down Expand Up @@ -190,6 +193,7 @@ es_search <- function(es_host
# value longer than an hour, set \code{ignore_scroll_restriction}
# to \code{TRUE}.
# [param] intermediates_dir passed through from es_search. See es_search docs.
# [param] verbose TRUE to print DEBUG-level logs.
# [examples]
# \dontrun{
#
Expand Down Expand Up @@ -224,6 +228,7 @@ es_search <- function(es_host
, break_on_duplicates
, ignore_scroll_restriction
, intermediates_dir
, verbose
) {

# Check es_host
Expand All @@ -238,7 +243,7 @@ es_search <- function(es_host
"\n\nIf you understand the costs and would like to make requests ",
"with a longer-lived context, re-run this function with ",
"ignore_scroll_restriction = TRUE.\n",
"\nPlease see https://www.elastic.co/guide/en/elasticsearch/reference/current/search-request-body.html#request-body-search-scroll ", # nolint[line_length]
"\nPlease see https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-scroll ", # nolint[line_length]
"for more information.")
.log_fatal(msg)
}
Expand Down Expand Up @@ -284,12 +289,13 @@ es_search <- function(es_host
, es_index = es_index
, trailing_args = paste0("size=", size, "&scroll=", scroll)
, query_body = query_body
, verbose = verbose
)

# Parse to JSON to get total number of documents matching the query
firstResult <- jsonlite::fromJSON(firstResultJSON, simplifyVector = FALSE)

major_version <- .get_es_version(es_host)
major_version <- .get_es_version(es_host, verbose = verbose)
if (as.integer(major_version) > 6) {
hits_to_pull <- min(firstResult[["hits"]][["total"]][["value"]], max_hits)
} else {
Expand Down Expand Up @@ -458,6 +464,7 @@ es_search <- function(es_host
# hits_to_pull - Total hits to be pulled (documents matching user's query).
# Or, in the case where max_hits < number of matching docs,
# max_hits.
# verbose - TRUE to print DEBUG-level logs.
#' @importFrom jsonlite fromJSON
.keep_on_pullin <- function(scroll_id
, out_path
Expand All @@ -466,11 +473,12 @@ es_search <- function(es_host
, scroll
, hits_pulled
, hits_to_pull
, verbose
) {

# Note that the old scrolling strategy was deprecated in Elasticsearch 5.x and
# officially dropped in Elasticsearch 6.x. Need to grab the correct method here
major_version <- .get_es_version(es_host)
major_version <- .get_es_version(es_host, verbose = verbose)
scrolling_request <- switch(
major_version
, "1" = .legacy_scroll_request
Expand All @@ -487,6 +495,7 @@ es_search <- function(es_host
es_host = es_host
, scroll = scroll
, scroll_id = scroll_id
, verbose = verbose
)
.stop_for_status(result)
resultJSON <- .content(result, as = "text")
Expand Down Expand Up @@ -531,7 +540,7 @@ es_search <- function(es_host
# [name] .new_scroll_request
# [description] Make a scrolling request and return the result
# [references] https://www.elastic.co/guide/en/elasticsearch/reference/6.7/search-request-scroll.html
.new_scroll_request <- function(es_host, scroll, scroll_id) {
.new_scroll_request <- function(es_host, scroll, scroll_id, verbose) {

# Set up scroll_url
scroll_url <- paste0(es_host, "/_search/scroll") # nolint[absolute_path,non_portable_path]
Expand All @@ -541,14 +550,15 @@ es_search <- function(es_host
verb = "POST"
, url = scroll_url
, body = sprintf('{"scroll": "%s", "scroll_id": "%s"}', scroll, scroll_id)
, verbose = verbose
)
return(result)
}

# [title] Make a scroll request with the strategy supported by Elasticsearch 1.x and Elasticsearch 2.x
# [name] .legacy_scroll_request
# [description] Make a scrolling request and return the result
.legacy_scroll_request <- function(es_host, scroll, scroll_id) {
.legacy_scroll_request <- function(es_host, scroll, scroll_id, verbose) {

# Set up scroll_url
scroll_url <- paste0(es_host, "/_search/scroll?scroll=", scroll)
Expand All @@ -558,6 +568,7 @@ es_search <- function(es_host
verb = "POST"
, url = scroll_url
, body = scroll_id
, verbose = verbose
)
return(result)
}
Expand Down Expand Up @@ -594,7 +605,7 @@ es_search <- function(es_host
portPattern <- ":[0-9]+$"
if (! grepl(portPattern, es_host) == 1) {
msg <- paste0("No port found in es_host! es_host should be a string of the"
, "form [transfer_protocol][hostname]:[port]). for "
, " form [transfer_protocol][hostname]:[port]. For "
, "example: 'http://myindex.mysite.com:9200'")
.log_fatal(msg)
}
Expand Down Expand Up @@ -625,14 +636,15 @@ es_search <- function(es_host
# version of Elasticsearch.
# [param] es_host A string identifying an Elasticsearch host. This should be of the form
# [transfer_protocol][hostname]:[port]. For example, 'http://myindex.thing.com:9200'.
.get_es_version <- function(es_host) {
.get_es_version <- function(es_host, verbose) {

# Hit the cluster root to get metadata
.log_info("Checking Elasticsearch version...")
result <- .request(
verb = "GET"
, url = es_host
, body = NULL
, verbose = verbose
)
.stop_for_status(result)

Expand Down Expand Up @@ -669,6 +681,7 @@ es_search <- function(es_host
# For example, to limit the size of the returned results, you might pass
# "size=0". This can be a single string or a character vector of params, e.g.
# \code{c('size=0', 'scroll=5m')}
# [param] verbose TRUE to print DEBUG-level logs.
# [param] query_body A JSON string with valid Elasticsearch DSL
# [examples]
# \dontrun{
Expand Down Expand Up @@ -701,6 +714,7 @@ es_search <- function(es_host
, es_index
, trailing_args = NULL
, query_body
, verbose
) {

# Input checking
Expand All @@ -717,6 +731,7 @@ es_search <- function(es_host
verb = "POST"
, url = reqURL
, body = query_body
, verbose = verbose
)
.stop_for_status(result)
result <- .content(result, as = "text")
Expand Down
9 changes: 7 additions & 2 deletions r-pkg/R/get_fields.R
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@
#' }
get_fields <- function(es_host
, es_indices = "_all"
, verbose = FALSE
) {

# Input checking
Expand All @@ -76,6 +77,7 @@ get_fields <- function(es_host

major_version <- .get_es_version(
es_host = es_host
, verbose = verbose
)

# The use of "_all" to indicate "all indices" was removed in Elasticsearch 7.
Expand All @@ -91,6 +93,7 @@ get_fields <- function(es_host
verb = "GET"
, url = sprintf("%s/_cat/indices?format=json", es_url)
, body = NULL
, verbose = verbose
)
indexDT <- data.table::as.data.table(
jsonlite::fromJSON(
Expand All @@ -112,6 +115,7 @@ get_fields <- function(es_host
verb = "GET"
, url = es_url
, body = NULL
, verbose = verbose
)
.stop_for_status(result)
resultContent <- .content(result, as = "parsed")
Expand Down Expand Up @@ -142,7 +146,7 @@ get_fields <- function(es_host
}

##################### get aliases for index names #########################
rawAliasDT <- .get_aliases(es_host = es_host)
rawAliasDT <- .get_aliases(es_host = es_host, verbose = verbose)
if (!is.null(rawAliasDT)) {

.log_info("Replacing index names with aliases")
Expand Down Expand Up @@ -221,7 +225,7 @@ get_fields <- function(es_host
# [es_host] A string identifying an Elasticsearch host.
#' @importFrom data.table as.data.table
#' @importFrom jsonlite fromJSON
.get_aliases <- function(es_host) {
.get_aliases <- function(es_host, verbose) {

# construct the url to the alias endpoint
url <- paste0(es_host, "/_cat/aliases") # nolint[absolute_path, non_portable_path]
Expand All @@ -231,6 +235,7 @@ get_fields <- function(es_host
verb = "GET"
, url = url
, body = NULL
, verbose = verbose
)
.stop_for_status(result)
resultContent <- .content(result, as = "text")
Expand Down
Loading