From 09128f28adf043cd6bf69a77086a866ec7383849 Mon Sep 17 00:00:00 2001 From: Paul Wessel Date: Sun, 17 Jan 2021 18:37:42 -1000 Subject: [PATCH 1/5] Experiment with refactoring of grdsample Extract a separate gmtlib_grd_sample function in grdsample.c that does most of the heavy lifting and is called by GMT_grdsample. --- src/grdsample.c | 214 ++++++++++++++++++++++++++++++++---------------- 1 file changed, 144 insertions(+), 70 deletions(-) diff --git a/src/grdsample.c b/src/grdsample.c index 79e2141770e..d86596fbfc2 100644 --- a/src/grdsample.c +++ b/src/grdsample.c @@ -173,20 +173,154 @@ static int parse (struct GMT_CTRL *GMT, struct GRDSAMPLE_CTRL *Ctrl, struct GMT_ #define bailout(code) {gmt_M_free_options (mode); return (code);} #define Return(code) {Free_Ctrl (GMT, Ctrl); gmt_end_module (GMT, GMT_cpy); bailout (code);} -EXTERN_MSC int GMT_grdsample (void *V_API, int mode, void *args) { - - int error = 0, row, col; - unsigned int registration; +EXTERN_MSC int gmtlib_grd_sample (void *V_API, struct GMT_GRID *Gin, double *wesn, double *incs, int registration, int mode, struct GMT_GRID **G) { + /* 2D equidistant sampling of input grid In. + * + * err = gmtlib_grdsample (API, Gin, wesn, incs, registration, mode, G) + * + * Input arguments: + * Gin: The input GMT grid + * wesn: Array with xmin, xmax, ymin, ymax for output, or NULL if same as input + * incs: Array with xinc and yinc, If yinc == 0 we set it to xinc + * reg: Registration (pixel or gridline) + * mode: Currently unused + * Output arguments + * G: The resulting grid structure with allocated memory + * err: The function return code, nonzero if a problem + */ + int shift = 0, row, col; uint64_t ij; + struct GMTAPI_CTRL *API; + struct GMT_CTRL *GMT; + struct GMT_GRID *Gout = NULL; + struct GMT_GRID_HEADER_HIDDEN *HH = NULL; + double lat, wesn_o[4], inc[2], *lon = NULL; + gmt_M_unused (mode); - char format[GMT_BUFSIZ]; + /* Sanity check on input arguments */ + if (V_API == NULL) return (GMT_NOT_A_SESSION); + if (Gin == NULL) return (GMT_PTR_IS_NULL); + if (G == NULL) return (GMT_PTR_IS_NULL); + if ((registration == GMT_GRID_NODE_REG || registration == GMT_GRID_PIXEL_REG)) return (GMT_VALUE_NOT_SET); + API = gmt_get_api_ptr (V_API); + GMT = API->GMT; + + gmt_M_memcpy (wesn_o, (wesn ? wesn : Gin->header->wesn), 4, double); /* wesn_o is the region we want for the output [Same as input] */ + gmt_M_memcpy (inc, (incs ? incs : Gin->header->inc), 2, double); /* Either a new increment is given or we use the one from the input grid */ + + if (wesn) { /* Gave a specific region */ + bool wrap_360_i = (gmt_M_is_geographic (GMT, GMT_IN) && gmt_M_360_range (Gin->header->wesn[XLO], Gin->header->wesn[XHI])); + bool wrap_360_o = (gmt_M_is_geographic (GMT, GMT_OUT) && gmt_M_360_range (wesn_o[XLO], wesn_o[XHI])); + + /* Adjust wesn_o used to CREATE the output grid: It cannot exceed the input grid bounds */ + while (wesn_o[YLO] < Gin->header->wesn[YLO]) wesn_o[YLO] += inc[GMT_Y]; /* Now on or inside boundary */ + while (wesn_o[YHI] > Gin->header->wesn[YHI]) wesn_o[YHI] -= inc[GMT_Y]; /* Now on or inside boundary */ + if (gmt_M_is_geographic (GMT, GMT_IN)) { /* Must carefully check the longitude overlap */ + if (Gin->header->wesn[XHI] < wesn_o[XLO]) shift += 360; + else if (Gin->header->wesn[XLO] > wesn_o[XHI]) shift -= 360; + else if (wrap_360_i && wesn_o[XHI] > Gin->header->wesn[XHI]) shift += 360; + else if (wrap_360_i && wesn_o[XLO] < Gin->header->wesn[XLO]) shift -= 360; + if (shift) { /* Must modify header */ + Gin->header->wesn[XLO] += shift, Gin->header->wesn[XHI] += shift; + GMT_Report (API, GMT_MSG_DEBUG, "Input grid region needed a %d longitude adjustment to fit final grid region\n", shift); + } + } + if (!wrap_360_o && !wrap_360_i) { /* Can only shrink wesn_o if there is no 360-wrapping going on */ + while (wesn_o[XLO] < Gin->header->wesn[XLO]) wesn_o[XLO] += inc[GMT_X]; /* Now on or inside boundary */ + while (wesn_o[XHI] > Gin->header->wesn[XHI]) wesn_o[XHI] -= inc[GMT_X]; /* Now on or inside boundary */ + } + } + + /* Here, wesn_o is the region we wish to use when creating the output grid */ + + if ((Gout = GMT_Create_Data (API, GMT_IS_GRID, GMT_IS_SURFACE, GMT_CONTAINER_AND_DATA, NULL, wesn_o, inc, \ + registration, GMT_NOTSET, NULL)) == NULL) return (API->error); + + if (gmt_M_is_verbose (GMT, GMT_MSG_INFORMATION)) { + char format[GMT_BUFSIZ] = {""}; + sprintf (format, "Input grid (%s/%s/%s/%s) n_columns = %%d n_rows = %%d dx = %s dy = %s registration = %%d\n", + GMT->current.setting.format_float_out, GMT->current.setting.format_float_out, GMT->current.setting.format_float_out, + GMT->current.setting.format_float_out, GMT->current.setting.format_float_out, GMT->current.setting.format_float_out); + + GMT_Report (API, GMT_MSG_INFORMATION, format, Gin->header->wesn[XLO], Gin->header->wesn[XHI], + Gin->header->wesn[YLO], Gin->header->wesn[YHI], Gin->header->n_columns, Gin->header->n_rows, + Gin->header->inc[GMT_X], Gin->header->inc[GMT_Y], Gin->header->registration); + + memcpy (&format, "Output", 6); + + GMT_Report (API, GMT_MSG_INFORMATION, format, Gout->header->wesn[XLO], Gout->header->wesn[XHI], + Gout->header->wesn[YLO], Gout->header->wesn[YHI], Gout->header->n_columns, Gout->header->n_rows, + Gout->header->inc[GMT_X], Gout->header->inc[GMT_Y], Gout->header->registration); + } + + if (Gout->header->inc[GMT_X] > Gin->header->inc[GMT_X]) + GMT_Report (API, GMT_MSG_WARNING, "Output sampling interval in x exceeds input interval and may lead to aliasing.\n"); + if (Gout->header->inc[GMT_Y] > Gin->header->inc[GMT_Y]) + GMT_Report (API, GMT_MSG_WARNING, "Output sampling interval in y exceeds input interval and may lead to aliasing.\n"); + + /* Precalculate longitudes from the output grid layout */ - double *lon = NULL, lat, wesn_i[4], wesn_o[4], inc[2]; + HH = gmt_get_H_hidden (Gin->header); + lon = gmt_M_memory (GMT, NULL, Gout->header->n_columns, double); + for (col = 0; col < (int)Gout->header->n_columns; col++) { + lon[col] = gmt_M_grd_col_to_x (GMT, col, Gout->header); + if (!HH->nxp) + /* Nothing */; + else if (lon[col] > Gin->header->wesn[XHI]) + lon[col] -= Gin->header->inc[GMT_X] * HH->nxp; + else if (lon[col] < Gin->header->wesn[XLO]) + lon[col] += Gin->header->inc[GMT_X] * HH->nxp; + } + + /* Loop over input point and estimate output values */ + + Gout->header->z_min = FLT_MAX; Gout->header->z_max = -FLT_MAX; /* Min/max for out */ + +#ifdef _OPENMP +#pragma omp parallel for private(row,col,lat,ij) shared(GMT,Gin,Gout,lon) +#endif + for (row = 0; row < (int)Gout->header->n_rows; row++) { + lat = gmt_M_grd_row_to_y (GMT, row, Gout->header); + if (!HH->nyp) + /* Nothing */; + else if (lat > Gin->header->wesn[YHI]) + lat -= Gin->header->inc[GMT_Y] * HH->nyp; + else if (lat < Gin->header->wesn[YLO]) + lat += Gin->header->inc[GMT_Y] * HH->nyp; + for (col = 0; col < (int)Gout->header->n_columns; col++) { + ij = gmt_M_ijp (Gout->header, row, col); + Gout->data[ij] = (gmt_grdfloat)gmt_bcr_get_z (GMT, Gin, lon[col], lat); + if (Gout->data[ij] < Gout->header->z_min) Gout->header->z_min = Gout->data[ij]; + if (Gout->data[ij] > Gout->header->z_max) Gout->header->z_max = Gout->data[ij]; + } + } + gmt_M_free (GMT, lon); + + if (!GMT->common.n.truncate && (Gout->header->z_min < Gin->header->z_min || Gout->header->z_max > Gin->header->z_max)) { /* Report and possibly truncate output to input extrama */ + GMT_Report (GMT->parent, GMT_MSG_INFORMATION, "Output grid extrema [%g/%g] exceeds extrema of input grid [%g/%g]; to clip output use -n...+c""\n", + Gout->header->z_min, Gout->header->z_max, Gin->header->z_min, Gin->header->z_max); + } + + if (shift) { /* Must restore input header wesn */ + Gin->header->wesn[XLO] -= shift, Gin->header->wesn[XHI] -= shift; + } + *G = Gout; /* Pass out the new grid */ + + API->error = GMT_NOERROR; + + return (GMT_NOERROR); +} + +EXTERN_MSC int GMT_grdsample (void *V_API, int mode, void *args) { + + int error = 0; + unsigned int registration; + + double wesn_i[4], wesn_o[4], inc[2]; struct GRDSAMPLE_CTRL *Ctrl = NULL; struct GMT_GRID *Gin = NULL, *Gout = NULL; - struct GMT_GRID_HEADER_HIDDEN *HH = NULL; struct GMT_CTRL *GMT = NULL, *GMT_cpy = NULL; struct GMT_OPTION *options = NULL; struct GMTAPI_CTRL *API = gmt_get_api_ptr (V_API); /* Cast from void to GMTAPI_CTRL pointer */ @@ -300,74 +434,14 @@ EXTERN_MSC int GMT_grdsample (void *V_API, int mode, void *args) { /* Here, wesn_i is compatible with the INPUT grid so we can read the subset from which we will resample. * Here, wesn_o is the region we wish to use when creating the output grid */ - if ((Gout = GMT_Create_Data (API, GMT_IS_GRID, GMT_IS_SURFACE, GMT_CONTAINER_AND_DATA, NULL, wesn_o, inc, \ - registration, GMT_NOTSET, NULL)) == NULL) Return (API->error); - - sprintf (format, "Input grid (%s/%s/%s/%s) n_columns = %%d n_rows = %%d dx = %s dy = %s registration = %%d\n", - GMT->current.setting.format_float_out, GMT->current.setting.format_float_out, GMT->current.setting.format_float_out, - GMT->current.setting.format_float_out, GMT->current.setting.format_float_out, GMT->current.setting.format_float_out); - - GMT_Report (API, GMT_MSG_INFORMATION, format, Gin->header->wesn[XLO], Gin->header->wesn[XHI], - Gin->header->wesn[YLO], Gin->header->wesn[YHI], Gin->header->n_columns, Gin->header->n_rows, - Gin->header->inc[GMT_X], Gin->header->inc[GMT_Y], Gin->header->registration); - - memcpy (&format, "Output", 6); - - GMT_Report (API, GMT_MSG_INFORMATION, format, Gout->header->wesn[XLO], Gout->header->wesn[XHI], - Gout->header->wesn[YLO], Gout->header->wesn[YHI], Gout->header->n_columns, Gout->header->n_rows, - Gout->header->inc[GMT_X], Gout->header->inc[GMT_Y], Gout->header->registration); - if (GMT_Read_Data (API, GMT_IS_GRID, GMT_IS_FILE, GMT_IS_SURFACE, GMT_DATA_ONLY, wesn_i, Ctrl->In.file, Gin) == NULL) { /* Get subset */ Return (API->error); } - if (Gout->header->inc[GMT_X] > Gin->header->inc[GMT_X]) - GMT_Report (API, GMT_MSG_WARNING, "Output sampling interval in x exceeds input interval and may lead to aliasing.\n"); - if (Gout->header->inc[GMT_Y] > Gin->header->inc[GMT_Y]) - GMT_Report (API, GMT_MSG_WARNING, "Output sampling interval in y exceeds input interval and may lead to aliasing.\n"); - - /* Precalculate longitudes from the output grid layout */ + /* Now do the resampling of the grid, getting Gout back */ - HH = gmt_get_H_hidden (Gin->header); - lon = gmt_M_memory (GMT, NULL, Gout->header->n_columns, double); - for (col = 0; col < (int)Gout->header->n_columns; col++) { - lon[col] = gmt_M_grd_col_to_x (GMT, col, Gout->header); - if (!HH->nxp) - /* Nothing */; - else if (lon[col] > Gin->header->wesn[XHI]) - lon[col] -= Gin->header->inc[GMT_X] * HH->nxp; - else if (lon[col] < Gin->header->wesn[XLO]) - lon[col] += Gin->header->inc[GMT_X] * HH->nxp; - } - - /* Loop over input point and estimate output values */ - - Gout->header->z_min = FLT_MAX; Gout->header->z_max = -FLT_MAX; /* Min/max for out */ - -#ifdef _OPENMP -#pragma omp parallel for private(row,col,lat,ij) shared(GMT,Gin,Gout,lon) -#endif - for (row = 0; row < (int)Gout->header->n_rows; row++) { - lat = gmt_M_grd_row_to_y (GMT, row, Gout->header); - if (!HH->nyp) - /* Nothing */; - else if (lat > Gin->header->wesn[YHI]) - lat -= Gin->header->inc[GMT_Y] * HH->nyp; - else if (lat < Gin->header->wesn[YLO]) - lat += Gin->header->inc[GMT_Y] * HH->nyp; - for (col = 0; col < (int)Gout->header->n_columns; col++) { - ij = gmt_M_ijp (Gout->header, row, col); - Gout->data[ij] = (gmt_grdfloat)gmt_bcr_get_z (GMT, Gin, lon[col], lat); - if (Gout->data[ij] < Gout->header->z_min) Gout->header->z_min = Gout->data[ij]; - if (Gout->data[ij] > Gout->header->z_max) Gout->header->z_max = Gout->data[ij]; - } - } - gmt_M_free (GMT, lon); - - if (!GMT->common.n.truncate && (Gout->header->z_min < Gin->header->z_min || Gout->header->z_max > Gin->header->z_max)) { /* Report and possibly truncate output to input extrama */ - GMT_Report (GMT->parent, GMT_MSG_INFORMATION, "Output grid extrema [%g/%g] exceeds extrema of input grid [%g/%g]; to clip output use -n...+c""\n", - Gout->header->z_min, Gout->header->z_max, Gin->header->z_min, Gin->header->z_max); - } + if (gmtlib_grd_sample (API, Gin, wesn_o, inc, registration, 0, &Gout)) + Return (API->error); gmt_set_pad (GMT, API->pad); /* Reset to session default pad before output */ From e087bcf201fcd87a6362ceb2f0eb2a343ac007a6 Mon Sep 17 00:00:00 2001 From: Paul Wessel Date: Sun, 17 Jan 2021 19:59:48 -1000 Subject: [PATCH 2/5] Update grdsample.c --- src/grdsample.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/grdsample.c b/src/grdsample.c index d86596fbfc2..19c29eafbe1 100644 --- a/src/grdsample.c +++ b/src/grdsample.c @@ -202,7 +202,7 @@ EXTERN_MSC int gmtlib_grd_sample (void *V_API, struct GMT_GRID *Gin, double *wes if (V_API == NULL) return (GMT_NOT_A_SESSION); if (Gin == NULL) return (GMT_PTR_IS_NULL); if (G == NULL) return (GMT_PTR_IS_NULL); - if ((registration == GMT_GRID_NODE_REG || registration == GMT_GRID_PIXEL_REG)) return (GMT_VALUE_NOT_SET); + if (!(registration == GMT_GRID_NODE_REG || registration == GMT_GRID_PIXEL_REG)) return (GMT_VALUE_NOT_SET); API = gmt_get_api_ptr (V_API); GMT = API->GMT; From 9ad2d7959b1574a824d4ae92c93287f3a9c47a7d Mon Sep 17 00:00:00 2001 From: Paul Wessel Date: Mon, 18 Jan 2021 09:40:26 -1000 Subject: [PATCH 3/5] Update grdsample.c --- src/grdsample.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/src/grdsample.c b/src/grdsample.c index 19c29eafbe1..e66bb812fa4 100644 --- a/src/grdsample.c +++ b/src/grdsample.c @@ -179,10 +179,11 @@ EXTERN_MSC int gmtlib_grd_sample (void *V_API, struct GMT_GRID *Gin, double *wes * err = gmtlib_grdsample (API, Gin, wesn, incs, registration, mode, G) * * Input arguments: + * API: Pointer to the GMT API structure * Gin: The input GMT grid * wesn: Array with xmin, xmax, ymin, ymax for output, or NULL if same as input - * incs: Array with xinc and yinc, If yinc == 0 we set it to xinc - * reg: Registration (pixel or gridline) + * incs: Array with xinc and yinc, If yinc == 0 we set it to xinc, NULL means same as input grid + * reg: Registration (pixel or gridline). GMT_NOTSET means same as input grid * mode: Currently unused * Output arguments * G: The resulting grid structure with allocated memory @@ -202,14 +203,18 @@ EXTERN_MSC int gmtlib_grd_sample (void *V_API, struct GMT_GRID *Gin, double *wes if (V_API == NULL) return (GMT_NOT_A_SESSION); if (Gin == NULL) return (GMT_PTR_IS_NULL); if (G == NULL) return (GMT_PTR_IS_NULL); - if (!(registration == GMT_GRID_NODE_REG || registration == GMT_GRID_PIXEL_REG)) return (GMT_VALUE_NOT_SET); + if (!(registration == GMT_NOTSET || registration == GMT_GRID_NODE_REG || registration == GMT_GRID_PIXEL_REG)) return (GMT_VALUE_NOT_SET); + if (incs && gmt_M_is_zero (incs[GMT_X])) return (GMT_VALUE_NOT_SET); + + /* Get API and GMT pointers */ API = gmt_get_api_ptr (V_API); GMT = API->GMT; gmt_M_memcpy (wesn_o, (wesn ? wesn : Gin->header->wesn), 4, double); /* wesn_o is the region we want for the output [Same as input] */ gmt_M_memcpy (inc, (incs ? incs : Gin->header->inc), 2, double); /* Either a new increment is given or we use the one from the input grid */ - - if (wesn) { /* Gave a specific region */ + if (gmt_M_is_zero (inc[GMT_Y])) inc[GMT_Y] = inc[GMT_X]; + if (registration == GMT_NOTSET) registration = Gin->header->registration; + if (wesn) { /* Gave a specific region */ bool wrap_360_i = (gmt_M_is_geographic (GMT, GMT_IN) && gmt_M_360_range (Gin->header->wesn[XLO], Gin->header->wesn[XHI])); bool wrap_360_o = (gmt_M_is_geographic (GMT, GMT_OUT) && gmt_M_360_range (wesn_o[XLO], wesn_o[XHI])); From 73c477479b0ae5b17e7778b560408b49490efa21 Mon Sep 17 00:00:00 2001 From: Paul Wessel Date: Tue, 19 Jan 2021 10:35:42 -1000 Subject: [PATCH 4/5] updates --- doc/rst/source/devdocs.rst | 4 +- doc/rst/source/devdocs/refactoring.rst | 276 +++++++++++++++++++++++++ 2 files changed, 279 insertions(+), 1 deletion(-) create mode 100644 doc/rst/source/devdocs/refactoring.rst diff --git a/doc/rst/source/devdocs.rst b/doc/rst/source/devdocs.rst index 3ca9194d7f4..22f30122909 100644 --- a/doc/rst/source/devdocs.rst +++ b/doc/rst/source/devdocs.rst @@ -5,10 +5,12 @@ Developer Documentation This section contains low-level documentation for how some aspects of GMT have been implemented or are in the planning-stages for design and implementation. Users with ideas for simplifications and general improvements are encouraged to open an issue and -share their thoughts with us. +share their thoughts with us. Descriptions in this document may change with no advance +notice! .. toctree:: :maxdepth: 1 :numbered: devdocs/long_options + devdocs/refactoring diff --git a/doc/rst/source/devdocs/refactoring.rst b/doc/rst/source/devdocs/refactoring.rst new file mode 100644 index 00000000000..aa9379a403f --- /dev/null +++ b/doc/rst/source/devdocs/refactoring.rst @@ -0,0 +1,276 @@ +.. _Refactoring: + +Proposed Refactoring Implementation +=================================== + +These are the rambling thoughts of Paul Wessel on the C refactoring. I start with explaining +how the present API came into being and why, what the problems are, and what the +challenges will be in doing a refactoring, including what that might mean. + +What is the problem? +-------------------- + +The GMT API as of GMT 6.x is an unusual beast as far as APIs go. Let me explain. +When GMT 5 was released we had repackaged all the GMT programs (e.g., pscoast.c, surface.c) +into API modules. Those source files were still there, but instead of having a main function +required for all C programs, they instead defined module functions called GMT_pscoast, +GMT_surface, etc. These all had the same argument list, e.g. + +:: + + int GMT_surface (void *API, int mode, void *args) + +where all the arguments necessary for surface to run were passed via the *args* pointer. +This module, as all others, would have the following common work flow: + +#. Create a linked list of GMT options from the two last arguments +#. Parse any GMT common options given to the module +#. Parse the module-specific options given to the module +#. Read the input data (if required) +#. Perform the calculation +#. Write the output (data or plotting) + +Since there no longer is a separate **surface** program, for command-line work we rely on the +single program gmt.c which does the following: + +#. Creates a new GMT session and return API pointer to internal structure +#. Calls the selected module via GMT_Call_Module and pass the arguments +#. Destroys the GMT session + +The only price users had to pay when moving from GMT 4 to 5 was to get used to running modules as argument to +the **gmt** program, e.g. + +:: + + gmt surface -R0/5/0/6 -I5m my_table.txt -Gmy_grid.grd + +and if that was just too hard to get used to we added aliases and functions for shells +as well as symbolic links to gmt in the name of all the modules, so if you compiled GMT +yourself you could pretend to be in GMT 4 forever and run + +:: + + surface -R0/5/0/6 -I5m my_table.txt -Gmy_grid.grd + +However, some modules are built slightly differently as far as i/o goes. They instead follow this model: + +1. Create a linked list of GMT options from the two last arguments +2. Parse any GMT common options given to the module +3. Parse the module-specific options given to the module +4. Loop over input records until EOF + + 4.1. Update the main calculation + +5. Finalize calculation +6. Write the output (data or plotting) + +Lets call this method **B** while the first one we introduced is method **A**. We will return +to the question of why we have two methods in the first place. + +Thus, all the work with the API basically resulted in no change as far as users go. So +that begs two questions: + +#. Why did we develop an API if it made no difference to users? +#. Why did we use the above design? + +The answer to the first is this: We wanted to make it possible for a wide range of +users and developers to take advantage of GMT's capabilities from different environments, +such as + +#. Custom C, C++, and even FORTRAN programs +#. Passing data to and from GMT modules in MATLAB +#. Passing data to and from GMT modules in Python +#. Passing data to and from GMT modules in ??? + + +There had been people who had used GMT from MATLAB and Python before, but +apart from a basic read/write module for grids in MATLAB, any real work would require +writing data to a temp file, use a system call to run a GMT program, and then +read the result back into MATLAB or Python. The API was designed to avoid the +system calls and instead call GMT API functions directly, avoiding temp files. + +However, there was a problem. The original GMT programs were all stand-along +programs: As discussed above, they would deal with option parsing, the i/o, and the calculations or plotting. +In contrast, MATLAB or Python users may already have data in memory and want +that to be used in the modules *without* temporary files. We faced a big decision +and not knowing all the ramifications picked the simplest one that would most +quickly get us to a working system: We simply converted the main function of +the C programs to a callable function. That meant all the steps that a module does listed +above now happens inside an API function: parsing of arguments, i/o of data, and +calculations. An immediate dilemma then was how do you pass data residing in memory into +a module that expects to read from files? The solution that was selected was +what we now call *virtual files*. These are special filenames that contain meta-data +about memory locations and are passed as regular filenames. However, deep in the +under-belly of the GMT i/o layer we know what these filenames mean and instead of +reading a file (e.g., a grid) into a memory structure for a grid we instead +simply pass the memory pointer to the grid we registered as a virtual file directly. So no +reading, just passing memory, and the module is completely unaware of this. +Because of this scheme we were able to convert all ~150 modules +to an API. The downside came later when we started working on the GMT/MATLAB wrapper +first and later the GMT/Python wrapper (PyGMT). Unhappy with MATLAB in general, +Joaquim then got excited about the new language Julia, and building on the experience +with GMT/MATLAB he quickly developed a wrapper around the GMT C API to provide a +modern keyword/value interface to GMT from Julia. + +What are the refactoring goals? +------------------------------- + +The goal of the C refactoring is to produce a second-generation API that +separates the i/o part from the calculation part. Specifically, it would provide +access to a new set of functions that expect input and parameters among the function +arguments and that return output back via other arguments. For now, we will +focus on modules that do data processing and not plotting, since the latter tend +to be used for adding map layers and there are additional complications here (for instance, +we have no particular interest in receiving an incomplete layer of *PostScript* code). +So, the goal is simple to state. However, there are complications with this scheme: + +1. All modules require a combination of common GMT settings as well as module-specific + settings. These can range from a few to many. How complicated do we want the + function interface to be? Internally, each module loops over the command-line + options (step 1 for each model) and assigns a myriad of internal variables that + are used in the execution of the algorithm. If there is no parsing of any command + line, how are these set? +2. The input is expected to be passed via function arguments. However, remember + methods **A** and **B**. In case of **B** we never have the input data in memory + as it is processed record by record from a file. This was done for two reasons: + + 2.1. The algorithm lends itself to this (e.g., sum up values, plot a single point) + so it was naturally implemented that way -- there was no reason to actually hold + all the data in memory at the same time. + + 2.2. Some of the processing tools are expected + to handle very large amounts of data records, beyond what most computers would be + able to hold in memory. That, of course, is an ever-moving target, but the concrete + example we cite is trying to compute spatial averages on a 15x15 arc-second grid + for all the multi-beam bathymetry data ever collected, pushing a billion records. + This may require over 100 Gb of RAM, which in 2021 is still too large. How can we + handle these special cases? + + 2.3. There may not be a simple way to add OpenMP to speed up calculations even if we + had the data in memory. + +Of course, in most cases the amount of data is manageable with method **A** and we +may eventually convert the few model **B** modules to follow model **A**. Perhaps it +is possible to let a few modules have both models implemented just in case. + +How will we pass all those module settings? +------------------------------------------- + +Because the GMT API is written in C we are limited to what that language has to offer. +I think there are four ways to pass module options into a new function that can +be called from various environments: + +#. Have a long list of possible variables that may be set. In that case the number + of variables is fixed even if not all are set. It may look like something like this:: + + gmtlib_surface (API, region, increment, registration, tension, input, output); + + where region is an array of *xmin, xmax, ymin, ymax* and increment holds the *xinc* and + *yinc* increments, etc., etc. However, :doc:`/surface` has many more settings that a user may + wish to access, so the number of arguments will quickly grow. Having a function with + 10-20 arguments is not friendly. Furthermore, as soon as it is released, the developers + decide to add yet another argument and now we must break the interface in the next release. +#. Another scheme relies on collecting all the possible options into a structure and then + just pass the pointer to the structure to the function. That way, we can still add new + settings as new members of the structure without breaking the function interface. We + do something like this when GMT communicates with GDAL. +#. The C language allows for a function with a variable-length argument list. This makes it possible to implement a + rudimentary keyword, value system where each string keyword is followed by a void pointer + to the argument. Internal parsing will use the keyword to know what its argument should + be and obtain the value. This avoids the breaking of the interface since optional arguments + do not have to be listed. It may look something like this:: + + gmtlib_surface (API, input, output, "region", wesn, "increment", incs, NULL); + + where we have changed the location of the input and output arguments so that the variable + list of keyword-value pairs can end with a NULL pointer. +#. Finally, we could imagine some combination of the single structure pointer and the + NULL-terminated list of keyword/argument pairs. This is the one I am leaning towards. + + +But first, in considering these four schemes I think we can exclude the first: it is simply unwieldy +given the many options each module may offer. The second scheme (structure) seems at first +promising: In fact, all modules already have such a structure that we use when we parse the +options internally. For instance, the surface module has *struct SURFACE_CTRL* with sub-structures +for each of the module-specific settings (tension, iteration count, over-relaxation, etc.). +While currently a private structure only used by the module, one can imagine making it an +external structure so that users could directly create a copy and fill in the values for +the sub-structures (options) they wish. Alas, this is not a simple procedure. The regular +parser functions (one which there is one for each module) often do considerable work in translating a user +text-string option into internal parameters. It is often not possible to simply enter the +value of the resulting parameter directly because intermediate steps are required that +calculates the final parameter. Hence, this scheme seems too low-level to be a flexible +alternative to the current high-level (but limited) options strings, although we should look +at this in detail before making that final determination. With that caveat, this leaves us with +the last two schemes. I think either would allow workable solutions. However, to ensure we +are able to document the functions, maintain the current command-line option parsers, and +allow for the long-option expansion discussed previously, I suggest the best solution may +be a combination: We create a keyword-argument basic structure and wrap an array of these +under a new GMT dictionary structure. It is this single pointer that we pass in for the +settings, but it is not filled out like the low-level structure above but populated from +a keyword-argument interface. + +Here is a sequence of commands that demonstrate how this might work. I am using C +syntax here but I anticipate simple wrappers around these if other languages need some +variations on the theme:: + + dict = gmt_create_dictionary (API); /* Create a new and empty GMT dictionary */ + gmt_put_option (API, dict, "region", GMT_DOUBLE, 4, wesn); /* Pass wesn array */ + gmt_put_modifier (API, dict, "region", "selection", GMT_CHAR, 0, "diagonal"); /* Pass modifier +r or +selection=diagonal */ + gmt_put_option (API, dict, "increment", GMT_DOUBLE, 2, incs); /* Pass incs array */ + gmt_put_option (API, dict, "registration", GMT_INT, 1, GMT_GRID_PIXEL_REG); + gmt_put_option (API, dict, "tension", GMT_DOUBLE, 1, tension); + +We can now call the module:: + + gmtlib_surface (API, dict, input, output); + +Internally, we simply use the dictionary to create a linked list of GMT options and let the +standard module parser translate the dictionary settings to the internal structure parameters. +By using the long-options format for options and modifiers we simply use that as the documentation +for both the command-line options/modifiers and the dictionary settings, and we can then easily +produce one from the other. + +How will we pass input and output? +---------------------------------- + +But what about the data that need to be passed in and out of a module? No longer relying +on virtual files, we need to pass memory locations. There are some complications here too: + +#. Some environments may wish to pass a matrix for table input while others may wish + to pass a set of column vectors. While many environments are able to combine column + vectors into a matrix easily, other (e.g., C) cannot. +#. A few modules may wish to give a filename instead of reading the data in first, + as explained during our discussion of model **B**. +#. Many use cases want to receive the data back to the calling environment but some may + wish to write to a file directly (again, because it may just be too big). +#. While passing in a table of data is not complicated, once we bring in grids there is + the complication of how a grid is represented in the environment. Perhaps it is a matrix. + Then, there has to be additional variables to inform on region and grid spacing, for instance. + Is the matrix a set of rows or a set of columns? What type is the grid (float, integer, char)? + How much of this variability should be support at this API level? + +Taking a lesson from how we handled settings, the simplest scheme for input and output is +to use a structure to pass the various items. Unlike the complexity of the myriad of +module-specific options and sub-structures, for input and output we may be able to use a standard +IO structure:: + + struct GMT_IO { + unsigned int mode; + unsigned int type; + void *data; + }; + +where the *type* integer controls *what* type of data is stored via the *data* pointer, +and the *mode* integer controls *how* it is stored. I suggest a separate structure is +used for input and output since for some modules we may need to pass NULL for one or +both of them to maintain the same interface with four arguments across all modules. +The loading of the IO structure can be done via functions as well: + +:: + + gmt_put_input (API, struct GMT_IO item, unsigned int type, unsigned int mode); + +with an example of passing a matrix to surface via a struct GMT_IO pointer looking like this:: + + gmt_define_io (API, input, GMT_DOUBLE, GMT_IS_MATRIX); From 59d3556bd6b2ff22f586e83b55c9eb97676140e8 Mon Sep 17 00:00:00 2001 From: Paul Wessel Date: Wed, 20 Jan 2021 07:31:56 -1000 Subject: [PATCH 5/5] WIll move the docs update to master --- doc/rst/source/devdocs.rst | 4 +- doc/rst/source/devdocs/refactoring.rst | 276 ------------------------- 2 files changed, 1 insertion(+), 279 deletions(-) delete mode 100644 doc/rst/source/devdocs/refactoring.rst diff --git a/doc/rst/source/devdocs.rst b/doc/rst/source/devdocs.rst index 22f30122909..3ca9194d7f4 100644 --- a/doc/rst/source/devdocs.rst +++ b/doc/rst/source/devdocs.rst @@ -5,12 +5,10 @@ Developer Documentation This section contains low-level documentation for how some aspects of GMT have been implemented or are in the planning-stages for design and implementation. Users with ideas for simplifications and general improvements are encouraged to open an issue and -share their thoughts with us. Descriptions in this document may change with no advance -notice! +share their thoughts with us. .. toctree:: :maxdepth: 1 :numbered: devdocs/long_options - devdocs/refactoring diff --git a/doc/rst/source/devdocs/refactoring.rst b/doc/rst/source/devdocs/refactoring.rst deleted file mode 100644 index aa9379a403f..00000000000 --- a/doc/rst/source/devdocs/refactoring.rst +++ /dev/null @@ -1,276 +0,0 @@ -.. _Refactoring: - -Proposed Refactoring Implementation -=================================== - -These are the rambling thoughts of Paul Wessel on the C refactoring. I start with explaining -how the present API came into being and why, what the problems are, and what the -challenges will be in doing a refactoring, including what that might mean. - -What is the problem? --------------------- - -The GMT API as of GMT 6.x is an unusual beast as far as APIs go. Let me explain. -When GMT 5 was released we had repackaged all the GMT programs (e.g., pscoast.c, surface.c) -into API modules. Those source files were still there, but instead of having a main function -required for all C programs, they instead defined module functions called GMT_pscoast, -GMT_surface, etc. These all had the same argument list, e.g. - -:: - - int GMT_surface (void *API, int mode, void *args) - -where all the arguments necessary for surface to run were passed via the *args* pointer. -This module, as all others, would have the following common work flow: - -#. Create a linked list of GMT options from the two last arguments -#. Parse any GMT common options given to the module -#. Parse the module-specific options given to the module -#. Read the input data (if required) -#. Perform the calculation -#. Write the output (data or plotting) - -Since there no longer is a separate **surface** program, for command-line work we rely on the -single program gmt.c which does the following: - -#. Creates a new GMT session and return API pointer to internal structure -#. Calls the selected module via GMT_Call_Module and pass the arguments -#. Destroys the GMT session - -The only price users had to pay when moving from GMT 4 to 5 was to get used to running modules as argument to -the **gmt** program, e.g. - -:: - - gmt surface -R0/5/0/6 -I5m my_table.txt -Gmy_grid.grd - -and if that was just too hard to get used to we added aliases and functions for shells -as well as symbolic links to gmt in the name of all the modules, so if you compiled GMT -yourself you could pretend to be in GMT 4 forever and run - -:: - - surface -R0/5/0/6 -I5m my_table.txt -Gmy_grid.grd - -However, some modules are built slightly differently as far as i/o goes. They instead follow this model: - -1. Create a linked list of GMT options from the two last arguments -2. Parse any GMT common options given to the module -3. Parse the module-specific options given to the module -4. Loop over input records until EOF - - 4.1. Update the main calculation - -5. Finalize calculation -6. Write the output (data or plotting) - -Lets call this method **B** while the first one we introduced is method **A**. We will return -to the question of why we have two methods in the first place. - -Thus, all the work with the API basically resulted in no change as far as users go. So -that begs two questions: - -#. Why did we develop an API if it made no difference to users? -#. Why did we use the above design? - -The answer to the first is this: We wanted to make it possible for a wide range of -users and developers to take advantage of GMT's capabilities from different environments, -such as - -#. Custom C, C++, and even FORTRAN programs -#. Passing data to and from GMT modules in MATLAB -#. Passing data to and from GMT modules in Python -#. Passing data to and from GMT modules in ??? - - -There had been people who had used GMT from MATLAB and Python before, but -apart from a basic read/write module for grids in MATLAB, any real work would require -writing data to a temp file, use a system call to run a GMT program, and then -read the result back into MATLAB or Python. The API was designed to avoid the -system calls and instead call GMT API functions directly, avoiding temp files. - -However, there was a problem. The original GMT programs were all stand-along -programs: As discussed above, they would deal with option parsing, the i/o, and the calculations or plotting. -In contrast, MATLAB or Python users may already have data in memory and want -that to be used in the modules *without* temporary files. We faced a big decision -and not knowing all the ramifications picked the simplest one that would most -quickly get us to a working system: We simply converted the main function of -the C programs to a callable function. That meant all the steps that a module does listed -above now happens inside an API function: parsing of arguments, i/o of data, and -calculations. An immediate dilemma then was how do you pass data residing in memory into -a module that expects to read from files? The solution that was selected was -what we now call *virtual files*. These are special filenames that contain meta-data -about memory locations and are passed as regular filenames. However, deep in the -under-belly of the GMT i/o layer we know what these filenames mean and instead of -reading a file (e.g., a grid) into a memory structure for a grid we instead -simply pass the memory pointer to the grid we registered as a virtual file directly. So no -reading, just passing memory, and the module is completely unaware of this. -Because of this scheme we were able to convert all ~150 modules -to an API. The downside came later when we started working on the GMT/MATLAB wrapper -first and later the GMT/Python wrapper (PyGMT). Unhappy with MATLAB in general, -Joaquim then got excited about the new language Julia, and building on the experience -with GMT/MATLAB he quickly developed a wrapper around the GMT C API to provide a -modern keyword/value interface to GMT from Julia. - -What are the refactoring goals? -------------------------------- - -The goal of the C refactoring is to produce a second-generation API that -separates the i/o part from the calculation part. Specifically, it would provide -access to a new set of functions that expect input and parameters among the function -arguments and that return output back via other arguments. For now, we will -focus on modules that do data processing and not plotting, since the latter tend -to be used for adding map layers and there are additional complications here (for instance, -we have no particular interest in receiving an incomplete layer of *PostScript* code). -So, the goal is simple to state. However, there are complications with this scheme: - -1. All modules require a combination of common GMT settings as well as module-specific - settings. These can range from a few to many. How complicated do we want the - function interface to be? Internally, each module loops over the command-line - options (step 1 for each model) and assigns a myriad of internal variables that - are used in the execution of the algorithm. If there is no parsing of any command - line, how are these set? -2. The input is expected to be passed via function arguments. However, remember - methods **A** and **B**. In case of **B** we never have the input data in memory - as it is processed record by record from a file. This was done for two reasons: - - 2.1. The algorithm lends itself to this (e.g., sum up values, plot a single point) - so it was naturally implemented that way -- there was no reason to actually hold - all the data in memory at the same time. - - 2.2. Some of the processing tools are expected - to handle very large amounts of data records, beyond what most computers would be - able to hold in memory. That, of course, is an ever-moving target, but the concrete - example we cite is trying to compute spatial averages on a 15x15 arc-second grid - for all the multi-beam bathymetry data ever collected, pushing a billion records. - This may require over 100 Gb of RAM, which in 2021 is still too large. How can we - handle these special cases? - - 2.3. There may not be a simple way to add OpenMP to speed up calculations even if we - had the data in memory. - -Of course, in most cases the amount of data is manageable with method **A** and we -may eventually convert the few model **B** modules to follow model **A**. Perhaps it -is possible to let a few modules have both models implemented just in case. - -How will we pass all those module settings? -------------------------------------------- - -Because the GMT API is written in C we are limited to what that language has to offer. -I think there are four ways to pass module options into a new function that can -be called from various environments: - -#. Have a long list of possible variables that may be set. In that case the number - of variables is fixed even if not all are set. It may look like something like this:: - - gmtlib_surface (API, region, increment, registration, tension, input, output); - - where region is an array of *xmin, xmax, ymin, ymax* and increment holds the *xinc* and - *yinc* increments, etc., etc. However, :doc:`/surface` has many more settings that a user may - wish to access, so the number of arguments will quickly grow. Having a function with - 10-20 arguments is not friendly. Furthermore, as soon as it is released, the developers - decide to add yet another argument and now we must break the interface in the next release. -#. Another scheme relies on collecting all the possible options into a structure and then - just pass the pointer to the structure to the function. That way, we can still add new - settings as new members of the structure without breaking the function interface. We - do something like this when GMT communicates with GDAL. -#. The C language allows for a function with a variable-length argument list. This makes it possible to implement a - rudimentary keyword, value system where each string keyword is followed by a void pointer - to the argument. Internal parsing will use the keyword to know what its argument should - be and obtain the value. This avoids the breaking of the interface since optional arguments - do not have to be listed. It may look something like this:: - - gmtlib_surface (API, input, output, "region", wesn, "increment", incs, NULL); - - where we have changed the location of the input and output arguments so that the variable - list of keyword-value pairs can end with a NULL pointer. -#. Finally, we could imagine some combination of the single structure pointer and the - NULL-terminated list of keyword/argument pairs. This is the one I am leaning towards. - - -But first, in considering these four schemes I think we can exclude the first: it is simply unwieldy -given the many options each module may offer. The second scheme (structure) seems at first -promising: In fact, all modules already have such a structure that we use when we parse the -options internally. For instance, the surface module has *struct SURFACE_CTRL* with sub-structures -for each of the module-specific settings (tension, iteration count, over-relaxation, etc.). -While currently a private structure only used by the module, one can imagine making it an -external structure so that users could directly create a copy and fill in the values for -the sub-structures (options) they wish. Alas, this is not a simple procedure. The regular -parser functions (one which there is one for each module) often do considerable work in translating a user -text-string option into internal parameters. It is often not possible to simply enter the -value of the resulting parameter directly because intermediate steps are required that -calculates the final parameter. Hence, this scheme seems too low-level to be a flexible -alternative to the current high-level (but limited) options strings, although we should look -at this in detail before making that final determination. With that caveat, this leaves us with -the last two schemes. I think either would allow workable solutions. However, to ensure we -are able to document the functions, maintain the current command-line option parsers, and -allow for the long-option expansion discussed previously, I suggest the best solution may -be a combination: We create a keyword-argument basic structure and wrap an array of these -under a new GMT dictionary structure. It is this single pointer that we pass in for the -settings, but it is not filled out like the low-level structure above but populated from -a keyword-argument interface. - -Here is a sequence of commands that demonstrate how this might work. I am using C -syntax here but I anticipate simple wrappers around these if other languages need some -variations on the theme:: - - dict = gmt_create_dictionary (API); /* Create a new and empty GMT dictionary */ - gmt_put_option (API, dict, "region", GMT_DOUBLE, 4, wesn); /* Pass wesn array */ - gmt_put_modifier (API, dict, "region", "selection", GMT_CHAR, 0, "diagonal"); /* Pass modifier +r or +selection=diagonal */ - gmt_put_option (API, dict, "increment", GMT_DOUBLE, 2, incs); /* Pass incs array */ - gmt_put_option (API, dict, "registration", GMT_INT, 1, GMT_GRID_PIXEL_REG); - gmt_put_option (API, dict, "tension", GMT_DOUBLE, 1, tension); - -We can now call the module:: - - gmtlib_surface (API, dict, input, output); - -Internally, we simply use the dictionary to create a linked list of GMT options and let the -standard module parser translate the dictionary settings to the internal structure parameters. -By using the long-options format for options and modifiers we simply use that as the documentation -for both the command-line options/modifiers and the dictionary settings, and we can then easily -produce one from the other. - -How will we pass input and output? ----------------------------------- - -But what about the data that need to be passed in and out of a module? No longer relying -on virtual files, we need to pass memory locations. There are some complications here too: - -#. Some environments may wish to pass a matrix for table input while others may wish - to pass a set of column vectors. While many environments are able to combine column - vectors into a matrix easily, other (e.g., C) cannot. -#. A few modules may wish to give a filename instead of reading the data in first, - as explained during our discussion of model **B**. -#. Many use cases want to receive the data back to the calling environment but some may - wish to write to a file directly (again, because it may just be too big). -#. While passing in a table of data is not complicated, once we bring in grids there is - the complication of how a grid is represented in the environment. Perhaps it is a matrix. - Then, there has to be additional variables to inform on region and grid spacing, for instance. - Is the matrix a set of rows or a set of columns? What type is the grid (float, integer, char)? - How much of this variability should be support at this API level? - -Taking a lesson from how we handled settings, the simplest scheme for input and output is -to use a structure to pass the various items. Unlike the complexity of the myriad of -module-specific options and sub-structures, for input and output we may be able to use a standard -IO structure:: - - struct GMT_IO { - unsigned int mode; - unsigned int type; - void *data; - }; - -where the *type* integer controls *what* type of data is stored via the *data* pointer, -and the *mode* integer controls *how* it is stored. I suggest a separate structure is -used for input and output since for some modules we may need to pass NULL for one or -both of them to maintain the same interface with four arguments across all modules. -The loading of the IO structure can be done via functions as well: - -:: - - gmt_put_input (API, struct GMT_IO item, unsigned int type, unsigned int mode); - -with an example of passing a matrix to surface via a struct GMT_IO pointer looking like this:: - - gmt_define_io (API, input, GMT_DOUBLE, GMT_IS_MATRIX);