Skip to content

compiler: fix missing parallel omp flag for nested#2932

Open
mloubout wants to merge 1 commit into
mainfrom
patch-nested-parallel
Open

compiler: fix missing parallel omp flag for nested#2932
mloubout wants to merge 1 commit into
mainfrom
patch-nested-parallel

Conversation

@mloubout
Copy link
Copy Markdown
Contributor

No description provided.

@codecov
Copy link
Copy Markdown

codecov Bot commented May 14, 2026

Codecov Report

✅ All modified and coverable lines are covered by tests.
✅ Project coverage is 83.35%. Comparing base (ddb2459) to head (13f9357).

Additional details and impacted files
@@           Coverage Diff           @@
##             main    #2932   +/-   ##
=======================================
  Coverage   83.35%   83.35%           
=======================================
  Files         248      248           
  Lines       51734    51734           
  Branches     4463     4463           
=======================================
+ Hits        43122    43124    +2     
+ Misses       7859     7858    -1     
+ Partials      753      752    -1     
Flag Coverage Δ
pytest-gpu-aomp-amdgpuX 68.70% <ø> (+0.01%) ⬆️
pytest-gpu-gcc- 78.04% <ø> (-0.02%) ⬇️
pytest-gpu-icx- 77.96% <ø> (-0.01%) ⬇️
pytest-gpu-nvc-nvidiaX 69.24% <ø> (ø)

Flags with carried forward coverage won't be shown. Click here to find out more.

☔ View full report in Codecov by Sentry.
📢 Have feedback on the report? Share it here.

🚀 New features to boost your workflow:
  • ❄️ Test Analytics: Detect flaky tests, report on failures, and find test suite problems.

@JDBetteridge
Copy link
Copy Markdown
Contributor

Hmmm, I'm still getting a fail with this branch...

@JDBetteridge
Copy link
Copy Markdown
Contributor

It's possible that it is unrelated

tests/test_dle.py::TestNodeParallelism::test_dynamic_nthreads

now passes, but

tests/test_dle.py::TestNodeParallelism::test_incr_perfect_sparse_outer

is failing here

@JDBetteridge
Copy link
Copy Markdown
Contributor

Generated code:

print(op.ccode)
/* Devito generated code for Operator `Kernel` */

#define _POSIX_C_SOURCE 200809L
#define START(S) struct timeval start_ ## S , end_ ## S ; gettimeofday(&start_ ## S , NULL);
#define STOP(S,T) gettimeofday(&end_ ## S, NULL); T->S += (double)(end_ ## S .tv_sec-start_ ## S.tv_sec)+(double)(end_ ## S .tv_usec-start_ ## S .tv_usec)/1000000;
#define MAX(a,b) (((a) > (b)) ? (a) : (b))

#include "stdlib.h"
#include "math.h"
#include "sys/time.h"
#include "omp.h"

struct dataobj
{
  void *restrict data;
  int * size;
  unsigned long nbytes;
  unsigned long * npsize;
  unsigned long * dsize;
  int * hsize;
  int * hofs;
  int * oofs;
  void * dmap;
} ;

struct profiler
{
  double section0;
} ;


int Kernel(struct dataobj *restrict u_vec, struct dataobj *restrict u_vec, struct dataobj *restrict u_coords_vec, const int x_M, const int x_m, const int y_M, const int y_m, const int z_M, const int z_m, const float h_x, const float h_y, const float h_z, const float o_x, const float o_y, const float o_z, const int p_u_M, const int p_u_m, const int time_M, const int time_m, const int nthreads_nested, const int nthreads_nonaffine, struct profiler * timers)
{
  float (*restrict u)[u_vec->size[1]] __attribute__ ((aligned (64))) = (float (*)[u_vec->size[1]]) u_vec->data;
  float (*restrict u)[u_vec->size[1]][u_vec->size[2]][u_vec->size[3]] __attribute__ ((aligned (64))) = (float (*)[u_vec->size[1]][u_vec->size[2]][u_vec->size[3]]) u_vec->data;
  float (*restrict u_coords)[u_coords_vec->size[1]] __attribute__ ((aligned (64))) = (float (*)[u_coords_vec->size[1]]) u_coords_vec->data;

  for (int time = time_m, t0 = (time)%(2); time <= time_M; time += 1, t0 = (time)%(2))
  {
    START(section0)
    #pragma omp parallel num_threads(nthreads_nonaffine)
    {
      int chunk_size = (int)(MAX(1, (int)((1.0/3.0)*(p_u_M - p_u_m + 1)/nthreads_nonaffine)));
      #pragma omp for schedule(dynamic,chunk_size)
      for (int p_u = p_u_m; p_u <= p_u_M; p_u += 1)
      {
        for (int rp_ux = 0; rp_ux <= 1; rp_ux += 1)
        {
          #pragma omp parallel for collapse(2) schedule(static,1) num_threads(nthreads_nested)
          for (int rp_uy = 0; rp_uy <= 1; rp_uy += 1)
          {
            for (int rp_uz = 0; rp_uz <= 1; rp_uz += 1)
            {
              int posx = (int)(floorf((-o_x + u_coords[p_u][0])/h_x));
              int posy = (int)(floorf((-o_y + u_coords[p_u][1])/h_y));
              int posz = (int)(floorf((-o_z + u_coords[p_u][2])/h_z));
              float px = -floorf((-o_x + u_coords[p_u][0])/h_x) + (-o_x + u_coords[p_u][0])/h_x;
              float py = -floorf((-o_y + u_coords[p_u][1])/h_y) + (-o_y + u_coords[p_u][1])/h_y;
              float pz = -floorf((-o_z + u_coords[p_u][2])/h_z) + (-o_z + u_coords[p_u][2])/h_z;
              if (rp_ux + posx >= x_m - 1 && rp_uy + posy >= y_m - 1 && rp_uz + posz >= z_m - 1 && rp_ux + posx <= x_M + 1 && rp_uy + posy <= y_M + 1 && rp_uz + posz <= z_M + 1)
              {
                float r0 = (rp_ux*px + (1 - rp_ux)*(1 - px))*(rp_uy*py + (1 - rp_uy)*(1 - py))*(rp_uz*pz + (1 - rp_uz)*(1 - pz))*u[time][p_u];
                #pragma omp atomic update
                u[t0][rp_ux + posx + 1][rp_uy + posy + 1][rp_uz + posz + 1] += r0;
              }
            }
          }
        }
      }
    }
    STOP(section0,timers)
  }

  return 0;
}

@mloubout
Copy link
Copy Markdown
Contributor Author

Ok so for this one, it makes sense that it fails on ppc since it checks the pragmas (which only ppc supports with the nested). So maybe a skipif('ppc'). Can you list all the tests that fail on ppc on top of this one?

Copy link
Copy Markdown
Contributor

@FabioLuporini FabioLuporini left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

incredible, but if it passes all tests -- and there are many -- I guess it's OK...

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

Projects

None yet

Development

Successfully merging this pull request may close these issues.

3 participants