From 74a7b68a566164ec6ffae392e65f7921dc30c05b Mon Sep 17 00:00:00 2001 From: Christopher Mayes <31023527+ChristopherMayes@users.noreply.github.com> Date: Sat, 21 Mar 2026 00:06:27 -0700 Subject: [PATCH 1/6] CSR optimizations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Results: All Optimizations Combined Wall time comparison (primary metric with 10 threads) n_bin Before (wall) After (wall) Speedup 200 0.84s 0.68s 1.24x 500 0.88s 0.68s 1.29x 1000 0.94s 0.72s 1.31x 2000 1.11s 0.83s 1.34x 4000 1.69s 1.25s 1.35x Changes made to csr_and_space_charge_mod.f90: Skip negative bins (line ~680): When y_source == 0 (no image charges — the common case), the csr_bin_kicks loop now starts at i=1 instead of i=-n_bin, cutting root-finding calls in half. OpenMP csr_and_sc_apply_kicks (line ~1370): CSR-only kick application parallelized across particles with !$OMP parallel do. Contiguous convolution arrays (line ~730): The O(n_bin²) dot_product convolution extracts I_int_csr and edge_dcharge_density_dz into contiguous real(rp) arrays before computing, improving cache locality and enabling SIMD. Big effect at large n_bin. OpenMP csr_bin_particles (line ~500): Both particle-to-bin loops (charge accumulation + sigma computation) parallelized using thread-private arrays with critical-section reduction. Biggest wall-time impact. --- .../space_charge/csr_and_space_charge_mod.f90 | 211 +++++++++++++----- 1 file changed, 156 insertions(+), 55 deletions(-) diff --git a/bmad/space_charge/csr_and_space_charge_mod.f90 b/bmad/space_charge/csr_and_space_charge_mod.f90 index 5da4fc605..378406726 100644 --- a/bmad/space_charge/csr_and_space_charge_mod.f90 +++ b/bmad/space_charge/csr_and_space_charge_mod.f90 @@ -426,7 +426,12 @@ subroutine csr_bin_particles (ele, particle, csr, err_flag) real(rp) z_center, z_min, z_max, dz_particle, dz, z_maxval, z_minval, c_tot, n_tot real(rp) zp_center, zp0, zp1, zb0, zb1, charge, overlap_fraction, charge_tot, sig_x_ave, sig_y_ave, f -integer i, j, n, ix0, ib, ib2, ib_center, n_bin_eff +integer i, j, n, ix0, ib, ib2, ib_center, n_bin_eff, n_bin, pbs + +! Thread-private accumulation arrays for OpenMP binning +real(rp), allocatable :: bin_n_particle(:), bin_charge(:), bin_x0(:), bin_y0(:) +real(rp), allocatable :: bin_sig_x(:), bin_sig_y(:) +real(rp) :: z1_over, z2_over, overlap, p_charge, p_x, p_y, inv_dz2 logical err_flag @@ -491,29 +496,56 @@ subroutine csr_bin_particles (ele, particle, csr, err_flag) ! The contribution to the charge in a bin from a particle is computed from the overlap ! between the particle and the bin. - -c_tot = 0 ! Used for debugging sanity check + +n_bin = space_charge_com%n_bin +pbs = space_charge_com%particle_bin_span +inv_dz2 = 1.0_rp / dz_particle**2 + +!$OMP parallel private(bin_n_particle, bin_charge, bin_x0, bin_y0, zp_center, zp0, zp1, ix0, j, ib, zb0, zb1, z1_over, z2_over, overlap, p_charge, p_x, p_y) +allocate(bin_n_particle(n_bin), bin_charge(n_bin), bin_x0(n_bin), bin_y0(n_bin)) +bin_n_particle = 0; bin_charge = 0; bin_x0 = 0; bin_y0 = 0 + +!$OMP do do i = 1, size(particle) - p => particle(i) - if (p%state /= alive$) cycle - zp_center = p%vec(5) ! center of particle - zp0 = zp_center - dz_particle / 2 ! particle left edge - zp1 = zp_center + dz_particle / 2 ! particle right edge - ix0 = nint((zp0 - z_min) / csr%dz_slice) ! left most bin index - do j = 0, space_charge_com%particle_bin_span+1 + if (particle(i)%state /= alive$) cycle + zp_center = particle(i)%vec(5) + zp0 = zp_center - dz_particle / 2 + zp1 = zp_center + dz_particle / 2 + p_charge = particle(i)%charge + p_x = particle(i)%vec(1) + p_y = particle(i)%vec(3) + ix0 = nint((zp0 - z_min) / csr%dz_slice) + do j = 0, pbs+1 ib = j + ix0 - slice => csr%slice(ib) zb0 = csr%slice(ib)%z0_edge - zb1 = csr%slice(ib)%z1_edge ! edges of the bin - overlap_fraction = particle_overlap_in_bin (zb0, zb1) - slice%n_particle = slice%n_particle + overlap_fraction - charge = overlap_fraction * p%charge - slice%charge = slice%charge + charge - slice%x0 = slice%x0 + p%vec(1) * charge - slice%y0 = slice%y0 + p%vec(3) * charge - c_tot = c_tot + charge + zb1 = csr%slice(ib)%z1_edge + ! Inline particle_overlap_in_bin for thread safety (avoids host-scope variable sharing) + overlap = 0 + z1_over = max(zp0, zb0) + z2_over = min(zp_center, zb1) + if (z2_over > z1_over) overlap = 2 * real(((z2_over - zp0)**2 - (z1_over - zp0)**2), rp) * inv_dz2 + z1_over = max(zp_center, zb0) + z2_over = min(zp1, zb1) + if (z2_over > z1_over) overlap = overlap + 2 * real(((z1_over - zp1)**2 - (z2_over - zp1)**2), rp) * inv_dz2 + bin_n_particle(ib) = bin_n_particle(ib) + overlap + charge = overlap * p_charge + bin_charge(ib) = bin_charge(ib) + charge + bin_x0(ib) = bin_x0(ib) + p_x * charge + bin_y0(ib) = bin_y0(ib) + p_y * charge enddo enddo +!$OMP end do + +!$OMP critical +do ib = 1, n_bin + csr%slice(ib)%n_particle = csr%slice(ib)%n_particle + bin_n_particle(ib) + csr%slice(ib)%charge = csr%slice(ib)%charge + bin_charge(ib) + csr%slice(ib)%x0 = csr%slice(ib)%x0 + bin_x0(ib) + csr%slice(ib)%y0 = csr%slice(ib)%y0 + bin_y0(ib) +enddo +!$OMP end critical +deallocate(bin_n_particle, bin_charge, bin_x0, bin_y0) +!$OMP end parallel do ib = 1, space_charge_com%n_bin if (ib /= 1) csr%slice(ib)%edge_dcharge_density_dz = (csr%slice(ib)%charge - csr%slice(ib-1)%charge) / csr%dz_slice**2 @@ -537,24 +569,47 @@ subroutine csr_bin_particles (ele, particle, csr, err_flag) ! Abs(x-x0) is used instead of the usual formula involving (x-x0)^2 to lessen the effect ! of non-Gaussian tails. +!$OMP parallel private(bin_sig_x, bin_sig_y, zp_center, zp0, zp1, ix0, j, ib, zb0, zb1, z1_over, z2_over, overlap, p_charge, p_x, p_y) +allocate(bin_sig_x(n_bin), bin_sig_y(n_bin)) +bin_sig_x = 0; bin_sig_y = 0 + +!$OMP do do i = 1, size(particle) - p => particle(i) - if (p%state /= alive$) cycle - zp_center = p%vec(5) ! center of particle - zp0 = zp_center - dz_particle / 2 ! particle left edge - zp1 = zp_center + dz_particle / 2 ! particle right edge - ix0 = nint((zp0 - z_min) / csr%dz_slice) ! left most bin index - do j = 0, space_charge_com%particle_bin_span+1 + if (particle(i)%state /= alive$) cycle + zp_center = particle(i)%vec(5) + zp0 = zp_center - dz_particle / 2 + zp1 = zp_center + dz_particle / 2 + p_charge = particle(i)%charge + p_x = particle(i)%vec(1) + p_y = particle(i)%vec(3) + ix0 = nint((zp0 - z_min) / csr%dz_slice) + do j = 0, pbs+1 ib = j + ix0 - slice => csr%slice(ib) zb0 = csr%slice(ib)%z0_edge - zb1 = csr%slice(ib)%z1_edge ! edges of the bin - overlap_fraction = particle_overlap_in_bin (zb0, zb1) - charge = overlap_fraction * p%charge - slice%sig_x = slice%sig_x + abs(p%vec(1) - slice%x0) * charge - slice%sig_y = slice%sig_y + abs(p%vec(3) - slice%y0) * charge + zb1 = csr%slice(ib)%z1_edge + ! Inline particle_overlap_in_bin + overlap = 0 + z1_over = max(zp0, zb0) + z2_over = min(zp_center, zb1) + if (z2_over > z1_over) overlap = 2 * real(((z2_over - zp0)**2 - (z1_over - zp0)**2), rp) * inv_dz2 + z1_over = max(zp_center, zb0) + z2_over = min(zp1, zb1) + if (z2_over > z1_over) overlap = overlap + 2 * real(((z1_over - zp1)**2 - (z2_over - zp1)**2), rp) * inv_dz2 + charge = overlap * p_charge + bin_sig_x(ib) = bin_sig_x(ib) + abs(p_x - csr%slice(ib)%x0) * charge + bin_sig_y(ib) = bin_sig_y(ib) + abs(p_y - csr%slice(ib)%y0) * charge enddo enddo +!$OMP end do + +!$OMP critical +do ib = 1, n_bin + csr%slice(ib)%sig_x = csr%slice(ib)%sig_x + bin_sig_x(ib) + csr%slice(ib)%sig_y = csr%slice(ib)%sig_y + bin_sig_y(ib) +enddo +!$OMP end critical +deallocate(bin_sig_x, bin_sig_y) +!$OMP end parallel charge_tot = 0; sig_x_ave = 0; sig_y_ave = 0 f = sqrt(pi/2) ! This corrects for the fact that |x - x0| is used instead of (x - x0)^2 to compute the sigma. @@ -670,40 +725,54 @@ subroutine csr_bin_kicks (ele, ds_kick_pt, csr, err_flag) integer i, n_bin logical err_flag +! Contiguous arrays for vectorized convolution +real(rp), allocatable :: I_int_arr(:), edge_dcharge_arr(:), image_kick_arr(:), charge_arr(:) + character(16) :: r_name = 'csr_bin_kicks' ! The kick point P is fixed. ! Loop over all kick1 bins and compute the kick. +! When y_source == 0 (no image charges), only positive bins contribute to CSR kick +! (source behind kicked particle). Negative bins have dz <= 0 and I_csr returns 0. err_flag = .false. -do i = lbound(csr%kick1, 1), ubound(csr%kick1, 1) - - kick1 => csr%kick1(i) - kick1%dz_particles = i * csr%dz_slice +if (csr%y_source == 0) then + ! CSR only: skip negative bins since I_csr = 0 for dz_particles <= 0 + do i = 1, ubound(csr%kick1, 1) + kick1 => csr%kick1(i) + kick1%dz_particles = i * csr%dz_slice - if (i == lbound(csr%kick1, 1)) then - kick1%ix_ele_source = csr%ix_ele_kick ! Initial guess where source point is - dr_match = 0 ! Discontinuity factor for match element. See s_source_calc routine. - else - kick1%ix_ele_source = csr%kick1(i-1)%ix_ele_source - endif + if (i == 1) then + kick1%ix_ele_source = csr%ix_ele_kick + dr_match = 0 + else + kick1%ix_ele_source = csr%kick1(i-1)%ix_ele_source + endif - ! Calculate what element the source point is in. + kick1%s_chord_source = s_source_calc(kick1, csr, err_flag, dr_match) + if (err_flag) return + call I_csr (kick1, i, csr) + enddo - kick1%s_chord_source = s_source_calc(kick1, csr, err_flag, dr_match) - if (err_flag) return +else + ! Image charges: need full range for both positive and negative separations + do i = lbound(csr%kick1, 1), ubound(csr%kick1, 1) + kick1 => csr%kick1(i) + kick1%dz_particles = i * csr%dz_slice - ! calculate csr. - ! I_csr is only calculated for particles with y = 0 and not for image currents. + if (i == lbound(csr%kick1, 1)) then + kick1%ix_ele_source = csr%ix_ele_kick + dr_match = 0 + else + kick1%ix_ele_source = csr%kick1(i-1)%ix_ele_source + endif - if (csr%y_source == 0) then - call I_csr (kick1, i, csr) - else + kick1%s_chord_source = s_source_calc(kick1, csr, err_flag, dr_match) + if (err_flag) return call image_charge_kick_calc (kick1, csr) - endif - -enddo + enddo +endif ! @@ -712,19 +781,34 @@ subroutine csr_bin_kicks (ele, ds_kick_pt, csr, err_flag) n_bin = space_charge_com%n_bin ! CSR & Image charge kick +! Use contiguous arrays for vectorized convolution instead of strided struct access. if (csr%y_source == 0) then if (ele%csr_method == one_dim$) then + allocate(I_int_arr(n_bin), edge_dcharge_arr(n_bin)) + do i = 1, n_bin + I_int_arr(i) = csr%kick1(i)%I_int_csr + edge_dcharge_arr(i) = csr%slice(i)%edge_dcharge_density_dz + enddo do i = 1, n_bin - csr%slice(i)%kick_csr = coef * dot_product(csr%kick1(i:1:-1)%I_int_csr, csr%slice(1:i)%edge_dcharge_density_dz) + csr%slice(i)%kick_csr = coef * dot_product(I_int_arr(i:1:-1), edge_dcharge_arr(1:i)) enddo + deallocate(I_int_arr, edge_dcharge_arr) endif else ! Image charge + allocate(image_kick_arr(-n_bin:n_bin), charge_arr(n_bin)) + do i = -n_bin, n_bin + image_kick_arr(i) = csr%kick1(i)%image_kick_csr + enddo + do i = 1, n_bin + charge_arr(i) = csr%slice(i)%charge + enddo do i = 1, n_bin csr%slice(i)%kick_csr = csr%slice(i)%kick_csr + coef * & - dot_product(csr%kick1(i-1:i-n_bin:-1)%image_kick_csr, csr%slice(1:n_bin)%charge) + dot_product(image_kick_arr(i-1:i-n_bin:-1), charge_arr(1:n_bin)) enddo + deallocate(image_kick_arr, charge_arr) endif ! Longitudinal space charge kick @@ -1355,6 +1439,22 @@ subroutine csr_and_sc_apply_kicks (ele, csr, particle) ! We use a weighted average so that the integral varies smoothly as a function of particle%vec(5). if (ele%csr_method == one_dim$ .or. ele%space_charge_method == slice$) then + + ! CSR-only kick (no space charge): loop is thread-safe since each particle reads shared slice data independently. + if (ele%csr_method == one_dim$ .and. ele%space_charge_method /= slice$) then + !$OMP parallel do private(ip, zp, i0, r1, r0) + do ip = 1, size(particle) + if (particle(ip)%state /= alive$) cycle + zp = particle(ip)%vec(5) + i0 = int((zp - csr%slice(1)%z_center) / csr%dz_slice) + 1 + r1 = (zp - csr%slice(i0)%z_center) / csr%dz_slice + r0 = 1 - r1 + particle(ip)%vec(6) = particle(ip)%vec(6) + r0 * csr%slice(i0)%kick_csr + r1 * csr%slice(i0+1)%kick_csr + enddo + !$OMP end parallel do + + else + ! General case with possible space charge do ip = 1, size(particle) p => particle(ip) if (p%state /= alive$) cycle @@ -1441,6 +1541,7 @@ subroutine csr_and_sc_apply_kicks (ele, csr, particle) enddo + endif ! end of general case (else branch) endif ! Mesh Space charge kick From 061925ca820f281c90d8438aa1473a368ab8a7d8 Mon Sep 17 00:00:00 2001 From: Christopher Mayes <31023527+ChristopherMayes@users.noreply.github.com> Date: Sat, 21 Mar 2026 09:40:13 -0700 Subject: [PATCH 2/6] slice space charge + csr performance improvements --- .../space_charge/csr_and_space_charge_mod.f90 | 111 ++++++++++++++++-- 1 file changed, 101 insertions(+), 10 deletions(-) diff --git a/bmad/space_charge/csr_and_space_charge_mod.f90 b/bmad/space_charge/csr_and_space_charge_mod.f90 index 378406726..3cc2e1291 100644 --- a/bmad/space_charge/csr_and_space_charge_mod.f90 +++ b/bmad/space_charge/csr_and_space_charge_mod.f90 @@ -1094,11 +1094,13 @@ subroutine lsc_kick_params_calc (ele, csr) integer i, j, n_bin -! Vectorized working arrays for inner loop over source slices +! Shared read-only arrays (source slice properties, independent of kick index i) real(rp), allocatable :: sx_v(:), sy_v(:), a_v(:), b_v(:) real(rp), allocatable :: charge_v(:), dcdz_v(:), z_center_v(:) -real(rp), allocatable :: abs_z_v(:), z1_v(:), z2_v(:), rho0_v(:), drho_v(:) real(rp), allocatable :: radix_v(:), sr_v(:) + +! Thread-private work arrays (recomputed each iteration of the i loop) +real(rp), allocatable :: abs_z_v(:), z1_v(:), z2_v(:), rho0_v(:), drho_v(:) real(rp), allocatable :: b2cz1_v(:), b2cz2_v(:), abcz1_v(:), abcz2_v(:) real(rp), allocatable :: atz1_v(:), atz2_v(:), bcd_v(:), dk0_v(:) integer, allocatable :: sign_v(:) @@ -1118,14 +1120,11 @@ subroutine lsc_kick_params_calc (ele, csr) dz_half = csr%dz_slice / 2 ! Precompute j-dependent arrays (source slice properties, independent of kick index i) +! These are shared read-only across threads. allocate(sx_v(n_bin), sy_v(n_bin), a_v(n_bin), b_v(n_bin)) allocate(charge_v(n_bin), dcdz_v(n_bin), z_center_v(n_bin)) -allocate(abs_z_v(n_bin), z1_v(n_bin), z2_v(n_bin), rho0_v(n_bin), drho_v(n_bin)) allocate(radix_v(n_bin), sr_v(n_bin)) -allocate(b2cz1_v(n_bin), b2cz2_v(n_bin), abcz1_v(n_bin), abcz2_v(n_bin)) -allocate(atz1_v(n_bin), atz2_v(n_bin), bcd_v(n_bin), dk0_v(n_bin)) -allocate(sign_v(n_bin)) do j = 1, n_bin sx_v(j) = csr%slice(j)%sig_x @@ -1143,7 +1142,22 @@ subroutine lsc_kick_params_calc (ele, csr) ! Compute the kick at the center of each bin ! i = index of slice where kick is computed +! Each iteration writes only to csr%slice(i), so outer loop is parallelizable. +! The transverse dependence block (lsc_kick_transverse_dependence) uses DA2 accumulation +! with pointers to slice data, which is not thread-safe, so OpenMP is only used without it. + +if (.not. space_charge_com%lsc_kick_transverse_dependence) then + +!$OMP parallel default(none) & +!$OMP shared(n_bin, csr, z_center_v, charge_v, dcdz_v, a_v, b_v, radix_v, sr_v, c_val, dz_half, factor) & +!$OMP private(i, j, z_center_i, abs_z_v, z1_v, z2_v, rho0_v, drho_v, bcd_v, abcz1_v, abcz2_v, & +!$OMP b2cz1_v, b2cz2_v, atz1_v, atz2_v, dk0_v, sign_v) +allocate(abs_z_v(n_bin), z1_v(n_bin), z2_v(n_bin), rho0_v(n_bin), drho_v(n_bin)) +allocate(b2cz1_v(n_bin), b2cz2_v(n_bin), abcz1_v(n_bin), abcz2_v(n_bin)) +allocate(atz1_v(n_bin), atz2_v(n_bin), bcd_v(n_bin), dk0_v(n_bin)) +allocate(sign_v(n_bin)) +!$OMP do do i = 1, n_bin z_center_i = csr%slice(i)%z_center @@ -1198,6 +1212,75 @@ subroutine lsc_kick_params_calc (ele, csr) ! Accumulate kick_lsc (sum over all source slices) csr%slice(i)%kick_lsc = csr%slice(i)%kick_lsc + sum(sign_v * dk0_v) +enddo +!$OMP end do +deallocate(abs_z_v, z1_v, z2_v, rho0_v, drho_v) +deallocate(b2cz1_v, b2cz2_v, abcz1_v, abcz2_v, atz1_v, atz2_v, bcd_v, dk0_v) +deallocate(sign_v) +!$OMP end parallel + +else + ! Serial path when lsc_kick_transverse_dependence is enabled (uses DA2 pointer accumulation) + + allocate(abs_z_v(n_bin), z1_v(n_bin), z2_v(n_bin), rho0_v(n_bin), drho_v(n_bin)) + allocate(b2cz1_v(n_bin), b2cz2_v(n_bin), abcz1_v(n_bin), abcz2_v(n_bin)) + allocate(atz1_v(n_bin), atz2_v(n_bin), bcd_v(n_bin), dk0_v(n_bin)) + allocate(sign_v(n_bin)) + + do i = 1, n_bin + z_center_i = csr%slice(i)%z_center + + ! Vectorized computation of z-separations and signs + abs_z_v = abs(z_center_i - z_center_v) + + do j = 1, n_bin + if (z_center_i > z_center_v(j)) then + sign_v(j) = 1 + elseif (z_center_i < z_center_v(j)) then + sign_v(j) = -1 + else + sign_v(j) = 0 + endif + enddo + + ! General case: compute z1, z2, drho, rho0 for all source slices + z1_v = abs_z_v - dz_half + z2_v = abs_z_v + dz_half + drho_v = dcdz_v * sign_v + rho0_v = charge_v / csr%dz_slice - drho_v * abs_z_v + + ! Self-slice override (diagonal: i == j) + drho_v(i) = dcdz_v(i) + rho0_v(i) = 0 + z1_v(i) = 0 + z2_v(i) = dz_half + sign_v(i) = -2 ! Factor of 2 accounts for 1/2 we did not integrate over. + + ! Vectorized intermediate computations + bcd_v = 2 * c_val * rho0_v - b_v * drho_v + abcz1_v = a_v + b_v * z1_v + c_val * z1_v**2 + abcz2_v = a_v + b_v * z2_v + c_val * z2_v**2 + b2cz1_v = b_v + 2 * c_val * z1_v + b2cz2_v = b_v + 2 * c_val * z2_v + + ! Compute atan for all elements (always safe), then override with log where radix <= 0 + atz1_v = atan(b2cz1_v / sr_v) / sr_v + atz2_v = atan(b2cz2_v / sr_v) / sr_v + + do j = 1, n_bin + if (radix_v(j) <= 0) then + atz1_v(j) = log((b2cz1_v(j) - sr_v(j)) / (b2cz1_v(j) + sr_v(j))) / (2 * sr_v(j)) + atz2_v(j) = log((b2cz2_v(j) - sr_v(j)) / (b2cz2_v(j) + sr_v(j))) / (2 * sr_v(j)) + endif + enddo + + ! Vectorized kick computation + dk0_v = factor * ((2 * atz2_v * bcd_v + drho_v * log(abcz2_v)) - & + (2 * atz1_v * bcd_v + drho_v * log(abcz1_v))) / (2 * c_val) + + ! Accumulate kick_lsc (sum over all source slices) + csr%slice(i)%kick_lsc = csr%slice(i)%kick_lsc + sum(sign_v * dk0_v) + ! Transverse dependence: scalar loop required for DA2 accumulation if (space_charge_com%lsc_kick_transverse_dependence) then do j = 1, n_bin @@ -1246,12 +1329,16 @@ subroutine lsc_kick_params_calc (ele, csr) enddo endif -enddo + enddo ! end of serial i loop + + deallocate(abs_z_v, z1_v, z2_v, rho0_v, drho_v) + deallocate(b2cz1_v, b2cz2_v, abcz1_v, abcz2_v, atz1_v, atz2_v, bcd_v, dk0_v) + deallocate(sign_v) + +endif ! end of lsc_kick_transverse_dependence branch deallocate(sx_v, sy_v, a_v, b_v, charge_v, dcdz_v, z_center_v) -deallocate(abs_z_v, z1_v, z2_v, rho0_v, drho_v, radix_v, sr_v) -deallocate(b2cz1_v, b2cz2_v, abcz1_v, abcz2_v, atz1_v, atz2_v, bcd_v, dk0_v) -deallocate(sign_v) +deallocate(radix_v, sr_v) end subroutine lsc_kick_params_calc @@ -1455,6 +1542,9 @@ subroutine csr_and_sc_apply_kicks (ele, csr, particle) else ! General case with possible space charge + !$OMP parallel do default(none) & + !$OMP shared(particle, csr, ele, space_charge_com, global_com) & + !$OMP private(ip, p, slice, zp, i0, r1, r0, dpz, x, y, beta0, f0, f, dpx, dpy, nk, dnk) do ip = 1, size(particle) p => particle(ip) if (p%state /= alive$) cycle @@ -1541,6 +1631,7 @@ subroutine csr_and_sc_apply_kicks (ele, csr, particle) enddo + !$OMP end parallel do endif ! end of general case (else branch) endif From 427b0fd6fb54acfa92b74e6bc66a60a386302f8e Mon Sep 17 00:00:00 2001 From: Christopher Mayes <31023527+ChristopherMayes@users.noreply.github.com> Date: Sat, 21 Mar 2026 13:14:19 -0700 Subject: [PATCH 3/6] Simplify CSR code for maintainability MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Deduplicated LSC loop body — Parallel loop always runs for kick_lsc. Transverse dependence DA2 accumulation is now a separate serial post-pass that recomputes intermediates. Eliminates ~100 lines of duplicated code. default(none) on CSR-only parallel do — Compiler will now catch any missing private declarations if the loop body is modified. Deleted dead particle_overlap_in_bin — The contained function was no longer called after inlining for OpenMP. Replaced print * with out_io in !$OMP critical — Both the CSR-only and general-case error paths now use the thread-safe out_io inside a critical section. Added bounds check to CSR-only path — Same r1/i0 validation as the general-case path, consistent error handling. --- .../space_charge/csr_and_space_charge_mod.f90 | 86 +++++-------------- 1 file changed, 22 insertions(+), 64 deletions(-) diff --git a/bmad/space_charge/csr_and_space_charge_mod.f90 b/bmad/space_charge/csr_and_space_charge_mod.f90 index 3cc2e1291..4270380aa 100644 --- a/bmad/space_charge/csr_and_space_charge_mod.f90 +++ b/bmad/space_charge/csr_and_space_charge_mod.f90 @@ -658,36 +658,6 @@ subroutine csr_bin_particles (ele, particle, csr, err_flag) enddo -!--------------------------------------------------------------------------- -contains - -! computes the contribution to the charge in a bin from a given particle. -! z0_bin, z1_bin are the edge positions of the bin - -function particle_overlap_in_bin (z0_bin, z1_bin) result (overlap) - -real(rp) z0_bin, z1_bin, overlap, z1, z2 - -! Integrate over left triangular half of particle distribution - -z1 = max(zp0, z0_bin) ! left integration edge -z2 = min(zp_center, z1_bin) ! right integration edge -if (z2 > z1) then ! If left particle half is in bin ... - overlap = 2 * real(((z2 - zp0)**2 - (z1 - zp0)**2), rp) / dz_particle**2 -else - overlap = 0 -endif - -! Integrate over right triangular half of particle distribution - -z1 = max(zp_center, z0_bin) ! left integration edge -z2 = min(zp1, z1_bin) ! right integration edge -if (z2 > z1) then ! If right particle half is in bin ... - overlap = overlap + 2 * real(((z1 - zp1)**2 - (z2 - zp1)**2), rp) / dz_particle**2 -endif - -end function particle_overlap_in_bin - end subroutine csr_bin_particles !---------------------------------------------------------------------------- @@ -1140,13 +1110,10 @@ subroutine lsc_kick_params_calc (ele, csr) radix_v = -b_v**2 + 4 * a_v * c_val sr_v = sqrt(abs(radix_v)) -! Compute the kick at the center of each bin -! i = index of slice where kick is computed -! Each iteration writes only to csr%slice(i), so outer loop is parallelizable. -! The transverse dependence block (lsc_kick_transverse_dependence) uses DA2 accumulation -! with pointers to slice data, which is not thread-safe, so OpenMP is only used without it. - -if (.not. space_charge_com%lsc_kick_transverse_dependence) then +! Compute the kick at the center of each bin (parallelized). +! i = index of slice where kick is computed. +! Each iteration writes only to csr%slice(i)%kick_lsc, so the outer loop is parallelizable. +! Thread-private work arrays are allocated once per thread (inside the parallel region, outside the do loop). !$OMP parallel default(none) & !$OMP shared(n_bin, csr, z_center_v, charge_v, dcdz_v, a_v, b_v, radix_v, sr_v, c_val, dz_half, factor) & @@ -1161,7 +1128,6 @@ subroutine lsc_kick_params_calc (ele, csr) do i = 1, n_bin z_center_i = csr%slice(i)%z_center - ! Vectorized computation of z-separations and signs abs_z_v = abs(z_center_i - z_center_v) do j = 1, n_bin @@ -1174,7 +1140,6 @@ subroutine lsc_kick_params_calc (ele, csr) endif enddo - ! General case: compute z1, z2, drho, rho0 for all source slices z1_v = abs_z_v - dz_half z2_v = abs_z_v + dz_half drho_v = dcdz_v * sign_v @@ -1187,14 +1152,12 @@ subroutine lsc_kick_params_calc (ele, csr) z2_v(i) = dz_half sign_v(i) = -2 ! Factor of 2 accounts for 1/2 we did not integrate over. - ! Vectorized intermediate computations bcd_v = 2 * c_val * rho0_v - b_v * drho_v abcz1_v = a_v + b_v * z1_v + c_val * z1_v**2 abcz2_v = a_v + b_v * z2_v + c_val * z2_v**2 b2cz1_v = b_v + 2 * c_val * z1_v b2cz2_v = b_v + 2 * c_val * z2_v - ! Compute atan for all elements (always safe), then override with log where radix <= 0 atz1_v = atan(b2cz1_v / sr_v) / sr_v atz2_v = atan(b2cz2_v / sr_v) / sr_v @@ -1205,11 +1168,9 @@ subroutine lsc_kick_params_calc (ele, csr) endif enddo - ! Vectorized kick computation dk0_v = factor * ((2 * atz2_v * bcd_v + drho_v * log(abcz2_v)) - & (2 * atz1_v * bcd_v + drho_v * log(abcz1_v))) / (2 * c_val) - ! Accumulate kick_lsc (sum over all source slices) csr%slice(i)%kick_lsc = csr%slice(i)%kick_lsc + sum(sign_v * dk0_v) enddo @@ -1219,9 +1180,11 @@ subroutine lsc_kick_params_calc (ele, csr) deallocate(sign_v) !$OMP end parallel -else - ! Serial path when lsc_kick_transverse_dependence is enabled (uses DA2 pointer accumulation) +! Separate serial pass for transverse dependence DA2 coefficients. +! This recomputes the intermediates for each bin since the parallel loop above doesn't store them. +! The DA2 pointer accumulation (da2_div, da2_mult) is not thread-safe, so this must remain serial. +if (space_charge_com%lsc_kick_transverse_dependence) then allocate(abs_z_v(n_bin), z1_v(n_bin), z2_v(n_bin), rho0_v(n_bin), drho_v(n_bin)) allocate(b2cz1_v(n_bin), b2cz2_v(n_bin), abcz1_v(n_bin), abcz2_v(n_bin)) allocate(atz1_v(n_bin), atz2_v(n_bin), bcd_v(n_bin), dk0_v(n_bin)) @@ -1230,7 +1193,6 @@ subroutine lsc_kick_params_calc (ele, csr) do i = 1, n_bin z_center_i = csr%slice(i)%z_center - ! Vectorized computation of z-separations and signs abs_z_v = abs(z_center_i - z_center_v) do j = 1, n_bin @@ -1243,27 +1205,23 @@ subroutine lsc_kick_params_calc (ele, csr) endif enddo - ! General case: compute z1, z2, drho, rho0 for all source slices z1_v = abs_z_v - dz_half z2_v = abs_z_v + dz_half drho_v = dcdz_v * sign_v rho0_v = charge_v / csr%dz_slice - drho_v * abs_z_v - ! Self-slice override (diagonal: i == j) drho_v(i) = dcdz_v(i) rho0_v(i) = 0 z1_v(i) = 0 z2_v(i) = dz_half - sign_v(i) = -2 ! Factor of 2 accounts for 1/2 we did not integrate over. + sign_v(i) = -2 - ! Vectorized intermediate computations bcd_v = 2 * c_val * rho0_v - b_v * drho_v abcz1_v = a_v + b_v * z1_v + c_val * z1_v**2 abcz2_v = a_v + b_v * z2_v + c_val * z2_v**2 b2cz1_v = b_v + 2 * c_val * z1_v b2cz2_v = b_v + 2 * c_val * z2_v - ! Compute atan for all elements (always safe), then override with log where radix <= 0 atz1_v = atan(b2cz1_v / sr_v) / sr_v atz2_v = atan(b2cz2_v / sr_v) / sr_v @@ -1274,15 +1232,9 @@ subroutine lsc_kick_params_calc (ele, csr) endif enddo - ! Vectorized kick computation dk0_v = factor * ((2 * atz2_v * bcd_v + drho_v * log(abcz2_v)) - & (2 * atz1_v * bcd_v + drho_v * log(abcz1_v))) / (2 * c_val) - ! Accumulate kick_lsc (sum over all source slices) - csr%slice(i)%kick_lsc = csr%slice(i)%kick_lsc + sum(sign_v * dk0_v) - - ! Transverse dependence: scalar loop required for DA2 accumulation - if (space_charge_com%lsc_kick_transverse_dependence) then do j = 1, n_bin if (dk0_v(j) == 0) cycle @@ -1327,15 +1279,13 @@ subroutine lsc_kick_params_calc (ele, csr) f = da2_div(da2_mult(f1, f), f + f1) endif enddo - endif - enddo ! end of serial i loop + enddo deallocate(abs_z_v, z1_v, z2_v, rho0_v, drho_v) deallocate(b2cz1_v, b2cz2_v, abcz1_v, abcz2_v, atz1_v, atz2_v, bcd_v, dk0_v) deallocate(sign_v) - -endif ! end of lsc_kick_transverse_dependence branch +endif deallocate(sx_v, sy_v, a_v, b_v, charge_v, dcdz_v, z_center_v) deallocate(radix_v, sr_v) @@ -1527,15 +1477,21 @@ subroutine csr_and_sc_apply_kicks (ele, csr, particle) if (ele%csr_method == one_dim$ .or. ele%space_charge_method == slice$) then - ! CSR-only kick (no space charge): loop is thread-safe since each particle reads shared slice data independently. + ! CSR-only kick (no space charge): each particle reads shared slice data and updates only its own vec(6). if (ele%csr_method == one_dim$ .and. ele%space_charge_method /= slice$) then - !$OMP parallel do private(ip, zp, i0, r1, r0) + !$OMP parallel do default(none) shared(particle, csr, space_charge_com, global_com) private(ip, zp, i0, r1, r0) do ip = 1, size(particle) if (particle(ip)%state /= alive$) cycle zp = particle(ip)%vec(5) i0 = int((zp - csr%slice(1)%z_center) / csr%dz_slice) + 1 r1 = (zp - csr%slice(i0)%z_center) / csr%dz_slice r0 = 1 - r1 + if (r1 < -0.01_rp .or. r1 > 1.01_rp .or. i0 < 1 .or. i0 >= space_charge_com%n_bin) then + !$OMP critical + call out_io (s_error$, 'csr_and_sc_apply_kicks', 'CSR INTERNAL ERROR!') + !$OMP end critical + if (global_com%exit_on_error) call err_exit + endif particle(ip)%vec(6) = particle(ip)%vec(6) + r0 * csr%slice(i0)%kick_csr + r1 * csr%slice(i0+1)%kick_csr enddo !$OMP end parallel do @@ -1555,7 +1511,9 @@ subroutine csr_and_sc_apply_kicks (ele, csr, particle) ! r1 should be in [0,1] but allow for some round-off error if (r1 < -0.01_rp .or. r1 > 1.01_rp .or. i0 < 1 .or. i0 >= space_charge_com%n_bin) then - print *, 'CSR INTERNAL ERROR!' + !$OMP critical + call out_io (s_error$, 'csr_and_sc_apply_kicks', 'CSR INTERNAL ERROR!') + !$OMP end critical if (global_com%exit_on_error) call err_exit endif From 3ed8eabb58d48485e77297ae816b20e232e1b278 Mon Sep 17 00:00:00 2001 From: Christopher Mayes <31023527+ChristopherMayes@users.noreply.github.com> Date: Sat, 21 Mar 2026 19:47:44 -0700 Subject: [PATCH 4/6] Use FFT convolution --- .../space_charge/csr_and_space_charge_mod.f90 | 45 +++++++++++++------ 1 file changed, 32 insertions(+), 13 deletions(-) diff --git a/bmad/space_charge/csr_and_space_charge_mod.f90 b/bmad/space_charge/csr_and_space_charge_mod.f90 index 4270380aa..ce9e5aa25 100644 --- a/bmad/space_charge/csr_and_space_charge_mod.f90 +++ b/bmad/space_charge/csr_and_space_charge_mod.f90 @@ -501,7 +501,7 @@ subroutine csr_bin_particles (ele, particle, csr, err_flag) pbs = space_charge_com%particle_bin_span inv_dz2 = 1.0_rp / dz_particle**2 -!$OMP parallel private(bin_n_particle, bin_charge, bin_x0, bin_y0, zp_center, zp0, zp1, ix0, j, ib, zb0, zb1, z1_over, z2_over, overlap, p_charge, p_x, p_y) +!$OMP parallel private(bin_n_particle, bin_charge, bin_x0, bin_y0, zp_center, zp0, zp1, ix0, j, ib, zb0, zb1, z1_over, z2_over, overlap, charge, p_charge, p_x, p_y) allocate(bin_n_particle(n_bin), bin_charge(n_bin), bin_x0(n_bin), bin_y0(n_bin)) bin_n_particle = 0; bin_charge = 0; bin_x0 = 0; bin_y0 = 0 @@ -569,7 +569,7 @@ subroutine csr_bin_particles (ele, particle, csr, err_flag) ! Abs(x-x0) is used instead of the usual formula involving (x-x0)^2 to lessen the effect ! of non-Gaussian tails. -!$OMP parallel private(bin_sig_x, bin_sig_y, zp_center, zp0, zp1, ix0, j, ib, zb0, zb1, z1_over, z2_over, overlap, p_charge, p_x, p_y) +!$OMP parallel private(bin_sig_x, bin_sig_y, zp_center, zp0, zp1, ix0, j, ib, zb0, zb1, z1_over, z2_over, overlap, charge, p_charge, p_x, p_y) allocate(bin_sig_x(n_bin), bin_sig_y(n_bin)) bin_sig_x = 0; bin_sig_y = 0 @@ -692,12 +692,15 @@ subroutine csr_bin_kicks (ele, ds_kick_pt, csr, err_flag) real(rp) ds_kick_pt, coef, dr_match(3) -integer i, n_bin +integer i, n_bin, m_fft logical err_flag ! Contiguous arrays for vectorized convolution real(rp), allocatable :: I_int_arr(:), edge_dcharge_arr(:), image_kick_arr(:), charge_arr(:) +! FFT workspace for O(n_bin * log(n_bin)) convolution +complex(rp), allocatable :: fft_a(:), fft_b(:) + character(16) :: r_name = 'csr_bin_kicks' ! The kick point P is fixed. @@ -760,10 +763,34 @@ subroutine csr_bin_kicks (ele, ds_kick_pt, csr, err_flag) I_int_arr(i) = csr%kick1(i)%I_int_csr edge_dcharge_arr(i) = csr%slice(i)%edge_dcharge_density_dz enddo + + ! FFT-based linear convolution: O(n_bin * log(n_bin)) instead of O(n_bin^2). + ! kick_csr(i) = coef * sum_{k=1}^{i} I_int_arr(k) * edge_dcharge_arr(i+1-k) + ! This equals the first n_bin elements of the linear convolution of I_int_arr with edge_dcharge_arr. + ! Zero-pad to avoid circular convolution artifacts. + m_fft = 1 + do while (m_fft < 2 * n_bin) + m_fft = m_fft * 2 + enddo + + allocate(fft_a(m_fft), fft_b(m_fft)) + fft_a = (0.0_rp, 0.0_rp) + fft_b = (0.0_rp, 0.0_rp) do i = 1, n_bin - csr%slice(i)%kick_csr = coef * dot_product(I_int_arr(i:1:-1), edge_dcharge_arr(1:i)) + fft_a(i) = cmplx(I_int_arr(i), 0.0_rp, rp) + fft_b(i) = cmplx(edge_dcharge_arr(i), 0.0_rp, rp) enddo - deallocate(I_int_arr, edge_dcharge_arr) + + call fft_1d(fft_a, -1) + call fft_1d(fft_b, -1) + fft_a = fft_a * fft_b + call fft_1d(fft_a, 1) + + do i = 1, n_bin + csr%slice(i)%kick_csr = coef * real(fft_a(i), rp) / m_fft + enddo + + deallocate(fft_a, fft_b, I_int_arr, edge_dcharge_arr) endif else ! Image charge @@ -1127,9 +1154,7 @@ subroutine lsc_kick_params_calc (ele, csr) !$OMP do do i = 1, n_bin z_center_i = csr%slice(i)%z_center - abs_z_v = abs(z_center_i - z_center_v) - do j = 1, n_bin if (z_center_i > z_center_v(j)) then sign_v(j) = 1 @@ -1139,7 +1164,6 @@ subroutine lsc_kick_params_calc (ele, csr) sign_v(j) = 0 endif enddo - z1_v = abs_z_v - dz_half z2_v = abs_z_v + dz_half drho_v = dcdz_v * sign_v @@ -1157,22 +1181,17 @@ subroutine lsc_kick_params_calc (ele, csr) abcz2_v = a_v + b_v * z2_v + c_val * z2_v**2 b2cz1_v = b_v + 2 * c_val * z1_v b2cz2_v = b_v + 2 * c_val * z2_v - atz1_v = atan(b2cz1_v / sr_v) / sr_v atz2_v = atan(b2cz2_v / sr_v) / sr_v - do j = 1, n_bin if (radix_v(j) <= 0) then atz1_v(j) = log((b2cz1_v(j) - sr_v(j)) / (b2cz1_v(j) + sr_v(j))) / (2 * sr_v(j)) atz2_v(j) = log((b2cz2_v(j) - sr_v(j)) / (b2cz2_v(j) + sr_v(j))) / (2 * sr_v(j)) endif enddo - dk0_v = factor * ((2 * atz2_v * bcd_v + drho_v * log(abcz2_v)) - & (2 * atz1_v * bcd_v + drho_v * log(abcz1_v))) / (2 * c_val) - csr%slice(i)%kick_lsc = csr%slice(i)%kick_lsc + sum(sign_v * dk0_v) - enddo !$OMP end do deallocate(abs_z_v, z1_v, z2_v, rho0_v, drho_v) From b7b8c2a10354a35f0e3fa112d45de83e003ab15d Mon Sep 17 00:00:00 2001 From: Christopher Mayes <31023527+ChristopherMayes@users.noreply.github.com> Date: Sat, 21 Mar 2026 21:12:13 -0700 Subject: [PATCH 5/6] Parallelize space_charge_com%lsc_kick_transverse_dependence --- bmad/space_charge/csr_and_space_charge_mod.f90 | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/bmad/space_charge/csr_and_space_charge_mod.f90 b/bmad/space_charge/csr_and_space_charge_mod.f90 index ce9e5aa25..f5dd004bb 100644 --- a/bmad/space_charge/csr_and_space_charge_mod.f90 +++ b/bmad/space_charge/csr_and_space_charge_mod.f90 @@ -1199,16 +1199,22 @@ subroutine lsc_kick_params_calc (ele, csr) deallocate(sign_v) !$OMP end parallel -! Separate serial pass for transverse dependence DA2 coefficients. -! This recomputes the intermediates for each bin since the parallel loop above doesn't store them. -! The DA2 pointer accumulation (da2_div, da2_mult) is not thread-safe, so this must remain serial. +! Transverse dependence DA2 coefficients. +! Each outer iteration i writes only to csr%slice(i)%coef_lsc_plus/minus, so i is parallelizable. +! The inner j loop has a serial dependency (harmonic-mean accumulation), but different i are independent. if (space_charge_com%lsc_kick_transverse_dependence) then + !$OMP parallel default(none) & + !$OMP shared(n_bin, csr, z_center_v, charge_v, dcdz_v, a_v, b_v, radix_v, sr_v, sx_v, sy_v, c_val, dz_half, factor) & + !$OMP private(i, j, z_center_i, abs_z_v, z1_v, z2_v, rho0_v, drho_v, bcd_v, abcz1_v, abcz2_v, & + !$OMP b2cz1_v, b2cz2_v, atz1_v, atz2_v, dk0_v, sign_v, & + !$OMP f, f1, f00, sx2, sy2, g_z1_s, g_z2_s, h_z1_s, h_z2_s, alph, bet, ss) allocate(abs_z_v(n_bin), z1_v(n_bin), z2_v(n_bin), rho0_v(n_bin), drho_v(n_bin)) allocate(b2cz1_v(n_bin), b2cz2_v(n_bin), abcz1_v(n_bin), abcz2_v(n_bin)) allocate(atz1_v(n_bin), atz2_v(n_bin), bcd_v(n_bin), dk0_v(n_bin)) allocate(sign_v(n_bin)) + !$OMP do do i = 1, n_bin z_center_i = csr%slice(i)%z_center @@ -1300,10 +1306,11 @@ subroutine lsc_kick_params_calc (ele, csr) enddo enddo - + !$OMP end do deallocate(abs_z_v, z1_v, z2_v, rho0_v, drho_v) deallocate(b2cz1_v, b2cz2_v, abcz1_v, abcz2_v, atz1_v, atz2_v, bcd_v, dk0_v) deallocate(sign_v) + !$OMP end parallel endif deallocate(sx_v, sy_v, a_v, b_v, charge_v, dcdz_v, z_center_v) From cb16cf8a708ebdd8352034ec2e2a0b92503c244b Mon Sep 17 00:00:00 2001 From: Christopher Mayes <31023527+ChristopherMayes@users.noreply.github.com> Date: Sat, 21 Mar 2026 21:14:29 -0700 Subject: [PATCH 6/6] docstring typos --- bmad/space_charge/csr_and_space_charge_mod.f90 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bmad/space_charge/csr_and_space_charge_mod.f90 b/bmad/space_charge/csr_and_space_charge_mod.f90 index f5dd004bb..57dc3b8b9 100644 --- a/bmad/space_charge/csr_and_space_charge_mod.f90 +++ b/bmad/space_charge/csr_and_space_charge_mod.f90 @@ -390,11 +390,11 @@ end subroutine track1_bunch_csr !---------------------------------------------------------------------------- !---------------------------------------------------------------------------- !+ -! Subroutine csr_bin_parcticles (ele, particle, csr) +! Subroutine csr_bin_particles (ele, particle, csr) ! ! Routine to bin the particles longitudinally in s. ! -! To avoid noise in the cacluation, every particle is considered to have a +! To avoid noise in the calculation, every particle is considered to have a ! triangular distribution with a base length given by ! space_charge_com%particle_bin_span * csr%dz_slice. ! That is, particles will, in general, overlap multiple bins.