Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
a042873
add external fluid simulation example
petbab May 17, 2026
a5b60d0
searcher integration wip
May 25, 2026
e341f0a
searcher integration, first prototype
May 29, 2026
ed19279
profile-searcher: graphing, speculative selection in OnInitialize()
Jun 1, 2026
062d44c
searcher integration wip
May 25, 2026
703682c
searcher integration, first prototype
May 29, 2026
114a837
profile-searcher: graphing, speculative selection in OnInitialize()
Jun 1, 2026
11d79ba
Merge remote-tracking branch 'origin/andrij/ml-profiling-searcher' in…
andrijrat Jun 2, 2026
57c38c2
profile-searcher: more proper README
andrijrat Jun 2, 2026
9b45876
profile-searcher: add launch.json configuration for graphing
andrijrat Jun 2, 2026
6ddc5d0
remove .clangd to not interfere with other people's setups
andrijrat Jun 2, 2026
f4ad16b
profile-searcher: logging
andrijrat Jun 2, 2026
bd240c0
profile-searcher: remove docopt dependency
andrijrat Jun 2, 2026
ae38692
profile-searcher: remove pyktt.so, it will not work for other people …
andrijrat Jun 2, 2026
f394e3f
remove .vscode/ and fix .gitignore
andrijrat Jun 3, 2026
83de0e4
profile-searcher: remove useless cupy requirement files
andrijrat Jun 3, 2026
123f8ec
scripts: slight code quality improvements
andrijrat Jun 3, 2026
ba8daf5
tutorials: cleanup of tutorial 08, rename to 08CustomPythonSearcher
andrijrat Jun 3, 2026
2193cf5
scripts: stringify use raw strings
andrijrat Jun 3, 2026
63e69f7
profile-searcher: move into Scripts/, separate old and new searchers
andrijrat Jun 3, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 10 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,8 +1,16 @@
.vs/*
.vscode/*
.vs/
.vscode/
Build/*
premake5.exe
premake5
*.swp
.cache

compile_commands.json
.clangd

# Python
env/
__pycache__/
*.pyc
*.pyo
2 changes: 2 additions & 0 deletions Scripts/ConvertXmlToCsv.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
#!/bin/env python3

import xml.etree.ElementTree as ET
import sys

Expand Down
85 changes: 85 additions & 0 deletions Scripts/GraphResults.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
#!/bin/env python3

import glob
import sys
from typing import cast
import xml.etree.ElementTree as ElementTree

import numpy
import pandas
from pandas import DataFrame
from seaborn import lineplot

import matplotlib.pyplot as plt
from matplotlib.figure import Figure


# Change these to where you store the profiling results
RESULT_DIRECTORIES = {
'New profile-based searcher': './logs/profiled/2080-1070/b10-n40-r100/',
# 'Old profile-based searcher': './logs/old-profiled/2080/b10-n40-r100/',
'Random searcher': './logs/random/2080/',
}


def parseResults(directory: str) -> DataFrame:
outputFiles = glob.glob(directory + 'output-*.xml')

durations = []
for file in outputFiles:
xmlTree = ElementTree.parse(file)
results = xmlTree.findall('./Results/KernelResult')
durations.append([float(d.attrib['TotalDuration']) for d in results])

iterations = numpy.concatenate(
[numpy.arange(1, len(d) + 1) for d in durations]
)

bestDurations = []
for duration in durations:
bestTime = numpy.empty(len(duration))
bestTime[0] = duration[0]

for i in range(1, len(duration)):
time = duration[i]
bestTime[i] = time if time < bestTime[i - 1] else bestTime[i - 1]

bestDurations.append(bestTime)

return DataFrame(
{
'iteration': iterations,
'time': numpy.concatenate(bestDurations),
}
)


def graphResults(outputPath: str | None):
results = []

for name, directory in RESULT_DIRECTORIES.items():
results.append(parseResults(directory))
results[-1]['name'] = name

resultsPlot = lineplot(
data=pandas.concat(results),
x='iteration',
y='time',
hue='name',
)

if outputPath is None:
plt.show(block=True)
return

figure = cast(Figure, resultsPlot.get_figure())
figure.set_dpi(150)
figure.set_size_inches(19.20, 10.80)

figure.savefig(outputPath)
plt.clf()


if __name__ == '__main__':
outputPath = sys.argv[1] if len(sys.argv) == 2 else None
graphResults(outputPath)
1 change: 1 addition & 0 deletions Scripts/NewProfileSearcher/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
pyktt.so
47 changes: 47 additions & 0 deletions Scripts/NewProfileSearcher/CudaKernel.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
extern "C" __global__ void directCoulombSum(const float4* atomInfo, const float* atomInfoX, const float* atomInfoY, const float* atomInfoZ, const float* atomInfoW, int numberOfAtoms, float gridSpacing, int gridSize, float* energyGrid)
{
int xIndex = blockIdx.x*blockDim.x + threadIdx.x;
int yIndex = blockIdx.y*blockDim.y + threadIdx.y;
int zIndex = (blockIdx.z*blockDim.z + threadIdx.z) * Z_ITERATIONS;

if ((xIndex >= gridSize) || (yIndex >= gridSize) || (zIndex >= gridSize))
return;

int sliceOffset = gridSize * gridSize;
int outIndex = sliceOffset*zIndex + gridSize*yIndex + xIndex;

float coordX = gridSpacing * xIndex;
float coordY = gridSpacing * yIndex;
float coordZ = gridSpacing * zIndex;

float energyValue[Z_ITERATIONS];
for (int i = 0; i < Z_ITERATIONS; i++)
energyValue[i] = 0.0f;

for (int i = 0; i < numberOfAtoms / VECTOR_SIZE; i++)
{
#if USE_SOA == 1
float dX = coordX - atomInfoX[i];
float dY = coordY - atomInfoY[i];
float dZ = coordZ - atomInfoZ[i];
float w = atomInfoW[i];
#else
float dX = coordX - atomInfo[i].x;
float dY = coordY - atomInfo[i].y;
float dZ = coordZ - atomInfo[i].z;
float w = atomInfo[i].w;
#endif /* USE_SOA */
#if INNER_UNROLL_FACTOR > 0
#pragma unroll INNER_UNROLL_FACTOR
#endif
for (int j = 0; j < Z_ITERATIONS; j++) {
float partialResult = rsqrt(dX * dX + dY * dY + dZ*dZ);
energyValue[j] += w * partialResult;
dZ += gridSpacing;
}
}

for (int i = 0; i < Z_ITERATIONS; i++)
if (zIndex + i < gridSize)
energyGrid[outIndex + sliceOffset*i] = energyValue[i];
}
176 changes: 176 additions & 0 deletions Scripts/NewProfileSearcher/ExampleUsage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
#!/usr/bin/env python3

import os
import sys
import glob
from pathlib import Path

from ProfileBasedSearcher import ProfileBasedSearcher
from modules.logging import LoggingLevel
from modules.info import BatchInfo, ModelInfo

import lib.pyktt as ktt
import numpy

LOG_DIRECTORY = './logs/profiled/'


def getLogPath(modelInfo: ModelInfo, batchInfo: BatchInfo) -> str:
runningGpu, profilingGpu = (
Path(modelInfo.counterPath).name.split('_')[0].split('-')
)

logPrefix = (
LOG_DIRECTORY
+ f'{runningGpu}-{profilingGpu}/'
+ f'b{batchInfo.batchSize}-n{batchInfo.neighborSize}-'
+ f'r{batchInfo.randomSize}/'
)

# logPrefix = LOG_DIRECTORY # TEST
existingLogs = glob.glob(logPrefix + '*.xml')
return logPrefix + f'output-{len(existingLogs) + 1}'


def runTuning(deviceIndex: int, kernelFile: str):
numberOfAtoms = 256
gridSize = 256
gridSpacing = 0.5
gridDimensions = ktt.DimensionVector(gridSize, gridSize, gridSize)
blockDimensions = ktt.DimensionVector(1, 1)

aX = 100.0 * numpy.random.rand(numberOfAtoms).astype('f')
aY = 100.0 * numpy.random.rand(numberOfAtoms).astype('f')
aZ = 100.0 * numpy.random.rand(numberOfAtoms).astype('f')
aW = 100.0 * numpy.random.rand(numberOfAtoms).astype('f')
aAll = numpy.zeros(numberOfAtoms * 4, dtype=numpy.single)
for i in range(numberOfAtoms):
aAll[4 * i] = aX[i]
aAll[4 * i + 1] = aY[i]
aAll[4 * i + 2] = aZ[i]
aAll[4 * i + 3] = aW[i]
energyGrid = numpy.zeros(gridSize * gridSize * gridSize, dtype=numpy.single)

tuner = ktt.Tuner(0, deviceIndex, ktt.ComputeApi.CUDA)
tuner.SetCompilerOptions('-use_fast_math')
tuner.SetTimeUnit(ktt.TimeUnit.Microseconds)
tuner.SetProfiling(False)

definition = tuner.AddKernelDefinitionFromFile(
'directCoulombSum', kernelFile, gridDimensions, blockDimensions
)

aXId = tuner.AddArgumentVectorFloat(aX, ktt.ArgumentAccessType.ReadOnly)
aYId = tuner.AddArgumentVectorFloat(aY, ktt.ArgumentAccessType.ReadOnly)
aZId = tuner.AddArgumentVectorFloat(aZ, ktt.ArgumentAccessType.ReadOnly)
aWId = tuner.AddArgumentVectorFloat(aW, ktt.ArgumentAccessType.ReadOnly)
aAllId = tuner.AddArgumentVectorFloat(aAll, ktt.ArgumentAccessType.ReadOnly)
numberOfAtomsId = tuner.AddArgumentScalarInt(numberOfAtoms)
gridSpacingId = tuner.AddArgumentScalarFloat(gridSpacing)
gridSizeId = tuner.AddArgumentScalarInt(gridSize)
energyGridId = tuner.AddArgumentVectorFloat(
energyGrid, ktt.ArgumentAccessType.WriteOnly
)
tuner.SetArguments(
definition,
[
aAllId,
aXId,
aYId,
aZId,
aWId,
numberOfAtomsId,
gridSpacingId,
gridSizeId,
energyGridId,
],
)

kernel = tuner.CreateSimpleKernel('directCoulombSum', definition)

tuner.AddParameter(kernel, 'WORK_GROUP_SIZE_X', [16, 32])
tuner.AddThreadModifier(
kernel,
[definition],
ktt.ModifierType.Local,
ktt.ModifierDimension.X,
'WORK_GROUP_SIZE_X',
ktt.ModifierAction.Multiply,
)
tuner.AddThreadModifier(
kernel,
[definition],
ktt.ModifierType.Global,
ktt.ModifierDimension.X,
'WORK_GROUP_SIZE_X',
ktt.ModifierAction.Divide,
)
tuner.AddParameter(kernel, 'WORK_GROUP_SIZE_Y', [1, 2, 4, 8])
tuner.AddThreadModifier(
kernel,
[definition],
ktt.ModifierType.Local,
ktt.ModifierDimension.Y,
'WORK_GROUP_SIZE_Y',
ktt.ModifierAction.Multiply,
)
tuner.AddThreadModifier(
kernel,
[definition],
ktt.ModifierType.Global,
ktt.ModifierDimension.Y,
'WORK_GROUP_SIZE_Y',
ktt.ModifierAction.Divide,
)
tuner.AddParameter(kernel, 'WORK_GROUP_SIZE_Z', [1])
tuner.AddParameter(kernel, 'Z_ITERATIONS', [1, 2, 4, 8, 16, 32])
tuner.AddThreadModifier(
kernel,
[definition],
ktt.ModifierType.Global,
ktt.ModifierDimension.Z,
'Z_ITERATIONS',
ktt.ModifierAction.Divide,
)
tuner.AddParameter(kernel, 'INNER_UNROLL_FACTOR', [0, 1, 2, 4, 8, 16, 32])
tuner.AddParameter(kernel, 'USE_CONSTANT_MEMORY', [0])
tuner.AddParameter(kernel, 'USE_SOA', [0, 1])
tuner.AddParameter(kernel, 'VECTOR_SIZE', [1])
tuner.AddConstraint(
kernel,
['INNER_UNROLL_FACTOR', 'Z_ITERATIONS'],
lambda vector: vector[0] < vector[1],
)

tuner.AddConstraint(
kernel,
['WORK_GROUP_SIZE_X', 'WORK_GROUP_SIZE_Y'],
lambda vector: vector[0] * vector[1] >= 64,
)

# Make tuner use the profiling searcher
searcher = ProfileBasedSearcher()
tuner.SetSearcher(kernel, searcher)

modelInfo = ModelInfo(
deltaPath='./models/1070_all_XGBRegressor.sav',
spacePath='./models/1070_coulomb_XGBRegressor.sav',
counterPath='./models/2080-1070_all_XGBRegressor.sav',
)
batchInfo = BatchInfo(batchSize=10, neighborSize=40, randomSize=100)
searcher.Configure(tuner, modelInfo, batchInfo, LoggingLevel.DEBUG)

# Begin tuning utilizing the stop condition implemented in Python
results = tuner.Tune(kernel, ktt.ConfigurationCount(50))

logPath = getLogPath(modelInfo, batchInfo)
os.makedirs(Path(logPath).parent, exist_ok=True)

tuner.SaveResults(results, logPath, ktt.OutputFormat.XML)


if __name__ == '__main__':
deviceIndex = int(sys.argv[1]) if len(sys.argv) >= 2 else 0
kernelFile = sys.argv[2] if len(sys.argv) >= 3 else './CudaKernel.cu'

runTuning(deviceIndex, kernelFile)
Loading