|
1 | 1 | # Appendix 01 - Troubleshooting device loss error
|
2 | 2 |
|
3 |
| -TBD |
| 3 | +In this appendix we will provide some insights on how to troubleshoot Vulkan device loss errors. The root cause of device loss errors may be hard to track, therefore, Vulkan provides some debugging extensions to help developers track down these errors. |
| 4 | + |
| 5 | +You can find the complete source code for this chapter [here](../../booksamples/appendix-01). |
| 6 | + |
| 7 | +## Enabling the extensions |
| 8 | + |
| 9 | +The specific extensions to be used will depend in the GPU family that you have. If you have an Nvida GPU, the extensions and functions to be used will be different than if you have an AMD GPU. In any case, the process is more or less similar. You store certain checkpoints or mark in a command buffer that you can check later on if something goes wrong. The last executed mark will give you a hint on where things started to fail. |
| 10 | + |
| 11 | +Therefore, the first step is to get, from the physical device, which type of extension is supported for the associated GPU. In the `PhysDevice` class we will create a new method to check the supported extension. This method will be invoked when creating the `PhysDevice` instance so it can be checked later on: |
| 12 | + |
| 13 | +```java |
| 14 | +public class PhysDevice { |
| 15 | + ... |
| 16 | + private final CheckPointExtension checkPointExtension; |
| 17 | + ... |
| 18 | + private PhysDevice(VkPhysicalDevice vkPhysicalDevice) { |
| 19 | + ... |
| 20 | + checkPointExtension = calcCheckPointExtension(vkDeviceExtensions); |
| 21 | + ... |
| 22 | + } |
| 23 | + ... |
| 24 | + private CheckPointExtension calcCheckPointExtension(VkExtensionProperties.Buffer vkDeviceExtensions) { |
| 25 | + var result = CheckPointExtension.NONE; |
| 26 | + |
| 27 | + int numExtensions = vkDeviceExtensions != null ? vkDeviceExtensions.capacity() : 0; |
| 28 | + for (int i = 0; i < numExtensions; i++) { |
| 29 | + String extensionName = vkDeviceExtensions.get(i).extensionNameString(); |
| 30 | + if (NVDeviceDiagnosticCheckpoints.VK_NV_DEVICE_DIAGNOSTIC_CHECKPOINTS_EXTENSION_NAME.equals(extensionName)) { |
| 31 | + result = CheckPointExtension.NVIDIA; |
| 32 | + break; |
| 33 | + } else if (AMDBufferMarker.VK_AMD_BUFFER_MARKER_EXTENSION_NAME.equals(extensionName)) { |
| 34 | + result = CheckPointExtension.AMD; |
| 35 | + break; |
| 36 | + } |
| 37 | + } |
| 38 | + return result; |
| 39 | + } |
| 40 | + ... |
| 41 | + public CheckPointExtension getCheckPointExtension() { |
| 42 | + return checkPointExtension; |
| 43 | + } |
| 44 | + ... |
| 45 | + public enum CheckPointExtension { |
| 46 | + NONE, NVIDIA, AMD; |
| 47 | + } |
| 48 | +} |
| 49 | +``` |
| 50 | + |
| 51 | +As you can see, we just iterate over the device extensions to check if it supports the required extension for NVIDIA (`NVDeviceDiagnosticCheckpoints.VK_NV_DEVICE_DIAGNOSTIC_CHECKPOINTS_EXTENSION_NAME`) or AMD (`AMDBufferMarker.VK_AMD_BUFFER_MARKER_EXTENSION_NAME`). |
| 52 | + |
| 53 | +We will create a new property to control if we want to enable the checkpoint / buffer marker extension: |
| 54 | +```java |
| 55 | +public class EngCfg { |
| 56 | + ... |
| 57 | + private boolean enableCheckPoints; |
| 58 | + ... |
| 59 | + private EngCfg() { |
| 60 | + ... |
| 61 | + enableCheckPoints = Boolean.parseBoolean(props.getOrDefault("enableCheckPoints", false).toString()); |
| 62 | + ... |
| 63 | + } |
| 64 | + ... |
| 65 | + public boolean isEnableCheckPoints() { |
| 66 | + return enableCheckPoints; |
| 67 | + } |
| 68 | + ... |
| 69 | +} |
| 70 | +``` |
| 71 | + |
| 72 | +In the `Device` class we will enable the checkpoint / buffer marker extension if properly set up and supported by the GPU: |
| 73 | +```java |
| 74 | +public class Device { |
| 75 | + ... |
| 76 | + private static PointerBuffer createReqExtensions(PhysDevice physDevice, MemoryStack stack) { |
| 77 | + ... |
| 78 | + if (EngCfg.getInstance().isEnableCheckPoints()) { |
| 79 | + PhysDevice.CheckPointExtension checkPointExtension = physDevice.getCheckPointExtension(); |
| 80 | + if (checkPointExtension == PhysDevice.CheckPointExtension.NONE) { |
| 81 | + Logger.warn("Requested check point extensions but not supported by device"); |
| 82 | + } else if (checkPointExtension == PhysDevice.CheckPointExtension.NVIDIA) { |
| 83 | + extsList.add(stack.ASCII(NVDeviceDiagnosticCheckpoints.VK_NV_DEVICE_DIAGNOSTIC_CHECKPOINTS_EXTENSION_NAME)); |
| 84 | + } else { |
| 85 | + extsList.add(stack.ASCII(AMDBufferMarker.VK_AMD_BUFFER_MARKER_EXTENSION_NAME)); |
| 86 | + } |
| 87 | + } |
| 88 | + ... |
| 89 | + } |
| 90 | + ... |
| 91 | +} |
| 92 | +``` |
| 93 | + |
| 94 | +## Use the extensions |
| 95 | + |
| 96 | +Finally, we will need to add new methods to support the insertion of checkpoint / markers and to dump the results. We will do this in the `VulkanUtils` class: |
| 97 | +```java |
| 98 | +public class VkUtils { |
| 99 | + ... |
| 100 | + public static List<CheckPoint> dumpCheckPoints(Queue queue) { |
| 101 | + List<CheckPoint> result = new ArrayList<>(); |
| 102 | + |
| 103 | + try (MemoryStack stack = MemoryStack.stackPush()) { |
| 104 | + var count = stack.callocInt(1); |
| 105 | + vkGetQueueCheckpointDataNV(queue.getVkQueue(), count, null); |
| 106 | + int numCheckPoints = count.get(0); |
| 107 | + if (numCheckPoints > 0) { |
| 108 | + VkCheckpointDataNV.Buffer checkpointData = VkCheckpointDataNV.calloc(numCheckPoints, stack); |
| 109 | + checkpointData.stream().forEach(c -> c.sType(VK_STRUCTURE_TYPE_CHECKPOINT_DATA_NV)); |
| 110 | + |
| 111 | + vkGetQueueCheckpointDataNV(queue.getVkQueue(), count, checkpointData); |
| 112 | + checkpointData.forEach(c -> result.add(new CheckPoint(c.pCheckpointMarker(), c.stage()))); |
| 113 | + } |
| 114 | + } |
| 115 | + return result; |
| 116 | + } |
| 117 | + ... |
| 118 | + public static void insertBufferMarker(VkCtx vkCtx, CmdBuffer cmdBuff, int pipelineStage, VkBuffer dstBuffer, |
| 119 | + int offset, int marker) { |
| 120 | + PhysDevice.CheckPointExtension checkPointExtension = vkCtx.getPhysDevice().getCheckPointExtension(); |
| 121 | + if (checkPointExtension == PhysDevice.CheckPointExtension.AMD) { |
| 122 | + vkCmdWriteBufferMarkerAMD(cmdBuff.getVkCommandBuffer(), pipelineStage, dstBuffer.getBuffer(), offset, |
| 123 | + marker); |
| 124 | + } else { |
| 125 | + Logger.warn("Requested debug buffer marker in non supported device"); |
| 126 | + } |
| 127 | + } |
| 128 | + |
| 129 | + public static void insertDebugCheckPoint(VkCtx vkCtx, CmdBuffer cmdBuff, long checkPointMarker) { |
| 130 | + PhysDevice.CheckPointExtension checkPointExtension = vkCtx.getPhysDevice().getCheckPointExtension(); |
| 131 | + if (checkPointExtension == PhysDevice.CheckPointExtension.NVIDIA) { |
| 132 | + vkCmdSetCheckpointNV(cmdBuff.getVkCommandBuffer(), checkPointMarker); |
| 133 | + } else { |
| 134 | + Logger.warn("Requested debug check point in non supported device"); |
| 135 | + } |
| 136 | + } |
| 137 | + ... |
| 138 | + |
| 139 | + public record CheckPoint(long marker, int stage) { |
| 140 | + } |
| 141 | +} |
| 142 | +``` |
| 143 | + |
| 144 | +The `insertBufferMarker` can be used to insert buffer markers into a command buffer for AMD GPUs. It calls the `vkCmdWriteBufferMarkerAMD` function which receives the following parameters: |
| 145 | + |
| 146 | +- The command buffer into which the marker is recorded. |
| 147 | +- The pipeline stage whose completion triggers the marker write. |
| 148 | +- A buffer where the marker will be written to. |
| 149 | +- An offset to that buffer. |
| 150 | +- The marker itself, which is a 32 bit value. |
| 151 | + |
| 152 | +The `insertDebugCheckPoint` is the equivalent one for NVIDIA GPUs. In this case, it calls the `vkCmdSetCheckpointNV` which just needs a command buffer and a checkpoint marker (a long value). In this case, since the markers are not written to another buffer, we need to dump the status of the checkpoints by calling the `dumpCheckPoints` which retrieves the most recent diagnostic checkpoints that were executed by the device. |
0 commit comments