Skip to content

Commit 710f6e4

Browse files
committed
sliding window for solcap
1 parent d5b7148 commit 710f6e4

File tree

9 files changed

+231
-37
lines changed

9 files changed

+231
-37
lines changed

src/app/firedancer/topology.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1273,6 +1273,8 @@ fd_topo_configure_tile( fd_topo_tile_t * tile,
12731273

12741274
tile->capctx.capture_start_slot = config->capture.capture_start_slot;
12751275
strncpy( tile->capctx.solcap_capture, config->capture.solcap_capture, sizeof(tile->capctx.solcap_capture) );
1276+
tile->capctx.recent_only = config->capture.recent_only;
1277+
tile->capctx.recent_slots_per_file = config->capture.recent_slots_per_file;
12761278

12771279
} else {
12781280
FD_LOG_ERR(( "unknown tile name `%s`", tile->name ));

src/app/shared/fd_config.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -484,6 +484,8 @@ struct fd_config {
484484
ulong capture_start_slot;
485485
char dump_proto_dir[ PATH_MAX ];
486486
char solcap_capture[ PATH_MAX ];
487+
int recent_only;
488+
ulong recent_slots_per_file;
487489
int dump_elf_to_pb;
488490
int dump_syscall_to_pb;
489491
int dump_instr_to_pb;

src/app/shared/fd_config_parse.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -242,6 +242,8 @@ fd_config_extract_pod( uchar * pod,
242242

243243
CFG_POP ( ulong, capture.capture_start_slot );
244244
CFG_POP ( cstr, capture.solcap_capture );
245+
CFG_POP ( bool, capture.recent_only );
246+
CFG_POP ( ulong, capture.recent_slots_per_file );
245247
CFG_POP ( cstr, capture.dump_proto_dir );
246248
CFG_POP ( bool, capture.dump_elf_to_pb );
247249
CFG_POP ( bool, capture.dump_syscall_to_pb );

src/disco/cswtch/fd_cswtch_tile.c

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -171,7 +171,10 @@ privileged_init( fd_topo_t * topo,
171171
FD_TEST( fd_cstr_printf_check( path, sizeof( path ), NULL, "/proc/%lu/task/%lu/status", pid, tid ) );
172172
ctx->status_fds[ i ] = open( path, O_RDONLY );
173173
ctx->metrics[ i ] = fd_metrics_tile( metrics );
174-
if( FD_UNLIKELY( -1==ctx->status_fds[ i ] ) ) FD_LOG_ERR(( "open failed (%i-%s)", errno, strerror( errno ) ));
174+
if( FD_UNLIKELY( -1==ctx->status_fds[ i ] ) ) {
175+
FD_LOG_ERR(( "open failed for tile %lu (%s:%lu) path=%s pid=%lu tid=%lu (%i-%s)",
176+
i, topo->tiles[i].name, topo->tiles[i].kind_id, path, pid, tid, errno, strerror( errno ) ));
177+
}
175178
break;
176179
}
177180
}

src/disco/topo/fd_topo.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -588,7 +588,12 @@ struct fd_topo_tile {
588588
struct {
589589
ulong capture_start_slot;
590590
char solcap_capture[ PATH_MAX ];
591+
int recent_only;
592+
ulong recent_slots_per_file;
591593
int solcap_fd;
594+
int solcap_fd_0; /* First FD (single file mode) or recent_0.solcap */
595+
int solcap_fd_1; /* Second FD (recent_1.solcap, or same as fd_0) */
596+
int solcap_fd_2; /* Third FD (recent_2.solcap, or same as fd_0) */
592597
} capctx;
593598
};
594599
};

src/discof/capture/fd_capture_tile.c

Lines changed: 141 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,12 @@
66

77
#include <errno.h>
88
#include <fcntl.h>
9+
#include <limits.h>
910
#include <stdio.h>
1011
#include <stdlib.h>
1112
#include <string.h>
1213
#include <unistd.h>
14+
#include <sys/stat.h>
1315

1416
#include "fd_capture_ctx.h"
1517
#include "../../flamenco/capture/fd_solcap_writer.h"
@@ -66,6 +68,14 @@ struct __attribute__((packed)) fd_capture_tile_ctx {
6668

6769
FILE * file;
6870

71+
/* Recent-only rotating capture state */
72+
int recent_only; /* 1 if using 3-file rotation, 0 for single file */
73+
FILE* recent_files[3]; /* Array of 3 FILE pointers for rotation */
74+
int recent_fds[3]; /* File descriptors for seccomp */
75+
ulong recent_current_idx; /* Current file index (0, 1, or 2) */
76+
ulong recent_file_start_slot; /* Slot number when current file was started (ULONG_MAX = uninitialized) */
77+
ulong recent_slots_per_file; /* Number of slots per file */
78+
6979
/* Incoming links for mcache/dcache processing */
7080
struct {
7181
fd_wksp_t * mem;
@@ -83,6 +93,13 @@ struct __attribute__((packed)) fd_capture_tile_ctx {
8393

8494
typedef struct fd_capture_tile_ctx fd_capture_tile_ctx_t;
8595

96+
/* _capture_failure: Called on any unrecoverable capture error.
97+
Logs a warning and spins forever instead of crashing the validator. */
98+
static void
99+
_capture_failure( char const * msg ) {
100+
FD_LOG_ERR(( "\033[1;31mSOLCAP HAS FAILED: %s. Contact Firedancer Development team immediately.\033[0m", msg ));
101+
for(;;) FD_SPIN_PAUSE();
102+
}
86103

87104
FD_FN_CONST static inline ulong
88105
scratch_align( void ) {
@@ -103,10 +120,14 @@ populate_allowed_seccomp( fd_topo_t const * topo FD_PARAM_UNUSED,
103120
fd_topo_tile_t const * tile,
104121
ulong out_cnt,
105122
struct sock_filter * out ) {
123+
/* FD values are stored in tile->capctx during privileged_init,
124+
avoiding the need to access context scratch memory here */
106125
populate_sock_filter_policy_fd_capture_tile( out_cnt,
107126
out,
108127
(uint)fd_log_private_logfile_fd(),
109-
(uint)tile->capctx.solcap_fd );
128+
(uint)tile->capctx.solcap_fd_0,
129+
(uint)tile->capctx.solcap_fd_1,
130+
(uint)tile->capctx.solcap_fd_2 );
110131
return sock_filter_policy_fd_capture_tile_instr_cnt;
111132
}
112133

@@ -120,8 +141,20 @@ populate_allowed_fds( fd_topo_t const * topo FD_PARAM_UNUSED,
120141
out_fds[ out_cnt++ ] = 2; /* stderr */
121142
if( FD_LIKELY( -1!=fd_log_private_logfile_fd() ) )
122143
out_fds[ out_cnt++ ] = fd_log_private_logfile_fd();
123-
if( FD_LIKELY( -1!=tile->capctx.solcap_fd ) )
124-
out_fds[ out_cnt++ ] = tile->capctx.solcap_fd;
144+
145+
if( tile->capctx.recent_only ) {
146+
/* In recent_only mode, allow all 3 rotating file descriptors */
147+
if( FD_LIKELY( -1!=tile->capctx.solcap_fd_0 ) )
148+
out_fds[ out_cnt++ ] = tile->capctx.solcap_fd_0;
149+
if( FD_LIKELY( -1!=tile->capctx.solcap_fd_1 ) )
150+
out_fds[ out_cnt++ ] = tile->capctx.solcap_fd_1;
151+
if( FD_LIKELY( -1!=tile->capctx.solcap_fd_2 ) )
152+
out_fds[ out_cnt++ ] = tile->capctx.solcap_fd_2;
153+
} else {
154+
/* Traditional single file mode */
155+
if( FD_LIKELY( -1!=tile->capctx.solcap_fd ) )
156+
out_fds[ out_cnt++ ] = tile->capctx.solcap_fd;
157+
}
125158

126159
return out_cnt;
127160
}
@@ -157,7 +190,7 @@ fd_capctx_buf_process_msg(fd_capture_ctx_t * capture_ctx,
157190
break;
158191
}
159192
default:
160-
FD_LOG_ERR(( "Unknown signal: %d", msg_hdr->sig ));
193+
_capture_failure( "Unknown signal received in message processing" );
161194
break;
162195
}
163196
return block_len;
@@ -192,6 +225,41 @@ returnable_frag( fd_capture_tile_ctx_t * ctx,
192225
actual_data = (char *)(data + sizeof(fd_solcap_buf_msg_t));
193226
ctx->msg_set_slot = msg_hdr->slot;
194227
ctx->msg_set_sig = SOLCAP_SIG_MAP(msg_hdr->sig);
228+
229+
/* Handle file rotation for recent_only mode */
230+
if( ctx->recent_only ) {
231+
if( ctx->recent_file_start_slot == ULONG_MAX ) {
232+
ctx->recent_file_start_slot = msg_hdr->slot;
233+
} else if( msg_hdr->slot >= ctx->recent_file_start_slot + ctx->recent_slots_per_file ) {
234+
/* Check if we need to rotate (>= 16 slots from start) */
235+
/* Rotate to next file */
236+
ulong next_idx = (ctx->recent_current_idx + 1) % 3;
237+
FILE * next_file = ctx->recent_files[next_idx];
238+
int next_fd = fileno(next_file);
239+
240+
/* The following is a series of checks to ensure the file is
241+
flushed and truncated correctly. This occurs via:
242+
1. Flushing the current file
243+
2. Flushing the next file
244+
3. Truncating the next file
245+
4. Resetting the file descriptor position to 0
246+
5. Resetting the FILE* stream position to 0
247+
6. Clearing any error indicators on the stream
248+
7. Reinitializing the solcap writer with the new file
249+
*/
250+
if( FD_UNLIKELY( fflush( ctx->file ) ) ) { _capture_failure( "fflush failed on current file during rotation" ); }
251+
if( FD_UNLIKELY( fflush( next_file ) ) ) { _capture_failure( "fflush failed on next file during rotation" ); }
252+
if( FD_UNLIKELY( ftruncate( next_fd, 0L ) != 0 ) ) { _capture_failure( "ftruncate failed during file rotation" ); }
253+
if( FD_UNLIKELY( lseek( next_fd, 0L, SEEK_SET ) == -1L ) ) { _capture_failure( "lseek failed during file rotation" ); }
254+
if( FD_UNLIKELY( fseek( next_file, 0L, SEEK_SET ) != 0 ) ) { _capture_failure( "fseek failed during file rotation" ); }
255+
256+
clearerr( next_file );
257+
fd_solcap_writer_init( ctx->capture_ctx->capture, next_file );
258+
ctx->recent_current_idx = next_idx;
259+
ctx->recent_file_start_slot = msg_hdr->slot;
260+
ctx->file = next_file;
261+
}
262+
}
195263
} else {
196264
msg_hdr_storage.sig = ctx->msg_set_sig;
197265
msg_hdr_storage.slot = ctx->msg_set_slot;
@@ -232,7 +300,6 @@ returnable_frag( fd_capture_tile_ctx_t * ctx,
232300
static void
233301
privileged_init( fd_topo_t * topo,
234302
fd_topo_tile_t * tile ) {
235-
236303
void * scratch = fd_topo_obj_laddr( topo, tile->tile_obj_id );
237304
FD_SCRATCH_ALLOC_INIT( l, scratch );
238305
fd_capture_tile_ctx_t * ctx = FD_SCRATCH_ALLOC_APPEND( l, alignof(fd_capture_tile_ctx_t), sizeof(fd_capture_tile_ctx_t) );
@@ -243,17 +310,77 @@ privileged_init( fd_topo_t * topo,
243310
ctx->capture_ctx = fd_capture_ctx_join( fd_capture_ctx_new( _capture_ctx ) );
244311
FD_TEST( ctx->capture_ctx );
245312

246-
tile->capctx.solcap_fd = open( tile->capctx.solcap_capture, O_RDWR | O_CREAT | O_TRUNC, 0644 );
247-
if( FD_UNLIKELY( tile->capctx.solcap_fd == -1 ) ) {
248-
FD_LOG_ERR(( "failed to open or create solcap capture file %s (%i-%s)",
249-
tile->capctx.solcap_capture, errno, strerror(errno) ));
250-
}
313+
ctx->recent_only = tile->capctx.recent_only;
314+
ctx->recent_slots_per_file = tile->capctx.recent_slots_per_file ? tile->capctx.recent_slots_per_file : 128UL;
315+
316+
struct stat path_stat;
317+
int stat_result = stat( tile->capctx.solcap_capture, &path_stat );
318+
319+
if( ctx->recent_only ) {
320+
/* recent_only=1: Ensure path is a directory, create if not exists */
321+
if( stat_result != 0 ) {
322+
if( FD_UNLIKELY( mkdir(tile->capctx.solcap_capture, 0755) != 0 ) ) {
323+
FD_LOG_ERR(( "solcap_recent_only=1 but could not create directory: %s (%i-%s)",
324+
tile->capctx.solcap_capture, errno, strerror(errno) ));
325+
}
326+
} else if( FD_UNLIKELY( !S_ISDIR(path_stat.st_mode) ) ) {
327+
FD_LOG_ERR(( "solcap_recent_only=1 but path is not a directory: %s", tile->capctx.solcap_capture ));
328+
}
329+
330+
ctx->recent_current_idx = 0;
331+
ctx->recent_file_start_slot = 0UL; /* Will be set on first fragment */
332+
333+
for( ulong i = 0; i < 3; i++ ) {
334+
char filepath[PATH_MAX];
335+
int ret = snprintf( filepath, PATH_MAX, "%s/recent_%lu.solcap", tile->capctx.solcap_capture, i );
336+
if( FD_UNLIKELY( ret<0 || ret>=PATH_MAX ) ) {
337+
FD_LOG_ERR(( "snprintf failed or path too long for recent file %lu", i ));
338+
}
339+
340+
ctx->recent_fds[i] = open( filepath, O_RDWR | O_CREAT | O_TRUNC, 0644 );
341+
if( FD_UNLIKELY( ctx->recent_fds[i] == -1 ) ) {
342+
FD_LOG_ERR(( "failed to open or create solcap recent file %s (%i-%s)",
343+
filepath, errno, strerror(errno) ));
344+
}
345+
346+
ctx->recent_files[i] = fdopen( ctx->recent_fds[i], "w+" );
347+
if( FD_UNLIKELY( !ctx->recent_files[i] ) ) {
348+
FD_LOG_ERR(( "failed to fdopen solcap recent file descriptor %d (%i-%s)",
349+
ctx->recent_fds[i], errno, strerror(errno) ));
350+
}
351+
}
352+
353+
ctx->file = ctx->recent_files[0];
354+
tile->capctx.solcap_fd = ctx->recent_fds[0];
355+
356+
tile->capctx.solcap_fd_0 = ctx->recent_fds[0];
357+
tile->capctx.solcap_fd_1 = ctx->recent_fds[1];
358+
tile->capctx.solcap_fd_2 = ctx->recent_fds[2];
359+
360+
} else {
361+
/* recent_only=0: Validate that path is a file*/
362+
if( FD_UNLIKELY( stat_result == 0 && S_ISDIR(path_stat.st_mode) ) ) {
363+
FD_LOG_ERR(( "solcap_recent_only=0 but path is a directory: %s (should be a file path)", tile->capctx.solcap_capture ));
364+
}
365+
366+
tile->capctx.solcap_fd = open( tile->capctx.solcap_capture, O_RDWR | O_CREAT | O_TRUNC, 0644 );
367+
if( FD_UNLIKELY( tile->capctx.solcap_fd == -1 ) ) {
368+
FD_LOG_ERR(( "failed to open or create solcap capture file %s (%i-%s)",
369+
tile->capctx.solcap_capture, errno, strerror(errno) ));
370+
}
251371

252-
ctx->file = fdopen( tile->capctx.solcap_fd, "w+" );
253-
if( FD_UNLIKELY( !ctx->file ) ) {
254-
FD_LOG_ERR(( "failed to fdopen solcap capture file descriptor %d (%i-%s)",
255-
tile->capctx.solcap_fd, errno, strerror(errno) ));
372+
ctx->file = fdopen( tile->capctx.solcap_fd, "w+" );
373+
if( FD_UNLIKELY( !ctx->file ) ) {
374+
FD_LOG_ERR(( "failed to fdopen solcap capture file descriptor %d (%i-%s)",
375+
tile->capctx.solcap_fd, errno, strerror(errno) ));
376+
}
377+
378+
/* Store same FD for all 3 slots in single file mode */
379+
tile->capctx.solcap_fd_0 = tile->capctx.solcap_fd;
380+
tile->capctx.solcap_fd_1 = tile->capctx.solcap_fd;
381+
tile->capctx.solcap_fd_2 = tile->capctx.solcap_fd;
256382
}
383+
257384
FD_TEST( ctx->capture_ctx->capture );
258385

259386
ctx->capture_ctx->solcap_start_slot = tile->capctx.capture_start_slot;

src/discof/capture/fd_capture_tile.seccomppolicy

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,31 +1,46 @@
11
# logfile_fd: It can be disabled by configuration, but typically tiles
22
# will open a log file on boot and write all messages there.
33
#
4-
# solcap_fd: The file that the solcap capture will be written to
5-
unsigned int logfile_fd, uint solcap_fd
4+
# solcap_fd_0: The first solcap capture file (single file mode or recent_0.solcap)
5+
# solcap_fd_1: The second solcap capture file (recent_1.solcap in recent_only mode)
6+
# solcap_fd_2: The third solcap capture file (recent_2.solcap in recent_only mode)
7+
unsigned int logfile_fd, uint solcap_fd_0, uint solcap_fd_1, uint solcap_fd_2
68

79
# logging: all log messages are written to a file and/or pipe
810
#
911
# 'WARNING' and above are written to the STDERR pipe, while all messages
1012
# are always written to the log file.
1113
#
12-
# The capture tile writes to the solcap file.
14+
# The capture tile writes to the solcap file(s).
1315
#
1416
# arg 0 is the file descriptor to write to. The boot process ensures
1517
# that descriptor 2 is always STDERR.
1618
write: (or (eq (arg 0) 2)
1719
(eq (arg 0) logfile_fd)
18-
(eq (arg 0) solcap_fd))
20+
(eq (arg 0) solcap_fd_0)
21+
(eq (arg 0) solcap_fd_1)
22+
(eq (arg 0) solcap_fd_2))
1923

2024
# fflush() requires fstat to check file state before flushing
2125
#
2226
# arg 0 is the file descriptor to fstat.
23-
fstat: (eq (arg 0) solcap_fd)
27+
fstat: (or (eq (arg 0) solcap_fd_0)
28+
(eq (arg 0) solcap_fd_1)
29+
(eq (arg 0) solcap_fd_2))
2430

2531
# fflush() may use lseek to manage file position
2632
#
2733
# arg 0 is the file descriptor to lseek.
28-
lseek: (eq (arg 0) solcap_fd)
34+
lseek: (or (eq (arg 0) solcap_fd_0)
35+
(eq (arg 0) solcap_fd_1)
36+
(eq (arg 0) solcap_fd_2))
37+
38+
# ftruncate: used to clear/reset rotating capture files
39+
#
40+
# arg 0 is the file descriptor to truncate.
41+
ftruncate: (or (eq (arg 0) solcap_fd_0)
42+
(eq (arg 0) solcap_fd_1)
43+
(eq (arg 0) solcap_fd_2))
2944

3045
# logging: 'WARNING' and above fsync the logfile to disk immediately
3146
#

0 commit comments

Comments
 (0)