From 2c417fa0cb0294d1502041080a1dc51b4c905c42 Mon Sep 17 00:00:00 2001 From: droidnoob Date: Tue, 26 May 2026 01:51:35 +0530 Subject: [PATCH] fix(process): share ETXTBSY retry helper across git/bd/os spawns MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Linux "Text file busy" race that the git module already handled (commit 2368c7a) also hits the brew-install probe in os.rs — the 0.8.1 CI surfaced it as `macos_with_brew_propagates_failure` panicking because the spawned brew stub raced its own write fd. Promote the retry helper to a shared `hew_core::process` module and route every in-tree spawn site through it: - hew_core::process::spawn_with_etxtbsy_retry — new shared module. pub(crate) so the binary doesn't see it but every hew-core module can. Exponential backoff up to ~150ms total, then a final attempt. - hew_core::git: drop the local copy, import from process. - hew_core::os::run_brew_install_git: use the helper (fixes the failing test). - hew_core::bd::RealBd::run / run_to_file_inner: use the helper for defense in depth — bd's tests don't stub-install today, but the same race applies any time the binary itself gets rewritten under a concurrent reader (hew init refresh during a parallel run). Verification: - Failing test `macos_with_brew_propagates_failure`: now passes 10/10 in a stress loop. - cargo test (default + --no-default-features): green. - cargo clippy --all-targets -- -D warnings: clean. - cargo fmt --all -- --check: clean. Co-Authored-By: Claude Opus 4.7 (1M context) --- hew-core/src/bd.rs | 5 +++-- hew-core/src/git.rs | 30 ++------------------------ hew-core/src/lib.rs | 1 + hew-core/src/os.rs | 3 ++- hew-core/src/process.rs | 48 +++++++++++++++++++++++++++++++++++++++++ 5 files changed, 56 insertions(+), 31 deletions(-) create mode 100644 hew-core/src/process.rs diff --git a/hew-core/src/bd.rs b/hew-core/src/bd.rs index 68fa13f..4e7731d 100644 --- a/hew-core/src/bd.rs +++ b/hew-core/src/bd.rs @@ -17,6 +17,7 @@ use serde::{Deserialize, Serialize}; use tracing::debug; use crate::error::{HewError, Result}; +use crate::process::spawn_with_etxtbsy_retry; /// Soft default timeout for any single `bd` invocation. pub const DEFAULT_TIMEOUT: Duration = Duration::from_secs(30); @@ -133,7 +134,7 @@ impl RealBd { let mut cmd = Command::new(&self.path); cmd.args(args).stdin(Stdio::null()).stdout(Stdio::piped()).stderr(Stdio::piped()); - let mut child = cmd.spawn()?; + let mut child = spawn_with_etxtbsy_retry(&mut cmd)?; let status = match child.wait_timeout(self.timeout)? { Some(s) => s, @@ -200,7 +201,7 @@ impl RealBd { let mut cmd = Command::new(&self.path); cmd.args(args).stdin(Stdio::null()).stdout(Stdio::from(file)).stderr(Stdio::piped()); - let mut child = cmd.spawn()?; + let mut child = spawn_with_etxtbsy_retry(&mut cmd)?; let status = match child.wait_timeout(self.timeout)? { Some(s) => s, diff --git a/hew-core/src/git.rs b/hew-core/src/git.rs index 64e10d8..484be6a 100644 --- a/hew-core/src/git.rs +++ b/hew-core/src/git.rs @@ -5,43 +5,17 @@ use std::ffi::{OsStr, OsString}; use std::path::PathBuf; -use std::process::{Child, Command, Stdio}; +use std::process::{Command, Stdio}; use std::time::Duration; use tracing::debug; use wait_timeout::ChildExt; use crate::error::{HewError, Result}; +use crate::process::spawn_with_etxtbsy_retry; const DEFAULT_TIMEOUT: Duration = Duration::from_secs(30); -/// Spawn a `Command`, retrying on `ExecutableFileBusy` (Linux `ETXTBSY`). -/// -/// Background: when parallel threads each write+exec their own stub -/// binary (the test harness pattern), one thread's writable fd to its -/// stub temp file leaks into another thread's child via `fork()` even -/// when the fd is `O_CLOEXEC` — `O_CLOEXEC` only fires on `exec`, not -/// `fork`. The kernel then sees an outstanding writer on the inode and -/// the child's `exec` trips `ETXTBSY` (errno 26). Same race can happen -/// in production any time `hew init` rewrites bundled artifacts during -/// concurrent reads. Exponential backoff up to ~150ms total handles the -/// transient window without callers needing to care. -fn spawn_with_etxtbsy_retry(cmd: &mut Command) -> std::io::Result { - use std::io::ErrorKind; - let mut delay_ms = 5u64; - for _ in 0..5 { - match cmd.spawn() { - Ok(c) => return Ok(c), - Err(e) if e.kind() == ErrorKind::ExecutableFileBusy => { - std::thread::sleep(Duration::from_millis(delay_ms)); - delay_ms *= 2; - } - Err(e) => return Err(e), - } - } - cmd.spawn() -} - #[derive(Debug, Clone)] pub struct GitOutput { pub stdout: String, diff --git a/hew-core/src/lib.rs b/hew-core/src/lib.rs index 8b33ace..02d2a72 100644 --- a/hew-core/src/lib.rs +++ b/hew-core/src/lib.rs @@ -22,6 +22,7 @@ pub mod memories; pub mod notify; pub mod os; pub mod prime; +pub(crate) mod process; pub mod review; pub mod skills; pub mod slash; diff --git a/hew-core/src/os.rs b/hew-core/src/os.rs index b644082..0c299b9 100644 --- a/hew-core/src/os.rs +++ b/hew-core/src/os.rs @@ -13,6 +13,7 @@ use tracing::debug; use wait_timeout::ChildExt; use crate::error::{HewError, Result}; +use crate::process::spawn_with_etxtbsy_retry; const BREW_INSTALL_TIMEOUT: Duration = Duration::from_secs(600); @@ -137,7 +138,7 @@ fn run_brew_install_git(brew: &std::path::Path) -> Result<()> { .stdin(Stdio::null()) .stdout(Stdio::piped()) .stderr(Stdio::piped()); - let mut child = cmd.spawn()?; + let mut child = spawn_with_etxtbsy_retry(&mut cmd)?; let status = match child.wait_timeout(BREW_INSTALL_TIMEOUT)? { Some(s) => s, None => { diff --git a/hew-core/src/process.rs b/hew-core/src/process.rs new file mode 100644 index 0000000..fad4003 --- /dev/null +++ b/hew-core/src/process.rs @@ -0,0 +1,48 @@ +//! Subprocess spawn helpers shared across hew-core. +//! +//! The single helper here, [`spawn_with_etxtbsy_retry`], wraps +//! [`std::process::Command::spawn`] with a brief retry loop on +//! `ETXTBSY` (Linux's "Text file busy" — `ErrorKind::ExecutableFileBusy`). +//! +//! ## Why this exists +//! +//! When parallel threads each write+exec their own stub binary (the +//! test harness pattern used in `hew_core::testing::install_executable_stub`), +//! one thread's writable fd to its stub temp file leaks into another +//! thread's child via `fork()` even when the fd is `O_CLOEXEC` — +//! `O_CLOEXEC` only fires on `exec`, not `fork`. The kernel then sees +//! an outstanding writer on the inode and the child's `exec` trips +//! `ETXTBSY` (errno 26). +//! +//! Same race can happen in production any time `hew init` rewrites +//! bundled artifacts during concurrent reads. Exponential backoff up to +//! ~150ms total handles the transient window without callers needing +//! to care. +//! +//! Defense in depth: every spawn path in hew-core (git, bd, os +//! installer probes) should go through this helper. The cost on the +//! happy path is one extra `match`; the cost on the failing path is a +//! handful of millisecond-scale sleeps that resolve a race the caller +//! would otherwise surface as a confusing test flake. + +use std::process::{Child, Command}; +use std::time::Duration; + +/// Spawn `cmd`, retrying briefly on `ETXTBSY` (Linux "Text file busy"). +/// Returns the spawned `Child` on success or the underlying io error +/// on the final attempt. +pub(crate) fn spawn_with_etxtbsy_retry(cmd: &mut Command) -> std::io::Result { + use std::io::ErrorKind; + let mut delay_ms = 5u64; + for _ in 0..5 { + match cmd.spawn() { + Ok(c) => return Ok(c), + Err(e) if e.kind() == ErrorKind::ExecutableFileBusy => { + std::thread::sleep(Duration::from_millis(delay_ms)); + delay_ms *= 2; + } + Err(e) => return Err(e), + } + } + cmd.spawn() +}