Skip to content

link launch and sync conda/workspace locations #742

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 38 additions & 0 deletions monarch_conda/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# @generated by autocargo from //monarch/monarch_conda:[conda-sync-cli,monarch_conda]

[package]
name = "monarch_conda"
version = "0.0.0"
authors = ["Meta"]
edition = "2021"
license = "BSD-3-Clause"

[[bin]]
name = "conda_sync_cli"
path = "src/main.rs"

[dependencies]
anyhow = "1.0.98"
async-tempfile = "0.7.0"
bincode = "1.3.3"
chrono = { version = "0.4.41", features = ["clock", "serde", "std"], default-features = false }
clap = { version = "4.5.41", features = ["derive", "env", "string", "unicode", "wrap_help"] }
dashmap = { version = "5.5.3", features = ["rayon", "serde"] }
digest = "0.10"
filetime = "0.2.25"
futures = { version = "0.3.31", features = ["async-await", "compat"] }
globset = { version = "0.4.13", features = ["serde1"] }
ignore = "0.4"
itertools = "0.14.0"
memchr = "2.7.5"
memmap2 = "0.9.5"
rattler_conda_types = "0.28.3"
serde = { version = "1.0.219", features = ["derive", "rc"] }
serde_json = { version = "1.0.140", features = ["alloc", "float_roundtrip", "unbounded_depth"] }
sha2 = "0.10.6"
tokio = { version = "1.46.1", features = ["full", "test-util", "tracing"] }
tokio-util = { version = "0.7.15", features = ["full"] }
walkdir = "2.3"

[dev-dependencies]
tempfile = "3.15"
158 changes: 158 additions & 0 deletions monarch_conda/src/diff.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

use std::collections::HashMap;
use std::path::Path;
use std::path::PathBuf;
use std::time::Duration;
use std::time::SystemTime;
use std::time::UNIX_EPOCH;

use anyhow::Context;
use anyhow::Result;
use anyhow::ensure;
use chrono::DateTime;
use chrono::Utc;
use digest::Digest;
use digest::Output;
use rattler_conda_types::PrefixRecord;
use rattler_conda_types::prefix_record::PathsEntry;
use serde::Deserialize;
use serde::Serialize;
use serde_json;
use sha2::Sha256;
use tokio::fs;
use walkdir::WalkDir;

use crate::hash_utils;
use crate::pack_meta::History;
use crate::pack_meta::Offsets;

/// Fingerprint of the conda-meta directory, used by `CondaFingerprint` below.
#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub struct CondaMetaFingerprint {
// TODO(agallagher): It might be worth storing more information of installed
// packages, so that we could print better error messages when we detect two
// envs are not equivalent.
hash: Output<Sha256>,
}

impl CondaMetaFingerprint {
async fn from_env(path: &Path) -> Result<Self> {
let mut hasher = Sha256::new();
hash_utils::hash_directory_tree(&path.join("conda-meta"), &mut hasher).await?;
Ok(Self {
hash: hasher.finalize(),
})
}
}

/// Fingerprint of the pack-meta directory, used by `CondaFingerprint` below.
#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub struct PackMetaFingerprint {
offsets: Output<Sha256>,
pub history: History,
}

impl PackMetaFingerprint {
async fn from_env(path: &Path) -> Result<Self> {
let pack_meta = path.join("pack-meta");

// Read the fulle history.jsonl file.
let contents = fs::read_to_string(pack_meta.join("history.jsonl")).await?;
let history = History::from_contents(&contents)?;

// Read entire offsets.jsonl file, but avoid hashing the offsets, which can change.
let mut hasher = Sha256::new();
let contents = fs::read_to_string(pack_meta.join("offsets.jsonl")).await?;
let offsets = Offsets::from_contents(&contents)?;
for ent in offsets.entries {
let contents = bincode::serialize(&(ent.path, ent.mode, ent.offsets.len()))?;
hasher.update(contents.len().to_le_bytes());
hasher.update(&contents);
}
let offsets = hasher.finalize();

Ok(Self { history, offsets })
}
}

/// A fingerprint of a conda environment, used to detect if two envs are similar enough to
/// facilitate mtime-based conda syncing.
#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub struct CondaFingerprint {
pub conda_meta: CondaMetaFingerprint,
pub pack_meta: PackMetaFingerprint,
}

impl CondaFingerprint {
pub async fn from_env(path: &Path) -> Result<Self> {
Ok(Self {
conda_meta: CondaMetaFingerprint::from_env(path).await?,
pack_meta: PackMetaFingerprint::from_env(path).await?,
})
}

/// Create a comparator to compare the mtimes of files from two "equivalent" conda envs.
/// In particular, thie comparator will be aware of spuriuos mtime changes that occurs from
/// prefix replacement (via `meta-pack`), and will filter them out.
pub fn mtime_comparator(
a: &Self,
b: &Self,
) -> Result<Box<dyn Fn(&SystemTime, &SystemTime) -> std::cmp::Ordering + Send + Sync>> {
let (a_prefix, a_base) = a.pack_meta.history.first()?;
let (b_prefix, b_base) = b.pack_meta.history.first()?;
ensure!(a_prefix == b_prefix);

// NOTE(agallagher): There appears to be some mtime drift on some files after fbpkg creation,
// so acccount for that here.
let slop = Duration::from_secs(5 * 60);

// We load the timestamp from the first history entry, and use this to see if any
// files have been updated since the env was created.
let a_base = UNIX_EPOCH + Duration::from_secs(a_base) + slop;
let b_base = UNIX_EPOCH + Duration::from_secs(b_base) + slop;

// We also load the last prefix update window for each, as any mtimes from this window
// should be ignored.
let a_window = a
.pack_meta
.history
.prefix_and_last_update_window()?
.1
.map(|(s, e)| {
(
UNIX_EPOCH + Duration::from_secs(s),
UNIX_EPOCH + Duration::from_secs(e + 1),
)
});
let b_window = b
.pack_meta
.history
.prefix_and_last_update_window()?
.1
.map(|(s, e)| {
(
UNIX_EPOCH + Duration::from_secs(s),
UNIX_EPOCH + Duration::from_secs(e + 1),
)
});

Ok(Box::new(move |a: &SystemTime, b: &SystemTime| {
match (
*a > a_base && a_window.is_none_or(|(s, e)| *a < s || *a > e),
*b > b_base && b_window.is_none_or(|(s, e)| *b < s || *b > e),
) {
(true, false) => std::cmp::Ordering::Greater,
(false, true) => std::cmp::Ordering::Less,
(false, false) => std::cmp::Ordering::Equal,
(true, true) => a.cmp(b),
}
}))
}
}
162 changes: 162 additions & 0 deletions monarch_conda/src/hash_utils.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

use std::path::Path;

use anyhow::Result;
use anyhow::bail;
use digest::Digest;
use tokio::fs;
use walkdir::WalkDir;

/// Compute a hash of a directory tree using the provided hasher.
///
/// This function traverses the directory tree deterministically (sorted by file name)
/// and includes both file paths and file contents in the hash computation.
///
/// # Arguments
/// * `dir` - The directory to hash
/// * `hasher` - A hasher implementing the Digest trait (e.g., Sha256::new())
///
/// # Returns
/// () - The hasher is updated with the directory tree data
pub async fn hash_directory_tree<D: Digest>(dir: &Path, hasher: &mut D) -> Result<()> {
// Iterate entries with deterministic ordering
for entry in WalkDir::new(dir).sort_by_file_name().into_iter() {
let entry = entry?;
let path = entry.path();
let relative_path = path.strip_prefix(dir)?;

// Hash the relative path (normalized to use forward slashes)
let path_str = relative_path.to_string_lossy().replace('\\', "/");
hasher.update(path_str.as_bytes());
hasher.update(b"\0"); // null separator

if entry.file_type().is_file() {
// Hash file type marker, size, and contents
hasher.update(b"FILE:");
let contents = fs::read(path).await?;
hasher.update(contents.len().to_le_bytes());
hasher.update(&contents);
} else if entry.file_type().is_dir() {
// For directories, hash a type marker
hasher.update(b"DIR:");
} else if entry.file_type().is_symlink() {
// For symlinks, hash type marker, target size, and target
hasher.update(b"SYMLINK:");
let target = fs::read_link(path).await?;
let target_string = target.to_string_lossy().into_owned();
let target_bytes = target_string.as_bytes();
hasher.update(target_bytes.len().to_le_bytes());
hasher.update(target_bytes);
} else {
// Unexpected file type
bail!("Unexpected file type for path: {}", path.display());
}

hasher.update(b"\n"); // entry separator
}

Ok(())
}

#[cfg(test)]
mod tests {
use sha2::Sha256;
use tempfile::TempDir;
use tokio::fs;

use super::*;

#[tokio::test]
async fn test_hash_directory_tree() -> Result<()> {
// Create a temporary directory with some test files
let temp_dir = TempDir::new()?;
let dir_path = temp_dir.path();

// Create test files
fs::write(dir_path.join("file1.txt"), "Hello, world!").await?;
fs::write(dir_path.join("file2.txt"), "Another file").await?;
fs::create_dir(dir_path.join("subdir")).await?;
fs::write(dir_path.join("subdir").join("file3.txt"), "Nested file").await?;

// Hash the directory
let mut hasher1 = Sha256::new();
let mut hasher2 = Sha256::new();
hash_directory_tree(dir_path, &mut hasher1).await?;
hash_directory_tree(dir_path, &mut hasher2).await?;

let hash1 = hasher1.finalize();
let hash2 = hasher2.finalize();

// Should be deterministic
assert_eq!(hash1, hash2);
assert_eq!(hash1.len(), 32); // SHA256 raw bytes length

Ok(())
}

#[tokio::test]
async fn test_no_hash_collision_between_file_and_dir() -> Result<()> {
// Test that a file containing "DIR:" and an empty directory don't collide
let temp_dir1 = TempDir::new()?;
let temp_dir2 = TempDir::new()?;

// Create a file with content that could collide with directory marker
fs::write(temp_dir1.path().join("test"), "DIR:").await?;

// Create an empty directory with the same name
fs::create_dir(temp_dir2.path().join("test")).await?;

// Hash both scenarios
let mut hasher_file = Sha256::new();
let mut hasher_dir = Sha256::new();
hash_directory_tree(temp_dir1.path(), &mut hasher_file).await?;
hash_directory_tree(temp_dir2.path(), &mut hasher_dir).await?;

let hash_file = hasher_file.finalize();
let hash_dir = hasher_dir.finalize();

// Should be different due to type prefixes
assert_ne!(hash_file, hash_dir);

Ok(())
}

#[tokio::test]
async fn test_no_structural_marker_collision() -> Result<()> {
// Test that files containing our structural markers don't cause collisions
let temp_dir1 = TempDir::new()?;
let temp_dir2 = TempDir::new()?;

// Create a file that could potentially collide without size prefixes:
// Path: "test1", Content: "foo\n"
// Without size prefixes: test1\0FILE:foo\n\n
fs::write(temp_dir1.path().join("test1"), "foo\n").await?;

// Create a file with path that includes our structural markers:
// Path: "test1\nFILE:", Content: "foo\n"
// Without size prefixes: test1\nFILE:\0FILE:foo\n\n
// This could potentially collide with the above
fs::write(temp_dir2.path().join("test1\nFILE:"), "foo\n").await?;

// Hash both scenarios
let mut hasher1 = Sha256::new();
let mut hasher2 = Sha256::new();
hash_directory_tree(temp_dir1.path(), &mut hasher1).await?;
hash_directory_tree(temp_dir2.path(), &mut hasher2).await?;

let hash1 = hasher1.finalize();
let hash2 = hasher2.finalize();

// Should be different - size prefixes prevent structural marker confusion
assert_ne!(hash1, hash2);

Ok(())
}
}
14 changes: 14 additions & 0 deletions monarch_conda/src/lib.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#![feature(once_cell_try)]

pub mod diff;
pub mod hash_utils;
pub mod pack_meta;
pub mod sync;
Loading
Loading