-
Notifications
You must be signed in to change notification settings - Fork 9
Remove HF_TOKEN dependency in E2E test #357
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 11 commits
df5f067
aefad4a
0560ffb
5feb9dd
a0f68c1
dca490c
19da2b1
1cf724b
f23f46d
c3edee2
15ef3eb
0e59528
418f726
766afb3
91d71b8
14dc419
c728389
b0f7562
b33a9da
a57b6b1
aeef24a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,94 @@ | ||
"""Utilities for preparing Hugging Face assets (models and tokenizers) for GCS.""" | ||
|
||
import logging | ||
import os | ||
import subprocess | ||
import tempfile | ||
from pathlib import Path | ||
|
||
from huggingface_hub import snapshot_download | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
TOKENIZER_PATTERNS = [ | ||
jack8558 marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
"tokenizer.json", | ||
"tokenizer_config.json", | ||
"special_tokens_map.json", | ||
"*.model", | ||
"vocab.txt", | ||
"merges.txt", | ||
] | ||
|
||
MODEL_PATTERNS = [ | ||
"*.safetensors*", | ||
"config.json", | ||
"generation_config.json", | ||
] | ||
|
||
|
||
def _upload_directory_to_gcs(local_path: Path, gcs_path: str): | ||
"""Uploads the contents of a local directory to GCS using gsutil. | ||
Args: | ||
local_path: The local directory whose contents will be uploaded. | ||
gcs_path: The destination GCS path (e.g., 'gs://my-bucket/models/'). | ||
""" | ||
if not gcs_path.startswith("gs://"): | ||
raise ValueError("GCS path must start with gs://") | ||
|
||
logger.info(f"Uploading contents of '{local_path}' to '{gcs_path}'...") | ||
command = ["gsutil", "-m", "cp", "-r", f"{str(local_path).rstrip('/')}/*", gcs_path] | ||
try: | ||
subprocess.run(command, check=True, capture_output=True, text=True) | ||
logger.info(f"Successfully uploaded assets to {gcs_path}.") | ||
except subprocess.CalledProcessError as e: | ||
logger.error(f"Failed to upload {local_path} to {gcs_path}. Error: {e.stderr}") | ||
raise | ||
|
||
|
||
def save_hf_model_files_to_gcs( | ||
repo_id: str, | ||
gcs_path: str, | ||
file_type: str, | ||
temp_dir: str | None = None, | ||
): | ||
"""Downloads model or tokenizer files from Hugging Face and uploads them to GCS. | ||
This function uses `huggingface_hub.snapshot_download` to fetch specific | ||
files based on predefined patterns for models and tokenizers. The downloaded | ||
files are then uploaded to the specified GCS path. | ||
Args: | ||
repo_id: The ID of the Hugging Face repository (e.g., 'meta-llama/Llama-3-8B-hf'). | ||
gcs_path: The target GCS path for the files (e.g., 'gs://bucket/models/Llama-3-8B-hf'). | ||
file_type: The type of files to download. Must be one of 'tokenizer', | ||
'model', or 'all'. | ||
temp_dir: An optional path to a temporary directory for downloading. If | ||
None, the system's default temporary directory is used. | ||
Raises: | ||
ValueError: If an invalid `file_type` is provided. | ||
""" | ||
allow_patterns = [] | ||
if file_type in ("tokenizer", "all"): | ||
allow_patterns.extend(TOKENIZER_PATTERNS) | ||
if file_type in ("model", "all"): | ||
allow_patterns.extend(MODEL_PATTERNS) | ||
|
||
if not allow_patterns: | ||
raise ValueError("file_type must be one of 'tokenizer', 'model', or 'all'") | ||
|
||
with tempfile.TemporaryDirectory(dir=temp_dir) as tmpdir: | ||
logger.info(f"Created temporary directory: {tmpdir}") | ||
|
||
logger.info(f"Downloading files for '{repo_id}' with patterns: {allow_patterns}") | ||
snapshot_path = snapshot_download( | ||
repo_id=repo_id, | ||
cache_dir=str(tmpdir), | ||
token=os.environ.get("HF_TOKEN"), | ||
allow_patterns=allow_patterns, | ||
) | ||
|
||
logger.info(f"Files for '{repo_id}' downloaded locally to '{snapshot_path}'.") | ||
|
||
_upload_directory_to_gcs(Path(snapshot_path), gcs_path) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -113,6 +113,8 @@ def from_pretrained(self, model_path_or_repo: str): | |
Args: | ||
model_path_or_repo: Path to the local directory or Hugging Face Hub repository ID. | ||
""" | ||
model_path_or_repo = model_utils.copy_gcs_to_local(model_path_or_repo) | ||
|
||
if os.path.isdir(model_path_or_repo): | ||
model_dir = model_path_or_repo | ||
else: | ||
|
@@ -153,8 +155,13 @@ def _maybe_save_checkpoint(self, config: DictConfig) -> None: | |
# Step 3: Save the HF config files and tokenizer | ||
if xr.process_index() == 0: | ||
logger.info("Saving Hugging Face configs and tokenizer to %s", save_dir) | ||
model_utils.copy_hf_config_files(config.model.pretrained_model, save_dir) | ||
model_utils.save_hf_tokenizer(config.model.pretrained_model, save_dir) | ||
# Copy to local if in GCS | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you explain why is it necessary? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This was needed because gcs bucket that we are loading toeknizer is not mounted by gcsfuse. The bucket we mount in |
||
tokenizer_path_or_repo = model_utils.copy_gcs_to_local( | ||
config.model.tokenizer_name | ||
) | ||
model_path_or_repo = model_utils.copy_gcs_to_local(config.model.pretrained_model) | ||
model_utils.copy_hf_config_files(tokenizer_path_or_repo, save_dir) | ||
model_utils.save_hf_tokenizer(model_path_or_repo, save_dir) | ||
|
||
# Step 4: Initialize torch.distributed process group | ||
if not dist.is_initialized(): | ||
|
Uh oh!
There was an error while loading. Please reload this page.