-
Notifications
You must be signed in to change notification settings - Fork 7
feat(adapter/nemo): cleanup checkpoint container on_train_end #94
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
fc2251b
aaf42d0
1897d3d
66786fb
9afab16
cb755a1
3b11591
4069374
d6792bb
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -239,10 +239,17 @@ def delete_container(self, container_id: CheckpointContainerId) -> None: | |
| """ | ||
| container_id = str(container_id) | ||
| _LOGGER.info("Starting deletion process for container: '%s'", container_id) | ||
|
|
||
| def _onerror(func, path, exc_info): | ||
| exc = exc_info[1] | ||
| if isinstance(exc, FileNotFoundError): | ||
| return | ||
| raise exc | ||
|
|
||
| try: | ||
| if os.path.isdir(container_id): | ||
| # Use shutil.rmtree for recursive deletion. | ||
| shutil.rmtree(container_id) | ||
| shutil.rmtree(container_id, onerror=_onerror) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. a note: this
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As we delete_container before it's fully finished (save + replication), so the transfer service might make changes to the dir at the same time (in the receiver, we save to a tmp file first and then rename it), so there could be file not found error. |
||
| _LOGGER.info("Successfully deleted container directory: '%s'", container_id) | ||
| else: | ||
| # This is not an error; the directory might have already been deleted. | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -14,6 +14,10 @@ | |
|
|
||
| #include "object_manager.h" | ||
|
|
||
| #include <sys/types.h> | ||
| #include <sys/wait.h> | ||
| #include <unistd.h> | ||
|
|
||
| #include <filesystem> | ||
| #include <future> | ||
| #include <iostream> | ||
|
|
@@ -27,13 +31,35 @@ namespace ml_flashpoint::checkpoint_object_manager::object_manager { | |
| namespace fs = std::filesystem; | ||
|
|
||
| namespace { | ||
| // We use a fork/exec approach calling 'rm -rf' here instead of | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. interesting, is this the only way to get around that? do we know why there was a seg fault? |
||
| // std::filesystem::remove_all to address a Segmentation Fault | ||
| // observed in multi-threaded environments. This should be safer | ||
| // and avoids the crash experienced with std::filesystem operations. | ||
| // | ||
| // The actual deletion logic | ||
| void delete_directories_task(const std::vector<std::string>& directories) { | ||
| for (const std::string& dir_path : directories) { | ||
| try { | ||
| if (fs::is_directory(dir_path)) { | ||
| LOG(INFO) << "Removing directory " << dir_path << " ..."; | ||
| fs::remove_all(dir_path); | ||
| LOG(INFO) << "Removing directory " << dir_path << " via fork/exec..."; | ||
| pid_t pid = fork(); | ||
| if (pid == 0) { | ||
| // Child process | ||
| execlp("rm", "rm", "-rf", dir_path.c_str(), (char*)NULL); | ||
| // If execlp returns, it failed | ||
| std::cerr << "Failed to exec rm -rf for " << dir_path << std::endl; | ||
| exit(1); | ||
| } else if (pid > 0) { | ||
| // Parent process | ||
| int status; | ||
| waitpid(pid, &status, 0); | ||
| if (status != 0) { | ||
| LOG(ERROR) << "rm -rf failed for " << dir_path << " with status " | ||
| << status; | ||
| } | ||
| } else { | ||
| LOG(ERROR) << "Failed to fork for deleting " << dir_path; | ||
| } | ||
| } | ||
| } catch (const fs::filesystem_error& e) { | ||
| // It's important to handle errors inside the thread, | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -37,6 +37,7 @@ | |
| #include <optional> | ||
| #include <queue> | ||
| #include <string> | ||
| #include <unordered_set> | ||
|
|
||
| namespace ml_flashpoint::replication::transfer_service { | ||
|
|
||
|
|
@@ -116,7 +117,8 @@ class ConnectionPool { | |
| std::string peer_host_; | ||
| int peer_port_; | ||
| size_t max_size_; | ||
| std::queue<int> available_connections_; // Guarded by mtx_. | ||
| std::queue<int> available_connections_; // Guarded by mtx_. | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you explain the rationale for using a queue here and unordered set below? specifically why FIFO order is relevant for available_connections. also are these two collections mutually exclusive?
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes, they should be be mutually exclusive. For the queue usage for available_connections_, ideally no difference if we using a queue or a stack, just a way to track them and get one to use quickly. adding active_connections_ as we want to destroy all the alive connection during shutdown |
||
| std::unordered_set<int> active_connections_; // Guarded by mtx_. | ||
| std::mutex mtx_; // Protects available_connections_ and stopping_. | ||
| std::condition_variable | ||
| cv_; // Signaled when a connection is released or the pool is stopping. | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.