quic
diff --git a/‎QEfficient/utils/constants.py
Lines changed: 1 addition & 0 deletions b/‎QEfficient/utils/constants.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎QEfficient/utils/device_utils.py
Lines changed: 105 additions & 24 deletions b/‎QEfficient/utils/device_utils.py
Lines changed: 105 additions & 24 deletions
diff --git a/‎tests/peft/lora/test_lora_model.py
Lines changed: 3 additions & 0 deletions b/‎tests/peft/lora/test_lora_model.py
Lines changed: 3 additions & 0 deletions
diff --git a/‎tests/peft/test_peft_model.py
Lines changed: 2 additions & 0 deletions b/‎tests/peft/test_peft_model.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎tests/text_generation/test_text_generation.py
Lines changed: 6 additions & 8 deletions b/‎tests/text_generation/test_text_generation.py
Lines changed: 6 additions & 8 deletions
diff --git a/‎tests/transformers/models/test_causal_lm_models.py
Lines changed: 14 additions & 20 deletions b/‎tests/transformers/models/test_causal_lm_models.py
Lines changed: 14 additions & 20 deletions
diff --git a/‎tests/transformers/models/test_embedding_models.py
Lines changed: 3 additions & 2 deletions b/‎tests/transformers/models/test_embedding_models.py
Lines changed: 3 additions & 2 deletions
@@ -12,6 +12,7 @@
 QEFF_DIR = os.path.dirname(UTILS_DIR)
 ROOT_DIR = os.path.dirname(QEFF_DIR)
 QEFF_CACHE_DIR_NAME = "qeff_cache"
+LOCK_DIR = "/tmp/device_locks"
 
 ONNX_EXPORT_EXAMPLE_BATCH_SIZE = 1
 ONNX_EXPORT_EXAMPLE_SEQ_LEN = 32
 
@@ -5,71 +5,152 @@
 #
 # -----------------------------------------------------------------------------
 
+import fcntl
 import math
+import os
 import re
 import subprocess
 import time
+from typing import Optional
 
-from QEfficient.utils.constants import Constants
+from QEfficient.utils.constants import LOCK_DIR, Constants
 from QEfficient.utils.logging_utils import logger
 
 
-def is_device_available(stdout: str) -> bool:
+def is_device_loaded(stdout: str) -> bool:
     try:
         match = re.search(r"Networks Loaded:(\d+)", stdout)
         return int(match.group(1)) > 0 if match else False
+
     except (ValueError, AttributeError):
         return False
 
 
+def release_device_lock(lock_file):
+    try:
+        fcntl.flock(lock_file, fcntl.LOCK_UN)
+        lock_file.close()
+
+    except Exception as e:
+        logger.error(f"Error releasing lock: {e}")
+
+
 def get_device_count():
     command = ["/opt/qti-aic/tools/qaic-util", "-q"]
+
     try:
         result = subprocess.run(command, capture_output=True, text=True)
         qids = re.findall(r"QID (\d+)", result.stdout)
         return max(map(int, qids)) + 1 if qids else 0
+
     except OSError:
         logger.warning("ERROR while fetching the device", command)
         return 0
 
 
-def get_available_device_id(max_retry_count: int = 50, wait_time: int = 5) -> list[int] | None:
+def ensure_lock_dir(lock_dir: str):
+    if not os.path.exists(lock_dir):
+        os.makedirs(lock_dir)
+
+
+def acquire_device_lock(retry_interval: int = 10, retry_duration: int = 300) -> Optional[object]:
     """
-    Find an available Cloud AI 100 device ID.
+    Attempt to acquire a non-blocking exclusive lock on a device lock file.
+    Retries every 10 seconds for up to 5 minutes.
 
     Args:
-    max_retry_count (int): Maximum number of retries.
-    wait_time (int): Seconds to wait between retries.
+        device_id (int): The device ID to lock.
 
     Returns:
-    list[int] | None: List containing available device ID, or None if not found.
+        file object if lock is acquired, else None.
     """
+    ensure_lock_dir()
+    lock_file_path = os.path.join(LOCK_DIR, "device_check.lock")
+    start_time = time.time()
 
+    while (time.time() - start_time) < retry_duration:
+        lock_file = open(lock_file_path, "w")
+
+        try:
+            fcntl.flock(lock_file, fcntl.LOCK_EX | fcntl.LOCK_NB)
+            logger.debug("Lock acquired for device check")
+            return lock_file
+
+        except BlockingIOError:
+            lock_file.close()
+            logger.debug(f"Device check is locked. Retrying in {retry_interval} seconds...")
+            time.sleep(retry_interval)
+
+        except Exception as e:
+            logger.error(f"Unexpected error acquiring lock for device check: {e}")
+            return None
+
+    logger.warning("Failed to acquire lock for device check after 5 minutes.")
+    return None
+
+
+def __fetch_device_id(device_count):
+    for device_id in range(device_count):
+        try:
+            device_query_cmd = ["/opt/qti-aic/tools/qaic-util", "-q", "-d", str(device_id)]
+            result = subprocess.run(device_query_cmd, capture_output=True, text=True)
+
+            if "Failed to find requested device ID" in result.stdout:
+                logger.warning(f"Device ID {device_id} not found.")
+                continue
+
+            if "Status:Error" in result.stdout or not is_device_loaded(result.stdout):
+                logger.debug(f"Device {device_id} is not available.")
+                continue
+
+            logger.info(f"Device ID {device_id} is available and locked.")
+            return [device_id]
+
+        except subprocess.TimeoutExpired:
+            logger.error(f"Timeout while querying device {device_id}.")
+        except OSError as e:
+            logger.error(f"OSError while querying device {device_id}: {e}")
+            return None
+        except Exception as e:
+            logger.exception(f"Unexpected error while checking device {device_id}: {e}")
+    return None
+
+
+def get_available_device_id(retry_duration: int = 300, wait_time: int = 5) -> Optional[list[int]]:
+    """
+    Find an available Cloud AI 100 device ID using file-based locking.
+
+    Args:
+        max_retry_count (int): Maximum number of retries.
+        wait_time (int): Seconds to wait between retries.
+
+    Returns:
+        list[int] | None: List containing available device ID, or None if not found.
+    """
     device_count = get_device_count()
+
     if device_count == 0:
-        logger.warning("No Cloud AI 100 devices found or platform sdk not installed.")
+        logger.warning("No Cloud AI 100 devices found or platform SDK not installed.")
         return None
 
-    for retry_count in range(max_retry_count):
-        for device_id in range(device_count):
-            command = ["/opt/qti-aic/tools/qaic-util", "-q", "-d", str(device_id)]
-            try:
-                result = subprocess.run(command, capture_output=True, text=True)
-            except OSError:
-                logger.warning("Failed while querying the AIC card", command)
-                return None
+    lock_file = acquire_device_lock()
 
-            if "Status:Error" in result.stdout or not is_device_available(result.stdout):
-                continue
+    if lock_file:
+        start_time = time.time()
+
+        while (time.time() - start_time) < retry_duration:
+            device_id = __fetch_device_id(device_count)
+
+            if device_id:
+                release_device_lock(lock_file)
+                return device_id
 
-            elif "Status:Ready" in result.stdout:
-                logger.info(f"Device ID : {device_id} is available.")
-                return [device_id]
+            time.sleep(wait_time)
 
-            elif "Failed to find requested device ID" in result.stdout:
-                logger.warning("Device ID %d not found.", device_id)
+    if lock_file:
+        release_device_lock(lock_file)
 
-        time.sleep(wait_time)
+    logger.warning("No available device found after all retries.")
     return None
 
 
 
@@ -17,6 +17,7 @@
 from QEfficient import QEffAutoPeftModelForCausalLM
 from QEfficient.peft.lora import QEffAutoLoraModelForCausalLM
 from QEfficient.utils import load_hf_tokenizer
+from QEfficient.utils.device_utils import get_available_device_id
 
 configs = [
     pytest.param(
@@ -235,6 +236,7 @@ def test_auto_lora_model_for_causal_lm_noncb_export_compile_generate(
         tokenizer=load_hf_tokenizer(pretrained_model_name_or_path=base_model_name),
         prompts=prompts,
         prompt_to_adapter_mapping=["adapter_0", "adapter_1", "adapter_0", "base"],
+        device_ids=get_available_device_id(),
     )
 
 
@@ -260,4 +262,5 @@ def test_auto_lora_model_for_causal_lm_cb_compile_generate(base_model_name, adap
         tokenizer=load_hf_tokenizer(pretrained_model_name_or_path=base_model_name),
         prompts=prompts,
         prompt_to_adapter_mapping=["adapter_0", "adapter_1", "adapter_0", "base"],
+        device_ids=get_available_device_id(),
     )
@@ -16,6 +16,7 @@
 from transformers import AutoConfig, AutoModelForCausalLM
 
 from QEfficient import QEffAutoPeftModelForCausalLM
+from QEfficient.utils.device_utils import get_available_device_id
 
 configs = [
     pytest.param(
@@ -181,6 +182,7 @@ def test_auto_peft_model_for_causal_lm_compile_generate(base_config, adapter_con
             axis=1,
         ),
         max_new_tokens=10,
+        device_ids=get_available_device_id(),
     )
 
     start = perf_counter()
 
@@ -72,10 +72,6 @@ def test_generate_text_stream(
     qeff_model = QEFFAutoModelForCausalLM(model_hf)
 
     qeff_model.export()
-    device_id = get_available_device_id()
-
-    if not device_id:
-        pytest.skip("No available devices to run model on Cloud AI 100")
 
     qpc_path = qeff_model.compile(
         prefill_seq_len=prompt_len,
@@ -86,7 +82,9 @@ def test_generate_text_stream(
         full_batch_size=full_batch_size,
     )
 
-    exec_info = qeff_model.generate(tokenizer, prompts=Constants.INPUT_STR, generation_len=max_gen_len)
+    exec_info = qeff_model.generate(
+        tokenizer, prompts=Constants.INPUT_STR, generation_len=max_gen_len, device_ids=get_available_device_id()
+    )
     cloud_ai_100_tokens = exec_info.generated_ids[0]  # Because we always run for single input and single batch size
     cloud_ai_100_output = [tokenizer.decode(token, skip_special_tokens=True) for token in cloud_ai_100_tokens[0]]
 
@@ -100,7 +98,7 @@ def test_generate_text_stream(
     for decoded_tokens in text_generator.generate_stream_tokens(Constants.INPUT_STR, generation_len=max_gen_len):
         stream_tokens.extend(decoded_tokens)
 
-    assert cloud_ai_100_output == stream_tokens, (
-        f"Deviation in output observed while comparing regular execution and streamed output: {cloud_ai_100_output} != {stream_tokens}"
-    )
+    assert (
+        cloud_ai_100_output == stream_tokens
+    ), f"Deviation in output observed while comparing regular execution and streamed output: {cloud_ai_100_output} != {stream_tokens}"
     assert os.path.isfile(os.path.join(os.path.dirname(qpc_path), "qconfig.json"))
@@ -127,19 +127,16 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
 
     pytorch_kv_tokens = api_runner.run_kv_model_on_pytorch(qeff_model.model)
 
-    assert (pytorch_hf_tokens == pytorch_kv_tokens).all(), (
-        "Tokens don't match for HF PyTorch model output and KV PyTorch model output"
-    )
+    assert (
+        pytorch_hf_tokens == pytorch_kv_tokens
+    ).all(), "Tokens don't match for HF PyTorch model output and KV PyTorch model output"
 
     onnx_model_path = qeff_model.export()
     ort_tokens = api_runner.run_kv_model_on_ort(onnx_model_path, is_tlm=is_tlm)
     gen_len = ort_tokens.shape[-1]
 
     assert (pytorch_kv_tokens == ort_tokens).all(), "Tokens don't match for ONNXRT output and PyTorch output."
 
-    if not get_available_device_id():
-        pytest.skip("No available devices to run model on Cloud AI 100")
-
     qpc_path = qeff_model.compile(
         prefill_seq_len=prompt_len,
         ctx_len=ctx_len,
@@ -151,18 +148,18 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
         enable_qnn=enable_qnn,
         qnn_config=qnn_config,
     )
-    exec_info = qeff_model.generate(tokenizer, prompts=Constants.INPUT_STR)
+    exec_info = qeff_model.generate(tokenizer, prompts=Constants.INPUT_STR, device_ids=get_available_device_id())
     cloud_ai_100_tokens = exec_info.generated_ids[0][
         :, :gen_len
     ]  # Because we always run for single input and single batch size
     if prefill_only:
-        assert (ort_tokens[0][0] == cloud_ai_100_tokens[0][0]).all(), (
-            "prefill run output tokens don't match for ONNXRT output and Cloud AI 100 output."
-        )
+        assert (
+            ort_tokens[0][0] == cloud_ai_100_tokens[0][0]
+        ).all(), "prefill run output tokens don't match for ONNXRT output and Cloud AI 100 output."
     else:
-        assert (ort_tokens == cloud_ai_100_tokens).all(), (
-            "Tokens don't match for ONNXRT output and Cloud AI 100 output."
-        )
+        assert (
+            ort_tokens == cloud_ai_100_tokens
+        ).all(), "Tokens don't match for ONNXRT output and Cloud AI 100 output."
         assert os.path.isfile(os.path.join(os.path.dirname(qpc_path), "qconfig.json"))
     if prefill_only is not None:
         return
@@ -188,9 +185,6 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
     )
     onnx_model_path = qeff_model.export()
 
-    if not get_available_device_id():
-        pytest.skip("No available devices to run model on Cloud AI 100")
-
     # TODO: add prefill_only tests
     qpc_path = qeff_model.compile(
         prefill_seq_len=prompt_len,
@@ -203,7 +197,7 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
         enable_qnn=enable_qnn,
         qnn_config=qnn_config,
     )
-    exec_info_fbs = qeff_model.generate(tokenizer, prompts=fbs_prompts)
+    exec_info_fbs = qeff_model.generate(tokenizer, prompts=fbs_prompts, device_ids=get_available_device_id())
 
     assert all(
         [
@@ -239,9 +233,9 @@ def test_causal_lm_export_with_deprecated_api(model_name):
     new_api_ort_tokens = api_runner.run_kv_model_on_ort(new_api_onnx_model_path)
     old_api_ort_tokens = api_runner.run_kv_model_on_ort(old_api_onnx_model_path)
 
-    assert (new_api_ort_tokens == old_api_ort_tokens).all(), (
-        "New API output does not match old API output for ONNX export function"
-    )
+    assert (
+        new_api_ort_tokens == old_api_ort_tokens
+    ).all(), "New API output does not match old API output for ONNX export function"
 
 
 @pytest.mark.on_qaic
 
@@ -16,6 +16,7 @@
 from QEfficient.transformers.models.modeling_auto import QEFFAutoModel
 from QEfficient.utils._utils import create_json
 from QEfficient.utils.constants import Constants, QnnConstants
+from QEfficient.utils.device_utils import get_available_device_id
 
 embed_test_models = [
     # model_name, architecture
@@ -48,7 +49,7 @@ def check_embed_pytorch_vs_ort_vs_ai100(
     pt_embeddings = pt_outputs[0][0].detach().numpy()
     # Pytorch transformed model
     qeff_model = QEFFAutoModel(pt_model, pretrained_model_name_or_path=model_name)
-    qeff_pt_outputs = qeff_model.generate(inputs=inputs, runtime_ai100=False)
+    qeff_pt_outputs = qeff_model.generate(inputs=inputs, runtime_ai100=False, device_ids=get_available_device_id())
     qeff_pt_embeddings = qeff_pt_outputs[0][0].detach().numpy()
     mad = np.mean(np.abs(pt_embeddings - qeff_pt_embeddings))
     print("Mad for PyTorch and PyTorch transformed qeff_model is ", mad)
@@ -78,7 +79,7 @@ def check_embed_pytorch_vs_ort_vs_ai100(
         enable_qnn=enable_qnn,
         qnn_config=qnn_config,
     )
-    ai100_output = qeff_model.generate(inputs=inputs)
+    ai100_output = qeff_model.generate(inputs=inputs, device_ids=get_available_device_id())
 
     # Compare ONNX and AI 100 outputs
     mad = np.mean(np.abs(ai100_output - onnx_outputs[0]))