feat: Add support to upload to Azure blob storage. (#243)

duggaraju · web-flow · commit 9201a5b50696 · 2026-03-07T09:34:35.000-08:00
Creates an AzureStorageUploader that can upload to Azure blob storage.
diff --git a/docs/source/cloud_storage.rst b/docs/source/cloud_storage.rst
@@ -25,9 +25,10 @@ https://localhost:8080/foo/bar/dash.mpd to write the manifest (with default
 settings).
 
 Cloud storage URLs can be either Google Cloud Storage URLs (beginning with
-gs://) or Amazon S3 URLs (beginning with s3://).  Like the HTTP support
-described above, these are a base URL.  If you ask for output to gs://foo/bar/,
-Streamer will write to gs://foo/bar/dash.mpd (with default settings).
+gs://), Amazon S3 URLs (beginning with s3://), or Azure Blob Storage URLs
+(beginning with azure://).  Like the HTTP support described above, these are
+a base URL.  If you ask for output to gs://foo/bar/, Streamer will write to
+gs://foo/bar/dash.mpd (with default settings).
 
 Cloud storage output uses the storage provider's Python libraries.  Find more
 details on setup and authentication below.
@@ -93,5 +94,39 @@ Example command-line for live streaming to Amazon S3:
      -o s3://my_s3_bucket/folder/
 
 
+Azure Blob Storage Setup
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+Install the Python modules if you haven't yet:
+
+.. code:: sh
+
+   python3 -m pip install azure-storage-blob azure-identity
+
+Azure Blob Storage support uses append blobs for efficient streaming uploads,
+making it ideal for live streaming scenarios where data is written sequentially.
+Authentication is handled by Azure's DefaultAzureCredential, which automatically
+tries multiple authentication methods in order.
+
+The most common authentication methods are:
+
+1. **Azure CLI**: Login using ``az login`` (recommended for development)
+2. **Managed Identity**: Automatic when running on Azure resources
+3. **Service Principal**: Set ``AZURE_CLIENT_ID``, ``AZURE_CLIENT_SECRET``,
+   and ``AZURE_TENANT_ID`` environment variables
+4. **Interactive Browser**: Fallback authentication method
+
+The Azure URL format is: ``azure://storageaccount.blob.core.windows.net/container/path/``
+
+Example command-line for live streaming to Azure Blob Storage:
+
+.. code:: sh
+
+   python3 shaka-streamer \
+     -i config_files/input_looped_file_config.yaml \
+     -p config_files/pipeline_live_config.yaml \
+     -o azure://mystorageaccount.blob.core.windows.net/mycontainer/folder/
+
+
 .. _boto config file: http://boto.cloudhackers.com/en/latest/boto_config_tut.html
 .. _AWS CLI: https://boto3.amazonaws.com/v1/documentation/api/latest/guide/configuration.html
diff --git a/shaka-streamer b/shaka-streamer
@@ -69,8 +69,8 @@ See docs: https://shaka-project.github.io/shaka-streamer/cloud_storage.html
                            'config_files/bitrate_config.yaml)')
   parser.add_argument('-c', '--cloud-url',
                       default=None,
-                      help='The Google Cloud Storage or Amazon S3 URL to ' +
-                           'upload to.  (Starts with gs:// or s3://) (DEPRECATED, use -o)')
+                      help='The Google Cloud Storage or Amazon S3 or Azure Blob Storage URL to ' +
+                           'upload to.  (Starts with gs:// or s3:// or azure://) (DEPRECATED, use -o)')
   parser.add_argument('-o', '--output',
                       default='output_files',
                       help='The output folder or URL to write files to.  See ' +
diff --git a/streamer/cloud/azure.py b/streamer/cloud/azure.py
@@ -0,0 +1,158 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Upload to Azure Blob Storage."""
+
+import io
+import urllib.parse
+from typing import Optional
+
+from azure.storage.blob import BlobServiceClient, BlobClient  # type: ignore
+from azure.core.exceptions import ResourceNotFoundError  # type: ignore
+from azure.identity import DefaultAzureCredential  # type: ignore
+
+from streamer.cloud.base import CloudUploaderBase
+
+
+# Azure Append Blobs can accept chunks of any size, but we'll use a reasonable buffer size.
+APPEND_BLOB_BUFFER_SIZE = (4 << 20)  # 4MB
+
+
+class AzureStorageUploader(CloudUploaderBase):
+  """See base class for interface docs."""
+
+  def __init__(self, upload_location: str) -> None:
+    # Parse the upload location (URL).
+    # Expected format: azure://storageaccount.blob.core.windows.net/container/path
+    url = urllib.parse.urlparse(upload_location)
+    if not url.netloc:
+      raise ValueError(f"Invalid Azure storage URL format: {upload_location}")
+
+    # Extract storage account from the netloc
+    # netloc format: storageaccount.blob.core.windows.net
+    account_url = f"https://{url.netloc}"
+
+    # Initialize the BlobServiceClient with DefaultAzureCredential
+    try:
+      credential = DefaultAzureCredential()
+      self._blob_service_client = BlobServiceClient(account_url=account_url, credential=credential)
+    except Exception as e:
+      raise RuntimeError(f"Failed to initialize Azure credentials for {account_url}: {e}")
+
+    # Extract container name and base path from the URL path
+    # First part of path is container, everything after is base path
+    path_parts = url.path.strip('/').split('/', 1)
+    if not path_parts or not path_parts[0]:
+      raise ValueError(f"Container name not found in URL: {upload_location}")
+
+    self._container_name = path_parts[0]
+    # Base path within the container (everything after container name)
+    self._base_path = path_parts[1] if len(path_parts) > 1 else ''
+
+    # State for chunked uploads:
+    self._blob_client: Optional[BlobClient] = None
+    self._data_buffer: bytes = b''
+
+  def write_non_chunked(self, path: str, data: bytes) -> None:
+    """Write the non-chunked data to the destination."""
+    full_path = self._get_full_path(path)
+
+    blob_client = self._blob_service_client.get_blob_client(
+        container=self._container_name,
+        blob=full_path
+    )
+
+    # Upload the blob with cache control headers
+    blob_client.upload_blob(
+        data=data,
+        overwrite=True
+    )
+
+  def start_chunked(self, path: str) -> None:
+    """Set up for a chunked transfer to the destination."""
+    full_path = self._get_full_path(path)
+
+    self._blob_client = self._blob_service_client.get_blob_client(
+        container=self._container_name,
+        blob=full_path
+    )
+
+    self._blob_client.create_append_blob()
+
+    # Reset state for new chunked upload
+    self._data_buffer = b''
+
+  def write_chunk(self, data: bytes, force: bool = False) -> None:
+    """Handle a single chunk of data."""
+    if not self._blob_client:
+      raise RuntimeError("start_chunked() must be called before write_chunk()")
+
+    # Accumulate data in buffer
+    self._data_buffer += data
+
+    # Append data when we have enough data or when forced
+    buffer_size = len(self._data_buffer)
+    if buffer_size >= APPEND_BLOB_BUFFER_SIZE or (buffer_size > 0 and force):
+      # Append the data to the blob
+      self._blob_client.append_block(
+          data=self._data_buffer
+      )
+
+      # Clear the buffer
+      self._data_buffer = b''
+
+  def end_chunked(self) -> None:
+    """End the chunked transfer."""
+    if not self._blob_client:
+      raise RuntimeError("start_chunked() must be called before end_chunked()")
+
+    # Upload any remaining data in the buffer
+    self.write_chunk(b'', force=True)
+
+    # For append blobs, no additional commit operation is needed
+    # The data is already committed with each append_block call
+    # Reset state
+    self.reset()
+
+  def delete(self, path: str) -> None:
+    """Delete the file from cloud storage."""
+    full_path = self._get_full_path(path)
+
+    blob_client = self._blob_service_client.get_blob_client(
+        container=self._container_name,
+        blob=full_path
+    )
+
+    try:
+      blob_client.delete_blob()
+    except ResourceNotFoundError:
+      # Blob doesn't exist, which is fine for delete operation
+      pass
+
+  def reset(self) -> None:
+    """Reset any chunked output state."""
+    self._blob_client = None
+    self._data_buffer = b''
+
+  def _get_full_path(self, path: str) -> str:
+    """Construct the full blob path by combining base path and relative path."""
+    # Remove leading slashes to avoid empty path segments
+    clean_path = path.lstrip('/')
+
+    if self._base_path:
+      # Ensure proper path separation
+      base = self._base_path.rstrip('/')
+      return f"{base}/{clean_path}" if clean_path else base
+    else:
+      return clean_path
diff --git a/streamer/cloud/uploader.py b/streamer/cloud/uploader.py
@@ -23,7 +23,7 @@
 
 
 # All supported protocols.  Used to provide more useful error messages.
-ALL_SUPPORTED_PROTOCOLS: list[str] = ['gs', 's3']
+ALL_SUPPORTED_PROTOCOLS: list[str] = ['gs', 's3', 'azure']
 
 
 # Try to load the GCS (Google Cloud Storage) uploader.  If we can, the user has
@@ -44,12 +44,23 @@
   pass
 
 
+# Try to load the Azure Blob Storage uploader.  If we can, the user has
+# the libraries needed for Azure support.
+try:
+  from streamer.cloud.azure import AzureStorageUploader
+  SUPPORTED_PROTOCOLS.append('azure')
+except:
+  pass
+
+
 def create(upload_location: str) -> CloudUploaderBase:
   """Create an uploader appropriate to the upload location URL."""
 
   if upload_location.startswith("gs://"):
     return GCSUploader(upload_location)
   elif upload_location.startswith("s3://"):
     return S3Uploader(upload_location)
+  elif upload_location.startswith("azure://"):
+    return AzureStorageUploader(upload_location)
   else:
     raise RuntimeError("Protocol of {} isn't supported".format(upload_location))