Skip to content
333 changes: 333 additions & 0 deletions src/common/util/include/openvino/util/parallel_mem_streambuf.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,333 @@
// Copyright (C) 2018-2026 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once

#include <algorithm>
#include <chrono>
#include <cstring>
#include <filesystem>
#include <fstream>
#include <memory>
#include <sstream>
#include <stdexcept>
#include <streambuf>

#ifdef _WIN32
# ifndef NOMINMAX
# define NOMINMAX
# endif
# ifndef WIN32_LEAN_AND_MEAN
# define WIN32_LEAN_AND_MEAN
# endif
# include <psapi.h>
# include <windows.h>
#else
# include <sys/mman.h>
#endif

#include "openvino/core/parallel.hpp"
#include "openvino/util/log.hpp"
#include "openvino/util/parallel_read_streambuf.hpp"

#define ENABLE_BD_PROFILING_LOG 0

namespace ov {
namespace util {

/// @brief A std::streambuf that reads from an in-memory buffer using parallel
/// memcpy for large reads.
///
/// Intended for mmap-backed tensors: the tensor's raw memory is already mapped
/// into the process but pages may not yet be resident. For large reads,
/// splitting the copy across N threads triggers concurrent page faults, raising
/// the OS I/O queue depth and saturating NVMe bandwidth.
///
/// On Windows, after each large copy the consumed source pages are released
/// from the process working-set via VirtualFree(MEM_RESET) to relieve RAM
/// pressure when loading multi-GB models.
///
/// Usage:
/// @code
/// // In plugin::import_model(const ov::Tensor& model, ...):
/// ov::util::ParallelMemStreamBuf par_buf(model.data(), model.get_byte_size());
/// std::istream stream(&par_buf);
/// // pass stream to BinaryInputBuffer or any std::istream& consumer
/// @endcode
class ParallelMemStreamBuf : public std::streambuf {
public:
static constexpr size_t DEFAULT_THRESHOLD = 4UL * 1024 * 1024; // 4 MB

/// @param data Pointer to the start of the memory region.
/// @param size Total size of the memory region in bytes.
/// @param threshold Minimum read size to engage parallel memcpy.
ParallelMemStreamBuf(const void* data, size_t size, size_t threshold = DEFAULT_THRESHOLD)
: m_begin(static_cast<const char*>(data)),
m_end(static_cast<const char*>(data) + size),
m_current(static_cast<const char*>(data)),
m_threshold(threshold) {
#ifdef _WIN32
// On Windows, detect whether this memory is a file-backed mmap region.
// If so, build a ParallelReadStreamBuf over the same file+offset so
// ReadFile is used instead of mmap+memcpy. This avoids the 2x RAM
// pressure (mmap working-set + destination buffer) that causes
// catastrophic working-set thrashing for multi-GB models, and
// eliminates per-page PFN database lock contention.
if (size >= threshold) {
MEMORY_BASIC_INFORMATION mbi{};
if (VirtualQuery(data, &mbi, sizeof(mbi)) && mbi.Type == MEM_MAPPED) {
wchar_t dev_path[MAX_PATH] = {};
if (GetMappedFileNameW(GetCurrentProcess(), const_cast<void*>(data), dev_path, MAX_PATH) > 0) {
// Convert device path (\Device\HarddiskVolume3\...) to Win32 path.
wchar_t win32_path[MAX_PATH] = {};
if (resolve_device_path(dev_path, win32_path, MAX_PATH)) {
// Compute the file offset: the region base may be at a
// different page boundary than the pointer we received.
// AllocationBase is the start of the mapped view.
const std::streamoff file_offset =
reinterpret_cast<const char*>(data) - reinterpret_cast<const char*>(mbi.AllocationBase);
m_file_buf = std::make_unique<ParallelReadStreamBuf>(std::filesystem::path(win32_path),
file_offset,
threshold);
}
}
}
}
// Fallback (non-file-backed memory, e.g. anonymous mmap or small
// allocations): issue an upfront async prefetch for the entire region
// so pages start arriving while the blob header is being parsed.
if (!m_file_buf) {
WIN32_MEMORY_RANGE_ENTRY prefetch_range{const_cast<void*>(data), size};
PrefetchVirtualMemory(GetCurrentProcess(), 1, &prefetch_range, 0);
}
#else
// On Linux, detect file-backed mmap via /proc/self/maps.
// If the pointer falls inside a file mapping, build a ParallelReadStreamBuf
// (pread-based) to avoid mmap Working Set residency pressure and page fault
// overhead that degrades throughput on multi-GB models.
if (size >= threshold) {
std::filesystem::path file_path;
std::streamoff file_off = 0;
if (get_mmap_file_info(data, file_path, file_off)) {
m_file_buf = std::make_unique<ParallelReadStreamBuf>(file_path, file_off, threshold);
}
}
// For non-file-backed memory (anonymous mmap, USM host buffers, etc.)
// fall back to async prefetch + parallel memcpy.
if (!m_file_buf) {
madvise(const_cast<void*>(data), size, MADV_WILLNEED);
}
#endif
}

~ParallelMemStreamBuf() override = default;

ParallelMemStreamBuf(const ParallelMemStreamBuf&) = delete;
ParallelMemStreamBuf& operator=(const ParallelMemStreamBuf&) = delete;

protected:
// -----------------------------------------------------------------------
// xsgetn: hot path — called by sgetn() for all bulk reads
// -----------------------------------------------------------------------
std::streamsize xsgetn(char_type* dst, std::streamsize n) override {
// If we detected a file-backed mmap on Windows, delegate to the
// ReadFile-based streambuf which avoids PFN contention and 2x RAM pressure.
if (m_file_buf) {
return m_file_buf->sgetn(dst, n);
}
if (m_current >= m_end) {
return 0;
}
const std::streamsize avail = static_cast<std::streamsize>(m_end - m_current);
const std::streamsize to_copy = std::min(n, avail);

if (static_cast<size_t>(to_copy) >= m_threshold) {
parallel_copy(dst, m_current, static_cast<size_t>(to_copy));
} else {
std::memcpy(dst, m_current, static_cast<size_t>(to_copy));
}

m_current += to_copy;
return to_copy;
}

// -----------------------------------------------------------------------
// underflow: single-char peek path
// -----------------------------------------------------------------------
int_type underflow() override {
if (m_file_buf) {
return m_file_buf->sgetc();
}
if (m_current >= m_end) {
return traits_type::eof();
}
return traits_type::to_int_type(*m_current);
}

int_type uflow() override {
if (m_file_buf) {
return m_file_buf->sbumpc();
}
if (m_current >= m_end) {
return traits_type::eof();
}
return traits_type::to_int_type(*m_current++);
}

// -----------------------------------------------------------------------
// Seek support
// -----------------------------------------------------------------------
pos_type seekoff(off_type off, std::ios_base::seekdir way, std::ios_base::openmode which) override {
if (m_file_buf) {
return m_file_buf->pubseekoff(off, way, which);
}
const char* new_pos = nullptr;
if (way == std::ios_base::beg) {
new_pos = m_begin + off;
} else if (way == std::ios_base::cur) {
new_pos = m_current + off;
} else {
new_pos = m_end + off;
}

if (new_pos < m_begin || new_pos > m_end) {
return pos_type(off_type(-1));
}

m_current = new_pos;
return pos_type(static_cast<off_type>(m_current - m_begin));
}

pos_type seekpos(pos_type pos, std::ios_base::openmode which) override {
if (m_file_buf) {
return m_file_buf->pubseekpos(pos, which);
}
return seekoff(off_type(pos), std::ios_base::beg, std::ios_base::in);
}

std::streamsize showmanyc() override {
if (m_file_buf) {
return m_file_buf->in_avail();
}
const std::streamsize avail = static_cast<std::streamsize>(m_end - m_current);
return avail > 0 ? avail : -1;
}

private:
void parallel_copy(char* dst, const char* src, size_t size) {
constexpr size_t MIN_CHUNK = 2UL * 1024 * 1024; // 2 MB minimum per thread
#ifdef _WIN32
// On Windows, mmap page faults require acquiring the PFN database lock
// once per page. Too many concurrent threads cause severe kernel-level
// serialization. Cap at 16 threads so PFN-lock contention is bounded
// while still saturating NVMe queue depth via PrefetchVirtualMemory.
constexpr size_t MAX_CHUNKS = 16;
const size_t num_chunks = std::max(size_t{1}, std::min(size / MIN_CHUNK, MAX_CHUNKS));
#else
const size_t num_chunks = std::max(size_t{1}, size / MIN_CHUNK);
#endif
const size_t chunk_size = (size + num_chunks - 1) / num_chunks;

#ifdef _WIN32
// Prefetch: trigger page faults up-front to maximise NVMe queue depth.
WIN32_MEMORY_RANGE_ENTRY prefetch_range{const_cast<char*>(src), size};
PrefetchVirtualMemory(GetCurrentProcess(), 1, &prefetch_range, 0);
#else
// Ask the kernel to start async I/O for these mmap pages so they are
// resident before the parallel memcpy threads access them.
madvise(const_cast<char*>(src), size, MADV_WILLNEED);
#endif

#if ENABLE_BD_PROFILING_LOG
const auto t0 = std::chrono::steady_clock::now();
#endif

ov::parallel_for(num_chunks, [&](size_t i) {
const size_t offset = i * chunk_size;
const size_t copy_size = (i + 1 == num_chunks) ? (size - offset) : chunk_size;
std::memcpy(dst + offset, src + offset, copy_size);
});

#if ENABLE_BD_PROFILING_LOG
{
const auto t1 = std::chrono::steady_clock::now();
const double elapsed_s = std::chrono::duration<double>(t1 - t0).count();
const double bw_gbs =
(elapsed_s > 0.0) ? (static_cast<double>(size) / elapsed_s / (1024.0 * 1024.0 * 1024.0)) : 0.0;
std::cout << "[ParallelMemStreamBuf] parallel_copy: " << size / 1024.0 / 1024.0 << " MB, " << num_chunks
<< " chunks, " << elapsed_s * 1e3 << " ms, " << bw_gbs << " GB/s" << std::endl;
}
#endif
}

const char* m_begin;
const char* m_end;
const char* m_current;
size_t m_threshold;
// Non-null when source is a file-backed mmap: delegates all I/O to
// ReadFile (Windows) / pread (Linux) parallel reads, bypassing the
// 2x RAM pressure and page-fault overhead of mmap+memcpy.
std::unique_ptr<ParallelReadStreamBuf> m_file_buf;

#ifdef _WIN32
// Convert a kernel device path (\Device\HarddiskVolume3\foo\bar) to a
// Win32 drive path (C:\foo\bar).
static bool resolve_device_path(const wchar_t* dev_path, wchar_t* out, DWORD out_len) {
wchar_t drives[512] = {};
if (!GetLogicalDriveStringsW(512, drives))
return false;
for (const wchar_t* d = drives; *d; d += wcslen(d) + 1) {
wchar_t drive[3] = {d[0], d[1], L'\0'};
wchar_t dev_name[MAX_PATH] = {};
if (!QueryDosDeviceW(drive, dev_name, MAX_PATH))
continue;
const size_t dev_name_len = wcslen(dev_name);
if (wcsncmp(dev_path, dev_name, dev_name_len) == 0 &&
(dev_path[dev_name_len] == L'\\' || dev_path[dev_name_len] == L'\0')) {
swprintf_s(out, out_len, L"%s%s", drive, dev_path + dev_name_len);
return true;
}
}
return false;
}
#else
// Parse /proc/self/maps to find the file backing an mmap address.
// Returns true and fills out_path/out_offset if the address is inside
// a named file mapping (i.e. not anonymous / [stack] / [heap]).
static bool get_mmap_file_info(const void* addr, std::filesystem::path& out_path, std::streamoff& out_offset) {
std::ifstream maps_file("/proc/self/maps");
if (!maps_file.is_open())
return false;
const auto addr_val = reinterpret_cast<uintptr_t>(addr);
std::string line;
while (std::getline(maps_file, line)) {
// Format: start-end perms offset dev inode [pathname]
std::istringstream iss(line);
std::string addr_range, perms, offset_str, dev, inode_str;
if (!(iss >> addr_range >> perms >> offset_str >> dev >> inode_str))
continue;
// Parse start-end
const auto dash = addr_range.find('-');
if (dash == std::string::npos)
continue;
const auto range_start = static_cast<uintptr_t>(std::stoull(addr_range.substr(0, dash), nullptr, 16));
const auto range_end = static_cast<uintptr_t>(std::stoull(addr_range.substr(dash + 1), nullptr, 16));
if (addr_val < range_start || addr_val >= range_end)
continue;
// Read optional pathname
std::string path;
if (!(iss >> path) || path.empty() || path[0] != '/')
return false; // anonymous or special region, no benefit
out_path = path;
const auto map_offset = static_cast<std::streamoff>(std::stoull(offset_str, nullptr, 16));
out_offset = map_offset + static_cast<std::streamoff>(addr_val - range_start);
return true;
}
return false;
}
#endif
};

} // namespace util
} // namespace ov
Loading
Loading