openvinotoolkit · riverlijunjie · Mar 12, 2026 · Mar 13, 2026 · Mar 14, 2026 · Mar 14, 2026
@@ -0,0 +1,333 @@
+// Copyright (C) 2018-2026 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <algorithm>
+#include <chrono>
+#include <cstring>
+#include <filesystem>
+#include <fstream>
+#include <memory>
+#include <sstream>
+#include <stdexcept>
+#include <streambuf>
+
+#ifdef _WIN32
+#    ifndef NOMINMAX
+#        define NOMINMAX
+#    endif
+#    ifndef WIN32_LEAN_AND_MEAN
+#        define WIN32_LEAN_AND_MEAN
+#    endif
+#    include <psapi.h>
+#    include <windows.h>
+#else
+#    include <sys/mman.h>
+#endif
+
+#include "openvino/core/parallel.hpp"
+#include "openvino/util/log.hpp"
+#include "openvino/util/parallel_read_streambuf.hpp"
+
+#define ENABLE_BD_PROFILING_LOG 0
+
+namespace ov {
+namespace util {
+
+/// @brief A std::streambuf that reads from an in-memory buffer using parallel
+///        memcpy for large reads.
+///
+/// Intended for mmap-backed tensors: the tensor's raw memory is already mapped
+/// into the process but pages may not yet be resident.  For large reads,
+/// splitting the copy across N threads triggers concurrent page faults, raising
+/// the OS I/O queue depth and saturating NVMe bandwidth.
+///
+/// On Windows, after each large copy the consumed source pages are released
+/// from the process working-set via VirtualFree(MEM_RESET) to relieve RAM
+/// pressure when loading multi-GB models.
+///
+/// Usage:
+/// @code
+///   // In plugin::import_model(const ov::Tensor& model, ...):
+///   ov::util::ParallelMemStreamBuf par_buf(model.data(), model.get_byte_size());
+///   std::istream stream(&par_buf);
+///   // pass stream to BinaryInputBuffer or any std::istream& consumer
+/// @endcode
+class ParallelMemStreamBuf : public std::streambuf {
+public:
+    static constexpr size_t DEFAULT_THRESHOLD = 4UL * 1024 * 1024;  // 4 MB
+
+    /// @param data       Pointer to the start of the memory region.
+    /// @param size       Total size of the memory region in bytes.
+    /// @param threshold  Minimum read size to engage parallel memcpy.
+    ParallelMemStreamBuf(const void* data, size_t size, size_t threshold = DEFAULT_THRESHOLD)
+        : m_begin(static_cast<const char*>(data)),
+          m_end(static_cast<const char*>(data) + size),
+          m_current(static_cast<const char*>(data)),
+          m_threshold(threshold) {
+#ifdef _WIN32
+        // On Windows, detect whether this memory is a file-backed mmap region.
+        // If so, build a ParallelReadStreamBuf over the same file+offset so
+        // ReadFile is used instead of mmap+memcpy. This avoids the 2x RAM
+        // pressure (mmap working-set + destination buffer) that causes
+        // catastrophic working-set thrashing for multi-GB models, and
+        // eliminates per-page PFN database lock contention.
+        if (size >= threshold) {
+            MEMORY_BASIC_INFORMATION mbi{};
+            if (VirtualQuery(data, &mbi, sizeof(mbi)) && mbi.Type == MEM_MAPPED) {
+                wchar_t dev_path[MAX_PATH] = {};
+                if (GetMappedFileNameW(GetCurrentProcess(), const_cast<void*>(data), dev_path, MAX_PATH) > 0) {
+                    // Convert device path (\Device\HarddiskVolume3\...) to Win32 path.
+                    wchar_t win32_path[MAX_PATH] = {};
+                    if (resolve_device_path(dev_path, win32_path, MAX_PATH)) {
+                        // Compute the file offset: the region base may be at a
+                        // different page boundary than the pointer we received.
+                        // AllocationBase is the start of the mapped view.
+                        const std::streamoff file_offset =
+                            reinterpret_cast<const char*>(data) - reinterpret_cast<const char*>(mbi.AllocationBase);
+                        m_file_buf = std::make_unique<ParallelReadStreamBuf>(std::filesystem::path(win32_path),
+                                                                             file_offset,
+                                                                             threshold);
+                    }
+                }
+            }
+        }
+        // Fallback (non-file-backed memory, e.g. anonymous mmap or small
+        // allocations): issue an upfront async prefetch for the entire region
+        // so pages start arriving while the blob header is being parsed.
+        if (!m_file_buf) {
+            WIN32_MEMORY_RANGE_ENTRY prefetch_range{const_cast<void*>(data), size};
+            PrefetchVirtualMemory(GetCurrentProcess(), 1, &prefetch_range, 0);
+        }
+#else
+        // On Linux, detect file-backed mmap via /proc/self/maps.
+        // If the pointer falls inside a file mapping, build a ParallelReadStreamBuf
+        // (pread-based) to avoid mmap Working Set residency pressure and page fault
+        // overhead that degrades throughput on multi-GB models.
+        if (size >= threshold) {
+            std::filesystem::path file_path;
+            std::streamoff file_off = 0;
+            if (get_mmap_file_info(data, file_path, file_off)) {
+                m_file_buf = std::make_unique<ParallelReadStreamBuf>(file_path, file_off, threshold);
+            }
+        }
+        // For non-file-backed memory (anonymous mmap, USM host buffers, etc.)
+        // fall back to async prefetch + parallel memcpy.
+        if (!m_file_buf) {
+            madvise(const_cast<void*>(data), size, MADV_WILLNEED);
+        }
+#endif
+    }
+
+    ~ParallelMemStreamBuf() override = default;
+
+    ParallelMemStreamBuf(const ParallelMemStreamBuf&) = delete;
+    ParallelMemStreamBuf& operator=(const ParallelMemStreamBuf&) = delete;
+
+protected:
+    // -----------------------------------------------------------------------
+    // xsgetn: hot path — called by sgetn() for all bulk reads
+    // -----------------------------------------------------------------------
+    std::streamsize xsgetn(char_type* dst, std::streamsize n) override {
+        // If we detected a file-backed mmap on Windows, delegate to the
+        // ReadFile-based streambuf which avoids PFN contention and 2x RAM pressure.
+        if (m_file_buf) {
+            return m_file_buf->sgetn(dst, n);
+        }
+        if (m_current >= m_end) {
+            return 0;
+        }
+        const std::streamsize avail = static_cast<std::streamsize>(m_end - m_current);
+        const std::streamsize to_copy = std::min(n, avail);
+
+        if (static_cast<size_t>(to_copy) >= m_threshold) {
+            parallel_copy(dst, m_current, static_cast<size_t>(to_copy));
+        } else {
+            std::memcpy(dst, m_current, static_cast<size_t>(to_copy));
+        }
+
+        m_current += to_copy;
+        return to_copy;
+    }
+
+    // -----------------------------------------------------------------------
+    // underflow: single-char peek path
+    // -----------------------------------------------------------------------
+    int_type underflow() override {
+        if (m_file_buf) {
+            return m_file_buf->sgetc();
+        }
+        if (m_current >= m_end) {
+            return traits_type::eof();
+        }
+        return traits_type::to_int_type(*m_current);
+    }
+
+    int_type uflow() override {
+        if (m_file_buf) {
+            return m_file_buf->sbumpc();
+        }
+        if (m_current >= m_end) {
+            return traits_type::eof();
+        }
+        return traits_type::to_int_type(*m_current++);
+    }
+
+    // -----------------------------------------------------------------------
+    // Seek support
+    // -----------------------------------------------------------------------
+    pos_type seekoff(off_type off, std::ios_base::seekdir way, std::ios_base::openmode which) override {
+        if (m_file_buf) {
+            return m_file_buf->pubseekoff(off, way, which);
+        }
+        const char* new_pos = nullptr;
+        if (way == std::ios_base::beg) {
+            new_pos = m_begin + off;
+        } else if (way == std::ios_base::cur) {
+            new_pos = m_current + off;
+        } else {
+            new_pos = m_end + off;
+        }
+
+        if (new_pos < m_begin || new_pos > m_end) {
+            return pos_type(off_type(-1));
+        }
+
+        m_current = new_pos;
+        return pos_type(static_cast<off_type>(m_current - m_begin));
+    }
+
+    pos_type seekpos(pos_type pos, std::ios_base::openmode which) override {
+        if (m_file_buf) {
+            return m_file_buf->pubseekpos(pos, which);
+        }
+        return seekoff(off_type(pos), std::ios_base::beg, std::ios_base::in);
+    }
+
+    std::streamsize showmanyc() override {
+        if (m_file_buf) {
+            return m_file_buf->in_avail();
+        }
+        const std::streamsize avail = static_cast<std::streamsize>(m_end - m_current);
+        return avail > 0 ? avail : -1;
+    }
+
+private:
+    void parallel_copy(char* dst, const char* src, size_t size) {
+        constexpr size_t MIN_CHUNK = 2UL * 1024 * 1024;  // 2 MB minimum per thread
+#ifdef _WIN32
+        // On Windows, mmap page faults require acquiring the PFN database lock
+        // once per page.  Too many concurrent threads cause severe kernel-level
+        // serialization.  Cap at 16 threads so PFN-lock contention is bounded
+        // while still saturating NVMe queue depth via PrefetchVirtualMemory.
+        constexpr size_t MAX_CHUNKS = 16;
+        const size_t num_chunks = std::max(size_t{1}, std::min(size / MIN_CHUNK, MAX_CHUNKS));
+#else
+        const size_t num_chunks = std::max(size_t{1}, size / MIN_CHUNK);
+#endif
+        const size_t chunk_size = (size + num_chunks - 1) / num_chunks;
+
+#ifdef _WIN32
+        // Prefetch: trigger page faults up-front to maximise NVMe queue depth.
+        WIN32_MEMORY_RANGE_ENTRY prefetch_range{const_cast<char*>(src), size};
+        PrefetchVirtualMemory(GetCurrentProcess(), 1, &prefetch_range, 0);
+#else
+        // Ask the kernel to start async I/O for these mmap pages so they are
+        // resident before the parallel memcpy threads access them.
+        madvise(const_cast<char*>(src), size, MADV_WILLNEED);
+#endif
+
+#if ENABLE_BD_PROFILING_LOG
+        const auto t0 = std::chrono::steady_clock::now();
+#endif
+
+        ov::parallel_for(num_chunks, [&](size_t i) {
+            const size_t offset = i * chunk_size;
+            const size_t copy_size = (i + 1 == num_chunks) ? (size - offset) : chunk_size;
+            std::memcpy(dst + offset, src + offset, copy_size);
+        });
+
+#if ENABLE_BD_PROFILING_LOG
+        {
+            const auto t1 = std::chrono::steady_clock::now();
+            const double elapsed_s = std::chrono::duration<double>(t1 - t0).count();
+            const double bw_gbs =
+                (elapsed_s > 0.0) ? (static_cast<double>(size) / elapsed_s / (1024.0 * 1024.0 * 1024.0)) : 0.0;
+            std::cout << "[ParallelMemStreamBuf] parallel_copy: " << size / 1024.0 / 1024.0 << " MB, " << num_chunks
+                      << " chunks, " << elapsed_s * 1e3 << " ms, " << bw_gbs << " GB/s" << std::endl;
+        }
+#endif
+    }
+
+    const char* m_begin;
+    const char* m_end;
+    const char* m_current;
+    size_t m_threshold;
+    // Non-null when source is a file-backed mmap: delegates all I/O to
+    // ReadFile (Windows) / pread (Linux) parallel reads, bypassing the
+    // 2x RAM pressure and page-fault overhead of mmap+memcpy.
+    std::unique_ptr<ParallelReadStreamBuf> m_file_buf;
+
+#ifdef _WIN32
+    // Convert a kernel device path (\Device\HarddiskVolume3\foo\bar) to a
+    // Win32 drive path (C:\foo\bar).
+    static bool resolve_device_path(const wchar_t* dev_path, wchar_t* out, DWORD out_len) {
+        wchar_t drives[512] = {};
+        if (!GetLogicalDriveStringsW(512, drives))
+            return false;
+        for (const wchar_t* d = drives; *d; d += wcslen(d) + 1) {
+            wchar_t drive[3] = {d[0], d[1], L'\0'};
+            wchar_t dev_name[MAX_PATH] = {};
+            if (!QueryDosDeviceW(drive, dev_name, MAX_PATH))
+                continue;
+            const size_t dev_name_len = wcslen(dev_name);
+            if (wcsncmp(dev_path, dev_name, dev_name_len) == 0 &&
+                (dev_path[dev_name_len] == L'\\' || dev_path[dev_name_len] == L'\0')) {
+                swprintf_s(out, out_len, L"%s%s", drive, dev_path + dev_name_len);
+                return true;
+            }
+        }
+        return false;
+    }
+#else
+    // Parse /proc/self/maps to find the file backing an mmap address.
+    // Returns true and fills out_path/out_offset if the address is inside
+    // a named file mapping (i.e. not anonymous / [stack] / [heap]).
+    static bool get_mmap_file_info(const void* addr, std::filesystem::path& out_path, std::streamoff& out_offset) {
+        std::ifstream maps_file("/proc/self/maps");
+        if (!maps_file.is_open())
+            return false;
+        const auto addr_val = reinterpret_cast<uintptr_t>(addr);
+        std::string line;
+        while (std::getline(maps_file, line)) {
+            // Format: start-end perms offset dev inode [pathname]
+            std::istringstream iss(line);
+            std::string addr_range, perms, offset_str, dev, inode_str;
+            if (!(iss >> addr_range >> perms >> offset_str >> dev >> inode_str))
+                continue;
+            // Parse start-end
+            const auto dash = addr_range.find('-');
+            if (dash == std::string::npos)
+                continue;
+            const auto range_start = static_cast<uintptr_t>(std::stoull(addr_range.substr(0, dash), nullptr, 16));
+            const auto range_end = static_cast<uintptr_t>(std::stoull(addr_range.substr(dash + 1), nullptr, 16));
+            if (addr_val < range_start || addr_val >= range_end)
+                continue;
+            // Read optional pathname
+            std::string path;
+            if (!(iss >> path) || path.empty() || path[0] != '/')
+                return false;  // anonymous or special region, no benefit
+            out_path = path;
+            const auto map_offset = static_cast<std::streamoff>(std::stoull(offset_str, nullptr, 16));
+            out_offset = map_offset + static_cast<std::streamoff>(addr_val - range_start);
+            return true;
+        }
+        return false;
+    }
+#endif
+};
+
+}  // namespace util
+}  // namespace ov