review changes

wkazmierczak · wkazmierczak · commit 1bb457825f46 · 2026-03-19T16:43:50.000+01:00
diff --git a/rtmp/src/flv/ex_video.rs b/rtmp/src/flv/ex_video.rs
@@ -0,0 +1,245 @@
+use bytes::Bytes;
+
+use crate::{RtmpMessageSerializeError, error::FlvVideoTagParseError};
+
+use super::mod_ex::resolve_mod_ex;
+use super::video::{VideoTagFrameType, parse_composition_time};
+
+/// Parsed Enhanced RTMP video tag.
+#[derive(Debug, Clone)]
+pub enum ExVideoTag {
+    StartSeek,
+    EndSeek,
+    VideoBody {
+        four_cc: ExVideoFourCc,
+        packet: ExVideoPacket,
+        frame_type: VideoTagFrameType,
+        timestamp_offset_nanos: Option<u32>,
+    },
+}
+
+impl ExVideoTag {
+    pub fn frame_type(&self) -> VideoTagFrameType {
+        match self {
+            ExVideoTag::StartSeek | ExVideoTag::EndSeek => {
+                VideoTagFrameType::VideoInfoOrCommandFrame
+            }
+            ExVideoTag::VideoBody { frame_type, .. } => *frame_type,
+        }
+    }
+}
+
+/// FourCC video codec identifiers for Enhanced RTMP.
+#[derive(Debug, Clone, Copy, PartialEq)]
+pub enum ExVideoFourCc {
+    /// VP8 (`vp08`)
+    Vp08,
+    /// VP9 (`vp09`)
+    Vp09,
+    /// AV1 (`av01`)
+    Av01,
+    /// H.264/AVC (`avc1`)
+    Avc1,
+    /// H.265/HEVC (`hvc1`)
+    Hvc1,
+}
+
+impl ExVideoFourCc {
+    fn from_raw(bytes: [u8; 4]) -> Result<Self, FlvVideoTagParseError> {
+        match &bytes {
+            b"vp08" => Ok(Self::Vp08),
+            b"vp09" => Ok(Self::Vp09),
+            b"av01" => Ok(Self::Av01),
+            b"avc1" => Ok(Self::Avc1),
+            b"hvc1" => Ok(Self::Hvc1),
+            _ => Err(FlvVideoTagParseError::UnknownVideoFourCc(bytes)),
+        }
+    }
+
+    #[allow(unused)]
+    fn to_raw(self) -> [u8; 4] {
+        match self {
+            Self::Vp08 => *b"vp08",
+            Self::Vp09 => *b"vp09",
+            Self::Av01 => *b"av01",
+            Self::Avc1 => *b"avc1",
+            Self::Hvc1 => *b"hvc1",
+        }
+    }
+
+    /// Returns true if this codec uses SI24 CompositionTime in CodedFrames.
+    /// Per the spec, only AVC and HEVC carry composition time offset.
+    fn has_composition_time(self) -> bool {
+        matches!(self, Self::Avc1 | Self::Hvc1)
+    }
+}
+
+/// Semantic video packet type after parsing.
+///
+/// This represents the resolved body content. Wire-only signals (`ModEx`, `Multitrack`)
+/// are handled during parsing and do not appear here. `CodedFrames` and `CodedFramesX`
+/// from the wire are unified — `CodedFramesX` sets `composition_time = 0`.
+#[derive(Debug, Clone)]
+pub enum ExVideoPacket {
+    /// Decoder configuration record (SPS/PPS, VPS, etc.)
+    SequenceStart(Bytes),
+    /// Video frame data with composition time offset.
+    /// For codecs without composition time (VP8, VP9, AV1), `composition_time` is 0.
+    /// Encompasses both wire types `CodedFrames` (explicit SI24) and `CodedFramesX` (implicit 0).
+    CodedFrames { composition_time: i32, data: Bytes },
+    /// End of sequence marker. No payload.
+    SequenceEnd,
+    /// AMF-encoded metadata (e.g. HDR colorInfo).
+    Metadata(Bytes),
+    /// Carriage of bitstream in MPEG-2 TS format.
+    /// Mutually exclusive with SequenceStart.
+    Mpeg2TsSequenceStart(Bytes),
+}
+
+#[derive(Debug, Clone, Copy, PartialEq)]
+pub(super) enum ExVideoPacketType {
+    SequenceStart,
+    CodedFrames,
+    SequenceEnd,
+    CodedFramesX,
+    Metadata,
+    Mpeg2TsSequenceStart,
+    Multitrack,
+    ModEx,
+}
+
+impl ExVideoPacketType {
+    pub(super) fn from_raw(value: u8) -> Result<Self, FlvVideoTagParseError> {
+        match value {
+            0 => Ok(Self::SequenceStart),
+            1 => Ok(Self::CodedFrames),
+            2 => Ok(Self::SequenceEnd),
+            3 => Ok(Self::CodedFramesX),
+            4 => Ok(Self::Metadata),
+            5 => Ok(Self::Mpeg2TsSequenceStart),
+            6 => Ok(Self::Multitrack),
+            7 => Ok(Self::ModEx),
+            _ => Err(FlvVideoTagParseError::UnknownExVideoPacketType(value)),
+        }
+    }
+
+    fn into_raw(self) -> u8 {
+        match self {
+            Self::SequenceStart => 0,
+            Self::CodedFrames => 1,
+            Self::SequenceEnd => 2,
+            Self::CodedFramesX => 3,
+            Self::Metadata => 4,
+            Self::Mpeg2TsSequenceStart => 5,
+            Self::Multitrack => 6,
+            Self::ModEx => 7,
+        }
+    }
+}
+
+impl ExVideoTag {
+    /// Parses Enhanced RTMP video tag.
+    /// First byte: `[isExHeader(1) | VideoFrameType(3 bits) | VideoPacketType(4 bits)]`
+    pub fn parse(data: Bytes) -> Result<Self, FlvVideoTagParseError> {
+        if data.is_empty() {
+            return Err(FlvVideoTagParseError::TooShort);
+        }
+
+        let frame_type = VideoTagFrameType::from_raw((data[0] & 0b01110000) >> 4)?;
+        let packet_type = ExVideoPacketType::from_raw(data[0] & 0b00001111)?;
+
+        // Process ModEx to resolve the final packet type and collect modifiers.
+        let (packet_type, rest, timestamp_offset_nanos) = if packet_type == ExVideoPacketType::ModEx
+        {
+            let result = resolve_mod_ex(data.slice(1..))?;
+            (
+                result.packet_type,
+                result.remaining,
+                result.timestamp_offset_nanos,
+            )
+        } else {
+            (packet_type, data.slice(1..), None)
+        };
+
+        // Per spec: if frame_type is Command and packet_type is not Metadata,
+        // the payload is a single UI8 VideoCommand with no FourCC or video body.
+        if frame_type == VideoTagFrameType::VideoInfoOrCommandFrame
+            && packet_type != ExVideoPacketType::Metadata
+        {
+            if rest.is_empty() {
+                return Err(FlvVideoTagParseError::TooShort);
+            }
+            let content = match rest[0] {
+                0 => ExVideoTag::StartSeek,
+                1 => ExVideoTag::EndSeek,
+                v => return Err(FlvVideoTagParseError::UnknownVideoCommand(v)),
+            };
+            return Ok(content);
+        }
+
+        // Read FourCC (4 bytes), present for all non-command packet types.
+        if rest.len() < 4 {
+            return Err(FlvVideoTagParseError::TooShort);
+        }
+        let four_cc = ExVideoFourCc::from_raw([rest[0], rest[1], rest[2], rest[3]])?;
+        let body_data = rest.slice(4..);
+
+        let packet = match packet_type {
+            ExVideoPacketType::SequenceStart => ExVideoPacket::SequenceStart(body_data),
+            ExVideoPacketType::CodedFrames => Self::parse_coded_frames(body_data, four_cc)?,
+            ExVideoPacketType::CodedFramesX => ExVideoPacket::CodedFrames {
+                composition_time: 0,
+                data: body_data,
+            },
+            ExVideoPacketType::SequenceEnd => ExVideoPacket::SequenceEnd,
+            ExVideoPacketType::Metadata => ExVideoPacket::Metadata(body_data),
+            ExVideoPacketType::Mpeg2TsSequenceStart => {
+                ExVideoPacket::Mpeg2TsSequenceStart(body_data)
+            }
+            ExVideoPacketType::Multitrack => {
+                // TODO: implement multitrack parsing (AvMultitrackType + per-track FourCC/data)
+                return Err(FlvVideoTagParseError::UnsupportedPacketType(
+                    packet_type.into_raw(),
+                ));
+            }
+            ExVideoPacketType::ModEx => {
+                unreachable!("ModEx should have been resolved above")
+            }
+        };
+
+        Ok(ExVideoTag::VideoBody {
+            four_cc,
+            packet,
+            frame_type,
+            timestamp_offset_nanos,
+        })
+    }
+
+    /// Parses CodedFrames body. AVC and HEVC include an SI24 composition
+    /// time prefix; other codecs do not (composition_time is set to 0
+    /// in the parsed representation).
+    fn parse_coded_frames(
+        data: Bytes,
+        four_cc: ExVideoFourCc,
+    ) -> Result<ExVideoPacket, FlvVideoTagParseError> {
+        if four_cc.has_composition_time() {
+            if data.len() < 3 {
+                return Err(FlvVideoTagParseError::TooShort);
+            }
+            let composition_time = parse_composition_time(&data[0..3]);
+            Ok(ExVideoPacket::CodedFrames {
+                composition_time,
+                data: data.slice(3..),
+            })
+        } else {
+            Ok(ExVideoPacket::CodedFrames {
+                composition_time: 0,
+                data,
+            })
+        }
+    }
+
+    pub fn serialize(&self) -> Result<Bytes, RtmpMessageSerializeError> {
+        unimplemented!()
+    }
+}
diff --git a/rtmp/src/flv/mod.rs b/rtmp/src/flv/mod.rs
@@ -1,7 +1,46 @@
 mod audio;
+mod ex_video;
+mod mod_ex;
 mod video;
-mod video_enhanced;
 
 pub use audio::*;
+use bytes::Bytes;
+pub use ex_video::*;
 pub use video::*;
-pub use video_enhanced::*;
+
+use crate::{FlvVideoTagParseError, RtmpMessageSerializeError};
+
+const EX_HEADER_BIT: u8 = 0b10000000;
+
+/// Top-level FLV video data, supporting both legacy and Enhanced RTMP formats.
+///
+/// Legacy format: <https://veovera.org/docs/legacy/video-file-format-v10-1-spec.pdf#page=74>
+/// Enhanced RTMP: <https://veovera.org/docs/enhanced/enhanced-rtmp-v2.pdf>
+#[derive(Debug, Clone)]
+pub enum FlvVideoData {
+    Legacy(VideoTag),
+    Enhanced(ExVideoTag),
+}
+
+impl FlvVideoData {
+    /// Parses flv `VIDEODATA`. Checks the IsExHeader bit in the first byte
+    /// and dispatches to either legacy or Enhanced RTMP parsing.
+    pub fn parse(data: Bytes) -> Result<Self, FlvVideoTagParseError> {
+        if data.is_empty() {
+            return Err(FlvVideoTagParseError::TooShort);
+        }
+
+        if data[0] & EX_HEADER_BIT != 0 {
+            ExVideoTag::parse(data).map(FlvVideoData::Enhanced)
+        } else {
+            VideoTag::parse(data).map(FlvVideoData::Legacy)
+        }
+    }
+
+    pub fn serialize(&self) -> Result<Bytes, RtmpMessageSerializeError> {
+        match self {
+            FlvVideoData::Legacy(tag) => tag.serialize(),
+            FlvVideoData::Enhanced(tag) => tag.serialize(),
+        }
+    }
+}
diff --git a/rtmp/src/flv/mod_ex.rs b/rtmp/src/flv/mod_ex.rs
@@ -0,0 +1,101 @@
+use bytes::Bytes;
+
+use crate::error::FlvVideoTagParseError;
+
+use super::ex_video::ExVideoPacketType;
+
+/// Enhanced RTMP ModEx sub-type for video packets.
+#[derive(Debug, Clone, Copy, PartialEq)]
+enum VideoPacketModExType {
+    /// Nanosecond precision timestamp offset (UI24, max 999,999 ns).
+    TimestampOffsetNano,
+}
+
+impl VideoPacketModExType {
+    fn from_raw(value: u8) -> Result<Self, FlvVideoTagParseError> {
+        match value {
+            0 => Ok(Self::TimestampOffsetNano),
+            _ => Err(FlvVideoTagParseError::UnknownVideoPacketModExType(value)),
+        }
+    }
+
+    #[allow(unused)]
+    fn into_raw(self) -> u8 {
+        match self {
+            Self::TimestampOffsetNano => 0,
+        }
+    }
+}
+
+/// Result of resolving ModEx prefixes from the wire.
+pub(super) struct ModExResult {
+    pub packet_type: ExVideoPacketType,
+    pub remaining: Bytes,
+    pub timestamp_offset_nanos: Option<u32>,
+}
+
+/// Processes the ModEx prefix loop, returning the resolved packet type,
+/// remaining data, and any collected modifiers (e.g. nanosecond timestamp offset).
+///
+/// Each ModEx iteration:
+/// 1. UI8 + 1 data size (if 256, use UI16 + 1)
+/// 2. ModEx data payload
+/// 3. `[VideoPacketModExType(4 bits) | ExVideoPacketType(4 bits)]`
+/// 4. Interpret data based on ModExType, then check if PacketType is another ModEx.
+pub(super) fn resolve_mod_ex(data: Bytes) -> Result<ModExResult, FlvVideoTagParseError> {
+    let mut offset: usize = 0;
+    let mut timestamp_offset_nanos: Option<u32> = None;
+
+    loop {
+        // Read ModEx data size: UI8 + 1 (range 1..=256)
+        if data.len() < offset + 1 {
+            return Err(FlvVideoTagParseError::TooShort);
+        }
+        let mut mod_ex_data_size = data[offset] as usize + 1;
+        offset += 1;
+
+        if mod_ex_data_size == 256 {
+            if data.len() < offset + 2 {
+                return Err(FlvVideoTagParseError::TooShort);
+            }
+            mod_ex_data_size = u16::from_be_bytes([data[offset], data[offset + 1]]) as usize + 1;
+            offset += 2;
+        }
+
+        if data.len() < offset + mod_ex_data_size {
+            return Err(FlvVideoTagParseError::TooShort);
+        }
+        let mod_ex_data_start = offset;
+        offset += mod_ex_data_size;
+
+        // Next byte: [VideoPacketModExType(4 bits) | ExVideoPacketType(4 bits)]
+        if data.len() < offset + 1 {
+            return Err(FlvVideoTagParseError::TooShort);
+        }
+        let mod_ex_type = VideoPacketModExType::from_raw((data[offset] & 0b11110000) >> 4)?;
+        let next_packet_type = ExVideoPacketType::from_raw(data[offset] & 0b00001111)?;
+        offset += 1;
+
+        match mod_ex_type {
+            VideoPacketModExType::TimestampOffsetNano => {
+                let mod_ex_data = &data[mod_ex_data_start..mod_ex_data_start + mod_ex_data_size];
+                if mod_ex_data.len() >= 3 {
+                    timestamp_offset_nanos = Some(u32::from_be_bytes([
+                        0,
+                        mod_ex_data[0],
+                        mod_ex_data[1],
+                        mod_ex_data[2],
+                    ]));
+                }
+            }
+        }
+
+        if next_packet_type != ExVideoPacketType::ModEx {
+            return Ok(ModExResult {
+                packet_type: next_packet_type,
+                remaining: data.slice(offset..),
+                timestamp_offset_nanos,
+            });
+        }
+    }
+}
diff --git a/rtmp/src/flv/video_enhanced.rs b/rtmp/src/flv/video_enhanced.rs