Add PyString::from_bytes (#5437)

elbaro · web-flow · commit e0fe042d5255 · 2025-09-16T09:11:56.000Z
* Add PyString::from_bytes

* add newsfragment

* gate the test behind not(any(Py_LIMITED_API, PyPy, GraalPy))

* Update the docstring

* Fix PyMemoryError not in scope

* style
diff --git a/newsfragments/5437.added.md b/newsfragments/5437.added.md
@@ -0,0 +1 @@
+Add PyString::from_bytes. This saves a redundant UTF-8 validation check because Python internally validates the bytes again.
diff --git a/src/types/string.rs b/src/types/string.rs
@@ -168,6 +168,20 @@ impl PyString {
         }
     }
 
+    /// Creates a new Python string object from bytes.
+    ///
+    /// Returns PyMemoryError if out of memory.
+    /// Returns [PyUnicodeDecodeError] if the slice is not a valid UTF-8 string.
+    pub fn from_bytes<'py>(py: Python<'py>, s: &[u8]) -> PyResult<Bound<'py, PyString>> {
+        let ptr = s.as_ptr().cast();
+        let len = s.len() as ffi::Py_ssize_t;
+        unsafe {
+            ffi::PyUnicode_FromStringAndSize(ptr, len)
+                .assume_owned_or_err(py)
+                .cast_into_unchecked()
+        }
+    }
+
     /// Intern the given string
     ///
     /// This will return a reference to the same Python string object if called repeatedly with the same string.
@@ -828,6 +842,20 @@ mod tests {
         });
     }
 
+    #[test]
+    #[cfg(not(any(Py_LIMITED_API, PyPy, GraalPy)))]
+    fn test_pystring_from_bytes() {
+        Python::attach(|py| {
+            let result = PyString::from_bytes(py, "\u{2122}".as_bytes());
+            assert!(result.is_ok());
+            let result = PyString::from_bytes(py, b"\x80");
+            assert!(result
+                .unwrap_err()
+                .get_type(py)
+                .is(py.get_type::<PyUnicodeDecodeError>()));
+        });
+    }
+
     #[test]
     fn test_intern_string() {
         Python::attach(|py| {

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+Add PyString::from_bytes. This saves a redundant UTF-8 validation check because Python internally validates the bytes again.`