diff --git a/pyproject.toml b/pyproject.toml index bbd9afd2..0ebcff81 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,6 +27,7 @@ classifiers = [ keywords = ["data format", "HDF5", "neutron scattering", "x-ray scattering"] requires-python = ">=3.9" dependencies = [ + "chardet", "colored", "h5py", "hdf5plugin", diff --git a/src/nexusformat/nexus/completer.py b/src/nexusformat/nexus/completer.py index 128bdec6..b1f9000a 100644 --- a/src/nexusformat/nexus/completer.py +++ b/src/nexusformat/nexus/completer.py @@ -1,5 +1,5 @@ # ----------------------------------------------------------------------------- -# Copyright (c) 2019-2025, NeXpy Development Team. +# Copyright (c) 2019-2026, NeXpy Development Team. # # Author: Paul Kienzle, Ray Osborn # diff --git a/src/nexusformat/nexus/lock.py b/src/nexusformat/nexus/lock.py index 3881352f..c82c79a4 100644 --- a/src/nexusformat/nexus/lock.py +++ b/src/nexusformat/nexus/lock.py @@ -1,5 +1,5 @@ # ----------------------------------------------------------------------------- -# Copyright (c) 2019-2022, NeXpy Development Team. +# Copyright (c) 2019-2026, NeXpy Development Team. # # Author: Paul Kienzle, Ray Osborn # diff --git a/src/nexusformat/nexus/tree.py b/src/nexusformat/nexus/tree.py index 4e633e10..502243ff 100644 --- a/src/nexusformat/nexus/tree.py +++ b/src/nexusformat/nexus/tree.py @@ -266,16 +266,17 @@ def text(value): if isinstance(value, bytes): try: _text = value.decode(NX_CONFIG['encoding']) - except UnicodeDecodeError: - if NX_CONFIG['encoding'] == 'utf-8': - _text = value.decode('latin-1') - else: - _text = value.decode('utf-8') + except (UnicodeDecodeError, KeyError, LookupError): + import chardet + detected = chardet.detect(value) + encoding = detected['encoding'] + if not encoding: + encoding = 'latin-1' + _text = value.decode(encoding, errors='replace') else: _text = str(value) return _text.replace('\x00', '').rstrip() - def is_text(value): """ Return True if the value represents text. diff --git a/src/nexusformat/scripts/nexusformat.py b/src/nexusformat/scripts/nexusformat.py index e3f66ce9..1daa81bd 100755 --- a/src/nexusformat/scripts/nexusformat.py +++ b/src/nexusformat/scripts/nexusformat.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # ----------------------------------------------------------------------------- -# Copyright (c) 2019-2021, NeXpy Development Team. +# Copyright (c) 2019-2026, NeXpy Development Team. # # Distributed under the terms of the Modified BSD License. # diff --git a/src/nexusformat/scripts/nxcheck.py b/src/nexusformat/scripts/nxcheck.py index 40badb91..78e448e7 100755 --- a/src/nexusformat/scripts/nxcheck.py +++ b/src/nexusformat/scripts/nxcheck.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # ----------------------------------------------------------------------------- -# Copyright (c) 2025, NeXpy Development Team. +# Copyright (c) 2025-2026, NeXpy Development Team. # # Distributed under the terms of the Modified BSD License. # diff --git a/src/nexusformat/scripts/nxconsolidate.py b/src/nexusformat/scripts/nxconsolidate.py index bbdc29f2..4f8c65f4 100755 --- a/src/nexusformat/scripts/nxconsolidate.py +++ b/src/nexusformat/scripts/nxconsolidate.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # ----------------------------------------------------------------------------- -# Copyright (c) 2022, NeXpy Development Team. +# Copyright (c) 2022-2026, NeXpy Development Team. # # Distributed under the terms of the Modified BSD License. # diff --git a/src/nexusformat/scripts/nxdir.py b/src/nexusformat/scripts/nxdir.py index be213976..57f18847 100755 --- a/src/nexusformat/scripts/nxdir.py +++ b/src/nexusformat/scripts/nxdir.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # ----------------------------------------------------------------------------- -# Copyright (c) 2019-2021, NeXpy Development Team. +# Copyright (c) 2019-2026, NeXpy Development Team. # # Distributed under the terms of the Modified BSD License. # diff --git a/src/nexusformat/scripts/nxduplicate.py b/src/nexusformat/scripts/nxduplicate.py index edc07f45..1ef14718 100755 --- a/src/nexusformat/scripts/nxduplicate.py +++ b/src/nexusformat/scripts/nxduplicate.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # ----------------------------------------------------------------------------- -# Copyright (c) 2019-2021, NeXpy Development Team. +# Copyright (c) 2019-2026, NeXpy Development Team. # # Distributed under the terms of the Modified BSD License. # diff --git a/src/nexusformat/scripts/nxinspect.py b/src/nexusformat/scripts/nxinspect.py index a4d35049..4d4d6c0a 100755 --- a/src/nexusformat/scripts/nxinspect.py +++ b/src/nexusformat/scripts/nxinspect.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # ----------------------------------------------------------------------------- -# Copyright (c) 2025, NeXpy Development Team. +# Copyright (c) 2025-2026, NeXpy Development Team. # # Distributed under the terms of the Modified BSD License. # diff --git a/src/nexusformat/scripts/nxstack.py b/src/nexusformat/scripts/nxstack.py index 7fcc92c3..149dc635 100755 --- a/src/nexusformat/scripts/nxstack.py +++ b/src/nexusformat/scripts/nxstack.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # ----------------------------------------------------------------------------- -# Copyright (c) 2015-2022, NeXpy Development Team. +# Copyright (c) 2015-2026, NeXpy Development Team. # # Distributed under the terms of the Modified BSD License. # diff --git a/tests/test_encodings.py b/tests/test_encodings.py new file mode 100644 index 00000000..62d40b82 --- /dev/null +++ b/tests/test_encodings.py @@ -0,0 +1,45 @@ +import os + +import pytest +from nexusformat.nexus import NXfield, NXentry, nxopen + +def test_encoding_detection(tmpdir): + + filename = os.path.join(tmpdir, "encoding_test.nxs") + + latin1_text = "Café" + encoded_bytes = latin1_text.encode('latin-1') + + with nxopen(filename, 'w') as root: + root['entry'] = NXentry() + root['entry/name'] = NXfield(encoded_bytes, dtype='S') + + with nxopen(filename, 'r') as root: + retrieved_bytes = root['entry/name'].nxvalue + + if isinstance(retrieved_bytes, bytes): + decoded_text = retrieved_bytes.decode('latin-1') + assert decoded_text == latin1_text + with pytest.raises(UnicodeDecodeError): + retrieved_bytes.decode('utf-8') + else: + assert str(retrieved_bytes) == latin1_text + + +@pytest.mark.parametrize("encoding", ["latin-1", "cp1252", "ascii"]) +def test_multiple_encodings(tmpdir, encoding): + + filename = os.path.join(tmpdir, f"test_{encoding}.nxs") + original_text = "Test_Data" + + with nxopen(filename, 'w') as root: + root['entry'] = NXentry() + root['entry/name'] = NXfield(original_text.encode(encoding), dtype='S') + + with nxopen(filename, 'r') as root: + value = root['entry/name'].nxvalue + + if isinstance(value, bytes): + assert value.decode(encoding) == original_text + else: + assert str(value) == original_text