Skip to content

Commit b75d7b2

Browse files
authored
Merge pull request #1015 from google/py-fix-1014
python bugfix: limit the number of bytes we read in case of an input with just many whitespaces
2 parents 9db5e75 + 744b071 commit b75d7b2

File tree

3 files changed

+62
-2
lines changed

3 files changed

+62
-2
lines changed

python/CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ semver guidelines for more details about this.
1313
## [Unreleased]
1414

1515
- Mark python 3.13 as supported.
16+
- Bugfix: limit the number of bytes we read in case of an input with just many whitespaces. ([#1015](https://github.com/google/magika/pull/1015))
1617

1718

1819
## [0.6.1] - 2025-03-19

python/src/magika/magika.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -875,7 +875,8 @@ def _get_result_or_features_from_bytes(
875875
# If the n-th token is padding, then it means that,
876876
# post-stripping, we do not have enough meaningful
877877
# bytes.
878-
result = self._get_result_from_few_bytes(content)
878+
bytes_to_read = min(len(content), self._model_config.block_size)
879+
result = self._get_result_from_few_bytes(content[0:bytes_to_read])
879880
return result, None
880881

881882
else:
@@ -938,8 +939,9 @@ def _get_result_or_features_from_stream(
938939
# If the n-th token is padding, then it means that,
939940
# post-stripping, we do not have enough meaningful
940941
# bytes.
942+
bytes_to_read = min(bytes_stream_size, self._model_config.block_size)
941943
stream.seek(0)
942-
content = stream.read()
944+
content = stream.read(bytes_to_read)
943945
result = self._get_result_from_few_bytes(content)
944946
return result, None
945947

python/tests/test_magika_python_module.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -222,6 +222,63 @@ def test_magika_module_with_python_and_non_python_content() -> None:
222222
assert res.prediction.output.label == ContentTypeLabel.TXT
223223

224224

225+
def test_magika_module_with_whitespaces() -> None:
226+
m = Magika()
227+
228+
ws_nums = sorted(
229+
{
230+
1,
231+
m._model_config.min_file_size_for_dl - 1,
232+
m._model_config.min_file_size_for_dl,
233+
m._model_config.min_file_size_for_dl + 1,
234+
m._model_config.beg_size - 1,
235+
m._model_config.beg_size,
236+
m._model_config.beg_size + 1,
237+
m._model_config.end_size - 1,
238+
m._model_config.end_size,
239+
m._model_config.end_size + 1,
240+
m._model_config.beg_size + m._model_config.end_size - 1,
241+
m._model_config.beg_size + m._model_config.end_size,
242+
m._model_config.beg_size + m._model_config.end_size + 1,
243+
m._model_config.beg_size + m._model_config.end_size + 1,
244+
m._model_config.block_size - 1,
245+
m._model_config.block_size,
246+
m._model_config.block_size + 1,
247+
2 * m._model_config.block_size - 1,
248+
2 * m._model_config.block_size,
249+
2 * m._model_config.block_size + 1,
250+
4 * m._model_config.block_size - 1,
251+
4 * m._model_config.block_size,
252+
4 * m._model_config.block_size + 1,
253+
}
254+
)
255+
256+
for ws_num in ws_nums:
257+
print(f"Calling indentify_bytes with {ws_num} whitespaces")
258+
content = b" " * ws_num
259+
res = m.identify_bytes(content)
260+
assert (
261+
res.ok
262+
and res.dl.label == ContentTypeLabel.UNDEFINED
263+
and res.output.label == ContentTypeLabel.TXT
264+
)
265+
res = m.identify_stream(io.BytesIO(content))
266+
assert (
267+
res.ok
268+
and res.dl.label == ContentTypeLabel.UNDEFINED
269+
and res.output.label == ContentTypeLabel.TXT
270+
)
271+
with tempfile.TemporaryDirectory() as td:
272+
tf_path = Path(td) / "test.bin"
273+
tf_path.write_bytes(content)
274+
res = m.identify_path(tf_path)
275+
assert (
276+
res.ok
277+
and res.dl.label == ContentTypeLabel.UNDEFINED
278+
and res.output.label == ContentTypeLabel.TXT
279+
)
280+
281+
225282
def test_magika_module_with_different_prediction_modes() -> None:
226283
model_dir = utils.get_default_model_dir()
227284
m = Magika(model_dir=model_dir, prediction_mode=PredictionMode.BEST_GUESS)

0 commit comments

Comments
 (0)