Skip to content

Commit 26b3322

Browse files
committed
Simplify utf8 sanitization
1 parent 72082ef commit 26b3322

File tree

2 files changed

+12
-80
lines changed

2 files changed

+12
-80
lines changed

lib/diff_web/live/diff_live_view.ex

Lines changed: 1 addition & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -227,7 +227,7 @@ defmodule DiffWeb.DiffLiveView do
227227

228228
diff_data =
229229
Jason.encode!(%{
230-
"diff" => sanitize_utf8(raw_diff),
230+
"diff" => DiffWeb.LiveView.sanitize_utf8(raw_diff),
231231
"path_from" => path_from,
232232
"path_to" => path_to
233233
})
@@ -336,45 +336,4 @@ defmodule DiffWeb.DiffLiveView do
336336
end
337337

338338
defp build_url(app, from, to), do: "/diff/#{app}/#{from}..#{to}"
339-
340-
# UTF-8 sanitization functions
341-
defp sanitize_utf8(content) when is_binary(content) do
342-
case String.valid?(content) do
343-
true ->
344-
content
345-
346-
false ->
347-
# Multiple fallback strategies for invalid UTF-8
348-
content
349-
|> sanitize_invalid_bytes()
350-
end
351-
end
352-
353-
defp sanitize_utf8(content), do: content
354-
355-
defp sanitize_invalid_bytes(content) do
356-
# Try different encoding conversions and fallbacks
357-
cond do
358-
# Try converting from Latin-1/ISO-8859-1 encoding
359-
latin1_result = safe_unicode_convert(content, :latin1, :utf8) ->
360-
latin1_result
361-
362-
# Last resort: replace invalid bytes with replacement character
363-
true ->
364-
content
365-
|> :binary.bin_to_list()
366-
# Replace high bytes with '?'
367-
|> Enum.map(fn byte -> if byte > 127, do: 63, else: byte end)
368-
|> :binary.list_to_bin()
369-
end
370-
end
371-
372-
defp safe_unicode_convert(content, from, to) do
373-
case :unicode.characters_to_binary(content, from, to) do
374-
result when is_binary(result) -> result
375-
_ -> nil
376-
end
377-
rescue
378-
_ -> nil
379-
end
380339
end

lib/diff_web/views/live_view.ex

Lines changed: 11 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@ defmodule DiffWeb.LiveView do
1111
DiffWeb.TooLargeComponent.render(%{file: file_path})
1212
|> Phoenix.HTML.Safe.to_iodata()
1313
|> IO.iodata_to_binary()
14-
|> sanitize_utf8()
1514

1615
{:ok, %{"diff" => raw_diff, "path_from" => path_from, "path_to" => path_to}} ->
1716
case GitDiff.parse_patch(raw_diff, relative_from: path_from, relative_to: path_to) do
@@ -38,42 +37,16 @@ defmodule DiffWeb.LiveView do
3837
end
3938
end
4039

41-
defp sanitize_utf8(content) when is_binary(content) do
42-
case String.valid?(content) do
43-
true ->
44-
content
45-
46-
false ->
47-
# Multiple fallback strategies for invalid UTF-8
48-
sanitize_invalid_bytes(content)
49-
end
50-
end
51-
52-
defp sanitize_utf8(content), do: content
53-
54-
defp sanitize_invalid_bytes(content) do
55-
# Try different encoding conversions and fallbacks
56-
cond do
57-
# Try converting from Latin-1/ISO-8859-1 encoding
58-
latin1_result = safe_unicode_convert(content, :latin1, :utf8) ->
59-
latin1_result
60-
61-
# Last resort: replace invalid bytes with replacement character
62-
true ->
63-
content
64-
|> :binary.bin_to_list()
65-
# Replace high bytes with '?'
66-
|> Enum.map(fn byte -> if byte > 127, do: 63, else: byte end)
67-
|> :binary.list_to_bin()
68-
end
69-
end
70-
71-
defp safe_unicode_convert(content, from, to) do
72-
case :unicode.characters_to_binary(content, from, to) do
73-
result when is_binary(result) -> result
74-
_ -> nil
75-
end
76-
rescue
77-
_ -> nil
40+
def sanitize_utf8(content) when is_binary(content) do
41+
content
42+
|> String.chunk(:valid)
43+
|> Enum.map(fn chunk ->
44+
if String.valid?(chunk) do
45+
chunk
46+
else
47+
String.duplicate("?", byte_size(chunk))
48+
end
49+
end)
50+
|> Enum.join("")
7851
end
7952
end

0 commit comments

Comments
 (0)