Skip to content

Commit 78fb312

Browse files
committed
Consistently treat \ followed by newlines as horizontal space
Closes #14713. Closes #14714.
1 parent 90e1826 commit 78fb312

File tree

2 files changed

+98
-48
lines changed

2 files changed

+98
-48
lines changed

lib/elixir/src/elixir_tokenizer.erl

Lines changed: 62 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -353,11 +353,11 @@ tokenize("=>" ++ Rest, Line, Column, Scope, Tokens) ->
353353
tokenize(Rest, Line, Column + 2, Scope, add_token_with_eol(Token, Tokens));
354354

355355
tokenize("..//" ++ Rest = String, Line, Column, Scope, Tokens) ->
356-
case strip_horizontal_space(Rest, 0) of
357-
{[$/ | _] = Remaining, Extra} ->
356+
case strip_horizontal_space(Rest, Line, Column + 4, Scope) of
357+
{[$/ | _] = Remaining, NewLine, NewColumn} ->
358358
Token = {identifier, {Line, Column, nil}, '..//'},
359-
tokenize(Remaining, Line, Column + 4 + Extra, Scope, [Token | Tokens]);
360-
{_, _} ->
359+
tokenize(Remaining, NewLine, NewColumn, Scope, [Token | Tokens]);
360+
{_, _, _} ->
361361
unexpected_token(String, Line, Column, Scope, Tokens)
362362
end;
363363

@@ -464,17 +464,17 @@ tokenize([T1, T2 | Rest], Line, Column, Scope, Tokens) when ?stab_op(T1, T2) ->
464464

465465
tokenize([$& | Rest], Line, Column, Scope, Tokens) ->
466466
Kind =
467-
case strip_horizontal_space(Rest, 0) of
468-
{[Int | _], 0} when ?is_digit(Int) ->
467+
case strip_horizontal_space(Rest, Line, 0, Scope) of
468+
{[Int | _], Line, 0} when ?is_digit(Int) ->
469469
capture_int;
470470

471-
{[$/ | NewRest], _} ->
472-
case strip_horizontal_space(NewRest, 0) of
473-
{[$/ | _], _} -> capture_op;
474-
{_, _} -> identifier
471+
{[$/ | NewRest], _, _} ->
472+
case strip_horizontal_space(NewRest, Line, 0, Scope) of
473+
{[$/ | _], _, _} -> capture_op;
474+
{_, _, _} -> identifier
475475
end;
476476

477-
{_, _} ->
477+
{_, _, _} ->
478478
capture_op
479479
end,
480480

@@ -612,8 +612,8 @@ tokenize([H | T], Line, Column, Scope, Tokens) when ?is_digit(H) ->
612612
% Spaces
613613

614614
tokenize([T | Rest], Line, Column, Scope, Tokens) when ?is_horizontal_space(T) ->
615-
{Remaining, Stripped} = strip_horizontal_space(Rest, 0),
616-
handle_space_sensitive_tokens(Remaining, Line, Column + 1 + Stripped, Scope, Tokens);
615+
{Remaining, NewLine, NewColumn} = strip_horizontal_space(Rest, Line, Column + 1, Scope),
616+
handle_space_sensitive_tokens(Remaining, NewLine, NewColumn, Scope, Tokens);
617617

618618
% End of line
619619

@@ -735,32 +735,39 @@ unexpected_token([T | Rest], Line, Column, Scope, Tokens) ->
735735
error({?LOC(Line, Column), "unexpected token: ", Message}, Rest, Scope, Tokens).
736736

737737
tokenize_eol(Rest, Line, Scope, Tokens) ->
738-
{StrippedRest, Column} = strip_horizontal_space(Rest, Scope#elixir_tokenizer.column),
739-
IndentedScope = Scope#elixir_tokenizer{indentation=Column-1},
740-
tokenize(StrippedRest, Line + 1, Column, IndentedScope, Tokens).
741-
742-
strip_horizontal_space([H | T], Counter) when ?is_horizontal_space(H) ->
743-
strip_horizontal_space(T, Counter + 1);
744-
strip_horizontal_space(T, Counter) ->
745-
{T, Counter}.
738+
{StrippedRest, NewLine, NewColumn} =
739+
strip_horizontal_space(Rest, Line + 1, Scope#elixir_tokenizer.column, Scope),
740+
IndentedScope = Scope#elixir_tokenizer{indentation=NewColumn-1},
741+
tokenize(StrippedRest, NewLine, NewColumn, IndentedScope, Tokens).
742+
743+
strip_horizontal_space([H | T], Line, Counter, Scope) when ?is_horizontal_space(H) ->
744+
strip_horizontal_space(T, Line, Counter + 1, Scope);
745+
%% \\ at the end of lines is treated as horizontal whitespace
746+
%% except at the very end of the buffer, which we treat as incomplete
747+
strip_horizontal_space("\\\n" ++ T, Line, _Counter, Scope) when T /= [] ->
748+
strip_horizontal_space(T, Line+1, Scope#elixir_tokenizer.column, Scope);
749+
strip_horizontal_space("\\\r\n" ++ T, Line, _Counter, Scope) when T /= [] ->
750+
strip_horizontal_space(T, Line+1, Scope#elixir_tokenizer.column, Scope);
751+
strip_horizontal_space(T, Line, Counter, _Scope) ->
752+
{T, Line, Counter}.
746753

747754
tokenize_dot(T, Line, Column, DotInfo, Scope, Tokens) ->
748-
case strip_horizontal_space(T, 0) of
749-
{[$# | R], _} ->
755+
case strip_horizontal_space(T, Line, Column, Scope) of
756+
{[$# | R], NewLine, NewColumn} ->
750757
case tokenize_comment(R, [$#]) of
751758
{error, Char, Reason} ->
752-
error_comment(Char, Reason, [$# | R], Line, Column, Scope, Tokens);
759+
error_comment(Char, Reason, [$# | R], NewLine, NewColumn, Scope, Tokens);
753760

754761
{Rest, Comment} ->
755-
preserve_comments(Line, Column, Tokens, Comment, Rest, Scope),
756-
tokenize_dot(Rest, Line, Scope#elixir_tokenizer.column, DotInfo, Scope, Tokens)
762+
preserve_comments(NewLine, NewColumn, Tokens, Comment, Rest, Scope),
763+
tokenize_dot(Rest, NewLine, Scope#elixir_tokenizer.column, DotInfo, Scope, Tokens)
757764
end;
758-
{"\r\n" ++ Rest, _} ->
759-
tokenize_dot(Rest, Line + 1, Scope#elixir_tokenizer.column, DotInfo, Scope, Tokens);
760-
{"\n" ++ Rest, _} ->
761-
tokenize_dot(Rest, Line + 1, Scope#elixir_tokenizer.column, DotInfo, Scope, Tokens);
762-
{Rest, Length} ->
763-
handle_dot([$. | Rest], Line, Column + Length, DotInfo, Scope, Tokens)
765+
{"\r\n" ++ Rest, NewLine, _NewColumn} ->
766+
tokenize_dot(Rest, NewLine + 1, Scope#elixir_tokenizer.column, DotInfo, Scope, Tokens);
767+
{"\n" ++ Rest, NewLine, _NewColumn} ->
768+
tokenize_dot(Rest, NewLine + 1, Scope#elixir_tokenizer.column, DotInfo, Scope, Tokens);
769+
{Rest, NewLine, NewColumn} ->
770+
handle_dot([$. | Rest], NewLine, NewColumn, DotInfo, Scope, Tokens)
764771
end.
765772

766773
handle_char(0) -> {"\\0", "null byte"};
@@ -871,25 +878,25 @@ handle_unary_op([$: | Rest], Line, Column, _Kind, Length, Op, Scope, Tokens) whe
871878
tokenize(Rest, Line, Column + Length + 1, Scope, [Token | Tokens]);
872879

873880
handle_unary_op(Rest, Line, Column, Kind, Length, Op, Scope, Tokens) ->
874-
case strip_horizontal_space(Rest, 0) of
875-
{[$/ | _] = Remaining, Extra} ->
881+
case strip_horizontal_space(Rest, Line, Column + Length, Scope) of
882+
{[$/ | _] = Remaining, NewLine, NewColumn} ->
876883
Token = {identifier, {Line, Column, nil}, Op},
877-
tokenize(Remaining, Line, Column + Length + Extra, Scope, [Token | Tokens]);
878-
{Remaining, Extra} ->
884+
tokenize(Remaining, NewLine, NewColumn, Scope, [Token | Tokens]);
885+
{Remaining, NewLine, NewColumn} ->
879886
Token = {Kind, {Line, Column, nil}, Op},
880-
tokenize(Remaining, Line, Column + Length + Extra, Scope, [Token | Tokens])
887+
tokenize(Remaining, NewLine, NewColumn, Scope, [Token | Tokens])
881888
end.
882889

883890
handle_op([$: | Rest], Line, Column, _Kind, Length, Op, Scope, Tokens) when ?is_space(hd(Rest)) ->
884891
Token = {kw_identifier, {Line, Column, nil}, Op},
885892
tokenize(Rest, Line, Column + Length + 1, Scope, [Token | Tokens]);
886893

887894
handle_op(Rest, Line, Column, Kind, Length, Op, Scope, Tokens) ->
888-
case strip_horizontal_space(Rest, 0) of
889-
{[$/ | _] = Remaining, Extra} ->
895+
case strip_horizontal_space(Rest, Line, Column + Length, Scope) of
896+
{[$/ | _] = Remaining, NewLine, NewColumn} ->
890897
Token = {identifier, {Line, Column, nil}, Op},
891-
tokenize(Remaining, Line, Column + Length + Extra, Scope, [Token | Tokens]);
892-
{Remaining, Extra} ->
898+
tokenize(Remaining, NewLine, NewColumn, Scope, [Token | Tokens]);
899+
{Remaining, NewLine, NewColumn} ->
893900
NewScope =
894901
%% TODO: Remove these deprecations on Elixir v2.0
895902
case Op of
@@ -910,7 +917,7 @@ handle_op(Rest, Line, Column, Kind, Length, Op, Scope, Tokens) ->
910917
end,
911918

912919
Token = {Kind, {Line, Column, previous_was_eol(Tokens)}, Op},
913-
tokenize(Remaining, Line, Column + Length + Extra, NewScope, add_token_with_eol(Token, Tokens))
920+
tokenize(Remaining, NewLine, NewColumn, NewScope, add_token_with_eol(Token, Tokens))
914921
end.
915922

916923
% ## Three Token Operators
@@ -996,12 +1003,21 @@ handle_call_identifier(Rest, Line, Column, DotInfo, Length, UnencodedOp, Scope,
9961003

9971004
% ## Ambiguous unary/binary operators tokens
9981005
% Keywords are not ambiguous operators
999-
handle_space_sensitive_tokens([Sign, $:, Space | _] = String, Line, Column, Scope, Tokens) when ?dual_op(Sign), ?is_space(Space) ->
1006+
handle_space_sensitive_tokens([Sign, $:, Space | _] = String, Line, Column, Scope, Tokens) when
1007+
?dual_op(Sign), ?is_space(Space) ->
10001008
tokenize(String, Line, Column, Scope, Tokens);
10011009

10021010
% But everything else, except other operators, are
10031011
handle_space_sensitive_tokens([Sign, NotMarker | T], Line, Column, Scope, [{identifier, _, _} = H | Tokens]) when
1004-
?dual_op(Sign), not(?is_space(NotMarker)), NotMarker =/= Sign, NotMarker =/= $/, NotMarker =/= $> ->
1012+
?dual_op(Sign), not(?is_space(NotMarker)),
1013+
%% Do not match ++ or --
1014+
NotMarker =/= Sign,
1015+
%% Do not match +/2 or -/2
1016+
NotMarker =/= $/,
1017+
%% Do not match ->
1018+
NotMarker =/= $>,
1019+
%% Do not match +\\n or -\\n (it should be treated as if a space is there)
1020+
NotMarker =/= $\\ ->
10051021
Rest = [NotMarker | T],
10061022
DualOpToken = {dual_op, {Line, Column, nil}, list_to_atom([Sign])},
10071023
tokenize(Rest, Line, Column + 1, Scope, [DualOpToken, setelement(1, H, op_identifier) | Tokens]);
@@ -1664,8 +1680,8 @@ tokenize_keyword(block, Rest, Line, Column, Atom, Length, Scope, Tokens) ->
16641680

16651681
tokenize_keyword(Kind, Rest, Line, Column, Atom, Length, Scope, Tokens) ->
16661682
NewTokens =
1667-
case strip_horizontal_space(Rest, 0) of
1668-
{[$/ | _], _} ->
1683+
case strip_horizontal_space(Rest, Line, Column, Scope) of
1684+
{[$/ | _], _, _} ->
16691685
[{identifier, {Line, Column, nil}, Atom} | Tokens];
16701686

16711687
_ ->

lib/elixir/test/elixir/kernel/parser_test.exs

Lines changed: 36 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ defmodule Kernel.ParserTest do
6464
test "ambiguous ops in keywords" do
6565
assert parse!("f(+: :ok)") == {:f, [line: 1], [[+: :ok]]}
6666
assert parse!("f +: :ok") == {:f, [line: 1], [[+: :ok]]}
67+
assert parse!("f +:\n:ok") == {:f, [line: 1], [[+: :ok]]}
6768
end
6869
end
6970

@@ -107,6 +108,39 @@ defmodule Kernel.ParserTest do
107108
end
108109
end
109110

111+
describe "\\\\ + newline" do
112+
test "with ambiguous ops" do
113+
assert parse!("f \\\n-var") ==
114+
{:f, [ambiguous_op: nil, line: 1], [{:-, [line: 2], [{:var, [line: 2], nil}]}]}
115+
116+
assert parse!("f \\\n- var") ==
117+
{:-, [line: 2], [{:f, [line: 1], nil}, {:var, [line: 2], nil}]}
118+
119+
assert parse!("f -\\\nvar") ==
120+
{:-, [line: 1], [{:f, [line: 1], nil}, {:var, [line: 2], nil}]}
121+
122+
assert parse!("f -\\\n var") ==
123+
{:-, [line: 1], [{:f, [line: 1], nil}, {:var, [line: 2], nil}]}
124+
end
125+
126+
test "with capture" do
127+
assert parse!("&..//\\\n/3") ==
128+
{:&, [line: 1], [{:/, [line: 2], [{:..//, [line: 1], nil}, 3]}]}
129+
130+
assert parse!("&\\\n+/2") == {:&, [line: 1], [{:/, [line: 2], [{:+, [line: 2], nil}, 2]}]}
131+
assert parse!("&\\\n//2") == {:&, [line: 1], [{:/, [line: 2], [{:/, [line: 2], nil}, 2]}]}
132+
assert parse!("&\\\nor/2") == {:&, [line: 1], [{:/, [line: 2], [{:or, [line: 2], nil}, 2]}]}
133+
134+
assert parse!("&+\\\n/2") == {:&, [line: 1], [{:/, [line: 2], [{:+, [line: 1], nil}, 2]}]}
135+
assert parse!("&/\\\n/2") == {:&, [line: 1], [{:/, [line: 2], [{:/, [line: 1], nil}, 2]}]}
136+
assert parse!("&or\\\n/2") == {:&, [line: 1], [{:/, [line: 2], [{:or, [line: 1], nil}, 2]}]}
137+
138+
assert parse!("&+/\\\n2") == {:&, [line: 1], [{:/, [line: 1], [{:+, [line: 1], nil}, 2]}]}
139+
assert parse!("&//\\\n2") == {:&, [line: 1], [{:/, [line: 1], [{:/, [line: 1], nil}, 2]}]}
140+
assert parse!("&or/\\\n2") == {:&, [line: 1], [{:/, [line: 1], [{:or, [line: 1], nil}, 2]}]}
141+
end
142+
end
143+
110144
describe "identifier unicode normalization" do
111145
test "stops at ascii codepoints" do
112146
assert {:ok, {, _, nil}} = Code.string_to_quoted("ç\n")
@@ -995,7 +1029,7 @@ defmodule Kernel.ParserTest do
9951029
)
9961030

9971031
assert_syntax_error(
998-
["nofile:1:5:", "invalid bidirectional formatting character in comment: \\u202A"],
1032+
["nofile:1:6:", "invalid bidirectional formatting character in comment: \\u202A"],
9991033
~c"foo. # This is a \u202A"
10001034
)
10011035

@@ -1023,7 +1057,7 @@ defmodule Kernel.ParserTest do
10231057
)
10241058

10251059
assert_syntax_error(
1026-
["nofile:1:5:", "invalid line break character in comment: \\u2028"],
1060+
["nofile:1:6:", "invalid line break character in comment: \\u2028"],
10271061
~c"foo. # This is a \u2028"
10281062
)
10291063

0 commit comments

Comments
 (0)