Skip to content

Commit e92705e

Browse files
dharanadfacebook-github-bot
authored andcommitted
refactor: Move Re2RegexpSplit to functions/prestosql (from functions/lib) (facebookincubator#13865)
Summary: Fixes facebookincubator#13673 Pull Request resolved: facebookincubator#13865 Reviewed By: kevinwilfong Differential Revision: D77382412 Pulled By: Yuhta fbshipit-source-id: 75e86c3a09afe5a46c41bd1eb188331bcbc1a700
1 parent 4f160ca commit e92705e

File tree

6 files changed

+159
-113
lines changed

6 files changed

+159
-113
lines changed

velox/functions/lib/Re2Functions.h

Lines changed: 0 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -384,70 +384,6 @@ struct Re2RegexpReplace {
384384
std::string result_;
385385
};
386386

387-
template <typename TExec>
388-
struct Re2RegexpSplit {
389-
Re2RegexpSplit() : cache_(0) {}
390-
391-
VELOX_DEFINE_FUNCTION_TYPES(TExec);
392-
393-
FOLLY_ALWAYS_INLINE void initialize(
394-
const std::vector<TypePtr>& /*inputTypes*/,
395-
const core::QueryConfig& config,
396-
const arg_type<Varchar>* /*string*/,
397-
const arg_type<Varchar>* /*pattern*/) {
398-
cache_.setMaxCompiledRegexes(config.exprMaxCompiledRegexes());
399-
}
400-
401-
static constexpr int32_t reuse_strings_from_arg = 0;
402-
403-
void call(
404-
out_type<Array<Varchar>>& out,
405-
const arg_type<Varchar>& string,
406-
const arg_type<Varchar>& pattern) {
407-
auto* re = cache_.findOrCompile(pattern);
408-
409-
const auto re2String = re2::StringPiece(string.data(), string.size());
410-
411-
size_t pos = 0;
412-
size_t lastEnd = 0;
413-
const char* start = string.data();
414-
415-
re2::StringPiece subMatches[1];
416-
while (re->Match(
417-
re2String,
418-
pos,
419-
string.size(),
420-
RE2::Anchor::UNANCHORED,
421-
subMatches,
422-
1)) {
423-
const auto fullMatch = subMatches[0];
424-
const auto offset = fullMatch.data() - start;
425-
const auto size = fullMatch.size();
426-
427-
out.add_item().setNoCopy(
428-
StringView(string.data() + lastEnd, offset - lastEnd));
429-
430-
lastEnd = offset + size;
431-
if (UNLIKELY(size == 0)) {
432-
pos = lastEnd + 1;
433-
} else {
434-
pos = lastEnd;
435-
}
436-
}
437-
438-
if (LIKELY(pos <= string.size())) {
439-
out.add_item().setNoCopy(
440-
StringView(string.data() + pos, string.size() - pos));
441-
} else {
442-
static const StringView kEmptyString(nullptr, 0);
443-
out.add_item().setNoCopy(kEmptyString);
444-
}
445-
}
446-
447-
private:
448-
detail::ReCache cache_;
449-
};
450-
451387
std::shared_ptr<exec::VectorFunction> makeRegexpReplaceWithLambda(
452388
const std::string& name,
453389
const std::vector<exec::VectorFunctionArg>& inputArgs,

velox/functions/lib/tests/Re2FunctionsTest.cpp

Lines changed: 0 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -1536,55 +1536,6 @@ TEST_F(Re2FunctionsTest, limit) {
15361536
ASSERT_NO_THROW(evaluate("regexp_like(c0, c2)", data));
15371537
}
15381538

1539-
TEST_F(Re2FunctionsTest, split) {
1540-
auto input = makeRowVector({
1541-
makeFlatVector<std::string>({
1542-
"1a 2b 14m",
1543-
"1a 2b 14",
1544-
"",
1545-
"a123b",
1546-
}),
1547-
});
1548-
auto result = evaluate("regexp_split(c0, '\\s*[a-z]+\\s*')", input);
1549-
1550-
auto expected = makeArrayVector<std::string>({
1551-
{"1", "2", "14", ""},
1552-
{"1", "2", "14"},
1553-
{""},
1554-
{"", "123", ""},
1555-
});
1556-
assertEqualVectors(expected, result);
1557-
1558-
result = evaluate("regexp_split(c0, '\\s*\\d+\\s*')", input);
1559-
expected = makeArrayVector<std::string>({
1560-
{"", "a", "b", "m"},
1561-
{"", "a", "b", ""},
1562-
{""},
1563-
{"a", "b"},
1564-
});
1565-
assertEqualVectors(expected, result);
1566-
1567-
// Test for empty matches
1568-
result = evaluate("regexp_split(c0, '')", input);
1569-
expected = makeArrayVector<std::string>({
1570-
{"", "1", "a", " ", "2", "b", " ", "1", "4", "m", ""},
1571-
{"", "1", "a", " ", "2", "b", " ", "1", "4", ""},
1572-
{"", ""},
1573-
{"", "a", "1", "2", "3", "b", ""},
1574-
});
1575-
assertEqualVectors(expected, result);
1576-
1577-
// Test for another case of empty matches
1578-
result = evaluate("regexp_split(c0, '\\s*[a-z]*\\s*')", input);
1579-
expected = makeArrayVector<std::string>({
1580-
{"", "1", "", "2", "", "1", "4", "", ""},
1581-
{"", "1", "", "2", "", "1", "4", ""},
1582-
{"", ""},
1583-
{"", "", "1", "2", "3", "", ""},
1584-
});
1585-
assertEqualVectors(expected, result);
1586-
}
1587-
15881539
TEST_F(Re2FunctionsTest, parseSubstrings) {
15891540
auto test = [&](const std::string& input,
15901541
const std::vector<std::string>& expected) {
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
/*
2+
* Copyright (c) Facebook, Inc. and its affiliates.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
#pragma once
17+
18+
namespace facebook::velox::functions {
19+
template <typename TExec>
20+
struct Re2RegexpSplit {
21+
Re2RegexpSplit() : cache_(0) {}
22+
23+
VELOX_DEFINE_FUNCTION_TYPES(TExec);
24+
25+
FOLLY_ALWAYS_INLINE void initialize(
26+
const std::vector<TypePtr>& /*inputTypes*/,
27+
const core::QueryConfig& config,
28+
const arg_type<Varchar>* /*string*/,
29+
const arg_type<Varchar>* /*pattern*/) {
30+
cache_.setMaxCompiledRegexes(config.exprMaxCompiledRegexes());
31+
}
32+
33+
static constexpr int32_t reuse_strings_from_arg = 0;
34+
35+
void call(
36+
out_type<Array<Varchar>>& out,
37+
const arg_type<Varchar>& string,
38+
const arg_type<Varchar>& pattern) {
39+
auto* re = cache_.findOrCompile(pattern);
40+
41+
const auto re2String = re2::StringPiece(string.data(), string.size());
42+
43+
size_t pos = 0;
44+
size_t lastEnd = 0;
45+
const char* start = string.data();
46+
47+
re2::StringPiece subMatches[1];
48+
while (re->Match(
49+
re2String,
50+
pos,
51+
string.size(),
52+
RE2::Anchor::UNANCHORED,
53+
subMatches,
54+
1)) {
55+
const auto fullMatch = subMatches[0];
56+
const auto offset = fullMatch.data() - start;
57+
const auto size = fullMatch.size();
58+
59+
out.add_item().setNoCopy(
60+
StringView(string.data() + lastEnd, offset - lastEnd));
61+
62+
lastEnd = offset + size;
63+
if (UNLIKELY(size == 0)) {
64+
pos = lastEnd + 1;
65+
} else {
66+
pos = lastEnd;
67+
}
68+
}
69+
70+
if (LIKELY(pos <= string.size())) {
71+
out.add_item().setNoCopy(
72+
StringView(string.data() + pos, string.size() - pos));
73+
} else {
74+
static const StringView kEmptyString(nullptr, 0);
75+
out.add_item().setNoCopy(kEmptyString);
76+
}
77+
}
78+
79+
private:
80+
detail::ReCache cache_;
81+
};
82+
} // namespace facebook::velox::functions

velox/functions/prestosql/registration/StringFunctionsRegistration.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
#include "velox/functions/Registerer.h"
1717
#include "velox/functions/lib/Re2Functions.h"
1818
#include "velox/functions/prestosql/RegexpReplace.h"
19+
#include "velox/functions/prestosql/RegexpSplit.h"
1920
#include "velox/functions/prestosql/SplitPart.h"
2021
#include "velox/functions/prestosql/SplitToMap.h"
2122
#include "velox/functions/prestosql/SplitToMultiMap.h"

velox/functions/prestosql/tests/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@ add_executable(
9292
ProbabilityTest.cpp
9393
RandTest.cpp
9494
ReduceTest.cpp
95+
RegexpSplitTest.cpp
9596
RegexpReplaceTest.cpp
9697
ReverseTest.cpp
9798
RoundTest.cpp
Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
/*
2+
* Copyright (c) Facebook, Inc. and its affiliates.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
#include "velox/functions/prestosql/tests/utils/FunctionBaseTest.h"
18+
19+
namespace facebook::velox {
20+
21+
namespace {
22+
23+
class RegexpSplitTest : public functions::test::FunctionBaseTest {};
24+
25+
TEST_F(RegexpSplitTest, split) {
26+
auto input = makeRowVector({
27+
makeFlatVector<std::string>({
28+
"1a 2b 14m",
29+
"1a 2b 14",
30+
"",
31+
"a123b",
32+
}),
33+
});
34+
auto result = evaluate("regexp_split(c0, '\\s*[a-z]+\\s*')", input);
35+
36+
auto expected = makeArrayVector<std::string>({
37+
{"1", "2", "14", ""},
38+
{"1", "2", "14"},
39+
{""},
40+
{"", "123", ""},
41+
});
42+
test::assertEqualVectors(expected, result);
43+
44+
result = evaluate("regexp_split(c0, '\\s*\\d+\\s*')", input);
45+
expected = makeArrayVector<std::string>({
46+
{"", "a", "b", "m"},
47+
{"", "a", "b", ""},
48+
{""},
49+
{"a", "b"},
50+
});
51+
test::assertEqualVectors(expected, result);
52+
53+
// Test for empty matches
54+
result = evaluate("regexp_split(c0, '')", input);
55+
expected = makeArrayVector<std::string>({
56+
{"", "1", "a", " ", "2", "b", " ", "1", "4", "m", ""},
57+
{"", "1", "a", " ", "2", "b", " ", "1", "4", ""},
58+
{"", ""},
59+
{"", "a", "1", "2", "3", "b", ""},
60+
});
61+
test::assertEqualVectors(expected, result);
62+
63+
// Test for another case of empty matches
64+
result = evaluate("regexp_split(c0, '\\s*[a-z]*\\s*')", input);
65+
expected = makeArrayVector<std::string>({
66+
{"", "1", "", "2", "", "1", "4", "", ""},
67+
{"", "1", "", "2", "", "1", "4", ""},
68+
{"", ""},
69+
{"", "", "1", "2", "3", "", ""},
70+
});
71+
test::assertEqualVectors(expected, result);
72+
}
73+
74+
} // namespace
75+
} // namespace facebook::velox

0 commit comments

Comments
 (0)