Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 29 additions & 1 deletion cpp/src/gandiva/function_registry.cc
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,35 @@ Status FunctionRegistry::Add(NativeFunction func) {
pc_registry_.emplace_back(std::move(func));
const auto& last_func = pc_registry_.back();
for (const auto& func_signature : last_func.signatures()) {
pc_registry_map_.emplace(&func_signature, &last_func);
auto [it, inserted] = pc_registry_map_.emplace(&func_signature, &last_func);
if (!inserted) {
ARROW_LOG(ERROR) << "Duplicate function signature registered: "
<< func_signature.ToString()
<< " (existing pc_name=" << it->second->pc_name()
<< ", new pc_name=" << last_func.pc_name()
<< "); the new registration will be ignored on lookup.";
continue;
}
// Also flag the weaker collision where name+params match but the return type
// differs — these are technically distinct entries in the signature map, but
// they create ambiguity at the SQL surface (e.g. `day(timestamp)` resolving
// to either extractDay → int64 or truncateDay → timestamp).
auto shape_key = func_signature.CallShape();
auto shape_it = call_shape_map_.find(shape_key);
if (shape_it != call_shape_map_.end()) {
auto pc_it = pc_registry_map_.find(shape_it->second);
std::string existing_pc_name =
pc_it != pc_registry_map_.end() ? pc_it->second->pc_name() : "<unknown>";
ARROW_LOG(ERROR) << "Function alias collision: " << func_signature.ToString()
<< " has the same name and parameter types as "
<< shape_it->second->ToString()
<< " (existing pc_name=" << existing_pc_name
<< ", new pc_name=" << last_func.pc_name()
<< "); callers of this function will get different results "
"depending on the inferred return type.";
} else {
call_shape_map_.emplace(std::move(shape_key), &func_signature);
}
}
return arrow::Status::OK();
}
Expand Down
4 changes: 4 additions & 0 deletions cpp/src/gandiva/function_registry.h
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,10 @@ class GANDIVA_EXPORT FunctionRegistry {
private:
std::vector<NativeFunction> pc_registry_;
SignatureMap pc_registry_map_;
// Tracks name+param-types (return type ignored) to detect call-shape collisions
// where the same `name(args)` could resolve to two functions with different
// return types.
std::unordered_map<std::string, const FunctionSignature*> call_shape_map_;
std::vector<std::shared_ptr<arrow::Buffer>> bitcode_memory_buffers_;
std::vector<std::pair<NativeFunction, void*>> c_functions_;
FunctionHolderMakerRegistry holder_maker_registry_;
Expand Down
14 changes: 11 additions & 3 deletions cpp/src/gandiva/function_registry_datetime.cc
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@

namespace gandiva {

#define DATE_EXTRACTION_TRUNCATION_FNS(INNER, name) \
#define DATE_EXTRACTION_FNS(INNER, name) \
DATE_TYPES(INNER, name##Millennium, {}), DATE_TYPES(INNER, name##Century, {}), \
DATE_TYPES(INNER, name##Decade, {}), DATE_TYPES(INNER, name##Year, {"year"}), \
DATE_TYPES(INNER, name##Quarter, ({"quarter"})), \
Expand All @@ -32,6 +32,14 @@ namespace gandiva {
DATE_TYPES(INNER, name##Minute, {"minute"}), \
DATE_TYPES(INNER, name##Second, {"second"})

#define DATE_TRUNCATION_FNS(INNER, name) \
DATE_TYPES(INNER, name##Millennium, {}), DATE_TYPES(INNER, name##Century, {}), \
DATE_TYPES(INNER, name##Decade, {}), DATE_TYPES(INNER, name##Year, {}), \
DATE_TYPES(INNER, name##Quarter, {}), DATE_TYPES(INNER, name##Month, {}), \
DATE_TYPES(INNER, name##Week, {}), DATE_TYPES(INNER, name##Day, {}), \
DATE_TYPES(INNER, name##Hour, {}), DATE_TYPES(INNER, name##Minute, {}), \
DATE_TYPES(INNER, name##Second, {})

#define TO_TIMESTAMP_SAFE_NULL_IF_NULL(NAME, ALIASES, TYPE) \
NativeFunction(#NAME, std::vector<std::string> ALIASES, DataTypeVector{TYPE()}, \
timestamp(), kResultNullIfNull, ARROW_STRINGIFY(NAME##_##TYPE))
Expand All @@ -51,8 +59,8 @@ std::vector<NativeFunction> GetDateTimeFunctionRegistry() {
static std::vector<NativeFunction> date_time_fn_registry_ = {
UNARY_SAFE_NULL_NEVER_BOOL(isnull, {}, day_time_interval),
UNARY_SAFE_NULL_NEVER_BOOL(isnull, {}, month_interval),
DATE_EXTRACTION_TRUNCATION_FNS(EXTRACT_SAFE_NULL_IF_NULL, extract),
DATE_EXTRACTION_TRUNCATION_FNS(TRUNCATE_SAFE_NULL_IF_NULL, date_trunc_),
DATE_EXTRACTION_FNS(EXTRACT_SAFE_NULL_IF_NULL, extract),
DATE_TRUNCATION_FNS(TRUNCATE_SAFE_NULL_IF_NULL, date_trunc_),

DATE_TYPES(EXTRACT_SAFE_NULL_IF_NULL, extractDoy, {}),
DATE_TYPES(EXTRACT_SAFE_NULL_IF_NULL, extractDow, {}),
Expand Down
9 changes: 4 additions & 5 deletions cpp/src/gandiva/function_registry_string.cc
Original file line number Diff line number Diff line change
Expand Up @@ -353,13 +353,12 @@ std::vector<NativeFunction> GetStringFunctionRegistry() {
kResultNullIfNull, "byte_substr_binary_int32_int32",
NativeFunction::kNeedsContext),

NativeFunction("convert_fromUTF8", {"convert_fromutf8"}, DataTypeVector{binary()},
utf8(), kResultNullIfNull, "convert_fromUTF8_binary",
NativeFunction("convert_fromUTF8", {}, DataTypeVector{binary()}, utf8(),
kResultNullIfNull, "convert_fromUTF8_binary",
NativeFunction::kNeedsContext),

NativeFunction("convert_replaceUTF8", {"convert_replaceutf8"},
DataTypeVector{binary(), utf8()}, utf8(), kResultNullIfNull,
"convert_replace_invalid_fromUTF8_binary",
NativeFunction("convert_replaceUTF8", {}, DataTypeVector{binary(), utf8()}, utf8(),
kResultNullIfNull, "convert_replace_invalid_fromUTF8_binary",
NativeFunction::kNeedsContext),

NativeFunction("convert_toDOUBLE", {}, DataTypeVector{float64()}, binary(),
Expand Down
111 changes: 111 additions & 0 deletions cpp/src/gandiva/function_registry_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,9 @@
#include <gmock/gmock.h>
#include <gtest/gtest.h>
#include <algorithm>
#include <sstream>
#include <string>
#include <unordered_map>
#include <unordered_set>

#include "gandiva/tests/test_util.h"
Expand All @@ -39,6 +41,90 @@ class TestFunctionRegistry : public ::testing::Test {
}
};

TEST_F(TestFunctionRegistry, TestAddDuplicateSignatureKeepsFirstRegistration) {
auto registry = std::make_unique<FunctionRegistry>();
auto buffer = arrow::Buffer::FromString("");

NativeFunction first("foo", {}, DataTypeVector{arrow::int32()}, arrow::int32(),
kResultNullIfNull, "foo_first");
NativeFunction second("foo", {}, DataTypeVector{arrow::int32()}, arrow::int32(),
kResultNullIfNull, "foo_second");

ARROW_EXPECT_OK(registry->Register({first}, buffer));

testing::internal::CaptureStderr();
ARROW_EXPECT_OK(registry->Register({second}, buffer));
std::string log = testing::internal::GetCapturedStderr();

EXPECT_NE(log.find("Duplicate function signature registered"), std::string::npos)
<< "stderr was: " << log;
EXPECT_NE(log.find("foo_first"), std::string::npos) << "stderr was: " << log;
EXPECT_NE(log.find("foo_second"), std::string::npos) << "stderr was: " << log;

// The first registration wins; lookup returns the original pc_name.
FunctionSignature sig("foo", {arrow::int32()}, arrow::int32());
const NativeFunction* fn = registry->LookupSignature(sig);
ASSERT_NE(fn, nullptr);
EXPECT_EQ(fn->pc_name(), "foo_first");
}

TEST_F(TestFunctionRegistry, TestAddCallShapeCollisionLogsAndKeepsBoth) {
auto registry = std::make_unique<FunctionRegistry>();
auto buffer = arrow::Buffer::FromString("");

NativeFunction returns_int32("foo", {}, DataTypeVector{arrow::int32()}, arrow::int32(),
kResultNullIfNull, "foo_int32_to_int32");
NativeFunction returns_int64("foo", {}, DataTypeVector{arrow::int32()}, arrow::int64(),
kResultNullIfNull, "foo_int32_to_int64");

ARROW_EXPECT_OK(registry->Register({returns_int32}, buffer));

testing::internal::CaptureStderr();
ARROW_EXPECT_OK(registry->Register({returns_int64}, buffer));
std::string log = testing::internal::GetCapturedStderr();

EXPECT_NE(log.find("Function alias collision"), std::string::npos)
<< "stderr was: " << log;
EXPECT_NE(log.find("foo_int32_to_int32"), std::string::npos) << "stderr was: " << log;
EXPECT_NE(log.find("foo_int32_to_int64"), std::string::npos) << "stderr was: " << log;

// Both signatures remain reachable because they differ in return type.
FunctionSignature as_int32("foo", {arrow::int32()}, arrow::int32());
FunctionSignature as_int64("foo", {arrow::int32()}, arrow::int64());
const NativeFunction* fn_int32 = registry->LookupSignature(as_int32);
const NativeFunction* fn_int64 = registry->LookupSignature(as_int64);
ASSERT_NE(fn_int32, nullptr);
ASSERT_NE(fn_int64, nullptr);
EXPECT_EQ(fn_int32->pc_name(), "foo_int32_to_int32");
EXPECT_EQ(fn_int64->pc_name(), "foo_int32_to_int64");
}

TEST_F(TestFunctionRegistry, TestAddCallShapeCollisionAcrossDecimalPrecision) {
// FunctionSignature::operator== treats decimals as equal when byte_width matches,
// regardless of precision/scale. The alias-collision diagnostic must use the same
// identity rule, so two registrations differing only in decimal precision/scale
// should still be flagged.
auto registry = std::make_unique<FunctionRegistry>();
auto buffer = arrow::Buffer::FromString("");

NativeFunction precise("foo", {}, DataTypeVector{arrow::decimal128(10, 2)},
arrow::int32(), kResultNullIfNull, "foo_decimal128_precise");
NativeFunction wide("foo", {}, DataTypeVector{arrow::decimal128(18, 4)}, arrow::int64(),
kResultNullIfNull, "foo_decimal128_wide");

ARROW_EXPECT_OK(registry->Register({precise}, buffer));

testing::internal::CaptureStderr();
ARROW_EXPECT_OK(registry->Register({wide}, buffer));
std::string log = testing::internal::GetCapturedStderr();

EXPECT_NE(log.find("Function alias collision"), std::string::npos)
<< "stderr was: " << log;
EXPECT_NE(log.find("foo_decimal128_precise"), std::string::npos)
<< "stderr was: " << log;
EXPECT_NE(log.find("foo_decimal128_wide"), std::string::npos) << "stderr was: " << log;
}

TEST_F(TestFunctionRegistry, TestFound) {
FunctionSignature add_i32_i32("add", {arrow::int32(), arrow::int32()}, arrow::int32());

Expand Down Expand Up @@ -85,6 +171,11 @@ TEST_F(TestFunctionRegistry, TestNoDuplicates) {
std::unordered_set<std::string> native_func_duplicates;
std::unordered_set<std::string> func_sigs;
std::unordered_set<std::string> func_sig_duplicates;
// (name, param-types) -> ret_type seen first; ignores return type to detect
// call-shape ambiguity, where the same user-facing call could resolve to two
// functions with different return types.
std::unordered_map<std::string, std::string> call_shapes;
std::unordered_set<std::string> call_shape_duplicates;
for (const auto& native_func_it : *registry_) {
auto& first_sig = native_func_it.signatures().front();
auto pc_func_sig = FunctionSignature(native_func_it.pc_name(),
Expand All @@ -103,6 +194,16 @@ TEST_F(TestFunctionRegistry, TestNoDuplicates) {
} else {
func_sig_duplicates.insert(sig_str);
}

auto call_shape = sig.CallShape();
auto ret_str = sig.ret_type()->ToString();
auto it = call_shapes.find(call_shape);
if (it == call_shapes.end()) {
call_shapes.emplace(call_shape, ret_str);
} else if (it->second != ret_str) {
call_shape_duplicates.insert(call_shape + " resolves to both " + it->second +
" and " + ret_str);
}
}
}
std::ostringstream stream;
Expand All @@ -114,12 +215,22 @@ TEST_F(TestFunctionRegistry, TestNoDuplicates) {
"following precompiled functions:\n"
<< result;

stream.str("");
stream.clear();
std::copy(func_sig_duplicates.begin(), func_sig_duplicates.end(),
std::ostream_iterator<std::string>(stream, "\n"));
EXPECT_TRUE(func_sig_duplicates.empty())
<< "The following signatures are defined more than once possibly pointing to "
"different precompiled functions:\n"
<< stream.str();

std::ostringstream shape_stream;
std::copy(call_shape_duplicates.begin(), call_shape_duplicates.end(),
std::ostream_iterator<std::string>(shape_stream, "\n"));
EXPECT_TRUE(call_shape_duplicates.empty())
<< "The following calls have the same name and parameter types but different "
"return types, so callers will get different results depending on the inferred "
"return type:\n"
<< shape_stream.str();
}
} // namespace gandiva
3 changes: 0 additions & 3 deletions cpp/src/gandiva/function_registry_timestamp_arithmetic.cc
Original file line number Diff line number Diff line change
Expand Up @@ -76,9 +76,6 @@ std::vector<NativeFunction> GetDateTimeArithmeticFunctionRegistry() {
DATE_ADD_FNS(date_add, {}),
DATE_ADD_FNS(add, {}),

NativeFunction("add", {}, DataTypeVector{date64(), int64()}, timestamp(),
kResultNullIfNull, "add_date64_int64"),

DATE_DIFF_FNS(date_sub, {}),
DATE_DIFF_FNS(subtract, {}),
DATE_DIFF_FNS(date_diff, {})};
Expand Down
19 changes: 19 additions & 0 deletions cpp/src/gandiva/function_signature.cc
Original file line number Diff line number Diff line change
Expand Up @@ -114,4 +114,23 @@ std::string FunctionSignature::ToString() const {
return s.str();
}

std::string FunctionSignature::CallShape() const {
std::stringstream s;
s << AsciiToLower(base_name_) << "(";
for (uint32_t i = 0; i < param_types_.size(); i++) {
if (i > 0) {
s << ", ";
}
const auto& p = param_types_[i];
if (p->id() == arrow::Type::DECIMAL) {
// Precision/scale aren't part of signature identity (see operator==).
s << "decimal" << (p->byte_width() * 8);
} else {
s << p->ToString();
}
}
s << ")";
return s.str();
}

} // namespace gandiva
5 changes: 5 additions & 0 deletions cpp/src/gandiva/function_signature.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,11 @@ class GANDIVA_EXPORT FunctionSignature {

std::string ToString() const;

/// String identifying the call shape: lowercased base name + parameter types
/// (return type omitted). Useful for detecting two functions that share the
/// same `name(args)` shape but differ in return type.
std::string CallShape() const;

private:
std::string base_name_;
DataTypeVector param_types_;
Expand Down
10 changes: 5 additions & 5 deletions cpp/src/gandiva/precompiled/types.h
Original file line number Diff line number Diff line change
Expand Up @@ -150,11 +150,11 @@ gdv_float64 nvl_float64_float64(gdv_float64 in, gdv_boolean is_valid_in,
gdv_boolean nvl_boolean_boolean(gdv_boolean in, gdv_boolean is_valid_in,
gdv_boolean replace, gdv_boolean is_valid_value);

gdv_int64 date_add_int32_timestamp(gdv_int32, gdv_timestamp);
gdv_int64 add_int64_timestamp(gdv_int64, gdv_timestamp);
gdv_int64 add_int32_timestamp(gdv_int32, gdv_timestamp);
gdv_int64 date_add_int64_timestamp(gdv_int64, gdv_timestamp);
gdv_timestamp add_date64_int64(gdv_date64, gdv_int64);
gdv_timestamp date_add_int32_timestamp(gdv_int32, gdv_timestamp);
gdv_timestamp add_int64_timestamp(gdv_int64, gdv_timestamp);
gdv_timestamp add_int32_timestamp(gdv_int32, gdv_timestamp);
gdv_timestamp date_add_int64_timestamp(gdv_int64, gdv_timestamp);
gdv_date64 add_date64_int64(gdv_date64, gdv_int64);

gdv_timestamp to_timestamp_int32(gdv_int32);
gdv_timestamp to_timestamp_int64(gdv_int64);
Expand Down
Loading