diff --git a/cpp/src/gandiva/gdv_function_stubs.cc b/cpp/src/gandiva/gdv_function_stubs.cc index 6fe3fa9a57ce..20aa3d710d41 100644 --- a/cpp/src/gandiva/gdv_function_stubs.cc +++ b/cpp/src/gandiva/gdv_function_stubs.cc @@ -20,6 +20,7 @@ #include #include +#include #include #include #include @@ -189,7 +190,10 @@ int32_t gdv_fn_populate_varlen_vector(int64_t context_ptr, int8_t* data_ptr, GANDIVA_EXPORT \ int64_t gdv_fn_crc_32_##TYPE(int64_t ctx, const char* input, int32_t input_len) { \ if (input_len < 0) { \ - gdv_fn_context_set_error_msg(ctx, "Input length can't be negative"); \ + char err_msg[96]; \ + snprintf(err_msg, sizeof(err_msg), \ + "CRC32: Input length can't be negative, got %d", input_len); \ + gdv_fn_context_set_error_msg(ctx, err_msg); \ return 0; \ } \ boost::crc_32_type result; \ @@ -233,7 +237,10 @@ GANDIVA_EXPORT const char* gdv_fn_base64_encode_binary(int64_t context, const char* in, int32_t in_len, int32_t* out_len) { if (in_len < 0) { - gdv_fn_context_set_error_msg(context, "Buffer length cannot be negative"); + char err_msg[96]; + snprintf(err_msg, sizeof(err_msg), + "BASE64: input length must be non-negative, got %d", in_len); + gdv_fn_context_set_error_msg(context, err_msg); *out_len = 0; return ""; } @@ -260,7 +267,10 @@ GANDIVA_EXPORT const char* gdv_fn_base64_decode_utf8(int64_t context, const char* in, int32_t in_len, int32_t* out_len) { if (in_len < 0) { - gdv_fn_context_set_error_msg(context, "Buffer length cannot be negative"); + char err_msg[96]; + snprintf(err_msg, sizeof(err_msg), + "UNBASE64: input length must be non-negative, got %d", in_len); + gdv_fn_context_set_error_msg(context, err_msg); *out_len = 0; return ""; } @@ -343,7 +353,10 @@ const char* gdv_fn_aes_encrypt(int64_t context, const char* data, int32_t data_l const char* key_data, int32_t key_data_len, int32_t* out_len) { if (data_len < 0) { - gdv_fn_context_set_error_msg(context, "Invalid data length to be encrypted"); + char err_msg[96]; + snprintf(err_msg, sizeof(err_msg), + "AES_ENCRYPT: data length can't be negative, got %d", data_len); + gdv_fn_context_set_error_msg(context, err_msg); *out_len = 0; return ""; } @@ -363,8 +376,7 @@ const char* gdv_fn_aes_encrypt(int64_t context, const char* data, int32_t data_l static_cast(arrow::bit_util::RoundUpToPowerOf2(data_len, kAesBlockSize)); char* ret = reinterpret_cast(gdv_fn_context_arena_malloc(context, *out_len)); if (ret == nullptr) { - std::string err_msg = - "Could not allocate memory for returning aes encrypt cypher text"; + std::string err_msg = "AES_ENCRYPT: could not allocate memory for ciphertext output"; gdv_fn_context_set_error_msg(context, err_msg.data()); *out_len = 0; return nullptr; @@ -387,7 +399,10 @@ const char* gdv_fn_aes_decrypt(int64_t context, const char* data, int32_t data_l const char* key_data, int32_t key_data_len, int32_t* out_len) { if (data_len < 0) { - gdv_fn_context_set_error_msg(context, "Invalid data length to be decrypted"); + char err_msg[96]; + snprintf(err_msg, sizeof(err_msg), + "AES_DECRYPT: data length can't be negative, got %d", data_len); + gdv_fn_context_set_error_msg(context, err_msg); *out_len = 0; return ""; } diff --git a/cpp/src/gandiva/gdv_function_stubs_test.cc b/cpp/src/gandiva/gdv_function_stubs_test.cc index 0b8d14914e82..2eb43689d80f 100644 --- a/cpp/src/gandiva/gdv_function_stubs_test.cc +++ b/cpp/src/gandiva/gdv_function_stubs_test.cc @@ -124,7 +124,8 @@ TEST(TestGdvFnStubs, TestBase64Encode) { value = gdv_fn_base64_encode_binary(ctx_ptr, "test", -5, &out_len); out_value = std::string(value, out_len); EXPECT_EQ(out_value, ""); - EXPECT_THAT(ctx.get_error(), ::testing::HasSubstr("Buffer length cannot be negative")); + EXPECT_THAT(ctx.get_error(), ::testing::HasSubstr("BASE64")); + EXPECT_THAT(ctx.get_error(), ::testing::HasSubstr("non-negative")); ctx.Reset(); } @@ -153,7 +154,8 @@ TEST(TestGdvFnStubs, TestBase64Decode) { value = gdv_fn_base64_decode_utf8(ctx_ptr, "test", -5, &out_len); out_value = std::string(value, out_len); EXPECT_EQ(out_value, ""); - EXPECT_THAT(ctx.get_error(), ::testing::HasSubstr("Buffer length cannot be negative")); + EXPECT_THAT(ctx.get_error(), ::testing::HasSubstr("UNBASE64")); + EXPECT_THAT(ctx.get_error(), ::testing::HasSubstr("non-negative")); ctx.Reset(); } diff --git a/cpp/src/gandiva/precompiled/arithmetic_ops.cc b/cpp/src/gandiva/precompiled/arithmetic_ops.cc index 2bd31bd78870..bebeb191a0f2 100644 --- a/cpp/src/gandiva/precompiled/arithmetic_ops.cc +++ b/cpp/src/gandiva/precompiled/arithmetic_ops.cc @@ -15,8 +15,10 @@ // specific language governing permissions and limitations // under the License. +#include #include #include +#include #include "arrow/util/basic_decimal.h" extern "C" { @@ -65,7 +67,7 @@ extern "C" { gdv_##OUT_TYPE NAME##_##IN_TYPE1##_##IN_TYPE2(int64_t context, gdv_##IN_TYPE1 left, \ gdv_##IN_TYPE2 right) { \ if (right == static_cast(0)) { \ - gdv_fn_context_set_error_msg(context, "divide by zero error"); \ + gdv_fn_context_set_error_msg(context, "PMOD: divide by zero error"); \ return static_cast(0); \ } \ double mod = fmod(static_cast(left), static_cast(right)); \ @@ -109,7 +111,8 @@ PMOD_OP(pmod, float64, float64, float64) gdv_float64 mod_float64_float64(int64_t context, gdv_float64 x, gdv_float64 y) { if (y == 0.0) { - const char* err_msg = "divide by zero error"; + char err_msg[96]; + snprintf(err_msg, sizeof(err_msg), "MOD: divide by zero error (dividend: %g)", x); gdv_fn_context_set_error_msg(context, err_msg); return 0.0; } @@ -351,7 +354,7 @@ NUMERIC_BOOL_DATE_FUNCTION(IS_NOT_DISTINCT_FROM) FORCE_INLINE \ gdv_##TYPE divide_##TYPE##_##TYPE(gdv_int64 context, gdv_##TYPE in1, gdv_##TYPE in2) { \ if (in2 == 0) { \ - const char* err_msg = "divide by zero error"; \ + const char* err_msg = "DIVIDE: divide by zero error"; \ gdv_fn_context_set_error_msg(context, err_msg); \ return 0; \ } \ @@ -376,14 +379,19 @@ NUMERIC_FUNCTION(POSITIVE) NUMERIC_FUNCTION_FOR_REAL(NEGATIVE) -#define NEGATIVE_INTEGER(TYPE, SIZE) \ - FORCE_INLINE \ - gdv_##TYPE negative_##TYPE(gdv_int64 context, gdv_##TYPE in) { \ - if (in <= INT##SIZE##_MIN) { \ - gdv_fn_context_set_error_msg(context, "Overflow in negative execution"); \ - return 0; \ - } \ - return -1 * in; \ +#define NEGATIVE_INTEGER(TYPE, SIZE) \ + FORCE_INLINE \ + gdv_##TYPE negative_##TYPE(gdv_int64 context, gdv_##TYPE in) { \ + if (in <= INT##SIZE##_MIN) { \ + char err_msg[96]; \ + snprintf(err_msg, sizeof(err_msg), \ + "NEGATIVE: Overflow in negative execution " \ + "(cannot negate INT" #SIZE "_MIN: %" PRId64 ")", \ + static_cast(in)); \ + gdv_fn_context_set_error_msg(context, err_msg); \ + return 0; \ + } \ + return -1 * in; \ } NEGATIVE_INTEGER(int32, 32) @@ -396,8 +404,12 @@ const int64_t INT_MIN_TO_NEGATIVE_INTERVAL_DAY_TIME = -9223372030412324863; gdv_int64 negative_daytimeinterval(gdv_int64 context, gdv_day_time_interval interval) { if (interval > INT_MAX_TO_NEGATIVE_INTERVAL_DAY_TIME || interval < INT_MIN_TO_NEGATIVE_INTERVAL_DAY_TIME) { - gdv_fn_context_set_error_msg( - context, "Interval day time is out of boundaries for the negative function"); + char err_msg[128]; + snprintf(err_msg, sizeof(err_msg), + "NEGATIVE: Interval day time is out of boundaries for the negative " + "function (value: %" PRId64 ")", + static_cast(interval)); + gdv_fn_context_set_error_msg(context, err_msg); return 0; } @@ -430,7 +442,7 @@ void negative_decimal(gdv_int64 context, int64_t high_bits, uint64_t low_bits, FORCE_INLINE \ gdv_##TYPE div_##TYPE##_##TYPE(gdv_int64 context, gdv_##TYPE in1, gdv_##TYPE in2) { \ if (in2 == 0) { \ - const char* err_msg = "divide by zero error"; \ + const char* err_msg = "DIV: divide by zero error"; \ gdv_fn_context_set_error_msg(context, err_msg); \ return 0; \ } \ @@ -448,7 +460,7 @@ DIV(uint64) FORCE_INLINE \ gdv_##TYPE div_##TYPE##_##TYPE(gdv_int64 context, gdv_##TYPE in1, gdv_##TYPE in2) { \ if (in2 == 0) { \ - const char* err_msg = "divide by zero error"; \ + const char* err_msg = "DIV: divide by zero error"; \ gdv_fn_context_set_error_msg(context, err_msg); \ return 0; \ } \ diff --git a/cpp/src/gandiva/precompiled/arithmetic_ops_test.cc b/cpp/src/gandiva/precompiled/arithmetic_ops_test.cc index 02fc68713b6b..8753e36faf4f 100644 --- a/cpp/src/gandiva/precompiled/arithmetic_ops_test.cc +++ b/cpp/src/gandiva/precompiled/arithmetic_ops_test.cc @@ -52,7 +52,7 @@ TEST(TestArithmeticOps, TestPmod) { EXPECT_EQ(pmod_int64_int64(ctx, 3, 0), 0); EXPECT_TRUE(context.has_error()); - EXPECT_EQ(context.get_error(), "divide by zero error"); + EXPECT_THAT(context.get_error(), ::testing::HasSubstr("divide by zero error")); context.Reset(); } @@ -65,7 +65,7 @@ TEST(TestArithmeticOps, TestMod) { EXPECT_DOUBLE_EQ(mod_float64_float64(reinterpret_cast(&context), 2.5, 0.0), 0.0); EXPECT_TRUE(context.has_error()); - EXPECT_EQ(context.get_error(), "divide by zero error"); + EXPECT_THAT(context.get_error(), ::testing::HasSubstr("divide by zero error")); context.Reset(); EXPECT_NEAR(mod_float64_float64(reinterpret_cast(&context), 2.5, 1.2), 0.1, @@ -219,8 +219,9 @@ TEST(TestArithmeticOps, TestNegativeIntervalTypes) { result = negative_daytimeinterval(ctx_ptr, INT64_MAX); EXPECT_EQ(ctx.has_error(), true); - EXPECT_EQ(ctx.get_error(), - "Interval day time is out of boundaries for the negative function"); + EXPECT_THAT(ctx.get_error(), + ::testing::HasSubstr( + "Interval day time is out of boundaries for the negative function")); ctx.Reset(); const int64_t INT_MIN_TO_NEGATIVE_INTERVAL_DAY_TIME = -9223372030412324863; @@ -229,8 +230,9 @@ TEST(TestArithmeticOps, TestNegativeIntervalTypes) { result = negative_daytimeinterval(ctx_ptr, INT64_MIN); EXPECT_EQ(ctx.has_error(), true); - EXPECT_EQ(ctx.get_error(), - "Interval day time is out of boundaries for the negative function"); + EXPECT_THAT(ctx.get_error(), + ::testing::HasSubstr( + "Interval day time is out of boundaries for the negative function")); ctx.Reset(); // Month interval @@ -251,7 +253,7 @@ TEST(TestArithmeticOps, TestDivide) { gandiva::ExecutionContext context; EXPECT_EQ(divide_int64_int64(reinterpret_cast(&context), 10, 0), 0); EXPECT_EQ(context.has_error(), true); - EXPECT_EQ(context.get_error(), "divide by zero error"); + EXPECT_THAT(context.get_error(), ::testing::HasSubstr("divide by zero error")); context.Reset(); EXPECT_EQ(divide_int64_int64(reinterpret_cast(&context), 10, 2), 5); @@ -262,7 +264,7 @@ TEST(TestArithmeticOps, TestDiv) { gandiva::ExecutionContext context; EXPECT_EQ(div_int64_int64(reinterpret_cast(&context), 101, 0), 0); EXPECT_EQ(context.has_error(), true); - EXPECT_EQ(context.get_error(), "divide by zero error"); + EXPECT_THAT(context.get_error(), ::testing::HasSubstr("divide by zero error")); context.Reset(); EXPECT_EQ(div_int64_int64(reinterpret_cast(&context), 101, 111), 0); @@ -278,7 +280,7 @@ TEST(TestArithmeticOps, TestDiv) { div_float64_float64(reinterpret_cast(&context), 1010.1010, 0.00000), 0.0); EXPECT_EQ(context.has_error(), true); - EXPECT_EQ(context.get_error(), "divide by zero error"); + EXPECT_THAT(context.get_error(), ::testing::HasSubstr("divide by zero error")); context.Reset(); EXPECT_EQ(div_float32_float32(reinterpret_cast(&context), 1010.1010f, 2.1f), diff --git a/cpp/src/gandiva/precompiled/decimal_ops.cc b/cpp/src/gandiva/precompiled/decimal_ops.cc index 68949680b7c0..9332fbfaa7cb 100644 --- a/cpp/src/gandiva/precompiled/decimal_ops.cc +++ b/cpp/src/gandiva/precompiled/decimal_ops.cc @@ -351,7 +351,7 @@ BasicDecimal128 Divide(int64_t context, const BasicDecimalScalar128& x, const BasicDecimalScalar128& y, int32_t out_precision, int32_t out_scale, bool* overflow) { if (y.value() == 0) { - const char* err_msg = "divide by zero error"; + const char* err_msg = "DIVIDE: divide by zero error (decimal)"; gdv_fn_context_set_error_msg(context, err_msg); return 0; } @@ -396,7 +396,7 @@ BasicDecimal128 Mod(int64_t context, const BasicDecimalScalar128& x, const BasicDecimalScalar128& y, int32_t out_precision, int32_t out_scale, bool* overflow) { if (y.value() == 0) { - const char* err_msg = "divide by zero error"; + const char* err_msg = "MOD: divide by zero error (decimal)"; gdv_fn_context_set_error_msg(context, err_msg); return 0; } diff --git a/cpp/src/gandiva/precompiled/decimal_ops_test.cc b/cpp/src/gandiva/precompiled/decimal_ops_test.cc index be8a1fe8ade4..95b0a874b94f 100644 --- a/cpp/src/gandiva/precompiled/decimal_ops_test.cc +++ b/cpp/src/gandiva/precompiled/decimal_ops_test.cc @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +#include #include #include #include @@ -455,7 +456,7 @@ TEST_F(TestDecimalSql, DivideByZero) { DecimalScalar128{"201", 20, 3}, DecimalScalar128{"0", 20, 2}, result_precision, result_scale, &overflow); EXPECT_TRUE(context.has_error()); - EXPECT_EQ(context.get_error(), "divide by zero error"); + EXPECT_THAT(context.get_error(), ::testing::HasSubstr("divide by zero error")); // divide-by-nonzero should not cause an error. context.Reset(); @@ -472,7 +473,7 @@ TEST_F(TestDecimalSql, DivideByZero) { DecimalScalar128{"0", 20, 2}, result_precision, result_scale, &overflow); EXPECT_TRUE(context.has_error()); - EXPECT_EQ(context.get_error(), "divide by zero error"); + EXPECT_THAT(context.get_error(), ::testing::HasSubstr("divide by zero error")); // mod-by-nonzero should not cause an error. context.Reset(); diff --git a/cpp/src/gandiva/precompiled/extended_math_ops.cc b/cpp/src/gandiva/precompiled/extended_math_ops.cc index c29f8f2a8684..348d595faf49 100644 --- a/cpp/src/gandiva/precompiled/extended_math_ops.cc +++ b/cpp/src/gandiva/precompiled/extended_math_ops.cc @@ -24,6 +24,7 @@ extern "C" { +#include #include #include #include @@ -80,7 +81,7 @@ ENUMERIC_TYPES_UNARY(LOG10, float64) FORCE_INLINE void set_error_for_logbase(int64_t execution_context, double base) { - const char* prefix = "divide by zero error with log of base"; + const char* prefix = "LOG: divide by zero error with log of base"; int size = static_cast(strlen(prefix)) + 64; char* error = reinterpret_cast(malloc(size)); snprintf(error, size, "%s %f", prefix, base); @@ -251,20 +252,28 @@ static const int64_t kFactorialLookupTable[] = {1, 121645100408832000, 2432902008176640000}; -#define FACTORIAL(IN_TYPE) \ - FORCE_INLINE \ - gdv_int64 factorial_##IN_TYPE(gdv_int64 ctx, gdv_##IN_TYPE value) { \ - if (value < 0) { \ - gdv_fn_context_set_error_msg(ctx, "Factorial of negative number not exist!"); \ - return 0; \ - } \ - /* For numbers greater than 20 causes an overflow. */ \ - if (value > 20) { \ - gdv_fn_context_set_error_msg(ctx, "Numbers greater than 20 cause overflow!"); \ - return 0; \ - } \ - \ - return kFactorialLookupTable[static_cast(value)]; \ +#define FACTORIAL(IN_TYPE) \ + FORCE_INLINE \ + gdv_int64 factorial_##IN_TYPE(gdv_int64 ctx, gdv_##IN_TYPE value) { \ + if (value < 0) { \ + char err_msg[96]; \ + snprintf(err_msg, sizeof(err_msg), \ + "FACTORIAL: input must be non-negative, got %" PRId64, \ + static_cast(value)); \ + gdv_fn_context_set_error_msg(ctx, err_msg); \ + return 0; \ + } \ + /* For numbers greater than 20 causes an overflow. */ \ + if (value > 20) { \ + char err_msg[96]; \ + snprintf(err_msg, sizeof(err_msg), \ + "FACTORIAL: input %" PRId64 " exceeds maximum 20 (would overflow int64)", \ + static_cast(value)); \ + gdv_fn_context_set_error_msg(ctx, err_msg); \ + return 0; \ + } \ + \ + return kFactorialLookupTable[static_cast(value)]; \ } FACTORIAL(int32) diff --git a/cpp/src/gandiva/precompiled/extended_math_ops_test.cc b/cpp/src/gandiva/precompiled/extended_math_ops_test.cc index ad0cb78188c5..e398ff47f56c 100644 --- a/cpp/src/gandiva/precompiled/extended_math_ops_test.cc +++ b/cpp/src/gandiva/precompiled/extended_math_ops_test.cc @@ -19,6 +19,7 @@ # define M_PI 3.14159265358979323846 #endif +#include #include #include @@ -61,11 +62,12 @@ TEST(TestExtendedMathOps, TestFactorial) { } factorial_int32(ctx, 21); - EXPECT_TRUE(context.get_error().find("overflow") != std::string::npos); + EXPECT_THAT(context.get_error(), ::testing::HasSubstr("overflow")); context.Reset(); factorial_int32(ctx, -5); - EXPECT_TRUE(context.get_error().find("Factorial of negative") != std::string::npos); + EXPECT_THAT(context.get_error(), ::testing::HasSubstr("FACTORIAL")); + EXPECT_THAT(context.get_error(), ::testing::HasSubstr("non-negative")); context.Reset(); for (int64_t i = 0; i <= 20; ++i) { @@ -79,11 +81,12 @@ TEST(TestExtendedMathOps, TestFactorial) { } factorial_int64(ctx, 21); - EXPECT_TRUE(context.get_error().find("overflow") != std::string::npos); + EXPECT_THAT(context.get_error(), ::testing::HasSubstr("overflow")); context.Reset(); factorial_int64(ctx, -5); - EXPECT_TRUE(context.get_error().find("Factorial of negative") != std::string::npos); + EXPECT_THAT(context.get_error(), ::testing::HasSubstr("FACTORIAL")); + EXPECT_THAT(context.get_error(), ::testing::HasSubstr("non-negative")); context.Reset(); } diff --git a/cpp/src/gandiva/precompiled/string_ops.cc b/cpp/src/gandiva/precompiled/string_ops.cc index 035d3c8c62e1..1850f5081cc7 100644 --- a/cpp/src/gandiva/precompiled/string_ops.cc +++ b/cpp/src/gandiva/precompiled/string_ops.cc @@ -27,6 +27,7 @@ extern "C" { #include #include #include +#include #include "./types.h" @@ -540,7 +541,9 @@ gdv_boolean compare_lower_strings(const char* base_str, gdv_int32 base_str_len, FORCE_INLINE gdv_boolean castBIT_utf8(gdv_int64 context, const char* data, gdv_int32 data_len) { if (data_len <= 0) { - gdv_fn_context_set_error_msg(context, "Invalid value for boolean."); + gdv_fn_context_set_error_msg(context, + "CAST_BIT: Invalid value for boolean: empty string " + "(expected 0, 1, true, false; case-insensitive)"); return false; } @@ -569,7 +572,10 @@ gdv_boolean castBIT_utf8(gdv_int64 context, const char* data, gdv_int32 data_len if (compare_lower_strings("false", 5, trimmed_data, trimmed_len)) return false; } // if no 'true', 'false', '0' or '1' value is found, set an error - gdv_fn_context_set_error_msg(context, "Invalid value for boolean."); + std::string err_msg = "CAST_BIT: Invalid value for boolean: '" + + std::string(data, data_len) + + "' (expected 0, 1, true, false; case-insensitive)"; + gdv_fn_context_set_error_msg(context, err_msg.c_str()); return false; } @@ -578,7 +584,10 @@ const char* castVARCHAR_bool_int64(gdv_int64 context, gdv_boolean value, gdv_int64 out_len, gdv_int32* out_length) { gdv_int32 len = static_cast(out_len); if (len < 0) { - gdv_fn_context_set_error_msg(context, "Output buffer length can't be negative"); + char err_msg[96]; + snprintf(err_msg, sizeof(err_msg), + "CAST_VARCHAR: Output buffer length can't be negative, got %d", len); + gdv_fn_context_set_error_msg(context, err_msg); *out_length = 0; return ""; } @@ -592,90 +601,93 @@ const char* castVARCHAR_bool_int64(gdv_int64 context, gdv_boolean value, } // Truncates the string to given length -#define CAST_VARCHAR_FROM_VARLEN_TYPE(TYPE) \ - FORCE_INLINE \ - const char* castVARCHAR_##TYPE##_int64(gdv_int64 context, const char* data, \ - gdv_int32 data_len, int64_t out_len, \ - int32_t* out_length) { \ - int32_t len = static_cast(out_len); \ - \ - if (len < 0) { \ - gdv_fn_context_set_error_msg(context, "Output buffer length can't be negative"); \ - *out_length = 0; \ - return ""; \ - } \ - \ - if (len >= data_len || len == 0) { \ - *out_length = data_len; \ - return data; \ - } \ - \ - int32_t remaining = len; \ - int32_t index = 0; \ - bool is_multibyte = false; \ - do { \ - /* In utf8, MSB of a single byte unicode char is always 0, \ - * whereas for a multibyte character the MSB of each byte is 1. \ - * So for a single byte char, a bitwise-and with x80 (10000000) will be 0 \ - * and it won't be 0 for bytes of a multibyte char. \ - */ \ - char* data_ptr = const_cast(data); \ - \ - /* advance byte by byte till the 8-byte boundary then advance 8 bytes */ \ - auto num_bytes = reinterpret_cast(data_ptr) & 0x07; \ - num_bytes = (8 - num_bytes) & 0x07; \ - while (num_bytes > 0) { \ - uint8_t* ptr = reinterpret_cast(data_ptr + index); \ - if ((*ptr & 0x80) != 0) { \ - is_multibyte = true; \ - break; \ - } \ - index++; \ - remaining--; \ - num_bytes--; \ - } \ - if (is_multibyte) break; \ - while (remaining >= 8) { \ - uint64_t* ptr = reinterpret_cast(data_ptr + index); \ - if ((*ptr & 0x8080808080808080) != 0) { \ - is_multibyte = true; \ - break; \ - } \ - index += 8; \ - remaining -= 8; \ - } \ - if (is_multibyte) break; \ - if (remaining >= 4) { \ - uint32_t* ptr = reinterpret_cast(data_ptr + index); \ - if ((*ptr & 0x80808080) != 0) break; \ - index += 4; \ - remaining -= 4; \ - } \ - while (remaining > 0) { \ - uint8_t* ptr = reinterpret_cast(data_ptr + index); \ - if ((*ptr & 0x80) != 0) { \ - is_multibyte = true; \ - break; \ - } \ - index++; \ - remaining--; \ - } \ - if (is_multibyte) break; \ - /* reached here; all are single byte characters */ \ - *out_length = len; \ - return data; \ - } while (false); \ - \ - /* detected multibyte utf8 characters; slow path */ \ - int32_t byte_pos = \ - utf8_byte_pos(context, data + index, data_len - index, len - index); \ - if (byte_pos < 0) { \ - *out_length = 0; \ - return ""; \ - } \ - \ - *out_length = index + byte_pos; \ - return data; \ +#define CAST_VARCHAR_FROM_VARLEN_TYPE(TYPE) \ + FORCE_INLINE \ + const char* castVARCHAR_##TYPE##_int64(gdv_int64 context, const char* data, \ + gdv_int32 data_len, int64_t out_len, \ + int32_t* out_length) { \ + int32_t len = static_cast(out_len); \ + \ + if (len < 0) { \ + char err_msg[96]; \ + snprintf(err_msg, sizeof(err_msg), \ + "CAST_VARCHAR: Output buffer length can't be negative, got %d", len); \ + gdv_fn_context_set_error_msg(context, err_msg); \ + *out_length = 0; \ + return ""; \ + } \ + \ + if (len >= data_len || len == 0) { \ + *out_length = data_len; \ + return data; \ + } \ + \ + int32_t remaining = len; \ + int32_t index = 0; \ + bool is_multibyte = false; \ + do { \ + /* In utf8, MSB of a single byte unicode char is always 0, \ + * whereas for a multibyte character the MSB of each byte is 1. \ + * So for a single byte char, a bitwise-and with x80 (10000000) will be 0 \ + * and it won't be 0 for bytes of a multibyte char. \ + */ \ + char* data_ptr = const_cast(data); \ + \ + /* advance byte by byte till the 8-byte boundary then advance 8 bytes */ \ + auto num_bytes = reinterpret_cast(data_ptr) & 0x07; \ + num_bytes = (8 - num_bytes) & 0x07; \ + while (num_bytes > 0) { \ + uint8_t* ptr = reinterpret_cast(data_ptr + index); \ + if ((*ptr & 0x80) != 0) { \ + is_multibyte = true; \ + break; \ + } \ + index++; \ + remaining--; \ + num_bytes--; \ + } \ + if (is_multibyte) break; \ + while (remaining >= 8) { \ + uint64_t* ptr = reinterpret_cast(data_ptr + index); \ + if ((*ptr & 0x8080808080808080) != 0) { \ + is_multibyte = true; \ + break; \ + } \ + index += 8; \ + remaining -= 8; \ + } \ + if (is_multibyte) break; \ + if (remaining >= 4) { \ + uint32_t* ptr = reinterpret_cast(data_ptr + index); \ + if ((*ptr & 0x80808080) != 0) break; \ + index += 4; \ + remaining -= 4; \ + } \ + while (remaining > 0) { \ + uint8_t* ptr = reinterpret_cast(data_ptr + index); \ + if ((*ptr & 0x80) != 0) { \ + is_multibyte = true; \ + break; \ + } \ + index++; \ + remaining--; \ + } \ + if (is_multibyte) break; \ + /* reached here; all are single byte characters */ \ + *out_length = len; \ + return data; \ + } while (false); \ + \ + /* detected multibyte utf8 characters; slow path */ \ + int32_t byte_pos = \ + utf8_byte_pos(context, data + index, data_len - index, len - index); \ + if (byte_pos < 0) { \ + *out_length = 0; \ + return ""; \ + } \ + \ + *out_length = index + byte_pos; \ + return data; \ } CAST_VARCHAR_FROM_VARLEN_TYPE(utf8) @@ -691,7 +703,10 @@ CAST_VARCHAR_FROM_VARLEN_TYPE(binary) int32_t* out_length) { \ int32_t len = static_cast(out_len); \ if (len < 0) { \ - gdv_fn_context_set_error_msg(context, "Output buffer length can't be negative"); \ + char err_msg[96]; \ + snprintf(err_msg, sizeof(err_msg), \ + "CAST_VARBINARY: Output buffer length can't be negative, got %d", len); \ + gdv_fn_context_set_error_msg(context, err_msg); \ *out_length = 0; \ return ""; \ } \ @@ -839,13 +854,21 @@ const char* repeat_utf8_int32(gdv_int64 context, const char* in, gdv_int32 in_le } // if the repeat number is a negative number, an error is set on context if (repeat_number < 0) { - gdv_fn_context_set_error_msg(context, "Repeat number can't be negative"); + char err_msg[96]; + snprintf(err_msg, sizeof(err_msg), "REPEAT: Repeat number can't be negative, got %d", + repeat_number); + gdv_fn_context_set_error_msg(context, err_msg); *out_len = 0; return ""; } if (ARROW_PREDICT_FALSE( arrow::internal::MultiplyWithOverflow(repeat_number, in_len, out_len))) { - gdv_fn_context_set_error_msg(context, "Would overflow maximum output size"); + char err_msg[128]; + snprintf(err_msg, sizeof(err_msg), + "REPEAT: Would overflow maximum output size " + "(repeat count %d * input length %d)", + repeat_number, in_len); + gdv_fn_context_set_error_msg(context, err_msg); *out_len = 0; return ""; } @@ -1435,7 +1458,12 @@ const char* convert_replace_invalid_fromUTF8_binary(int64_t context, const char* int32_t char_to_replace_len, int32_t* out_len) { if (char_to_replace_len > 1) { - gdv_fn_context_set_error_msg(context, "Replacement of multiple bytes not supported"); + char err_msg[128]; + snprintf(err_msg, sizeof(err_msg), + "CONVERT_REPLACE_INVALID_FROM_UTF8: replacement must be a single byte, " + "got %d bytes", + char_to_replace_len); + gdv_fn_context_set_error_msg(context, err_msg); *out_len = 0; return ""; } @@ -1815,7 +1843,10 @@ gdv_int32 locate_utf8_utf8_int32(gdv_int64 context, const char* sub_str, gdv_int32 sub_str_len, const char* str, gdv_int32 str_len, gdv_int32 start_pos) { if (start_pos < 1) { - gdv_fn_context_set_error_msg(context, "Start position must be greater than 0"); + char err_msg[96]; + snprintf(err_msg, sizeof(err_msg), + "LOCATE: Start position must be greater than 0, got %d", start_pos); + gdv_fn_context_set_error_msg(context, err_msg); return 0; } @@ -1859,7 +1890,8 @@ const char* replace_with_max_len_utf8_utf8_utf8(gdv_int64 context, const char* t for (; text_index <= text_len - from_str_len;) { if (memcmp(text + text_index, from_str, from_str_len) == 0) { if (out_index + text_index - last_match_index + to_str_len > max_length) { - gdv_fn_context_set_error_msg(context, "Buffer overflow for output string"); + gdv_fn_context_set_error_msg(context, + "REPLACE: Buffer overflow for output string"); *out_len = 0; return ""; } @@ -1894,7 +1926,7 @@ const char* replace_with_max_len_utf8_utf8_utf8(gdv_int64 context, const char* t } if (out_index + text_len - last_match_index > max_length) { - gdv_fn_context_set_error_msg(context, "Buffer overflow for output string"); + gdv_fn_context_set_error_msg(context, "REPLACE: Buffer overflow for output string"); *out_len = 0; return ""; } @@ -2315,62 +2347,75 @@ const char* binary_string(gdv_int64 context, const char* text, gdv_int32 text_le return ret; } -#define CAST_INT_BIGINT_VARBINARY(OUT_TYPE, TYPE_NAME) \ - FORCE_INLINE \ - OUT_TYPE \ - cast##TYPE_NAME##_varbinary(gdv_int64 context, const char* in, int32_t in_len) { \ - if (in_len == 0) { \ - gdv_fn_context_set_error_msg(context, "Can't cast an empty string."); \ - return -1; \ - } \ - char sign = in[0]; \ - \ - bool negative = false; \ - if (sign == '-') { \ - negative = true; \ - /* Ignores the sign char in the hexadecimal string */ \ - in++; \ - in_len--; \ - } \ - \ - if (negative && in_len == 0) { \ - gdv_fn_context_set_error_msg(context, \ - "Can't cast hexadecimal with only a minus sign."); \ - return -1; \ - } \ - \ - OUT_TYPE result = 0; \ - int digit; \ - \ - int read_index = 0; \ - while (read_index < in_len) { \ - char c1 = in[read_index]; \ - if (isxdigit(c1)) { \ - digit = to_binary_from_hex(c1); \ - \ - OUT_TYPE next = result * 16 - digit; \ - \ - if (next > result) { \ - gdv_fn_context_set_error_msg(context, "Integer overflow."); \ - return -1; \ - } \ - result = next; \ - read_index++; \ - } else { \ - gdv_fn_context_set_error_msg(context, \ - "The hexadecimal given has invalid characters."); \ - return -1; \ - } \ - } \ - if (!negative) { \ - result *= -1; \ - \ - if (result < 0) { \ - gdv_fn_context_set_error_msg(context, "Integer overflow."); \ - return -1; \ - } \ - } \ - return result; \ +#define CAST_INT_BIGINT_VARBINARY(OUT_TYPE, TYPE_NAME) \ + FORCE_INLINE \ + OUT_TYPE \ + cast##TYPE_NAME##_varbinary(gdv_int64 context, const char* in, int32_t in_len) { \ + const char* in_original = in; \ + int32_t in_len_original = in_len; \ + if (in_len == 0) { \ + gdv_fn_context_set_error_msg( \ + context, "CAST_" #TYPE_NAME "_FROM_HEX: can't cast an empty string"); \ + return -1; \ + } \ + char sign = in[0]; \ + \ + bool negative = false; \ + if (sign == '-') { \ + negative = true; \ + /* Ignores the sign char in the hexadecimal string */ \ + in++; \ + in_len--; \ + } \ + \ + if (negative && in_len == 0) { \ + gdv_fn_context_set_error_msg( \ + context, "CAST_" #TYPE_NAME \ + "_FROM_HEX: can't cast hexadecimal with only a minus sign"); \ + return -1; \ + } \ + \ + OUT_TYPE result = 0; \ + int digit; \ + \ + int read_index = 0; \ + while (read_index < in_len) { \ + char c1 = in[read_index]; \ + if (isxdigit(c1)) { \ + digit = to_binary_from_hex(c1); \ + \ + OUT_TYPE next = result * 16 - digit; \ + \ + if (next > result) { \ + std::string err_msg = \ + "CAST_" #TYPE_NAME \ + "_FROM_HEX: integer overflow while reading hex value '" + \ + std::string(in_original, in_len_original) + "'"; \ + gdv_fn_context_set_error_msg(context, err_msg.c_str()); \ + return -1; \ + } \ + result = next; \ + read_index++; \ + } else { \ + std::string err_msg = "CAST_" #TYPE_NAME \ + "_FROM_HEX: invalid character in hex value '" + \ + std::string(in_original, in_len_original) + "'"; \ + gdv_fn_context_set_error_msg(context, err_msg.c_str()); \ + return -1; \ + } \ + } \ + if (!negative) { \ + result *= -1; \ + \ + if (result < 0) { \ + std::string err_msg = "CAST_" #TYPE_NAME \ + "_FROM_HEX: integer overflow while reading hex value '" + \ + std::string(in_original, in_len_original) + "'"; \ + gdv_fn_context_set_error_msg(context, err_msg.c_str()); \ + return -1; \ + } \ + } \ + return result; \ } CAST_INT_BIGINT_VARBINARY(int32_t, INT) diff --git a/cpp/src/gandiva/precompiled/time.cc b/cpp/src/gandiva/precompiled/time.cc index 8414d0ed37cf..ff3b849ef492 100644 --- a/cpp/src/gandiva/precompiled/time.cc +++ b/cpp/src/gandiva/precompiled/time.cc @@ -17,6 +17,8 @@ #include "./epoch_time_point.h" +#include + extern "C" { #define __STDC_FORMAT_MACROS @@ -268,7 +270,9 @@ static const int WEEK_LEN[] = {6, 6, 7, 9, 8, 6, 8}; } \ } \ if (dateSearch == 0) { \ - gdv_fn_context_set_error_msg(context, "The weekday in this entry is invalid"); \ + std::string err_msg = "NEXT_DAY: '" + std::string(in, in_len) + \ + "' is not a recognized day of the week"; \ + gdv_fn_context_set_error_msg(context, err_msg.c_str()); \ return 0; \ } \ \ @@ -1047,7 +1051,12 @@ CAST_NULLABLE_INTERVAL_DAY(int64) gdv_month_interval castNULLABLEINTERVALYEAR_##TYPE(int64_t context, gdv_##TYPE in) { \ gdv_month_interval value = static_cast(in); \ if (value != in) { \ - gdv_fn_context_set_error_msg(context, "Integer overflow"); \ + char err_msg[96]; \ + snprintf(err_msg, sizeof(err_msg), \ + "CAST_INTERVAL_YEAR: Integer overflow casting %" PRId64 \ + " to month interval", \ + static_cast(in)); \ + gdv_fn_context_set_error_msg(context, err_msg); \ } \ return value; \ } diff --git a/cpp/src/gandiva/precompiled/time_test.cc b/cpp/src/gandiva/precompiled/time_test.cc index 6cfa6acf579d..7aed68951cae 100644 --- a/cpp/src/gandiva/precompiled/time_test.cc +++ b/cpp/src/gandiva/precompiled/time_test.cc @@ -962,10 +962,80 @@ TEST(TestTime, TestNextDay) { ts = StringToTimestamp("2015-08-06 11:12:30"); out = next_day_from_timestamp(context_ptr, ts, "AHSRK", 5); - EXPECT_EQ(context.get_error(), "The weekday in this entry is invalid"); + EXPECT_NE(context.get_error().find("NEXT_DAY"), std::string::npos); + EXPECT_NE(context.get_error().find("AHSRK"), std::string::npos); context.Reset(); } +// Document that next_day's weekday-name matching is case-sensitive: +// the WEEK[] lookup table holds uppercase names ("MONDAY", "TUE", ...) and +// is_substr_utf8_utf8 does a byte-exact memcmp, so lowercase or mixed-case +// input does not match and produces a NEXT_DAY error. +TEST(TestTime, TestNextDayCaseSensitive) { + ExecutionContext context; + int64_t context_ptr = reinterpret_cast(&context); + + gdv_timestamp ts = StringToTimestamp("2021-11-08 10:20:34"); + + // Uppercase: matches. + auto out = next_day_from_timestamp(context_ptr, ts, "FRIDAY", 6); + EXPECT_EQ(StringToTimestamp("2021-11-12 00:00:00"), out); + EXPECT_FALSE(context.has_error()); + + // Lowercase: does NOT match (case-sensitive memcmp against uppercase WEEK[]). + out = next_day_from_timestamp(context_ptr, ts, "friday", 6); + EXPECT_TRUE(context.has_error()); + EXPECT_NE(context.get_error().find("NEXT_DAY"), std::string::npos); + EXPECT_NE(context.get_error().find("friday"), std::string::npos); + context.Reset(); + + // Mixed case: also does NOT match. + out = next_day_from_timestamp(context_ptr, ts, "Friday", 6); + EXPECT_TRUE(context.has_error()); + EXPECT_NE(context.get_error().find("NEXT_DAY"), std::string::npos); + EXPECT_NE(context.get_error().find("Friday"), std::string::npos); + context.Reset(); +} + +// Document that next_day's weekday-name matching is loose: it uses +// is_substr_utf8_utf8 to test whether the input is a *substring* of any +// WEEK[] entry, walking the array in the order +// SUNDAY, MONDAY, TUESDAY, WEDNESDAY, THURSDAY, FRIDAY, SATURDAY +// and returning the FIRST match. This means single-letter prefixes 'S' and +// 'T' are ambiguous and silently resolve to SUNDAY / TUESDAY respectively +// (never SATURDAY / THURSDAY). Likewise, any substring shared across weekdays +// (e.g. "DAY") matches SUNDAY because it is first in the array. +TEST(TestTime, TestNextDayAmbiguousPrefix) { + ExecutionContext context; + int64_t context_ptr = reinterpret_cast(&context); + + gdv_timestamp ts = StringToTimestamp("2021-11-08 10:20:34"); // Mon + + // "S" -> Sunday (could have been Saturday). + auto out = next_day_from_timestamp(context_ptr, ts, "S", 1); + EXPECT_EQ(StringToTimestamp("2021-11-14 00:00:00"), out); // next Sunday + EXPECT_FALSE(context.has_error()); + + // "T" -> Tuesday (could have been Thursday). + out = next_day_from_timestamp(context_ptr, ts, "T", 1); + EXPECT_EQ(StringToTimestamp("2021-11-09 00:00:00"), out); // next Tuesday + EXPECT_FALSE(context.has_error()); + + // "DAY" appears in every weekday name -> matches SUNDAY (first in array). + out = next_day_from_timestamp(context_ptr, ts, "DAY", 3); + EXPECT_EQ(StringToTimestamp("2021-11-14 00:00:00"), out); // next Sunday + EXPECT_FALSE(context.has_error()); + + // Unambiguous 2-letter prefixes work as expected. + out = next_day_from_timestamp(context_ptr, ts, "SA", 2); + EXPECT_EQ(StringToTimestamp("2021-11-13 00:00:00"), out); // next Saturday + EXPECT_FALSE(context.has_error()); + + out = next_day_from_timestamp(context_ptr, ts, "TH", 2); + EXPECT_EQ(StringToTimestamp("2021-11-11 00:00:00"), out); // next Thursday + EXPECT_FALSE(context.has_error()); +} + TEST(TestTime, TestCastTimestampToTime) { gdv_timestamp ts = StringToTimestamp("2000-05-01 10:20:34"); auto expected_response = @@ -1172,7 +1242,7 @@ TEST(TestTime, TestCastNullableInterval) { EXPECT_EQ(castNULLABLEINTERVALYEAR_int64(context_ptr, 1201), 1201); // validate overflow error when using bigint as input castNULLABLEINTERVALYEAR_int64(context_ptr, INT64_MAX); - EXPECT_EQ(context.get_error(), "Integer overflow"); + EXPECT_NE(context.get_error().find("Integer overflow"), std::string::npos); context.Reset(); } diff --git a/cpp/src/gandiva/regex_functions_holder.cc b/cpp/src/gandiva/regex_functions_holder.cc index 6c0c3d40f127..e6bd09be37ba 100644 --- a/cpp/src/gandiva/regex_functions_holder.cc +++ b/cpp/src/gandiva/regex_functions_holder.cc @@ -237,7 +237,11 @@ const char* ExtractHolder::operator()(ExecutionContext* ctx, const char* user_in int32_t user_input_len, int32_t extract_index, int32_t* out_length) { if (extract_index < 0 || extract_index >= num_groups_pattern_) { - ctx->set_error_msg("Index to extract out of range"); + std::string err_msg = "REGEXP_EXTRACT: invalid group_index '" + + std::to_string(extract_index) + "'; must be between 0 and " + + std::to_string(num_groups_pattern_ - 1) + + " (the number of capture groups in the pattern)"; + ctx->set_error_msg(err_msg.c_str()); *out_length = 0; return ""; }