Skip to content

Commit 93b2a9a

Browse files
authored
feat(sql): add AUTO datetime format detection (#19659)
* feat(sql): add AUTO datetime format detection Add `enable_auto_detect_datetime_format` for deterministic datetime format inference. Parsing boundaries: - Direct functions: - TIMESTAMP: ISO -> epoch -> dtparse(strict=0) -> auto - DATE: ISO -> numeric-day -> dtparse(strict=0) -> auto - VARIANT / COPY / field decoders: - TIMESTAMP: ISO -> epoch -> auto - DATE: ISO -> numeric-day -> auto Also: - keep DATE numeric strings aligned with existing `to_date(number)` semantics - document that `enable_strict_datetime_parser` only affects function-level fallback parsing - recommend explicit formats for string-to-date/timestamp conversion * refactor(sql): move AUTO datetime detection to common-expression and simplify eval functions Move auto-detect datetime parsing functions (int64_to_timestamp, parse_epoch_str, auto_detect_*, parse_*_with_auto, etc.) from databend-functions-scalar-datetime into a new module at databend-common-expression::utils::auto_detect_datetime. This fixes the inverted dependency where databend-common-formats depended on databend-functions-scalar-datetime. The formats crate now imports these utilities directly from common-expression. Also extract the 4-layer fallback logic (ISO -> epoch/auto -> dtparse) from eval_string_to_timestamp and eval_string_to_date into standalone parse_string_to_timestamp and parse_string_to_date functions, making the vectorized eval closures concise. Fix setting descriptions: mention enable_auto_detect_datetime_format interaction in enable_strict_datetime_parser, and add missing Unix date format to enable_auto_detect_datetime_format description.
1 parent b235140 commit 93b2a9a

File tree

19 files changed

+1159
-206
lines changed

19 files changed

+1159
-206
lines changed

Cargo.lock

Lines changed: 0 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/common/io/src/format_settings.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,7 @@ pub struct InputFormatSettings {
122122

123123
pub is_rounding_mode: bool,
124124
pub disable_variant_check: bool,
125+
pub enable_auto_detect_datetime_format: bool,
125126
}
126127

127128
// only used for tests
@@ -133,6 +134,7 @@ impl Default for InputFormatSettings {
133134
binary_format: BinaryDisplayFormat::Hex,
134135
is_rounding_mode: true,
135136
disable_variant_check: false,
137+
enable_auto_detect_datetime_format: false,
136138
}
137139
}
138140
}

src/query/expression/src/function.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,7 @@ pub struct FunctionContext {
142142
pub binary_output_format: BinaryDisplayFormat,
143143
pub parse_datetime_ignore_remainder: bool,
144144
pub enable_strict_datetime_parser: bool,
145+
pub enable_auto_detect_datetime_format: bool,
145146
pub random_function_seed: bool,
146147
pub week_start: u8,
147148
pub date_format_style: String,
@@ -161,6 +162,7 @@ impl Default for FunctionContext {
161162
binary_output_format: BinaryDisplayFormat::Utf8,
162163
parse_datetime_ignore_remainder: false,
163164
enable_strict_datetime_parser: true,
165+
enable_auto_detect_datetime_format: false,
164166
random_function_seed: false,
165167
week_start: 0,
166168
date_format_style: "oracle".to_string(),
Lines changed: 242 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,242 @@
1+
// Copyright 2021 Datafuse Labs
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
use databend_common_column::types::timestamp_tz;
16+
use databend_common_exception::ErrorCode;
17+
use databend_common_timezone::fast_utc_from_local;
18+
use jiff::Timestamp;
19+
use jiff::fmt::strtime::BrokenDownTime;
20+
use jiff::tz::TimeZone;
21+
22+
use crate::serialize::uniform_date;
23+
use crate::types::date::clamp_date;
24+
use crate::types::date::string_to_date;
25+
use crate::types::timestamp::MICROS_PER_MILLI;
26+
use crate::types::timestamp::MICROS_PER_SEC;
27+
use crate::types::timestamp::TIMESTAMP_MAX;
28+
use crate::types::timestamp::TIMESTAMP_MIN;
29+
use crate::types::timestamp::clamp_timestamp;
30+
use crate::types::timestamp::string_to_timestamp;
31+
use crate::types::timestamp_tz::string_to_timestamp_tz;
32+
33+
// ---------------------------------------------------------------------------
34+
// AUTO datetime format detection
35+
// ---------------------------------------------------------------------------
36+
37+
const AUTO_DATE_FORMATS: &[&str] = &["%d-%b-%Y", "%m/%d/%Y"];
38+
39+
const AUTO_TS_FORMATS: &[&str] = &[
40+
// DD-MON-YYYY
41+
"%d-%b-%Y %H:%M:%S%.f",
42+
"%d-%b-%Y %H:%M:%S",
43+
"%d-%b-%Y",
44+
// MM/DD/YYYY
45+
"%m/%d/%Y %H:%M:%S%.f",
46+
"%m/%d/%Y %H:%M:%S",
47+
"%m/%d/%Y",
48+
// RFC 2822 (24h)
49+
"%a, %d %b %Y %H:%M:%S%.f %z",
50+
"%a, %d %b %Y %H:%M:%S %z",
51+
"%a, %d %b %Y %H:%M:%S%.f",
52+
"%a, %d %b %Y %H:%M:%S",
53+
// RFC 2822 (12h)
54+
"%a, %d %b %Y %I:%M:%S%.f %p %z",
55+
"%a, %d %b %Y %I:%M:%S %p %z",
56+
"%a, %d %b %Y %I:%M:%S%.f %p",
57+
"%a, %d %b %Y %I:%M:%S %p",
58+
// Unix date
59+
"%a %b %d %H:%M:%S %z %Y",
60+
];
61+
62+
/// Check if timestamp is within range, and return the timestamp in micros.
63+
#[inline]
64+
pub fn int64_to_timestamp(mut n: i64) -> i64 {
65+
if -31536000000 < n && n < 31536000000 {
66+
n * MICROS_PER_SEC
67+
} else if -31536000000000 < n && n < 31536000000000 {
68+
n * MICROS_PER_MILLI
69+
} else {
70+
clamp_timestamp(&mut n);
71+
n
72+
}
73+
}
74+
75+
/// calc int64 domain to timestamp domain
76+
#[inline]
77+
pub fn calc_int64_to_timestamp_domain(n: i64) -> i64 {
78+
if -31536000000 < n && n < 31536000000 {
79+
n * MICROS_PER_SEC
80+
} else if -31536000000000 < n && n < 31536000000000 {
81+
n * MICROS_PER_MILLI
82+
} else {
83+
n.clamp(TIMESTAMP_MIN, TIMESTAMP_MAX)
84+
}
85+
}
86+
87+
/// Try to parse a string as an epoch number and convert to microseconds.
88+
/// Reuses the same rules as `int64_to_timestamp` / `to_timestamp(number)`.
89+
pub fn parse_epoch_str(val: &str) -> Option<i64> {
90+
let n: i64 = val.parse().ok()?;
91+
Some(int64_to_timestamp(n))
92+
}
93+
94+
/// Core format-matching loop: tries each format, returns `(micros, offset_seconds)`.
95+
fn try_parse_formats(val: &str, tz: &TimeZone, formats: &[&str]) -> Option<(i64, i32)> {
96+
for fmt in formats {
97+
let (tm, consumed) = match BrokenDownTime::parse_prefix(fmt, val) {
98+
Ok(pair) => pair,
99+
Err(_) => continue,
100+
};
101+
if consumed != val.len() {
102+
continue;
103+
}
104+
match tm.offset() {
105+
Some(_) => {
106+
let zoned = match tm.to_zoned() {
107+
Ok(z) => z,
108+
Err(_) => continue,
109+
};
110+
return Some((zoned.timestamp().as_microsecond(), zoned.offset().seconds()));
111+
}
112+
None => {
113+
let micros = match fast_timestamp_from_tm(&tm, tz) {
114+
Some(m) => m,
115+
None => continue,
116+
};
117+
let ts = match Timestamp::from_microsecond(micros) {
118+
Ok(t) => t,
119+
Err(_) => continue,
120+
};
121+
let zoned = ts.to_zoned(tz.clone());
122+
return Some((micros, zoned.offset().seconds()));
123+
}
124+
}
125+
}
126+
None
127+
}
128+
129+
pub fn fast_timestamp_from_tm(tm: &BrokenDownTime, tz: &TimeZone) -> Option<i64> {
130+
let year = i32::from(tm.year()?);
131+
let month: u8 = tm.month()?.try_into().ok()?;
132+
let day: u8 = tm.day()?.try_into().ok()?;
133+
let hour: u8 = tm.hour().unwrap_or(0).try_into().ok()?;
134+
let minute: u8 = tm.minute().unwrap_or(0).try_into().ok()?;
135+
let second: u8 = tm.second().unwrap_or(0).try_into().ok()?;
136+
let nanos = tm.subsec_nanosecond().unwrap_or(0);
137+
let micro = (nanos / 1_000).max(0) as u32;
138+
fast_utc_from_local(tz, year, month, day, hour, minute, second, micro)
139+
}
140+
141+
pub fn auto_detect_timestamp(val: &str, tz: &TimeZone) -> Option<i64> {
142+
let (mut micros, _) = try_parse_formats(val, tz, AUTO_TS_FORMATS)?;
143+
clamp_timestamp(&mut micros);
144+
Some(micros)
145+
}
146+
147+
pub fn auto_detect_date(val: &str) -> Option<i32> {
148+
for fmt in AUTO_DATE_FORMATS {
149+
let (tm, consumed) = match BrokenDownTime::parse_prefix(fmt, val) {
150+
Ok(pair) => pair,
151+
Err(_) => continue,
152+
};
153+
if consumed != val.len() {
154+
continue;
155+
}
156+
let dt = match tm.to_datetime() {
157+
Ok(dt) => dt,
158+
Err(_) => continue,
159+
};
160+
return Some(clamp_date(uniform_date(dt.date()) as i64));
161+
}
162+
None
163+
}
164+
165+
pub fn auto_detect_timestamp_tz(val: &str, tz: &TimeZone) -> Option<timestamp_tz> {
166+
let (mut micros, offset) = try_parse_formats(val, tz, AUTO_TS_FORMATS)?;
167+
clamp_timestamp(&mut micros);
168+
Some(timestamp_tz::new(micros, offset))
169+
}
170+
171+
/// Parse a date string with optional auto-detect fallback.
172+
/// Chain: ISO -> numeric-day -> auto (no dtparse).
173+
#[allow(clippy::result_large_err)]
174+
pub fn parse_date_with_auto(val: &str, tz: &TimeZone, enable_auto: bool) -> Result<i32, ErrorCode> {
175+
match string_to_date(val, tz) {
176+
Ok(d) => Ok(uniform_date(d)),
177+
Err(e) => {
178+
if enable_auto {
179+
if let Ok(days) = val.parse::<i64>() {
180+
return Ok(clamp_date(days));
181+
}
182+
if let Some(days) = auto_detect_date(val) {
183+
return Ok(days);
184+
}
185+
}
186+
Err(e)
187+
}
188+
}
189+
}
190+
191+
/// Parse a timestamp string with optional auto-detect fallback.
192+
/// Chain: ISO -> epoch -> auto (no dtparse).
193+
#[allow(clippy::result_large_err)]
194+
pub fn parse_timestamp_with_auto(
195+
val: &str,
196+
tz: &TimeZone,
197+
enable_auto: bool,
198+
) -> Result<i64, ErrorCode> {
199+
match string_to_timestamp(val, tz) {
200+
Ok(ts) => Ok(ts.timestamp().as_microsecond()),
201+
Err(e) => {
202+
if enable_auto {
203+
if let Some(mut micros) = parse_epoch_str(val) {
204+
clamp_timestamp(&mut micros);
205+
return Ok(micros);
206+
}
207+
if let Some(micros) = auto_detect_timestamp(val, tz) {
208+
return Ok(micros);
209+
}
210+
}
211+
Err(e)
212+
}
213+
}
214+
}
215+
216+
/// Parse a timestamp_tz string with optional auto-detect fallback.
217+
/// Chain: ISO -> epoch -> auto (no dtparse).
218+
#[allow(clippy::result_large_err)]
219+
pub fn parse_timestamp_tz_with_auto(
220+
val: &str,
221+
tz: &TimeZone,
222+
enable_auto: bool,
223+
) -> Result<timestamp_tz, ErrorCode> {
224+
match string_to_timestamp_tz(val.as_bytes(), || tz) {
225+
Ok(ts_tz) => Ok(ts_tz),
226+
Err(e) => {
227+
if enable_auto {
228+
if let Some(mut micros) = parse_epoch_str(val) {
229+
clamp_timestamp(&mut micros);
230+
if let Ok(ts) = Timestamp::from_microsecond(micros) {
231+
let offset = tz.to_offset(ts).seconds();
232+
return Ok(timestamp_tz::new(micros, offset));
233+
}
234+
}
235+
if let Some(ts_tz) = auto_detect_timestamp_tz(val, tz) {
236+
return Ok(ts_tz);
237+
}
238+
}
239+
Err(e)
240+
}
241+
}
242+
}

src/query/expression/src/utils/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515
pub mod arithmetics_type;
1616
pub mod arrow;
17+
pub mod auto_detect_datetime;
1718
pub mod bitmap;
1819
pub mod block_debug;
1920
pub mod block_thresholds;

src/query/formats/Cargo.toml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@ databend-common-expression = { workspace = true }
1515
databend-common-io = { workspace = true }
1616
databend-common-meta-app = { workspace = true }
1717
databend-common-settings = { workspace = true }
18-
databend-functions-scalar-datetime = { workspace = true }
1918
databend-storages-common-blocks = { workspace = true }
2019
databend-storages-common-table-meta = { workspace = true }
2120

0 commit comments

Comments
 (0)