From ce21df89b487e63ed908dc9cccaa4fd8ae7b62d9 Mon Sep 17 00:00:00 2001 From: Puneet Dixit Date: Sun, 17 May 2026 04:20:59 +0530 Subject: [PATCH 1/9] perf: optimize arrays_zip perfect list zips --- .../functions-nested/benches/arrays_zip.rs | 2 +- datafusion/functions-nested/src/arrays_zip.rs | 182 ++++++++++++++++++ 2 files changed, 183 insertions(+), 1 deletion(-) diff --git a/datafusion/functions-nested/benches/arrays_zip.rs b/datafusion/functions-nested/benches/arrays_zip.rs index bc82b2978cc42..812e5e3dbec8a 100644 --- a/datafusion/functions-nested/benches/arrays_zip.rs +++ b/datafusion/functions-nested/benches/arrays_zip.rs @@ -109,7 +109,7 @@ fn bench_arrays_zip(c: &mut Criterion, name: &str, null_density: f64) { } fn criterion_benchmark(c: &mut Criterion) { - bench_arrays_zip(c, "arrays_zip_no_nulls_8192", 0.0); + bench_arrays_zip(c, "arrays_zip_perfect_zip_8192", 0.0); bench_arrays_zip(c, "arrays_zip_10pct_nulls_8192", 0.1); } diff --git a/datafusion/functions-nested/src/arrays_zip.rs b/datafusion/functions-nested/src/arrays_zip.rs index 5f1cb9dedf408..27cd8539d44f1 100644 --- a/datafusion/functions-nested/src/arrays_zip.rs +++ b/datafusion/functions-nested/src/arrays_zip.rs @@ -165,6 +165,10 @@ fn arrays_zip_inner(args: &[ArrayRef]) -> Result { let num_rows = args[0].len(); + if let Some(result) = try_perfect_list_zip(args)? { + return Ok(result); + } + // Build a type-erased ListColumnView for each argument. // None means the argument is Null-typed (all nulls, no backing data). let mut views: Vec> = Vec::with_capacity(args.len()); @@ -327,3 +331,181 @@ fn arrays_zip_inner(args: &[ArrayRef]) -> Result { Ok(Arc::new(result)) } + +fn try_perfect_list_zip(args: &[ArrayRef]) -> Result> { + let mut list_arrays = Vec::with_capacity(args.len()); + let mut struct_fields = Vec::with_capacity(args.len()); + + for (i, arg) in args.iter().enumerate() { + let arr = match arg.data_type() { + List(field) => { + struct_fields.push(Field::new( + format!("{}", i + 1), + field.data_type().clone(), + true, + )); + as_list_array(arg)? + } + _ => return Ok(None), + }; + + list_arrays.push(arr); + } + + let first = list_arrays[0]; + let num_rows = first.len(); + let offsets = first.offsets().clone(); + let values_len = first.values().len(); + + for arr in &list_arrays { + if arr.len() != num_rows || arr.values().len() != values_len { + return Ok(None); + } + } + + let nulls = if list_arrays.iter().any(|arr| arr.null_count() != 0) { + let mut null_builder = NullBufferBuilder::new(num_rows); + for row_idx in 0..num_rows { + let mut all_null = true; + + for arr in &list_arrays { + if arr.is_null(row_idx) { + if arr.offsets()[row_idx + 1] != arr.offsets()[row_idx] { + return Ok(None); + } + } else { + all_null = false; + } + } + + if all_null { + null_builder.append_null(); + } else { + null_builder.append_non_null(); + } + } + + null_builder.finish() + } else { + None + }; + + for arr in &list_arrays { + if arr.offsets() != &offsets { + return Ok(None); + } + } + + let struct_columns = list_arrays + .iter() + .map(|arr| Arc::clone(arr.values())) + .collect::>(); + let struct_array = + StructArray::try_new(Fields::from(struct_fields), struct_columns, None)?; + let result = ListArray::try_new( + Arc::new(Field::new_list_field( + struct_array.data_type().clone(), + true, + )), + offsets, + Arc::new(struct_array), + nulls, + )?; + + Ok(Some(Arc::new(result))) +} + +#[cfg(test)] +mod tests { + use super::*; + use arrow::array::Int64Array; + use arrow::buffer::NullBuffer; + + fn list(values: Vec, offsets: Vec) -> Arc { + list_with_validity(values, offsets, None) + } + + fn list_with_validity( + values: Vec, + offsets: Vec, + valid: Option>, + ) -> Arc { + Arc::new( + ListArray::try_new( + Arc::new(Field::new_list_field(DataType::Int64, true)), + OffsetBuffer::new(offsets.into()), + Arc::new(Int64Array::from(values)), + valid.map(NullBuffer::from), + ) + .unwrap(), + ) + } + + #[test] + fn perfect_zip_reuses_input_values_and_offsets() { + let left = list(vec![1, 2, 3, 4, 5, 6], vec![0, 2, 3, 6]); + let right = list(vec![10, 20, 30, 40, 50, 60], vec![0, 2, 3, 6]); + + let result = arrays_zip_inner(&[ + Arc::clone(&left) as ArrayRef, + Arc::clone(&right) as ArrayRef, + ]) + .unwrap(); + let result = result.as_any().downcast_ref::().unwrap(); + let values = result + .values() + .as_any() + .downcast_ref::() + .unwrap(); + + assert!(result.offsets().ptr_eq(left.offsets())); + assert!(Arc::ptr_eq(values.column(0), left.values())); + assert!(Arc::ptr_eq(values.column(1), right.values())); + } + + #[test] + fn perfect_zip_reuses_zero_length_null_rows() { + let left = list_with_validity( + vec![1, 2, 3, 4], + vec![0, 2, 2, 4], + Some(vec![true, false, true]), + ); + let right = list_with_validity( + vec![10, 20, 30, 40], + vec![0, 2, 2, 4], + Some(vec![true, false, true]), + ); + + let result = arrays_zip_inner(&[ + Arc::clone(&left) as ArrayRef, + Arc::clone(&right) as ArrayRef, + ]) + .unwrap(); + let result = result.as_any().downcast_ref::().unwrap(); + + assert!(result.offsets().ptr_eq(left.offsets())); + assert!(result.is_null(1)); + } + + #[test] + fn null_row_with_hidden_values_uses_general_path() { + let left = + list_with_validity(vec![1, 2, 3, 4], vec![0, 2, 4], Some(vec![true, false])); + let right = list_with_validity( + vec![10, 20, 30, 40], + vec![0, 2, 4], + Some(vec![true, false]), + ); + + let result = arrays_zip_inner(&[ + Arc::clone(&left) as ArrayRef, + Arc::clone(&right) as ArrayRef, + ]) + .unwrap(); + let result = result.as_any().downcast_ref::().unwrap(); + + assert!(!result.offsets().ptr_eq(left.offsets())); + assert_eq!(result.value_offsets(), &[0, 2, 2]); + assert!(result.is_null(1)); + } +} From 7795270850316f82bcc28ff59165019a50a53eab Mon Sep 17 00:00:00 2001 From: Puneet Dixit Date: Sun, 17 May 2026 04:37:37 +0530 Subject: [PATCH 2/9] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- datafusion/functions-nested/src/arrays_zip.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/functions-nested/src/arrays_zip.rs b/datafusion/functions-nested/src/arrays_zip.rs index 27cd8539d44f1..630ba60fafd8d 100644 --- a/datafusion/functions-nested/src/arrays_zip.rs +++ b/datafusion/functions-nested/src/arrays_zip.rs @@ -340,7 +340,7 @@ fn try_perfect_list_zip(args: &[ArrayRef]) -> Result> { let arr = match arg.data_type() { List(field) => { struct_fields.push(Field::new( - format!("{}", i + 1), + (i + 1).to_string(), field.data_type().clone(), true, )); From 61459f1936d3da277f957602db62d790a6f05ce5 Mon Sep 17 00:00:00 2001 From: Puneet Dixit Date: Sun, 17 May 2026 04:44:57 +0530 Subject: [PATCH 3/9] perf: reject non-perfect arrays_zip offsets earlier --- datafusion/functions-nested/src/arrays_zip.rs | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/datafusion/functions-nested/src/arrays_zip.rs b/datafusion/functions-nested/src/arrays_zip.rs index 630ba60fafd8d..74d8c0ad88a93 100644 --- a/datafusion/functions-nested/src/arrays_zip.rs +++ b/datafusion/functions-nested/src/arrays_zip.rs @@ -358,7 +358,10 @@ fn try_perfect_list_zip(args: &[ArrayRef]) -> Result> { let values_len = first.values().len(); for arr in &list_arrays { - if arr.len() != num_rows || arr.values().len() != values_len { + if arr.len() != num_rows + || arr.values().len() != values_len + || arr.offsets() != &offsets + { return Ok(None); } } @@ -390,12 +393,6 @@ fn try_perfect_list_zip(args: &[ArrayRef]) -> Result> { None }; - for arr in &list_arrays { - if arr.offsets() != &offsets { - return Ok(None); - } - } - let struct_columns = list_arrays .iter() .map(|arr| Arc::clone(arr.values())) From b02aad1901c058e46e9a5c3ce53840f5de5f42fa Mon Sep 17 00:00:00 2001 From: Puneet Dixit Date: Sun, 17 May 2026 04:50:07 +0530 Subject: [PATCH 4/9] docs: explain arrays_zip perfect-list fast path --- datafusion/functions-nested/src/arrays_zip.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/datafusion/functions-nested/src/arrays_zip.rs b/datafusion/functions-nested/src/arrays_zip.rs index 74d8c0ad88a93..64900a196487b 100644 --- a/datafusion/functions-nested/src/arrays_zip.rs +++ b/datafusion/functions-nested/src/arrays_zip.rs @@ -332,6 +332,9 @@ fn arrays_zip_inner(args: &[ArrayRef]) -> Result { Ok(Arc::new(result)) } +/// Fast path for regular List inputs whose existing buffers already match the +/// zipped output: all offsets and values lengths match, and null rows cover no +/// values. This lets us reuse offsets and child values instead of rebuilding. fn try_perfect_list_zip(args: &[ArrayRef]) -> Result> { let mut list_arrays = Vec::with_capacity(args.len()); let mut struct_fields = Vec::with_capacity(args.len()); From 973e3dc17014ab9b7b7b13bf17f82f5708f91daa Mon Sep 17 00:00:00 2001 From: puneetdixit200 <236133619+puneetdixit200@users.noreply.github.com> Date: Mon, 18 May 2026 19:25:31 +0530 Subject: [PATCH 5/9] Document arrays zip fast path constraints --- datafusion/functions-nested/src/arrays_zip.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/datafusion/functions-nested/src/arrays_zip.rs b/datafusion/functions-nested/src/arrays_zip.rs index 64900a196487b..3d2864305aee3 100644 --- a/datafusion/functions-nested/src/arrays_zip.rs +++ b/datafusion/functions-nested/src/arrays_zip.rs @@ -360,6 +360,8 @@ fn try_perfect_list_zip(args: &[ArrayRef]) -> Result> { let offsets = first.offsets().clone(); let values_len = first.values().len(); + // Reusing the child arrays is only valid when every list uses the exact + // same row boundaries and exposes the same total number of child values. for arr in &list_arrays { if arr.len() != num_rows || arr.values().len() != values_len From 035689147f610fe7d793723651848ac0323475c2 Mon Sep 17 00:00:00 2001 From: Puneet Dixit Date: Thu, 21 May 2026 07:51:23 +0530 Subject: [PATCH 6/9] perf: pass arrays_zip field names to fast path --- datafusion/functions-nested/src/arrays_zip.rs | 58 ++++++++++++++++--- 1 file changed, 51 insertions(+), 7 deletions(-) diff --git a/datafusion/functions-nested/src/arrays_zip.rs b/datafusion/functions-nested/src/arrays_zip.rs index 3d2864305aee3..85d825d1c309a 100644 --- a/datafusion/functions-nested/src/arrays_zip.rs +++ b/datafusion/functions-nested/src/arrays_zip.rs @@ -130,7 +130,7 @@ impl ScalarUDFImpl for ArraysZip { return exec_err!("arrays_zip expects array arguments, got {dt}"); } }; - fields.push(Field::new(format!("{}", i + 1), element_type, true)); + fields.push(Field::new(arrays_zip_field_name(i), element_type, true)); } Ok(List(Arc::new(Field::new_list_field( @@ -163,9 +163,10 @@ fn arrays_zip_inner(args: &[ArrayRef]) -> Result { return exec_err!("arrays_zip requires at least one argument"); } + let field_names = arrays_zip_field_names(args.len()); let num_rows = args[0].len(); - if let Some(result) = try_perfect_list_zip(args)? { + if let Some(result) = try_perfect_list_zip(args, &field_names)? { return Ok(result); } @@ -229,8 +230,8 @@ fn arrays_zip_inner(args: &[ArrayRef]) -> Result { let struct_fields: Fields = element_types .iter() - .enumerate() - .map(|(i, dt)| Field::new(format!("{}", i + 1), dt.clone(), true)) + .zip(field_names.iter()) + .map(|(dt, name)| Field::new(name.clone(), dt.clone(), true)) .collect::>() .into(); @@ -332,18 +333,31 @@ fn arrays_zip_inner(args: &[ArrayRef]) -> Result { Ok(Arc::new(result)) } +fn arrays_zip_field_name(index: usize) -> String { + (index + 1).to_string() +} + +fn arrays_zip_field_names(len: usize) -> Vec { + (0..len).map(arrays_zip_field_name).collect() +} + /// Fast path for regular List inputs whose existing buffers already match the /// zipped output: all offsets and values lengths match, and null rows cover no /// values. This lets us reuse offsets and child values instead of rebuilding. -fn try_perfect_list_zip(args: &[ArrayRef]) -> Result> { +fn try_perfect_list_zip( + args: &[ArrayRef], + field_names: &[String], +) -> Result> { + debug_assert_eq!(args.len(), field_names.len()); + let mut list_arrays = Vec::with_capacity(args.len()); let mut struct_fields = Vec::with_capacity(args.len()); - for (i, arg) in args.iter().enumerate() { + for (arg, field_name) in args.iter().zip(field_names) { let arr = match arg.data_type() { List(field) => { struct_fields.push(Field::new( - (i + 1).to_string(), + field_name.clone(), field.data_type().clone(), true, )); @@ -465,6 +479,36 @@ mod tests { assert!(Arc::ptr_eq(values.column(1), right.values())); } + #[test] + fn perfect_zip_uses_supplied_field_names() { + let left = list(vec![1, 2, 3], vec![0, 1, 3]); + let right = list(vec![10, 20, 30], vec![0, 1, 3]); + let field_names = vec!["left".to_string(), "right".to_string()]; + + let result = try_perfect_list_zip( + &[ + Arc::clone(&left) as ArrayRef, + Arc::clone(&right) as ArrayRef, + ], + &field_names, + ) + .unwrap() + .unwrap(); + let result = result.as_any().downcast_ref::().unwrap(); + let values = result + .values() + .as_any() + .downcast_ref::() + .unwrap(); + let names = values + .fields() + .iter() + .map(|field| field.name().as_str()) + .collect::>(); + + assert_eq!(names, vec!["left", "right"]); + } + #[test] fn perfect_zip_reuses_zero_length_null_rows() { let left = list_with_validity( From b184e03c522f5c9cd10679ac3aef43fd6b520785 Mon Sep 17 00:00:00 2001 From: Puneet Dixit <236133619+puneetdixit200@users.noreply.github.com> Date: Tue, 26 May 2026 12:38:42 +0530 Subject: [PATCH 7/9] perf: refine arrays_zip fast path nulls --- datafusion/functions-nested/src/arrays_zip.rs | 29 ++++++++++++++++--- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/datafusion/functions-nested/src/arrays_zip.rs b/datafusion/functions-nested/src/arrays_zip.rs index 85d825d1c309a..1c36cfa8d4639 100644 --- a/datafusion/functions-nested/src/arrays_zip.rs +++ b/datafusion/functions-nested/src/arrays_zip.rs @@ -377,15 +377,16 @@ fn try_perfect_list_zip( // Reusing the child arrays is only valid when every list uses the exact // same row boundaries and exposes the same total number of child values. for arr in &list_arrays { - if arr.len() != num_rows - || arr.values().len() != values_len - || arr.offsets() != &offsets - { + if arr.values().len() != values_len || arr.offsets() != &offsets { return Ok(None); } } let nulls = if list_arrays.iter().any(|arr| arr.null_count() != 0) { + // Match the general path: arrays_zip only marks an output row null + // when every concrete input list is null. Mixed null and non-null + // empty lists still produce a non-null empty list, so this cannot use + // NullBuffer::union_many, which would make rows null if any input is. let mut null_builder = NullBufferBuilder::new(num_rows); for row_idx in 0..num_rows { let mut all_null = true; @@ -533,6 +534,26 @@ mod tests { assert!(result.is_null(1)); } + #[test] + fn perfect_zip_preserves_mixed_null_empty_rows() { + let left = + list_with_validity(vec![], vec![0, 0, 0, 0], Some(vec![false, true, false])); + let right = + list_with_validity(vec![], vec![0, 0, 0, 0], Some(vec![true, false, false])); + + let result = arrays_zip_inner(&[ + Arc::clone(&left) as ArrayRef, + Arc::clone(&right) as ArrayRef, + ]) + .unwrap(); + let result = result.as_any().downcast_ref::().unwrap(); + + assert!(result.offsets().ptr_eq(left.offsets())); + assert!(!result.is_null(0)); + assert!(!result.is_null(1)); + assert!(result.is_null(2)); + } + #[test] fn null_row_with_hidden_values_uses_general_path() { let left = From 6ea61846209aa92a50afef584ac2f8d4a5fcca1f Mon Sep 17 00:00:00 2001 From: Puneet Dixit <236133619+puneetdixit200@users.noreply.github.com> Date: Tue, 26 May 2026 12:50:12 +0530 Subject: [PATCH 8/9] perf: use null buffer union for aligned list nulls --- datafusion/functions-nested/src/arrays_zip.rs | 85 +++++++++++++------ 1 file changed, 60 insertions(+), 25 deletions(-) diff --git a/datafusion/functions-nested/src/arrays_zip.rs b/datafusion/functions-nested/src/arrays_zip.rs index 1c36cfa8d4639..8a89b1c829f00 100644 --- a/datafusion/functions-nested/src/arrays_zip.rs +++ b/datafusion/functions-nested/src/arrays_zip.rs @@ -22,7 +22,7 @@ use arrow::array::{ Array, ArrayRef, Capacities, ListArray, MutableArrayData, NullBufferBuilder, StructArray, new_null_array, }; -use arrow::buffer::OffsetBuffer; +use arrow::buffer::{NullBuffer, OffsetBuffer}; use arrow::datatypes::DataType::{FixedSizeList, LargeList, List, Null}; use arrow::datatypes::{DataType, Field, Fields}; use datafusion_common::cast::{ @@ -44,7 +44,7 @@ struct ListColumnView { /// Pre-computed per-row start offsets (length = num_rows + 1). offsets: Vec, /// Null bitmap from the input array (None means no nulls). - nulls: Option, + nulls: Option, } impl ListColumnView { @@ -383,32 +383,37 @@ fn try_perfect_list_zip( } let nulls = if list_arrays.iter().any(|arr| arr.null_count() != 0) { - // Match the general path: arrays_zip only marks an output row null - // when every concrete input list is null. Mixed null and non-null - // empty lists still produce a non-null empty list, so this cannot use - // NullBuffer::union_many, which would make rows null if any input is. - let mut null_builder = NullBufferBuilder::new(num_rows); - for row_idx in 0..num_rows { - let mut all_null = true; - - for arr in &list_arrays { - if arr.is_null(row_idx) { - if arr.offsets()[row_idx + 1] != arr.offsets()[row_idx] { - return Ok(None); + let first_nulls = first.nulls(); + if list_arrays.iter().all(|arr| arr.nulls() == first_nulls) { + NullBuffer::union_many(list_arrays.iter().map(|arr| arr.nulls())) + } else { + // Match the general path: arrays_zip only marks an output row null + // when every concrete input list is null. Mixed null and non-null + // empty lists still produce a non-null empty list, but mixed null + // rows with values must fall back to preserve field-level nulls. + let mut null_builder = NullBufferBuilder::new(num_rows); + for row_idx in 0..num_rows { + let mut all_null = true; + + for arr in &list_arrays { + if arr.is_null(row_idx) { + if arr.offsets()[row_idx + 1] != arr.offsets()[row_idx] { + return Ok(None); + } + } else { + all_null = false; } + } + + if all_null { + null_builder.append_null(); } else { - all_null = false; + null_builder.append_non_null(); } } - if all_null { - null_builder.append_null(); - } else { - null_builder.append_non_null(); - } + null_builder.finish() } - - null_builder.finish() } else { None }; @@ -555,7 +560,7 @@ mod tests { } #[test] - fn null_row_with_hidden_values_uses_general_path() { + fn perfect_zip_reuses_null_rows_with_hidden_values() { let left = list_with_validity(vec![1, 2, 3, 4], vec![0, 2, 4], Some(vec![true, false])); let right = list_with_validity( @@ -571,8 +576,38 @@ mod tests { .unwrap(); let result = result.as_any().downcast_ref::().unwrap(); - assert!(!result.offsets().ptr_eq(left.offsets())); - assert_eq!(result.value_offsets(), &[0, 2, 2]); + assert!(result.offsets().ptr_eq(left.offsets())); + assert_eq!(result.value_offsets(), &[0, 2, 4]); assert!(result.is_null(1)); } + + #[test] + fn mixed_null_row_with_hidden_values_uses_general_path() { + let left = + list_with_validity(vec![1, 2, 3, 4], vec![0, 2, 4], Some(vec![true, false])); + let right = list_with_validity( + vec![10, 20, 30, 40], + vec![0, 2, 4], + Some(vec![true, true]), + ); + + let result = arrays_zip_inner(&[ + Arc::clone(&left) as ArrayRef, + Arc::clone(&right) as ArrayRef, + ]) + .unwrap(); + let result = result.as_any().downcast_ref::().unwrap(); + let values = result + .values() + .as_any() + .downcast_ref::() + .unwrap(); + + assert!(!result.offsets().ptr_eq(left.offsets())); + assert_eq!(result.value_offsets(), &[0, 2, 4]); + assert!(values.column(0).is_null(2)); + assert!(values.column(0).is_null(3)); + assert!(!values.column(1).is_null(2)); + assert!(!values.column(1).is_null(3)); + } } From 1aaae2c18355f58bac7ecb8e1a402e5a81b49b18 Mon Sep 17 00:00:00 2001 From: Puneet Dixit <236133619+puneetdixit200@users.noreply.github.com> Date: Tue, 26 May 2026 13:12:31 +0530 Subject: [PATCH 9/9] perf: avoid redundant null buffer union --- datafusion/functions-nested/src/arrays_zip.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/functions-nested/src/arrays_zip.rs b/datafusion/functions-nested/src/arrays_zip.rs index 8a89b1c829f00..76b1b589f42f5 100644 --- a/datafusion/functions-nested/src/arrays_zip.rs +++ b/datafusion/functions-nested/src/arrays_zip.rs @@ -385,7 +385,7 @@ fn try_perfect_list_zip( let nulls = if list_arrays.iter().any(|arr| arr.null_count() != 0) { let first_nulls = first.nulls(); if list_arrays.iter().all(|arr| arr.nulls() == first_nulls) { - NullBuffer::union_many(list_arrays.iter().map(|arr| arr.nulls())) + first_nulls.cloned() } else { // Match the general path: arrays_zip only marks an output row null // when every concrete input list is null. Mixed null and non-null