Skip to content

Commit 0b0358b

Browse files
committed
HIVE-29557: Parquet Vectorization reads NULL values in Arrays and Map as Default Values
1 parent 6f063df commit 0b0358b

File tree

5 files changed

+324
-13
lines changed

5 files changed

+324
-13
lines changed

ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedListColumnReader.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -272,6 +272,7 @@ private void fillColumnVector(PrimitiveObjectInspector.PrimitiveCategory categor
272272
for (int i = 0; i < valueList.size(); i++) {
273273
if (valueList.get(i) == null) {
274274
lcv.child.isNull[i] = true;
275+
lcv.child.noNulls = false;
275276
} else {
276277
((LongColumnVector) lcv.child).vector[i] = ((List<Integer>) valueList).get(i);
277278
}
@@ -287,6 +288,7 @@ private void fillColumnVector(PrimitiveObjectInspector.PrimitiveCategory categor
287288
for (int i = 0; i < valueList.size(); i++) {
288289
if (valueList.get(i) == null) {
289290
lcv.child.isNull[i] = true;
291+
lcv.child.noNulls = false;
290292
} else {
291293
((LongColumnVector) lcv.child).vector[i] = ((List<Long>) valueList).get(i);
292294
}
@@ -297,6 +299,7 @@ private void fillColumnVector(PrimitiveObjectInspector.PrimitiveCategory categor
297299
for (int i = 0; i < valueList.size(); i++) {
298300
if (valueList.get(i) == null) {
299301
lcv.child.isNull[i] = true;
302+
lcv.child.noNulls = false;
300303
} else {
301304
((DoubleColumnVector) lcv.child).vector[i] = ((List<Double>) valueList).get(i);
302305
}
@@ -313,6 +316,7 @@ private void fillColumnVector(PrimitiveObjectInspector.PrimitiveCategory categor
313316
if (src == null) {
314317
((BytesColumnVector) lcv.child).setRef(i, src, 0, 0);
315318
lcv.child.isNull[i] = true;
319+
lcv.child.noNulls = false;
316320
} else {
317321
((BytesColumnVector) lcv.child).setRef(i, src, 0, src.length);
318322
}
@@ -323,6 +327,7 @@ private void fillColumnVector(PrimitiveObjectInspector.PrimitiveCategory categor
323327
for (int i = 0; i < valueList.size(); i++) {
324328
if (valueList.get(i) == null) {
325329
lcv.child.isNull[i] = true;
330+
lcv.child.noNulls = false;
326331
} else {
327332
((DoubleColumnVector) lcv.child).vector[i] = ((List<Float>) valueList).get(i);
328333
}
@@ -337,6 +342,7 @@ private void fillColumnVector(PrimitiveObjectInspector.PrimitiveCategory categor
337342
for (int i = 0; i < valueList.size(); i++) {
338343
if (valueList.get(i) == null) {
339344
lcv.child.isNull[i] = true;
345+
lcv.child.noNulls = false;
340346
} else {
341347
((DecimalColumnVector) lcv.child).vector[i].set(((List<byte[]>) valueList).get(i), scale);
342348
}

ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedMapColumnReader.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,5 +65,6 @@ public void readBatch(int total, ColumnVector column, TypeInfo columnType) throw
6565
mapColumnVector.childCount = keyListColumnVector.childCount;
6666
mapColumnVector.isRepeating = keyListColumnVector.isRepeating
6767
&& valueListColumnVector.isRepeating;
68+
mapColumnVector.noNulls = keyListColumnVector.noNulls && valueListColumnVector.noNulls;
6869
}
6970
}
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
-- SORT_QUERY_RESULTS
2+
SET hive.vectorized.execution.enabled=true;
3+
set hive.vectorized.execution.reduce.enabled=true;
4+
SET hive.fetch.task.conversion=none;
5+
6+
CREATE TABLE test_parquet_array_nulls_bool (
7+
id INT,
8+
arr_prim ARRAY<BOOLEAN>
9+
) STORED AS PARQUET;
10+
11+
INSERT INTO test_parquet_array_nulls_bool VALUES
12+
(1, array(CAST(NULL AS BOOLEAN), CAST(NULL AS BOOLEAN))),
13+
(2, if(1=0, array(true, false), null)),
14+
(3, array(true, CAST(NULL AS BOOLEAN))),
15+
(4, array(true, false));
16+
17+
SELECT * FROM test_parquet_array_nulls_bool;
18+
19+
CREATE TABLE test_parquet_array_nulls_double (
20+
id INT,
21+
arr_prim ARRAY<DOUBLE>
22+
) STORED AS PARQUET;
23+
24+
INSERT INTO test_parquet_array_nulls_double
25+
SELECT 1, array(CAST(NULL AS DOUBLE), CAST(NULL AS DOUBLE))
26+
UNION ALL
27+
SELECT 2, CAST(NULL AS ARRAY<DOUBLE>)
28+
UNION ALL
29+
SELECT 3, array(CAST(3.3 AS DOUBLE), CAST(NULL AS DOUBLE))
30+
UNION ALL
31+
SELECT 4, array(CAST(4.4 AS DOUBLE), CAST(5.5 AS DOUBLE));
32+
33+
SELECT * FROM test_parquet_array_nulls_double;
34+
35+
CREATE TABLE test_parquet_array_nulls_varchar (
36+
id INT,
37+
arr_prim ARRAY<VARCHAR(20)>
38+
) STORED AS PARQUET;
39+
40+
SELECT 1, array(NULL, NULL)
41+
UNION ALL
42+
SELECT 2, CAST(NULL AS ARRAY<STRING>)
43+
UNION ALL
44+
SELECT 3, array('val3', NULL)
45+
UNION ALL
46+
SELECT 4, array('val4', 'val5');
47+
48+
SELECT * FROM test_parquet_array_nulls_varchar;
49+
50+
CREATE TABLE test_parquet_array_nulls_float (
51+
id INT,
52+
arr_prim ARRAY<FLOAT>
53+
) STORED AS PARQUET;
54+
55+
INSERT INTO test_parquet_array_nulls_float
56+
SELECT 1, array(CAST(NULL AS FLOAT), CAST(NULL AS FLOAT))
57+
UNION ALL
58+
SELECT 2, CAST(NULL AS ARRAY<FLOAT>)
59+
UNION ALL
60+
SELECT 3, array(CAST(3.3 AS FLOAT), CAST(NULL AS FLOAT))
61+
UNION ALL
62+
SELECT 4, array(CAST(4.4 AS FLOAT), CAST(5.5 AS FLOAT));
63+
64+
SELECT * FROM test_parquet_array_nulls_float;
65+
66+
SET hive.vectorized.execution.enabled=false;
67+
SELECT * FROM test_parquet_array_nulls_bool;
68+
SELECT * FROM test_parquet_array_nulls_double;
69+
SELECT * FROM test_parquet_array_nulls_varchar;
70+
SELECT * FROM test_parquet_array_nulls_float;
Lines changed: 234 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,234 @@
1+
PREHOOK: query: CREATE TABLE test_parquet_array_nulls_bool (
2+
id INT,
3+
arr_prim ARRAY<BOOLEAN>
4+
) STORED AS PARQUET
5+
PREHOOK: type: CREATETABLE
6+
PREHOOK: Output: database:default
7+
PREHOOK: Output: default@test_parquet_array_nulls_bool
8+
POSTHOOK: query: CREATE TABLE test_parquet_array_nulls_bool (
9+
id INT,
10+
arr_prim ARRAY<BOOLEAN>
11+
) STORED AS PARQUET
12+
POSTHOOK: type: CREATETABLE
13+
POSTHOOK: Output: database:default
14+
POSTHOOK: Output: default@test_parquet_array_nulls_bool
15+
PREHOOK: query: INSERT INTO test_parquet_array_nulls_bool VALUES
16+
(1, array(CAST(NULL AS BOOLEAN), CAST(NULL AS BOOLEAN))),
17+
(2, if(1=0, array(true, false), null)),
18+
(3, array(true, CAST(NULL AS BOOLEAN))),
19+
(4, array(true, false))
20+
PREHOOK: type: QUERY
21+
PREHOOK: Input: _dummy_database@_dummy_table
22+
PREHOOK: Output: default@test_parquet_array_nulls_bool
23+
POSTHOOK: query: INSERT INTO test_parquet_array_nulls_bool VALUES
24+
(1, array(CAST(NULL AS BOOLEAN), CAST(NULL AS BOOLEAN))),
25+
(2, if(1=0, array(true, false), null)),
26+
(3, array(true, CAST(NULL AS BOOLEAN))),
27+
(4, array(true, false))
28+
POSTHOOK: type: QUERY
29+
POSTHOOK: Input: _dummy_database@_dummy_table
30+
POSTHOOK: Output: default@test_parquet_array_nulls_bool
31+
POSTHOOK: Lineage: test_parquet_array_nulls_bool.arr_prim SCRIPT []
32+
POSTHOOK: Lineage: test_parquet_array_nulls_bool.id SCRIPT []
33+
PREHOOK: query: SELECT * FROM test_parquet_array_nulls_bool
34+
PREHOOK: type: QUERY
35+
PREHOOK: Input: default@test_parquet_array_nulls_bool
36+
#### A masked pattern was here ####
37+
POSTHOOK: query: SELECT * FROM test_parquet_array_nulls_bool
38+
POSTHOOK: type: QUERY
39+
POSTHOOK: Input: default@test_parquet_array_nulls_bool
40+
#### A masked pattern was here ####
41+
1 [null,null]
42+
2 NULL
43+
3 [true,null]
44+
4 [true,false]
45+
PREHOOK: query: CREATE TABLE test_parquet_array_nulls_double (
46+
id INT,
47+
arr_prim ARRAY<DOUBLE>
48+
) STORED AS PARQUET
49+
PREHOOK: type: CREATETABLE
50+
PREHOOK: Output: database:default
51+
PREHOOK: Output: default@test_parquet_array_nulls_double
52+
POSTHOOK: query: CREATE TABLE test_parquet_array_nulls_double (
53+
id INT,
54+
arr_prim ARRAY<DOUBLE>
55+
) STORED AS PARQUET
56+
POSTHOOK: type: CREATETABLE
57+
POSTHOOK: Output: database:default
58+
POSTHOOK: Output: default@test_parquet_array_nulls_double
59+
PREHOOK: query: INSERT INTO test_parquet_array_nulls_double
60+
SELECT 1, array(CAST(NULL AS DOUBLE), CAST(NULL AS DOUBLE))
61+
UNION ALL
62+
SELECT 2, CAST(NULL AS ARRAY<DOUBLE>)
63+
UNION ALL
64+
SELECT 3, array(CAST(3.3 AS DOUBLE), CAST(NULL AS DOUBLE))
65+
UNION ALL
66+
SELECT 4, array(CAST(4.4 AS DOUBLE), CAST(5.5 AS DOUBLE))
67+
PREHOOK: type: QUERY
68+
PREHOOK: Input: _dummy_database@_dummy_table
69+
PREHOOK: Output: default@test_parquet_array_nulls_double
70+
POSTHOOK: query: INSERT INTO test_parquet_array_nulls_double
71+
SELECT 1, array(CAST(NULL AS DOUBLE), CAST(NULL AS DOUBLE))
72+
UNION ALL
73+
SELECT 2, CAST(NULL AS ARRAY<DOUBLE>)
74+
UNION ALL
75+
SELECT 3, array(CAST(3.3 AS DOUBLE), CAST(NULL AS DOUBLE))
76+
UNION ALL
77+
SELECT 4, array(CAST(4.4 AS DOUBLE), CAST(5.5 AS DOUBLE))
78+
POSTHOOK: type: QUERY
79+
POSTHOOK: Input: _dummy_database@_dummy_table
80+
POSTHOOK: Output: default@test_parquet_array_nulls_double
81+
POSTHOOK: Lineage: test_parquet_array_nulls_double.arr_prim SCRIPT []
82+
POSTHOOK: Lineage: test_parquet_array_nulls_double.id SCRIPT []
83+
PREHOOK: query: SELECT * FROM test_parquet_array_nulls_double
84+
PREHOOK: type: QUERY
85+
PREHOOK: Input: default@test_parquet_array_nulls_double
86+
#### A masked pattern was here ####
87+
POSTHOOK: query: SELECT * FROM test_parquet_array_nulls_double
88+
POSTHOOK: type: QUERY
89+
POSTHOOK: Input: default@test_parquet_array_nulls_double
90+
#### A masked pattern was here ####
91+
1 [null,null]
92+
2 NULL
93+
3 [3.3,null]
94+
4 [4.4,5.5]
95+
PREHOOK: query: CREATE TABLE test_parquet_array_nulls_varchar (
96+
id INT,
97+
arr_prim ARRAY<VARCHAR(20)>
98+
) STORED AS PARQUET
99+
PREHOOK: type: CREATETABLE
100+
PREHOOK: Output: database:default
101+
PREHOOK: Output: default@test_parquet_array_nulls_varchar
102+
POSTHOOK: query: CREATE TABLE test_parquet_array_nulls_varchar (
103+
id INT,
104+
arr_prim ARRAY<VARCHAR(20)>
105+
) STORED AS PARQUET
106+
POSTHOOK: type: CREATETABLE
107+
POSTHOOK: Output: database:default
108+
POSTHOOK: Output: default@test_parquet_array_nulls_varchar
109+
PREHOOK: query: SELECT 1, array(NULL, NULL)
110+
UNION ALL
111+
SELECT 2, CAST(NULL AS ARRAY<STRING>)
112+
UNION ALL
113+
SELECT 3, array('val3', NULL)
114+
UNION ALL
115+
SELECT 4, array('val4', 'val5')
116+
PREHOOK: type: QUERY
117+
PREHOOK: Input: _dummy_database@_dummy_table
118+
#### A masked pattern was here ####
119+
POSTHOOK: query: SELECT 1, array(NULL, NULL)
120+
UNION ALL
121+
SELECT 2, CAST(NULL AS ARRAY<STRING>)
122+
UNION ALL
123+
SELECT 3, array('val3', NULL)
124+
UNION ALL
125+
SELECT 4, array('val4', 'val5')
126+
POSTHOOK: type: QUERY
127+
POSTHOOK: Input: _dummy_database@_dummy_table
128+
#### A masked pattern was here ####
129+
1 [null,null]
130+
2 NULL
131+
3 ["val3",null]
132+
4 ["val4","val5"]
133+
PREHOOK: query: SELECT * FROM test_parquet_array_nulls_varchar
134+
PREHOOK: type: QUERY
135+
PREHOOK: Input: default@test_parquet_array_nulls_varchar
136+
#### A masked pattern was here ####
137+
POSTHOOK: query: SELECT * FROM test_parquet_array_nulls_varchar
138+
POSTHOOK: type: QUERY
139+
POSTHOOK: Input: default@test_parquet_array_nulls_varchar
140+
#### A masked pattern was here ####
141+
PREHOOK: query: CREATE TABLE test_parquet_array_nulls_float (
142+
id INT,
143+
arr_prim ARRAY<FLOAT>
144+
) STORED AS PARQUET
145+
PREHOOK: type: CREATETABLE
146+
PREHOOK: Output: database:default
147+
PREHOOK: Output: default@test_parquet_array_nulls_float
148+
POSTHOOK: query: CREATE TABLE test_parquet_array_nulls_float (
149+
id INT,
150+
arr_prim ARRAY<FLOAT>
151+
) STORED AS PARQUET
152+
POSTHOOK: type: CREATETABLE
153+
POSTHOOK: Output: database:default
154+
POSTHOOK: Output: default@test_parquet_array_nulls_float
155+
PREHOOK: query: INSERT INTO test_parquet_array_nulls_float
156+
SELECT 1, array(CAST(NULL AS FLOAT), CAST(NULL AS FLOAT))
157+
UNION ALL
158+
SELECT 2, CAST(NULL AS ARRAY<FLOAT>)
159+
UNION ALL
160+
SELECT 3, array(CAST(3.3 AS FLOAT), CAST(NULL AS FLOAT))
161+
UNION ALL
162+
SELECT 4, array(CAST(4.4 AS FLOAT), CAST(5.5 AS FLOAT))
163+
PREHOOK: type: QUERY
164+
PREHOOK: Input: _dummy_database@_dummy_table
165+
PREHOOK: Output: default@test_parquet_array_nulls_float
166+
POSTHOOK: query: INSERT INTO test_parquet_array_nulls_float
167+
SELECT 1, array(CAST(NULL AS FLOAT), CAST(NULL AS FLOAT))
168+
UNION ALL
169+
SELECT 2, CAST(NULL AS ARRAY<FLOAT>)
170+
UNION ALL
171+
SELECT 3, array(CAST(3.3 AS FLOAT), CAST(NULL AS FLOAT))
172+
UNION ALL
173+
SELECT 4, array(CAST(4.4 AS FLOAT), CAST(5.5 AS FLOAT))
174+
POSTHOOK: type: QUERY
175+
POSTHOOK: Input: _dummy_database@_dummy_table
176+
POSTHOOK: Output: default@test_parquet_array_nulls_float
177+
POSTHOOK: Lineage: test_parquet_array_nulls_float.arr_prim SCRIPT []
178+
POSTHOOK: Lineage: test_parquet_array_nulls_float.id SCRIPT []
179+
PREHOOK: query: SELECT * FROM test_parquet_array_nulls_float
180+
PREHOOK: type: QUERY
181+
PREHOOK: Input: default@test_parquet_array_nulls_float
182+
#### A masked pattern was here ####
183+
POSTHOOK: query: SELECT * FROM test_parquet_array_nulls_float
184+
POSTHOOK: type: QUERY
185+
POSTHOOK: Input: default@test_parquet_array_nulls_float
186+
#### A masked pattern was here ####
187+
1 [null,null]
188+
2 NULL
189+
3 [3.3,null]
190+
4 [4.4,5.5]
191+
PREHOOK: query: SELECT * FROM test_parquet_array_nulls_bool
192+
PREHOOK: type: QUERY
193+
PREHOOK: Input: default@test_parquet_array_nulls_bool
194+
#### A masked pattern was here ####
195+
POSTHOOK: query: SELECT * FROM test_parquet_array_nulls_bool
196+
POSTHOOK: type: QUERY
197+
POSTHOOK: Input: default@test_parquet_array_nulls_bool
198+
#### A masked pattern was here ####
199+
1 [null,null]
200+
2 NULL
201+
3 [true,null]
202+
4 [true,false]
203+
PREHOOK: query: SELECT * FROM test_parquet_array_nulls_double
204+
PREHOOK: type: QUERY
205+
PREHOOK: Input: default@test_parquet_array_nulls_double
206+
#### A masked pattern was here ####
207+
POSTHOOK: query: SELECT * FROM test_parquet_array_nulls_double
208+
POSTHOOK: type: QUERY
209+
POSTHOOK: Input: default@test_parquet_array_nulls_double
210+
#### A masked pattern was here ####
211+
1 [null,null]
212+
2 NULL
213+
3 [3.3,null]
214+
4 [4.4,5.5]
215+
PREHOOK: query: SELECT * FROM test_parquet_array_nulls_varchar
216+
PREHOOK: type: QUERY
217+
PREHOOK: Input: default@test_parquet_array_nulls_varchar
218+
#### A masked pattern was here ####
219+
POSTHOOK: query: SELECT * FROM test_parquet_array_nulls_varchar
220+
POSTHOOK: type: QUERY
221+
POSTHOOK: Input: default@test_parquet_array_nulls_varchar
222+
#### A masked pattern was here ####
223+
PREHOOK: query: SELECT * FROM test_parquet_array_nulls_float
224+
PREHOOK: type: QUERY
225+
PREHOOK: Input: default@test_parquet_array_nulls_float
226+
#### A masked pattern was here ####
227+
POSTHOOK: query: SELECT * FROM test_parquet_array_nulls_float
228+
POSTHOOK: type: QUERY
229+
POSTHOOK: Input: default@test_parquet_array_nulls_float
230+
#### A masked pattern was here ####
231+
1 [null,null]
232+
2 NULL
233+
3 [3.3,null]
234+
4 [4.4,5.5]

0 commit comments

Comments
 (0)