vdk-oracle: Pass ingestion payload rows in uniform batches (#3194)

DeltaMichael · Dilyan Marinov · pre-commit-ci[bot] · web-flow · commit 7ccea041e8b9 · 2024-03-22T17:45:31.000+02:00
## Why?

The current ingestion implementation batches payload rows by column
keyset. Payloads with the same keyset are batched together and passed to
an executemany() call. This is not ideal becauase it can result in a
large number of executemany() calls

## What?

Make each payload row uniform by identifying missing columns and filling
them out with null values.
Pass data rows to executemany() in uniform batches.

## How was this tested?

Functional tests

## What kind of change is this?

Feature/non-breaking

---------

Signed-off-by: Dilyan Marinov &lt;mdilyan@vmware.com&gt;
Co-authored-by: Dilyan Marinov &lt;mdilyan@vmware.com&gt;
Co-authored-by: pre-commit-ci[bot] &lt;66853113+pre-commit-ci[bot]@users.noreply.github.com&gt;
diff --git a/projects/vdk-plugins/vdk-oracle/README.md b/projects/vdk-plugins/vdk-oracle/README.md
@@ -29,6 +29,7 @@ pip install vdk-oracle
 | oracle_sid               | The SID of the Oracle database. Note: This gets overridden if oracle_connection_string is set.                                                                                                                                        | free                |
 | oracle_service_name      | The Service name of the Oracle database. Note: This gets overridden if oracle_connection_string is set.                                                                                                                               | free                |
 | oracle_thick_mode        | Python-oracledb is said to be in Thick mode when Oracle Client libraries are used. True by default. Set to False to disable Oracle Thick mode. More info: https://python-oracledb.readthedocs.io/en/latest/user_guide/appendix_b.html | True                |
+| oracle_ingest_batch_size | vdk-oracle splits ingestion payloads into batches. Change this config to control the batch size. Default is set to 100.                                                                                                               | 100                 |
 
 ### Example
 
diff --git a/projects/vdk-plugins/vdk-oracle/src/vdk/plugin/oracle/ingest_to_oracle.py b/projects/vdk-plugins/vdk-oracle/src/vdk/plugin/oracle/ingest_to_oracle.py
@@ -6,11 +6,9 @@
 import re
 from decimal import Decimal
 from typing import Any
-from typing import Collection
 from typing import Dict
 from typing import List
 from typing import Optional
-from typing import Set
 
 from vdk.api.plugin.plugin_input import PEP249Connection
 from vdk.internal.builtin_plugins.connection.impl.router import ManagedConnectionRouter
@@ -89,10 +87,13 @@ def table_exists(self, table: str) -> bool:
 
 
 class IngestToOracle(IIngesterPlugin):
-    def __init__(self, connections: ManagedConnectionRouter):
+    def __init__(
+        self, connections: ManagedConnectionRouter, ingest_batch_size: int = 100
+    ):
         self.conn: PEP249Connection = connections.open_connection("ORACLE").connect()
         self.cursor: ManagedCursor = self.conn.cursor()
         self.table_cache: TableCache = TableCache(self.cursor)  # New cache for columns
+        self.ingest_batch_size = ingest_batch_size
 
     @staticmethod
     def _get_oracle_type(value: Any) -> str:
@@ -191,40 +192,48 @@ def cast_string_to_type(db_type: str, payload_value: str) -> Any:
 
         return value
 
-    # TODO: Look into potential optimizations
-    # TODO: https://github.com/vmware/versatile-data-kit/issues/2931
     def _insert_data(self, table_name: str, payload: List[Dict[str, Any]]) -> None:
         if not payload:
             return
 
-        # group dicts by key set
-        batches = {}
-        for p in payload:
-            batch = frozenset(p.keys())
-            if batch not in batches:
-                batches[batch] = []
-            batches[batch].append(p)
-
-        # create queries for groups of dicts with the same key set
-        queries = []
-        batch_data = []
-        for column_names, batch in batches.items():
-            columns = list(column_names)
-            query_columns = [_escape_special_chars(col) for col in columns]
-            insert_sql = f"INSERT INTO {table_name} ({', '.join(query_columns)}) VALUES ({', '.join([':' + str(i + 1) for i in range(len(query_columns))])})"
-            queries.append(insert_sql)
-            temp_data = []
-            for row in batch:
-                temp = [
-                    self._cast_to_correct_type(table_name, col, row[col])
-                    for col in columns
-                ]
-                temp_data.append(temp)
-            batch_data.append(temp_data)
-
-        # batch execute queries for dicts with the same key set
-        for i in range(len(queries)):
-            self.cursor.executemany(queries[i], batch_data[i])
+        def split(lst, n):
+            """Yield successive n-sized chunks from lst."""
+            for i in range(0, len(lst), n):
+                yield lst[i : i + n]
+
+        query, params = self._populate_query_parameters_tuple(table_name, payload)
+        batches = list(split(params, self.ingest_batch_size))
+        for batch in batches:
+            self.cursor.executemany(query, batch)
+
+    def _populate_query_parameters_tuple(
+        self, destination_table: str, payload: List[dict]
+    ) -> (str, list):
+        """
+        Prepare the SQL query and parameters for bulk insertion.
+
+        Returns insert into destination table tuple of query and parameters;
+        E.g. for a table dest_table with columns val1, val2 and payload size 2, this method will return:
+        'INSERT INTO dest_table (val1, val2) VALUES (:0, :1)',
+        [('val1', 'val2'), ('val1', 'val2')]
+        """
+        columns = self.table_cache.get_columns(destination_table)
+        query_columns = [_escape_special_chars(col) for col in columns]
+
+        placeholders = ", ".join(f":{i}" for i in range(len(columns)))
+        query = f"INSERT INTO {destination_table} ({', '.join(query_columns)}) VALUES ({placeholders})"
+
+        parameters = []
+        for obj in payload:
+            row = tuple(
+                self._cast_to_correct_type(
+                    destination_table, column.lower(), obj.get(column.lower())
+                )
+                for column in columns
+            )
+            parameters.append(row)
+
+        return query, parameters
 
     def ingest_payload(
         self,
diff --git a/projects/vdk-plugins/vdk-oracle/src/vdk/plugin/oracle/oracle_configuration.py b/projects/vdk-plugins/vdk-oracle/src/vdk/plugin/oracle/oracle_configuration.py
@@ -19,6 +19,7 @@
 ORACLE_PORT = "ORACLE_PORT"
 ORACLE_SID = "ORACLE_SID"
 ORACLE_SERVICE_NAME = "ORACLE_SERVICE_NAME"
+ORACLE_INGEST_BATCH_SIZE = "ORACLE_INGEST_BATCH_SIZE"
 
 
 class OracleConfiguration:
@@ -55,6 +56,9 @@ def oracle_thick_mode(self) -> bool:
     def oracle_thick_mode_lib_dir(self) -> Optional[str]:
         return self.__config.get_value(ORACLE_THICK_MODE_LIB_DIR)
 
+    def oracle_ingest_batch_size(self) -> Optional[int]:
+        return int(self.__config.get_value(ORACLE_INGEST_BATCH_SIZE))
+
     @staticmethod
     def add_definitions(config_builder: ConfigurationBuilder):
         config_builder.add(
@@ -122,3 +126,8 @@ def add_definitions(config_builder: ConfigurationBuilder):
             "Before setting this follow instruction in "
             "https://python-oracledb.readthedocs.io/en/latest/user_guide/initialization.html#enablingthick ",
         )
+        config_builder.add(
+            key=ORACLE_INGEST_BATCH_SIZE,
+            default_value=100,
+            description="Batch size when ingesting records into Oracle.",
+        )
diff --git a/projects/vdk-plugins/vdk-oracle/src/vdk/plugin/oracle/oracle_plugin.py b/projects/vdk-plugins/vdk-oracle/src/vdk/plugin/oracle/oracle_plugin.py
@@ -50,7 +50,10 @@ def initialize_job(self, context: JobContext):
             ),
         )
         context.ingester.add_ingester_factory_method(
-            "oracle", (lambda: IngestToOracle(context.connections))
+            "oracle",
+            lambda: IngestToOracle(
+                context.connections, conf.oracle_ingest_batch_size()
+            ),
         )
 
 
diff --git a/projects/vdk-plugins/vdk-oracle/tests/jobs/oracle-ingest-data-frame-schema-inference/10_ingest.py b/projects/vdk-plugins/vdk-oracle/tests/jobs/oracle-ingest-data-frame-schema-inference/10_ingest.py
@@ -5,6 +5,6 @@
 
 
 def run(job_input: IJobInput):
-    df = DataFrame.from_dict({"A": [1], "B": [2], "C": [3]})
+    df = DataFrame.from_dict({"a": [1], "b": [2], "c": [3]})
 
     job_input.send_object_for_ingestion(payload=df, destination_table="test_table")
diff --git a/projects/vdk-plugins/vdk-oracle/tests/test_plugin.py b/projects/vdk-plugins/vdk-oracle/tests/test_plugin.py
@@ -178,85 +178,72 @@ def test_oracle_ingest_data_frame_schema_inference(self):
 
 def _verify_query_execution(runner):
     check_result = runner.invoke(["sql-query", "--query", "SELECT * FROM todoitem"])
-    expected = (
-        "  ID  DESCRIPTION      DONE\n"
-        "----  -------------  ------\n"
-        "   1  Task 1              1\n"
-    )
-    assert expected in check_result.output
+    expected = [
+        "  ID  DESCRIPTION      DONE\n",
+        "----  -------------  ------\n",
+        "   1  Task 1              1\n",
+    ]
+    for row in expected:
+        assert row in check_result.output
 
 
 def _verify_ingest_execution(runner):
     check_result = runner.invoke(["sql-query", "--query", "SELECT * FROM test_table"])
-    expected = (
-        "  ID  STR_DATA      INT_DATA    FLOAT_DATA    BOOL_DATA  "
-        "TIMESTAMP_DATA         DECIMAL_DATA\n"
-        "----  ----------  ----------  ------------  -----------  "
-        "-------------------  --------------\n"
-        "   5  string              12           1.2            1  2023-11-21 "
-        "08:12:53             0.1\n"
-    )
-    assert expected in check_result.output
+    expected = [
+        "  ID  STR_DATA      INT_DATA    FLOAT_DATA    BOOL_DATA  TIMESTAMP_DATA         DECIMAL_DATA\n",
+        "----  ----------  ----------  ------------  -----------  -------------------  --------------\n",
+        "   5  string              12           1.2            1  2023-11-21 08:12:53             0.1\n",
+    ]
+    for row in expected:
+        assert row in check_result.output
 
 
 def _verify_ingest_execution_special_chars(runner):
     check_result = runner.invoke(["sql-query", "--query", "SELECT * FROM test_table"])
-    expected = (
-        "  ID  @str_data      %int_data    *float*data*    BOOL_DATA  "
-        "TIMESTAMP_DATA         DECIMAL_DATA\n"
-        "----  -----------  -----------  --------------  -----------  "
-        "-------------------  --------------\n"
-        "   5  string                12             1.2            1  2023-11-21 "
-        "08:12:53             0.1\n"
-    )
-    assert expected in check_result.output
+    expected = [
+        "  ID  @str_data      %int_data    *float*data*    BOOL_DATA  TIMESTAMP_DATA         DECIMAL_DATA\n",
+        "----  -----------  -----------  --------------  -----------  -------------------  --------------\n",
+        "   5  string                12             1.2            1  2023-11-21 08:12:53             0.1\n",
+    ]
+    for row in expected:
+        assert row in check_result.output
 
 
 def _verify_ingest_execution_type_inference(runner):
     check_result = runner.invoke(["sql-query", "--query", "SELECT * FROM test_table"])
-    expected = (
-        "  ID  STR_DATA      INT_DATA  NAN_INT_DATA      FLOAT_DATA    BOOL_DATA  "
-        "TIMESTAMP_DATA         DECIMAL_DATA\n"
-        "----  ----------  ----------  --------------  ------------  -----------  "
-        "-------------------  --------------\n"
-        "   5  string              12                           1.2            1  "
-        "2023-11-21 08:12:53             0.1\n"
-    )
-    assert expected in check_result.output
+    expected = [
+        "  ID  STR_DATA      INT_DATA  NAN_INT_DATA      FLOAT_DATA    BOOL_DATA  TIMESTAMP_DATA         DECIMAL_DATA\n",
+        "----  ----------  ----------  --------------  ------------  -----------  -------------------  --------------\n",
+        "   5  string              12                           1.2            1  2023-11-21 08:12:53             0.1\n",
+    ]
+    for row in expected:
+        assert row in check_result.output
 
 
 def _verify_ingest_execution_no_table(runner):
     check_result = runner.invoke(["sql-query", "--query", "SELECT * FROM test_table"])
-    expected = (
-        "  ID  STR_DATA      INT_DATA    FLOAT_DATA    BOOL_DATA  "
-        "TIMESTAMP_DATA         DECIMAL_DATA\n"
-        "----  ----------  ----------  ------------  -----------  "
-        "-------------------  --------------\n"
-        "   0  string              12           1.2            1  "
-        "2023-11-21T08:12:53             1.1\n"
-        "   1  string              12           1.2            1  "
-        "2023-11-21T08:12:53             1.1\n"
-        "   2  string              12           1.2            1  "
-        "2023-11-21T08:12:53             1.1\n"
-    )
-    assert expected in check_result.output
+    expected = [
+        "  ID  STR_DATA      INT_DATA    FLOAT_DATA    BOOL_DATA  TIMESTAMP_DATA         DECIMAL_DATA\n",
+        "----  ----------  ----------  ------------  -----------  -------------------  --------------\n",
+        "   0  string              12           1.2            1  2023-11-21T08:12:53             1.1\n",
+        "   1  string              12           1.2            1  2023-11-21T08:12:53             1.1\n",
+        "   2  string              12           1.2            1  2023-11-21T08:12:53             1.1\n",
+    ]
+    for row in expected:
+        assert row in check_result.output
 
 
 def _verify_ingest_execution_no_table_special_chars(runner):
     check_result = runner.invoke(["sql-query", "--query", "SELECT * FROM test_table"])
-    expected = (
-        "  ID  @str_data      %int_data    *float*data*    BOOL_DATA  "
-        "TIMESTAMP_DATA         DECIMAL_DATA\n"
-        "----  -----------  -----------  --------------  -----------  "
-        "-------------------  --------------\n"
-        "   0  string                12             1.2            1  "
-        "2023-11-21T08:12:53             1.1\n"
-        "   1  string                12             1.2            1  "
-        "2023-11-21T08:12:53             1.1\n"
-        "   2  string                12             1.2            1  "
-        "2023-11-21T08:12:53             1.1\n"
-    )
-    assert expected in check_result.output
+    expected = [
+        "  ID  @str_data      %int_data    *float*data*    BOOL_DATA  TIMESTAMP_DATA         DECIMAL_DATA\n",
+        "----  -----------  -----------  --------------  -----------  -------------------  --------------\n",
+        "   0  string                12             1.2            1  2023-11-21T08:12:53             1.1\n",
+        "   1  string                12             1.2            1  2023-11-21T08:12:53             1.1\n",
+        "   2  string                12             1.2            1  2023-11-21T08:12:53             1.1\n",
+    ]
+    for row in expected:
+        assert row in check_result.output
 
 
 def _verify_ingest_execution_different_payloads_no_table(runner):
@@ -301,21 +288,20 @@ def _verify_ingest_execution_different_payloads_no_table_special_chars(runner):
 
 def _verify_ingest_execution_different_payloads(runner):
     check_result = runner.invoke(["sql-query", "--query", "SELECT * FROM test_table"])
-    expected = (
-        "  ID  STR_DATA      INT_DATA    FLOAT_DATA    BOOL_DATA  TIMESTAMP_DATA\n"
-        "----  ----------  ----------  ------------  -----------  "
-        "-------------------\n"
-        "   0\n"
-        "   1  string\n"
-        "   2  string              12\n"
-        "   3  string              12           1.2\n"
-        "   6  string              12           1.2\n"
-        "   4  string              12           1.2            1\n"
-        "   7  string              12           1.2            1\n"
-        "   5  string              12           1.2            1  2023-11-21 "
-        "08:12:53\n"
-    )
-    assert expected in check_result.output
+    expected = [
+        "  ID  STR_DATA      INT_DATA    FLOAT_DATA    BOOL_DATA  TIMESTAMP_DATA\n",
+        "----  ----------  ----------  ------------  -----------  -------------------\n",
+        "   0\n",
+        "   1  string\n",
+        "   2  string              12\n",
+        "   3  string              12           1.2\n",
+        "   4  string              12           1.2            1\n",
+        "   5  string              12           1.2            1  2023-11-21 08:12:53\n"
+        "   6  string              12           1.2\n",
+        "   7  string              12           1.2            1\n",
+    ]
+    for row in expected:
+        assert row in check_result.output
 
 
 def _verify_ingest_blob(runner):
@@ -352,5 +338,5 @@ def _verify_ingest_nan_and_none_execution(runner):
 
 def _verify_ingest_data_frame_schema_inference(runner):
     check_result = runner.invoke(["sql-query", "--query", "SELECT * FROM test_table"])
-    expected = "  A    B    C\n---  ---  ---\n  1    2    3\n"
+    expected = "A    B    C\n---  ---  ---\n  1    2    3\n"
     assert expected in check_result.output

Original file line number	Diff line number	Diff line change
`@@ -50,7 +50,10 @@ def initialize_job(self, context: JobContext):`
`50`	`50`	`),`
`51`	`51`	`)`
`52`	`52`	`context.ingester.add_ingester_factory_method(`
`53`		`- "oracle", (lambda: IngestToOracle(context.connections))`
	`53`	`+ "oracle",`
	`54`	`+ lambda: IngestToOracle(`
	`55`	`+ context.connections, conf.oracle_ingest_batch_size()`
	`56`	`+ ),`
`54`	`57`	`)`
`55`	`58`
`56`	`59`