From 6b690aaf98052e9867b7876601b02e1a4ab3221a Mon Sep 17 00:00:00 2001 From: Evert Lammerts Date: Thu, 5 Feb 2026 11:06:07 +0100 Subject: [PATCH] Fix ArrowStream --- src/duckdb_py/arrow/arrow_array_stream.cpp | 17 +++++++---------- .../duckdb_python/arrow/arrow_array_stream.hpp | 8 +++----- src/duckdb_py/python_replacement_scan.cpp | 13 ++++++------- src/duckdb_py/python_udf.cpp | 3 +-- 4 files changed, 17 insertions(+), 24 deletions(-) diff --git a/src/duckdb_py/arrow/arrow_array_stream.cpp b/src/duckdb_py/arrow/arrow_array_stream.cpp index f4ac2b58..6eef37a3 100644 --- a/src/duckdb_py/arrow/arrow_array_stream.cpp +++ b/src/duckdb_py/arrow/arrow_array_stream.cpp @@ -27,15 +27,15 @@ void VerifyArrowDatasetLoaded() { } } -py::object PythonTableArrowArrayStreamFactory::ProduceScanner(DBConfig &config, py::object &arrow_scanner, - py::handle &arrow_obj_handle, +py::object PythonTableArrowArrayStreamFactory::ProduceScanner(py::object &arrow_scanner, py::handle &arrow_obj_handle, ArrowStreamParameters ¶meters, const ClientProperties &client_properties) { D_ASSERT(!py::isinstance(arrow_obj_handle)); ArrowSchemaWrapper schema; PythonTableArrowArrayStreamFactory::GetSchemaInternal(arrow_obj_handle, schema); ArrowTableSchema arrow_table; - ArrowTableFunction::PopulateArrowTableSchema(config, arrow_table, schema.arrow_schema); + ArrowTableFunction::PopulateArrowTableSchema(*client_properties.client_context.get_mutable(), arrow_table, + schema.arrow_schema); auto filters = parameters.filters; auto &column_list = parameters.projected_columns.columns; @@ -86,26 +86,23 @@ unique_ptr PythonTableArrowArrayStreamFactory::Produce( auto arrow_dataset = import_cache.pyarrow.dataset().attr("dataset"); auto dataset = arrow_dataset(arrow_obj_handle); py::object arrow_scanner = dataset.attr("__class__").attr("scanner"); - scanner = ProduceScanner(factory->config, arrow_scanner, dataset, parameters, factory->client_properties); + scanner = ProduceScanner(arrow_scanner, dataset, parameters, factory->client_properties); break; } case PyArrowObjectType::RecordBatchReader: { - scanner = ProduceScanner(factory->config, arrow_batch_scanner, arrow_obj_handle, parameters, - factory->client_properties); + scanner = ProduceScanner(arrow_batch_scanner, arrow_obj_handle, parameters, factory->client_properties); break; } case PyArrowObjectType::Scanner: { // If it's a scanner we have to turn it to a record batch reader, and then a scanner again since we can't stack // scanners on arrow Otherwise pushed-down projections and filters will disappear like tears in the rain auto record_batches = arrow_obj_handle.attr("to_reader")(); - scanner = ProduceScanner(factory->config, arrow_batch_scanner, record_batches, parameters, - factory->client_properties); + scanner = ProduceScanner(arrow_batch_scanner, record_batches, parameters, factory->client_properties); break; } case PyArrowObjectType::Dataset: { py::object arrow_scanner = arrow_obj_handle.attr("__class__").attr("scanner"); - scanner = - ProduceScanner(factory->config, arrow_scanner, arrow_obj_handle, parameters, factory->client_properties); + scanner = ProduceScanner(arrow_scanner, arrow_obj_handle, parameters, factory->client_properties); break; } default: { diff --git a/src/duckdb_py/include/duckdb_python/arrow/arrow_array_stream.hpp b/src/duckdb_py/include/duckdb_python/arrow/arrow_array_stream.hpp index a5895b4a..3534f918 100644 --- a/src/duckdb_py/include/duckdb_python/arrow/arrow_array_stream.hpp +++ b/src/duckdb_py/include/duckdb_python/arrow/arrow_array_stream.hpp @@ -68,9 +68,8 @@ PyArrowObjectType GetArrowType(const py::handle &obj); class PythonTableArrowArrayStreamFactory { public: - explicit PythonTableArrowArrayStreamFactory(PyObject *arrow_table, const ClientProperties &client_properties_p, - DBConfig &config) - : arrow_object(arrow_table), client_properties(client_properties_p), config(config) {}; + explicit PythonTableArrowArrayStreamFactory(PyObject *arrow_table, const ClientProperties &client_properties_p) + : arrow_object(arrow_table), client_properties(client_properties_p) {}; //! Produces an Arrow Scanner, should be only called once when initializing Scan States static unique_ptr Produce(uintptr_t factory, ArrowStreamParameters ¶meters); @@ -83,10 +82,9 @@ class PythonTableArrowArrayStreamFactory { PyObject *arrow_object; const ClientProperties client_properties; - DBConfig &config; private: - static py::object ProduceScanner(DBConfig &config, py::object &arrow_scanner, py::handle &arrow_obj_handle, + static py::object ProduceScanner(py::object &arrow_scanner, py::handle &arrow_obj_handle, ArrowStreamParameters ¶meters, const ClientProperties &client_properties); }; } // namespace duckdb diff --git a/src/duckdb_py/python_replacement_scan.cpp b/src/duckdb_py/python_replacement_scan.cpp index ff5c8b5e..18338650 100644 --- a/src/duckdb_py/python_replacement_scan.cpp +++ b/src/duckdb_py/python_replacement_scan.cpp @@ -18,7 +18,7 @@ namespace duckdb { static void CreateArrowScan(const string &name, py::object entry, TableFunctionRef &table_function, vector> &children, ClientProperties &client_properties, - PyArrowObjectType type, DBConfig &config, DatabaseInstance &db) { + PyArrowObjectType type, DatabaseInstance &db) { shared_ptr external_dependency = make_shared_ptr(); if (type == PyArrowObjectType::MessageReader) { if (!db.ExtensionIsLoaded("nanoarrow")) { @@ -56,7 +56,7 @@ static void CreateArrowScan(const string &name, py::object entry, TableFunctionR type = PyArrowObjectType::PyCapsule; } - auto stream_factory = make_uniq(entry.ptr(), client_properties, config); + auto stream_factory = make_uniq(entry.ptr(), client_properties); auto stream_factory_produce = PythonTableArrowArrayStreamFactory::Produce; auto stream_factory_get_schema = PythonTableArrowArrayStreamFactory::GetSchema; @@ -113,7 +113,7 @@ unique_ptr PythonReplacementScan::TryReplacementObject(const py::objec if (PandasDataFrame::IsPyArrowBacked(entry)) { auto table = PandasDataFrame::ToArrowTable(entry); CreateArrowScan(name, table, *table_function, children, client_properties, PyArrowObjectType::Table, - DBConfig::GetConfig(context), *context.db); + *context.db); } else { string name = "df_" + StringUtil::GenerateRandomName(); auto new_df = PandasScanFunction::PandasReplaceCopiedNames(entry); @@ -143,17 +143,16 @@ unique_ptr PythonReplacementScan::TryReplacementObject(const py::objec } else if (PolarsDataFrame::IsDataFrame(entry)) { auto arrow_dataset = entry.attr("to_arrow")(); CreateArrowScan(name, arrow_dataset, *table_function, children, client_properties, PyArrowObjectType::Table, - DBConfig::GetConfig(context), *context.db); + *context.db); } else if (PolarsDataFrame::IsLazyFrame(entry)) { auto materialized = entry.attr("collect")(); auto arrow_dataset = materialized.attr("to_arrow")(); CreateArrowScan(name, arrow_dataset, *table_function, children, client_properties, PyArrowObjectType::Table, - DBConfig::GetConfig(context), *context.db); + *context.db); } else if (DuckDBPyConnection::GetArrowType(entry) != PyArrowObjectType::Invalid && !(DuckDBPyConnection::GetArrowType(entry) == PyArrowObjectType::MessageReader && !relation)) { arrow_type = DuckDBPyConnection::GetArrowType(entry); - CreateArrowScan(name, entry, *table_function, children, client_properties, arrow_type, - DBConfig::GetConfig(context), *context.db); + CreateArrowScan(name, entry, *table_function, children, client_properties, arrow_type, *context.db); } else if (DuckDBPyConnection::IsAcceptedNumpyObject(entry) != NumpyObjectType::INVALID) { numpytype = DuckDBPyConnection::IsAcceptedNumpyObject(entry); string np_name = "np_" + StringUtil::GenerateRandomName(); diff --git a/src/duckdb_py/python_udf.cpp b/src/duckdb_py/python_udf.cpp index ac2ae74e..97eea3b0 100644 --- a/src/duckdb_py/python_udf.cpp +++ b/src/duckdb_py/python_udf.cpp @@ -74,8 +74,7 @@ static void ConvertArrowTableToVector(const py::object &table, Vector &out, Clie D_ASSERT(py::gil_check()); py::gil_scoped_release gil; - auto stream_factory = - make_uniq(ptr, context.GetClientProperties(), DBConfig::GetConfig(context)); + auto stream_factory = make_uniq(ptr, context.GetClientProperties()); auto stream_factory_produce = PythonTableArrowArrayStreamFactory::Produce; auto stream_factory_get_schema = PythonTableArrowArrayStreamFactory::GetSchema;