PyArrow Functionality#

pandas can utilize PyArrow to extend functionality and improve the performance of various APIs. This includes:

  • More extensive data types compared to NumPy

  • Missing data support (NA) for all data types

  • Performant IO reader integration

  • Facilitate interoperability with other dataframe libraries based on the Apache Arrow specification (e.g. polars, cuDF)

To use this functionality, please ensure you have installed the minimum supported PyArrow version.

Data Structure Integration#

A Series, Index, or the columns of a DataFrame can be directly backed by a which is similar to a NumPy array. To construct these from the main pandas data structures, you can pass in a string of the type followed by [pyarrow], e.g. "int64[pyarrow]"" into the dtype parameter

In [1]: ser = pd.Series([-1.5, 0.2, None], dtype="float32[pyarrow]")
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[1], line 1
----> 1 ser = pd.Series([-1.5, 0.2, None], dtype="float32[pyarrow]")

File /usr/lib/python3/dist-packages/pandas/core/series.py:496, in Series.__init__(self, data, index, dtype, name, copy, fastpath)
    493     index = ensure_index(index)
    495 if dtype is not None:
--> 496     dtype = self._validate_dtype(dtype)
    498 if data is None:
    499     index = index if index is not None else default_index(0)

File /usr/lib/python3/dist-packages/pandas/core/generic.py:519, in NDFrame._validate_dtype(cls, dtype)
    517 """validate the passed dtype"""
    518 if dtype is not None:
--> 519     dtype = pandas_dtype(dtype)
    521     # a compound dtype
    522     if dtype.kind == "V":

File /usr/lib/python3/dist-packages/pandas/core/dtypes/common.py:1640, in pandas_dtype(dtype)
   1637     return StringDtype(na_value=np.nan)
   1639 # registered extension types
-> 1640 result = registry.find(dtype)
   1641 if result is not None:
   1642     if isinstance(result, type):
   1643         # GH 31356, GH 54592

File /usr/lib/python3/dist-packages/pandas/core/dtypes/base.py:576, in Registry.find(self, dtype)
    574 for dtype_type in self.dtypes:
    575     try:
--> 576         return dtype_type.construct_from_string(dtype)
    577     except TypeError:
    578         pass

File /usr/lib/python3/dist-packages/pandas/core/dtypes/dtypes.py:2251, in ArrowDtype.construct_from_string(cls, string)
   2249 base_type = string[:-9]  # get rid of "[pyarrow]"
   2250 try:
-> 2251     pa_dtype = pa.type_for_alias(base_type)
   2252 except ValueError as err:
   2253     has_parameters = re.search(r"[\[\(].*[\]\)]", base_type)

NameError: name 'pa' is not defined

In [2]: ser
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[2], line 1
----> 1 ser

NameError: name 'ser' is not defined

In [3]: idx = pd.Index([True, None], dtype="bool[pyarrow]")
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[3], line 1
----> 1 idx = pd.Index([True, None], dtype="bool[pyarrow]")

File /usr/lib/python3/dist-packages/pandas/core/indexes/base.py:488, in Index.__new__(cls, data, dtype, copy, name, tupleize_cols)
    485 name = maybe_extract_name(name, data, cls)
    487 if dtype is not None:
--> 488     dtype = pandas_dtype(dtype)
    490 data_dtype = getattr(data, "dtype", None)
    492 refs = None

File /usr/lib/python3/dist-packages/pandas/core/dtypes/common.py:1640, in pandas_dtype(dtype)
   1637     return StringDtype(na_value=np.nan)
   1639 # registered extension types
-> 1640 result = registry.find(dtype)
   1641 if result is not None:
   1642     if isinstance(result, type):
   1643         # GH 31356, GH 54592

File /usr/lib/python3/dist-packages/pandas/core/dtypes/base.py:576, in Registry.find(self, dtype)
    574 for dtype_type in self.dtypes:
    575     try:
--> 576         return dtype_type.construct_from_string(dtype)
    577     except TypeError:
    578         pass

File /usr/lib/python3/dist-packages/pandas/core/dtypes/dtypes.py:2251, in ArrowDtype.construct_from_string(cls, string)
   2249 base_type = string[:-9]  # get rid of "[pyarrow]"
   2250 try:
-> 2251     pa_dtype = pa.type_for_alias(base_type)
   2252 except ValueError as err:
   2253     has_parameters = re.search(r"[\[\(].*[\]\)]", base_type)

NameError: name 'pa' is not defined

In [4]: idx
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[4], line 1
----> 1 idx

NameError: name 'idx' is not defined

In [5]: df = pd.DataFrame([[1, 2], [3, 4]], dtype="uint64[pyarrow]")
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[5], line 1
----> 1 df = pd.DataFrame([[1, 2], [3, 4]], dtype="uint64[pyarrow]")

File /usr/lib/python3/dist-packages/pandas/core/frame.py:708, in DataFrame.__init__(self, data, index, columns, dtype, copy)
    706 allow_mgr = False
    707 if dtype is not None:
--> 708     dtype = self._validate_dtype(dtype)
    710 if isinstance(data, DataFrame):
    711     data = data._mgr

File /usr/lib/python3/dist-packages/pandas/core/generic.py:519, in NDFrame._validate_dtype(cls, dtype)
    517 """validate the passed dtype"""
    518 if dtype is not None:
--> 519     dtype = pandas_dtype(dtype)
    521     # a compound dtype
    522     if dtype.kind == "V":

File /usr/lib/python3/dist-packages/pandas/core/dtypes/common.py:1640, in pandas_dtype(dtype)
   1637     return StringDtype(na_value=np.nan)
   1639 # registered extension types
-> 1640 result = registry.find(dtype)
   1641 if result is not None:
   1642     if isinstance(result, type):
   1643         # GH 31356, GH 54592

File /usr/lib/python3/dist-packages/pandas/core/dtypes/base.py:576, in Registry.find(self, dtype)
    574 for dtype_type in self.dtypes:
    575     try:
--> 576         return dtype_type.construct_from_string(dtype)
    577     except TypeError:
    578         pass

File /usr/lib/python3/dist-packages/pandas/core/dtypes/dtypes.py:2251, in ArrowDtype.construct_from_string(cls, string)
   2249 base_type = string[:-9]  # get rid of "[pyarrow]"
   2250 try:
-> 2251     pa_dtype = pa.type_for_alias(base_type)
   2252 except ValueError as err:
   2253     has_parameters = re.search(r"[\[\(].*[\]\)]", base_type)

NameError: name 'pa' is not defined

In [6]: df
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[6], line 1
----> 1 df

NameError: name 'df' is not defined

Note

The string alias "string[pyarrow]" maps to pd.StringDtype("pyarrow") which is not equivalent to specifying dtype=pd.ArrowDtype(pa.string()). Generally, operations on the data will behave similarly except pd.StringDtype("pyarrow") can return NumPy-backed nullable types while pd.ArrowDtype(pa.string()) will return ArrowDtype.

In [7]: import pyarrow as pa
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Cell In[7], line 1
----> 1 import pyarrow as pa

ModuleNotFoundError: No module named 'pyarrow'

In [8]: data = list("abc")

In [9]: ser_sd = pd.Series(data, dtype="string[pyarrow]")
---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
Cell In[9], line 1
----> 1 ser_sd = pd.Series(data, dtype="string[pyarrow]")

File /usr/lib/python3/dist-packages/pandas/core/series.py:496, in Series.__init__(self, data, index, dtype, name, copy, fastpath)
    493     index = ensure_index(index)
    495 if dtype is not None:
--> 496     dtype = self._validate_dtype(dtype)
    498 if data is None:
    499     index = index if index is not None else default_index(0)

File /usr/lib/python3/dist-packages/pandas/core/generic.py:519, in NDFrame._validate_dtype(cls, dtype)
    517 """validate the passed dtype"""
    518 if dtype is not None:
--> 519     dtype = pandas_dtype(dtype)
    521     # a compound dtype
    522     if dtype.kind == "V":

File /usr/lib/python3/dist-packages/pandas/core/dtypes/common.py:1640, in pandas_dtype(dtype)
   1637     return StringDtype(na_value=np.nan)
   1639 # registered extension types
-> 1640 result = registry.find(dtype)
   1641 if result is not None:
   1642     if isinstance(result, type):
   1643         # GH 31356, GH 54592

File /usr/lib/python3/dist-packages/pandas/core/dtypes/base.py:576, in Registry.find(self, dtype)
    574 for dtype_type in self.dtypes:
    575     try:
--> 576         return dtype_type.construct_from_string(dtype)
    577     except TypeError:
    578         pass

File /usr/lib/python3/dist-packages/pandas/core/arrays/string_.py:275, in StringDtype.construct_from_string(cls, string)
    273     return cls(storage="python")
    274 elif string == "string[pyarrow]":
--> 275     return cls(storage="pyarrow")
    276 elif string == "string[pyarrow_numpy]":
    277     # this is deprecated in the dtype __init__, remove this in pandas 3.0
    278     return cls(storage="pyarrow_numpy")

File /usr/lib/python3/dist-packages/pandas/core/arrays/string_.py:182, in StringDtype.__init__(self, storage, na_value)
    178     raise ValueError(
    179         f"Storage must be 'python' or 'pyarrow'. Got {storage} instead."
    180     )
    181 if storage == "pyarrow" and pa_version_under10p1:
--> 182     raise ImportError(
    183         "pyarrow>=10.0.1 is required for PyArrow backed StringArray."
    184     )
    186 if isinstance(na_value, float) and np.isnan(na_value):
    187     # when passed a NaN value, always set to np.nan to ensure we use
    188     # a consistent NaN value (and we can use `dtype.na_value is np.nan`)
    189     na_value = np.nan

ImportError: pyarrow>=10.0.1 is required for PyArrow backed StringArray.

In [10]: ser_ad = pd.Series(data, dtype=pd.ArrowDtype(pa.string()))
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[10], line 1
----> 1 ser_ad = pd.Series(data, dtype=pd.ArrowDtype(pa.string()))

NameError: name 'pa' is not defined

In [11]: ser_ad.dtype == ser_sd.dtype
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[11], line 1
----> 1 ser_ad.dtype == ser_sd.dtype

NameError: name 'ser_ad' is not defined

In [12]: ser_sd.str.contains("a")
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[12], line 1
----> 1 ser_sd.str.contains("a")

NameError: name 'ser_sd' is not defined

In [13]: ser_ad.str.contains("a")
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[13], line 1
----> 1 ser_ad.str.contains("a")

NameError: name 'ser_ad' is not defined

For PyArrow types that accept parameters, you can pass in a PyArrow type with those parameters into ArrowDtype to use in the dtype parameter.

In [14]: import pyarrow as pa
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Cell In[14], line 1
----> 1 import pyarrow as pa

ModuleNotFoundError: No module named 'pyarrow'

In [15]: list_str_type = pa.list_(pa.string())
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[15], line 1
----> 1 list_str_type = pa.list_(pa.string())

NameError: name 'pa' is not defined

In [16]: ser = pd.Series([["hello"], ["there"]], dtype=pd.ArrowDtype(list_str_type))
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[16], line 1
----> 1 ser = pd.Series([["hello"], ["there"]], dtype=pd.ArrowDtype(list_str_type))

NameError: name 'list_str_type' is not defined

In [17]: ser
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[17], line 1
----> 1 ser

NameError: name 'ser' is not defined
In [18]: from datetime import time

In [19]: idx = pd.Index([time(12, 30), None], dtype=pd.ArrowDtype(pa.time64("us")))
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[19], line 1
----> 1 idx = pd.Index([time(12, 30), None], dtype=pd.ArrowDtype(pa.time64("us")))

NameError: name 'pa' is not defined

In [20]: idx
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[20], line 1
----> 1 idx

NameError: name 'idx' is not defined
In [21]: from decimal import Decimal

In [22]: decimal_type = pd.ArrowDtype(pa.decimal128(3, scale=2))
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[22], line 1
----> 1 decimal_type = pd.ArrowDtype(pa.decimal128(3, scale=2))

NameError: name 'pa' is not defined

In [23]: data = [[Decimal("3.19"), None], [None, Decimal("-1.23")]]

In [24]: df = pd.DataFrame(data, dtype=decimal_type)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[24], line 1
----> 1 df = pd.DataFrame(data, dtype=decimal_type)

NameError: name 'decimal_type' is not defined

In [25]: df
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[25], line 1
----> 1 df

NameError: name 'df' is not defined

If you already have an or , you can pass it into arrays.ArrowExtensionArray to construct the associated Series, Index or DataFrame object.

In [26]: pa_array = pa.array(
   ....:     [{"1": "2"}, {"10": "20"}, None],
   ....:     type=pa.map_(pa.string(), pa.string()),
   ....: )
   ....: 
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[26], line 1
----> 1 pa_array = pa.array(
      2     [{"1": "2"}, {"10": "20"}, None],
      3     type=pa.map_(pa.string(), pa.string()),
      4 )

NameError: name 'pa' is not defined

In [27]: ser = pd.Series(pd.arrays.ArrowExtensionArray(pa_array))
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[27], line 1
----> 1 ser = pd.Series(pd.arrays.ArrowExtensionArray(pa_array))

NameError: name 'pa_array' is not defined

In [28]: ser
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[28], line 1
----> 1 ser

NameError: name 'ser' is not defined

To retrieve a pyarrow from a Series or Index, you can call the pyarrow array constructor on the Series or Index.

In [29]: ser = pd.Series([1, 2, None], dtype="uint8[pyarrow]")
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[29], line 1
----> 1 ser = pd.Series([1, 2, None], dtype="uint8[pyarrow]")

File /usr/lib/python3/dist-packages/pandas/core/series.py:496, in Series.__init__(self, data, index, dtype, name, copy, fastpath)
    493     index = ensure_index(index)
    495 if dtype is not None:
--> 496     dtype = self._validate_dtype(dtype)
    498 if data is None:
    499     index = index if index is not None else default_index(0)

File /usr/lib/python3/dist-packages/pandas/core/generic.py:519, in NDFrame._validate_dtype(cls, dtype)
    517 """validate the passed dtype"""
    518 if dtype is not None:
--> 519     dtype = pandas_dtype(dtype)
    521     # a compound dtype
    522     if dtype.kind == "V":

File /usr/lib/python3/dist-packages/pandas/core/dtypes/common.py:1640, in pandas_dtype(dtype)
   1637     return StringDtype(na_value=np.nan)
   1639 # registered extension types
-> 1640 result = registry.find(dtype)
   1641 if result is not None:
   1642     if isinstance(result, type):
   1643         # GH 31356, GH 54592

File /usr/lib/python3/dist-packages/pandas/core/dtypes/base.py:576, in Registry.find(self, dtype)
    574 for dtype_type in self.dtypes:
    575     try:
--> 576         return dtype_type.construct_from_string(dtype)
    577     except TypeError:
    578         pass

File /usr/lib/python3/dist-packages/pandas/core/dtypes/dtypes.py:2251, in ArrowDtype.construct_from_string(cls, string)
   2249 base_type = string[:-9]  # get rid of "[pyarrow]"
   2250 try:
-> 2251     pa_dtype = pa.type_for_alias(base_type)
   2252 except ValueError as err:
   2253     has_parameters = re.search(r"[\[\(].*[\]\)]", base_type)

NameError: name 'pa' is not defined

In [30]: pa.array(ser)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[30], line 1
----> 1 pa.array(ser)

NameError: name 'pa' is not defined

In [31]: idx = pd.Index(ser)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[31], line 1
----> 1 idx = pd.Index(ser)

NameError: name 'ser' is not defined

In [32]: pa.array(idx)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[32], line 1
----> 1 pa.array(idx)

NameError: name 'pa' is not defined

To convert a to a DataFrame, you can call the method with types_mapper=pd.ArrowDtype.

In [33]: table = pa.table([pa.array([1, 2, 3], type=pa.int64())], names=["a"])
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[33], line 1
----> 1 table = pa.table([pa.array([1, 2, 3], type=pa.int64())], names=["a"])

NameError: name 'pa' is not defined

In [34]: df = table.to_pandas(types_mapper=pd.ArrowDtype)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[34], line 1
----> 1 df = table.to_pandas(types_mapper=pd.ArrowDtype)

NameError: name 'table' is not defined

In [35]: df
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[35], line 1
----> 1 df

NameError: name 'df' is not defined

In [36]: df.dtypes
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[36], line 1
----> 1 df.dtypes

NameError: name 'df' is not defined

Operations#

PyArrow data structure integration is implemented through pandas’ ExtensionArray interface; therefore, supported functionality exists where this interface is integrated within the pandas API. Additionally, this functionality is accelerated with PyArrow compute functions where available. This includes:

  • Numeric aggregations

  • Numeric arithmetic

  • Numeric rounding

  • Logical and comparison functions

  • String functionality

  • Datetime functionality

The following are just some examples of operations that are accelerated by native PyArrow compute functions.

In [37]: import pyarrow as pa
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Cell In[37], line 1
----> 1 import pyarrow as pa

ModuleNotFoundError: No module named 'pyarrow'

In [38]: ser = pd.Series([-1.545, 0.211, None], dtype="float32[pyarrow]")
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[38], line 1
----> 1 ser = pd.Series([-1.545, 0.211, None], dtype="float32[pyarrow]")

File /usr/lib/python3/dist-packages/pandas/core/series.py:496, in Series.__init__(self, data, index, dtype, name, copy, fastpath)
    493     index = ensure_index(index)
    495 if dtype is not None:
--> 496     dtype = self._validate_dtype(dtype)
    498 if data is None:
    499     index = index if index is not None else default_index(0)

File /usr/lib/python3/dist-packages/pandas/core/generic.py:519, in NDFrame._validate_dtype(cls, dtype)
    517 """validate the passed dtype"""
    518 if dtype is not None:
--> 519     dtype = pandas_dtype(dtype)
    521     # a compound dtype
    522     if dtype.kind == "V":

File /usr/lib/python3/dist-packages/pandas/core/dtypes/common.py:1640, in pandas_dtype(dtype)
   1637     return StringDtype(na_value=np.nan)
   1639 # registered extension types
-> 1640 result = registry.find(dtype)
   1641 if result is not None:
   1642     if isinstance(result, type):
   1643         # GH 31356, GH 54592

File /usr/lib/python3/dist-packages/pandas/core/dtypes/base.py:576, in Registry.find(self, dtype)
    574 for dtype_type in self.dtypes:
    575     try:
--> 576         return dtype_type.construct_from_string(dtype)
    577     except TypeError:
    578         pass

File /usr/lib/python3/dist-packages/pandas/core/dtypes/dtypes.py:2251, in ArrowDtype.construct_from_string(cls, string)
   2249 base_type = string[:-9]  # get rid of "[pyarrow]"
   2250 try:
-> 2251     pa_dtype = pa.type_for_alias(base_type)
   2252 except ValueError as err:
   2253     has_parameters = re.search(r"[\[\(].*[\]\)]", base_type)

NameError: name 'pa' is not defined

In [39]: ser.mean()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[39], line 1
----> 1 ser.mean()

NameError: name 'ser' is not defined

In [40]: ser + ser
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[40], line 1
----> 1 ser + ser

NameError: name 'ser' is not defined

In [41]: ser > (ser + 1)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[41], line 1
----> 1 ser > (ser + 1)

NameError: name 'ser' is not defined

In [42]: ser.dropna()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[42], line 1
----> 1 ser.dropna()

NameError: name 'ser' is not defined

In [43]: ser.isna()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[43], line 1
----> 1 ser.isna()

NameError: name 'ser' is not defined

In [44]: ser.fillna(0)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[44], line 1
----> 1 ser.fillna(0)

NameError: name 'ser' is not defined
In [45]: ser_str = pd.Series(["a", "b", None], dtype=pd.ArrowDtype(pa.string()))
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[45], line 1
----> 1 ser_str = pd.Series(["a", "b", None], dtype=pd.ArrowDtype(pa.string()))

NameError: name 'pa' is not defined

In [46]: ser_str.str.startswith("a")
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[46], line 1
----> 1 ser_str.str.startswith("a")

NameError: name 'ser_str' is not defined
In [47]: from datetime import datetime

In [48]: pa_type = pd.ArrowDtype(pa.timestamp("ns"))
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[48], line 1
----> 1 pa_type = pd.ArrowDtype(pa.timestamp("ns"))

NameError: name 'pa' is not defined

In [49]: ser_dt = pd.Series([datetime(2022, 1, 1), None], dtype=pa_type)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[49], line 1
----> 1 ser_dt = pd.Series([datetime(2022, 1, 1), None], dtype=pa_type)

NameError: name 'pa_type' is not defined

In [50]: ser_dt.dt.strftime("%Y-%m")
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[50], line 1
----> 1 ser_dt.dt.strftime("%Y-%m")

NameError: name 'ser_dt' is not defined

I/O Reading#

PyArrow also provides IO reading functionality that has been integrated into several pandas IO readers. The following functions provide an engine keyword that can dispatch to PyArrow to accelerate reading from an IO source.

In [51]: import io

In [52]: data = io.StringIO("""a,b,c
   ....:    1,2.5,True
   ....:    3,4.5,False
   ....: """)
   ....: 

In [53]: df = pd.read_csv(data, engine="pyarrow")
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
File /usr/lib/python3/dist-packages/pandas/compat/_optional.py:140, in import_optional_dependency(name, extra, errors, min_version)
    139 try:
--> 140     module = importlib.import_module(name)
    141 except ImportError:

File /usr/lib/python3.14/importlib/__init__.py:88, in import_module(name, package)
     87         level += 1
---> 88 return _bootstrap._gcd_import(name[level:], package, level)

File <frozen importlib._bootstrap>:1406, in _gcd_import(name, package, level)
   1405     name = _resolve_name(name, package, level)
-> 1406 return _find_and_load(name, _gcd_import)

File <frozen importlib._bootstrap>:1371, in _find_and_load(name, import_)
   1370     if module is _NEEDS_LOADING:
-> 1371         return _find_and_load_unlocked(name, import_)
   1373 # Optimization: only call _bootstrap._lock_unlock_module() if
   1374 # module.__spec__._initializing is True.
   1375 # NOTE: because of this, initializing must be set *before*
   1376 # putting the new module in sys.modules.

File <frozen importlib._bootstrap>:1335, in _find_and_load_unlocked(name, import_)
   1334 if spec is None:
-> 1335     raise ModuleNotFoundError(f'{_ERR_MSG_PREFIX}{name!r}', name=name)
   1336 else:

ModuleNotFoundError: No module named 'pyarrow'

During handling of the above exception, another exception occurred:

ImportError                               Traceback (most recent call last)
Cell In[53], line 1
----> 1 df = pd.read_csv(data, engine="pyarrow")

File /usr/lib/python3/dist-packages/pandas/io/parsers/readers.py:1026, in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)
   1013 kwds_defaults = _refine_defaults_read(
   1014     dialect,
   1015     delimiter,
   (...)   1022     dtype_backend=dtype_backend,
   1023 )
   1024 kwds.update(kwds_defaults)
-> 1026 return _read(filepath_or_buffer, kwds)

File /usr/lib/python3/dist-packages/pandas/io/parsers/readers.py:626, in _read(filepath_or_buffer, kwds)
    623     return parser
    625 with parser:
--> 626     return parser.read(nrows)

File /usr/lib/python3/dist-packages/pandas/io/parsers/readers.py:1911, in TextFileReader.read(self, nrows)
   1908 if self.engine == "pyarrow":
   1909     try:
   1910         # error: "ParserBase" has no attribute "read"
-> 1911         df = self._engine.read()  # type: ignore[attr-defined]
   1912     except Exception:
   1913         self.close()

File /usr/lib/python3/dist-packages/pandas/io/parsers/arrow_parser_wrapper.py:239, in ArrowParserWrapper.read(self)
    228 def read(self) -> DataFrame:
    229     """
    230     Reads the contents of a CSV file into a DataFrame and
    231     processes it according to the kwargs passed in the
   (...)    237         The DataFrame created from the CSV file.
    238     """
--> 239     pa = import_optional_dependency("pyarrow")
    240     pyarrow_csv = import_optional_dependency("pyarrow.csv")
    241     self._get_pyarrow_options()

File /usr/lib/python3/dist-packages/pandas/compat/_optional.py:143, in import_optional_dependency(name, extra, errors, min_version)
    141 except ImportError:
    142     if errors == "raise":
--> 143         raise ImportError(msg)
    144     return None
    146 # Handle submodules: if we have submodule, grab parent module from sys.modules

ImportError: Missing optional dependency 'pyarrow'.  Use pip or conda to install pyarrow.

In [54]: df
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[54], line 1
----> 1 df

NameError: name 'df' is not defined

By default, these functions and all other IO reader functions return NumPy-backed data. These readers can return PyArrow-backed data by specifying the parameter dtype_backend="pyarrow". A reader does not need to set engine="pyarrow" to necessarily return PyArrow-backed data.

In [55]: import io

In [56]: data = io.StringIO("""a,b,c,d,e,f,g,h,i
   ....:     1,2.5,True,a,,,,,
   ....:     3,4.5,False,b,6,7.5,True,a,
   ....: """)
   ....: 

In [57]: df_pyarrow = pd.read_csv(data, dtype_backend="pyarrow")
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
File /usr/lib/python3/dist-packages/pandas/compat/_optional.py:140, in import_optional_dependency(name, extra, errors, min_version)
    139 try:
--> 140     module = importlib.import_module(name)
    141 except ImportError:

File /usr/lib/python3.14/importlib/__init__.py:88, in import_module(name, package)
     87         level += 1
---> 88 return _bootstrap._gcd_import(name[level:], package, level)

File <frozen importlib._bootstrap>:1406, in _gcd_import(name, package, level)
   1405     name = _resolve_name(name, package, level)
-> 1406 return _find_and_load(name, _gcd_import)

File <frozen importlib._bootstrap>:1371, in _find_and_load(name, import_)
   1370     if module is _NEEDS_LOADING:
-> 1371         return _find_and_load_unlocked(name, import_)
   1373 # Optimization: only call _bootstrap._lock_unlock_module() if
   1374 # module.__spec__._initializing is True.
   1375 # NOTE: because of this, initializing must be set *before*
   1376 # putting the new module in sys.modules.

File <frozen importlib._bootstrap>:1335, in _find_and_load_unlocked(name, import_)
   1334 if spec is None:
-> 1335     raise ModuleNotFoundError(f'{_ERR_MSG_PREFIX}{name!r}', name=name)
   1336 else:

ModuleNotFoundError: No module named 'pyarrow'

During handling of the above exception, another exception occurred:

ImportError                               Traceback (most recent call last)
Cell In[57], line 1
----> 1 df_pyarrow = pd.read_csv(data, dtype_backend="pyarrow")

File /usr/lib/python3/dist-packages/pandas/io/parsers/readers.py:1026, in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)
   1013 kwds_defaults = _refine_defaults_read(
   1014     dialect,
   1015     delimiter,
   (...)   1022     dtype_backend=dtype_backend,
   1023 )
   1024 kwds.update(kwds_defaults)
-> 1026 return _read(filepath_or_buffer, kwds)

File /usr/lib/python3/dist-packages/pandas/io/parsers/readers.py:620, in _read(filepath_or_buffer, kwds)
    617 _validate_names(kwds.get("names", None))
    619 # Create the parser.
--> 620 parser = TextFileReader(filepath_or_buffer, **kwds)
    622 if chunksize or iterator:
    623     return parser

File /usr/lib/python3/dist-packages/pandas/io/parsers/readers.py:1620, in TextFileReader.__init__(self, f, engine, **kwds)
   1617     self.options["has_index_names"] = kwds["has_index_names"]
   1619 self.handles: IOHandles | None = None
-> 1620 self._engine = self._make_engine(f, self.engine)

File /usr/lib/python3/dist-packages/pandas/io/parsers/readers.py:1898, in TextFileReader._make_engine(self, f, engine)
   1895     raise ValueError(msg)
   1897 try:
-> 1898     return mapping[engine](f, **self.options)
   1899 except Exception:
   1900     if self.handles is not None:

File /usr/lib/python3/dist-packages/pandas/io/parsers/c_parser_wrapper.py:92, in CParserWrapper.__init__(self, src, **kwds)
     89     kwds["dtype_backend"] = "numpy"
     90 if kwds["dtype_backend"] == "pyarrow":
     91     # Fail here loudly instead of in cython after reading
---> 92     import_optional_dependency("pyarrow")
     93 self._reader = parsers.TextReader(src, **kwds)
     95 self.unnamed_cols = self._reader.unnamed_cols

File /usr/lib/python3/dist-packages/pandas/compat/_optional.py:143, in import_optional_dependency(name, extra, errors, min_version)
    141 except ImportError:
    142     if errors == "raise":
--> 143         raise ImportError(msg)
    144     return None
    146 # Handle submodules: if we have submodule, grab parent module from sys.modules

ImportError: Missing optional dependency 'pyarrow'.  Use pip or conda to install pyarrow.

In [58]: df_pyarrow.dtypes
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[58], line 1
----> 1 df_pyarrow.dtypes

NameError: name 'df_pyarrow' is not defined

Several non-IO reader functions can also use the dtype_backend argument to return PyArrow-backed data including: