NVIDIA · leofang · May 18, 2026 · May 18, 2026 · leofang · May 18, 2026
diff --git a/README.md b/README.md
@@ -7,6 +7,7 @@ CUDA Python is the home for accessing NVIDIA’s CUDA platform from Python. It c
 * [cuda.pathfinder](https://nvidia.github.io/cuda-python/cuda-pathfinder/latest): Utilities for locating CUDA components installed in the user's Python environment
 * [cuda.coop](https://nvidia.github.io/cccl/unstable/python/coop.html): A Python module providing CCCL's reusable block-wide and warp-wide *device* primitives for use within Numba CUDA kernels
 * [cuda.compute](https://nvidia.github.io/cccl/unstable/python/compute/index.html): A Python module for easy access to CCCL's highly efficient and customizable parallel algorithms, like `sort`, `scan`, `reduce`, `transform`, etc. that are callable on the *host*
+* [numba-cuda-mlir](https://nvidia.github.io/numba-cuda-mlir/): An evolution of Numba CUDA that improves upon its technical foundation and performance to provide the future of CUDA Python JIT compilation. It currently supports developing CUDA **SIMT** kernels in Python, providing Python bindings for accelerated device libraries, and serving as a compiler for user-defined functions in accelerated libraries.
 * [numba.cuda](https://nvidia.github.io/numba-cuda/): A Python DSL that exposes CUDA **SIMT** programming model and compiles a restricted subset of Python code into CUDA kernels and device functions
 * [cuda.tile](https://docs.nvidia.com/cuda/cutile-python/): A new Python DSL that exposes CUDA **Tile** programming model and allows users to write NumPy-like code in CUDA kernels
 * [nvmath-python](https://docs.nvidia.com/cuda/nvmath-python/latest): Pythonic access to NVIDIA CPU & GPU Math Libraries, with [*host*](https://docs.nvidia.com/cuda/nvmath-python/latest/overview.html#host-apis), [*device*](https://docs.nvidia.com/cuda/nvmath-python/latest/overview.html#device-apis), and [*distributed*](https://docs.nvidia.com/cuda/nvmath-python/latest/distributed-apis/index.html) APIs. It also provides low-level Python bindings to host C APIs ([nvmath.bindings](https://docs.nvidia.com/cuda/nvmath-python/latest/bindings/index.html)).

diff --git a/cuda_bindings/pyproject.toml b/cuda_bindings/pyproject.toml
@@ -36,8 +36,9 @@ dependencies = ["cuda-pathfinder >=1.4.2"]
 
 [project.optional-dependencies]
 all = [
-    "cuda-toolkit[nvrtc,nvjitlink,nvvm,nvfatbin,cudla]==13.*",
+    "cuda-toolkit[nvrtc,nvjitlink,nvvm,nvfatbin]==13.*",
     "cuda-toolkit[cufile]==13.*; sys_platform == 'linux'",
+    "cuda-toolkit==13.*",
 ]
 
 [dependency-groups]

diff --git a/cuda_core/pyproject.toml b/cuda_core/pyproject.toml
@@ -54,8 +54,8 @@ dependencies = [
 ]
 
 [project.optional-dependencies]
-cu12 = ["cuda-bindings[all]==12.*"]
-cu13 = ["cuda-bindings[all]==13.*"]
+cu12 = ["cuda-bindings[all]==12.*", "cuda-toolkit==12.*"]
+cu13 = ["cuda-bindings[all]==13.*", "cuda-toolkit==13.*"]
 
 [dependency-groups]
 test = ["cython>=3.2,<3.3", "setuptools", "pytest>=6.2.4", "pytest-benchmark", "pytest-randomly", "pytest-repeat", "pytest-rerunfailures", "cloudpickle", "psutil", "cffi"]

diff --git a/cuda_python/DESCRIPTION.rst b/cuda_python/DESCRIPTION.rst
@@ -12,6 +12,7 @@ CUDA Python is the home for accessing NVIDIA's CUDA platform from Python. It con
 * `cuda.pathfinder <https://nvidia.github.io/cuda-python/cuda-pathfinder/latest>`_: Utilities for locating CUDA components installed in the user's Python environment
 * `cuda.coop <https://nvidia.github.io/cccl/unstable/python/coop.html>`_: A Python module providing CCCL's reusable block-wide and warp-wide *device* primitives for use within Numba CUDA kernels
 * `cuda.compute <https://nvidia.github.io/cccl/unstable/python/compute/index.html>`_: A Python module for easy access to CCCL's highly efficient and customizable parallel algorithms, like ``sort``, ``scan``, ``reduce``, ``transform``, etc. that are callable on the *host*
+* `numba-cuda-mlir <https://nvidia.github.io/numba-cuda-mlir/>`_: An evolution of Numba CUDA that improves upon its technical foundation and performance to provide the future of CUDA Python JIT compilation. It currently supports developing CUDA **SIMT** kernels in Python, providing Python bindings for accelerated device libraries, and serving as a compiler for user-defined functions in accelerated libraries.
 * `numba.cuda <https://nvidia.github.io/numba-cuda/>`_: A Python DSL that exposes CUDA **SIMT** programming model and compiles a restricted subset of Python code into CUDA kernels and device functions
 * `cuda.tile <https://docs.nvidia.com/cuda/cutile-python/>`_: A new Python DSL that exposes CUDA **Tile** programming model and allows users to write NumPy-like code in CUDA kernels
 * `nvmath-python <https://docs.nvidia.com/cuda/nvmath-python/latest>`_: Pythonic access to NVIDIA CPU & GPU Math Libraries, with `host <https://docs.nvidia.com/cuda/nvmath-python/latest/overview.html#host-apis>`_, `device <https://docs.nvidia.com/cuda/nvmath-python/latest/overview.html#device-apis>`_, and `distributed <https://docs.nvidia.com/cuda/nvmath-python/latest/distributed-apis/index.html>`_ APIs. It also provides low-level Python bindings to host C APIs (`nvmath.bindings <https://docs.nvidia.com/cuda/nvmath-python/latest/bindings/index.html>`_).

diff --git a/cuda_python/docs/source/index.rst b/cuda_python/docs/source/index.rst
@@ -12,6 +12,7 @@ multiple components:
 - `cuda.pathfinder`_: Utilities for locating CUDA components installed in the user's Python environment
 - `cuda.coop`_: A Python module providing CCCL's reusable block-wide and warp-wide *device* primitives for use within Numba CUDA kernels
 - `cuda.compute`_: A Python module for easy access to CCCL's highly efficient and customizable parallel algorithms, like ``sort``, ``scan``, ``reduce``, ``transform``, etc. that are callable on the *host*
+- `numba-cuda-mlir`_: An evolution of Numba CUDA that improves upon its technical foundation and performance to provide the future of CUDA Python JIT compilation. It currently supports developing CUDA **SIMT** kernels in Python, providing Python bindings for accelerated device libraries, and serving as a compiler for user-defined functions in accelerated libraries.
 - `numba.cuda`_: A Python DSL that exposes CUDA **SIMT** programming model and compiles a restricted subset of Python code into CUDA kernels and device functions
 - `cuda.tile`_: A new Python DSL that exposes CUDA **Tile** programming model and allows users to write NumPy-like code in CUDA kernels
 - `nvmath-python`_: Pythonic access to NVIDIA CPU & GPU Math Libraries, with `host`_, `device`_, and `distributed`_ APIs. It also provides low-level Python bindings to host C APIs (`nvmath.bindings`_).
@@ -22,6 +23,7 @@ multiple components:
 
 .. _cuda.coop: https://nvidia.github.io/cccl/unstable/python/coop.html
 .. _cuda.compute: https://nvidia.github.io/cccl/unstable/python/compute/index.html
+.. _numba-cuda-mlir: https://nvidia.github.io/numba-cuda-mlir/
 .. _numba.cuda: https://nvidia.github.io/numba-cuda/
 .. _cuda.tile: https://docs.nvidia.com/cuda/cutile-python/
 .. _nvmath-python: https://docs.nvidia.com/cuda/nvmath-python/latest
@@ -52,6 +54,7 @@ be available, please refer to the `cuda.bindings`_ documentation for installatio
    cuda.pathfinder <https://nvidia.github.io/cuda-python/cuda-pathfinder/latest>
    cuda.coop <https://nvidia.github.io/cccl/unstable/python/coop.html>
    cuda.compute <https://nvidia.github.io/cccl/unstable/python/compute/index.html>
+   numba-cuda-mlir <https://nvidia.github.io/numba-cuda-mlir/>
    numba.cuda <https://nvidia.github.io/numba-cuda/>
    cuda.tile <https://docs.nvidia.com/cuda/cutile-python/>
    nvmath-python <https://docs.nvidia.com/cuda/nvmath-python/>

diff --git a/cuda_python/setup.py b/cuda_python/setup.py
@@ -32,6 +32,8 @@
     version=version,
     install_requires=[
         f"cuda-bindings{matcher}{version}",
+        "cuda-core~=1.0.0",
+        "cuda-cccl~=1.0.0",
         "cuda-pathfinder~=1.1",
     ],
     extras_require={