From 1e5c6b3c3b4fbe443bdc23265135b1cfa4cba945 Mon Sep 17 00:00:00 2001
From: Jake Hemstad <jhemstad@nvidia.com>
Date: Tue, 26 Sep 2023 19:02:54 +0000
Subject: [PATCH 01/64] Allow setting CUDA compiler via CMAKE_CUDA_COMPILER
 envvar.

---
 ci/build_common.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ci/build_common.sh b/ci/build_common.sh
index 22bdf878ab..a70a61016a 100755
--- a/ci/build_common.sh
+++ b/ci/build_common.sh
@@ -6,12 +6,13 @@ set -eo pipefail
 cd "$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )";
 
 # Script defaults
-CUDA_COMPILER=nvcc
+CUDA_COMPILER=${CMAKE_CUDA_COMPILER:-nvcc}
 
 # Check if the correct number of arguments has been provided
 function usage {
     echo "Usage: $0 [OPTIONS] <HOST_COMPILER> <CXX_STANDARD> <GPU_ARCHS>"
     echo "The PARALLEL_LEVEL environment variable controls the amount of build parallelism. Default is the number of cores."
+    echo "The CMAKE_CUDA_COMPILER environment variable can be used to control the CUDA compiler. The -nvcc flag takes precedence.
     echo "Example: PARALLEL_LEVEL=8 $0 g++-8 14 \"70\" "
     echo "Example: $0 clang++-8 17 \"70;75;80-virtual\" "
     echo "Possible options: "

From 388af5bc315d2e364e668f1cad3d9e3392c095f7 Mon Sep 17 00:00:00 2001
From: Jake Hemstad <jhemstad@nvidia.com>
Date: Tue, 26 Sep 2023 19:08:20 +0000
Subject: [PATCH 02/64] Move nvcc version check to CUB script.

---
 ci/build_common.sh |  4 +---
 ci/build_cub.sh    | 13 ++++++++-----
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/ci/build_common.sh b/ci/build_common.sh
index a70a61016a..a63c278b72 100755
--- a/ci/build_common.sh
+++ b/ci/build_common.sh
@@ -55,9 +55,7 @@ readonly CXX_STANDARD=$2
 
 # Replace spaces, commas and semicolons with semicolons for CMake list
 readonly GPU_ARCHS=$(echo $3 | tr ' ,' ';')
-
 readonly PARALLEL_LEVEL=${PARALLEL_LEVEL:=$(nproc)}
-readonly NVCC_VERSION=$($CUDA_COMPILER --version | grep release | awk '{print $6}' | cut -c2-)
 
 if [ -z ${DEVCONTAINER_NAME+x} ]; then
     BUILD_DIR=../build/local
@@ -84,7 +82,7 @@ COMMON_CMAKE_OPTIONS="
 echo "========================================"
 echo "Begin build"
 echo "pwd=$(pwd)"
-echo "NVCC_VERSION=$NVCC_VERSION"
+echo "CUDA_COMPILER=$CUDA_COMPILER"
 echo "HOST_COMPILER=$HOST_COMPILER"
 echo "CXX_STANDARD=$CXX_STANDARD"
 echo "GPU_ARCHS=$GPU_ARCHS"
diff --git a/ci/build_cub.sh b/ci/build_cub.sh
index f3cdd40546..fb720b85b9 100755
--- a/ci/build_cub.sh
+++ b/ci/build_cub.sh
@@ -2,7 +2,6 @@
 
 source "$(dirname "$0")/build_common.sh"
 
-
 # CUB benchmarks require at least CUDA nvcc 11.5 for int128
 # Returns "true" if the first version is greater than or equal to the second
 version_compare() {
@@ -12,12 +11,16 @@ version_compare() {
         echo "false"
     fi
 }
-readonly ENABLE_CUB_BENCHMARKS=${ENABLE_CUB_BENCHMARKS:=$(version_compare $NVCC_VERSION 11.5)}
 
-if [[ $ENABLE_CUB_BENCHMARKS == "true" ]]; then
-    echo "CUDA version is $NVCC_VERSION. Building CUB benchmarks."
+if [[ "$CUDA_COMPILER" == *nvcc* ]]; then
+    NVCC_VERSION=$($CUDA_COMPILER --version | grep release | awk '{print $6}' | cut -c2-)
+    if [[ $(version_compare $NVCC_VERSION 11.5) == "true" ]]; then
+        echo "nvcc version is $NVCC_VERSION. Building CUB benchmarks."
+    else
+        echo "nvcc version is $NVCC_VERSION. Not building CUB benchmarks because nvcc version is less than 11.5."
+    fi
 else
-    echo "CUDA version is $NVCC_VERSION. Not building CUB benchmarks because CUDA version is less than 11.5."
+    echo "nvcc version is not determined (likely using a non-NVCC compiler). Not building CUB benchmarks."
 fi
 
 CMAKE_OPTIONS="

From b625e93f2227191c6fce05e1d830ab68e9928d93 Mon Sep 17 00:00:00 2001
From: Jake Hemstad <jhemstad@nvidia.com>
Date: Tue, 26 Sep 2023 19:14:31 +0000
Subject: [PATCH 03/64] Add clang-cuda job to matrix.

---
 .github/actions/compute-matrix/compute-matrix.sh | 2 ++
 ci/matrix.yaml                                   | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/.github/actions/compute-matrix/compute-matrix.sh b/.github/actions/compute-matrix/compute-matrix.sh
index f4fb489e4c..290feaead2 100755
--- a/.github/actions/compute-matrix/compute-matrix.sh
+++ b/.github/actions/compute-matrix/compute-matrix.sh
@@ -23,6 +23,8 @@ extract_matrix() {
   write_output "HOST_COMPILERS" "$(echo "$nvcc_full_matrix" | jq -cr '[.[] | .compiler.name] | unique')"
   write_output "PER_CUDA_COMPILER_MATRIX" "$(echo "$nvcc_full_matrix" | jq -cr ' group_by(.cuda + .compiler.name) | map({(.[0].cuda + "-" + .[0].compiler.name): .}) | add')"
   write_output "NVRTC_MATRIX" "$(echo "$matrix" | jq '.nvrtc' | explode_std_versions)"
+  local clang_cuda_matrix="$(echo "$matrix" | jq -cr '.["clang-cuda"]' | explode_std_versions)"
+  write_output "CLANG_CUDA_MATRIX" "$clang_cuda_matrix"
 }
 
 main() {
diff --git a/ci/matrix.yaml b/ci/matrix.yaml
index 707c06c695..8be6e88a73 100644
--- a/ci/matrix.yaml
+++ b/ci/matrix.yaml
@@ -48,3 +48,5 @@ pull_request:
     - {cuda: *cuda_newest, os: 'ubuntu22.04', cpu: 'amd64', compiler: {name: 'llvm', version: '16', exe: 'clang++'}, gpu_build_archs: '70', std: [11, 14, 17, 20], jobs: ['build', 'test']}
   nvrtc:
     - {cuda: *cuda_newest, os: 'ubuntu22.04', cpu: 'amd64', gpu_build_archs: '70', std: [11, 14, 17, 20]}
+  clang-cuda:
+    - {cuda: *cuda_newest, os: 'ubuntu22.04', cpu: 'amd64', gpu_build_archs: '70', std: [17, 20]}

From 260805f4377bf97166f6398e81716a67321efad2 Mon Sep 17 00:00:00 2001
From: Jake Hemstad <jhemstad@nvidia.com>
Date: Tue, 26 Sep 2023 19:26:56 +0000
Subject: [PATCH 04/64] Add compiler field to matrix for clang-cuda.

---
 ci/matrix.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/matrix.yaml b/ci/matrix.yaml
index 8be6e88a73..bd5bc2e611 100644
--- a/ci/matrix.yaml
+++ b/ci/matrix.yaml
@@ -49,4 +49,4 @@ pull_request:
   nvrtc:
     - {cuda: *cuda_newest, os: 'ubuntu22.04', cpu: 'amd64', gpu_build_archs: '70', std: [11, 14, 17, 20]}
   clang-cuda:
-    - {cuda: *cuda_newest, os: 'ubuntu22.04', cpu: 'amd64', gpu_build_archs: '70', std: [17, 20]}
+    - {cuda: *cuda_newest, os: 'ubuntu22.04', cpu: 'amd64', compiler: {name: 'llvm', version: '16', exe: 'clang++'}, gpu_build_archs: '70', std: [17, 20]}

From 5ac06225d091f13e1755027033d5cad98581abee Mon Sep 17 00:00:00 2001
From: Jake Hemstad <jhemstad@nvidia.com>
Date: Tue, 26 Sep 2023 19:27:08 +0000
Subject: [PATCH 05/64] Add Thrust clang-cuda job.

---
 .github/workflows/pr.yml | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml
index d7fe05d1a1..49d9341068 100644
--- a/.github/workflows/pr.yml
+++ b/.github/workflows/pr.yml
@@ -33,7 +33,7 @@ concurrency:
 
 jobs:
   compute-matrix:
-    name: Compute matrix 
+    name: Compute matrix
     runs-on: ubuntu-latest
     outputs:
       DEVCONTAINER_VERSION: ${{steps.set-outputs.outputs.DEVCONTAINER_VERSION}}
@@ -49,7 +49,7 @@ jobs:
         id: set-outputs
         run: |
           .github/actions/compute-matrix/compute-matrix.sh ci/matrix.yaml pull_request
-      
+
   nvrtc:
     name: NVRTC CUDA${{matrix.cuda}} C++${{matrix.std}}
     needs: compute-matrix
@@ -111,6 +111,21 @@ jobs:
       test_script: "./ci/test_libcudacxx.sh"
       devcontainer_version: ${{ needs.compute-matrix.outputs.DEVCONTAINER_VERSION }}
 
+  thrust-clang-cuda:
+    name: Thrust CTK${{matrix.cuda_version}} clang-cuda ${{matrix.compiler.version}}
+    need: compute-matrix
+    strategy:
+      fail-fast: false
+      matrix:
+        include: ${{ fromJSON(needs.compute-matrix.outputs.CLANG_CUDA_MATRIX) }}
+      uses: ./.github/workflows/run-as-coder.yml
+      with:
+        name: Thrust CTK${{matrix.cuda_version}} clang-cuda ${{matrix.compiler.version}}
+        runner: linux-${{matrix.cpu}}-cpu16
+        image: rapidsai/devcontainers:${{needs.compute-matrix.outputs.DEVCONTAINER_VERSION}}-cpp-${{matrix.compiler.name}}${{matrix.compiler.version}}-cuda${{matrux.cuda}}-${{matrix.os}}
+        command: |
+          CMAKE_CUDA_COMPILER=${{matrix.compiler.exe}} ./ci/build_thrust.sh ${{matrix.compiler.exe}} ${{matrix.std}} ${{matrix.gpu_build_archs}}
+
   examples:
     name: CCCL Examples
     needs: compute-matrix

From a428b215de54810e9c476f3917bb5d335aa54044 Mon Sep 17 00:00:00 2001
From: Jake Hemstad <jhemstad@nvidia.com>
Date: Tue, 26 Sep 2023 19:31:10 +0000
Subject: [PATCH 06/64] Fix formatting.

---
 .github/workflows/pr.yml | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml
index 49d9341068..e9781a5be5 100644
--- a/.github/workflows/pr.yml
+++ b/.github/workflows/pr.yml
@@ -118,13 +118,13 @@ jobs:
       fail-fast: false
       matrix:
         include: ${{ fromJSON(needs.compute-matrix.outputs.CLANG_CUDA_MATRIX) }}
-      uses: ./.github/workflows/run-as-coder.yml
-      with:
-        name: Thrust CTK${{matrix.cuda_version}} clang-cuda ${{matrix.compiler.version}}
-        runner: linux-${{matrix.cpu}}-cpu16
-        image: rapidsai/devcontainers:${{needs.compute-matrix.outputs.DEVCONTAINER_VERSION}}-cpp-${{matrix.compiler.name}}${{matrix.compiler.version}}-cuda${{matrux.cuda}}-${{matrix.os}}
-        command: |
-          CMAKE_CUDA_COMPILER=${{matrix.compiler.exe}} ./ci/build_thrust.sh ${{matrix.compiler.exe}} ${{matrix.std}} ${{matrix.gpu_build_archs}}
+    uses: ./.github/workflows/run-as-coder.yml
+    with:
+      name: Thrust CTK${{matrix.cuda_version}} clang-cuda ${{matrix.compiler.version}}
+      runner: linux-${{matrix.cpu}}-cpu16
+      image: rapidsai/devcontainers:${{needs.compute-matrix.outputs.DEVCONTAINER_VERSION}}-cpp-${{matrix.compiler.name}}${{matrix.compiler.version}}-cuda${{matrux.cuda}}-${{matrix.os}}
+      command: |
+        CMAKE_CUDA_COMPILER=${{matrix.compiler.exe}} ./ci/build_thrust.sh ${{matrix.compiler.exe}} ${{matrix.std}} ${{matrix.gpu_build_archs}}
 
   examples:
     name: CCCL Examples

From 3da71f4c8d0e5931d07ecfa7704e693d41762e8f Mon Sep 17 00:00:00 2001
From: Jake Hemstad <jhemstad@nvidia.com>
Date: Tue, 26 Sep 2023 19:31:56 +0000
Subject: [PATCH 07/64] s/need/needs/

---
 .github/workflows/pr.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml
index e9781a5be5..ddbe24a4ad 100644
--- a/.github/workflows/pr.yml
+++ b/.github/workflows/pr.yml
@@ -113,7 +113,7 @@ jobs:
 
   thrust-clang-cuda:
     name: Thrust CTK${{matrix.cuda_version}} clang-cuda ${{matrix.compiler.version}}
-    need: compute-matrix
+    needs: compute-matrix
     strategy:
       fail-fast: false
       matrix:

From 25efa7c76c7463cedafb95fb09d22de4575143ef Mon Sep 17 00:00:00 2001
From: Jake Hemstad <jhemstad@nvidia.com>
Date: Tue, 26 Sep 2023 19:32:33 +0000
Subject: [PATCH 08/64] Can't spell good.

---
 .github/workflows/pr.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml
index ddbe24a4ad..6a883dfdc3 100644
--- a/.github/workflows/pr.yml
+++ b/.github/workflows/pr.yml
@@ -122,7 +122,7 @@ jobs:
     with:
       name: Thrust CTK${{matrix.cuda_version}} clang-cuda ${{matrix.compiler.version}}
       runner: linux-${{matrix.cpu}}-cpu16
-      image: rapidsai/devcontainers:${{needs.compute-matrix.outputs.DEVCONTAINER_VERSION}}-cpp-${{matrix.compiler.name}}${{matrix.compiler.version}}-cuda${{matrux.cuda}}-${{matrix.os}}
+      image: rapidsai/devcontainers:${{needs.compute-matrix.outputs.DEVCONTAINER_VERSION}}-cpp-${{matrix.compiler.name}}${{matrix.compiler.version}}-cuda${{matrix.cuda}}-${{matrix.os}}
       command: |
         CMAKE_CUDA_COMPILER=${{matrix.compiler.exe}} ./ci/build_thrust.sh ${{matrix.compiler.exe}} ${{matrix.std}} ${{matrix.gpu_build_archs}}
 

From ce622cf757c8b4dc241c991bbca1daae473d3420 Mon Sep 17 00:00:00 2001
From: Jake Hemstad <jhemstad@nvidia.com>
Date: Tue, 26 Sep 2023 19:33:21 +0000
Subject: [PATCH 09/64] [skip-tests] Add clang cuda job to status check job.

---
 .github/workflows/pr.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml
index 6a883dfdc3..7f0cf31e1f 100644
--- a/.github/workflows/pr.yml
+++ b/.github/workflows/pr.yml
@@ -154,5 +154,6 @@ jobs:
       - nvrtc
       - thrust
       - examples
+      - thrust-clang-cuda
     steps:
       - run: echo "CI success"

From 13c95baefe4759d728bb0a74ea94a97e3f1b8316 Mon Sep 17 00:00:00 2001
From: Jake Hemstad <jhemstad@nvidia.com>
Date: Tue, 26 Sep 2023 19:34:08 +0000
Subject: [PATCH 10/64] Disable other jobs for now.

---
 .github/workflows/pr.yml | 148 +++++++++++++++++++--------------------
 1 file changed, 74 insertions(+), 74 deletions(-)

diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml
index 7f0cf31e1f..750b1f6238 100644
--- a/.github/workflows/pr.yml
+++ b/.github/workflows/pr.yml
@@ -50,66 +50,66 @@ jobs:
         run: |
           .github/actions/compute-matrix/compute-matrix.sh ci/matrix.yaml pull_request
 
-  nvrtc:
-    name: NVRTC CUDA${{matrix.cuda}} C++${{matrix.std}}
-    needs: compute-matrix
-    if: ${{ !contains(github.event.head_commit.message, 'skip-tests') }}
-    uses: ./.github/workflows/run-as-coder.yml
-    strategy:
-      fail-fast: false
-      matrix:
-        include: ${{ fromJSON(needs.compute-matrix.outputs.NVRTC_MATRIX) }}
-    with:
-      name: NVRTC CUDA${{matrix.cuda}} C++${{matrix.std}}
-      runner: linux-${{matrix.cpu}}-gpu-v100-latest-1
-      image: rapidsai/devcontainers:${{needs.compute-matrix.outputs.DEVCONTAINER_VERSION}}-cpp-gcc12-cuda${{matrix.cuda}}-${{matrix.os}}
-      command: |
-        ./ci/nvrtc_libcudacxx.sh g++ ${{matrix.std}} ${{matrix.gpu_build_archs}}
+  #nvrtc:
+  #  name: NVRTC CUDA${{matrix.cuda}} C++${{matrix.std}}
+  #  needs: compute-matrix
+  #  if: ${{ !contains(github.event.head_commit.message, 'skip-tests') }}
+  #  uses: ./.github/workflows/run-as-coder.yml
+  #  strategy:
+  #    fail-fast: false
+  #    matrix:
+  #      include: ${{ fromJSON(needs.compute-matrix.outputs.NVRTC_MATRIX) }}
+  #  with:
+  #    name: NVRTC CUDA${{matrix.cuda}} C++${{matrix.std}}
+  #    runner: linux-${{matrix.cpu}}-gpu-v100-latest-1
+  #    image: rapidsai/devcontainers:${{needs.compute-matrix.outputs.DEVCONTAINER_VERSION}}-cpp-gcc12-cuda${{matrix.cuda}}-${{matrix.os}}
+  #    command: |
+  #      ./ci/nvrtc_libcudacxx.sh g++ ${{matrix.std}} ${{matrix.gpu_build_archs}}
 
-  thrust:
-    name: Thrust CUDA${{ matrix.cuda_version }} ${{ matrix.compiler }}
-    needs: compute-matrix
-    uses: ./.github/workflows/dispatch-build-and-test.yml
-    strategy:
-      fail-fast: false
-      matrix:
-        cuda_version: ${{ fromJSON(needs.compute-matrix.outputs.CUDA_VERSIONS) }}
-        compiler: ${{ fromJSON(needs.compute-matrix.outputs.HOST_COMPILERS) }}
-    with:
-      per_cuda_compiler_matrix: ${{ toJSON(fromJSON(needs.compute-matrix.outputs.PER_CUDA_COMPILER_MATRIX)[ format('{0}-{1}', matrix.cuda_version, matrix.compiler) ]) }}
-      build_script: "./ci/build_thrust.sh"
-      test_script: "./ci/test_thrust.sh"
-      devcontainer_version: ${{ needs.compute-matrix.outputs.DEVCONTAINER_VERSION }}
+  #thrust:
+  #  name: Thrust CUDA${{ matrix.cuda_version }} ${{ matrix.compiler }}
+  #  needs: compute-matrix
+  #  uses: ./.github/workflows/dispatch-build-and-test.yml
+  #  strategy:
+  #    fail-fast: false
+  #    matrix:
+  #      cuda_version: ${{ fromJSON(needs.compute-matrix.outputs.CUDA_VERSIONS) }}
+  #      compiler: ${{ fromJSON(needs.compute-matrix.outputs.HOST_COMPILERS) }}
+  #  with:
+  #    per_cuda_compiler_matrix: ${{ toJSON(fromJSON(needs.compute-matrix.outputs.PER_CUDA_COMPILER_MATRIX)[ format('{0}-{1}', matrix.cuda_version, matrix.compiler) ]) }}
+  #    build_script: "./ci/build_thrust.sh"
+  #    test_script: "./ci/test_thrust.sh"
+  #    devcontainer_version: ${{ needs.compute-matrix.outputs.DEVCONTAINER_VERSION }}
 
-  cub:
-    name: CUB CUDA${{ matrix.cuda_version }} ${{ matrix.compiler }}
-    needs: compute-matrix
-    uses: ./.github/workflows/dispatch-build-and-test.yml
-    strategy:
-      fail-fast: false
-      matrix:
-        cuda_version: ${{ fromJSON(needs.compute-matrix.outputs.CUDA_VERSIONS) }}
-        compiler: ${{ fromJSON(needs.compute-matrix.outputs.HOST_COMPILERS) }}
-    with:
-      per_cuda_compiler_matrix: ${{ toJSON(fromJSON(needs.compute-matrix.outputs.PER_CUDA_COMPILER_MATRIX)[ format('{0}-{1}', matrix.cuda_version, matrix.compiler) ]) }}
-      build_script: "./ci/build_cub.sh"
-      test_script: "./ci/test_cub.sh"
-      devcontainer_version: ${{ needs.compute-matrix.outputs.DEVCONTAINER_VERSION }}
+  #cub:
+  #  name: CUB CUDA${{ matrix.cuda_version }} ${{ matrix.compiler }}
+  #  needs: compute-matrix
+  #  uses: ./.github/workflows/dispatch-build-and-test.yml
+  #  strategy:
+  #    fail-fast: false
+  #    matrix:
+  #      cuda_version: ${{ fromJSON(needs.compute-matrix.outputs.CUDA_VERSIONS) }}
+  #      compiler: ${{ fromJSON(needs.compute-matrix.outputs.HOST_COMPILERS) }}
+  #  with:
+  #    per_cuda_compiler_matrix: ${{ toJSON(fromJSON(needs.compute-matrix.outputs.PER_CUDA_COMPILER_MATRIX)[ format('{0}-{1}', matrix.cuda_version, matrix.compiler) ]) }}
+  #    build_script: "./ci/build_cub.sh"
+  #    test_script: "./ci/test_cub.sh"
+  #    devcontainer_version: ${{ needs.compute-matrix.outputs.DEVCONTAINER_VERSION }}
 
-  libcudacxx:
-    name: libcudacxx CUDA${{ matrix.cuda_version }} ${{ matrix.compiler }}
-    needs: compute-matrix
-    uses: ./.github/workflows/dispatch-build-and-test.yml
-    strategy:
-      fail-fast: false
-      matrix:
-        cuda_version: ${{ fromJSON(needs.compute-matrix.outputs.CUDA_VERSIONS) }}
-        compiler: ${{ fromJSON(needs.compute-matrix.outputs.HOST_COMPILERS) }}
-    with:
-      per_cuda_compiler_matrix: ${{ toJSON(fromJSON(needs.compute-matrix.outputs.PER_CUDA_COMPILER_MATRIX)[ format('{0}-{1}', matrix.cuda_version, matrix.compiler) ]) }}
-      build_script: "./ci/build_libcudacxx.sh"
-      test_script: "./ci/test_libcudacxx.sh"
-      devcontainer_version: ${{ needs.compute-matrix.outputs.DEVCONTAINER_VERSION }}
+  #libcudacxx:
+  #  name: libcudacxx CUDA${{ matrix.cuda_version }} ${{ matrix.compiler }}
+  #  needs: compute-matrix
+  #  uses: ./.github/workflows/dispatch-build-and-test.yml
+  #  strategy:
+  #    fail-fast: false
+  #    matrix:
+  #      cuda_version: ${{ fromJSON(needs.compute-matrix.outputs.CUDA_VERSIONS) }}
+  #      compiler: ${{ fromJSON(needs.compute-matrix.outputs.HOST_COMPILERS) }}
+  #  with:
+  #    per_cuda_compiler_matrix: ${{ toJSON(fromJSON(needs.compute-matrix.outputs.PER_CUDA_COMPILER_MATRIX)[ format('{0}-{1}', matrix.cuda_version, matrix.compiler) ]) }}
+  #    build_script: "./ci/build_libcudacxx.sh"
+  #    test_script: "./ci/test_libcudacxx.sh"
+  #    devcontainer_version: ${{ needs.compute-matrix.outputs.DEVCONTAINER_VERSION }}
 
   thrust-clang-cuda:
     name: Thrust CTK${{matrix.cuda_version}} clang-cuda ${{matrix.compiler.version}}
@@ -126,23 +126,23 @@ jobs:
       command: |
         CMAKE_CUDA_COMPILER=${{matrix.compiler.exe}} ./ci/build_thrust.sh ${{matrix.compiler.exe}} ${{matrix.std}} ${{matrix.gpu_build_archs}}
 
-  examples:
-    name: CCCL Examples
-    needs: compute-matrix
-    if: ${{ !contains(github.event.head_commit.message, 'skip-tests') }}
-    strategy:
-      fail-fast: false
-      matrix:
-        include: ${{ fromJSON(needs.compute-matrix.outputs.NVCC_FULL_MATRIX) }}
-    uses: ./.github/workflows/run-as-coder.yml
-    with:
-      name: CCCL Examples CUDA${{matrix.cuda}} ${{matrix.compiler.name}}${{matrix.compiler.version}}
-      runner: linux-${{matrix.cpu}}-gpu-v100-latest-1
-      image: rapidsai/devcontainers:${{needs.compute-matrix.outputs.DEVCONTAINER_VERSION}}-cpp-${{matrix.compiler.name}}${{matrix.compiler.version}}-cuda${{matrix.cuda}}-${{matrix.os}}
-      command: |
-        cmake -S . --preset=examples -DCCCL_EXAMPLE_CPM_TAG=${GITHUB_SHA} -DCMAKE_CUDA_COMPILER=nvcc
-        ctest --preset=examples
-
+#  examples:
+#    name: CCCL Examples
+#    needs: compute-matrix
+#    if: ${{ !contains(github.event.head_commit.message, 'skip-tests') }}
+#    strategy:
+#      fail-fast: false
+#      matrix:
+#        include: ${{ fromJSON(needs.compute-matrix.outputs.NVCC_FULL_MATRIX) }}
+#    uses: ./.github/workflows/run-as-coder.yml
+#    with:
+#      name: CCCL Examples CUDA${{matrix.cuda}} ${{matrix.compiler.name}}${{matrix.compiler.version}}
+#      runner: linux-${{matrix.cpu}}-gpu-v100-latest-1
+#      image: rapidsai/devcontainers:${{needs.compute-matrix.outputs.DEVCONTAINER_VERSION}}-cpp-${{matrix.compiler.name}}${{matrix.compiler.version}}-cuda${{matrix.cuda}}-${{matrix.os}}
+#      command: |
+#        cmake -S . --preset=examples -DCCCL_EXAMPLE_CPM_TAG=${GITHUB_SHA} -DCMAKE_CUDA_COMPILER=nvcc
+#        ctest --preset=examples
+#
   # This job is the final job that runs after all other jobs and is used for branch protection status checks.
   # See: https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/collaborating-on-repositories-with-code-quality-features/about-status-checks
   ci:

From 800913c35a82801e4b2d0e948bfdad29411e7896 Mon Sep 17 00:00:00 2001
From: Jake Hemstad <jhemstad@nvidia.com>
Date: Tue, 26 Sep 2023 19:38:54 +0000
Subject: [PATCH 11/64] Disable other jobs in status check.

---
 .github/workflows/pr.yml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml
index 750b1f6238..1e7fa3e112 100644
--- a/.github/workflows/pr.yml
+++ b/.github/workflows/pr.yml
@@ -149,11 +149,11 @@ jobs:
     runs-on: ubuntu-latest
     name: CI
     needs:
-      - cub
-      - libcudacxx
-      - nvrtc
-      - thrust
-      - examples
       - thrust-clang-cuda
+      #- cub
+      #- libcudacxx
+      #- nvrtc
+      #- thrust
+      #- examples
     steps:
       - run: echo "CI success"

From 566798261301fc624572b5231c9115ffd24167b6 Mon Sep 17 00:00:00 2001
From: Jake Hemstad <jhemstad@nvidia.com>
Date: Tue, 26 Sep 2023 19:42:55 +0000
Subject: [PATCH 12/64] Add output to compute matrix job.

---
 .github/workflows/pr.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml
index 1e7fa3e112..abe36ab033 100644
--- a/.github/workflows/pr.yml
+++ b/.github/workflows/pr.yml
@@ -42,6 +42,7 @@ jobs:
       HOST_COMPILERS: ${{steps.set-outputs.outputs.HOST_COMPILERS}}
       PER_CUDA_COMPILER_MATRIX: ${{steps.set-outputs.outputs.PER_CUDA_COMPILER_MATRIX}}
       NVRTC_MATRIX: ${{steps.set-outputs.outputs.matrix}}
+      CLANG_CUDA_MATRIX: ${{steps.set-outputs.outputs.CLANG_CUDA_MATRIX}}
     steps:
       - name: Checkout repo
         uses: actions/checkout@v3

From 8e66201d5d6929fd8572996edc0e9baf1fc5f8b8 Mon Sep 17 00:00:00 2001
From: Jake Hemstad <jhemstad@nvidia.com>
Date: Tue, 26 Sep 2023 19:49:52 +0000
Subject: [PATCH 13/64] Missin quote.

---
 ci/build_common.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/build_common.sh b/ci/build_common.sh
index a63c278b72..b2bc9061ea 100755
--- a/ci/build_common.sh
+++ b/ci/build_common.sh
@@ -12,7 +12,7 @@ CUDA_COMPILER=${CMAKE_CUDA_COMPILER:-nvcc}
 function usage {
     echo "Usage: $0 [OPTIONS] <HOST_COMPILER> <CXX_STANDARD> <GPU_ARCHS>"
     echo "The PARALLEL_LEVEL environment variable controls the amount of build parallelism. Default is the number of cores."
-    echo "The CMAKE_CUDA_COMPILER environment variable can be used to control the CUDA compiler. The -nvcc flag takes precedence.
+    echo "The CMAKE_CUDA_COMPILER environment variable can be used to control the CUDA compiler. The -nvcc flag takes precedence."
     echo "Example: PARALLEL_LEVEL=8 $0 g++-8 14 \"70\" "
     echo "Example: $0 clang++-8 17 \"70;75;80-virtual\" "
     echo "Possible options: "

From 8a2560b31ff76dbdc487eb1654003e5984e005ec Mon Sep 17 00:00:00 2001
From: Jake Hemstad <jhemstad@nvidia.com>
Date: Tue, 26 Sep 2023 19:52:17 +0000
Subject: [PATCH 14/64] Fix logic for enabling CUB benchmarks.

---
 ci/build_cub.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/ci/build_cub.sh b/ci/build_cub.sh
index fb720b85b9..12b7122a9b 100755
--- a/ci/build_cub.sh
+++ b/ci/build_cub.sh
@@ -12,9 +12,11 @@ version_compare() {
     fi
 }
 
+ENABLE_CUB_BENCHMARKS="false"
 if [[ "$CUDA_COMPILER" == *nvcc* ]]; then
     NVCC_VERSION=$($CUDA_COMPILER --version | grep release | awk '{print $6}' | cut -c2-)
     if [[ $(version_compare $NVCC_VERSION 11.5) == "true" ]]; then
+        ENABLE_CUB_BENCHMARKS="true"
         echo "nvcc version is $NVCC_VERSION. Building CUB benchmarks."
     else
         echo "nvcc version is $NVCC_VERSION. Not building CUB benchmarks because nvcc version is less than 11.5."

From dc550e75bcb23161dae7483858a62796e5f2ffe8 Mon Sep 17 00:00:00 2001
From: Jake Hemstad <jhemstad@nvidia.com>
Date: Tue, 26 Sep 2023 19:53:39 +0000
Subject: [PATCH 15/64] Fix reference to cuda version in job name.

---
 .github/workflows/pr.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml
index abe36ab033..9595f2d945 100644
--- a/.github/workflows/pr.yml
+++ b/.github/workflows/pr.yml
@@ -113,7 +113,7 @@ jobs:
   #    devcontainer_version: ${{ needs.compute-matrix.outputs.DEVCONTAINER_VERSION }}
 
   thrust-clang-cuda:
-    name: Thrust CTK${{matrix.cuda_version}} clang-cuda ${{matrix.compiler.version}}
+    name: Thrust CTK${{matrix.cuda}} clang-cuda ${{matrix.compiler.version}}
     needs: compute-matrix
     strategy:
       fail-fast: false
@@ -121,11 +121,11 @@ jobs:
         include: ${{ fromJSON(needs.compute-matrix.outputs.CLANG_CUDA_MATRIX) }}
     uses: ./.github/workflows/run-as-coder.yml
     with:
-      name: Thrust CTK${{matrix.cuda_version}} clang-cuda ${{matrix.compiler.version}}
+      name: Thrust CTK${{matrix.cuda}} clang-cuda ${{matrix.compiler.version}}
       runner: linux-${{matrix.cpu}}-cpu16
       image: rapidsai/devcontainers:${{needs.compute-matrix.outputs.DEVCONTAINER_VERSION}}-cpp-${{matrix.compiler.name}}${{matrix.compiler.version}}-cuda${{matrix.cuda}}-${{matrix.os}}
       command: |
-        CMAKE_CUDA_COMPILER=${{matrix.compiler.exe}} ./ci/build_thrust.sh ${{matrix.compiler.exe}} ${{matrix.std}} ${{matrix.gpu_build_archs}}
+        CMAKE_CUDA_COMPILER="${{matrix.compiler.exe}}" ./ci/build_thrust.sh ${{matrix.compiler.exe}} ${{matrix.std}} ${{matrix.gpu_build_archs}}
 
 #  examples:
 #    name: CCCL Examples

From afd8f1303eedc67751593400f569271e424e29af Mon Sep 17 00:00:00 2001
From: Jake Hemstad <jhemstad@nvidia.com>
Date: Tue, 26 Sep 2023 20:43:32 +0000
Subject: [PATCH 16/64] make clang-cuda job matrix over libs.

---
 .github/workflows/pr.yml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml
index 9595f2d945..82faac9e74 100644
--- a/.github/workflows/pr.yml
+++ b/.github/workflows/pr.yml
@@ -112,12 +112,13 @@ jobs:
   #    test_script: "./ci/test_libcudacxx.sh"
   #    devcontainer_version: ${{ needs.compute-matrix.outputs.DEVCONTAINER_VERSION }}
 
-  thrust-clang-cuda:
-    name: Thrust CTK${{matrix.cuda}} clang-cuda ${{matrix.compiler.version}}
+clang-cuda:
+    name: ${{matrix.lib}} CTK${{matrix.cuda}} clang-cuda ${{matrix.compiler.version}}
     needs: compute-matrix
     strategy:
       fail-fast: false
       matrix:
+        lib: [thrust, cub, libcudacxx]
         include: ${{ fromJSON(needs.compute-matrix.outputs.CLANG_CUDA_MATRIX) }}
     uses: ./.github/workflows/run-as-coder.yml
     with:

From abb82356410b4bdbd75cbaa5b2c586cce9749355 Mon Sep 17 00:00:00 2001
From: Jake Hemstad <jhemstad@nvidia.com>
Date: Tue, 26 Sep 2023 20:45:07 +0000
Subject: [PATCH 17/64] Fix build script to use matrix lib value.

---
 .github/workflows/pr.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml
index 82faac9e74..6cb5d5487e 100644
--- a/.github/workflows/pr.yml
+++ b/.github/workflows/pr.yml
@@ -126,7 +126,7 @@ clang-cuda:
       runner: linux-${{matrix.cpu}}-cpu16
       image: rapidsai/devcontainers:${{needs.compute-matrix.outputs.DEVCONTAINER_VERSION}}-cpp-${{matrix.compiler.name}}${{matrix.compiler.version}}-cuda${{matrix.cuda}}-${{matrix.os}}
       command: |
-        CMAKE_CUDA_COMPILER="${{matrix.compiler.exe}}" ./ci/build_thrust.sh ${{matrix.compiler.exe}} ${{matrix.std}} ${{matrix.gpu_build_archs}}
+        CMAKE_CUDA_COMPILER="${{matrix.compiler.exe}}" ./ci/build_${{matrix.lib}}.sh ${{matrix.compiler.exe}} ${{matrix.std}} ${{matrix.gpu_build_archs}}
 
 #  examples:
 #    name: CCCL Examples

From 873db9e2e72b3028ce62d3365023023b7ec4a2c2 Mon Sep 17 00:00:00 2001
From: Jake Hemstad <jhemstad@nvidia.com>
Date: Tue, 26 Sep 2023 20:45:51 +0000
Subject: [PATCH 18/64] Fix job name in status check.

---
 .github/workflows/pr.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml
index 6cb5d5487e..11a132d2d3 100644
--- a/.github/workflows/pr.yml
+++ b/.github/workflows/pr.yml
@@ -151,7 +151,7 @@ clang-cuda:
     runs-on: ubuntu-latest
     name: CI
     needs:
-      - thrust-clang-cuda
+      - clang-cuda
       #- cub
       #- libcudacxx
       #- nvrtc

From 93a10e59cec840f6bbef8782e10410ba14856f7a Mon Sep 17 00:00:00 2001
From: Jake Hemstad <jhemstad@nvidia.com>
Date: Tue, 26 Sep 2023 20:50:49 +0000
Subject: [PATCH 19/64] Fix formatting.

---
 .github/workflows/pr.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml
index 11a132d2d3..bfabe9cef0 100644
--- a/.github/workflows/pr.yml
+++ b/.github/workflows/pr.yml
@@ -112,7 +112,7 @@ jobs:
   #    test_script: "./ci/test_libcudacxx.sh"
   #    devcontainer_version: ${{ needs.compute-matrix.outputs.DEVCONTAINER_VERSION }}
 
-clang-cuda:
+  clang-cuda:
     name: ${{matrix.lib}} CTK${{matrix.cuda}} clang-cuda ${{matrix.compiler.version}}
     needs: compute-matrix
     strategy:

From 26938c2ecd2b782471e1d489d8c1239efdc16d69 Mon Sep 17 00:00:00 2001
From: Jake Hemstad <jhemstad@nvidia.com>
Date: Tue, 26 Sep 2023 20:53:47 +0000
Subject: [PATCH 20/64] Fix job name.

---
 .github/workflows/pr.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml
index bfabe9cef0..8d73a4c954 100644
--- a/.github/workflows/pr.yml
+++ b/.github/workflows/pr.yml
@@ -122,7 +122,7 @@ jobs:
         include: ${{ fromJSON(needs.compute-matrix.outputs.CLANG_CUDA_MATRIX) }}
     uses: ./.github/workflows/run-as-coder.yml
     with:
-      name: Thrust CTK${{matrix.cuda}} clang-cuda ${{matrix.compiler.version}}
+      name: ${{matrix.lib}} CTK${{matrix.cuda}} clang-cuda ${{matrix.compiler.version}}
       runner: linux-${{matrix.cpu}}-cpu16
       image: rapidsai/devcontainers:${{needs.compute-matrix.outputs.DEVCONTAINER_VERSION}}-cpp-${{matrix.compiler.name}}${{matrix.compiler.version}}-cuda${{matrix.cuda}}-${{matrix.os}}
       command: |

From 1356a4e4f4bc7ece1ad5ea9af7aab202f74918be Mon Sep 17 00:00:00 2001
From: Jake Hemstad <jhemstad@nvidia.com>
Date: Tue, 26 Sep 2023 21:53:13 +0000
Subject: [PATCH 21/64] Generate custom matrix with cartesian product of libs.

---
 .github/actions/compute-matrix/compute-matrix.sh | 6 +++++-
 .github/workflows/pr.yml                         | 1 -
 ci/matrix.yaml                                   | 2 +-
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/.github/actions/compute-matrix/compute-matrix.sh b/.github/actions/compute-matrix/compute-matrix.sh
index 290feaead2..14586a2e67 100755
--- a/.github/actions/compute-matrix/compute-matrix.sh
+++ b/.github/actions/compute-matrix/compute-matrix.sh
@@ -12,6 +12,10 @@ explode_std_versions() {
   jq -cr 'map(. as $o | {std: $o.std[]} + del($o.std))'
 }
 
+explode_libs() {
+  jq -cr 'map(. as $o | {lib: $o.lib[]} + del($o.lib))'
+}
+
 extract_matrix() {
   local file="$1"
   local type="$2"
@@ -23,7 +27,7 @@ extract_matrix() {
   write_output "HOST_COMPILERS" "$(echo "$nvcc_full_matrix" | jq -cr '[.[] | .compiler.name] | unique')"
   write_output "PER_CUDA_COMPILER_MATRIX" "$(echo "$nvcc_full_matrix" | jq -cr ' group_by(.cuda + .compiler.name) | map({(.[0].cuda + "-" + .[0].compiler.name): .}) | add')"
   write_output "NVRTC_MATRIX" "$(echo "$matrix" | jq '.nvrtc' | explode_std_versions)"
-  local clang_cuda_matrix="$(echo "$matrix" | jq -cr '.["clang-cuda"]' | explode_std_versions)"
+  local clang_cuda_matrix="$(echo "$matrix" | jq -cr '.["clang-cuda"]' | explode_std_versions | explode_libs)"
   write_output "CLANG_CUDA_MATRIX" "$clang_cuda_matrix"
 }
 
diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml
index 8d73a4c954..0958d7271e 100644
--- a/.github/workflows/pr.yml
+++ b/.github/workflows/pr.yml
@@ -118,7 +118,6 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        lib: [thrust, cub, libcudacxx]
         include: ${{ fromJSON(needs.compute-matrix.outputs.CLANG_CUDA_MATRIX) }}
     uses: ./.github/workflows/run-as-coder.yml
     with:
diff --git a/ci/matrix.yaml b/ci/matrix.yaml
index bd5bc2e611..fefada3fd8 100644
--- a/ci/matrix.yaml
+++ b/ci/matrix.yaml
@@ -49,4 +49,4 @@ pull_request:
   nvrtc:
     - {cuda: *cuda_newest, os: 'ubuntu22.04', cpu: 'amd64', gpu_build_archs: '70', std: [11, 14, 17, 20]}
   clang-cuda:
-    - {cuda: *cuda_newest, os: 'ubuntu22.04', cpu: 'amd64', compiler: {name: 'llvm', version: '16', exe: 'clang++'}, gpu_build_archs: '70', std: [17, 20]}
+    - {lib: ['thrust', 'cub', 'libcudacxx'], cuda: *cuda_newest, os: 'ubuntu22.04', cpu: 'amd64', compiler: {name: 'llvm', version: '16', exe: 'clang++'}, gpu_build_archs: '70', std: [17, 20]}

From 74379130cfe38e2208a4ee44bc1af1fb1935185a Mon Sep 17 00:00:00 2001
From: Wesley Maxey <71408887+wmaxey@users.noreply.github.com>
Date: Tue, 26 Sep 2023 15:16:52 -0700
Subject: [PATCH 22/64] Add hacks that allow clang-cuda to work.

---
 .../.upstream-tests/test/CMakeLists.txt       | 12 ++++++
 .../test/support/cuda_space_selector.h        |  4 ++
 .../utils/libcudacxx/compiler.py              |  3 ++
 .../utils/libcudacxx/test/config.py           | 38 +++++++++++--------
 libcudacxx/CMakeLists.txt                     |  1 +
 5 files changed, 42 insertions(+), 16 deletions(-)

diff --git a/libcudacxx/.upstream-tests/test/CMakeLists.txt b/libcudacxx/.upstream-tests/test/CMakeLists.txt
index b03563f09a..2bda7020d2 100644
--- a/libcudacxx/.upstream-tests/test/CMakeLists.txt
+++ b/libcudacxx/.upstream-tests/test/CMakeLists.txt
@@ -45,6 +45,18 @@ if (${CMAKE_CXX_COMPILER_ID} STREQUAL "IntelLLVM")
     --compiler-options=-fno-fast-math")
 endif()
 
+if (${CMAKE_CUDA_COMPILER_ID} STREQUAL "Clang")
+  string(APPEND LIBCUDACXX_TEST_COMPILER_FLAGS
+     " -Xclang -fcuda-allow-variadic-functions"
+     " -Xclang -Wno-unused-parameter"
+     " -Wno-unknown-cuda-version")
+
+  find_package(CUDAToolkit)
+
+  string(APPEND LIBCUDACXX_TEST_LINKER_FLAGS
+    " -L${CUDAToolkit_LIBRARY_DIR} -lcuda -lcudart")
+endif()
+
 if (${CMAKE_CUDA_COMPILER_ID} STREQUAL "NVIDIA")
   set(LIBCUDACXX_TEST_COMPILER_FLAGS
     "${LIBCUDACXX_TEST_COMPILER_FLAGS} \
diff --git a/libcudacxx/.upstream-tests/test/support/cuda_space_selector.h b/libcudacxx/.upstream-tests/test/support/cuda_space_selector.h
index 5e58080d51..1ba4d41948 100644
--- a/libcudacxx/.upstream-tests/test/support/cuda_space_selector.h
+++ b/libcudacxx/.upstream-tests/test/support/cuda_space_selector.h
@@ -16,6 +16,10 @@
 
 #include "concurrent_agents.h"
 
+#if defined(__clang__) && defined(__CUDA__)
+# include <new>
+#endif
+
 #ifdef _LIBCUDACXX_COMPILER_NVRTC
 #define LAMBDA [=]
 #else
diff --git a/libcudacxx/.upstream-tests/utils/libcudacxx/compiler.py b/libcudacxx/.upstream-tests/utils/libcudacxx/compiler.py
index 436de868f4..267e5c0025 100644
--- a/libcudacxx/.upstream-tests/utils/libcudacxx/compiler.py
+++ b/libcudacxx/.upstream-tests/utils/libcudacxx/compiler.py
@@ -146,6 +146,9 @@ def _initTypeAndVersion(self):
         if self.type == 'nvcc':
             # Treat C++ as CUDA when the compiler is NVCC.
             self.source_lang = 'cu'
+        elif self.type == 'clang':
+            # Treat C++ as clang-cuda when the compiler is Clang.
+            self.source_lang = 'cu'
 
     def _basicCmd(self, source_files, out, mode=CM_Default, flags=[],
                   input_is_cxx=False):
diff --git a/libcudacxx/.upstream-tests/utils/libcudacxx/test/config.py b/libcudacxx/.upstream-tests/utils/libcudacxx/test/config.py
index c7ae87c8eb..51015644bf 100644
--- a/libcudacxx/.upstream-tests/utils/libcudacxx/test/config.py
+++ b/libcudacxx/.upstream-tests/utils/libcudacxx/test/config.py
@@ -627,12 +627,17 @@ def configure_compile_flags(self):
             self.config.available_features.add("nvrtc")
         if self.cxx.type == 'nvcc':
             self.cxx.compile_flags += ['--extended-lambda']
+        real_arch_format = '-gencode=arch=compute_{0},code=sm_{0}'
+        virt_arch_format = '-gencode=arch=compute_{0},code=compute_{0}'
+        if self.cxx.type == 'clang':
+            real_arch_format = '--cuda-gpu-arch=sm_{0}'
+            virt_arch_format = '--cuda-gpu-arch=compute_{0}'
         pre_sm_32 = True
         pre_sm_60 = True
         pre_sm_70 = True
         pre_sm_80 = True
         pre_sm_90 = True
-        if compute_archs and self.cxx.type == 'nvcc':
+        if compute_archs and (self.cxx.type == 'nvcc' or self.cxx.type == 'clang'):
             pre_sm_32 = False
             pre_sm_60 = False
             pre_sm_70 = False
@@ -653,10 +658,9 @@ def configure_compile_flags(self):
                 if arch < 70: pre_sm_70 = True
                 if arch < 80: pre_sm_80 = True
                 if arch < 90: pre_sm_90 = True
+                arch_flag = real_arch_format.format(arch)
                 if mode.count("virtual"):
-                    arch_flag = '-gencode=arch=compute_{0},code=compute_{0}'.format(arch)
-                else:
-                    arch_flag = '-gencode=arch=compute_{0},code=sm_{0}'.format(arch)
+                    arch_flag = virt_arch_format.format(arch)
                 self.cxx.compile_flags += [arch_flag]
         if pre_sm_32:
             self.config.available_features.add("pre-sm-32")
@@ -818,8 +822,9 @@ def configure_compile_flags_header_includes(self):
                                  and self.cxx_stdlib_under_test != 'libc++'):
             self.lit_config.note('using the system cxx headers')
             return
-        if self.cxx.type != 'nvcc' and self.cxx.type != 'nvhpc':
-            self.cxx.compile_flags += ['-nostdinc++']
+        # I don't think this is required, since removing it helps clang-cuda compile and libcudacxx only supports building in CUDA modes?
+        # if self.cxx.type != 'nvcc' and self.cxx.type != 'pgi':
+        #    self.cxx.compile_flags += ['-nostdinc++']
         if cxx_headers is None:
             cxx_headers = os.path.join(self.libcudacxx_src_root, 'include')
         if not os.path.isdir(cxx_headers):
@@ -1058,16 +1063,17 @@ def configure_link_flags_cxx_library(self):
             self.cxx.link_flags += ['-lc++experimental']
         if self.link_shared:
             self.cxx.link_flags += ['-lc++']
-        elif self.cxx.type != 'nvcc' and self.cxx.type != 'nvhpc':
-            cxx_library_root = self.get_lit_conf('cxx_library_root')
-            if cxx_library_root:
-                libname = self.make_static_lib_name('c++')
-                abs_path = os.path.join(cxx_library_root, libname)
-                assert os.path.exists(abs_path) and \
-                       "static libc++ library does not exist"
-                self.cxx.link_flags += [abs_path]
-            else:
-                self.cxx.link_flags += ['-lc++']
+        # Device code does not have binary components, don't link libc++
+        # elif self.cxx.type != 'nvcc' and self.cxx.type != 'pgi':
+        #     cxx_library_root = self.get_lit_conf('cxx_library_root')
+        #     if cxx_library_root:
+        #         libname = self.make_static_lib_name('c++')
+        #         abs_path = os.path.join(cxx_library_root, libname)
+        #         assert os.path.exists(abs_path) and \
+        #                "static libc++ library does not exist"
+        #         self.cxx.link_flags += [abs_path]
+        #     else:
+        #         self.cxx.link_flags += ['-lc++']
 
     def configure_link_flags_abi_library(self):
         cxx_abi = self.get_lit_conf('cxx_abi', 'libcxxabi')
diff --git a/libcudacxx/CMakeLists.txt b/libcudacxx/CMakeLists.txt
index 784c57aa02..a5343d53db 100644
--- a/libcudacxx/CMakeLists.txt
+++ b/libcudacxx/CMakeLists.txt
@@ -1,4 +1,5 @@
 # 3.15 is the minimum for including the project with add_subdirectory.
+# 3.18 for C++17 + CUDA and clang-cuda support.
 # 3.21 is the minimum for the developer build.
 cmake_minimum_required(VERSION 3.15)
 

From 4d4616b5aa72650d553c9b80c623268359513f97 Mon Sep 17 00:00:00 2001
From: Georgy Evtushenko <evtushenko.georgy@gmail.com>
Date: Tue, 26 Sep 2023 16:30:32 -0700
Subject: [PATCH 23/64] Do not build RDC tests for Clang CUDA

---
 ci/build_cub.sh | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/ci/build_cub.sh b/ci/build_cub.sh
index 12b7122a9b..3b425f2dab 100755
--- a/ci/build_cub.sh
+++ b/ci/build_cub.sh
@@ -13,7 +13,9 @@ version_compare() {
 }
 
 ENABLE_CUB_BENCHMARKS="false"
+ENABLE_CUB_RDC="false"
 if [[ "$CUDA_COMPILER" == *nvcc* ]]; then
+    ENABLE_CUB_RDC="true"
     NVCC_VERSION=$($CUDA_COMPILER --version | grep release | awk '{print $6}' | cut -c2-)
     if [[ $(version_compare $NVCC_VERSION 11.5) == "true" ]]; then
         ENABLE_CUB_BENCHMARKS="true"
@@ -37,6 +39,7 @@ CMAKE_OPTIONS="
     -DTHRUST_IGNORE_DEPRECATED_CPP_DIALECT=ON \
     -DCUB_IGNORE_DEPRECATED_CPP_DIALECT=ON \
     -DCUB_ENABLE_BENCHMARKS="$ENABLE_CUB_BENCHMARKS"\
+    -DCUB_ENABLE_RDC_TESTS="$ENABLE_CUB_RDC" \
 "
 
 configure_and_build "CUB" "$CMAKE_OPTIONS"

From 4e204b20622dfcd10d94f014e5dd82ea11a544b6 Mon Sep 17 00:00:00 2001
From: Georgy Evtushenko <evtushenko.georgy@gmail.com>
Date: Wed, 27 Sep 2023 00:45:00 +0000
Subject: [PATCH 24/64] Attempt to fix thrust::complex for Clang-CUDA

---
 .../include/cuda/std/detail/libcxx/include/cmath       | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/cmath b/libcudacxx/include/cuda/std/detail/libcxx/include/cmath
index ab25d7585e..2855220e72 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/cmath
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/cmath
@@ -674,7 +674,7 @@ __constexpr_isfinite(_A1 __lcpp_x) noexcept
     return isfinite(__lcpp_x);
 }
 
-#if defined(_MSC_VER) || defined(__CUDACC_RTC__)
+#if defined(_MSC_VER) || defined(__CUDACC_RTC__) || defined(_LIBCUDACXX_COMPILER_CLANG_CUDA)
 template <class _A1>
 _LIBCUDACXX_INLINE_VISIBILITY
 _A1 __constexpr_copysign(_A1 __x, _A1 __y) noexcept
@@ -708,7 +708,7 @@ _LIBCUDACXX_CONSTEXPR_AFTER_CXX11 typename __enable_if_t<is_arithmetic<_A1>::val
 }
 #endif // !_MSC_VER
 
-#if defined(_MSC_VER) || defined(__CUDACC_RTC__)
+#if defined(_MSC_VER) || defined(__CUDACC_RTC__) || defined(_LIBCUDACXX_COMPILER_CLANG_CUDA)
 template <class _A1>
 _LIBCUDACXX_INLINE_VISIBILITY
 _A1 __constexpr_fabs(_A1 __x) noexcept
@@ -738,7 +738,7 @@ _LIBCUDACXX_CONSTEXPR_AFTER_CXX11 double __constexpr_fabs(_Tp __x) noexcept {
 }
 #endif // !_MSC_VER
 
-#if defined(_MSC_VER) || defined(__CUDACC_RTC__)
+#if defined(_MSC_VER) || defined(__CUDACC_RTC__) || defined(_LIBCUDACXX_COMPILER_CLANG_CUDA)
 template <class _A1>
 _LIBCUDACXX_INLINE_VISIBILITY
 _A1 __constexpr_fmax(_A1 __x, _A1 __y) noexcept
@@ -797,7 +797,7 @@ __constexpr_fmax(_Tp __x, _Up __y) noexcept {
 }
 #endif // !_MSC_VER
 
-#if defined(_MSC_VER) || defined(__CUDACC_RTC__)
+#if defined(_MSC_VER) || defined(__CUDACC_RTC__) || defined(_LIBCUDACXX_COMPILER_CLANG_CUDA)
 template <class _A1>
 _LIBCUDACXX_INLINE_VISIBILITY
 _A1 __constexpr_logb(_A1 __x)
@@ -834,7 +834,7 @@ _LIBCUDACXX_CONSTEXPR_AFTER_CXX11_COMPLEX _Tp __constexpr_logb(_Tp __x) {
 }
 #endif // !_MSVC
 
-#if defined(_MSC_VER) || defined(__CUDACC_RTC__)
+#if defined(_MSC_VER) || defined(__CUDACC_RTC__) || defined(_LIBCUDACXX_COMPILER_CLANG_CUDA)
 template <class _Tp>
 inline _LIBCUDACXX_INLINE_VISIBILITY
 _Tp __constexpr_scalbn(_Tp __x, int __i) {

From 9acf004872a2ab5d104d56343c8884c79395b43b Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Wed, 27 Sep 2023 08:12:49 +0000
Subject: [PATCH 25/64] Fix macro definitions that are nvcc specific

---
 libcudacxx/.upstream-tests/test/support/test_macros.h     | 8 +++++++-
 .../include/cuda/std/detail/libcxx/include/__config       | 2 +-
 libcudacxx/include/cuda/std/detail/libcxx/include/cstdlib | 5 +++--
 3 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/libcudacxx/.upstream-tests/test/support/test_macros.h b/libcudacxx/.upstream-tests/test/support/test_macros.h
index ea30effa3b..3d77a42137 100644
--- a/libcudacxx/.upstream-tests/test/support/test_macros.h
+++ b/libcudacxx/.upstream-tests/test/support/test_macros.h
@@ -86,6 +86,10 @@
 // compiler.
 # define TEST_COMPILER_NVCC
 # define TEST_COMPILER_EDG
+#elif defined(_NVHPC_CUDA)
+#  define TEST_COMPILER_NVHPC_CUDA
+#elif defined(__CUDA__) && defined(_LIBCUDACXX_COMPILER_CLANG)
+#  define TEST_COMPILER_CLANG_CUDA
 #endif
 
 #if defined(__apple_build_version__)
@@ -398,7 +402,9 @@ constexpr bool unused(T &&) {return true;}
 // Define a helper macro to properly suppress warnings
 #define _TEST_TOSTRING2(x) #x
 #define _TEST_TOSTRING(x) _TEST_TOSTRING2(x)
-#if defined(__NVCC_DIAG_PRAGMA_SUPPORT__)
+#if defined(TEST_COMPILER_CLANG_CUDA)
+# define TEST_NV_DIAG_SUPPRESS(WARNING)
+#elif defined(__NVCC_DIAG_PRAGMA_SUPPORT__)
 #if defined (TEST_COMPILER_MSVC)
 # define TEST_NV_DIAG_SUPPRESS(WARNING) __pragma(_TEST_TOSTRING(nv_diag_suppress WARNING))
 #else // ^^^ MSVC ^^^ / vvv not MSVC vvv
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__config b/libcudacxx/include/cuda/std/detail/libcxx/include/__config
index 29cf177366..93efdd983d 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__config
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__config
@@ -864,7 +864,7 @@ extern "C++" {
 
 #define _LIBCUDACXX_TOSTRING2(_STR) #_STR
 #define _LIBCUDACXX_TOSTRING(_STR) _LIBCUDACXX_TOSTRING2(_STR)
-#if defined(_LIBCUDACXX_CUDACC)
+#if defined(_LIBCUDACXX_CUDACC) && !defined(_LIBCUDACXX_COMPILER_CLANG_CUDA)
 #  if defined(__NVCC_DIAG_PRAGMA_SUPPORT__)
 #    if defined(_LIBCUDACXX_COMPILER_MSVC)
 #      define _LIBCUDACXX_NV_DIAG_SUPPRESS(_WARNING) __pragma(_LIBCUDACXX_TOSTRING(nv_diag_suppress _WARNING))
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/cstdlib b/libcudacxx/include/cuda/std/detail/libcxx/include/cstdlib
index cd6768e0e2..f1659f5755 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/cstdlib
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/cstdlib
@@ -98,8 +98,9 @@ void *aligned_alloc(size_t alignment, size_t size);                       // C11
 #pragma GCC system_header
 #endif
 
-
-#ifdef __CUDA_ARCH__
+#if defined(_LIBCUDACXX_COMPILER_CLANG_CUDA)
+#  define _LIBCUDACXX_UNREACHABLE() __builtin_unreachable()
+#elif defined(__CUDA_ARCH__)
 #if defined(_LIBCUDACXX_CUDACC_BELOW_11_2)
 #  define _LIBCUDACXX_UNREACHABLE() __trap()
 #elif defined(_LIBCUDACXX_CUDACC_BELOW_11_3)

From ff7a43aa3bfaff413bd430962b350845ae5309e3 Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Wed, 27 Sep 2023 08:13:14 +0000
Subject: [PATCH 26/64] Add missing header that is otherwise coming from the
 cuda side

---
 .../cuda/std/detail/libcxx/include/__memory/construct_at.h    | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__memory/construct_at.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__memory/construct_at.h
index 44b747efd9..a0577fac22 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__memory/construct_at.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__memory/construct_at.h
@@ -27,6 +27,10 @@
 #include "../__utility/forward.h"
 #include "../__utility/move.h"
 
+#ifdef _LIBCUDACXX_COMPILER_CLANG_CUDA
+#include <new>
+#endif // _LIBCUDACXX_COMPILER_CLANG_CUDA
+
 #if defined(_LIBCUDACXX_USE_PRAGMA_GCC_SYSTEM_HEADER)
 #pragma GCC system_header
 #endif

From 1f3e8c438005e6826fc47632b3ae1ab994908e01 Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Wed, 27 Sep 2023 08:13:33 +0000
Subject: [PATCH 27/64] Fix invalid initialization order in constructor

---
 libcudacxx/include/cuda/std/detail/libcxx/include/barrier | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/barrier b/libcudacxx/include/cuda/std/detail/libcxx/include/barrier
index 2f0afb3d2b..ab4c0fb851 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/barrier
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/barrier
@@ -302,7 +302,7 @@ public:
 
     _LIBCUDACXX_INLINE_VISIBILITY
     __barrier_base(ptrdiff_t __expected, _CompletionF __completion = _CompletionF())
-        : __phase(false), __expected(__expected), __arrived(__expected), __completion(__completion)
+        : __expected(__expected), __arrived(__expected), __completion(__completion), __phase(false)
     {
     }
 

From 7feab38147da4342f8ebb9bd50754ba1c5d00f2c Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Wed, 27 Sep 2023 08:13:54 +0000
Subject: [PATCH 28/64] Fix clang-cuda being picky about attribute orders

---
 .../test/support/cuda_space_selector.h           |  2 +-
 .../libcxx/include/__expected/unexpected.h       | 16 ++++++++--------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/libcudacxx/.upstream-tests/test/support/cuda_space_selector.h b/libcudacxx/.upstream-tests/test/support/cuda_space_selector.h
index 1ba4d41948..cf145267ff 100644
--- a/libcudacxx/.upstream-tests/test/support/cuda_space_selector.h
+++ b/libcudacxx/.upstream-tests/test/support/cuda_space_selector.h
@@ -97,7 +97,7 @@ struct device_shared_memory_provider {
 
     __device__
     T * get() {
-        __shared__ alignas(T) char buffer[shared_offset];
+        alignas(T) __shared__ char buffer[shared_offset];
         return reinterpret_cast<T *>(buffer + prefix_size);
     }
 };
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__expected/unexpected.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__expected/unexpected.h
index 0f8770c800..297bebed1d 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__expected/unexpected.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__expected/unexpected.h
@@ -97,23 +97,23 @@ class unexpected {
   constexpr unexpected& operator=(unexpected&&)      = default;
 
   // [expected.un.obs]
-  _LIBCUDACXX_INLINE_VISIBILITY
-  _LIBCUDACXX_NODISCARD_ATTRIBUTE constexpr const _Err& error() const& noexcept {
+  _LIBCUDACXX_NODISCARD_ATTRIBUTE _LIBCUDACXX_INLINE_VISIBILITY
+  constexpr const _Err& error() const& noexcept {
     return __unex_;
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY
-  _LIBCUDACXX_NODISCARD_ATTRIBUTE constexpr _Err& error() & noexcept {
+  _LIBCUDACXX_NODISCARD_ATTRIBUTE _LIBCUDACXX_INLINE_VISIBILITY
+  constexpr _Err& error() & noexcept {
     return __unex_;
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY
-  _LIBCUDACXX_NODISCARD_ATTRIBUTE constexpr const _Err&& error() const&& noexcept {
+  _LIBCUDACXX_NODISCARD_ATTRIBUTE _LIBCUDACXX_INLINE_VISIBILITY
+  constexpr const _Err&& error() const&& noexcept {
     return _CUDA_VSTD::move(__unex_);
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY
-  _LIBCUDACXX_NODISCARD_ATTRIBUTE constexpr _Err&& error() && noexcept {
+  _LIBCUDACXX_NODISCARD_ATTRIBUTE _LIBCUDACXX_INLINE_VISIBILITY
+  constexpr _Err&& error() && noexcept {
     return _CUDA_VSTD::move(__unex_);
   }
 

From 9f70dc081f007d9254a2c5e0c29dd5a6de9d32c5 Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Wed, 27 Sep 2023 08:19:01 +0000
Subject: [PATCH 29/64] clang-cuda requires deduction guides to be marked as
 `__host__ __device__`

---
 .../include/__functional/reference_wrapper.h     |  2 +-
 .../std/detail/libcxx/include/__mdspan/extents.h |  2 +-
 .../std/detail/libcxx/include/__mdspan/mdspan.h  | 16 ++++++++--------
 .../std/detail/libcxx/include/__utility/pair.h   |  2 +-
 .../include/cuda/std/detail/libcxx/include/array |  3 +--
 .../cuda/std/detail/libcxx/include/optional      |  2 +-
 .../include/cuda/std/detail/libcxx/include/span  | 10 +++++-----
 .../include/cuda/std/detail/libcxx/include/tuple | 12 ++++++------
 8 files changed, 24 insertions(+), 25 deletions(-)

diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__functional/reference_wrapper.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__functional/reference_wrapper.h
index d9308b668a..a9cb9a37da 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__functional/reference_wrapper.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__functional/reference_wrapper.h
@@ -60,7 +60,7 @@ class _LIBCUDACXX_TEMPLATE_VIS reference_wrapper : public __weak_result_type<_Tp
 
 #if _LIBCUDACXX_STD_VER > 14 && !defined(_LIBCUDACXX_HAS_NO_DEDUCTION_GUIDES)
 template <class _Tp>
-reference_wrapper(_Tp&) -> reference_wrapper<_Tp>;
+_LIBCUDACXX_HOST_DEVICE reference_wrapper(_Tp&) -> reference_wrapper<_Tp>;
 #endif
 
 template <class _Tp>
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__mdspan/extents.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__mdspan/extents.h
index 1f82f3a389..a827c49ce5 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__mdspan/extents.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__mdspan/extents.h
@@ -534,7 +534,7 @@ using dextents = typename __detail::__make_dextents<_IndexType, _Rank>::type;
 
 #if defined(__MDSPAN_USE_CLASS_TEMPLATE_ARGUMENT_DEDUCTION)
 template <class... _IndexTypes>
-extents(_IndexTypes...)
+_LIBCUDACXX_HOST_DEVICE extents(_IndexTypes...)
   // Workaround for nvcc
   //-> extents<size_t, __detail::__make_dynamic_extent<_IndexTypes>()...>;
   // Adding "(void)" so that clang doesn't complain this is unused
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__mdspan/mdspan.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__mdspan/mdspan.h
index f5323a72ce..37cc94c732 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__mdspan/mdspan.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__mdspan/mdspan.h
@@ -421,41 +421,41 @@ __MDSPAN_TEMPLATE_REQUIRES(
   /* requires */ __MDSPAN_FOLD_AND(_LIBCUDACXX_TRAIT(is_integral, _SizeTypes) /* && ... */) &&
   (sizeof...(_SizeTypes) > 0)
 )
-explicit mdspan(_ElementType*, _SizeTypes...)
+_LIBCUDACXX_HOST_DEVICE explicit mdspan(_ElementType*, _SizeTypes...)
   -> mdspan<_ElementType, dextents<size_t, sizeof...(_SizeTypes)>>;
 
 __MDSPAN_TEMPLATE_REQUIRES(
   class _Pointer,
   (_LIBCUDACXX_TRAIT(is_pointer, _CUDA_VSTD::remove_reference_t<_Pointer>))
 )
-mdspan(_Pointer&&) -> mdspan<_CUDA_VSTD::remove_pointer_t<_CUDA_VSTD::remove_reference_t<_Pointer>>, extents<size_t>>;
+_LIBCUDACXX_HOST_DEVICE  mdspan(_Pointer&&) -> mdspan<_CUDA_VSTD::remove_pointer_t<_CUDA_VSTD::remove_reference_t<_Pointer>>, extents<size_t>>;
 __MDSPAN_TEMPLATE_REQUIRES(
   class _CArray,
   (_LIBCUDACXX_TRAIT(is_array, _CArray) && (rank_v<_CArray> == 1))
 )
-mdspan(_CArray&) -> mdspan<_CUDA_VSTD::remove_all_extents_t<_CArray>, extents<size_t, _CUDA_VSTD::extent_v<_CArray,0>>>;
+_LIBCUDACXX_HOST_DEVICE mdspan(_CArray&) -> mdspan<_CUDA_VSTD::remove_all_extents_t<_CArray>, extents<size_t, _CUDA_VSTD::extent_v<_CArray,0>>>;
 
 template <class _ElementType, class _SizeType, size_t _Np>
-mdspan(_ElementType*, const _CUDA_VSTD::array<_SizeType, _Np>&)
+_LIBCUDACXX_HOST_DEVICE mdspan(_ElementType*, const _CUDA_VSTD::array<_SizeType, _Np>&)
   -> mdspan<_ElementType, dextents<size_t, _Np>>;
 
 template <class _ElementType, class _SizeType, size_t _Np>
-mdspan(_ElementType*, _CUDA_VSTD::span<_SizeType, _Np>)
+_LIBCUDACXX_HOST_DEVICE mdspan(_ElementType*, _CUDA_VSTD::span<_SizeType, _Np>)
   -> mdspan<_ElementType, dextents<size_t, _Np>>;
 
 // This one is necessary because all the constructors take `data_handle_type`s, not
 // `_ElementType*`s, and `data_handle_type` is taken from `accessor_type::data_handle_type`, which
 // seems to throw off automatic deduction guides.
 template <class _ElementType, class _SizeType, size_t... _ExtentsPack>
-mdspan(_ElementType*, const extents<_SizeType, _ExtentsPack...>&)
+_LIBCUDACXX_HOST_DEVICE mdspan(_ElementType*, const extents<_SizeType, _ExtentsPack...>&)
   -> mdspan<_ElementType, extents<_SizeType, _ExtentsPack...>>;
 
 template <class _ElementType, class _MappingType>
-mdspan(_ElementType*, const _MappingType&)
+_LIBCUDACXX_HOST_DEVICE mdspan(_ElementType*, const _MappingType&)
   -> mdspan<_ElementType, typename _MappingType::extents_type, typename _MappingType::layout_type>;
 
 template <class _MappingType, class _AccessorType>
-mdspan(const typename _AccessorType::data_handle_type, const _MappingType&, const _AccessorType&)
+_LIBCUDACXX_HOST_DEVICE mdspan(const typename _AccessorType::data_handle_type, const _MappingType&, const _AccessorType&)
   -> mdspan<typename _AccessorType::element_type, typename _MappingType::extents_type, typename _MappingType::layout_type, _AccessorType>;
 #endif
 
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__utility/pair.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__utility/pair.h
index 4e336f746a..bbaecb6ff0 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__utility/pair.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__utility/pair.h
@@ -564,7 +564,7 @@ struct _LIBCUDACXX_TEMPLATE_VIS pair
 
 #if _LIBCUDACXX_STD_VER > 14 && !defined(_LIBCUDACXX_HAS_NO_DEDUCTION_GUIDES)
 template<class _T1, class _T2>
-pair(_T1, _T2) -> pair<_T1, _T2>;
+_LIBCUDACXX_HOST_DEVICE pair(_T1, _T2) -> pair<_T1, _T2>;
 #endif // _LIBCUDACXX_STD_VER > 14 && !defined(_LIBCUDACXX_HAS_NO_DEDUCTION_GUIDES)
 
 // [pairs.spec], specialized algorithms
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/array b/libcudacxx/include/cuda/std/detail/libcxx/include/array
index 0e9b9d7e64..73f2b2ca86 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/array
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/array
@@ -394,8 +394,7 @@ struct _LIBCUDACXX_TEMPLATE_VIS array<_Tp, 0>
 template<class _Tp, class... _Args,
          class = __enable_if_t<(is_same_v<_Tp, _Args> && ...), void>
          >
-array(_Tp, _Args...)
-  -> array<_Tp, 1 + sizeof...(_Args)>;
+_LIBCUDACXX_HOST_DEVICE array(_Tp, _Args...) -> array<_Tp, 1 + sizeof...(_Args)>;
 #endif
 
 template <class _Tp, size_t _Size>
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/optional b/libcudacxx/include/cuda/std/detail/libcxx/include/optional
index 7c911fb17e..7b1df08de3 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/optional
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/optional
@@ -1148,7 +1148,7 @@ public:
 
 #if _LIBCUDACXX_STD_VER > 14 && !defined(_LIBCUDACXX_HAS_NO_DEDUCTION_GUIDES)
 template<class _Tp>
-    optional(_Tp) -> optional<_Tp>;
+_LIBCUDACXX_HOST_DEVICE optional(_Tp) -> optional<_Tp>;
 #endif
 
 // Comparisons between optionals
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/span b/libcudacxx/include/cuda/std/detail/libcxx/include/span
index 60a8b74f14..999d05b463 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/span
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/span
@@ -554,19 +554,19 @@ auto as_writable_bytes(span<_Tp, _Extent> __s) noexcept
 
 //  Deduction guides
 template<class _Tp, size_t _Sz>
-    span(_Tp (&)[_Sz]) -> span<_Tp, _Sz>;
+_LIBCUDACXX_HOST_DEVICE span(_Tp (&)[_Sz]) -> span<_Tp, _Sz>;
 
 template<class _Tp, size_t _Sz>
-    span(array<_Tp, _Sz>&) -> span<_Tp, _Sz>;
+_LIBCUDACXX_HOST_DEVICE span(array<_Tp, _Sz>&) -> span<_Tp, _Sz>;
 
 template<class _Tp, size_t _Sz>
-    span(const array<_Tp, _Sz>&) -> span<const _Tp, _Sz>;
+_LIBCUDACXX_HOST_DEVICE span(const array<_Tp, _Sz>&) -> span<const _Tp, _Sz>;
 
 template<class _Container>
-    span(_Container&) -> span<typename _Container::value_type>;
+_LIBCUDACXX_HOST_DEVICE span(_Container&) -> span<typename _Container::value_type>;
 
 template<class _Container>
-    span(const _Container&) -> span<const typename _Container::value_type>;
+_LIBCUDACXX_HOST_DEVICE span(const _Container&) -> span<const typename _Container::value_type>;
 
 #endif // _LIBCUDACXX_STD_VER > 14
 
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/tuple b/libcudacxx/include/cuda/std/detail/libcxx/include/tuple
index 6dac4ab7c4..53dcbd10d7 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/tuple
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/tuple
@@ -1029,16 +1029,16 @@ public:
 
 #ifndef _LIBCUDACXX_HAS_NO_DEDUCTION_GUIDES
 template <class ..._Tp>
-tuple(_Tp...) -> tuple<_Tp...>;
+_LIBCUDACXX_HOST_DEVICE tuple(_Tp...) -> tuple<_Tp...>;
 template <class _Tp1, class _Tp2>
-tuple(pair<_Tp1, _Tp2>) -> tuple<_Tp1, _Tp2>;
+_LIBCUDACXX_HOST_DEVICE tuple(pair<_Tp1, _Tp2>) -> tuple<_Tp1, _Tp2>;
 template <class _Alloc, class ..._Tp>
-tuple(allocator_arg_t, _Alloc, _Tp...) -> tuple<_Tp...>;
+_LIBCUDACXX_HOST_DEVICE tuple(allocator_arg_t, _Alloc, _Tp...) -> tuple<_Tp...>;
 template <class _Alloc, class _Tp1, class _Tp2>
-tuple(allocator_arg_t, _Alloc, pair<_Tp1, _Tp2>) -> tuple<_Tp1, _Tp2>;
+_LIBCUDACXX_HOST_DEVICE tuple(allocator_arg_t, _Alloc, pair<_Tp1, _Tp2>) -> tuple<_Tp1, _Tp2>;
 template <class _Alloc, class ..._Tp>
-tuple(allocator_arg_t, _Alloc, tuple<_Tp...>) -> tuple<_Tp...>;
-#endif
+_LIBCUDACXX_HOST_DEVICE tuple(allocator_arg_t, _Alloc, tuple<_Tp...>) -> tuple<_Tp...>;
+#endif // _LIBCUDACXX_HAS_NO_DEDUCTION_GUIDES
 
 template <class ..._Tp>
 inline _LIBCUDACXX_INLINE_VISIBILITY

From 621ca51f38014795dcbce67d33853c4b258f6c46 Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Wed, 27 Sep 2023 08:29:38 +0000
Subject: [PATCH 30/64] Fix some warnings about unused typedefs

---
 .../mdspan/mdspan.layout.left.cons/list_init.pass.cpp    | 3 ---
 .../views/mdspan/mdspan.layout.left.obs/compare.pass.cpp | 1 -
 .../mdspan/mdspan.layout.right.obs/compare.pass.cpp      | 1 -
 .../mdspan/mdspan.layout.stride.cons/list_init.pass.cpp  | 9 ---------
 .../mdspan/mdspan.layout.stride.obs/compare.pass.cpp     | 1 -
 .../mdspan.mdspan.cons/array_init_extents.pass.cpp       | 3 +--
 .../mdspan/mdspan.mdspan.cons/custom_layout.pass.cpp     | 2 --
 .../time.cal.ym/time.cal.ym.nonmembers/minus.pass.cpp    | 1 +
 .../plus_minus_equal_month.pass.cpp                      | 1 -
 .../time.cal.ymwd.members/plus_minus_equal_year.pass.cpp | 2 --
 .../time.cal.ymwdlast.members/op_sys_days.pass.cpp       | 2 --
 .../time.cal.ymwdlast.nonmembers/plus.pass.cpp           | 1 -
 12 files changed, 2 insertions(+), 25 deletions(-)

diff --git a/libcudacxx/.upstream-tests/test/std/containers/views/mdspan/mdspan.layout.left.cons/list_init.pass.cpp b/libcudacxx/.upstream-tests/test/std/containers/views/mdspan/mdspan.layout.left.cons/list_init.pass.cpp
index d6d6bf9fd6..ff3990ba6a 100644
--- a/libcudacxx/.upstream-tests/test/std/containers/views/mdspan/mdspan.layout.left.cons/list_init.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/std/containers/views/mdspan/mdspan.layout.left.cons/list_init.pass.cpp
@@ -55,9 +55,6 @@ int main(int, char**)
 
     // TEST(TestLayoutLeftListInitialization, test_layout_left_extent_initialization)
     {
-        typedef int    data_t ;
-        typedef size_t index_t;
-
         cuda::std::layout_left::mapping<cuda::std::extents<size_t,dyn, dyn>> m{cuda::std::dextents<size_t,2>{16, 32}};
 
         static_assert( m.is_exhaustive()          == true, "" );
diff --git a/libcudacxx/.upstream-tests/test/std/containers/views/mdspan/mdspan.layout.left.obs/compare.pass.cpp b/libcudacxx/.upstream-tests/test/std/containers/views/mdspan/mdspan.layout.left.obs/compare.pass.cpp
index 7eb9d24379..adeb07be0b 100644
--- a/libcudacxx/.upstream-tests/test/std/containers/views/mdspan/mdspan.layout.left.obs/compare.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/std/containers/views/mdspan/mdspan.layout.left.obs/compare.pass.cpp
@@ -35,7 +35,6 @@ int main(int, char**)
     typed_test_compare_left();
 
     using index_t = size_t;
-    using ext1d_t = cuda::std::extents<index_t,dyn>;
     using ext2d_t = cuda::std::extents<index_t,dyn,dyn>;
 
     {
diff --git a/libcudacxx/.upstream-tests/test/std/containers/views/mdspan/mdspan.layout.right.obs/compare.pass.cpp b/libcudacxx/.upstream-tests/test/std/containers/views/mdspan/mdspan.layout.right.obs/compare.pass.cpp
index 099ff5fb8c..b1eabbf9d8 100644
--- a/libcudacxx/.upstream-tests/test/std/containers/views/mdspan/mdspan.layout.right.obs/compare.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/std/containers/views/mdspan/mdspan.layout.right.obs/compare.pass.cpp
@@ -35,7 +35,6 @@ int main(int, char**)
     typed_test_compare_right();
 
     using index_t = size_t;
-    using ext1d_t = cuda::std::extents<index_t,dyn>;
     using ext2d_t = cuda::std::extents<index_t,dyn,dyn>;
 
     {
diff --git a/libcudacxx/.upstream-tests/test/std/containers/views/mdspan/mdspan.layout.stride.cons/list_init.pass.cpp b/libcudacxx/.upstream-tests/test/std/containers/views/mdspan/mdspan.layout.stride.cons/list_init.pass.cpp
index 7b84d1656e..90a6e42016 100644
--- a/libcudacxx/.upstream-tests/test/std/containers/views/mdspan/mdspan.layout.stride.cons/list_init.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/std/containers/views/mdspan/mdspan.layout.stride.cons/list_init.pass.cpp
@@ -31,11 +31,6 @@ int main(int, char**)
 {
     // From a span
     {
-        typedef int    data_t ;
-        typedef size_t index_t;
-
-        using my_ext = typename cuda::std::extents<size_t,dyn>;
-
         cuda::std::array<int,2> a{1, 128};
         cuda::std::span <int,2> s(a.data(), 2);
         cuda::std::layout_stride::mapping<cuda::std::extents<size_t,dyn, dyn>> m{cuda::std::dextents<size_t,2>{16, 32}, s};
@@ -45,9 +40,6 @@ int main(int, char**)
 
     // TEST(TestLayoutStrideListInitialization, test_list_initialization)
     {
-        typedef int    data_t ;
-        typedef size_t index_t;
-
         cuda::std::layout_stride::mapping<cuda::std::extents<size_t,dyn, dyn>> m{cuda::std::dextents<size_t,2>{16, 32}, cuda::std::array<int,2>{1, 128}};
 
         CHECK_MAPPING(m);
@@ -55,7 +47,6 @@ int main(int, char**)
 
     // From another mapping
     {
-        typedef int    data_t ;
         typedef size_t index_t;
 
         cuda::std::layout_stride::mapping<cuda::std::extents<index_t,dyn, dyn>> m0{cuda::std::dextents<index_t,2>{16, 32}, cuda::std::array<int,2>{1, 128}};
diff --git a/libcudacxx/.upstream-tests/test/std/containers/views/mdspan/mdspan.layout.stride.obs/compare.pass.cpp b/libcudacxx/.upstream-tests/test/std/containers/views/mdspan/mdspan.layout.stride.obs/compare.pass.cpp
index 76ac945258..4d40d56884 100644
--- a/libcudacxx/.upstream-tests/test/std/containers/views/mdspan/mdspan.layout.stride.obs/compare.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/std/containers/views/mdspan/mdspan.layout.stride.obs/compare.pass.cpp
@@ -18,7 +18,6 @@ constexpr auto dyn = cuda::std::dynamic_extent;
 int main(int, char**)
 {
     using index_t = int;
-    using ext1d_t = cuda::std::extents<index_t,dyn>;
     using ext2d_t = cuda::std::extents<index_t,dyn,dyn>;
 
     {
diff --git a/libcudacxx/.upstream-tests/test/std/containers/views/mdspan/mdspan.mdspan.cons/array_init_extents.pass.cpp b/libcudacxx/.upstream-tests/test/std/containers/views/mdspan/mdspan.mdspan.cons/array_init_extents.pass.cpp
index 824b3281cf..61816bc921 100644
--- a/libcudacxx/.upstream-tests/test/std/containers/views/mdspan/mdspan.mdspan.cons/array_init_extents.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/std/containers/views/mdspan/mdspan.mdspan.cons/array_init_extents.pass.cpp
@@ -84,8 +84,7 @@ int main(int, char**)
 
     // Constraint: N == rank() || N == rank_dynamic() is true
     {
-        using      mdspan_t = cuda::std::mdspan< int, cuda::std::extents< int, dyn, dyn > >;
-        using other_index_t = int;
+        using mdspan_t = cuda::std::mdspan< int, cuda::std::extents< int, dyn, dyn > >;
 
         static_assert( is_array_cons_avail_v< mdspan_t, int *, int, 1 > == false, "" );
     }
diff --git a/libcudacxx/.upstream-tests/test/std/containers/views/mdspan/mdspan.mdspan.cons/custom_layout.pass.cpp b/libcudacxx/.upstream-tests/test/std/containers/views/mdspan/mdspan.mdspan.cons/custom_layout.pass.cpp
index 0fd8e2b06b..d82aea6f71 100644
--- a/libcudacxx/.upstream-tests/test/std/containers/views/mdspan/mdspan.mdspan.cons/custom_layout.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/std/containers/views/mdspan/mdspan.mdspan.cons/custom_layout.pass.cpp
@@ -19,8 +19,6 @@ constexpr auto dyn = cuda::std::dynamic_extent;
 
 int main(int, char**)
 {
-    using map_t = Foo::layout_foo::template mapping<cuda::std::dextents<size_t ,2>>;
-
     {
         using  data_t = int;
         using   lay_t = Foo::layout_foo;
diff --git a/libcudacxx/.upstream-tests/test/std/utilities/time/time.cal/time.cal.ym/time.cal.ym.nonmembers/minus.pass.cpp b/libcudacxx/.upstream-tests/test/std/utilities/time/time.cal/time.cal.ym/time.cal.ym.nonmembers/minus.pass.cpp
index 8ee30c387b..609d250946 100644
--- a/libcudacxx/.upstream-tests/test/std/utilities/time/time.cal/time.cal.ym/time.cal.ym.nonmembers/minus.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/std/utilities/time/time.cal/time.cal.ym/time.cal.ym.nonmembers/minus.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 // UNSUPPORTED: c++98, c++03, c++11
+// UNSUPPORTED: clang && (!nvcc)
 // XFAIL: *
 
 // <chrono>
diff --git a/libcudacxx/.upstream-tests/test/std/utilities/time/time.cal/time.cal.ymwd/time.cal.ymwd.members/plus_minus_equal_month.pass.cpp b/libcudacxx/.upstream-tests/test/std/utilities/time/time.cal/time.cal.ymwd/time.cal.ymwd.members/plus_minus_equal_month.pass.cpp
index 9b93078d65..cd7965fa4c 100644
--- a/libcudacxx/.upstream-tests/test/std/utilities/time/time.cal/time.cal.ymwd/time.cal.ymwd.members/plus_minus_equal_month.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/std/utilities/time/time.cal/time.cal.ymwd/time.cal.ymwd.members/plus_minus_equal_month.pass.cpp
@@ -39,7 +39,6 @@ int main(int, char**)
 {
     using year               = cuda::std::chrono::year;
     using month              = cuda::std::chrono::month;
-    using weekday            = cuda::std::chrono::weekday;
     using weekday_indexed    = cuda::std::chrono::weekday_indexed;
     using year_month_weekday = cuda::std::chrono::year_month_weekday;
     using months             = cuda::std::chrono::months;
diff --git a/libcudacxx/.upstream-tests/test/std/utilities/time/time.cal/time.cal.ymwd/time.cal.ymwd.members/plus_minus_equal_year.pass.cpp b/libcudacxx/.upstream-tests/test/std/utilities/time/time.cal/time.cal.ymwd/time.cal.ymwd.members/plus_minus_equal_year.pass.cpp
index 3b486988e4..33948e7f17 100644
--- a/libcudacxx/.upstream-tests/test/std/utilities/time/time.cal/time.cal.ymwd/time.cal.ymwd.members/plus_minus_equal_year.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/std/utilities/time/time.cal/time.cal.ymwd/time.cal.ymwd.members/plus_minus_equal_year.pass.cpp
@@ -38,8 +38,6 @@ constexpr bool testConstexpr(D d1)
 int main(int, char**)
 {
     using year               = cuda::std::chrono::year;
-    using month              = cuda::std::chrono::month;
-    using weekday            = cuda::std::chrono::weekday;
     using weekday_indexed    = cuda::std::chrono::weekday_indexed;
     using year_month_weekday = cuda::std::chrono::year_month_weekday;
     using years              = cuda::std::chrono::years;
diff --git a/libcudacxx/.upstream-tests/test/std/utilities/time/time.cal/time.cal.ymwdlast/time.cal.ymwdlast.members/op_sys_days.pass.cpp b/libcudacxx/.upstream-tests/test/std/utilities/time/time.cal/time.cal.ymwdlast/time.cal.ymwdlast.members/op_sys_days.pass.cpp
index a9bbbf4eb0..a18d25e311 100644
--- a/libcudacxx/.upstream-tests/test/std/utilities/time/time.cal/time.cal.ymwdlast/time.cal.ymwdlast.members/op_sys_days.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/std/utilities/time/time.cal/time.cal.ymwdlast/time.cal.ymwdlast.members/op_sys_days.pass.cpp
@@ -24,11 +24,9 @@
 int main(int, char**)
 {
     using year                    = cuda::std::chrono::year;
-    using month                   = cuda::std::chrono::month;
     using year_month_weekday_last = cuda::std::chrono::year_month_weekday_last;
     using sys_days                = cuda::std::chrono::sys_days;
     using days                    = cuda::std::chrono::days;
-    using weekday                 = cuda::std::chrono::weekday;
     using weekday_last            = cuda::std::chrono::weekday_last;
 
     ASSERT_NOEXCEPT(                    static_cast<sys_days>(cuda::std::declval<const year_month_weekday_last>()));
diff --git a/libcudacxx/.upstream-tests/test/std/utilities/time/time.cal/time.cal.ymwdlast/time.cal.ymwdlast.nonmembers/plus.pass.cpp b/libcudacxx/.upstream-tests/test/std/utilities/time/time.cal/time.cal.ymwdlast/time.cal.ymwdlast.nonmembers/plus.pass.cpp
index 9c0ff4c9be..5d730546fb 100644
--- a/libcudacxx/.upstream-tests/test/std/utilities/time/time.cal/time.cal.ymwdlast/time.cal.ymwdlast.nonmembers/plus.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/std/utilities/time/time.cal/time.cal.ymwdlast/time.cal.ymwdlast.nonmembers/plus.pass.cpp
@@ -53,7 +53,6 @@ int main(int, char**)
 {
     using year                    = cuda::std::chrono::year;
     using month                   = cuda::std::chrono::month;
-    using weekday                 = cuda::std::chrono::weekday;
     using weekday_last            = cuda::std::chrono::weekday_last;
     using year_month_weekday_last = cuda::std::chrono::year_month_weekday_last;
     using years                   = cuda::std::chrono::years;

From 580242447bf514eaae14a0825f1145dc13cfdadb Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Wed, 27 Sep 2023 08:41:11 +0000
Subject: [PATCH 31/64] Fix invalid ifdefs

We want to ifdef based on available feature not some compiler combination
---
 .../meta/meta.rel/is_invocable.pass.cpp       | 19 ++++-
 .../meta.rel/is_nothrow_invocable.pass.cpp    | 10 ++-
 .../meta.trans.other/result_of.fail.cpp       |  3 +-
 .../meta.trans.other/result_of.pass.cpp       | 85 ++++++++++---------
 .../meta.trans.other/result_of11.pass.cpp     | 14 +--
 .../cuda/std/detail/libcxx/include/atomic     |  4 +-
 6 files changed, 82 insertions(+), 53 deletions(-)

diff --git a/libcudacxx/.upstream-tests/test/std/utilities/meta/meta.rel/is_invocable.pass.cpp b/libcudacxx/.upstream-tests/test/std/utilities/meta/meta.rel/is_invocable.pass.cpp
index 9407deddda..64d574e557 100644
--- a/libcudacxx/.upstream-tests/test/std/utilities/meta/meta.rel/is_invocable.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/std/utilities/meta/meta.rel/is_invocable.pass.cpp
@@ -8,9 +8,6 @@
 
 // UNSUPPORTED: c++98, c++03, c++11
 
-// XFAIL: nvcc
-// FIXME: Triage and fix this.
-
 // type_traits
 
 // is_invocable
@@ -24,8 +21,12 @@
 #include <cuda/std/type_traits>
 // NOTE: These headers are not currently supported by libcu++.
 #include <cuda/std/functional>
+#ifdef _LIBCUDACXX_HAS_MEMORY
 #include <cuda/std/memory>
+#endif // _LIBCUDACXX_HAS_MEMORY
+#ifdef _LIBCUDACXX_HAS_VECTOR
 #include <cuda/std/vector>
+#endif // _LIBCUDACXX_HAS_VECTOR
 
 #include "test_macros.h"
 
@@ -74,9 +75,11 @@ int main(int, char**) {
     static_assert(!cuda::std::is_invocable<const int&>::value, "");
     static_assert(!cuda::std::is_invocable<int&&>::value, "");
 
+#ifdef _LIBCUDACXX_HAS_VECTOR
     static_assert(!cuda::std::is_invocable<cuda::std::vector<int> >::value, "");
     static_assert(!cuda::std::is_invocable<cuda::std::vector<int*> >::value, "");
     static_assert(!cuda::std::is_invocable<cuda::std::vector<int**> >::value, "");
+#endif // _LIBCUDACXX_HAS_VECTOR
 
     static_assert(!cuda::std::is_invocable<AbominableFunc>::value, "");
 
@@ -109,9 +112,11 @@ int main(int, char**) {
     static_assert(!cuda::std::is_invocable_r<int, const int&>::value, "");
     static_assert(!cuda::std::is_invocable_r<int, int&&>::value, "");
 
+#ifdef _LIBCUDACXX_HAS_VECTOR
     static_assert(!cuda::std::is_invocable_r<int, cuda::std::vector<int> >::value, "");
     static_assert(!cuda::std::is_invocable_r<int, cuda::std::vector<int*> >::value, "");
     static_assert(!cuda::std::is_invocable_r<int, cuda::std::vector<int**> >::value, "");
+#endif // _LIBCUDACXX_HAS_VECTOR
     static_assert(!cuda::std::is_invocable_r<void, AbominableFunc>::value, "");
 
     //  with parameters
@@ -154,12 +159,16 @@ int main(int, char**) {
       using T = Tag*;
       using DT = DerFromTag*;
       using CT = const Tag*;
+#ifdef _LIBCUDACXX_HAS_MEMORY
       using ST = cuda::std::unique_ptr<Tag>;
+#endif // _LIBCUDACXX_HAS_MEMORY
       static_assert(cuda::std::is_invocable<Fn, T&, int>::value, "");
       static_assert(cuda::std::is_invocable<Fn, DT&, int>::value, "");
       static_assert(cuda::std::is_invocable<Fn, const T&, int>::value, "");
       static_assert(cuda::std::is_invocable<Fn, T&&, int>::value, "");
+#ifdef _LIBCUDACXX_HAS_MEMORY
       static_assert(cuda::std::is_invocable<Fn, ST, int>::value, "");
+#endif // _LIBCUDACXX_HAS_MEMORY
       static_assert(!cuda::std::is_invocable<Fn, CT&, int>::value, "");
       static_assert(!cuda::std::is_invocable<RFn, T, int>::value, "");
     }
@@ -191,12 +200,16 @@ int main(int, char**) {
       using T = Tag*;
       using DT = DerFromTag*;
       using CT = const Tag*;
+#ifdef _LIBCUDACXX_HAS_MEMORY
       using ST = cuda::std::unique_ptr<Tag>;
+#endif // _LIBCUDACXX_HAS_MEMORY
       static_assert(cuda::std::is_invocable<Fn, T&>::value, "");
       static_assert(cuda::std::is_invocable<Fn, DT&>::value, "");
       static_assert(cuda::std::is_invocable<Fn, const T&>::value, "");
       static_assert(cuda::std::is_invocable<Fn, T&&>::value, "");
+#ifdef _LIBCUDACXX_HAS_MEMORY
       static_assert(cuda::std::is_invocable<Fn, ST>::value, "");
+#endif // _LIBCUDACXX_HAS_MEMORY
       static_assert(cuda::std::is_invocable<Fn, CT&>::value, "");
     }
   }
diff --git a/libcudacxx/.upstream-tests/test/std/utilities/meta/meta.rel/is_nothrow_invocable.pass.cpp b/libcudacxx/.upstream-tests/test/std/utilities/meta/meta.rel/is_nothrow_invocable.pass.cpp
index 6f486296c0..1c21592fd1 100644
--- a/libcudacxx/.upstream-tests/test/std/utilities/meta/meta.rel/is_nothrow_invocable.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/std/utilities/meta/meta.rel/is_nothrow_invocable.pass.cpp
@@ -8,16 +8,14 @@
 
 // UNSUPPORTED: c++98, c++03, c++11
 
-// XFAIL: nvcc
-// FIXME: Triage and fix this.
-
 // type_traits
 
 // is_nothrow_invocable
 
 #include <cuda/std/type_traits>
-// NOTE: This header is not currently supported by libcu++.
+#ifdef _LIBCUDACXX_HAS_VECTOR
 #include <cuda/std/vector>
+#endif // _LIBCUDACXX_HAS_VECTOR
 
 #include "test_macros.h"
 
@@ -106,12 +104,14 @@ int main(int, char**) {
     static_assert(!cuda::std::is_nothrow_invocable<const int&>::value, "");
     static_assert(!cuda::std::is_nothrow_invocable<int&&>::value, "");
 
+#ifdef _LIBCUDACXX_HAS_VECTOR
     static_assert(!cuda::std::is_nothrow_invocable<int, cuda::std::vector<int> >::value,
                   "");
     static_assert(!cuda::std::is_nothrow_invocable<int, cuda::std::vector<int*> >::value,
                   "");
     static_assert(!cuda::std::is_nothrow_invocable<int, cuda::std::vector<int**> >::value,
                   "");
+#endif // _LIBCUDACXX_HAS_VECTOR
 
     static_assert(!cuda::std::is_nothrow_invocable<AbominableFunc>::value, "");
 
@@ -145,12 +145,14 @@ int main(int, char**) {
     static_assert(!cuda::std::is_nothrow_invocable_r<int, const int&>::value, "");
     static_assert(!cuda::std::is_nothrow_invocable_r<int, int&&>::value, "");
 
+#ifdef _LIBCUDACXX_HAS_VECTOR
     static_assert(!cuda::std::is_nothrow_invocable_r<int, cuda::std::vector<int> >::value,
                   "");
     static_assert(!cuda::std::is_nothrow_invocable_r<int, cuda::std::vector<int*> >::value,
                   "");
     static_assert(!cuda::std::is_nothrow_invocable_r<int, cuda::std::vector<int**> >::value,
                   "");
+#endif // _LIBCUDACXX_HAS_VECTOR
     static_assert(!cuda::std::is_nothrow_invocable_r<void, AbominableFunc>::value,
                   "");
 
diff --git a/libcudacxx/.upstream-tests/test/std/utilities/meta/meta.trans/meta.trans.other/result_of.fail.cpp b/libcudacxx/.upstream-tests/test/std/utilities/meta/meta.trans/meta.trans.other/result_of.fail.cpp
index 7a2079503e..f435a5246e 100644
--- a/libcudacxx/.upstream-tests/test/std/utilities/meta/meta.trans/meta.trans.other/result_of.fail.cpp
+++ b/libcudacxx/.upstream-tests/test/std/utilities/meta/meta.trans/meta.trans.other/result_of.fail.cpp
@@ -8,6 +8,7 @@
 
 // Mandates: invoke result must fail to compile when used with device lambdas.
 // UNSUPPORTED: nvrtc
+// UNSUPPORTED: clang && (!nvcc)
 
 // <cuda/std/functional>
 
@@ -31,7 +32,7 @@ void test_lambda(Fn &&)
 
 int main(int, char**)
 {
-#if defined(__NVCC__) 
+#if defined(TEST_COMPILER_NVCC)
     { // extended device lambda
     test_lambda<int>([] __device__ () -> int { return 42; });
     test_lambda<double>([] __device__ () -> double { return 42.0; });
diff --git a/libcudacxx/.upstream-tests/test/std/utilities/meta/meta.trans/meta.trans.other/result_of.pass.cpp b/libcudacxx/.upstream-tests/test/std/utilities/meta/meta.trans/meta.trans.other/result_of.pass.cpp
index 7dd4e49d63..aa3f40d753 100644
--- a/libcudacxx/.upstream-tests/test/std/utilities/meta/meta.trans/meta.trans.other/result_of.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/std/utilities/meta/meta.trans/meta.trans.other/result_of.pass.cpp
@@ -14,7 +14,9 @@
 #define _LIBCUDACXX_DISABLE_DEPRECATION_WARNINGS
 
 #include <cuda/std/type_traits>
-// #include <cuda/std/memory>
+#ifdef _LIBCUDACXX_HAS_MEMORY
+#include <cuda/std/memory>
+#endif // _LIBCUDACXX_HAS_MEMORY
 #include <cuda/std/cassert>
 #include <cuda/functional>
 #include "test_macros.h"
@@ -168,74 +170,80 @@ int main(int, char**)
     test_result_of<PMS0(                             S&),  int> ();
     test_result_of<PMS0(                             S*),  int> ();
     test_result_of<PMS0(                             S*&), int> ();
-#if !(defined(__NVCC__) || defined(__CUDACC_RTC__))
+
     test_result_of<PMS0(      cuda::std::reference_wrapper<S>),  int> ();
     test_result_of<PMS0(const cuda::std::reference_wrapper<S>&), int> ();
     test_result_of<PMS0(      cuda::std::reference_wrapper<SD>),  int> ();
     test_result_of<PMS0(const cuda::std::reference_wrapper<SD>&), int> ();
+
+#ifdef _LIBCUDACXX_HAS_MEMORY
     test_result_of<PMS0(cuda::std::unique_ptr<S>),  int> ();
     test_result_of<PMS0(cuda::std::unique_ptr<SD>), int> ();
-#endif
+#endif // _LIBCUDACXX_HAS_MEMORY
     test_no_result<PMS0(const          S&)>();
     test_no_result<PMS0(volatile       S&)>();
     test_no_result<PMS0(const volatile S&)>();
     test_no_result<PMS0(ND &                           )>();
     test_no_result<PMS0(const ND&                      )>();
-#if !(defined(__NVCC__) || defined(__CUDACC_RTC__))
+
+#ifdef _LIBCUDACXX_HAS_MEMORY
     test_no_result<PMS0(cuda::std::unique_ptr<S const>       )>();
+    test_no_result<PMS0(cuda::std::unique_ptr<ND>            )>();
+#endif // _LIBCUDACXX_HAS_MEMORY
     test_no_result<PMS0(cuda::std::reference_wrapper<S const>)>();
     test_no_result<PMS0(cuda::std::reference_wrapper<ND>     )>();
-    test_no_result<PMS0(cuda::std::unique_ptr<ND>            )>();
-#endif
 
     test_result_of<PMS1(                             S,   int), int*> ();
     test_result_of<PMS1(                             S&,  int), int*> ();
     test_result_of<PMS1(                             S*,  int), int*> ();
     test_result_of<PMS1(                             S*&, int), int*> ();
-#if !(defined(__NVCC__) || defined(__CUDACC_RTC__))
+#ifdef _LIBCUDACXX_HAS_MEMORY
     test_result_of<PMS1(cuda::std::unique_ptr<S>,               int), int*> ();
     test_result_of<PMS1(cuda::std::unique_ptr<SD>,              int), int*> ();
+#endif // _LIBCUDACXX_HAS_MEMORY
     test_result_of<PMS1(cuda::std::reference_wrapper<S>,        int), int*> ();
     test_result_of<PMS1(const cuda::std::reference_wrapper<S>&, int), int*> ();
     test_result_of<PMS1(cuda::std::reference_wrapper<SD>,        int), int*> ();
     test_result_of<PMS1(const cuda::std::reference_wrapper<SD>&, int), int*> ();
-#endif
+
     test_no_result<PMS1(const          S&, int)>();
     test_no_result<PMS1(volatile       S&, int)>();
     test_no_result<PMS1(const volatile S&, int)>();
     test_no_result<PMS1(ND &,                            int)>();
     test_no_result<PMS1(const ND&,                       int)>();
-#if !(defined(__NVCC__) || defined(__CUDACC_RTC__))
+#ifdef _LIBCUDACXX_HAS_MEMORY
     test_no_result<PMS1(cuda::std::unique_ptr<S const>,        int)>();
+    test_no_result<PMS1(cuda::std::unique_ptr<ND>,             int)>();
+#endif // _LIBCUDACXX_HAS_MEMORY
     test_no_result<PMS1(cuda::std::reference_wrapper<S const>, int)>();
     test_no_result<PMS1(cuda::std::reference_wrapper<ND>,      int)>();
-    test_no_result<PMS1(cuda::std::unique_ptr<ND>,             int)>();
-#endif
 
     test_result_of<PMS2(               S,   int, int), int&> ();
     test_result_of<PMS2(               S&,  int, int), int&> ();
     test_result_of<PMS2(               S*,  int, int), int&> ();
     test_result_of<PMS2(               S*&, int, int), int&> ();
-#if !(defined(__NVCC__) || defined(__CUDACC_RTC__))
+#ifdef _LIBCUDACXX_HAS_MEMORY
     test_result_of<PMS2(cuda::std::unique_ptr<S>, int, int), int&> ();
     test_result_of<PMS2(cuda::std::unique_ptr<SD>, int, int), int&> ();
+#endif // _LIBCUDACXX_HAS_MEMORY
     test_result_of<PMS2(cuda::std::reference_wrapper<S>,         int, int), int&> ();
     test_result_of<PMS2(const cuda::std::reference_wrapper<S>&,  int, int), int&> ();
     test_result_of<PMS2(cuda::std::reference_wrapper<SD>,        int, int), int&> ();
     test_result_of<PMS2(const cuda::std::reference_wrapper<SD>&, int, int), int&> ();
-#endif
+
     test_no_result<PMS2(const          S&, int, int)>();
     test_no_result<PMS2(volatile       S&, int, int)>();
     test_no_result<PMS2(const volatile S&, int, int)>();
-#if !(defined(__NVCC__) || defined(__CUDACC_RTC__))
+#ifdef _LIBCUDACXX_HAS_MEMORY
     test_no_result<PMS2(cuda::std::unique_ptr<S const>,   int, int)>();
+#endif // _LIBCUDACXX_HAS_MEMORY
     test_no_result<PMS2(cuda::std::reference_wrapper<S const>, int, int)>();
-#endif
+
     test_no_result<PMS2(const ND&,                  int, int)>();
-#if !(defined(__NVCC__) || defined(__CUDACC_RTC__))
     test_no_result<PMS2(cuda::std::reference_wrapper<ND>, int, int)>();
+#ifdef _LIBCUDACXX_HAS_MEMORY
     test_no_result<PMS2(cuda::std::unique_ptr<ND>,        int, int)>();
-#endif
+#endif // _LIBCUDACXX_HAS_MEMORY
 
     test_result_of<PMS3(S&, int), const int &>();
     test_result_of<PMS3(S&, int, long), const int &>();
@@ -251,9 +259,10 @@ int main(int, char**)
     test_result_of<PMS0C(const          S*),  int> ();
     test_result_of<PMS0C(               S*&), int> ();
     test_result_of<PMS0C(const          S*&), int> ();
-#if !(defined(__NVCC__) || defined(__CUDACC_RTC__))
+#ifdef _LIBCUDACXX_HAS_MEMORY
     test_result_of<PMS0C(cuda::std::unique_ptr<S>), int> ();
     test_result_of<PMS0C(cuda::std::unique_ptr<SD>), int> ();
+#endif // _LIBCUDACXX_HAS_MEMORY
     test_result_of<PMS0C(cuda::std::reference_wrapper<S>              ), int> ();
     test_result_of<PMS0C(cuda::std::reference_wrapper<const S>        ), int> ();
     test_result_of<PMS0C(const cuda::std::reference_wrapper<S> &      ), int> ();
@@ -262,7 +271,7 @@ int main(int, char**)
     test_result_of<PMS0C(cuda::std::reference_wrapper<const SD>       ), int> ();
     test_result_of<PMS0C(const cuda::std::reference_wrapper<SD> &     ), int> ();
     test_result_of<PMS0C(const cuda::std::reference_wrapper<const SD> &), int> ();
-#endif
+
     test_no_result<PMS0C(volatile       S&)>();
     test_no_result<PMS0C(const volatile S&)>();
 
@@ -273,9 +282,9 @@ int main(int, char**)
     test_result_of<PMS1C(const          S*,  int), int*> ();
     test_result_of<PMS1C(               S*&, int), int*> ();
     test_result_of<PMS1C(const          S*&, int), int*> ();
-#if !(defined(__NVCC__) || defined(__CUDACC_RTC__))
+#ifdef _LIBCUDACXX_HAS_MEMORY
     test_result_of<PMS1C(cuda::std::unique_ptr<S>, int), int*> ();
-#endif
+#endif // _LIBCUDACXX_HAS_MEMORY
     test_no_result<PMS1C(volatile       S&, int)>();
     test_no_result<PMS1C(const volatile S&, int)>();
 
@@ -286,9 +295,9 @@ int main(int, char**)
     test_result_of<PMS2C(const          S*,  int, int), int&> ();
     test_result_of<PMS2C(               S*&, int, int), int&> ();
     test_result_of<PMS2C(const          S*&, int, int), int&> ();
-#if !(defined(__NVCC__) || defined(__CUDACC_RTC__))
+#ifdef _LIBCUDACXX_HAS_MEMORY
     test_result_of<PMS2C(cuda::std::unique_ptr<S>, int, int), int&> ();
-#endif
+#endif // _LIBCUDACXX_HAS_MEMORY
     test_no_result<PMS2C(volatile       S&, int, int)>();
     test_no_result<PMS2C(const volatile S&, int, int)>();
 
@@ -306,9 +315,9 @@ int main(int, char**)
     test_result_of<PMS0V(volatile       S*),  int> ();
     test_result_of<PMS0V(               S*&), int> ();
     test_result_of<PMS0V(volatile       S*&), int> ();
-#if !(defined(__NVCC__) || defined(__CUDACC_RTC__))
+#ifdef _LIBCUDACXX_HAS_MEMORY
     test_result_of<PMS0V(cuda::std::unique_ptr<S>), int> ();
-#endif
+#endif // _LIBCUDACXX_HAS_MEMORY
     test_no_result<PMS0V(const          S&)>();
     test_no_result<PMS0V(const volatile S&)>();
 
@@ -319,9 +328,9 @@ int main(int, char**)
     test_result_of<PMS1V(volatile       S*,  int), int*> ();
     test_result_of<PMS1V(               S*&, int), int*> ();
     test_result_of<PMS1V(volatile       S*&, int), int*> ();
-#if !(defined(__NVCC__) || defined(__CUDACC_RTC__))
+#ifdef _LIBCUDACXX_HAS_MEMORY
     test_result_of<PMS1V(cuda::std::unique_ptr<S>, int), int*> ();
-#endif
+#endif // _LIBCUDACXX_HAS_MEMORY
     test_no_result<PMS1V(const          S&, int)>();
     test_no_result<PMS1V(const volatile S&, int)>();
 
@@ -332,9 +341,9 @@ int main(int, char**)
     test_result_of<PMS2V(volatile       S*,  int, int), int&> ();
     test_result_of<PMS2V(               S*&, int, int), int&> ();
     test_result_of<PMS2V(volatile       S*&, int, int), int&> ();
-#if !(defined(__NVCC__) || defined(__CUDACC_RTC__))
+#ifdef _LIBCUDACXX_HAS_MEMORY
     test_result_of<PMS2V(cuda::std::unique_ptr<S>, int, int), int&> ();
-#endif
+#endif // _LIBCUDACXX_HAS_MEMORY
     test_no_result<PMS2V(const          S&, int, int)>();
     test_no_result<PMS2V(const volatile S&, int, int)>();
 
@@ -358,9 +367,9 @@ int main(int, char**)
     test_result_of<PMS0CV(const          S*&), int> ();
     test_result_of<PMS0CV(volatile       S*&), int> ();
     test_result_of<PMS0CV(const volatile S*&), int> ();
-#if !(defined(__NVCC__) || defined(__CUDACC_RTC__))
+#ifdef _LIBCUDACXX_HAS_MEMORY
     test_result_of<PMS0CV(cuda::std::unique_ptr<S>), int> ();
-#endif
+#endif // _LIBCUDACXX_HAS_MEMORY
 
     test_result_of<PMS1CV(               S,   int), int*> ();
     test_result_of<PMS1CV(               S&,  int), int*> ();
@@ -375,9 +384,9 @@ int main(int, char**)
     test_result_of<PMS1CV(const          S*&, int), int*> ();
     test_result_of<PMS1CV(volatile       S*&, int), int*> ();
     test_result_of<PMS1CV(const volatile S*&, int), int*> ();
-#if !(defined(__NVCC__) || defined(__CUDACC_RTC__))
+#ifdef _LIBCUDACXX_HAS_MEMORY
     test_result_of<PMS1CV(cuda::std::unique_ptr<S>, int), int*> ();
-#endif
+#endif // _LIBCUDACXX_HAS_MEMORY
 
     test_result_of<PMS2CV(               S,   int, int), int&> ();
     test_result_of<PMS2CV(               S&,  int, int), int&> ();
@@ -392,9 +401,9 @@ int main(int, char**)
     test_result_of<PMS2CV(const          S*&, int, int), int&> ();
     test_result_of<PMS2CV(volatile       S*&, int, int), int&> ();
     test_result_of<PMS2CV(const volatile S*&, int, int), int&> ();
-#if !(defined(__NVCC__) || defined(__CUDACC_RTC__))
+#ifdef _LIBCUDACXX_HAS_MEMORY
     test_result_of<PMS2CV(cuda::std::unique_ptr<S>, int, int), int&> ();
-#endif
+#endif // _LIBCUDACXX_HAS_MEMORY
 
     test_result_of<PMS3CV(S&, int), const int &>();
     test_result_of<PMS3CV(S&, int, long), const int &>();
@@ -414,15 +423,15 @@ int main(int, char**)
     test_result_of<PMD(SD const&), const char&>();
     test_result_of<PMD(SD*), char&>();
     test_result_of<PMD(const SD*), const char&>();
-#if !(defined(__NVCC__) || defined(__CUDACC_RTC__))
+#ifdef _LIBCUDACXX_HAS_MEMORY
     test_result_of<PMD(cuda::std::unique_ptr<S>), char &>();
     test_result_of<PMD(cuda::std::unique_ptr<S const>), const char&>();
+#endif // _LIBCUDACXX_HAS_MEMORY
     test_result_of<PMD(cuda::std::reference_wrapper<S>), char&>();
     test_result_of<PMD(cuda::std::reference_wrapper<S const>), const char&>();
-#endif
     test_no_result<PMD(ND&)>();
     }
-#if defined(__NVCC__)
+#if defined(TEST_COMPILER_NVCC)
     { // extended lambda
     NV_IF_TARGET(NV_IS_DEVICE,(
         test_lambda<int>([] __host__ __device__ () -> int { return 42; });
diff --git a/libcudacxx/.upstream-tests/test/std/utilities/meta/meta.trans/meta.trans.other/result_of11.pass.cpp b/libcudacxx/.upstream-tests/test/std/utilities/meta/meta.trans/meta.trans.other/result_of11.pass.cpp
index d66353b059..e806d12193 100644
--- a/libcudacxx/.upstream-tests/test/std/utilities/meta/meta.trans/meta.trans.other/result_of11.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/std/utilities/meta/meta.trans/meta.trans.other/result_of11.pass.cpp
@@ -19,8 +19,10 @@
 #define _LIBCUDACXX_DISABLE_DEPRECATION_WARNINGS
 
 #include <cuda/std/type_traits>
-// #include <cuda/std/memory>
-// #include <cuda/std/utility>
+#ifdef _LIBCUDACXX_HAS_MEMORY
+#include <cuda/std/memory>
+#endif // _LIBCUDACXX_HAS_MEMORY
+#include <cuda/std/utility>
 #include "test_macros.h"
 
 TEST_NV_DIAG_SUPPRESS(3013) // a volatile function parameter is deprecated
@@ -101,17 +103,17 @@ int main(int, char**)
     test_result_of_imp<PMD(FD volatile       ), char &&>();
     test_result_of_imp<PMD(FD const volatile ), char &&>();
 
-#if !(defined(__NVCC__) || defined(__CUDACC_RTC__))
+#if defined(_LIBCUDACXX_HAS_MEMORY)
     test_result_of_imp<PMD(cuda::std::unique_ptr<F>),        char &>();
     test_result_of_imp<PMD(cuda::std::unique_ptr<F const>),  const char &>();
     test_result_of_imp<PMD(cuda::std::unique_ptr<FD>),       char &>();
     test_result_of_imp<PMD(cuda::std::unique_ptr<FD const>), const char &>();
+#endif // _LIBCUDACXX_HAS_MEMORY
 
     test_result_of_imp<PMD(cuda::std::reference_wrapper<F>),        char &>();
     test_result_of_imp<PMD(cuda::std::reference_wrapper<F const>),  const char &>();
     test_result_of_imp<PMD(cuda::std::reference_wrapper<FD>),       char &>();
     test_result_of_imp<PMD(cuda::std::reference_wrapper<FD const>), const char &>();
-#endif
     }
     {
     test_result_of_imp<int (F::* (F       &)) ()                &, int> ();
@@ -176,12 +178,12 @@ int main(int, char**)
     test_result_of_imp<int (F::* (FD const volatile )) () const volatile &&, int> ();
     }
     {
-#if !(defined(__NVCC__) || defined(__CUDACC_RTC__))
     test_result_of_imp<int (F::* (cuda::std::reference_wrapper<F>))       (),       int>();
     test_result_of_imp<int (F::* (cuda::std::reference_wrapper<const F>)) () const, int>();
+#ifdef _LIBCUDACXX_HAS_MEMORY
     test_result_of_imp<int (F::* (cuda::std::unique_ptr<F>       ))       (),       int>();
     test_result_of_imp<int (F::* (cuda::std::unique_ptr<const F> ))       () const, int>();
-#endif
+#endif // _LIBCUDACXX_HAS_MEMORY
     }
     test_result_of_imp<decltype(&wat::foo)(wat), void>();
 
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/atomic b/libcudacxx/include/cuda/std/detail/libcxx/include/atomic
index 37e3639027..45c2a5a558 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/atomic
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/atomic
@@ -684,7 +684,9 @@ typedef enum memory_order {
 
 template <typename _Tp> _LIBCUDACXX_INLINE_VISIBILITY
 bool __cxx_nonatomic_compare_equal(_Tp const& __lhs, _Tp const& __rhs) {
-#if defined(_LIBCUDACXX_COMPILER_NVCC) || defined(_LIBCUDACXX_COMPILER_NVRTC)
+#if defined(_LIBCUDACXX_COMPILER_NVCC) \
+ || defined(_LIBCUDACXX_COMPILER_NVRTC) \
+ || defined(_LIBCUDACXX_COMPILER_CLANG_CUDA)
     return __lhs == __rhs;
 #else
     return memcmp(&__lhs, &__rhs, sizeof(_Tp)) == 0;

From 523535d00c6df6d2ad1783c05192bd5c89ad5b69 Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Wed, 27 Sep 2023 08:47:32 +0000
Subject: [PATCH 32/64] Rename shadowing typedefs

---
 .../intseq/intseq.general/integer_seq.pass.cpp   | 10 +++++-----
 .../intseq/intseq.intseq/integer_seq.pass.cpp    | 16 ++++++++--------
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/libcudacxx/.upstream-tests/test/std/utilities/intseq/intseq.general/integer_seq.pass.cpp b/libcudacxx/.upstream-tests/test/std/utilities/intseq/intseq.general/integer_seq.pass.cpp
index b23d8f202a..f6c879d584 100644
--- a/libcudacxx/.upstream-tests/test/std/utilities/intseq/intseq.general/integer_seq.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/std/utilities/intseq/intseq.general/integer_seq.pass.cpp
@@ -27,7 +27,7 @@ __host__ __device__ auto extract ( const AtContainer &t, const cuda::std::intege
 int main(int, char**)
 {
 //  Make a couple of sequences
-    using int3    = cuda::std::make_integer_sequence<int, 3>;     // generates int:    0,1,2
+    using intseq3 = cuda::std::make_integer_sequence<int, 3>;     // generates int:    0,1,2
     using size7   = cuda::std::make_integer_sequence<size_t, 7>;  // generates size_t: 0,1,2,3,4,5,6
     using size4   = cuda::std::make_index_sequence<4>;            // generates size_t: 0,1,2,3
     using size2   = cuda::std::index_sequence_for<int, size_t>;   // generates size_t: 0,1
@@ -35,8 +35,8 @@ int main(int, char**)
     using sizemix = cuda::std::index_sequence<1, 1, 2, 3, 5>;     // generates size_t: 1,1,2,3,5
 
 //  Make sure they're what we expect
-    static_assert ( cuda::std::is_same<int3::value_type, int>::value, "int3 type wrong" );
-    static_assert ( int3::size () == 3, "int3 size wrong" );
+    static_assert ( cuda::std::is_same<intseq3::value_type, int>::value, "intseq3 type wrong" );
+    static_assert ( intseq3::size () == 3, "intseq3 size wrong" );
 
     static_assert ( cuda::std::is_same<size7::value_type, size_t>::value, "size7 type wrong" );
     static_assert ( size7::size () == 7, "size7 size wrong" );
@@ -56,8 +56,8 @@ int main(int, char**)
     auto tup = cuda::std::make_tuple ( 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20 );
 
 //  Use them
-    auto t3 = extract ( tup, int3() );
-    static_assert ( cuda::std::tuple_size<decltype(t3)>::value == int3::size (), "t3 size wrong");
+    auto t3 = extract ( tup, intseq3() );
+    static_assert ( cuda::std::tuple_size<decltype(t3)>::value == intseq3::size (), "t3 size wrong");
     assert ( t3 == cuda::std::make_tuple ( 10, 11, 12 ));
 
     auto t7 = extract ( tup, size7 ());
diff --git a/libcudacxx/.upstream-tests/test/std/utilities/intseq/intseq.intseq/integer_seq.pass.cpp b/libcudacxx/.upstream-tests/test/std/utilities/intseq/intseq.intseq/integer_seq.pass.cpp
index 5d4544a61c..c7c9a154ad 100644
--- a/libcudacxx/.upstream-tests/test/std/utilities/intseq/intseq.intseq/integer_seq.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/std/utilities/intseq/intseq.intseq/integer_seq.pass.cpp
@@ -28,20 +28,20 @@
 int main(int, char**)
 {
 //  Make a few of sequences
-    using int3    = cuda::std::integer_sequence<int, 3, 2, 1>;
-    using size1   = cuda::std::integer_sequence<cuda::std::size_t, 7>;
-    using ushort2 = cuda::std::integer_sequence<unsigned short, 4, 6>;
-    using bool0   = cuda::std::integer_sequence<bool>;
+    using intseq3    = cuda::std::integer_sequence<int, 3, 2, 1>;
+    using size1      = cuda::std::integer_sequence<cuda::std::size_t, 7>;
+    using ushortseq2 = cuda::std::integer_sequence<unsigned short, 4, 6>;
+    using bool0      = cuda::std::integer_sequence<bool>;
 
 //  Make sure they're what we expect
-    static_assert ( cuda::std::is_same<int3::value_type, int>::value, "int3 type wrong" );
-    static_assert ( int3::size() == 3, "int3 size wrong" );
+    static_assert ( cuda::std::is_same<intseq3::value_type, int>::value, "intseq3 type wrong" );
+    static_assert ( intseq3::size() == 3, "intseq3 size wrong" );
 
     static_assert ( cuda::std::is_same<size1::value_type, cuda::std::size_t>::value, "size1 type wrong" );
     static_assert ( size1::size() == 1, "size1 size wrong" );
 
-    static_assert ( cuda::std::is_same<ushort2::value_type, unsigned short>::value, "ushort2 type wrong" );
-    static_assert ( ushort2::size() == 2, "ushort2 size wrong" );
+    static_assert ( cuda::std::is_same<ushortseq2::value_type, unsigned short>::value, "ushortseq2 type wrong" );
+    static_assert ( ushortseq2::size() == 2, "ushortseq2 size wrong" );
 
     static_assert ( cuda::std::is_same<bool0::value_type, bool>::value, "bool0 type wrong" );
     static_assert ( bool0::size() == 0, "bool0 size wrong" );

From 4085dd70a0e3b7087491f1447180b63f1c294f53 Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Wed, 27 Sep 2023 09:10:06 +0000
Subject: [PATCH 33/64] Work around compiler crash

---
 .../utilities/function.objects/func.not_fn/not_fn.pass.cpp    | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/libcudacxx/.upstream-tests/test/std/utilities/function.objects/func.not_fn/not_fn.pass.cpp b/libcudacxx/.upstream-tests/test/std/utilities/function.objects/func.not_fn/not_fn.pass.cpp
index 4bc730c89d..ca489bb096 100644
--- a/libcudacxx/.upstream-tests/test/std/utilities/function.objects/func.not_fn/not_fn.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/std/utilities/function.objects/func.not_fn/not_fn.pass.cpp
@@ -668,6 +668,7 @@ void call_operator_noexcept_test()
     }
 }
 
+#ifndef TEST_COMPILER_CLANG_CUDA // https://github.com/llvm/llvm-project/issues/67533
 __host__ __device__
 void test_lwg2767() {
     // See https://cplusplus.github.io/LWG/lwg-defects.html#2767
@@ -681,6 +682,7 @@ void test_lwg2767() {
         assert(b);
     }
 }
+#endif // TEST_COMPILER_CLANG_CUDA
 
 int main(int, char**)
 {
@@ -691,7 +693,9 @@ int main(int, char**)
     call_operator_sfinae_test(); // somewhat of an extension
     // call_operator_forwarding_test();
     call_operator_noexcept_test();
+#ifndef TEST_COMPILER_CLANG_CUDA
     test_lwg2767();
+#endif // TEST_COMPILER_CLANG_CUDA
 
   return 0;
 }

From 71b37bfabaa0a295f4ca55f5557766ef9f34d732 Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Wed, 27 Sep 2023 09:19:19 +0000
Subject: [PATCH 34/64] Ignore unused private member warning

---
 .../std/concepts/concepts.lang/concept.same/same_as.pass.cpp | 5 +++++
 .../indirect_binary_predicate.compile.pass.cpp               | 4 ++++
 .../indirect_equivalence_relation.compile.pass.cpp           | 4 ++++
 .../indirect_strict_weak_order.compile.pass.cpp              | 4 ++++
 .../indirect_unary_predicate.compile.pass.cpp                | 4 ++++
 5 files changed, 21 insertions(+)

diff --git a/libcudacxx/.upstream-tests/test/std/concepts/concepts.lang/concept.same/same_as.pass.cpp b/libcudacxx/.upstream-tests/test/std/concepts/concepts.lang/concept.same/same_as.pass.cpp
index b0c626e536..1065818852 100644
--- a/libcudacxx/.upstream-tests/test/std/concepts/concepts.lang/concept.same/same_as.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/std/concepts/concepts.lang/concept.same/same_as.pass.cpp
@@ -16,6 +16,8 @@
 #include <cuda/std/concepts>
 #include <cuda/std/type_traits>
 
+#include "test_macros.h"
+
 using cuda::std::same_as;
 
 struct S1 {};
@@ -35,6 +37,9 @@ struct S5 {
   int* p;
 };
 
+#ifdef TEST_COMPILER_CLANG_CUDA
+#pragma clang diagnostic ignored "-Wunused-private-field"
+#endif // TEST_COMPILER_CLANG_CUDA
 class C1 {};
 class C2 {
   /* [[maybe_unused]] */ int i;
diff --git a/libcudacxx/.upstream-tests/test/std/iterators/iterator.requirements/indirectcallable/indirectinvocable/indirect_binary_predicate.compile.pass.cpp b/libcudacxx/.upstream-tests/test/std/iterators/iterator.requirements/indirectcallable/indirectinvocable/indirect_binary_predicate.compile.pass.cpp
index b5923cd7af..70b332be6e 100644
--- a/libcudacxx/.upstream-tests/test/std/iterators/iterator.requirements/indirectcallable/indirectinvocable/indirect_binary_predicate.compile.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/std/iterators/iterator.requirements/indirectcallable/indirectinvocable/indirect_binary_predicate.compile.pass.cpp
@@ -33,6 +33,10 @@ struct GoodPredicate {
 // Should work when all constraints are satisfied
 static_assert(cuda::std::indirect_binary_predicate<GoodPredicate<It1, It2>, It1, It2>);
 static_assert(cuda::std::indirect_binary_predicate<bool(*)(int, float), int*, float*>);
+
+#ifdef TEST_COMPILER_CLANG_CUDA
+#pragma clang diagnostic ignored "-Wunneeded-internal-declaration"
+#endif // TEST_COMPILER_CLANG_CUDA
 #ifndef __CUDA_ARCH__
 auto lambda = [](int i, long j) { return i == j; };
 static_assert(cuda::std::indirect_binary_predicate<decltype(lambda), int*, int*>);
diff --git a/libcudacxx/.upstream-tests/test/std/iterators/iterator.requirements/indirectcallable/indirectinvocable/indirect_equivalence_relation.compile.pass.cpp b/libcudacxx/.upstream-tests/test/std/iterators/iterator.requirements/indirectcallable/indirectinvocable/indirect_equivalence_relation.compile.pass.cpp
index cfc3211607..2906a7c42a 100644
--- a/libcudacxx/.upstream-tests/test/std/iterators/iterator.requirements/indirectcallable/indirectinvocable/indirect_equivalence_relation.compile.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/std/iterators/iterator.requirements/indirectcallable/indirectinvocable/indirect_equivalence_relation.compile.pass.cpp
@@ -48,6 +48,10 @@ struct GoodRelation {
 // Should work when all constraints are satisfied
 static_assert(cuda::std::indirect_equivalence_relation<GoodRelation<It1, It2>, It1, It2>);
 static_assert(cuda::std::indirect_equivalence_relation<bool(*)(int, long), int*, long*>);
+
+#ifdef TEST_COMPILER_CLANG_CUDA
+#pragma clang diagnostic ignored "-Wunneeded-internal-declaration"
+#endif // TEST_COMPILER_CLANG_CUDA
 #ifndef __CUDA_ARCH__
 auto lambda = [](int i, long j) { return i == j; };
 static_assert(cuda::std::indirect_equivalence_relation<decltype(lambda), int*, long*>);
diff --git a/libcudacxx/.upstream-tests/test/std/iterators/iterator.requirements/indirectcallable/indirectinvocable/indirect_strict_weak_order.compile.pass.cpp b/libcudacxx/.upstream-tests/test/std/iterators/iterator.requirements/indirectcallable/indirectinvocable/indirect_strict_weak_order.compile.pass.cpp
index 1ba50bc154..1defcd894e 100644
--- a/libcudacxx/.upstream-tests/test/std/iterators/iterator.requirements/indirectcallable/indirectinvocable/indirect_strict_weak_order.compile.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/std/iterators/iterator.requirements/indirectcallable/indirectinvocable/indirect_strict_weak_order.compile.pass.cpp
@@ -48,6 +48,10 @@ struct GoodOrder {
 // Should work when all constraints are satisfied
 static_assert(cuda::std::indirect_strict_weak_order<GoodOrder<It1, It2>, It1, It2>);
 static_assert(cuda::std::indirect_strict_weak_order<bool(*)(int, long), int*, long*>);
+
+#ifdef TEST_COMPILER_CLANG_CUDA
+#pragma clang diagnostic ignored "-Wunneeded-internal-declaration"
+#endif // TEST_COMPILER_CLANG_CUDA
 #ifndef __CUDA_ARCH__
 auto lambda = [](int i, long j) { return i == j; };
 static_assert(cuda::std::indirect_strict_weak_order<decltype(lambda), int*, long*>);
diff --git a/libcudacxx/.upstream-tests/test/std/iterators/iterator.requirements/indirectcallable/indirectinvocable/indirect_unary_predicate.compile.pass.cpp b/libcudacxx/.upstream-tests/test/std/iterators/iterator.requirements/indirectcallable/indirectinvocable/indirect_unary_predicate.compile.pass.cpp
index dcbf90dbc0..7f9ddf2792 100644
--- a/libcudacxx/.upstream-tests/test/std/iterators/iterator.requirements/indirectcallable/indirectinvocable/indirect_unary_predicate.compile.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/std/iterators/iterator.requirements/indirectcallable/indirectinvocable/indirect_unary_predicate.compile.pass.cpp
@@ -30,6 +30,10 @@ struct GoodPredicate {
 // Should work when all constraints are satisfied
 static_assert(cuda::std::indirect_unary_predicate<GoodPredicate<It>, It>);
 static_assert(cuda::std::indirect_unary_predicate<bool(*)(int), int*>);
+
+#ifdef TEST_COMPILER_CLANG_CUDA
+#pragma clang diagnostic ignored "-Wunneeded-internal-declaration"
+#endif // TEST_COMPILER_CLANG_CUDA
 #ifndef __CUDA_ARCH__
 auto lambda = [](int i) { return i % 2 == 0; };
 static_assert(cuda::std::indirect_unary_predicate<decltype(lambda), int*>);

From 0c8b4a04c203bdaf1d5e7448ba62f3440aaa53a5 Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Wed, 27 Sep 2023 09:28:23 +0000
Subject: [PATCH 35/64] Remove non compiling code

---
 .../std/language.support/support.types/nullptr_t.pass.cpp     | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/libcudacxx/.upstream-tests/test/std/language.support/support.types/nullptr_t.pass.cpp b/libcudacxx/.upstream-tests/test/std/language.support/support.types/nullptr_t.pass.cpp
index 65f9e0841f..8edb3f1a6d 100644
--- a/libcudacxx/.upstream-tests/test/std/language.support/support.types/nullptr_t.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/std/language.support/support.types/nullptr_t.pass.cpp
@@ -65,7 +65,9 @@ __host__ __device__
 void test_nullptr_conversions() {
 // GCC does not accept this due to CWG Defect #1423
 // http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_defects.html#1423
-#if defined(__clang__) && !defined(TEST_COMPILER_NVCC)
+#if  defined(TEST_COMPILER_CLANG) \
+ && !defined(TEST_COMPILER_NVCC) \
+ && !defined(TEST_COMPILER_CLANG_CUDA)
     {
         bool b = nullptr;
         assert(!b);

From afa9004c111afbf8404b0e2e0eca416614972715 Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Wed, 27 Sep 2023 12:58:55 +0200
Subject: [PATCH 36/64] Add missing includes

---
 libcudacxx/.upstream-tests/test/heterogeneous/helpers.h       | 1 +
 .../specialized.addressof/addressof.pass.cpp                  | 4 ++++
 2 files changed, 5 insertions(+)

diff --git a/libcudacxx/.upstream-tests/test/heterogeneous/helpers.h b/libcudacxx/.upstream-tests/test/heterogeneous/helpers.h
index d0d27200f1..4cf73c24b2 100644
--- a/libcudacxx/.upstream-tests/test/heterogeneous/helpers.h
+++ b/libcudacxx/.upstream-tests/test/heterogeneous/helpers.h
@@ -11,6 +11,7 @@
 
 #include <cuda/std/type_traits>
 
+#include <cstdio>
 #include <new>
 #include <thread>
 #include <vector>
diff --git a/libcudacxx/.upstream-tests/test/std/utilities/memory/specialized.algorithms/specialized.addressof/addressof.pass.cpp b/libcudacxx/.upstream-tests/test/std/utilities/memory/specialized.algorithms/specialized.addressof/addressof.pass.cpp
index dbf13dc050..34f156c4c3 100644
--- a/libcudacxx/.upstream-tests/test/std/utilities/memory/specialized.algorithms/specialized.addressof/addressof.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/std/utilities/memory/specialized.algorithms/specialized.addressof/addressof.pass.cpp
@@ -16,6 +16,10 @@
 
 #include "test_macros.h"
 
+#ifdef TEST_COMPILER_CLANG_CUDA
+#include <new>
+#endif // TEST_COMPILER_CLANG_CUDA
+
 struct A
 {
     __host__ __device__ void operator&() const {}

From 69fe6edaa2823eec2bf16ed80fc8e76597069b3b Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Wed, 27 Sep 2023 11:46:45 +0000
Subject: [PATCH 37/64] Fix tests

---
 .../meta/meta.rel/is_nothrow_invocable.pass.cpp  | 16 ++++++++--------
 .../is_swappable_include_order.pass.cpp          |  3 ++-
 .../std/utilities/time/date.time/ctime.pass.cpp  |  2 ++
 3 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/libcudacxx/.upstream-tests/test/std/utilities/meta/meta.rel/is_nothrow_invocable.pass.cpp b/libcudacxx/.upstream-tests/test/std/utilities/meta/meta.rel/is_nothrow_invocable.pass.cpp
index 1c21592fd1..eec9c0c75e 100644
--- a/libcudacxx/.upstream-tests/test/std/utilities/meta/meta.rel/is_nothrow_invocable.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/std/utilities/meta/meta.rel/is_nothrow_invocable.pass.cpp
@@ -22,42 +22,42 @@
 struct Tag {};
 
 struct Implicit {
-  Implicit(int) noexcept {}
+  __host__ __device__ Implicit(int) noexcept {}
 };
 
 struct ThrowsImplicit {
-  ThrowsImplicit(int) {}
+  __host__ __device__ ThrowsImplicit(int) {}
 };
 
 struct Explicit {
-  explicit Explicit(int) noexcept {}
+  __host__ __device__ explicit Explicit(int) noexcept {}
 };
 
 template <bool IsNoexcept, class Ret, class... Args>
 struct CallObject {
-  Ret operator()(Args&&...) const noexcept(IsNoexcept);
+  __host__ __device__ Ret operator()(Args&&...) const noexcept(IsNoexcept);
 };
 
 struct Sink {
   template <class... Args>
-  void operator()(Args&&...) const noexcept {}
+  __host__ __device__ void operator()(Args&&...) const noexcept {}
 };
 
 template <class Fn, class... Args>
-constexpr bool throws_invocable() {
+__host__ __device__ constexpr bool throws_invocable() {
   return cuda::std::is_invocable<Fn, Args...>::value &&
          !cuda::std::is_nothrow_invocable<Fn, Args...>::value;
 }
 
 template <class Ret, class Fn, class... Args>
-constexpr bool throws_invocable_r() {
+__host__ __device__ constexpr bool throws_invocable_r() {
   return cuda::std::is_invocable_r<Ret, Fn, Args...>::value &&
          !cuda::std::is_nothrow_invocable_r<Ret, Fn, Args...>::value;
 }
 
 // FIXME(EricWF) Don't test the where noexcept is *not* part of the type system
 // once implementations have caught up.
-void test_noexcept_function_pointers() {
+__host__ __device__ void test_noexcept_function_pointers() {
   struct Dummy {
     void foo() noexcept {}
     static void bar() noexcept {}
diff --git a/libcudacxx/.upstream-tests/test/std/utilities/meta/meta.unary/meta.unary.prop/is_swappable_include_order.pass.cpp b/libcudacxx/.upstream-tests/test/std/utilities/meta/meta.unary/meta.unary.prop/is_swappable_include_order.pass.cpp
index 408f8e7e6f..37666eb394 100644
--- a/libcudacxx/.upstream-tests/test/std/utilities/meta/meta.unary/meta.unary.prop/is_swappable_include_order.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/std/utilities/meta/meta.unary/meta.unary.prop/is_swappable_include_order.pass.cpp
@@ -23,7 +23,8 @@
 // This test checks that (1) and (2) see forward declarations
 // for (3).
 #include <cuda/std/type_traits>
-#include <cuda/std/algorithm>
+//#include <cuda/std/algorithm>
+#include <cuda/std/array>
 #include <cuda/std/utility>
 
 #include "test_macros.h"
diff --git a/libcudacxx/.upstream-tests/test/std/utilities/time/date.time/ctime.pass.cpp b/libcudacxx/.upstream-tests/test/std/utilities/time/date.time/ctime.pass.cpp
index 89c9c4efe8..e5c564312e 100644
--- a/libcudacxx/.upstream-tests/test/std/utilities/time/date.time/ctime.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/std/utilities/time/date.time/ctime.pass.cpp
@@ -51,7 +51,9 @@ int main(int, char**)
     unused(tmspec); // Prevent unused warning
 #endif
 
+#ifndef TEST_COMPILER_CLANG_CUDA
     static_assert((cuda::std::is_same<decltype(cuda::std::clock()), cuda::std::clock_t>::value), "");
+#endif // TEST_COMPILER_CLANG_CUDA
     static_assert((cuda::std::is_same<decltype(cuda::std::difftime(t,t)), double>::value), "");
     static_assert((cuda::std::is_same<decltype(cuda::std::mktime(&tm)), cuda::std::time_t>::value), "");
     static_assert((cuda::std::is_same<decltype(cuda::std::time(&t)), cuda::std::time_t>::value), "");

From c0872b979ff22c56abe6287d1e9db776df220b51 Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Wed, 27 Sep 2023 11:51:48 +0000
Subject: [PATCH 38/64] We need to force include `force_include.h`

---
 libcudacxx/.upstream-tests/test/CMakeLists.txt  | 6 ++++++
 libcudacxx/.upstream-tests/test/force_include.h | 2 +-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/libcudacxx/.upstream-tests/test/CMakeLists.txt b/libcudacxx/.upstream-tests/test/CMakeLists.txt
index cc49f2b2d4..dddc1df7e0 100644
--- a/libcudacxx/.upstream-tests/test/CMakeLists.txt
+++ b/libcudacxx/.upstream-tests/test/CMakeLists.txt
@@ -79,6 +79,12 @@ if (${CMAKE_CUDA_COMPILER_ID} STREQUAL "NVHPC")
     -stdpar")
 endif()
 
+if (${CMAKE_CUDA_COMPILER_ID} STREQUAL "Clang")
+  set(LIBCUDACXX_TEST_COMPILER_FLAGS
+    "${LIBCUDACXX_TEST_COMPILER_FLAGS} \
+    ${LIBCUDACXX_FORCE_INCLUDE}")
+endif()
+
 set(LIBCUDACXX_COMPUTE_ARCHS_STRING
   "${CMAKE_CUDA_ARCHITECTURES}")
 
diff --git a/libcudacxx/.upstream-tests/test/force_include.h b/libcudacxx/.upstream-tests/test/force_include.h
index 9be658711e..fd7a644f8c 100644
--- a/libcudacxx/.upstream-tests/test/force_include.h
+++ b/libcudacxx/.upstream-tests/test/force_include.h
@@ -93,6 +93,6 @@ int main(int argc, char** argv)
     return ret;
 }
 
-#define main fake_main
+#define main __host__ __device__ fake_main
 
 #endif

From 11dd4889a34f8d22c5968632b3c1ba450ae1402b Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Wed, 27 Sep 2023 14:56:48 +0200
Subject: [PATCH 39/64] Avoid signed / unsigned warnings

---
 .../.upstream-tests/test/cuda/memcpy_async/group_memcpy_async.h | 2 +-
 .../test/cuda/pipeline_divergent_threads.pass.cpp               | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/libcudacxx/.upstream-tests/test/cuda/memcpy_async/group_memcpy_async.h b/libcudacxx/.upstream-tests/test/cuda/memcpy_async/group_memcpy_async.h
index 89c3850101..5302f0c660 100644
--- a/libcudacxx/.upstream-tests/test/cuda/memcpy_async/group_memcpy_async.h
+++ b/libcudacxx/.upstream-tests/test/cuda/memcpy_async/group_memcpy_async.h
@@ -47,7 +47,7 @@ struct storage
     __host__ __device__
     friend bool operator==(const storage & lhs, const T & rhs) {
         for (cuda::std::size_t i = 0; i < size; ++i) {
-            if (lhs.data[i] != rhs + i) {
+            if (lhs.data[i] != static_cast<T>(rhs + i)) {
                 return false;
             }
         }
diff --git a/libcudacxx/.upstream-tests/test/cuda/pipeline_divergent_threads.pass.cpp b/libcudacxx/.upstream-tests/test/cuda/pipeline_divergent_threads.pass.cpp
index 147b263e6c..7aeeca07fd 100644
--- a/libcudacxx/.upstream-tests/test/cuda/pipeline_divergent_threads.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/cuda/pipeline_divergent_threads.pass.cpp
@@ -30,7 +30,7 @@ constexpr size_t stages_count = 2; // Pipeline with two stages
 // Simply copy shared memory to global out
 __device__ __forceinline__ void compute(int* global_out, int const* shared_in){
     auto block = cooperative_groups::this_thread_block();
-    for (int i = 0; i < block.size(); ++i) {
+    for (int i = 0; i < static_cast<int>(block.size()); ++i) {
         global_out[i] = shared_in[i];
     }
 }

From 0be8b9d5e4b372404aba57f9d5586cfaca03794a Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Wed, 27 Sep 2023 14:56:59 +0200
Subject: [PATCH 40/64] Properly escape inside asm

---
 .../include/cuda/std/detail/libcxx/include/__cuda/barrier.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h
index c4354ec020..ba63ff796a 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h
@@ -356,9 +356,9 @@ friend class _CUDA_VSTD::__barrier_poll_tester_parity;
         NV_DISPATCH_TARGET(
             NV_PROVIDES_SM_80, (
                 asm volatile ("{"
-                    ".reg .pred %p;"
-                    "mbarrier.test_wait.parity.shared.b64 %p, [%1], %2;"
-                    "selp.u16 %0, 1, 0, %p;"
+                    ".reg .pred %%p;"
+                    "mbarrier.test_wait.parity.shared.b64 %%p, [%1], %2;"
+                    "selp.u16 %0, 1, 0, %%p;"
                     "}"
                     : "=h"(__ready)
                     : "r"(static_cast<uint32_t>(__cvta_generic_to_shared(&__barrier))),

From c5e2d23c3ac911194c4abb19bf30ef5872cb013f Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Wed, 27 Sep 2023 14:57:24 +0200
Subject: [PATCH 41/64] Consider clang-cuda for platform-test

---
 libcudacxx/.upstream-tests/test/cuda/test_platform.pass.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/libcudacxx/.upstream-tests/test/cuda/test_platform.pass.cpp b/libcudacxx/.upstream-tests/test/cuda/test_platform.pass.cpp
index 2ae5a470d5..658e33a634 100644
--- a/libcudacxx/.upstream-tests/test/cuda/test_platform.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/cuda/test_platform.pass.cpp
@@ -21,11 +21,13 @@
 #  define TEST_NVCC
 #elif defined(__NVCOMPILER)
 #  define TEST_NVCXX
+#elif defined(__CUDA__)
+#  define TEST_CLANG_CUDA
 #else
 #  define TEST_HOST
 #endif
 
-#if defined(TEST_NVCC)
+#if defined(TEST_NVCC) || defined(TEST_CLANG_CUDA)
 
 __host__ __device__ void test() {
 #if defined(__CUDA_ARCH__)

From 5583d2893e9389f160cec72450ebaf7d4088c3f3 Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Wed, 27 Sep 2023 16:57:43 +0200
Subject: [PATCH 42/64] clang can handle subsumption

---
 .../indirectly_copyable.subsumption.compile.pass.cpp            | 2 +-
 .../indirectly_movable.subsumption.compile.pass.cpp             | 2 +-
 .../iterator.concept.bidir/subsumption.compile.pass.cpp         | 2 +-
 .../iterator.concept.forward/subsumption.compile.pass.cpp       | 2 +-
 .../iterator.concept.inc/subsumption.compile.pass.cpp           | 2 +-
 .../iterator.concept.input/subsumption.compile.pass.cpp         | 2 +-
 .../iterator.concept.iterator/subsumption.compile.pass.cpp      | 2 +-
 7 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/libcudacxx/.upstream-tests/test/std/iterators/iterator.requirements/alg.req.ind.copy/indirectly_copyable.subsumption.compile.pass.cpp b/libcudacxx/.upstream-tests/test/std/iterators/iterator.requirements/alg.req.ind.copy/indirectly_copyable.subsumption.compile.pass.cpp
index 386feb269c..36e1a14e4b 100644
--- a/libcudacxx/.upstream-tests/test/std/iterators/iterator.requirements/alg.req.ind.copy/indirectly_copyable.subsumption.compile.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/std/iterators/iterator.requirements/alg.req.ind.copy/indirectly_copyable.subsumption.compile.pass.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17
-// XFAIL: c++20
+// XFAIL: c++20 && nvcc
 // nvbug 3885350
 
 // template<class In, class Out>
diff --git a/libcudacxx/.upstream-tests/test/std/iterators/iterator.requirements/alg.req.ind.move/indirectly_movable.subsumption.compile.pass.cpp b/libcudacxx/.upstream-tests/test/std/iterators/iterator.requirements/alg.req.ind.move/indirectly_movable.subsumption.compile.pass.cpp
index ad70e0f1af..35bb8051ea 100644
--- a/libcudacxx/.upstream-tests/test/std/iterators/iterator.requirements/alg.req.ind.move/indirectly_movable.subsumption.compile.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/std/iterators/iterator.requirements/alg.req.ind.move/indirectly_movable.subsumption.compile.pass.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17
-// XFAIL: c++20
+// XFAIL: c++20 && nvcc
 // nvbug 3885350
 
 // template<class In, class Out>
diff --git a/libcudacxx/.upstream-tests/test/std/iterators/iterator.requirements/iterator.concepts/iterator.concept.bidir/subsumption.compile.pass.cpp b/libcudacxx/.upstream-tests/test/std/iterators/iterator.requirements/iterator.concepts/iterator.concept.bidir/subsumption.compile.pass.cpp
index bdc44634f0..f20403ee8b 100644
--- a/libcudacxx/.upstream-tests/test/std/iterators/iterator.requirements/iterator.concepts/iterator.concept.bidir/subsumption.compile.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/std/iterators/iterator.requirements/iterator.concepts/iterator.concept.bidir/subsumption.compile.pass.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17
-// XFAIL: c++20
+// XFAIL: c++20 && nvcc
 // nvbug 3885350
 
 // template<class T>
diff --git a/libcudacxx/.upstream-tests/test/std/iterators/iterator.requirements/iterator.concepts/iterator.concept.forward/subsumption.compile.pass.cpp b/libcudacxx/.upstream-tests/test/std/iterators/iterator.requirements/iterator.concepts/iterator.concept.forward/subsumption.compile.pass.cpp
index 2e9def18c7..19c90e7bc0 100644
--- a/libcudacxx/.upstream-tests/test/std/iterators/iterator.requirements/iterator.concepts/iterator.concept.forward/subsumption.compile.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/std/iterators/iterator.requirements/iterator.concepts/iterator.concept.forward/subsumption.compile.pass.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17
-// XFAIL: c++20
+// XFAIL: c++20 && nvcc
 // nvbug 3885350
 
 // cuda::std::forward_iterator;
diff --git a/libcudacxx/.upstream-tests/test/std/iterators/iterator.requirements/iterator.concepts/iterator.concept.inc/subsumption.compile.pass.cpp b/libcudacxx/.upstream-tests/test/std/iterators/iterator.requirements/iterator.concepts/iterator.concept.inc/subsumption.compile.pass.cpp
index 16a8f5a8dd..69bb112ea7 100644
--- a/libcudacxx/.upstream-tests/test/std/iterators/iterator.requirements/iterator.concepts/iterator.concept.inc/subsumption.compile.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/std/iterators/iterator.requirements/iterator.concepts/iterator.concept.inc/subsumption.compile.pass.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17
-// XFAIL: c++20
+// XFAIL: c++20 && nvcc
 // nvbug 3885350
 
 // template<class T>
diff --git a/libcudacxx/.upstream-tests/test/std/iterators/iterator.requirements/iterator.concepts/iterator.concept.input/subsumption.compile.pass.cpp b/libcudacxx/.upstream-tests/test/std/iterators/iterator.requirements/iterator.concepts/iterator.concept.input/subsumption.compile.pass.cpp
index 4209a3901a..6012698155 100644
--- a/libcudacxx/.upstream-tests/test/std/iterators/iterator.requirements/iterator.concepts/iterator.concept.input/subsumption.compile.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/std/iterators/iterator.requirements/iterator.concepts/iterator.concept.input/subsumption.compile.pass.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17
-// XFAIL: c++20
+// XFAIL: c++20 && nvcc
 // nvbug 3885350
 
 // template<class T>
diff --git a/libcudacxx/.upstream-tests/test/std/iterators/iterator.requirements/iterator.concepts/iterator.concept.iterator/subsumption.compile.pass.cpp b/libcudacxx/.upstream-tests/test/std/iterators/iterator.requirements/iterator.concepts/iterator.concept.iterator/subsumption.compile.pass.cpp
index 11cd6d3dbb..0f2de25eae 100644
--- a/libcudacxx/.upstream-tests/test/std/iterators/iterator.requirements/iterator.concepts/iterator.concept.iterator/subsumption.compile.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/std/iterators/iterator.requirements/iterator.concepts/iterator.concept.iterator/subsumption.compile.pass.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17
-// XFAIL: c++20
+// XFAIL: c++20 && nvcc
 // nvbug 3885350
 
 // template<class In>

From 0a2bd94e9d3c6407e624b1cf11bb815bf4eda421 Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Wed, 27 Sep 2023 16:58:24 +0200
Subject: [PATCH 43/64] Avoid more signed / unsigned issues

---
 .../include/cuda/std/detail/libcxx/include/__cuda/barrier.h     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h
index ba63ff796a..7096709124 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h
@@ -226,7 +226,7 @@ friend class _CUDA_VSTD::__barrier_poll_tester_parity;
                 int __leader = __ffs(__active) - 1;
                 // All threads in mask synchronize here, establishing cummulativity to the __leader:
                 __syncwarp(__mask);
-                if(__leader == __laneid)
+                if(__leader == static_cast<int>(__laneid))
                 {
                     __token = __barrier.arrive(__inc);
                 }

From d07152067ed6d50b5f96f40c09394941c820642d Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Wed, 27 Sep 2023 18:15:06 +0200
Subject: [PATCH 44/64] Escape all the things

---
 .../include/cuda/std/detail/libcxx/include/__cuda/barrier.h     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h
index 7096709124..7727ba7521 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h
@@ -222,7 +222,7 @@ friend class _CUDA_VSTD::__barrier_poll_tester_parity;
                 int __inc = __popc(__active) * __update;
 
                 unsigned __laneid;
-                asm ("mov.u32 %0, %laneid;" : "=r"(__laneid));
+                asm ("mov.u32 %0, %%laneid;" : "=r"(__laneid));
                 int __leader = __ffs(__active) - 1;
                 // All threads in mask synchronize here, establishing cummulativity to the __leader:
                 __syncwarp(__mask);

From d634859fd3d26023d7316a3f98678f1418a1cf1b Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Wed, 27 Sep 2023 18:15:21 +0200
Subject: [PATCH 45/64] Silence more warnings

---
 .../meta/meta.trans/meta.trans.other/result_of11.pass.cpp      | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/libcudacxx/.upstream-tests/test/std/utilities/meta/meta.trans/meta.trans.other/result_of11.pass.cpp b/libcudacxx/.upstream-tests/test/std/utilities/meta/meta.trans/meta.trans.other/result_of11.pass.cpp
index e806d12193..99b1bfbb66 100644
--- a/libcudacxx/.upstream-tests/test/std/utilities/meta/meta.trans/meta.trans.other/result_of11.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/std/utilities/meta/meta.trans/meta.trans.other/result_of11.pass.cpp
@@ -26,6 +26,9 @@
 #include "test_macros.h"
 
 TEST_NV_DIAG_SUPPRESS(3013) // a volatile function parameter is deprecated
+#ifdef TEST_COMPILER_CLANG_CUDA
+#pragma clang diagnostic ignored "-Wdeprecated-volatile"
+#endif // TEST_COMPILER_CLANG_CUDA
 
 struct wat
 {

From 4223917afb0fc166bd54153aa875084dc4a55a76 Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Wed, 27 Sep 2023 18:15:40 +0200
Subject: [PATCH 46/64] Remove trailing commata

---
 .../test/heterogeneous/barrier_parity.std.pass.cpp              | 2 +-
 .../.upstream-tests/test/heterogeneous/cuda_atomic_ref.pass.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/libcudacxx/.upstream-tests/test/heterogeneous/barrier_parity.std.pass.cpp b/libcudacxx/.upstream-tests/test/heterogeneous/barrier_parity.std.pass.cpp
index 7446bd67d6..dfa1ed364e 100644
--- a/libcudacxx/.upstream-tests/test/heterogeneous/barrier_parity.std.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/heterogeneous/barrier_parity.std.pass.cpp
@@ -79,7 +79,7 @@ using aw_aw_pw1 = performer_list<
     barrier_arrive_and_wait,
     barrier_arrive_and_wait,
     async_tester_fence,
-    clear_token,
+    clear_token
 >;
 
 using aw_aw_pw2 = performer_list<
diff --git a/libcudacxx/.upstream-tests/test/heterogeneous/cuda_atomic_ref.pass.cpp b/libcudacxx/.upstream-tests/test/heterogeneous/cuda_atomic_ref.pass.cpp
index ec5e00ad08..290d45ab82 100644
--- a/libcudacxx/.upstream-tests/test/heterogeneous/cuda_atomic_ref.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/heterogeneous/cuda_atomic_ref.pass.cpp
@@ -166,7 +166,7 @@ using arithmetic_atomic_testers = extend_tester_list<
     fetch_sub_tester<30, 21, 9>,
     fetch_min_tester<9, 5, 5>,
     fetch_max_tester<5, 9, 9>,
-    fetch_sub_tester<9, 17, -8>,
+    fetch_sub_tester<9, 17, -8>
 >;
 
 using bitwise_atomic_testers = extend_tester_list<

From 8f4beea58862d6618cc0574aa3ec86eab729dcbc Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Wed, 27 Sep 2023 18:16:55 +0000
Subject: [PATCH 47/64] Fix two tests that should fail to compile

---
 .../iterators/iterator.container/empty.array.fail.cpp  | 10 +---------
 .../iterator.container/empty.container.fail.cpp        |  1 +
 2 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/libcudacxx/.upstream-tests/test/std/iterators/iterator.container/empty.array.fail.cpp b/libcudacxx/.upstream-tests/test/std/iterators/iterator.container/empty.array.fail.cpp
index abdc0e63b1..e93c8e98b6 100644
--- a/libcudacxx/.upstream-tests/test/std/iterators/iterator.container/empty.array.fail.cpp
+++ b/libcudacxx/.upstream-tests/test/std/iterators/iterator.container/empty.array.fail.cpp
@@ -16,8 +16,6 @@
 // UNSUPPORTED: c++98, c++03, c++11, c++14, c++17
 // UNSUPPORTED: clang-3.3, clang-3.4, clang-3.5, clang-3.6, clang-3.7, clang-3.8
 
-#if defined(_LIBCUDACXX_HAS_VECTOR)
-#include <cuda/std/vector>
 #include <cuda/std/iterator>
 
 #include "test_macros.h"
@@ -28,10 +26,4 @@ int main(int, char**)
     cuda::std::empty(c);  // expected-error {{ignoring return value of function declared with 'nodiscard' attribute}}
 
   return 0;
-}
-#else
-int main(int, char**)
-{
-  return 0;
-}
-#endif
\ No newline at end of file
+}
\ No newline at end of file
diff --git a/libcudacxx/.upstream-tests/test/std/iterators/iterator.container/empty.container.fail.cpp b/libcudacxx/.upstream-tests/test/std/iterators/iterator.container/empty.container.fail.cpp
index 1ccefe303a..18b89125c7 100644
--- a/libcudacxx/.upstream-tests/test/std/iterators/iterator.container/empty.container.fail.cpp
+++ b/libcudacxx/.upstream-tests/test/std/iterators/iterator.container/empty.container.fail.cpp
@@ -15,6 +15,7 @@
 
 // UNSUPPORTED: c++98, c++03, c++11, c++14, c++17
 // UNSUPPORTED: clang-3.3, clang-3.4, clang-3.5, clang-3.6, clang-3.7, clang-3.8
+// XFAIL: c++20
 
 #if defined(_LIBCUDACXX_HAS_VECTOR)
 #include <cuda/std/vector>

From 44febea7c9f9a00f9ddee61469c67b9ae04f034c Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Wed, 27 Sep 2023 18:29:21 +0000
Subject: [PATCH 48/64] Fix pipeline divergent threads

clang-cuda really does not like to initialize shared variables so go the indirection around a pointer
---
 .../test/cuda/pipeline_divergent_threads.pass.cpp         | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/libcudacxx/.upstream-tests/test/cuda/pipeline_divergent_threads.pass.cpp b/libcudacxx/.upstream-tests/test/cuda/pipeline_divergent_threads.pass.cpp
index 7aeeca07fd..6490c62bab 100644
--- a/libcudacxx/.upstream-tests/test/cuda/pipeline_divergent_threads.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/cuda/pipeline_divergent_threads.pass.cpp
@@ -20,6 +20,7 @@
 #include <stdlib.h>
 
 #include "test_macros.h"
+#include "cuda_space_selector.h"
 
 TEST_NV_DIAG_SUPPRESS(static_var_with_dynamic_init)
 TEST_NV_DIAG_SUPPRESS(186) // pointless comparison of unsigned integer with zero
@@ -46,8 +47,11 @@ __global__ void with_staging(int* global_out, int const* global_in, size_t size,
     size_t shared_offset[stages_count] = { 0, block.size() }; // Offsets to each batch
 
     // Allocate shared storage for a two-stage cuda::pipeline:
-    __shared__ cuda::pipeline_shared_state<cuda::thread_scope::thread_scope_block, stages_count> shared_state;
-    auto pipeline = cuda::make_pipeline(block, &shared_state);
+    using pipeline_state = cuda::pipeline_shared_state<cuda::thread_scope::thread_scope_block, stages_count>;
+    __shared__ pipeline_state* shared_state;
+    shared_memory_selector<pipeline_state, constructor_initializer> sel;
+    shared_state = sel.construct();
+    auto pipeline = cuda::make_pipeline(block, shared_state);
 
     // Each thread processes `batch_sz` elements.
     // Compute offset of the batch `batch` of this thread block in global memory:

From 91145f433dd2fb36e8aa580e2c348fcfb2e7f64a Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Wed, 27 Sep 2023 18:38:21 +0000
Subject: [PATCH 49/64] Disable two tests that rely on managed variables

---
 libcudacxx/.upstream-tests/test/heterogeneous/barrier.pass.cpp   | 1 +
 .../.upstream-tests/test/heterogeneous/barrier_abi_v2.pass.cpp   | 1 +
 2 files changed, 2 insertions(+)

diff --git a/libcudacxx/.upstream-tests/test/heterogeneous/barrier.pass.cpp b/libcudacxx/.upstream-tests/test/heterogeneous/barrier.pass.cpp
index 96d27dd5c7..b91177882c 100644
--- a/libcudacxx/.upstream-tests/test/heterogeneous/barrier.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/heterogeneous/barrier.pass.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: nvrtc, pre-sm-70
+// UNSUPPORTED: clang && (!nvcc)
 
 // uncomment for a really verbose output detailing what test steps are being launched
 // #define DEBUG_TESTERS
diff --git a/libcudacxx/.upstream-tests/test/heterogeneous/barrier_abi_v2.pass.cpp b/libcudacxx/.upstream-tests/test/heterogeneous/barrier_abi_v2.pass.cpp
index ae24f76ecf..35c1c593ac 100644
--- a/libcudacxx/.upstream-tests/test/heterogeneous/barrier_abi_v2.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/heterogeneous/barrier_abi_v2.pass.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: nvrtc, pre-sm-70
+// UNSUPPORTED: clang && (!nvcc)
 
 // uncomment for a really verbose output detailing what test steps are being launched
 // #define DEBUG_TESTERS

From 0e2cb576397efbcc92ac1198d0ac59ccd7ef153a Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Wed, 27 Sep 2023 18:38:21 +0000
Subject: [PATCH 50/64] Disable two tests that rely on managed variables

---
 libcudacxx/.upstream-tests/test/heterogeneous/barrier.pass.cpp   | 1 +
 .../.upstream-tests/test/heterogeneous/barrier_abi_v2.pass.cpp   | 1 +
 2 files changed, 2 insertions(+)

diff --git a/libcudacxx/.upstream-tests/test/heterogeneous/barrier.pass.cpp b/libcudacxx/.upstream-tests/test/heterogeneous/barrier.pass.cpp
index 96d27dd5c7..602567b529 100644
--- a/libcudacxx/.upstream-tests/test/heterogeneous/barrier.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/heterogeneous/barrier.pass.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: nvrtc, pre-sm-70
+// XFAIL: clang && (!nvcc)
 
 // uncomment for a really verbose output detailing what test steps are being launched
 // #define DEBUG_TESTERS
diff --git a/libcudacxx/.upstream-tests/test/heterogeneous/barrier_abi_v2.pass.cpp b/libcudacxx/.upstream-tests/test/heterogeneous/barrier_abi_v2.pass.cpp
index ae24f76ecf..04e1999a01 100644
--- a/libcudacxx/.upstream-tests/test/heterogeneous/barrier_abi_v2.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/heterogeneous/barrier_abi_v2.pass.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: nvrtc, pre-sm-70
+// XFAIL: clang && (!nvcc)
 
 // uncomment for a really verbose output detailing what test steps are being launched
 // #define DEBUG_TESTERS

From 374ecb24a6fb6f36b320fbd53e85aadee8bd0fe8 Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Wed, 27 Sep 2023 18:45:02 +0000
Subject: [PATCH 51/64] Fix one more test for SM_80

---
 .../test/cuda/barrier/native_handle.pass.cpp             | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/libcudacxx/.upstream-tests/test/cuda/barrier/native_handle.pass.cpp b/libcudacxx/.upstream-tests/test/cuda/barrier/native_handle.pass.cpp
index 570630ed15..0b29135f95 100644
--- a/libcudacxx/.upstream-tests/test/cuda/barrier/native_handle.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/cuda/barrier/native_handle.pass.cpp
@@ -21,17 +21,18 @@ TEST_NV_DIAG_SUPPRESS(set_but_not_used)
 __device__
 void test()
 {
-    __shared__ cuda::barrier<cuda::thread_scope_block> b;
-    init(&b, 2);
+    __shared__ cuda::barrier<cuda::thread_scope_block>* b;
+    shared_memory_selector<cuda::barrier<cuda::thread_scope_block>, constructor_initializer> sel;
+    b = sel.construct(2);
 
     uint64_t token;
     asm volatile ("mbarrier.arrive.b64 %0, [%1];"
         : "=l"(token)
-        : "l"(cuda::device::barrier_native_handle(b))
+        : "l"(cuda::device::barrier_native_handle(*b))
         : "memory");
     (void)token;
 
-    b.arrive_and_wait();
+    b->arrive_and_wait();
 }
 
 int main(int argc, char ** argv)

From 0685884f493a6eca2e3e93ec0bdbd87a58727ed4 Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Wed, 27 Sep 2023 19:10:10 +0000
Subject: [PATCH 52/64] Disable test that fails during runtime with an invalid
 launch parameter

---
 .../.upstream-tests/test/cuda/pipeline_arrive_on_abi_v2.pass.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/libcudacxx/.upstream-tests/test/cuda/pipeline_arrive_on_abi_v2.pass.cpp b/libcudacxx/.upstream-tests/test/cuda/pipeline_arrive_on_abi_v2.pass.cpp
index 83431f1558..5534849a39 100644
--- a/libcudacxx/.upstream-tests/test/cuda/pipeline_arrive_on_abi_v2.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/cuda/pipeline_arrive_on_abi_v2.pass.cpp
@@ -9,6 +9,7 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: pre-sm-70
+// UNSUPPORTED: clang && (!nvcc)
 
 #define _LIBCUDACXX_CUDA_ABI_VERSION 2
 

From b228f262abe10cf92a4d364c41b7cb36dad577b3 Mon Sep 17 00:00:00 2001
From: Jake Hemstad <jhemstad@nvidia.com>
Date: Thu, 28 Sep 2023 18:11:01 +0000
Subject: [PATCH 53/64] Re-enable other jobs.

---
 .github/workflows/pr.yml | 142 +++++++++++++++++++--------------------
 1 file changed, 71 insertions(+), 71 deletions(-)

diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml
index aefdcb7882..be35a4ba8e 100644
--- a/.github/workflows/pr.yml
+++ b/.github/workflows/pr.yml
@@ -51,66 +51,66 @@ jobs:
         run: |
           .github/actions/compute-matrix/compute-matrix.sh ci/matrix.yaml pull_request
 
-  #nvrtc:
-  #  name: NVRTC CUDA${{matrix.cuda}} C++${{matrix.std}}
-  #  needs: compute-matrix
-  #  if: ${{ !contains(github.event.head_commit.message, 'skip-tests') }}
-  #  uses: ./.github/workflows/run-as-coder.yml
-  #  strategy:
-  #    fail-fast: false
-  #    matrix:
-  #      include: ${{ fromJSON(needs.compute-matrix.outputs.NVRTC_MATRIX) }}
-  #  with:
-  #    name: NVRTC CUDA${{matrix.cuda}} C++${{matrix.std}}
-  #    runner: linux-${{matrix.cpu}}-gpu-v100-latest-1
-  #    image: rapidsai/devcontainers:${{needs.compute-matrix.outputs.DEVCONTAINER_VERSION}}-cpp-gcc12-cuda${{matrix.cuda}}-${{matrix.os}}
-  #    command: |
-  #      ./ci/nvrtc_libcudacxx.sh g++ ${{matrix.std}} ${{matrix.gpu_build_archs}}
+  nvrtc:
+    name: NVRTC CUDA${{matrix.cuda}} C++${{matrix.std}}
+    needs: compute-matrix
+    if: ${{ !contains(github.event.head_commit.message, 'skip-tests') }}
+    uses: ./.github/workflows/run-as-coder.yml
+    strategy:
+      fail-fast: false
+      matrix:
+        include: ${{ fromJSON(needs.compute-matrix.outputs.NVRTC_MATRIX) }}
+    with:
+      name: NVRTC CUDA${{matrix.cuda}} C++${{matrix.std}}
+      runner: linux-${{matrix.cpu}}-gpu-v100-latest-1
+      image: rapidsai/devcontainers:${{needs.compute-matrix.outputs.DEVCONTAINER_VERSION}}-cpp-gcc12-cuda${{matrix.cuda}}-${{matrix.os}}
+      command: |
+        ./ci/nvrtc_libcudacxx.sh g++ ${{matrix.std}} ${{matrix.gpu_build_archs}}
 
-  #thrust:
-  #  name: Thrust CUDA${{ matrix.cuda_version }} ${{ matrix.compiler }}
-  #  needs: compute-matrix
-  #  uses: ./.github/workflows/dispatch-build-and-test.yml
-  #  strategy:
-  #    fail-fast: false
-  #    matrix:
-  #      cuda_version: ${{ fromJSON(needs.compute-matrix.outputs.CUDA_VERSIONS) }}
-  #      compiler: ${{ fromJSON(needs.compute-matrix.outputs.HOST_COMPILERS) }}
-  #  with:
-  #    project_name: "thrust"
-  #    per_cuda_compiler_matrix: ${{ toJSON(fromJSON(needs.compute-matrix.outputs.PER_CUDA_COMPILER_MATRIX)[ format('{0}-{1}', matrix.cuda_version, matrix.compiler) ]) }}
-  #    devcontainer_version: ${{ needs.compute-matrix.outputs.DEVCONTAINER_VERSION }}
-  #    is_windows: ${{ contains(matrix.compiler, 'cl') }}
+  thrust:
+    name: Thrust CUDA${{ matrix.cuda_version }} ${{ matrix.compiler }}
+    needs: compute-matrix
+    uses: ./.github/workflows/dispatch-build-and-test.yml
+    strategy:
+      fail-fast: false
+      matrix:
+        cuda_version: ${{ fromJSON(needs.compute-matrix.outputs.CUDA_VERSIONS) }}
+        compiler: ${{ fromJSON(needs.compute-matrix.outputs.HOST_COMPILERS) }}
+    with:
+      project_name: "thrust"
+      per_cuda_compiler_matrix: ${{ toJSON(fromJSON(needs.compute-matrix.outputs.PER_CUDA_COMPILER_MATRIX)[ format('{0}-{1}', matrix.cuda_version, matrix.compiler) ]) }}
+      devcontainer_version: ${{ needs.compute-matrix.outputs.DEVCONTAINER_VERSION }}
+      is_windows: ${{ contains(matrix.compiler, 'cl') }}
 
-  #cub:
-  #  name: CUB CUDA${{ matrix.cuda_version }} ${{ matrix.compiler }}
-  #  needs: compute-matrix
-  #  uses: ./.github/workflows/dispatch-build-and-test.yml
-  #  strategy:
-  #    fail-fast: false
-  #    matrix:
-  #      cuda_version: ${{ fromJSON(needs.compute-matrix.outputs.CUDA_VERSIONS) }}
-  #      compiler: ${{ fromJSON(needs.compute-matrix.outputs.HOST_COMPILERS) }}
-  #  with:
-  #    project_name: "cub"
-  #    per_cuda_compiler_matrix: ${{ toJSON(fromJSON(needs.compute-matrix.outputs.PER_CUDA_COMPILER_MATRIX)[ format('{0}-{1}', matrix.cuda_version, matrix.compiler) ]) }}
-  #    devcontainer_version: ${{ needs.compute-matrix.outputs.DEVCONTAINER_VERSION }}
-  #    is_windows: ${{ contains(matrix.compiler, 'cl') }}
+  cub:
+    name: CUB CUDA${{ matrix.cuda_version }} ${{ matrix.compiler }}
+    needs: compute-matrix
+    uses: ./.github/workflows/dispatch-build-and-test.yml
+    strategy:
+      fail-fast: false
+      matrix:
+        cuda_version: ${{ fromJSON(needs.compute-matrix.outputs.CUDA_VERSIONS) }}
+        compiler: ${{ fromJSON(needs.compute-matrix.outputs.HOST_COMPILERS) }}
+    with:
+      project_name: "cub"
+      per_cuda_compiler_matrix: ${{ toJSON(fromJSON(needs.compute-matrix.outputs.PER_CUDA_COMPILER_MATRIX)[ format('{0}-{1}', matrix.cuda_version, matrix.compiler) ]) }}
+      devcontainer_version: ${{ needs.compute-matrix.outputs.DEVCONTAINER_VERSION }}
+      is_windows: ${{ contains(matrix.compiler, 'cl') }}
 
-  #libcudacxx:
-  #  name: libcudacxx CUDA${{ matrix.cuda_version }} ${{ matrix.compiler }}
-  #  needs: compute-matrix
-  #  uses: ./.github/workflows/dispatch-build-and-test.yml
-  #  strategy:
-  #    fail-fast: false
-  #    matrix:
-  #      cuda_version: ${{ fromJSON(needs.compute-matrix.outputs.CUDA_VERSIONS) }}
-  #      compiler: ${{ fromJSON(needs.compute-matrix.outputs.HOST_COMPILERS) }}
-  #  with:
-  #    project_name: "libcudacxx"
-  #    per_cuda_compiler_matrix: ${{ toJSON(fromJSON(needs.compute-matrix.outputs.PER_CUDA_COMPILER_MATRIX)[ format('{0}-{1}', matrix.cuda_version, matrix.compiler) ]) }}
-  #    devcontainer_version: ${{ needs.compute-matrix.outputs.DEVCONTAINER_VERSION }}
-  #    is_windows: ${{ contains(matrix.compiler, 'cl') }}
+  libcudacxx:
+    name: libcudacxx CUDA${{ matrix.cuda_version }} ${{ matrix.compiler }}
+    needs: compute-matrix
+    uses: ./.github/workflows/dispatch-build-and-test.yml
+    strategy:
+      fail-fast: false
+      matrix:
+        cuda_version: ${{ fromJSON(needs.compute-matrix.outputs.CUDA_VERSIONS) }}
+        compiler: ${{ fromJSON(needs.compute-matrix.outputs.HOST_COMPILERS) }}
+    with:
+      project_name: "libcudacxx"
+      per_cuda_compiler_matrix: ${{ toJSON(fromJSON(needs.compute-matrix.outputs.PER_CUDA_COMPILER_MATRIX)[ format('{0}-{1}', matrix.cuda_version, matrix.compiler) ]) }}
+      devcontainer_version: ${{ needs.compute-matrix.outputs.DEVCONTAINER_VERSION }}
+      is_windows: ${{ contains(matrix.compiler, 'cl') }}
 
   clang-cuda:
     name: ${{matrix.lib}} CTK${{matrix.cuda}} clang-cuda ${{matrix.compiler.version}}
@@ -127,20 +127,20 @@ jobs:
       command: |
         CMAKE_CUDA_COMPILER="${{matrix.compiler.exe}}" ./ci/build_${{matrix.lib}}.sh ${{matrix.compiler.exe}} ${{matrix.std}} ${{matrix.gpu_build_archs}}
 
-#  examples:
-#    name: CCCL Examples
-#    needs: compute-matrix
-#    if: ${{ !contains(github.event.head_commit.message, 'skip-tests') }}
-#    uses: ./.github/workflows/build-examples.yml
-#    strategy:
-#      fail-fast: false
-#      matrix:
-#        cuda_version: ${{ fromJSON(needs.compute-matrix.outputs.CUDA_VERSIONS) }}
-#        compiler: ${{ fromJSON(needs.compute-matrix.outputs.HOST_COMPILERS) }}
-#    with:
-#      per_cuda_compiler_matrix: ${{ toJSON(fromJSON(needs.compute-matrix.outputs.PER_CUDA_COMPILER_MATRIX)[ format('{0}-{1}', matrix.cuda_version, matrix.compiler) ]) }}
-#      devcontainer_version: ${{ needs.compute-matrix.outputs.DEVCONTAINER_VERSION }}
-#      is_windows: ${{ contains(matrix.compiler, 'cl') }}
+  examples:
+    name: CCCL Examples
+    needs: compute-matrix
+    if: ${{ !contains(github.event.head_commit.message, 'skip-tests') }}
+    uses: ./.github/workflows/build-examples.yml
+    strategy:
+      fail-fast: false
+      matrix:
+        cuda_version: ${{ fromJSON(needs.compute-matrix.outputs.CUDA_VERSIONS) }}
+        compiler: ${{ fromJSON(needs.compute-matrix.outputs.HOST_COMPILERS) }}
+    with:
+      per_cuda_compiler_matrix: ${{ toJSON(fromJSON(needs.compute-matrix.outputs.PER_CUDA_COMPILER_MATRIX)[ format('{0}-{1}', matrix.cuda_version, matrix.compiler) ]) }}
+      devcontainer_version: ${{ needs.compute-matrix.outputs.DEVCONTAINER_VERSION }}
+      is_windows: ${{ contains(matrix.compiler, 'cl') }}
 
   # This job is the final job that runs after all other jobs and is used for branch protection status checks.
   # See: https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/collaborating-on-repositories-with-code-quality-features/about-status-checks

From 4be1ed2598c63e69414124d8d9c132329dcb7e5a Mon Sep 17 00:00:00 2001
From: Jake Hemstad <jhemstad@nvidia.com>
Date: Thu, 28 Sep 2023 18:11:39 +0000
Subject: [PATCH 54/64] Re-enable other jobs in status check.

---
 .github/workflows/pr.yml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml
index be35a4ba8e..24ed932612 100644
--- a/.github/workflows/pr.yml
+++ b/.github/workflows/pr.yml
@@ -149,10 +149,10 @@ jobs:
     name: CI
     needs:
       - clang-cuda
-      #- cub
-      #- libcudacxx
-      #- nvrtc
-      #- thrust
-      #- examples
+      - cub
+      - libcudacxx
+      - nvrtc
+      - thrust
+      - examples
     steps:
       - run: echo "CI success"

From c8154ee637a42f57c02c8da6c7e2677fb3db14d5 Mon Sep 17 00:00:00 2001
From: Jake Hemstad <jhemstad@nvidia.com>
Date: Thu, 28 Sep 2023 18:42:30 +0000
Subject: [PATCH 55/64] Update clang-cuda job names.

---
 .github/workflows/pr.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml
index 24ed932612..40283c0608 100644
--- a/.github/workflows/pr.yml
+++ b/.github/workflows/pr.yml
@@ -113,7 +113,7 @@ jobs:
       is_windows: ${{ contains(matrix.compiler, 'cl') }}
 
   clang-cuda:
-    name: ${{matrix.lib}} CTK${{matrix.cuda}} clang-cuda ${{matrix.compiler.version}}
+    name: ${{matrix.lib}} ${{matrix.cpu}}/CTK${{matrix.cuda}}/clang-cuda
     needs: compute-matrix
     strategy:
       fail-fast: false
@@ -121,7 +121,7 @@ jobs:
         include: ${{ fromJSON(needs.compute-matrix.outputs.CLANG_CUDA_MATRIX) }}
     uses: ./.github/workflows/run-as-coder.yml
     with:
-      name: ${{matrix.lib}} CTK${{matrix.cuda}} clang-cuda ${{matrix.compiler.version}}
+      name: ${{matrix.lib}} CTK${{matrix.cuda}} clang-cuda${{matrix.compiler.version}}/${{matrix.std}}
       runner: linux-${{matrix.cpu}}-cpu16
       image: rapidsai/devcontainers:${{needs.compute-matrix.outputs.DEVCONTAINER_VERSION}}-cpp-${{matrix.compiler.name}}${{matrix.compiler.version}}-cuda${{matrix.cuda}}-${{matrix.os}}
       command: |

From 75017217faeb158865f8ad988eb096b5002db76a Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Fri, 29 Sep 2023 12:37:53 +0000
Subject: [PATCH 56/64] Try not to add invalid flag to clang

---
 libcudacxx/.upstream-tests/utils/libcudacxx/test/config.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/libcudacxx/.upstream-tests/utils/libcudacxx/test/config.py b/libcudacxx/.upstream-tests/utils/libcudacxx/test/config.py
index 5c310d7483..bbd145c69b 100644
--- a/libcudacxx/.upstream-tests/utils/libcudacxx/test/config.py
+++ b/libcudacxx/.upstream-tests/utils/libcudacxx/test/config.py
@@ -1181,7 +1181,8 @@ def addIfHostSupports(flag):
                 addIfHostSupports('-Wall')
                 addIfHostSupports('-Wextra')
                 addIfHostSupports('-Werror')
-                addIfHostSupports('-Wno-literal-suffix') # GCC warning about reserved UDLs
+                if 'gcc' in self.config.available_features:
+                    addIfHostSupports('-Wno-literal-suffix') # GCC warning about reserved UDLs
                 addIfHostSupports('-Wno-user-defined-literals') # Clang warning about reserved UDLs
                 addIfHostSupports('-Wno-unused-parameter')
                 addIfHostSupports('-Wno-unused-local-typedefs') # GCC warning local typdefs

From fa10123beb54dc6457d5b867a572f822e88e6f37 Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Fri, 29 Sep 2023 12:38:10 +0000
Subject: [PATCH 57/64] try to fix `is_nothrow_invocable` test

---
 .../meta.rel/is_nothrow_invocable.pass.cpp    | 65 +++++++++++--------
 1 file changed, 39 insertions(+), 26 deletions(-)

diff --git a/libcudacxx/.upstream-tests/test/std/utilities/meta/meta.rel/is_nothrow_invocable.pass.cpp b/libcudacxx/.upstream-tests/test/std/utilities/meta/meta.rel/is_nothrow_invocable.pass.cpp
index eec9c0c75e..deab672a3a 100644
--- a/libcudacxx/.upstream-tests/test/std/utilities/meta/meta.rel/is_nothrow_invocable.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/std/utilities/meta/meta.rel/is_nothrow_invocable.pass.cpp
@@ -6,13 +6,14 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: c++98, c++03, c++11
+// UNSUPPORTED: c++03, c++11
 
 // type_traits
 
 // is_nothrow_invocable
 
 #include <cuda/std/type_traits>
+#include <cuda/std/functional>
 #ifdef _LIBCUDACXX_HAS_VECTOR
 #include <cuda/std/vector>
 #endif // _LIBCUDACXX_HAS_VECTOR
@@ -55,34 +56,22 @@ __host__ __device__ constexpr bool throws_invocable_r() {
          !cuda::std::is_nothrow_invocable_r<Ret, Fn, Args...>::value;
 }
 
-// FIXME(EricWF) Don't test the where noexcept is *not* part of the type system
-// once implementations have caught up.
 __host__ __device__ void test_noexcept_function_pointers() {
   struct Dummy {
-    void foo() noexcept {}
-    static void bar() noexcept {}
+    __host__ __device__ void foo() noexcept {}
+    __host__ __device__ static void bar() noexcept {}
   };
-#if !defined(__cpp_noexcept_function_type)
-  {
-    // Check that PMF's and function pointers *work*. is_nothrow_invocable will always
-    // return false because 'noexcept' is not part of the function type.
-    static_assert(throws_invocable<decltype(&Dummy::foo), Dummy&>(), "");
-    static_assert(throws_invocable<decltype(&Dummy::bar)>(), "");
-  }
-#else
-  {
-    // Check that PMF's and function pointers actually work and that
-    // is_nothrow_invocable returns true for noexcept PMF's and function
-    // pointers.
-    static_assert(
-        cuda::std::is_nothrow_invocable<decltype(&Dummy::foo), Dummy&>::value, "");
-    static_assert(cuda::std::is_nothrow_invocable<decltype(&Dummy::bar)>::value, "");
-  }
-#endif
+  // Check that PMF's and function pointers actually work and that
+  // is_nothrow_invocable returns true for noexcept PMF's and function
+  // pointers.
+  static_assert(cuda::std::is_nothrow_invocable<decltype(&Dummy::foo), Dummy&>::value, "");
+  static_assert(cuda::std::is_nothrow_invocable<decltype(&Dummy::bar)>::value, "");
 }
 
 int main(int, char**) {
+#if TEST_STD_VER >= 17
   using AbominableFunc = void(...) const noexcept;
+#endif // TEST_STD_VER >= 17
   //  Non-callable things
   {
     static_assert(!cuda::std::is_nothrow_invocable<void>::value, "");
@@ -104,23 +93,25 @@ int main(int, char**) {
     static_assert(!cuda::std::is_nothrow_invocable<const int&>::value, "");
     static_assert(!cuda::std::is_nothrow_invocable<int&&>::value, "");
 
-#ifdef _LIBCUDACXX_HAS_VECTOR
     static_assert(!cuda::std::is_nothrow_invocable<int, cuda::std::vector<int> >::value,
                   "");
     static_assert(!cuda::std::is_nothrow_invocable<int, cuda::std::vector<int*> >::value,
                   "");
     static_assert(!cuda::std::is_nothrow_invocable<int, cuda::std::vector<int**> >::value,
                   "");
-#endif // _LIBCUDACXX_HAS_VECTOR
 
+#if TEST_STD_VER >= 17
     static_assert(!cuda::std::is_nothrow_invocable<AbominableFunc>::value, "");
+#endif // TEST_STD_VER >= 17
 
     //  with parameters
     static_assert(!cuda::std::is_nothrow_invocable<int, int>::value, "");
     static_assert(!cuda::std::is_nothrow_invocable<int, double, float>::value, "");
     static_assert(!cuda::std::is_nothrow_invocable<int, char, float, double>::value,
                   "");
+#if TEST_STD_VER >= 17
     static_assert(!cuda::std::is_nothrow_invocable<Sink, AbominableFunc>::value, "");
+#endif // TEST_STD_VER >= 17
     static_assert(!cuda::std::is_nothrow_invocable<Sink, void>::value, "");
     static_assert(!cuda::std::is_nothrow_invocable<Sink, const volatile void>::value,
                   "");
@@ -145,16 +136,16 @@ int main(int, char**) {
     static_assert(!cuda::std::is_nothrow_invocable_r<int, const int&>::value, "");
     static_assert(!cuda::std::is_nothrow_invocable_r<int, int&&>::value, "");
 
-#ifdef _LIBCUDACXX_HAS_VECTOR
     static_assert(!cuda::std::is_nothrow_invocable_r<int, cuda::std::vector<int> >::value,
                   "");
     static_assert(!cuda::std::is_nothrow_invocable_r<int, cuda::std::vector<int*> >::value,
                   "");
     static_assert(!cuda::std::is_nothrow_invocable_r<int, cuda::std::vector<int**> >::value,
                   "");
-#endif // _LIBCUDACXX_HAS_VECTOR
+#if TEST_STD_VER >= 17
     static_assert(!cuda::std::is_nothrow_invocable_r<void, AbominableFunc>::value,
                   "");
+#endif // TEST_STD_VER >= 17
 
     //  with parameters
     static_assert(!cuda::std::is_nothrow_invocable_r<int, int, int>::value, "");
@@ -162,8 +153,10 @@ int main(int, char**) {
                   "");
     static_assert(
         !cuda::std::is_nothrow_invocable_r<int, int, char, float, double>::value, "");
+#if TEST_STD_VER >= 17
     static_assert(
         !cuda::std::is_nothrow_invocable_r<void, Sink, AbominableFunc>::value, "");
+#endif // TEST_STD_VER >= 17
     static_assert(!cuda::std::is_nothrow_invocable_r<void, Sink, void>::value, "");
     static_assert(
         !cuda::std::is_nothrow_invocable_r<void, Sink, const volatile void>::value,
@@ -204,6 +197,26 @@ int main(int, char**) {
     static_assert(cuda::std::is_nothrow_invocable_r<Implicit, Fn, Tag&>::value, "");
     static_assert(throws_invocable_r<ThrowsImplicit, Fn, Tag&>(), "");
   }
+#if TEST_STD_VER >= 17
+  {
+    // Check that it's fine if the result type is non-moveable.
+    struct CantMove {
+      CantMove() = default;
+      __host__ __device__ CantMove(CantMove&&) = delete;
+    };
+
+    static_assert(!cuda::std::is_move_constructible_v<CantMove>, "");
+    static_assert(!cuda::std::is_copy_constructible_v<CantMove>, "");
+
+    using Fn = CantMove() noexcept;
+
+    static_assert(cuda::std::is_nothrow_invocable_r<CantMove, Fn>::value, "");
+    static_assert(!cuda::std::is_nothrow_invocable_r<CantMove, Fn, int>::value, "");
+
+    static_assert(cuda::std::is_nothrow_invocable_r_v<CantMove, Fn>, "");
+    static_assert(!cuda::std::is_nothrow_invocable_r_v<CantMove, Fn, int>, "");
+  }
+#endif // TEST_STD_VER >= 17
   {
     // Check for is_nothrow_invocable_v
     using Fn = CallObject<true, int>;

From 16b6e58f493d2ffd40a76c425ace725da9bd7c8b Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Fri, 29 Sep 2023 16:28:35 +0200
Subject: [PATCH 58/64] Mark is_swappable test as potentially passing

---
 .../meta.unary.prop/is_swappable_include_order.pass.cpp         | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libcudacxx/.upstream-tests/test/std/utilities/meta/meta.unary/meta.unary.prop/is_swappable_include_order.pass.cpp b/libcudacxx/.upstream-tests/test/std/utilities/meta/meta.unary/meta.unary.prop/is_swappable_include_order.pass.cpp
index 37666eb394..28dac6da10 100644
--- a/libcudacxx/.upstream-tests/test/std/utilities/meta/meta.unary/meta.unary.prop/is_swappable_include_order.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/std/utilities/meta/meta.unary/meta.unary.prop/is_swappable_include_order.pass.cpp
@@ -10,7 +10,7 @@
 
 // is_swappable
 
-// XFAIL: nvcc
+// XFAIL: nvcc && (!nvrtc) && (!c++20) && (!clang-16) && (!gcc-12)
 
 // If we're just building the test and not executing it, it should pass.
 // UNSUPPORTED: no_execute

From 34e270a6b81046364522cf57306fbe9aa2a80419 Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Fri, 29 Sep 2023 16:30:05 +0200
Subject: [PATCH 59/64] Make MSVC pass

---
 .../std/iterators/iterator.container/empty.container.fail.cpp   | 2 +-
 .../std/utilities/meta/meta.rel/is_nothrow_invocable.pass.cpp   | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/libcudacxx/.upstream-tests/test/std/iterators/iterator.container/empty.container.fail.cpp b/libcudacxx/.upstream-tests/test/std/iterators/iterator.container/empty.container.fail.cpp
index 18b89125c7..48ba13b785 100644
--- a/libcudacxx/.upstream-tests/test/std/iterators/iterator.container/empty.container.fail.cpp
+++ b/libcudacxx/.upstream-tests/test/std/iterators/iterator.container/empty.container.fail.cpp
@@ -15,7 +15,7 @@
 
 // UNSUPPORTED: c++98, c++03, c++11, c++14, c++17
 // UNSUPPORTED: clang-3.3, clang-3.4, clang-3.5, clang-3.6, clang-3.7, clang-3.8
-// XFAIL: c++20
+// XFAIL: c++20 && (!msvc-19.36)
 
 #if defined(_LIBCUDACXX_HAS_VECTOR)
 #include <cuda/std/vector>
diff --git a/libcudacxx/.upstream-tests/test/std/utilities/meta/meta.rel/is_nothrow_invocable.pass.cpp b/libcudacxx/.upstream-tests/test/std/utilities/meta/meta.rel/is_nothrow_invocable.pass.cpp
index deab672a3a..63569f089a 100644
--- a/libcudacxx/.upstream-tests/test/std/utilities/meta/meta.rel/is_nothrow_invocable.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/std/utilities/meta/meta.rel/is_nothrow_invocable.pass.cpp
@@ -213,8 +213,10 @@ int main(int, char**) {
     static_assert(cuda::std::is_nothrow_invocable_r<CantMove, Fn>::value, "");
     static_assert(!cuda::std::is_nothrow_invocable_r<CantMove, Fn, int>::value, "");
 
+#ifndef _LIBCUDACXX_COMPILER_MSVC_2017
     static_assert(cuda::std::is_nothrow_invocable_r_v<CantMove, Fn>, "");
     static_assert(!cuda::std::is_nothrow_invocable_r_v<CantMove, Fn, int>, "");
+#endif // _LIBCUDACXX_COMPILER_MSVC_2017
   }
 #endif // TEST_STD_VER >= 17
   {

From 894c9862e18d0da0213238d64e3efb046e3c1153 Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Fri, 29 Sep 2023 16:51:44 +0200
Subject: [PATCH 60/64] Unfail test that seems to pass

---
 .../std/iterators/iterator.container/empty.container.fail.cpp    | 1 -
 1 file changed, 1 deletion(-)

diff --git a/libcudacxx/.upstream-tests/test/std/iterators/iterator.container/empty.container.fail.cpp b/libcudacxx/.upstream-tests/test/std/iterators/iterator.container/empty.container.fail.cpp
index 48ba13b785..1ccefe303a 100644
--- a/libcudacxx/.upstream-tests/test/std/iterators/iterator.container/empty.container.fail.cpp
+++ b/libcudacxx/.upstream-tests/test/std/iterators/iterator.container/empty.container.fail.cpp
@@ -15,7 +15,6 @@
 
 // UNSUPPORTED: c++98, c++03, c++11, c++14, c++17
 // UNSUPPORTED: clang-3.3, clang-3.4, clang-3.5, clang-3.6, clang-3.7, clang-3.8
-// XFAIL: c++20 && (!msvc-19.36)
 
 #if defined(_LIBCUDACXX_HAS_VECTOR)
 #include <cuda/std/vector>

From b95c9e6fa0afee42142008b242e5821c7b49c537 Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Sat, 30 Sep 2023 08:39:06 +0200
Subject: [PATCH 61/64] Fix test for nvrtc

---
 .../std/utilities/meta/meta.rel/is_invocable.pass.cpp  | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/libcudacxx/.upstream-tests/test/std/utilities/meta/meta.rel/is_invocable.pass.cpp b/libcudacxx/.upstream-tests/test/std/utilities/meta/meta.rel/is_invocable.pass.cpp
index 64d574e557..dc19b8e058 100644
--- a/libcudacxx/.upstream-tests/test/std/utilities/meta/meta.rel/is_invocable.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/std/utilities/meta/meta.rel/is_invocable.pass.cpp
@@ -34,21 +34,21 @@ struct Tag {};
 struct DerFromTag : Tag {};
 
 struct Implicit {
-  Implicit(int) {}
+  __host__ __device__ Implicit(int) {}
 };
 
 struct Explicit {
-  explicit Explicit(int) {}
+  __host__ __device__ explicit Explicit(int) {}
 };
 
 struct NotCallableWithInt {
-  int operator()(int) = delete;
-  int operator()(Tag) { return 42; }
+  __host__ __device__ int operator()(int) = delete;
+  __host__ __device__ int operator()(Tag) { return 42; }
 };
 
 struct Sink {
   template <class ...Args>
-  void operator()(Args&&...) const {}
+  __host__ __device__ void operator()(Args&&...) const {}
 };
 
 int main(int, char**) {

From 42519b6f3ff6dc50d4336b34c34a67d9ed2b81ac Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Sat, 30 Sep 2023 08:43:25 +0200
Subject: [PATCH 62/64] Fix fail test

---
 .../iterator.container/empty.container.fail.cpp       | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/libcudacxx/.upstream-tests/test/std/iterators/iterator.container/empty.container.fail.cpp b/libcudacxx/.upstream-tests/test/std/iterators/iterator.container/empty.container.fail.cpp
index 1ccefe303a..2951a74a05 100644
--- a/libcudacxx/.upstream-tests/test/std/iterators/iterator.container/empty.container.fail.cpp
+++ b/libcudacxx/.upstream-tests/test/std/iterators/iterator.container/empty.container.fail.cpp
@@ -16,22 +16,15 @@
 // UNSUPPORTED: c++98, c++03, c++11, c++14, c++17
 // UNSUPPORTED: clang-3.3, clang-3.4, clang-3.5, clang-3.6, clang-3.7, clang-3.8
 
-#if defined(_LIBCUDACXX_HAS_VECTOR)
-#include <cuda/std/vector>
+#include <cuda/std/array>
 #include <cuda/std/iterator>
 
 #include "test_macros.h"
 
 int main(int, char**)
 {
-    cuda::std::vector<int> c;
+    cuda::std::array<int, 4> c;
     cuda::std::empty(c);  // expected-error {{ignoring return value of function declared with 'nodiscard' attribute}}
 
   return 0;
 }
-#else
-int main(int, char**)
-{
-  return 0;
-}
-#endif
\ No newline at end of file

From 995fea512115e94960b6e61526f589697879169f Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Tue, 10 Oct 2023 10:59:03 +0200
Subject: [PATCH 63/64] Address review comments

---
 libcudacxx/.upstream-tests/test/CMakeLists.txt       |  4 +++-
 .../.upstream-tests/utils/libcudacxx/test/config.py  | 12 ------------
 2 files changed, 3 insertions(+), 13 deletions(-)

diff --git a/libcudacxx/.upstream-tests/test/CMakeLists.txt b/libcudacxx/.upstream-tests/test/CMakeLists.txt
index dddc1df7e0..9fe4904d45 100644
--- a/libcudacxx/.upstream-tests/test/CMakeLists.txt
+++ b/libcudacxx/.upstream-tests/test/CMakeLists.txt
@@ -82,7 +82,9 @@ endif()
 if (${CMAKE_CUDA_COMPILER_ID} STREQUAL "Clang")
   set(LIBCUDACXX_TEST_COMPILER_FLAGS
     "${LIBCUDACXX_TEST_COMPILER_FLAGS} \
-    ${LIBCUDACXX_FORCE_INCLUDE}")
+    ${LIBCUDACXX_FORCE_INCLUDE} \
+    -I${libcudacxx_SOURCE_DIR}/include \
+    ${LIBCUDACXX_WARNING_LEVEL}")
 endif()
 
 set(LIBCUDACXX_COMPUTE_ARCHS_STRING
diff --git a/libcudacxx/.upstream-tests/utils/libcudacxx/test/config.py b/libcudacxx/.upstream-tests/utils/libcudacxx/test/config.py
index bbd145c69b..fcc98acf98 100644
--- a/libcudacxx/.upstream-tests/utils/libcudacxx/test/config.py
+++ b/libcudacxx/.upstream-tests/utils/libcudacxx/test/config.py
@@ -1068,18 +1068,6 @@ def configure_link_flags_cxx_library(self):
             self.cxx.link_flags += ['-lc++experimental']
         if self.link_shared:
             self.cxx.link_flags += ['-lc++']
-        # Device code does not have binary components, don't link libc++
-        # elif self.cxx.type != 'nvcc' and self.cxx.type != 'pgi':
-        #     cxx_library_root = self.get_lit_conf('cxx_library_root')
-        #     if cxx_library_root:
-        #         libname = self.make_static_lib_name('c++')
-        #         abs_path = os.path.join(cxx_library_root, libname)
-        #         assert os.path.exists(abs_path) and \
-        #                "static libc++ library does not exist"
-        #         self.cxx.link_flags += [abs_path]
-        #     else:
-        #         self.cxx.link_flags += ['-lc++']
-
     def configure_link_flags_abi_library(self):
         cxx_abi = self.get_lit_conf('cxx_abi', 'libcxxabi')
         if cxx_abi == 'libstdc++':

From 60682bfcbf1172cc64524367cf35a91fc03ce569 Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Tue, 10 Oct 2023 11:13:11 +0200
Subject: [PATCH 64/64] Do not pass warnings flags similar to nvcc for
 clang-cuda

---
 libcudacxx/.upstream-tests/test/CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libcudacxx/.upstream-tests/test/CMakeLists.txt b/libcudacxx/.upstream-tests/test/CMakeLists.txt
index 9fe4904d45..ebbe975175 100644
--- a/libcudacxx/.upstream-tests/test/CMakeLists.txt
+++ b/libcudacxx/.upstream-tests/test/CMakeLists.txt
@@ -32,10 +32,10 @@ else() # NOT LIBCUDACXX_TEST_WITH_NVRTC
   set(LIBCUDACXX_CUDA_COMPILER "${CMAKE_CUDA_COMPILER}")
 endif()
 
-if (NOT MSVC)
+if (NOT MSVC AND NOT ${CMAKE_CUDA_COMPILER_ID} STREQUAL "Clang")
   set(LIBCUDACXX_WARNING_LEVEL
     "--compiler-options=-Wall \
-     --compiler-options=-Wextra")
+    --compiler-options=-Wextra")
 endif()
 
 # sccache cannot handle the -Fd option generationg pdb files