diff --git a/.github/actions/compute-matrix/compute-matrix.sh b/.github/actions/compute-matrix/compute-matrix.sh
index f0a57aab5e..cba278bf0f 100755
--- a/.github/actions/compute-matrix/compute-matrix.sh
+++ b/.github/actions/compute-matrix/compute-matrix.sh
@@ -12,6 +12,10 @@ explode_std_versions() {
   jq -cr 'map(. as $o | {std: $o.std[]} + del($o.std))'
 }
 
+explode_libs() {
+  jq -cr 'map(. as $o | {lib: $o.lib[]} + del($o.lib))'
+}
+
 extract_matrix() {
   local file="$1"
   local type="$2"
@@ -23,6 +27,8 @@ extract_matrix() {
   write_output "HOST_COMPILERS" "$(echo "$nvcc_full_matrix" | jq -cr '[.[] | .compiler.name] | unique')"
   write_output "PER_CUDA_COMPILER_MATRIX" "$(echo "$nvcc_full_matrix" | jq -cr ' group_by(.cuda + .compiler.name) | map({(.[0].cuda + "-" + .[0].compiler.name): .}) | add')"
   write_output "NVRTC_MATRIX" "$(echo "$matrix" | jq '.nvrtc' | explode_std_versions)"
+  local clang_cuda_matrix="$(echo "$matrix" | jq -cr '.["clang-cuda"]' | explode_std_versions | explode_libs)"
+  write_output "CLANG_CUDA_MATRIX" "$clang_cuda_matrix"
   write_output "EXAMPLES_MATRIX" "$(echo "$matrix" | jq -cr '.examples' )"
 }
 
diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml
index 972ef73cec..b9c610941b 100644
--- a/.github/workflows/pr.yml
+++ b/.github/workflows/pr.yml
@@ -41,6 +41,7 @@ jobs:
       HOST_COMPILERS: ${{steps.set-outputs.outputs.HOST_COMPILERS}}
       PER_CUDA_COMPILER_MATRIX: ${{steps.set-outputs.outputs.PER_CUDA_COMPILER_MATRIX}}
       NVRTC_MATRIX: ${{steps.set-outputs.outputs.NVRTC_MATRIX}}
+      CLANG_CUDA_MATRIX: ${{steps.set-outputs.outputs.CLANG_CUDA_MATRIX}}
       EXAMPLES_MATRIX: ${{steps.set-outputs.outputs.EXAMPLES_MATRIX}}
     steps:
       - name: Checkout repo
@@ -111,9 +112,25 @@ jobs:
       devcontainer_version: ${{ needs.compute-matrix.outputs.DEVCONTAINER_VERSION }}
       is_windows: ${{ contains(matrix.compiler, 'cl') }}
 
+  clang-cuda:
+    name: ${{matrix.lib}} ${{matrix.cpu}}/CTK${{matrix.cuda}}/clang-cuda
+    needs: compute-matrix
+    strategy:
+      fail-fast: false
+      matrix:
+        include: ${{ fromJSON(needs.compute-matrix.outputs.CLANG_CUDA_MATRIX) }}
+    uses: ./.github/workflows/run-as-coder.yml
+    with:
+      name: ${{matrix.lib}} CTK${{matrix.cuda}} clang-cuda${{matrix.compiler.version}}/${{matrix.std}}
+      runner: linux-${{matrix.cpu}}-cpu16
+      image: rapidsai/devcontainers:${{needs.compute-matrix.outputs.DEVCONTAINER_VERSION}}-cpp-${{matrix.compiler.name}}${{matrix.compiler.version}}-cuda${{matrix.cuda}}-${{matrix.os}}
+      command: |
+        CMAKE_CUDA_COMPILER="${{matrix.compiler.exe}}" ./ci/build_${{matrix.lib}}.sh ${{matrix.compiler.exe}} ${{matrix.std}} ${{matrix.gpu_build_archs}}
+
   examples:
     name: CCCL Examples
     needs: compute-matrix
+    if: ${{ !contains(github.event.head_commit.message, 'skip-tests') }}
     strategy:
       fail-fast: false
       matrix:
@@ -137,6 +154,7 @@ jobs:
     runs-on: ubuntu-latest
     name: CI
     needs:
+      - clang-cuda
       - cub
       - libcudacxx
       - nvrtc
diff --git a/ci/build_common.sh b/ci/build_common.sh
index 81210f4090..93ca919521 100755
--- a/ci/build_common.sh
+++ b/ci/build_common.sh
@@ -6,12 +6,13 @@ set -eo pipefail
 cd "$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )";
 
 # Script defaults
-CUDA_COMPILER=nvcc
+CUDA_COMPILER=${CMAKE_CUDA_COMPILER:-nvcc}
 
 # Check if the correct number of arguments has been provided
 function usage {
     echo "Usage: $0 [OPTIONS] <HOST_COMPILER> <CXX_STANDARD> <GPU_ARCHS>"
     echo "The PARALLEL_LEVEL environment variable controls the amount of build parallelism. Default is the number of cores."
+    echo "The CMAKE_CUDA_COMPILER environment variable can be used to control the CUDA compiler. The -nvcc flag takes precedence."
     echo "Example: PARALLEL_LEVEL=8 $0 g++-8 14 \"70\" "
     echo "Example: $0 clang++-8 17 \"70;75;80-virtual\" "
     echo "Possible options: "
@@ -54,9 +55,7 @@ readonly CXX_STANDARD=$2
 
 # Replace spaces, commas and semicolons with semicolons for CMake list
 readonly GPU_ARCHS=$(echo $3 | tr ' ,' ';')
-
 readonly PARALLEL_LEVEL=${PARALLEL_LEVEL:=$(nproc)}
-readonly NVCC_VERSION=$($CUDA_COMPILER --version | grep release | awk '{print $6}' | cut -c2-)
 
 if [ -z ${DEVCONTAINER_NAME+x} ]; then
     BUILD_DIR=../build/local
@@ -83,7 +82,7 @@ COMMON_CMAKE_OPTIONS="
 echo "========================================"
 echo "Begin build"
 echo "pwd=$(pwd)"
-echo "NVCC_VERSION=$NVCC_VERSION"
+echo "CUDA_COMPILER=$CUDA_COMPILER"
 echo "HOST_COMPILER=$HOST_COMPILER"
 echo "CXX_STANDARD=$CXX_STANDARD"
 echo "GPU_ARCHS=$GPU_ARCHS"
diff --git a/ci/build_cub.sh b/ci/build_cub.sh
index f3cdd40546..3b425f2dab 100755
--- a/ci/build_cub.sh
+++ b/ci/build_cub.sh
@@ -2,7 +2,6 @@
 
 source "$(dirname "$0")/build_common.sh"
 
-
 # CUB benchmarks require at least CUDA nvcc 11.5 for int128
 # Returns "true" if the first version is greater than or equal to the second
 version_compare() {
@@ -12,12 +11,20 @@ version_compare() {
         echo "false"
     fi
 }
-readonly ENABLE_CUB_BENCHMARKS=${ENABLE_CUB_BENCHMARKS:=$(version_compare $NVCC_VERSION 11.5)}
 
-if [[ $ENABLE_CUB_BENCHMARKS == "true" ]]; then
-    echo "CUDA version is $NVCC_VERSION. Building CUB benchmarks."
+ENABLE_CUB_BENCHMARKS="false"
+ENABLE_CUB_RDC="false"
+if [[ "$CUDA_COMPILER" == *nvcc* ]]; then
+    ENABLE_CUB_RDC="true"
+    NVCC_VERSION=$($CUDA_COMPILER --version | grep release | awk '{print $6}' | cut -c2-)
+    if [[ $(version_compare $NVCC_VERSION 11.5) == "true" ]]; then
+        ENABLE_CUB_BENCHMARKS="true"
+        echo "nvcc version is $NVCC_VERSION. Building CUB benchmarks."
+    else
+        echo "nvcc version is $NVCC_VERSION. Not building CUB benchmarks because nvcc version is less than 11.5."
+    fi
 else
-    echo "CUDA version is $NVCC_VERSION. Not building CUB benchmarks because CUDA version is less than 11.5."
+    echo "nvcc version is not determined (likely using a non-NVCC compiler). Not building CUB benchmarks."
 fi
 
 CMAKE_OPTIONS="
@@ -32,6 +39,7 @@ CMAKE_OPTIONS="
     -DTHRUST_IGNORE_DEPRECATED_CPP_DIALECT=ON \
     -DCUB_IGNORE_DEPRECATED_CPP_DIALECT=ON \
     -DCUB_ENABLE_BENCHMARKS="$ENABLE_CUB_BENCHMARKS"\
+    -DCUB_ENABLE_RDC_TESTS="$ENABLE_CUB_RDC" \
 "
 
 configure_and_build "CUB" "$CMAKE_OPTIONS"
diff --git a/ci/matrix.yaml b/ci/matrix.yaml
index d94815177b..b7c63f5538 100644
--- a/ci/matrix.yaml
+++ b/ci/matrix.yaml
@@ -51,6 +51,8 @@ pull_request:
     - {cuda: *cuda_newest, os: 'windows2022', cpu: 'amd64', compiler: {name: 'cl',   version: '14.36', exe: 'cl++'}, gpu_build_archs: '70', std: [14, 17, 20],     jobs: ['build']}
   nvrtc:
     - {cuda: *cuda_newest, os: 'ubuntu22.04', cpu: 'amd64', gpu_build_archs: '70', std: [11, 14, 17, 20]}
+  clang-cuda:
+    - {lib: ['thrust', 'cub', 'libcudacxx'], cuda: *cuda_newest, os: 'ubuntu22.04', cpu: 'amd64', compiler: {name: 'llvm', version: '16', exe: 'clang++'}, gpu_build_archs: '70', std: [17, 20]}
   examples:
     # Strategy: Oldest CUDA + Oldest Host compiler && Newest CUDA + Newest Host Compiler
     - {cuda: *cuda_oldest, os: 'ubuntu18.04', cpu: 'amd64', compiler: {name: 'gcc', version: '6', exe: 'g++'}}
diff --git a/libcudacxx/.upstream-tests/test/CMakeLists.txt b/libcudacxx/.upstream-tests/test/CMakeLists.txt
index 6024d60658..ebbe975175 100644
--- a/libcudacxx/.upstream-tests/test/CMakeLists.txt
+++ b/libcudacxx/.upstream-tests/test/CMakeLists.txt
@@ -32,10 +32,10 @@ else() # NOT LIBCUDACXX_TEST_WITH_NVRTC
   set(LIBCUDACXX_CUDA_COMPILER "${CMAKE_CUDA_COMPILER}")
 endif()
 
-if (NOT MSVC)
+if (NOT MSVC AND NOT ${CMAKE_CUDA_COMPILER_ID} STREQUAL "Clang")
   set(LIBCUDACXX_WARNING_LEVEL
     "--compiler-options=-Wall \
-     --compiler-options=-Wextra")
+    --compiler-options=-Wextra")
 endif()
 
 # sccache cannot handle the -Fd option generationg pdb files
@@ -50,6 +50,18 @@ if (${CMAKE_CXX_COMPILER_ID} STREQUAL "IntelLLVM")
     --compiler-options=-fno-fast-math")
 endif()
 
+if (${CMAKE_CUDA_COMPILER_ID} STREQUAL "Clang")
+  string(APPEND LIBCUDACXX_TEST_COMPILER_FLAGS
+     " -Xclang -fcuda-allow-variadic-functions"
+     " -Xclang -Wno-unused-parameter"
+     " -Wno-unknown-cuda-version")
+
+  find_package(CUDAToolkit)
+
+  string(APPEND LIBCUDACXX_TEST_LINKER_FLAGS
+    " -L${CUDAToolkit_LIBRARY_DIR} -lcuda -lcudart")
+endif()
+
 if (${CMAKE_CUDA_COMPILER_ID} STREQUAL "NVIDIA")
   set(LIBCUDACXX_TEST_COMPILER_FLAGS
     "${LIBCUDACXX_TEST_COMPILER_FLAGS} \
@@ -67,6 +79,14 @@ if (${CMAKE_CUDA_COMPILER_ID} STREQUAL "NVHPC")
     -stdpar")
 endif()
 
+if (${CMAKE_CUDA_COMPILER_ID} STREQUAL "Clang")
+  set(LIBCUDACXX_TEST_COMPILER_FLAGS
+    "${LIBCUDACXX_TEST_COMPILER_FLAGS} \
+    ${LIBCUDACXX_FORCE_INCLUDE} \
+    -I${libcudacxx_SOURCE_DIR}/include \
+    ${LIBCUDACXX_WARNING_LEVEL}")
+endif()
+
 set(LIBCUDACXX_COMPUTE_ARCHS_STRING
   "${CMAKE_CUDA_ARCHITECTURES}")
 
diff --git a/libcudacxx/.upstream-tests/test/cuda/barrier/native_handle.pass.cpp b/libcudacxx/.upstream-tests/test/cuda/barrier/native_handle.pass.cpp
index 570630ed15..0b29135f95 100644
--- a/libcudacxx/.upstream-tests/test/cuda/barrier/native_handle.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/cuda/barrier/native_handle.pass.cpp
@@ -21,17 +21,18 @@ TEST_NV_DIAG_SUPPRESS(set_but_not_used)
 __device__
 void test()
 {
-    __shared__ cuda::barrier<cuda::thread_scope_block> b;
-    init(&b, 2);
+    __shared__ cuda::barrier<cuda::thread_scope_block>* b;
+    shared_memory_selector<cuda::barrier<cuda::thread_scope_block>, constructor_initializer> sel;
+    b = sel.construct(2);
 
     uint64_t token;
     asm volatile ("mbarrier.arrive.b64 %0, [%1];"
         : "=l"(token)
-        : "l"(cuda::device::barrier_native_handle(b))
+        : "l"(cuda::device::barrier_native_handle(*b))
         : "memory");
     (void)token;
 
-    b.arrive_and_wait();
+    b->arrive_and_wait();
 }
 
 int main(int argc, char ** argv)
diff --git a/libcudacxx/.upstream-tests/test/cuda/memcpy_async/group_memcpy_async.h b/libcudacxx/.upstream-tests/test/cuda/memcpy_async/group_memcpy_async.h
index 89c3850101..5302f0c660 100644
--- a/libcudacxx/.upstream-tests/test/cuda/memcpy_async/group_memcpy_async.h
+++ b/libcudacxx/.upstream-tests/test/cuda/memcpy_async/group_memcpy_async.h
@@ -47,7 +47,7 @@ struct storage
     __host__ __device__
     friend bool operator==(const storage & lhs, const T & rhs) {
         for (cuda::std::size_t i = 0; i < size; ++i) {
-            if (lhs.data[i] != rhs + i) {
+            if (lhs.data[i] != static_cast<T>(rhs + i)) {
                 return false;
             }
         }
diff --git a/libcudacxx/.upstream-tests/test/cuda/pipeline_arrive_on_abi_v2.pass.cpp b/libcudacxx/.upstream-tests/test/cuda/pipeline_arrive_on_abi_v2.pass.cpp
index 83431f1558..5534849a39 100644
--- a/libcudacxx/.upstream-tests/test/cuda/pipeline_arrive_on_abi_v2.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/cuda/pipeline_arrive_on_abi_v2.pass.cpp
@@ -9,6 +9,7 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: pre-sm-70
+// UNSUPPORTED: clang && (!nvcc)
 
 #define _LIBCUDACXX_CUDA_ABI_VERSION 2
 
diff --git a/libcudacxx/.upstream-tests/test/cuda/pipeline_divergent_threads.pass.cpp b/libcudacxx/.upstream-tests/test/cuda/pipeline_divergent_threads.pass.cpp
index 147b263e6c..6490c62bab 100644
--- a/libcudacxx/.upstream-tests/test/cuda/pipeline_divergent_threads.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/cuda/pipeline_divergent_threads.pass.cpp
@@ -20,6 +20,7 @@
 #include <stdlib.h>
 
 #include "test_macros.h"
+#include "cuda_space_selector.h"
 
 TEST_NV_DIAG_SUPPRESS(static_var_with_dynamic_init)
 TEST_NV_DIAG_SUPPRESS(186) // pointless comparison of unsigned integer with zero
@@ -30,7 +31,7 @@ constexpr size_t stages_count = 2; // Pipeline with two stages
 // Simply copy shared memory to global out
 __device__ __forceinline__ void compute(int* global_out, int const* shared_in){
     auto block = cooperative_groups::this_thread_block();
-    for (int i = 0; i < block.size(); ++i) {
+    for (int i = 0; i < static_cast<int>(block.size()); ++i) {
         global_out[i] = shared_in[i];
     }
 }
@@ -46,8 +47,11 @@ __global__ void with_staging(int* global_out, int const* global_in, size_t size,
     size_t shared_offset[stages_count] = { 0, block.size() }; // Offsets to each batch
 
     // Allocate shared storage for a two-stage cuda::pipeline:
-    __shared__ cuda::pipeline_shared_state<cuda::thread_scope::thread_scope_block, stages_count> shared_state;
-    auto pipeline = cuda::make_pipeline(block, &shared_state);
+    using pipeline_state = cuda::pipeline_shared_state<cuda::thread_scope::thread_scope_block, stages_count>;
+    __shared__ pipeline_state* shared_state;
+    shared_memory_selector<pipeline_state, constructor_initializer> sel;
+    shared_state = sel.construct();
+    auto pipeline = cuda::make_pipeline(block, shared_state);
 
     // Each thread processes `batch_sz` elements.
     // Compute offset of the batch `batch` of this thread block in global memory:
diff --git a/libcudacxx/.upstream-tests/test/cuda/test_platform.pass.cpp b/libcudacxx/.upstream-tests/test/cuda/test_platform.pass.cpp
index 2ae5a470d5..658e33a634 100644
--- a/libcudacxx/.upstream-tests/test/cuda/test_platform.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/cuda/test_platform.pass.cpp
@@ -21,11 +21,13 @@
 #  define TEST_NVCC
 #elif defined(__NVCOMPILER)
 #  define TEST_NVCXX
+#elif defined(__CUDA__)
+#  define TEST_CLANG_CUDA
 #else
 #  define TEST_HOST
 #endif
 
-#if defined(TEST_NVCC)
+#if defined(TEST_NVCC) || defined(TEST_CLANG_CUDA)
 
 __host__ __device__ void test() {
 #if defined(__CUDA_ARCH__)
diff --git a/libcudacxx/.upstream-tests/test/force_include.h b/libcudacxx/.upstream-tests/test/force_include.h
index 9be658711e..fd7a644f8c 100644
--- a/libcudacxx/.upstream-tests/test/force_include.h
+++ b/libcudacxx/.upstream-tests/test/force_include.h
@@ -93,6 +93,6 @@ int main(int argc, char** argv)
     return ret;
 }
 
-#define main fake_main
+#define main __host__ __device__ fake_main
 
 #endif
diff --git a/libcudacxx/.upstream-tests/test/heterogeneous/barrier.pass.cpp b/libcudacxx/.upstream-tests/test/heterogeneous/barrier.pass.cpp
index 96d27dd5c7..602567b529 100644
--- a/libcudacxx/.upstream-tests/test/heterogeneous/barrier.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/heterogeneous/barrier.pass.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: nvrtc, pre-sm-70
+// XFAIL: clang && (!nvcc)
 
 // uncomment for a really verbose output detailing what test steps are being launched
 // #define DEBUG_TESTERS
diff --git a/libcudacxx/.upstream-tests/test/heterogeneous/barrier_abi_v2.pass.cpp b/libcudacxx/.upstream-tests/test/heterogeneous/barrier_abi_v2.pass.cpp
index ae24f76ecf..04e1999a01 100644
--- a/libcudacxx/.upstream-tests/test/heterogeneous/barrier_abi_v2.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/heterogeneous/barrier_abi_v2.pass.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: nvrtc, pre-sm-70
+// XFAIL: clang && (!nvcc)
 
 // uncomment for a really verbose output detailing what test steps are being launched
 // #define DEBUG_TESTERS
diff --git a/libcudacxx/.upstream-tests/test/heterogeneous/barrier_parity.std.pass.cpp b/libcudacxx/.upstream-tests/test/heterogeneous/barrier_parity.std.pass.cpp
index 21172beb68..28695c6d61 100644
--- a/libcudacxx/.upstream-tests/test/heterogeneous/barrier_parity.std.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/heterogeneous/barrier_parity.std.pass.cpp
@@ -80,7 +80,7 @@ using aw_aw_pw1 = performer_list<
     barrier_arrive_and_wait,
     barrier_arrive_and_wait,
     async_tester_fence,
-    clear_token,
+    clear_token
 >;
 
 using aw_aw_pw2 = performer_list<
diff --git a/libcudacxx/.upstream-tests/test/heterogeneous/cuda_atomic_ref.pass.cpp b/libcudacxx/.upstream-tests/test/heterogeneous/cuda_atomic_ref.pass.cpp
index ec5e00ad08..290d45ab82 100644
--- a/libcudacxx/.upstream-tests/test/heterogeneous/cuda_atomic_ref.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/heterogeneous/cuda_atomic_ref.pass.cpp
@@ -166,7 +166,7 @@ using arithmetic_atomic_testers = extend_tester_list<
     fetch_sub_tester<30, 21, 9>,
     fetch_min_tester<9, 5, 5>,
     fetch_max_tester<5, 9, 9>,
-    fetch_sub_tester<9, 17, -8>,
+    fetch_sub_tester<9, 17, -8>
 >;
 
 using bitwise_atomic_testers = extend_tester_list<
diff --git a/libcudacxx/.upstream-tests/test/heterogeneous/helpers.h b/libcudacxx/.upstream-tests/test/heterogeneous/helpers.h
index e23d00d30c..1330d167c0 100644
--- a/libcudacxx/.upstream-tests/test/heterogeneous/helpers.h
+++ b/libcudacxx/.upstream-tests/test/heterogeneous/helpers.h
@@ -11,6 +11,7 @@
 
 #include <cuda/std/type_traits>
 
+#include <cstdio>
 #include <new>
 #include <thread>
 #include <vector>
diff --git a/libcudacxx/.upstream-tests/test/std/concepts/concepts.lang/concept.same/same_as.pass.cpp b/libcudacxx/.upstream-tests/test/std/concepts/concepts.lang/concept.same/same_as.pass.cpp
index b0c626e536..1065818852 100644
--- a/libcudacxx/.upstream-tests/test/std/concepts/concepts.lang/concept.same/same_as.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/std/concepts/concepts.lang/concept.same/same_as.pass.cpp
@@ -16,6 +16,8 @@
 #include <cuda/std/concepts>
 #include <cuda/std/type_traits>
 
+#include "test_macros.h"
+
 using cuda::std::same_as;
 
 struct S1 {};
@@ -35,6 +37,9 @@ struct S5 {
   int* p;
 };
 
+#ifdef TEST_COMPILER_CLANG_CUDA
+#pragma clang diagnostic ignored "-Wunused-private-field"
+#endif // TEST_COMPILER_CLANG_CUDA
 class C1 {};
 class C2 {
   /* [[maybe_unused]] */ int i;
diff --git a/libcudacxx/.upstream-tests/test/std/containers/views/mdspan/mdspan.layout.left.cons/list_init.pass.cpp b/libcudacxx/.upstream-tests/test/std/containers/views/mdspan/mdspan.layout.left.cons/list_init.pass.cpp
index d6d6bf9fd6..ff3990ba6a 100644
--- a/libcudacxx/.upstream-tests/test/std/containers/views/mdspan/mdspan.layout.left.cons/list_init.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/std/containers/views/mdspan/mdspan.layout.left.cons/list_init.pass.cpp
@@ -55,9 +55,6 @@ int main(int, char**)
 
     // TEST(TestLayoutLeftListInitialization, test_layout_left_extent_initialization)
     {
-        typedef int    data_t ;
-        typedef size_t index_t;
-
         cuda::std::layout_left::mapping<cuda::std::extents<size_t,dyn, dyn>> m{cuda::std::dextents<size_t,2>{16, 32}};
 
         static_assert( m.is_exhaustive()          == true, "" );
diff --git a/libcudacxx/.upstream-tests/test/std/containers/views/mdspan/mdspan.layout.left.obs/compare.pass.cpp b/libcudacxx/.upstream-tests/test/std/containers/views/mdspan/mdspan.layout.left.obs/compare.pass.cpp
index 7eb9d24379..adeb07be0b 100644
--- a/libcudacxx/.upstream-tests/test/std/containers/views/mdspan/mdspan.layout.left.obs/compare.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/std/containers/views/mdspan/mdspan.layout.left.obs/compare.pass.cpp
@@ -35,7 +35,6 @@ int main(int, char**)
     typed_test_compare_left();
 
     using index_t = size_t;
-    using ext1d_t = cuda::std::extents<index_t,dyn>;
     using ext2d_t = cuda::std::extents<index_t,dyn,dyn>;
 
     {
diff --git a/libcudacxx/.upstream-tests/test/std/containers/views/mdspan/mdspan.layout.right.obs/compare.pass.cpp b/libcudacxx/.upstream-tests/test/std/containers/views/mdspan/mdspan.layout.right.obs/compare.pass.cpp
index 099ff5fb8c..b1eabbf9d8 100644
--- a/libcudacxx/.upstream-tests/test/std/containers/views/mdspan/mdspan.layout.right.obs/compare.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/std/containers/views/mdspan/mdspan.layout.right.obs/compare.pass.cpp
@@ -35,7 +35,6 @@ int main(int, char**)
     typed_test_compare_right();
 
     using index_t = size_t;
-    using ext1d_t = cuda::std::extents<index_t,dyn>;
     using ext2d_t = cuda::std::extents<index_t,dyn,dyn>;
 
     {
diff --git a/libcudacxx/.upstream-tests/test/std/containers/views/mdspan/mdspan.layout.stride.cons/list_init.pass.cpp b/libcudacxx/.upstream-tests/test/std/containers/views/mdspan/mdspan.layout.stride.cons/list_init.pass.cpp
index 7b84d1656e..90a6e42016 100644
--- a/libcudacxx/.upstream-tests/test/std/containers/views/mdspan/mdspan.layout.stride.cons/list_init.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/std/containers/views/mdspan/mdspan.layout.stride.cons/list_init.pass.cpp
@@ -31,11 +31,6 @@ int main(int, char**)
 {
     // From a span
     {
-        typedef int    data_t ;
-        typedef size_t index_t;
-
-        using my_ext = typename cuda::std::extents<size_t,dyn>;
-
         cuda::std::array<int,2> a{1, 128};
         cuda::std::span <int,2> s(a.data(), 2);
         cuda::std::layout_stride::mapping<cuda::std::extents<size_t,dyn, dyn>> m{cuda::std::dextents<size_t,2>{16, 32}, s};
@@ -45,9 +40,6 @@ int main(int, char**)
 
     // TEST(TestLayoutStrideListInitialization, test_list_initialization)
     {
-        typedef int    data_t ;
-        typedef size_t index_t;
-
         cuda::std::layout_stride::mapping<cuda::std::extents<size_t,dyn, dyn>> m{cuda::std::dextents<size_t,2>{16, 32}, cuda::std::array<int,2>{1, 128}};
 
         CHECK_MAPPING(m);
@@ -55,7 +47,6 @@ int main(int, char**)
 
     // From another mapping
     {
-        typedef int    data_t ;
         typedef size_t index_t;
 
         cuda::std::layout_stride::mapping<cuda::std::extents<index_t,dyn, dyn>> m0{cuda::std::dextents<index_t,2>{16, 32}, cuda::std::array<int,2>{1, 128}};
diff --git a/libcudacxx/.upstream-tests/test/std/containers/views/mdspan/mdspan.layout.stride.obs/compare.pass.cpp b/libcudacxx/.upstream-tests/test/std/containers/views/mdspan/mdspan.layout.stride.obs/compare.pass.cpp
index 76ac945258..4d40d56884 100644
--- a/libcudacxx/.upstream-tests/test/std/containers/views/mdspan/mdspan.layout.stride.obs/compare.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/std/containers/views/mdspan/mdspan.layout.stride.obs/compare.pass.cpp
@@ -18,7 +18,6 @@ constexpr auto dyn = cuda::std::dynamic_extent;
 int main(int, char**)
 {
     using index_t = int;
-    using ext1d_t = cuda::std::extents<index_t,dyn>;
     using ext2d_t = cuda::std::extents<index_t,dyn,dyn>;
 
     {
diff --git a/libcudacxx/.upstream-tests/test/std/containers/views/mdspan/mdspan.mdspan.cons/array_init_extents.pass.cpp b/libcudacxx/.upstream-tests/test/std/containers/views/mdspan/mdspan.mdspan.cons/array_init_extents.pass.cpp
index 824b3281cf..61816bc921 100644
--- a/libcudacxx/.upstream-tests/test/std/containers/views/mdspan/mdspan.mdspan.cons/array_init_extents.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/std/containers/views/mdspan/mdspan.mdspan.cons/array_init_extents.pass.cpp
@@ -84,8 +84,7 @@ int main(int, char**)
 
     // Constraint: N == rank() || N == rank_dynamic() is true
     {
-        using      mdspan_t = cuda::std::mdspan< int, cuda::std::extents< int, dyn, dyn > >;
-        using other_index_t = int;
+        using mdspan_t = cuda::std::mdspan< int, cuda::std::extents< int, dyn, dyn > >;
 
         static_assert( is_array_cons_avail_v< mdspan_t, int *, int, 1 > == false, "" );
     }
diff --git a/libcudacxx/.upstream-tests/test/std/containers/views/mdspan/mdspan.mdspan.cons/custom_layout.pass.cpp b/libcudacxx/.upstream-tests/test/std/containers/views/mdspan/mdspan.mdspan.cons/custom_layout.pass.cpp
index 0fd8e2b06b..d82aea6f71 100644
--- a/libcudacxx/.upstream-tests/test/std/containers/views/mdspan/mdspan.mdspan.cons/custom_layout.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/std/containers/views/mdspan/mdspan.mdspan.cons/custom_layout.pass.cpp
@@ -19,8 +19,6 @@ constexpr auto dyn = cuda::std::dynamic_extent;
 
 int main(int, char**)
 {
-    using map_t = Foo::layout_foo::template mapping<cuda::std::dextents<size_t ,2>>;
-
     {
         using  data_t = int;
         using   lay_t = Foo::layout_foo;
diff --git a/libcudacxx/.upstream-tests/test/std/iterators/iterator.container/empty.array.fail.cpp b/libcudacxx/.upstream-tests/test/std/iterators/iterator.container/empty.array.fail.cpp
index abdc0e63b1..e93c8e98b6 100644
--- a/libcudacxx/.upstream-tests/test/std/iterators/iterator.container/empty.array.fail.cpp
+++ b/libcudacxx/.upstream-tests/test/std/iterators/iterator.container/empty.array.fail.cpp
@@ -16,8 +16,6 @@
 // UNSUPPORTED: c++98, c++03, c++11, c++14, c++17
 // UNSUPPORTED: clang-3.3, clang-3.4, clang-3.5, clang-3.6, clang-3.7, clang-3.8
 
-#if defined(_LIBCUDACXX_HAS_VECTOR)
-#include <cuda/std/vector>
 #include <cuda/std/iterator>
 
 #include "test_macros.h"
@@ -28,10 +26,4 @@ int main(int, char**)
     cuda::std::empty(c);  // expected-error {{ignoring return value of function declared with 'nodiscard' attribute}}
 
   return 0;
-}
-#else
-int main(int, char**)
-{
-  return 0;
-}
-#endif
\ No newline at end of file
+}
\ No newline at end of file
diff --git a/libcudacxx/.upstream-tests/test/std/iterators/iterator.container/empty.container.fail.cpp b/libcudacxx/.upstream-tests/test/std/iterators/iterator.container/empty.container.fail.cpp
index 1ccefe303a..2951a74a05 100644
--- a/libcudacxx/.upstream-tests/test/std/iterators/iterator.container/empty.container.fail.cpp
+++ b/libcudacxx/.upstream-tests/test/std/iterators/iterator.container/empty.container.fail.cpp
@@ -16,22 +16,15 @@
 // UNSUPPORTED: c++98, c++03, c++11, c++14, c++17
 // UNSUPPORTED: clang-3.3, clang-3.4, clang-3.5, clang-3.6, clang-3.7, clang-3.8
 
-#if defined(_LIBCUDACXX_HAS_VECTOR)
-#include <cuda/std/vector>
+#include <cuda/std/array>
 #include <cuda/std/iterator>
 
 #include "test_macros.h"
 
 int main(int, char**)
 {
-    cuda::std::vector<int> c;
+    cuda::std::array<int, 4> c;
     cuda::std::empty(c);  // expected-error {{ignoring return value of function declared with 'nodiscard' attribute}}
 
   return 0;
 }
-#else
-int main(int, char**)
-{
-  return 0;
-}
-#endif
\ No newline at end of file
diff --git a/libcudacxx/.upstream-tests/test/std/iterators/iterator.requirements/alg.req.ind.copy/indirectly_copyable.subsumption.compile.pass.cpp b/libcudacxx/.upstream-tests/test/std/iterators/iterator.requirements/alg.req.ind.copy/indirectly_copyable.subsumption.compile.pass.cpp
index 386feb269c..36e1a14e4b 100644
--- a/libcudacxx/.upstream-tests/test/std/iterators/iterator.requirements/alg.req.ind.copy/indirectly_copyable.subsumption.compile.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/std/iterators/iterator.requirements/alg.req.ind.copy/indirectly_copyable.subsumption.compile.pass.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17
-// XFAIL: c++20
+// XFAIL: c++20 && nvcc
 // nvbug 3885350
 
 // template<class In, class Out>
diff --git a/libcudacxx/.upstream-tests/test/std/iterators/iterator.requirements/alg.req.ind.move/indirectly_movable.subsumption.compile.pass.cpp b/libcudacxx/.upstream-tests/test/std/iterators/iterator.requirements/alg.req.ind.move/indirectly_movable.subsumption.compile.pass.cpp
index ad70e0f1af..35bb8051ea 100644
--- a/libcudacxx/.upstream-tests/test/std/iterators/iterator.requirements/alg.req.ind.move/indirectly_movable.subsumption.compile.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/std/iterators/iterator.requirements/alg.req.ind.move/indirectly_movable.subsumption.compile.pass.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17
-// XFAIL: c++20
+// XFAIL: c++20 && nvcc
 // nvbug 3885350
 
 // template<class In, class Out>
diff --git a/libcudacxx/.upstream-tests/test/std/iterators/iterator.requirements/indirectcallable/indirectinvocable/indirect_binary_predicate.compile.pass.cpp b/libcudacxx/.upstream-tests/test/std/iterators/iterator.requirements/indirectcallable/indirectinvocable/indirect_binary_predicate.compile.pass.cpp
index b5923cd7af..70b332be6e 100644
--- a/libcudacxx/.upstream-tests/test/std/iterators/iterator.requirements/indirectcallable/indirectinvocable/indirect_binary_predicate.compile.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/std/iterators/iterator.requirements/indirectcallable/indirectinvocable/indirect_binary_predicate.compile.pass.cpp
@@ -33,6 +33,10 @@ struct GoodPredicate {
 // Should work when all constraints are satisfied
 static_assert(cuda::std::indirect_binary_predicate<GoodPredicate<It1, It2>, It1, It2>);
 static_assert(cuda::std::indirect_binary_predicate<bool(*)(int, float), int*, float*>);
+
+#ifdef TEST_COMPILER_CLANG_CUDA
+#pragma clang diagnostic ignored "-Wunneeded-internal-declaration"
+#endif // TEST_COMPILER_CLANG_CUDA
 #ifndef __CUDA_ARCH__
 auto lambda = [](int i, long j) { return i == j; };
 static_assert(cuda::std::indirect_binary_predicate<decltype(lambda), int*, int*>);
diff --git a/libcudacxx/.upstream-tests/test/std/iterators/iterator.requirements/indirectcallable/indirectinvocable/indirect_equivalence_relation.compile.pass.cpp b/libcudacxx/.upstream-tests/test/std/iterators/iterator.requirements/indirectcallable/indirectinvocable/indirect_equivalence_relation.compile.pass.cpp
index cfc3211607..2906a7c42a 100644
--- a/libcudacxx/.upstream-tests/test/std/iterators/iterator.requirements/indirectcallable/indirectinvocable/indirect_equivalence_relation.compile.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/std/iterators/iterator.requirements/indirectcallable/indirectinvocable/indirect_equivalence_relation.compile.pass.cpp
@@ -48,6 +48,10 @@ struct GoodRelation {
 // Should work when all constraints are satisfied
 static_assert(cuda::std::indirect_equivalence_relation<GoodRelation<It1, It2>, It1, It2>);
 static_assert(cuda::std::indirect_equivalence_relation<bool(*)(int, long), int*, long*>);
+
+#ifdef TEST_COMPILER_CLANG_CUDA
+#pragma clang diagnostic ignored "-Wunneeded-internal-declaration"
+#endif // TEST_COMPILER_CLANG_CUDA
 #ifndef __CUDA_ARCH__
 auto lambda = [](int i, long j) { return i == j; };
 static_assert(cuda::std::indirect_equivalence_relation<decltype(lambda), int*, long*>);
diff --git a/libcudacxx/.upstream-tests/test/std/iterators/iterator.requirements/indirectcallable/indirectinvocable/indirect_strict_weak_order.compile.pass.cpp b/libcudacxx/.upstream-tests/test/std/iterators/iterator.requirements/indirectcallable/indirectinvocable/indirect_strict_weak_order.compile.pass.cpp
index 1ba50bc154..1defcd894e 100644
--- a/libcudacxx/.upstream-tests/test/std/iterators/iterator.requirements/indirectcallable/indirectinvocable/indirect_strict_weak_order.compile.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/std/iterators/iterator.requirements/indirectcallable/indirectinvocable/indirect_strict_weak_order.compile.pass.cpp
@@ -48,6 +48,10 @@ struct GoodOrder {
 // Should work when all constraints are satisfied
 static_assert(cuda::std::indirect_strict_weak_order<GoodOrder<It1, It2>, It1, It2>);
 static_assert(cuda::std::indirect_strict_weak_order<bool(*)(int, long), int*, long*>);
+
+#ifdef TEST_COMPILER_CLANG_CUDA
+#pragma clang diagnostic ignored "-Wunneeded-internal-declaration"
+#endif // TEST_COMPILER_CLANG_CUDA
 #ifndef __CUDA_ARCH__
 auto lambda = [](int i, long j) { return i == j; };
 static_assert(cuda::std::indirect_strict_weak_order<decltype(lambda), int*, long*>);
diff --git a/libcudacxx/.upstream-tests/test/std/iterators/iterator.requirements/indirectcallable/indirectinvocable/indirect_unary_predicate.compile.pass.cpp b/libcudacxx/.upstream-tests/test/std/iterators/iterator.requirements/indirectcallable/indirectinvocable/indirect_unary_predicate.compile.pass.cpp
index dcbf90dbc0..7f9ddf2792 100644
--- a/libcudacxx/.upstream-tests/test/std/iterators/iterator.requirements/indirectcallable/indirectinvocable/indirect_unary_predicate.compile.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/std/iterators/iterator.requirements/indirectcallable/indirectinvocable/indirect_unary_predicate.compile.pass.cpp
@@ -30,6 +30,10 @@ struct GoodPredicate {
 // Should work when all constraints are satisfied
 static_assert(cuda::std::indirect_unary_predicate<GoodPredicate<It>, It>);
 static_assert(cuda::std::indirect_unary_predicate<bool(*)(int), int*>);
+
+#ifdef TEST_COMPILER_CLANG_CUDA
+#pragma clang diagnostic ignored "-Wunneeded-internal-declaration"
+#endif // TEST_COMPILER_CLANG_CUDA
 #ifndef __CUDA_ARCH__
 auto lambda = [](int i) { return i % 2 == 0; };
 static_assert(cuda::std::indirect_unary_predicate<decltype(lambda), int*>);
diff --git a/libcudacxx/.upstream-tests/test/std/iterators/iterator.requirements/iterator.concepts/iterator.concept.bidir/subsumption.compile.pass.cpp b/libcudacxx/.upstream-tests/test/std/iterators/iterator.requirements/iterator.concepts/iterator.concept.bidir/subsumption.compile.pass.cpp
index bdc44634f0..f20403ee8b 100644
--- a/libcudacxx/.upstream-tests/test/std/iterators/iterator.requirements/iterator.concepts/iterator.concept.bidir/subsumption.compile.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/std/iterators/iterator.requirements/iterator.concepts/iterator.concept.bidir/subsumption.compile.pass.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17
-// XFAIL: c++20
+// XFAIL: c++20 && nvcc
 // nvbug 3885350
 
 // template<class T>
diff --git a/libcudacxx/.upstream-tests/test/std/iterators/iterator.requirements/iterator.concepts/iterator.concept.forward/subsumption.compile.pass.cpp b/libcudacxx/.upstream-tests/test/std/iterators/iterator.requirements/iterator.concepts/iterator.concept.forward/subsumption.compile.pass.cpp
index 2e9def18c7..19c90e7bc0 100644
--- a/libcudacxx/.upstream-tests/test/std/iterators/iterator.requirements/iterator.concepts/iterator.concept.forward/subsumption.compile.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/std/iterators/iterator.requirements/iterator.concepts/iterator.concept.forward/subsumption.compile.pass.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17
-// XFAIL: c++20
+// XFAIL: c++20 && nvcc
 // nvbug 3885350
 
 // cuda::std::forward_iterator;
diff --git a/libcudacxx/.upstream-tests/test/std/iterators/iterator.requirements/iterator.concepts/iterator.concept.inc/subsumption.compile.pass.cpp b/libcudacxx/.upstream-tests/test/std/iterators/iterator.requirements/iterator.concepts/iterator.concept.inc/subsumption.compile.pass.cpp
index 16a8f5a8dd..69bb112ea7 100644
--- a/libcudacxx/.upstream-tests/test/std/iterators/iterator.requirements/iterator.concepts/iterator.concept.inc/subsumption.compile.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/std/iterators/iterator.requirements/iterator.concepts/iterator.concept.inc/subsumption.compile.pass.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17
-// XFAIL: c++20
+// XFAIL: c++20 && nvcc
 // nvbug 3885350
 
 // template<class T>
diff --git a/libcudacxx/.upstream-tests/test/std/iterators/iterator.requirements/iterator.concepts/iterator.concept.input/subsumption.compile.pass.cpp b/libcudacxx/.upstream-tests/test/std/iterators/iterator.requirements/iterator.concepts/iterator.concept.input/subsumption.compile.pass.cpp
index 4209a3901a..6012698155 100644
--- a/libcudacxx/.upstream-tests/test/std/iterators/iterator.requirements/iterator.concepts/iterator.concept.input/subsumption.compile.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/std/iterators/iterator.requirements/iterator.concepts/iterator.concept.input/subsumption.compile.pass.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17
-// XFAIL: c++20
+// XFAIL: c++20 && nvcc
 // nvbug 3885350
 
 // template<class T>
diff --git a/libcudacxx/.upstream-tests/test/std/iterators/iterator.requirements/iterator.concepts/iterator.concept.iterator/subsumption.compile.pass.cpp b/libcudacxx/.upstream-tests/test/std/iterators/iterator.requirements/iterator.concepts/iterator.concept.iterator/subsumption.compile.pass.cpp
index 11cd6d3dbb..0f2de25eae 100644
--- a/libcudacxx/.upstream-tests/test/std/iterators/iterator.requirements/iterator.concepts/iterator.concept.iterator/subsumption.compile.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/std/iterators/iterator.requirements/iterator.concepts/iterator.concept.iterator/subsumption.compile.pass.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17
-// XFAIL: c++20
+// XFAIL: c++20 && nvcc
 // nvbug 3885350
 
 // template<class In>
diff --git a/libcudacxx/.upstream-tests/test/std/language.support/support.types/nullptr_t.pass.cpp b/libcudacxx/.upstream-tests/test/std/language.support/support.types/nullptr_t.pass.cpp
index 65f9e0841f..8edb3f1a6d 100644
--- a/libcudacxx/.upstream-tests/test/std/language.support/support.types/nullptr_t.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/std/language.support/support.types/nullptr_t.pass.cpp
@@ -65,7 +65,9 @@ __host__ __device__
 void test_nullptr_conversions() {
 // GCC does not accept this due to CWG Defect #1423
 // http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_defects.html#1423
-#if defined(__clang__) && !defined(TEST_COMPILER_NVCC)
+#if  defined(TEST_COMPILER_CLANG) \
+ && !defined(TEST_COMPILER_NVCC) \
+ && !defined(TEST_COMPILER_CLANG_CUDA)
     {
         bool b = nullptr;
         assert(!b);
diff --git a/libcudacxx/.upstream-tests/test/std/utilities/function.objects/func.not_fn/not_fn.pass.cpp b/libcudacxx/.upstream-tests/test/std/utilities/function.objects/func.not_fn/not_fn.pass.cpp
index 4bc730c89d..ca489bb096 100644
--- a/libcudacxx/.upstream-tests/test/std/utilities/function.objects/func.not_fn/not_fn.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/std/utilities/function.objects/func.not_fn/not_fn.pass.cpp
@@ -668,6 +668,7 @@ void call_operator_noexcept_test()
     }
 }
 
+#ifndef TEST_COMPILER_CLANG_CUDA // https://github.com/llvm/llvm-project/issues/67533
 __host__ __device__
 void test_lwg2767() {
     // See https://cplusplus.github.io/LWG/lwg-defects.html#2767
@@ -681,6 +682,7 @@ void test_lwg2767() {
         assert(b);
     }
 }
+#endif // TEST_COMPILER_CLANG_CUDA
 
 int main(int, char**)
 {
@@ -691,7 +693,9 @@ int main(int, char**)
     call_operator_sfinae_test(); // somewhat of an extension
     // call_operator_forwarding_test();
     call_operator_noexcept_test();
+#ifndef TEST_COMPILER_CLANG_CUDA
     test_lwg2767();
+#endif // TEST_COMPILER_CLANG_CUDA
 
   return 0;
 }
diff --git a/libcudacxx/.upstream-tests/test/std/utilities/intseq/intseq.general/integer_seq.pass.cpp b/libcudacxx/.upstream-tests/test/std/utilities/intseq/intseq.general/integer_seq.pass.cpp
index b23d8f202a..f6c879d584 100644
--- a/libcudacxx/.upstream-tests/test/std/utilities/intseq/intseq.general/integer_seq.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/std/utilities/intseq/intseq.general/integer_seq.pass.cpp
@@ -27,7 +27,7 @@ __host__ __device__ auto extract ( const AtContainer &t, const cuda::std::intege
 int main(int, char**)
 {
 //  Make a couple of sequences
-    using int3    = cuda::std::make_integer_sequence<int, 3>;     // generates int:    0,1,2
+    using intseq3 = cuda::std::make_integer_sequence<int, 3>;     // generates int:    0,1,2
     using size7   = cuda::std::make_integer_sequence<size_t, 7>;  // generates size_t: 0,1,2,3,4,5,6
     using size4   = cuda::std::make_index_sequence<4>;            // generates size_t: 0,1,2,3
     using size2   = cuda::std::index_sequence_for<int, size_t>;   // generates size_t: 0,1
@@ -35,8 +35,8 @@ int main(int, char**)
     using sizemix = cuda::std::index_sequence<1, 1, 2, 3, 5>;     // generates size_t: 1,1,2,3,5
 
 //  Make sure they're what we expect
-    static_assert ( cuda::std::is_same<int3::value_type, int>::value, "int3 type wrong" );
-    static_assert ( int3::size () == 3, "int3 size wrong" );
+    static_assert ( cuda::std::is_same<intseq3::value_type, int>::value, "intseq3 type wrong" );
+    static_assert ( intseq3::size () == 3, "intseq3 size wrong" );
 
     static_assert ( cuda::std::is_same<size7::value_type, size_t>::value, "size7 type wrong" );
     static_assert ( size7::size () == 7, "size7 size wrong" );
@@ -56,8 +56,8 @@ int main(int, char**)
     auto tup = cuda::std::make_tuple ( 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20 );
 
 //  Use them
-    auto t3 = extract ( tup, int3() );
-    static_assert ( cuda::std::tuple_size<decltype(t3)>::value == int3::size (), "t3 size wrong");
+    auto t3 = extract ( tup, intseq3() );
+    static_assert ( cuda::std::tuple_size<decltype(t3)>::value == intseq3::size (), "t3 size wrong");
     assert ( t3 == cuda::std::make_tuple ( 10, 11, 12 ));
 
     auto t7 = extract ( tup, size7 ());
diff --git a/libcudacxx/.upstream-tests/test/std/utilities/intseq/intseq.intseq/integer_seq.pass.cpp b/libcudacxx/.upstream-tests/test/std/utilities/intseq/intseq.intseq/integer_seq.pass.cpp
index 5d4544a61c..c7c9a154ad 100644
--- a/libcudacxx/.upstream-tests/test/std/utilities/intseq/intseq.intseq/integer_seq.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/std/utilities/intseq/intseq.intseq/integer_seq.pass.cpp
@@ -28,20 +28,20 @@
 int main(int, char**)
 {
 //  Make a few of sequences
-    using int3    = cuda::std::integer_sequence<int, 3, 2, 1>;
-    using size1   = cuda::std::integer_sequence<cuda::std::size_t, 7>;
-    using ushort2 = cuda::std::integer_sequence<unsigned short, 4, 6>;
-    using bool0   = cuda::std::integer_sequence<bool>;
+    using intseq3    = cuda::std::integer_sequence<int, 3, 2, 1>;
+    using size1      = cuda::std::integer_sequence<cuda::std::size_t, 7>;
+    using ushortseq2 = cuda::std::integer_sequence<unsigned short, 4, 6>;
+    using bool0      = cuda::std::integer_sequence<bool>;
 
 //  Make sure they're what we expect
-    static_assert ( cuda::std::is_same<int3::value_type, int>::value, "int3 type wrong" );
-    static_assert ( int3::size() == 3, "int3 size wrong" );
+    static_assert ( cuda::std::is_same<intseq3::value_type, int>::value, "intseq3 type wrong" );
+    static_assert ( intseq3::size() == 3, "intseq3 size wrong" );
 
     static_assert ( cuda::std::is_same<size1::value_type, cuda::std::size_t>::value, "size1 type wrong" );
     static_assert ( size1::size() == 1, "size1 size wrong" );
 
-    static_assert ( cuda::std::is_same<ushort2::value_type, unsigned short>::value, "ushort2 type wrong" );
-    static_assert ( ushort2::size() == 2, "ushort2 size wrong" );
+    static_assert ( cuda::std::is_same<ushortseq2::value_type, unsigned short>::value, "ushortseq2 type wrong" );
+    static_assert ( ushortseq2::size() == 2, "ushortseq2 size wrong" );
 
     static_assert ( cuda::std::is_same<bool0::value_type, bool>::value, "bool0 type wrong" );
     static_assert ( bool0::size() == 0, "bool0 size wrong" );
diff --git a/libcudacxx/.upstream-tests/test/std/utilities/memory/specialized.algorithms/specialized.addressof/addressof.pass.cpp b/libcudacxx/.upstream-tests/test/std/utilities/memory/specialized.algorithms/specialized.addressof/addressof.pass.cpp
index dbf13dc050..34f156c4c3 100644
--- a/libcudacxx/.upstream-tests/test/std/utilities/memory/specialized.algorithms/specialized.addressof/addressof.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/std/utilities/memory/specialized.algorithms/specialized.addressof/addressof.pass.cpp
@@ -16,6 +16,10 @@
 
 #include "test_macros.h"
 
+#ifdef TEST_COMPILER_CLANG_CUDA
+#include <new>
+#endif // TEST_COMPILER_CLANG_CUDA
+
 struct A
 {
     __host__ __device__ void operator&() const {}
diff --git a/libcudacxx/.upstream-tests/test/std/utilities/meta/meta.rel/is_invocable.pass.cpp b/libcudacxx/.upstream-tests/test/std/utilities/meta/meta.rel/is_invocable.pass.cpp
index 9407deddda..dc19b8e058 100644
--- a/libcudacxx/.upstream-tests/test/std/utilities/meta/meta.rel/is_invocable.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/std/utilities/meta/meta.rel/is_invocable.pass.cpp
@@ -8,9 +8,6 @@
 
 // UNSUPPORTED: c++98, c++03, c++11
 
-// XFAIL: nvcc
-// FIXME: Triage and fix this.
-
 // type_traits
 
 // is_invocable
@@ -24,8 +21,12 @@
 #include <cuda/std/type_traits>
 // NOTE: These headers are not currently supported by libcu++.
 #include <cuda/std/functional>
+#ifdef _LIBCUDACXX_HAS_MEMORY
 #include <cuda/std/memory>
+#endif // _LIBCUDACXX_HAS_MEMORY
+#ifdef _LIBCUDACXX_HAS_VECTOR
 #include <cuda/std/vector>
+#endif // _LIBCUDACXX_HAS_VECTOR
 
 #include "test_macros.h"
 
@@ -33,21 +34,21 @@ struct Tag {};
 struct DerFromTag : Tag {};
 
 struct Implicit {
-  Implicit(int) {}
+  __host__ __device__ Implicit(int) {}
 };
 
 struct Explicit {
-  explicit Explicit(int) {}
+  __host__ __device__ explicit Explicit(int) {}
 };
 
 struct NotCallableWithInt {
-  int operator()(int) = delete;
-  int operator()(Tag) { return 42; }
+  __host__ __device__ int operator()(int) = delete;
+  __host__ __device__ int operator()(Tag) { return 42; }
 };
 
 struct Sink {
   template <class ...Args>
-  void operator()(Args&&...) const {}
+  __host__ __device__ void operator()(Args&&...) const {}
 };
 
 int main(int, char**) {
@@ -74,9 +75,11 @@ int main(int, char**) {
     static_assert(!cuda::std::is_invocable<const int&>::value, "");
     static_assert(!cuda::std::is_invocable<int&&>::value, "");
 
+#ifdef _LIBCUDACXX_HAS_VECTOR
     static_assert(!cuda::std::is_invocable<cuda::std::vector<int> >::value, "");
     static_assert(!cuda::std::is_invocable<cuda::std::vector<int*> >::value, "");
     static_assert(!cuda::std::is_invocable<cuda::std::vector<int**> >::value, "");
+#endif // _LIBCUDACXX_HAS_VECTOR
 
     static_assert(!cuda::std::is_invocable<AbominableFunc>::value, "");
 
@@ -109,9 +112,11 @@ int main(int, char**) {
     static_assert(!cuda::std::is_invocable_r<int, const int&>::value, "");
     static_assert(!cuda::std::is_invocable_r<int, int&&>::value, "");
 
+#ifdef _LIBCUDACXX_HAS_VECTOR
     static_assert(!cuda::std::is_invocable_r<int, cuda::std::vector<int> >::value, "");
     static_assert(!cuda::std::is_invocable_r<int, cuda::std::vector<int*> >::value, "");
     static_assert(!cuda::std::is_invocable_r<int, cuda::std::vector<int**> >::value, "");
+#endif // _LIBCUDACXX_HAS_VECTOR
     static_assert(!cuda::std::is_invocable_r<void, AbominableFunc>::value, "");
 
     //  with parameters
@@ -154,12 +159,16 @@ int main(int, char**) {
       using T = Tag*;
       using DT = DerFromTag*;
       using CT = const Tag*;
+#ifdef _LIBCUDACXX_HAS_MEMORY
       using ST = cuda::std::unique_ptr<Tag>;
+#endif // _LIBCUDACXX_HAS_MEMORY
       static_assert(cuda::std::is_invocable<Fn, T&, int>::value, "");
       static_assert(cuda::std::is_invocable<Fn, DT&, int>::value, "");
       static_assert(cuda::std::is_invocable<Fn, const T&, int>::value, "");
       static_assert(cuda::std::is_invocable<Fn, T&&, int>::value, "");
+#ifdef _LIBCUDACXX_HAS_MEMORY
       static_assert(cuda::std::is_invocable<Fn, ST, int>::value, "");
+#endif // _LIBCUDACXX_HAS_MEMORY
       static_assert(!cuda::std::is_invocable<Fn, CT&, int>::value, "");
       static_assert(!cuda::std::is_invocable<RFn, T, int>::value, "");
     }
@@ -191,12 +200,16 @@ int main(int, char**) {
       using T = Tag*;
       using DT = DerFromTag*;
       using CT = const Tag*;
+#ifdef _LIBCUDACXX_HAS_MEMORY
       using ST = cuda::std::unique_ptr<Tag>;
+#endif // _LIBCUDACXX_HAS_MEMORY
       static_assert(cuda::std::is_invocable<Fn, T&>::value, "");
       static_assert(cuda::std::is_invocable<Fn, DT&>::value, "");
       static_assert(cuda::std::is_invocable<Fn, const T&>::value, "");
       static_assert(cuda::std::is_invocable<Fn, T&&>::value, "");
+#ifdef _LIBCUDACXX_HAS_MEMORY
       static_assert(cuda::std::is_invocable<Fn, ST>::value, "");
+#endif // _LIBCUDACXX_HAS_MEMORY
       static_assert(cuda::std::is_invocable<Fn, CT&>::value, "");
     }
   }
diff --git a/libcudacxx/.upstream-tests/test/std/utilities/meta/meta.rel/is_nothrow_invocable.pass.cpp b/libcudacxx/.upstream-tests/test/std/utilities/meta/meta.rel/is_nothrow_invocable.pass.cpp
index 6f486296c0..63569f089a 100644
--- a/libcudacxx/.upstream-tests/test/std/utilities/meta/meta.rel/is_nothrow_invocable.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/std/utilities/meta/meta.rel/is_nothrow_invocable.pass.cpp
@@ -6,85 +6,72 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: c++98, c++03, c++11
-
-// XFAIL: nvcc
-// FIXME: Triage and fix this.
+// UNSUPPORTED: c++03, c++11
 
 // type_traits
 
 // is_nothrow_invocable
 
 #include <cuda/std/type_traits>
-// NOTE: This header is not currently supported by libcu++.
+#include <cuda/std/functional>
+#ifdef _LIBCUDACXX_HAS_VECTOR
 #include <cuda/std/vector>
+#endif // _LIBCUDACXX_HAS_VECTOR
 
 #include "test_macros.h"
 
 struct Tag {};
 
 struct Implicit {
-  Implicit(int) noexcept {}
+  __host__ __device__ Implicit(int) noexcept {}
 };
 
 struct ThrowsImplicit {
-  ThrowsImplicit(int) {}
+  __host__ __device__ ThrowsImplicit(int) {}
 };
 
 struct Explicit {
-  explicit Explicit(int) noexcept {}
+  __host__ __device__ explicit Explicit(int) noexcept {}
 };
 
 template <bool IsNoexcept, class Ret, class... Args>
 struct CallObject {
-  Ret operator()(Args&&...) const noexcept(IsNoexcept);
+  __host__ __device__ Ret operator()(Args&&...) const noexcept(IsNoexcept);
 };
 
 struct Sink {
   template <class... Args>
-  void operator()(Args&&...) const noexcept {}
+  __host__ __device__ void operator()(Args&&...) const noexcept {}
 };
 
 template <class Fn, class... Args>
-constexpr bool throws_invocable() {
+__host__ __device__ constexpr bool throws_invocable() {
   return cuda::std::is_invocable<Fn, Args...>::value &&
          !cuda::std::is_nothrow_invocable<Fn, Args...>::value;
 }
 
 template <class Ret, class Fn, class... Args>
-constexpr bool throws_invocable_r() {
+__host__ __device__ constexpr bool throws_invocable_r() {
   return cuda::std::is_invocable_r<Ret, Fn, Args...>::value &&
          !cuda::std::is_nothrow_invocable_r<Ret, Fn, Args...>::value;
 }
 
-// FIXME(EricWF) Don't test the where noexcept is *not* part of the type system
-// once implementations have caught up.
-void test_noexcept_function_pointers() {
+__host__ __device__ void test_noexcept_function_pointers() {
   struct Dummy {
-    void foo() noexcept {}
-    static void bar() noexcept {}
+    __host__ __device__ void foo() noexcept {}
+    __host__ __device__ static void bar() noexcept {}
   };
-#if !defined(__cpp_noexcept_function_type)
-  {
-    // Check that PMF's and function pointers *work*. is_nothrow_invocable will always
-    // return false because 'noexcept' is not part of the function type.
-    static_assert(throws_invocable<decltype(&Dummy::foo), Dummy&>(), "");
-    static_assert(throws_invocable<decltype(&Dummy::bar)>(), "");
-  }
-#else
-  {
-    // Check that PMF's and function pointers actually work and that
-    // is_nothrow_invocable returns true for noexcept PMF's and function
-    // pointers.
-    static_assert(
-        cuda::std::is_nothrow_invocable<decltype(&Dummy::foo), Dummy&>::value, "");
-    static_assert(cuda::std::is_nothrow_invocable<decltype(&Dummy::bar)>::value, "");
-  }
-#endif
+  // Check that PMF's and function pointers actually work and that
+  // is_nothrow_invocable returns true for noexcept PMF's and function
+  // pointers.
+  static_assert(cuda::std::is_nothrow_invocable<decltype(&Dummy::foo), Dummy&>::value, "");
+  static_assert(cuda::std::is_nothrow_invocable<decltype(&Dummy::bar)>::value, "");
 }
 
 int main(int, char**) {
+#if TEST_STD_VER >= 17
   using AbominableFunc = void(...) const noexcept;
+#endif // TEST_STD_VER >= 17
   //  Non-callable things
   {
     static_assert(!cuda::std::is_nothrow_invocable<void>::value, "");
@@ -113,14 +100,18 @@ int main(int, char**) {
     static_assert(!cuda::std::is_nothrow_invocable<int, cuda::std::vector<int**> >::value,
                   "");
 
+#if TEST_STD_VER >= 17
     static_assert(!cuda::std::is_nothrow_invocable<AbominableFunc>::value, "");
+#endif // TEST_STD_VER >= 17
 
     //  with parameters
     static_assert(!cuda::std::is_nothrow_invocable<int, int>::value, "");
     static_assert(!cuda::std::is_nothrow_invocable<int, double, float>::value, "");
     static_assert(!cuda::std::is_nothrow_invocable<int, char, float, double>::value,
                   "");
+#if TEST_STD_VER >= 17
     static_assert(!cuda::std::is_nothrow_invocable<Sink, AbominableFunc>::value, "");
+#endif // TEST_STD_VER >= 17
     static_assert(!cuda::std::is_nothrow_invocable<Sink, void>::value, "");
     static_assert(!cuda::std::is_nothrow_invocable<Sink, const volatile void>::value,
                   "");
@@ -151,8 +142,10 @@ int main(int, char**) {
                   "");
     static_assert(!cuda::std::is_nothrow_invocable_r<int, cuda::std::vector<int**> >::value,
                   "");
+#if TEST_STD_VER >= 17
     static_assert(!cuda::std::is_nothrow_invocable_r<void, AbominableFunc>::value,
                   "");
+#endif // TEST_STD_VER >= 17
 
     //  with parameters
     static_assert(!cuda::std::is_nothrow_invocable_r<int, int, int>::value, "");
@@ -160,8 +153,10 @@ int main(int, char**) {
                   "");
     static_assert(
         !cuda::std::is_nothrow_invocable_r<int, int, char, float, double>::value, "");
+#if TEST_STD_VER >= 17
     static_assert(
         !cuda::std::is_nothrow_invocable_r<void, Sink, AbominableFunc>::value, "");
+#endif // TEST_STD_VER >= 17
     static_assert(!cuda::std::is_nothrow_invocable_r<void, Sink, void>::value, "");
     static_assert(
         !cuda::std::is_nothrow_invocable_r<void, Sink, const volatile void>::value,
@@ -202,6 +197,28 @@ int main(int, char**) {
     static_assert(cuda::std::is_nothrow_invocable_r<Implicit, Fn, Tag&>::value, "");
     static_assert(throws_invocable_r<ThrowsImplicit, Fn, Tag&>(), "");
   }
+#if TEST_STD_VER >= 17
+  {
+    // Check that it's fine if the result type is non-moveable.
+    struct CantMove {
+      CantMove() = default;
+      __host__ __device__ CantMove(CantMove&&) = delete;
+    };
+
+    static_assert(!cuda::std::is_move_constructible_v<CantMove>, "");
+    static_assert(!cuda::std::is_copy_constructible_v<CantMove>, "");
+
+    using Fn = CantMove() noexcept;
+
+    static_assert(cuda::std::is_nothrow_invocable_r<CantMove, Fn>::value, "");
+    static_assert(!cuda::std::is_nothrow_invocable_r<CantMove, Fn, int>::value, "");
+
+#ifndef _LIBCUDACXX_COMPILER_MSVC_2017
+    static_assert(cuda::std::is_nothrow_invocable_r_v<CantMove, Fn>, "");
+    static_assert(!cuda::std::is_nothrow_invocable_r_v<CantMove, Fn, int>, "");
+#endif // _LIBCUDACXX_COMPILER_MSVC_2017
+  }
+#endif // TEST_STD_VER >= 17
   {
     // Check for is_nothrow_invocable_v
     using Fn = CallObject<true, int>;
diff --git a/libcudacxx/.upstream-tests/test/std/utilities/meta/meta.trans/meta.trans.other/result_of.fail.cpp b/libcudacxx/.upstream-tests/test/std/utilities/meta/meta.trans/meta.trans.other/result_of.fail.cpp
index 7a2079503e..f435a5246e 100644
--- a/libcudacxx/.upstream-tests/test/std/utilities/meta/meta.trans/meta.trans.other/result_of.fail.cpp
+++ b/libcudacxx/.upstream-tests/test/std/utilities/meta/meta.trans/meta.trans.other/result_of.fail.cpp
@@ -8,6 +8,7 @@
 
 // Mandates: invoke result must fail to compile when used with device lambdas.
 // UNSUPPORTED: nvrtc
+// UNSUPPORTED: clang && (!nvcc)
 
 // <cuda/std/functional>
 
@@ -31,7 +32,7 @@ void test_lambda(Fn &&)
 
 int main(int, char**)
 {
-#if defined(__NVCC__) 
+#if defined(TEST_COMPILER_NVCC)
     { // extended device lambda
     test_lambda<int>([] __device__ () -> int { return 42; });
     test_lambda<double>([] __device__ () -> double { return 42.0; });
diff --git a/libcudacxx/.upstream-tests/test/std/utilities/meta/meta.trans/meta.trans.other/result_of.pass.cpp b/libcudacxx/.upstream-tests/test/std/utilities/meta/meta.trans/meta.trans.other/result_of.pass.cpp
index 7dd4e49d63..aa3f40d753 100644
--- a/libcudacxx/.upstream-tests/test/std/utilities/meta/meta.trans/meta.trans.other/result_of.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/std/utilities/meta/meta.trans/meta.trans.other/result_of.pass.cpp
@@ -14,7 +14,9 @@
 #define _LIBCUDACXX_DISABLE_DEPRECATION_WARNINGS
 
 #include <cuda/std/type_traits>
-// #include <cuda/std/memory>
+#ifdef _LIBCUDACXX_HAS_MEMORY
+#include <cuda/std/memory>
+#endif // _LIBCUDACXX_HAS_MEMORY
 #include <cuda/std/cassert>
 #include <cuda/functional>
 #include "test_macros.h"
@@ -168,74 +170,80 @@ int main(int, char**)
     test_result_of<PMS0(                             S&),  int> ();
     test_result_of<PMS0(                             S*),  int> ();
     test_result_of<PMS0(                             S*&), int> ();
-#if !(defined(__NVCC__) || defined(__CUDACC_RTC__))
+
     test_result_of<PMS0(      cuda::std::reference_wrapper<S>),  int> ();
     test_result_of<PMS0(const cuda::std::reference_wrapper<S>&), int> ();
     test_result_of<PMS0(      cuda::std::reference_wrapper<SD>),  int> ();
     test_result_of<PMS0(const cuda::std::reference_wrapper<SD>&), int> ();
+
+#ifdef _LIBCUDACXX_HAS_MEMORY
     test_result_of<PMS0(cuda::std::unique_ptr<S>),  int> ();
     test_result_of<PMS0(cuda::std::unique_ptr<SD>), int> ();
-#endif
+#endif // _LIBCUDACXX_HAS_MEMORY
     test_no_result<PMS0(const          S&)>();
     test_no_result<PMS0(volatile       S&)>();
     test_no_result<PMS0(const volatile S&)>();
     test_no_result<PMS0(ND &                           )>();
     test_no_result<PMS0(const ND&                      )>();
-#if !(defined(__NVCC__) || defined(__CUDACC_RTC__))
+
+#ifdef _LIBCUDACXX_HAS_MEMORY
     test_no_result<PMS0(cuda::std::unique_ptr<S const>       )>();
+    test_no_result<PMS0(cuda::std::unique_ptr<ND>            )>();
+#endif // _LIBCUDACXX_HAS_MEMORY
     test_no_result<PMS0(cuda::std::reference_wrapper<S const>)>();
     test_no_result<PMS0(cuda::std::reference_wrapper<ND>     )>();
-    test_no_result<PMS0(cuda::std::unique_ptr<ND>            )>();
-#endif
 
     test_result_of<PMS1(                             S,   int), int*> ();
     test_result_of<PMS1(                             S&,  int), int*> ();
     test_result_of<PMS1(                             S*,  int), int*> ();
     test_result_of<PMS1(                             S*&, int), int*> ();
-#if !(defined(__NVCC__) || defined(__CUDACC_RTC__))
+#ifdef _LIBCUDACXX_HAS_MEMORY
     test_result_of<PMS1(cuda::std::unique_ptr<S>,               int), int*> ();
     test_result_of<PMS1(cuda::std::unique_ptr<SD>,              int), int*> ();
+#endif // _LIBCUDACXX_HAS_MEMORY
     test_result_of<PMS1(cuda::std::reference_wrapper<S>,        int), int*> ();
     test_result_of<PMS1(const cuda::std::reference_wrapper<S>&, int), int*> ();
     test_result_of<PMS1(cuda::std::reference_wrapper<SD>,        int), int*> ();
     test_result_of<PMS1(const cuda::std::reference_wrapper<SD>&, int), int*> ();
-#endif
+
     test_no_result<PMS1(const          S&, int)>();
     test_no_result<PMS1(volatile       S&, int)>();
     test_no_result<PMS1(const volatile S&, int)>();
     test_no_result<PMS1(ND &,                            int)>();
     test_no_result<PMS1(const ND&,                       int)>();
-#if !(defined(__NVCC__) || defined(__CUDACC_RTC__))
+#ifdef _LIBCUDACXX_HAS_MEMORY
     test_no_result<PMS1(cuda::std::unique_ptr<S const>,        int)>();
+    test_no_result<PMS1(cuda::std::unique_ptr<ND>,             int)>();
+#endif // _LIBCUDACXX_HAS_MEMORY
     test_no_result<PMS1(cuda::std::reference_wrapper<S const>, int)>();
     test_no_result<PMS1(cuda::std::reference_wrapper<ND>,      int)>();
-    test_no_result<PMS1(cuda::std::unique_ptr<ND>,             int)>();
-#endif
 
     test_result_of<PMS2(               S,   int, int), int&> ();
     test_result_of<PMS2(               S&,  int, int), int&> ();
     test_result_of<PMS2(               S*,  int, int), int&> ();
     test_result_of<PMS2(               S*&, int, int), int&> ();
-#if !(defined(__NVCC__) || defined(__CUDACC_RTC__))
+#ifdef _LIBCUDACXX_HAS_MEMORY
     test_result_of<PMS2(cuda::std::unique_ptr<S>, int, int), int&> ();
     test_result_of<PMS2(cuda::std::unique_ptr<SD>, int, int), int&> ();
+#endif // _LIBCUDACXX_HAS_MEMORY
     test_result_of<PMS2(cuda::std::reference_wrapper<S>,         int, int), int&> ();
     test_result_of<PMS2(const cuda::std::reference_wrapper<S>&,  int, int), int&> ();
     test_result_of<PMS2(cuda::std::reference_wrapper<SD>,        int, int), int&> ();
     test_result_of<PMS2(const cuda::std::reference_wrapper<SD>&, int, int), int&> ();
-#endif
+
     test_no_result<PMS2(const          S&, int, int)>();
     test_no_result<PMS2(volatile       S&, int, int)>();
     test_no_result<PMS2(const volatile S&, int, int)>();
-#if !(defined(__NVCC__) || defined(__CUDACC_RTC__))
+#ifdef _LIBCUDACXX_HAS_MEMORY
     test_no_result<PMS2(cuda::std::unique_ptr<S const>,   int, int)>();
+#endif // _LIBCUDACXX_HAS_MEMORY
     test_no_result<PMS2(cuda::std::reference_wrapper<S const>, int, int)>();
-#endif
+
     test_no_result<PMS2(const ND&,                  int, int)>();
-#if !(defined(__NVCC__) || defined(__CUDACC_RTC__))
     test_no_result<PMS2(cuda::std::reference_wrapper<ND>, int, int)>();
+#ifdef _LIBCUDACXX_HAS_MEMORY
     test_no_result<PMS2(cuda::std::unique_ptr<ND>,        int, int)>();
-#endif
+#endif // _LIBCUDACXX_HAS_MEMORY
 
     test_result_of<PMS3(S&, int), const int &>();
     test_result_of<PMS3(S&, int, long), const int &>();
@@ -251,9 +259,10 @@ int main(int, char**)
     test_result_of<PMS0C(const          S*),  int> ();
     test_result_of<PMS0C(               S*&), int> ();
     test_result_of<PMS0C(const          S*&), int> ();
-#if !(defined(__NVCC__) || defined(__CUDACC_RTC__))
+#ifdef _LIBCUDACXX_HAS_MEMORY
     test_result_of<PMS0C(cuda::std::unique_ptr<S>), int> ();
     test_result_of<PMS0C(cuda::std::unique_ptr<SD>), int> ();
+#endif // _LIBCUDACXX_HAS_MEMORY
     test_result_of<PMS0C(cuda::std::reference_wrapper<S>              ), int> ();
     test_result_of<PMS0C(cuda::std::reference_wrapper<const S>        ), int> ();
     test_result_of<PMS0C(const cuda::std::reference_wrapper<S> &      ), int> ();
@@ -262,7 +271,7 @@ int main(int, char**)
     test_result_of<PMS0C(cuda::std::reference_wrapper<const SD>       ), int> ();
     test_result_of<PMS0C(const cuda::std::reference_wrapper<SD> &     ), int> ();
     test_result_of<PMS0C(const cuda::std::reference_wrapper<const SD> &), int> ();
-#endif
+
     test_no_result<PMS0C(volatile       S&)>();
     test_no_result<PMS0C(const volatile S&)>();
 
@@ -273,9 +282,9 @@ int main(int, char**)
     test_result_of<PMS1C(const          S*,  int), int*> ();
     test_result_of<PMS1C(               S*&, int), int*> ();
     test_result_of<PMS1C(const          S*&, int), int*> ();
-#if !(defined(__NVCC__) || defined(__CUDACC_RTC__))
+#ifdef _LIBCUDACXX_HAS_MEMORY
     test_result_of<PMS1C(cuda::std::unique_ptr<S>, int), int*> ();
-#endif
+#endif // _LIBCUDACXX_HAS_MEMORY
     test_no_result<PMS1C(volatile       S&, int)>();
     test_no_result<PMS1C(const volatile S&, int)>();
 
@@ -286,9 +295,9 @@ int main(int, char**)
     test_result_of<PMS2C(const          S*,  int, int), int&> ();
     test_result_of<PMS2C(               S*&, int, int), int&> ();
     test_result_of<PMS2C(const          S*&, int, int), int&> ();
-#if !(defined(__NVCC__) || defined(__CUDACC_RTC__))
+#ifdef _LIBCUDACXX_HAS_MEMORY
     test_result_of<PMS2C(cuda::std::unique_ptr<S>, int, int), int&> ();
-#endif
+#endif // _LIBCUDACXX_HAS_MEMORY
     test_no_result<PMS2C(volatile       S&, int, int)>();
     test_no_result<PMS2C(const volatile S&, int, int)>();
 
@@ -306,9 +315,9 @@ int main(int, char**)
     test_result_of<PMS0V(volatile       S*),  int> ();
     test_result_of<PMS0V(               S*&), int> ();
     test_result_of<PMS0V(volatile       S*&), int> ();
-#if !(defined(__NVCC__) || defined(__CUDACC_RTC__))
+#ifdef _LIBCUDACXX_HAS_MEMORY
     test_result_of<PMS0V(cuda::std::unique_ptr<S>), int> ();
-#endif
+#endif // _LIBCUDACXX_HAS_MEMORY
     test_no_result<PMS0V(const          S&)>();
     test_no_result<PMS0V(const volatile S&)>();
 
@@ -319,9 +328,9 @@ int main(int, char**)
     test_result_of<PMS1V(volatile       S*,  int), int*> ();
     test_result_of<PMS1V(               S*&, int), int*> ();
     test_result_of<PMS1V(volatile       S*&, int), int*> ();
-#if !(defined(__NVCC__) || defined(__CUDACC_RTC__))
+#ifdef _LIBCUDACXX_HAS_MEMORY
     test_result_of<PMS1V(cuda::std::unique_ptr<S>, int), int*> ();
-#endif
+#endif // _LIBCUDACXX_HAS_MEMORY
     test_no_result<PMS1V(const          S&, int)>();
     test_no_result<PMS1V(const volatile S&, int)>();
 
@@ -332,9 +341,9 @@ int main(int, char**)
     test_result_of<PMS2V(volatile       S*,  int, int), int&> ();
     test_result_of<PMS2V(               S*&, int, int), int&> ();
     test_result_of<PMS2V(volatile       S*&, int, int), int&> ();
-#if !(defined(__NVCC__) || defined(__CUDACC_RTC__))
+#ifdef _LIBCUDACXX_HAS_MEMORY
     test_result_of<PMS2V(cuda::std::unique_ptr<S>, int, int), int&> ();
-#endif
+#endif // _LIBCUDACXX_HAS_MEMORY
     test_no_result<PMS2V(const          S&, int, int)>();
     test_no_result<PMS2V(const volatile S&, int, int)>();
 
@@ -358,9 +367,9 @@ int main(int, char**)
     test_result_of<PMS0CV(const          S*&), int> ();
     test_result_of<PMS0CV(volatile       S*&), int> ();
     test_result_of<PMS0CV(const volatile S*&), int> ();
-#if !(defined(__NVCC__) || defined(__CUDACC_RTC__))
+#ifdef _LIBCUDACXX_HAS_MEMORY
     test_result_of<PMS0CV(cuda::std::unique_ptr<S>), int> ();
-#endif
+#endif // _LIBCUDACXX_HAS_MEMORY
 
     test_result_of<PMS1CV(               S,   int), int*> ();
     test_result_of<PMS1CV(               S&,  int), int*> ();
@@ -375,9 +384,9 @@ int main(int, char**)
     test_result_of<PMS1CV(const          S*&, int), int*> ();
     test_result_of<PMS1CV(volatile       S*&, int), int*> ();
     test_result_of<PMS1CV(const volatile S*&, int), int*> ();
-#if !(defined(__NVCC__) || defined(__CUDACC_RTC__))
+#ifdef _LIBCUDACXX_HAS_MEMORY
     test_result_of<PMS1CV(cuda::std::unique_ptr<S>, int), int*> ();
-#endif
+#endif // _LIBCUDACXX_HAS_MEMORY
 
     test_result_of<PMS2CV(               S,   int, int), int&> ();
     test_result_of<PMS2CV(               S&,  int, int), int&> ();
@@ -392,9 +401,9 @@ int main(int, char**)
     test_result_of<PMS2CV(const          S*&, int, int), int&> ();
     test_result_of<PMS2CV(volatile       S*&, int, int), int&> ();
     test_result_of<PMS2CV(const volatile S*&, int, int), int&> ();
-#if !(defined(__NVCC__) || defined(__CUDACC_RTC__))
+#ifdef _LIBCUDACXX_HAS_MEMORY
     test_result_of<PMS2CV(cuda::std::unique_ptr<S>, int, int), int&> ();
-#endif
+#endif // _LIBCUDACXX_HAS_MEMORY
 
     test_result_of<PMS3CV(S&, int), const int &>();
     test_result_of<PMS3CV(S&, int, long), const int &>();
@@ -414,15 +423,15 @@ int main(int, char**)
     test_result_of<PMD(SD const&), const char&>();
     test_result_of<PMD(SD*), char&>();
     test_result_of<PMD(const SD*), const char&>();
-#if !(defined(__NVCC__) || defined(__CUDACC_RTC__))
+#ifdef _LIBCUDACXX_HAS_MEMORY
     test_result_of<PMD(cuda::std::unique_ptr<S>), char &>();
     test_result_of<PMD(cuda::std::unique_ptr<S const>), const char&>();
+#endif // _LIBCUDACXX_HAS_MEMORY
     test_result_of<PMD(cuda::std::reference_wrapper<S>), char&>();
     test_result_of<PMD(cuda::std::reference_wrapper<S const>), const char&>();
-#endif
     test_no_result<PMD(ND&)>();
     }
-#if defined(__NVCC__)
+#if defined(TEST_COMPILER_NVCC)
     { // extended lambda
     NV_IF_TARGET(NV_IS_DEVICE,(
         test_lambda<int>([] __host__ __device__ () -> int { return 42; });
diff --git a/libcudacxx/.upstream-tests/test/std/utilities/meta/meta.trans/meta.trans.other/result_of11.pass.cpp b/libcudacxx/.upstream-tests/test/std/utilities/meta/meta.trans/meta.trans.other/result_of11.pass.cpp
index d66353b059..99b1bfbb66 100644
--- a/libcudacxx/.upstream-tests/test/std/utilities/meta/meta.trans/meta.trans.other/result_of11.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/std/utilities/meta/meta.trans/meta.trans.other/result_of11.pass.cpp
@@ -19,11 +19,16 @@
 #define _LIBCUDACXX_DISABLE_DEPRECATION_WARNINGS
 
 #include <cuda/std/type_traits>
-// #include <cuda/std/memory>
-// #include <cuda/std/utility>
+#ifdef _LIBCUDACXX_HAS_MEMORY
+#include <cuda/std/memory>
+#endif // _LIBCUDACXX_HAS_MEMORY
+#include <cuda/std/utility>
 #include "test_macros.h"
 
 TEST_NV_DIAG_SUPPRESS(3013) // a volatile function parameter is deprecated
+#ifdef TEST_COMPILER_CLANG_CUDA
+#pragma clang diagnostic ignored "-Wdeprecated-volatile"
+#endif // TEST_COMPILER_CLANG_CUDA
 
 struct wat
 {
@@ -101,17 +106,17 @@ int main(int, char**)
     test_result_of_imp<PMD(FD volatile       ), char &&>();
     test_result_of_imp<PMD(FD const volatile ), char &&>();
 
-#if !(defined(__NVCC__) || defined(__CUDACC_RTC__))
+#if defined(_LIBCUDACXX_HAS_MEMORY)
     test_result_of_imp<PMD(cuda::std::unique_ptr<F>),        char &>();
     test_result_of_imp<PMD(cuda::std::unique_ptr<F const>),  const char &>();
     test_result_of_imp<PMD(cuda::std::unique_ptr<FD>),       char &>();
     test_result_of_imp<PMD(cuda::std::unique_ptr<FD const>), const char &>();
+#endif // _LIBCUDACXX_HAS_MEMORY
 
     test_result_of_imp<PMD(cuda::std::reference_wrapper<F>),        char &>();
     test_result_of_imp<PMD(cuda::std::reference_wrapper<F const>),  const char &>();
     test_result_of_imp<PMD(cuda::std::reference_wrapper<FD>),       char &>();
     test_result_of_imp<PMD(cuda::std::reference_wrapper<FD const>), const char &>();
-#endif
     }
     {
     test_result_of_imp<int (F::* (F       &)) ()                &, int> ();
@@ -176,12 +181,12 @@ int main(int, char**)
     test_result_of_imp<int (F::* (FD const volatile )) () const volatile &&, int> ();
     }
     {
-#if !(defined(__NVCC__) || defined(__CUDACC_RTC__))
     test_result_of_imp<int (F::* (cuda::std::reference_wrapper<F>))       (),       int>();
     test_result_of_imp<int (F::* (cuda::std::reference_wrapper<const F>)) () const, int>();
+#ifdef _LIBCUDACXX_HAS_MEMORY
     test_result_of_imp<int (F::* (cuda::std::unique_ptr<F>       ))       (),       int>();
     test_result_of_imp<int (F::* (cuda::std::unique_ptr<const F> ))       () const, int>();
-#endif
+#endif // _LIBCUDACXX_HAS_MEMORY
     }
     test_result_of_imp<decltype(&wat::foo)(wat), void>();
 
diff --git a/libcudacxx/.upstream-tests/test/std/utilities/meta/meta.unary/meta.unary.prop/is_swappable_include_order.pass.cpp b/libcudacxx/.upstream-tests/test/std/utilities/meta/meta.unary/meta.unary.prop/is_swappable_include_order.pass.cpp
index 408f8e7e6f..28dac6da10 100644
--- a/libcudacxx/.upstream-tests/test/std/utilities/meta/meta.unary/meta.unary.prop/is_swappable_include_order.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/std/utilities/meta/meta.unary/meta.unary.prop/is_swappable_include_order.pass.cpp
@@ -10,7 +10,7 @@
 
 // is_swappable
 
-// XFAIL: nvcc
+// XFAIL: nvcc && (!nvrtc) && (!c++20) && (!clang-16) && (!gcc-12)
 
 // If we're just building the test and not executing it, it should pass.
 // UNSUPPORTED: no_execute
@@ -23,7 +23,8 @@
 // This test checks that (1) and (2) see forward declarations
 // for (3).
 #include <cuda/std/type_traits>
-#include <cuda/std/algorithm>
+//#include <cuda/std/algorithm>
+#include <cuda/std/array>
 #include <cuda/std/utility>
 
 #include "test_macros.h"
diff --git a/libcudacxx/.upstream-tests/test/std/utilities/time/date.time/ctime.pass.cpp b/libcudacxx/.upstream-tests/test/std/utilities/time/date.time/ctime.pass.cpp
index 89c9c4efe8..e5c564312e 100644
--- a/libcudacxx/.upstream-tests/test/std/utilities/time/date.time/ctime.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/std/utilities/time/date.time/ctime.pass.cpp
@@ -51,7 +51,9 @@ int main(int, char**)
     unused(tmspec); // Prevent unused warning
 #endif
 
+#ifndef TEST_COMPILER_CLANG_CUDA
     static_assert((cuda::std::is_same<decltype(cuda::std::clock()), cuda::std::clock_t>::value), "");
+#endif // TEST_COMPILER_CLANG_CUDA
     static_assert((cuda::std::is_same<decltype(cuda::std::difftime(t,t)), double>::value), "");
     static_assert((cuda::std::is_same<decltype(cuda::std::mktime(&tm)), cuda::std::time_t>::value), "");
     static_assert((cuda::std::is_same<decltype(cuda::std::time(&t)), cuda::std::time_t>::value), "");
diff --git a/libcudacxx/.upstream-tests/test/std/utilities/time/time.cal/time.cal.ym/time.cal.ym.nonmembers/minus.pass.cpp b/libcudacxx/.upstream-tests/test/std/utilities/time/time.cal/time.cal.ym/time.cal.ym.nonmembers/minus.pass.cpp
index 8ee30c387b..609d250946 100644
--- a/libcudacxx/.upstream-tests/test/std/utilities/time/time.cal/time.cal.ym/time.cal.ym.nonmembers/minus.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/std/utilities/time/time.cal/time.cal.ym/time.cal.ym.nonmembers/minus.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 // UNSUPPORTED: c++98, c++03, c++11
+// UNSUPPORTED: clang && (!nvcc)
 // XFAIL: *
 
 // <chrono>
diff --git a/libcudacxx/.upstream-tests/test/std/utilities/time/time.cal/time.cal.ymwd/time.cal.ymwd.members/plus_minus_equal_month.pass.cpp b/libcudacxx/.upstream-tests/test/std/utilities/time/time.cal/time.cal.ymwd/time.cal.ymwd.members/plus_minus_equal_month.pass.cpp
index 9b93078d65..cd7965fa4c 100644
--- a/libcudacxx/.upstream-tests/test/std/utilities/time/time.cal/time.cal.ymwd/time.cal.ymwd.members/plus_minus_equal_month.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/std/utilities/time/time.cal/time.cal.ymwd/time.cal.ymwd.members/plus_minus_equal_month.pass.cpp
@@ -39,7 +39,6 @@ int main(int, char**)
 {
     using year               = cuda::std::chrono::year;
     using month              = cuda::std::chrono::month;
-    using weekday            = cuda::std::chrono::weekday;
     using weekday_indexed    = cuda::std::chrono::weekday_indexed;
     using year_month_weekday = cuda::std::chrono::year_month_weekday;
     using months             = cuda::std::chrono::months;
diff --git a/libcudacxx/.upstream-tests/test/std/utilities/time/time.cal/time.cal.ymwd/time.cal.ymwd.members/plus_minus_equal_year.pass.cpp b/libcudacxx/.upstream-tests/test/std/utilities/time/time.cal/time.cal.ymwd/time.cal.ymwd.members/plus_minus_equal_year.pass.cpp
index 3b486988e4..33948e7f17 100644
--- a/libcudacxx/.upstream-tests/test/std/utilities/time/time.cal/time.cal.ymwd/time.cal.ymwd.members/plus_minus_equal_year.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/std/utilities/time/time.cal/time.cal.ymwd/time.cal.ymwd.members/plus_minus_equal_year.pass.cpp
@@ -38,8 +38,6 @@ constexpr bool testConstexpr(D d1)
 int main(int, char**)
 {
     using year               = cuda::std::chrono::year;
-    using month              = cuda::std::chrono::month;
-    using weekday            = cuda::std::chrono::weekday;
     using weekday_indexed    = cuda::std::chrono::weekday_indexed;
     using year_month_weekday = cuda::std::chrono::year_month_weekday;
     using years              = cuda::std::chrono::years;
diff --git a/libcudacxx/.upstream-tests/test/std/utilities/time/time.cal/time.cal.ymwdlast/time.cal.ymwdlast.members/op_sys_days.pass.cpp b/libcudacxx/.upstream-tests/test/std/utilities/time/time.cal/time.cal.ymwdlast/time.cal.ymwdlast.members/op_sys_days.pass.cpp
index a9bbbf4eb0..a18d25e311 100644
--- a/libcudacxx/.upstream-tests/test/std/utilities/time/time.cal/time.cal.ymwdlast/time.cal.ymwdlast.members/op_sys_days.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/std/utilities/time/time.cal/time.cal.ymwdlast/time.cal.ymwdlast.members/op_sys_days.pass.cpp
@@ -24,11 +24,9 @@
 int main(int, char**)
 {
     using year                    = cuda::std::chrono::year;
-    using month                   = cuda::std::chrono::month;
     using year_month_weekday_last = cuda::std::chrono::year_month_weekday_last;
     using sys_days                = cuda::std::chrono::sys_days;
     using days                    = cuda::std::chrono::days;
-    using weekday                 = cuda::std::chrono::weekday;
     using weekday_last            = cuda::std::chrono::weekday_last;
 
     ASSERT_NOEXCEPT(                    static_cast<sys_days>(cuda::std::declval<const year_month_weekday_last>()));
diff --git a/libcudacxx/.upstream-tests/test/std/utilities/time/time.cal/time.cal.ymwdlast/time.cal.ymwdlast.nonmembers/plus.pass.cpp b/libcudacxx/.upstream-tests/test/std/utilities/time/time.cal/time.cal.ymwdlast/time.cal.ymwdlast.nonmembers/plus.pass.cpp
index 9c0ff4c9be..5d730546fb 100644
--- a/libcudacxx/.upstream-tests/test/std/utilities/time/time.cal/time.cal.ymwdlast/time.cal.ymwdlast.nonmembers/plus.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/std/utilities/time/time.cal/time.cal.ymwdlast/time.cal.ymwdlast.nonmembers/plus.pass.cpp
@@ -53,7 +53,6 @@ int main(int, char**)
 {
     using year                    = cuda::std::chrono::year;
     using month                   = cuda::std::chrono::month;
-    using weekday                 = cuda::std::chrono::weekday;
     using weekday_last            = cuda::std::chrono::weekday_last;
     using year_month_weekday_last = cuda::std::chrono::year_month_weekday_last;
     using years                   = cuda::std::chrono::years;
diff --git a/libcudacxx/.upstream-tests/test/support/cuda_space_selector.h b/libcudacxx/.upstream-tests/test/support/cuda_space_selector.h
index 5e58080d51..cf145267ff 100644
--- a/libcudacxx/.upstream-tests/test/support/cuda_space_selector.h
+++ b/libcudacxx/.upstream-tests/test/support/cuda_space_selector.h
@@ -16,6 +16,10 @@
 
 #include "concurrent_agents.h"
 
+#if defined(__clang__) && defined(__CUDA__)
+# include <new>
+#endif
+
 #ifdef _LIBCUDACXX_COMPILER_NVRTC
 #define LAMBDA [=]
 #else
@@ -93,7 +97,7 @@ struct device_shared_memory_provider {
 
     __device__
     T * get() {
-        __shared__ alignas(T) char buffer[shared_offset];
+        alignas(T) __shared__ char buffer[shared_offset];
         return reinterpret_cast<T *>(buffer + prefix_size);
     }
 };
diff --git a/libcudacxx/.upstream-tests/test/support/test_macros.h b/libcudacxx/.upstream-tests/test/support/test_macros.h
index ea30effa3b..3d77a42137 100644
--- a/libcudacxx/.upstream-tests/test/support/test_macros.h
+++ b/libcudacxx/.upstream-tests/test/support/test_macros.h
@@ -86,6 +86,10 @@
 // compiler.
 # define TEST_COMPILER_NVCC
 # define TEST_COMPILER_EDG
+#elif defined(_NVHPC_CUDA)
+#  define TEST_COMPILER_NVHPC_CUDA
+#elif defined(__CUDA__) && defined(_LIBCUDACXX_COMPILER_CLANG)
+#  define TEST_COMPILER_CLANG_CUDA
 #endif
 
 #if defined(__apple_build_version__)
@@ -398,7 +402,9 @@ constexpr bool unused(T &&) {return true;}
 // Define a helper macro to properly suppress warnings
 #define _TEST_TOSTRING2(x) #x
 #define _TEST_TOSTRING(x) _TEST_TOSTRING2(x)
-#if defined(__NVCC_DIAG_PRAGMA_SUPPORT__)
+#if defined(TEST_COMPILER_CLANG_CUDA)
+# define TEST_NV_DIAG_SUPPRESS(WARNING)
+#elif defined(__NVCC_DIAG_PRAGMA_SUPPORT__)
 #if defined (TEST_COMPILER_MSVC)
 # define TEST_NV_DIAG_SUPPRESS(WARNING) __pragma(_TEST_TOSTRING(nv_diag_suppress WARNING))
 #else // ^^^ MSVC ^^^ / vvv not MSVC vvv
diff --git a/libcudacxx/.upstream-tests/utils/libcudacxx/compiler.py b/libcudacxx/.upstream-tests/utils/libcudacxx/compiler.py
index 32b1323a8a..430a573dd6 100644
--- a/libcudacxx/.upstream-tests/utils/libcudacxx/compiler.py
+++ b/libcudacxx/.upstream-tests/utils/libcudacxx/compiler.py
@@ -147,6 +147,9 @@ def _initTypeAndVersion(self):
         if self.type == 'nvcc':
             # Treat C++ as CUDA when the compiler is NVCC.
             self.source_lang = 'cu'
+        elif self.type == 'clang':
+            # Treat C++ as clang-cuda when the compiler is Clang.
+            self.source_lang = 'cu'
 
     def _basicCmd(self, source_files, out, mode=CM_Default, flags=[],
                   input_is_cxx=False):
diff --git a/libcudacxx/.upstream-tests/utils/libcudacxx/test/config.py b/libcudacxx/.upstream-tests/utils/libcudacxx/test/config.py
index fd5790e767..fcc98acf98 100644
--- a/libcudacxx/.upstream-tests/utils/libcudacxx/test/config.py
+++ b/libcudacxx/.upstream-tests/utils/libcudacxx/test/config.py
@@ -628,12 +628,17 @@ def configure_compile_flags(self):
             self.config.available_features.add("nvrtc")
         if self.cxx.type == 'nvcc':
             self.cxx.compile_flags += ['--extended-lambda']
+        real_arch_format = '-gencode=arch=compute_{0},code=sm_{0}'
+        virt_arch_format = '-gencode=arch=compute_{0},code=compute_{0}'
+        if self.cxx.type == 'clang':
+            real_arch_format = '--cuda-gpu-arch=sm_{0}'
+            virt_arch_format = '--cuda-gpu-arch=compute_{0}'
         pre_sm_32 = True
         pre_sm_60 = True
         pre_sm_70 = True
         pre_sm_80 = True
         pre_sm_90 = True
-        if compute_archs and self.cxx.type == 'nvcc':
+        if compute_archs and (self.cxx.type == 'nvcc' or self.cxx.type == 'clang'):
             pre_sm_32 = False
             pre_sm_60 = False
             pre_sm_70 = False
@@ -654,10 +659,9 @@ def configure_compile_flags(self):
                 if arch < 70: pre_sm_70 = True
                 if arch < 80: pre_sm_80 = True
                 if arch < 90: pre_sm_90 = True
+                arch_flag = real_arch_format.format(arch)
                 if mode.count("virtual"):
-                    arch_flag = '-gencode=arch=compute_{0},code=compute_{0}'.format(arch)
-                else:
-                    arch_flag = '-gencode=arch=compute_{0},code=sm_{0}'.format(arch)
+                    arch_flag = virt_arch_format.format(arch)
                 self.cxx.compile_flags += [arch_flag]
         if pre_sm_32:
             self.config.available_features.add("pre-sm-32")
@@ -820,8 +824,9 @@ def configure_compile_flags_header_includes(self):
                                  and self.cxx_stdlib_under_test != 'libc++'):
             self.lit_config.note('using the system cxx headers')
             return
-        if self.cxx.type != 'nvcc' and self.cxx.type != 'nvhpc':
-            self.cxx.compile_flags += ['-nostdinc++']
+        # I don't think this is required, since removing it helps clang-cuda compile and libcudacxx only supports building in CUDA modes?
+        # if self.cxx.type != 'nvcc' and self.cxx.type != 'pgi':
+        #    self.cxx.compile_flags += ['-nostdinc++']
         if cxx_headers is None:
             cxx_headers = os.path.join(self.libcudacxx_src_root, 'include')
         if not os.path.isdir(cxx_headers):
@@ -1063,17 +1068,6 @@ def configure_link_flags_cxx_library(self):
             self.cxx.link_flags += ['-lc++experimental']
         if self.link_shared:
             self.cxx.link_flags += ['-lc++']
-        elif self.cxx.type != 'nvcc' and self.cxx.type != 'nvhpc':
-            cxx_library_root = self.get_lit_conf('cxx_library_root')
-            if cxx_library_root:
-                libname = self.make_static_lib_name('c++')
-                abs_path = os.path.join(cxx_library_root, libname)
-                assert os.path.exists(abs_path) and \
-                       "static libc++ library does not exist"
-                self.cxx.link_flags += [abs_path]
-            else:
-                self.cxx.link_flags += ['-lc++']
-
     def configure_link_flags_abi_library(self):
         cxx_abi = self.get_lit_conf('cxx_abi', 'libcxxabi')
         if cxx_abi == 'libstdc++':
@@ -1175,7 +1169,8 @@ def addIfHostSupports(flag):
                 addIfHostSupports('-Wall')
                 addIfHostSupports('-Wextra')
                 addIfHostSupports('-Werror')
-                addIfHostSupports('-Wno-literal-suffix') # GCC warning about reserved UDLs
+                if 'gcc' in self.config.available_features:
+                    addIfHostSupports('-Wno-literal-suffix') # GCC warning about reserved UDLs
                 addIfHostSupports('-Wno-user-defined-literals') # Clang warning about reserved UDLs
                 addIfHostSupports('-Wno-unused-parameter')
                 addIfHostSupports('-Wno-unused-local-typedefs') # GCC warning local typdefs
diff --git a/libcudacxx/CMakeLists.txt b/libcudacxx/CMakeLists.txt
index 784c57aa02..a5343d53db 100644
--- a/libcudacxx/CMakeLists.txt
+++ b/libcudacxx/CMakeLists.txt
@@ -1,4 +1,5 @@
 # 3.15 is the minimum for including the project with add_subdirectory.
+# 3.18 for C++17 + CUDA and clang-cuda support.
 # 3.21 is the minimum for the developer build.
 cmake_minimum_required(VERSION 3.15)
 
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__config b/libcudacxx/include/cuda/std/detail/libcxx/include/__config
index 7abb0267fa..de76e3ccad 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__config
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__config
@@ -864,7 +864,7 @@ extern "C++" {
 
 #define _LIBCUDACXX_TOSTRING2(_STR) #_STR
 #define _LIBCUDACXX_TOSTRING(_STR) _LIBCUDACXX_TOSTRING2(_STR)
-#if defined(_LIBCUDACXX_CUDACC)
+#if defined(_LIBCUDACXX_CUDACC) && !defined(_LIBCUDACXX_COMPILER_CLANG_CUDA)
 #  if defined(__NVCC_DIAG_PRAGMA_SUPPORT__)
 #    if defined(_LIBCUDACXX_COMPILER_MSVC)
 #      define _LIBCUDACXX_NV_DIAG_SUPPRESS(_WARNING) __pragma(_LIBCUDACXX_TOSTRING(nv_diag_suppress _WARNING))
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h
index c4354ec020..7727ba7521 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h
@@ -222,11 +222,11 @@ friend class _CUDA_VSTD::__barrier_poll_tester_parity;
                 int __inc = __popc(__active) * __update;
 
                 unsigned __laneid;
-                asm ("mov.u32 %0, %laneid;" : "=r"(__laneid));
+                asm ("mov.u32 %0, %%laneid;" : "=r"(__laneid));
                 int __leader = __ffs(__active) - 1;
                 // All threads in mask synchronize here, establishing cummulativity to the __leader:
                 __syncwarp(__mask);
-                if(__leader == __laneid)
+                if(__leader == static_cast<int>(__laneid))
                 {
                     __token = __barrier.arrive(__inc);
                 }
@@ -356,9 +356,9 @@ friend class _CUDA_VSTD::__barrier_poll_tester_parity;
         NV_DISPATCH_TARGET(
             NV_PROVIDES_SM_80, (
                 asm volatile ("{"
-                    ".reg .pred %p;"
-                    "mbarrier.test_wait.parity.shared.b64 %p, [%1], %2;"
-                    "selp.u16 %0, 1, 0, %p;"
+                    ".reg .pred %%p;"
+                    "mbarrier.test_wait.parity.shared.b64 %%p, [%1], %2;"
+                    "selp.u16 %0, 1, 0, %%p;"
                     "}"
                     : "=h"(__ready)
                     : "r"(static_cast<uint32_t>(__cvta_generic_to_shared(&__barrier))),
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__expected/unexpected.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__expected/unexpected.h
index 0f8770c800..297bebed1d 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__expected/unexpected.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__expected/unexpected.h
@@ -97,23 +97,23 @@ class unexpected {
   constexpr unexpected& operator=(unexpected&&)      = default;
 
   // [expected.un.obs]
-  _LIBCUDACXX_INLINE_VISIBILITY
-  _LIBCUDACXX_NODISCARD_ATTRIBUTE constexpr const _Err& error() const& noexcept {
+  _LIBCUDACXX_NODISCARD_ATTRIBUTE _LIBCUDACXX_INLINE_VISIBILITY
+  constexpr const _Err& error() const& noexcept {
     return __unex_;
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY
-  _LIBCUDACXX_NODISCARD_ATTRIBUTE constexpr _Err& error() & noexcept {
+  _LIBCUDACXX_NODISCARD_ATTRIBUTE _LIBCUDACXX_INLINE_VISIBILITY
+  constexpr _Err& error() & noexcept {
     return __unex_;
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY
-  _LIBCUDACXX_NODISCARD_ATTRIBUTE constexpr const _Err&& error() const&& noexcept {
+  _LIBCUDACXX_NODISCARD_ATTRIBUTE _LIBCUDACXX_INLINE_VISIBILITY
+  constexpr const _Err&& error() const&& noexcept {
     return _CUDA_VSTD::move(__unex_);
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY
-  _LIBCUDACXX_NODISCARD_ATTRIBUTE constexpr _Err&& error() && noexcept {
+  _LIBCUDACXX_NODISCARD_ATTRIBUTE _LIBCUDACXX_INLINE_VISIBILITY
+  constexpr _Err&& error() && noexcept {
     return _CUDA_VSTD::move(__unex_);
   }
 
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__functional/reference_wrapper.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__functional/reference_wrapper.h
index d9308b668a..a9cb9a37da 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__functional/reference_wrapper.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__functional/reference_wrapper.h
@@ -60,7 +60,7 @@ class _LIBCUDACXX_TEMPLATE_VIS reference_wrapper : public __weak_result_type<_Tp
 
 #if _LIBCUDACXX_STD_VER > 14 && !defined(_LIBCUDACXX_HAS_NO_DEDUCTION_GUIDES)
 template <class _Tp>
-reference_wrapper(_Tp&) -> reference_wrapper<_Tp>;
+_LIBCUDACXX_HOST_DEVICE reference_wrapper(_Tp&) -> reference_wrapper<_Tp>;
 #endif
 
 template <class _Tp>
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__mdspan/extents.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__mdspan/extents.h
index 1f82f3a389..a827c49ce5 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__mdspan/extents.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__mdspan/extents.h
@@ -534,7 +534,7 @@ using dextents = typename __detail::__make_dextents<_IndexType, _Rank>::type;
 
 #if defined(__MDSPAN_USE_CLASS_TEMPLATE_ARGUMENT_DEDUCTION)
 template <class... _IndexTypes>
-extents(_IndexTypes...)
+_LIBCUDACXX_HOST_DEVICE extents(_IndexTypes...)
   // Workaround for nvcc
   //-> extents<size_t, __detail::__make_dynamic_extent<_IndexTypes>()...>;
   // Adding "(void)" so that clang doesn't complain this is unused
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__mdspan/mdspan.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__mdspan/mdspan.h
index f5323a72ce..37cc94c732 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__mdspan/mdspan.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__mdspan/mdspan.h
@@ -421,41 +421,41 @@ __MDSPAN_TEMPLATE_REQUIRES(
   /* requires */ __MDSPAN_FOLD_AND(_LIBCUDACXX_TRAIT(is_integral, _SizeTypes) /* && ... */) &&
   (sizeof...(_SizeTypes) > 0)
 )
-explicit mdspan(_ElementType*, _SizeTypes...)
+_LIBCUDACXX_HOST_DEVICE explicit mdspan(_ElementType*, _SizeTypes...)
   -> mdspan<_ElementType, dextents<size_t, sizeof...(_SizeTypes)>>;
 
 __MDSPAN_TEMPLATE_REQUIRES(
   class _Pointer,
   (_LIBCUDACXX_TRAIT(is_pointer, _CUDA_VSTD::remove_reference_t<_Pointer>))
 )
-mdspan(_Pointer&&) -> mdspan<_CUDA_VSTD::remove_pointer_t<_CUDA_VSTD::remove_reference_t<_Pointer>>, extents<size_t>>;
+_LIBCUDACXX_HOST_DEVICE  mdspan(_Pointer&&) -> mdspan<_CUDA_VSTD::remove_pointer_t<_CUDA_VSTD::remove_reference_t<_Pointer>>, extents<size_t>>;
 __MDSPAN_TEMPLATE_REQUIRES(
   class _CArray,
   (_LIBCUDACXX_TRAIT(is_array, _CArray) && (rank_v<_CArray> == 1))
 )
-mdspan(_CArray&) -> mdspan<_CUDA_VSTD::remove_all_extents_t<_CArray>, extents<size_t, _CUDA_VSTD::extent_v<_CArray,0>>>;
+_LIBCUDACXX_HOST_DEVICE mdspan(_CArray&) -> mdspan<_CUDA_VSTD::remove_all_extents_t<_CArray>, extents<size_t, _CUDA_VSTD::extent_v<_CArray,0>>>;
 
 template <class _ElementType, class _SizeType, size_t _Np>
-mdspan(_ElementType*, const _CUDA_VSTD::array<_SizeType, _Np>&)
+_LIBCUDACXX_HOST_DEVICE mdspan(_ElementType*, const _CUDA_VSTD::array<_SizeType, _Np>&)
   -> mdspan<_ElementType, dextents<size_t, _Np>>;
 
 template <class _ElementType, class _SizeType, size_t _Np>
-mdspan(_ElementType*, _CUDA_VSTD::span<_SizeType, _Np>)
+_LIBCUDACXX_HOST_DEVICE mdspan(_ElementType*, _CUDA_VSTD::span<_SizeType, _Np>)
   -> mdspan<_ElementType, dextents<size_t, _Np>>;
 
 // This one is necessary because all the constructors take `data_handle_type`s, not
 // `_ElementType*`s, and `data_handle_type` is taken from `accessor_type::data_handle_type`, which
 // seems to throw off automatic deduction guides.
 template <class _ElementType, class _SizeType, size_t... _ExtentsPack>
-mdspan(_ElementType*, const extents<_SizeType, _ExtentsPack...>&)
+_LIBCUDACXX_HOST_DEVICE mdspan(_ElementType*, const extents<_SizeType, _ExtentsPack...>&)
   -> mdspan<_ElementType, extents<_SizeType, _ExtentsPack...>>;
 
 template <class _ElementType, class _MappingType>
-mdspan(_ElementType*, const _MappingType&)
+_LIBCUDACXX_HOST_DEVICE mdspan(_ElementType*, const _MappingType&)
   -> mdspan<_ElementType, typename _MappingType::extents_type, typename _MappingType::layout_type>;
 
 template <class _MappingType, class _AccessorType>
-mdspan(const typename _AccessorType::data_handle_type, const _MappingType&, const _AccessorType&)
+_LIBCUDACXX_HOST_DEVICE mdspan(const typename _AccessorType::data_handle_type, const _MappingType&, const _AccessorType&)
   -> mdspan<typename _AccessorType::element_type, typename _MappingType::extents_type, typename _MappingType::layout_type, _AccessorType>;
 #endif
 
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__memory/construct_at.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__memory/construct_at.h
index 44b747efd9..a0577fac22 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__memory/construct_at.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__memory/construct_at.h
@@ -27,6 +27,10 @@
 #include "../__utility/forward.h"
 #include "../__utility/move.h"
 
+#ifdef _LIBCUDACXX_COMPILER_CLANG_CUDA
+#include <new>
+#endif // _LIBCUDACXX_COMPILER_CLANG_CUDA
+
 #if defined(_LIBCUDACXX_USE_PRAGMA_GCC_SYSTEM_HEADER)
 #pragma GCC system_header
 #endif
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__utility/pair.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__utility/pair.h
index bc6869ee4e..c4f66a6207 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__utility/pair.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__utility/pair.h
@@ -564,7 +564,7 @@ struct _LIBCUDACXX_TEMPLATE_VIS pair
 
 #if _LIBCUDACXX_STD_VER > 14 && !defined(_LIBCUDACXX_HAS_NO_DEDUCTION_GUIDES)
 template<class _T1, class _T2>
-pair(_T1, _T2) -> pair<_T1, _T2>;
+_LIBCUDACXX_HOST_DEVICE pair(_T1, _T2) -> pair<_T1, _T2>;
 #endif // _LIBCUDACXX_STD_VER > 14 && !defined(_LIBCUDACXX_HAS_NO_DEDUCTION_GUIDES)
 
 // [pairs.spec], specialized algorithms
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/array b/libcudacxx/include/cuda/std/detail/libcxx/include/array
index 0e9b9d7e64..73f2b2ca86 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/array
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/array
@@ -394,8 +394,7 @@ struct _LIBCUDACXX_TEMPLATE_VIS array<_Tp, 0>
 template<class _Tp, class... _Args,
          class = __enable_if_t<(is_same_v<_Tp, _Args> && ...), void>
          >
-array(_Tp, _Args...)
-  -> array<_Tp, 1 + sizeof...(_Args)>;
+_LIBCUDACXX_HOST_DEVICE array(_Tp, _Args...) -> array<_Tp, 1 + sizeof...(_Args)>;
 #endif
 
 template <class _Tp, size_t _Size>
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/atomic b/libcudacxx/include/cuda/std/detail/libcxx/include/atomic
index 37e3639027..45c2a5a558 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/atomic
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/atomic
@@ -684,7 +684,9 @@ typedef enum memory_order {
 
 template <typename _Tp> _LIBCUDACXX_INLINE_VISIBILITY
 bool __cxx_nonatomic_compare_equal(_Tp const& __lhs, _Tp const& __rhs) {
-#if defined(_LIBCUDACXX_COMPILER_NVCC) || defined(_LIBCUDACXX_COMPILER_NVRTC)
+#if defined(_LIBCUDACXX_COMPILER_NVCC) \
+ || defined(_LIBCUDACXX_COMPILER_NVRTC) \
+ || defined(_LIBCUDACXX_COMPILER_CLANG_CUDA)
     return __lhs == __rhs;
 #else
     return memcmp(&__lhs, &__rhs, sizeof(_Tp)) == 0;
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/barrier b/libcudacxx/include/cuda/std/detail/libcxx/include/barrier
index 2f0afb3d2b..ab4c0fb851 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/barrier
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/barrier
@@ -302,7 +302,7 @@ public:
 
     _LIBCUDACXX_INLINE_VISIBILITY
     __barrier_base(ptrdiff_t __expected, _CompletionF __completion = _CompletionF())
-        : __phase(false), __expected(__expected), __arrived(__expected), __completion(__completion)
+        : __expected(__expected), __arrived(__expected), __completion(__completion), __phase(false)
     {
     }
 
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/cmath b/libcudacxx/include/cuda/std/detail/libcxx/include/cmath
index ab25d7585e..2855220e72 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/cmath
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/cmath
@@ -674,7 +674,7 @@ __constexpr_isfinite(_A1 __lcpp_x) noexcept
     return isfinite(__lcpp_x);
 }
 
-#if defined(_MSC_VER) || defined(__CUDACC_RTC__)
+#if defined(_MSC_VER) || defined(__CUDACC_RTC__) || defined(_LIBCUDACXX_COMPILER_CLANG_CUDA)
 template <class _A1>
 _LIBCUDACXX_INLINE_VISIBILITY
 _A1 __constexpr_copysign(_A1 __x, _A1 __y) noexcept
@@ -708,7 +708,7 @@ _LIBCUDACXX_CONSTEXPR_AFTER_CXX11 typename __enable_if_t<is_arithmetic<_A1>::val
 }
 #endif // !_MSC_VER
 
-#if defined(_MSC_VER) || defined(__CUDACC_RTC__)
+#if defined(_MSC_VER) || defined(__CUDACC_RTC__) || defined(_LIBCUDACXX_COMPILER_CLANG_CUDA)
 template <class _A1>
 _LIBCUDACXX_INLINE_VISIBILITY
 _A1 __constexpr_fabs(_A1 __x) noexcept
@@ -738,7 +738,7 @@ _LIBCUDACXX_CONSTEXPR_AFTER_CXX11 double __constexpr_fabs(_Tp __x) noexcept {
 }
 #endif // !_MSC_VER
 
-#if defined(_MSC_VER) || defined(__CUDACC_RTC__)
+#if defined(_MSC_VER) || defined(__CUDACC_RTC__) || defined(_LIBCUDACXX_COMPILER_CLANG_CUDA)
 template <class _A1>
 _LIBCUDACXX_INLINE_VISIBILITY
 _A1 __constexpr_fmax(_A1 __x, _A1 __y) noexcept
@@ -797,7 +797,7 @@ __constexpr_fmax(_Tp __x, _Up __y) noexcept {
 }
 #endif // !_MSC_VER
 
-#if defined(_MSC_VER) || defined(__CUDACC_RTC__)
+#if defined(_MSC_VER) || defined(__CUDACC_RTC__) || defined(_LIBCUDACXX_COMPILER_CLANG_CUDA)
 template <class _A1>
 _LIBCUDACXX_INLINE_VISIBILITY
 _A1 __constexpr_logb(_A1 __x)
@@ -834,7 +834,7 @@ _LIBCUDACXX_CONSTEXPR_AFTER_CXX11_COMPLEX _Tp __constexpr_logb(_Tp __x) {
 }
 #endif // !_MSVC
 
-#if defined(_MSC_VER) || defined(__CUDACC_RTC__)
+#if defined(_MSC_VER) || defined(__CUDACC_RTC__) || defined(_LIBCUDACXX_COMPILER_CLANG_CUDA)
 template <class _Tp>
 inline _LIBCUDACXX_INLINE_VISIBILITY
 _Tp __constexpr_scalbn(_Tp __x, int __i) {
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/cstdlib b/libcudacxx/include/cuda/std/detail/libcxx/include/cstdlib
index cd6768e0e2..f1659f5755 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/cstdlib
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/cstdlib
@@ -98,8 +98,9 @@ void *aligned_alloc(size_t alignment, size_t size);                       // C11
 #pragma GCC system_header
 #endif
 
-
-#ifdef __CUDA_ARCH__
+#if defined(_LIBCUDACXX_COMPILER_CLANG_CUDA)
+#  define _LIBCUDACXX_UNREACHABLE() __builtin_unreachable()
+#elif defined(__CUDA_ARCH__)
 #if defined(_LIBCUDACXX_CUDACC_BELOW_11_2)
 #  define _LIBCUDACXX_UNREACHABLE() __trap()
 #elif defined(_LIBCUDACXX_CUDACC_BELOW_11_3)
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/optional b/libcudacxx/include/cuda/std/detail/libcxx/include/optional
index 7c911fb17e..7b1df08de3 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/optional
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/optional
@@ -1148,7 +1148,7 @@ public:
 
 #if _LIBCUDACXX_STD_VER > 14 && !defined(_LIBCUDACXX_HAS_NO_DEDUCTION_GUIDES)
 template<class _Tp>
-    optional(_Tp) -> optional<_Tp>;
+_LIBCUDACXX_HOST_DEVICE optional(_Tp) -> optional<_Tp>;
 #endif
 
 // Comparisons between optionals
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/span b/libcudacxx/include/cuda/std/detail/libcxx/include/span
index 60a8b74f14..999d05b463 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/span
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/span
@@ -554,19 +554,19 @@ auto as_writable_bytes(span<_Tp, _Extent> __s) noexcept
 
 //  Deduction guides
 template<class _Tp, size_t _Sz>
-    span(_Tp (&)[_Sz]) -> span<_Tp, _Sz>;
+_LIBCUDACXX_HOST_DEVICE span(_Tp (&)[_Sz]) -> span<_Tp, _Sz>;
 
 template<class _Tp, size_t _Sz>
-    span(array<_Tp, _Sz>&) -> span<_Tp, _Sz>;
+_LIBCUDACXX_HOST_DEVICE span(array<_Tp, _Sz>&) -> span<_Tp, _Sz>;
 
 template<class _Tp, size_t _Sz>
-    span(const array<_Tp, _Sz>&) -> span<const _Tp, _Sz>;
+_LIBCUDACXX_HOST_DEVICE span(const array<_Tp, _Sz>&) -> span<const _Tp, _Sz>;
 
 template<class _Container>
-    span(_Container&) -> span<typename _Container::value_type>;
+_LIBCUDACXX_HOST_DEVICE span(_Container&) -> span<typename _Container::value_type>;
 
 template<class _Container>
-    span(const _Container&) -> span<const typename _Container::value_type>;
+_LIBCUDACXX_HOST_DEVICE span(const _Container&) -> span<const typename _Container::value_type>;
 
 #endif // _LIBCUDACXX_STD_VER > 14
 
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/tuple b/libcudacxx/include/cuda/std/detail/libcxx/include/tuple
index 566f827e89..272a49be02 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/tuple
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/tuple
@@ -896,16 +896,16 @@ public:
 
 #ifndef _LIBCUDACXX_HAS_NO_DEDUCTION_GUIDES
 template <class... _Tp>
-tuple(_Tp...) -> tuple<_Tp...>;
+_LIBCUDACXX_HOST_DEVICE tuple(_Tp...) -> tuple<_Tp...>;
 template <class _Tp1, class _Tp2>
-tuple(pair<_Tp1, _Tp2>) -> tuple<_Tp1, _Tp2>;
+_LIBCUDACXX_HOST_DEVICE tuple(pair<_Tp1, _Tp2>) -> tuple<_Tp1, _Tp2>;
 template <class _Alloc, class... _Tp>
-tuple(allocator_arg_t, _Alloc, _Tp...) -> tuple<_Tp...>;
+_LIBCUDACXX_HOST_DEVICE tuple(allocator_arg_t, _Alloc, _Tp...) -> tuple<_Tp...>;
 template <class _Alloc, class _Tp1, class _Tp2>
-tuple(allocator_arg_t, _Alloc, pair<_Tp1, _Tp2>) -> tuple<_Tp1, _Tp2>;
+_LIBCUDACXX_HOST_DEVICE tuple(allocator_arg_t, _Alloc, pair<_Tp1, _Tp2>) -> tuple<_Tp1, _Tp2>;
 template <class _Alloc, class... _Tp>
-tuple(allocator_arg_t, _Alloc, tuple<_Tp...>) -> tuple<_Tp...>;
-#endif
+_LIBCUDACXX_HOST_DEVICE tuple(allocator_arg_t, _Alloc, tuple<_Tp...>) -> tuple<_Tp...>;
+#endif // _LIBCUDACXX_HAS_NO_DEDUCTION_GUIDES
 
 template <class... _Tp>
 inline _LIBCUDACXX_INLINE_VISIBILITY __enable_if_t<_And<__is_swappable<_Tp>...>::value, void>