git: c77c1b5c4847 - stable/13 - Merge llvm-project release/13.x llvmorg-13.0.0-rc2-43-gf56129fe78d5

From: Dimitry Andric <dim_at_FreeBSD.org>
Date: Mon, 06 Dec 2021 16:33:15 UTC
The branch stable/13 has been updated by dim:

URL: https://cgit.FreeBSD.org/src/commit/?id=c77c1b5c48476e0b0f6d3f4ea9dbf2c744eb1765

commit c77c1b5c48476e0b0f6d3f4ea9dbf2c744eb1765
Author:     Dimitry Andric <dim@FreeBSD.org>
AuthorDate: 2021-09-09 20:15:13 +0000
Commit:     Dimitry Andric <dim@FreeBSD.org>
CommitDate: 2021-12-06 16:30:04 +0000

    Merge llvm-project release/13.x llvmorg-13.0.0-rc2-43-gf56129fe78d5
    
    This updates llvm, clang, compiler-rt, libc++, libunwind, lld, lldb and
    openmp to llvmorg-13.0.0-rc2-43-gf56129fe78d5.
    
    PR:             258209
    
    (cherry picked from commit 69ade1e033e478ec426cafc0ec2104d672de294a)
---
 contrib/llvm-project/clang/lib/AST/ASTContext.cpp  |   8 +
 .../llvm-project/clang/lib/Basic/Targets/M68k.cpp  |   4 +-
 .../clang/lib/Basic/Targets/OSTargets.h            |   5 +
 contrib/llvm-project/clang/lib/Driver/Driver.cpp   |   1 -
 .../clang/lib/Driver/ToolChains/AMDGPU.cpp         |  35 +++
 .../clang/lib/Driver/ToolChains/AMDGPU.h           |   5 +
 .../clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp   |  32 ++-
 .../clang/lib/Driver/ToolChains/AMDGPUOpenMP.h     |  14 +-
 .../clang/lib/Driver/ToolChains/Clang.cpp          |   3 +-
 .../clang/lib/Driver/ToolChains/CommonArgs.cpp     |   3 +-
 .../clang/lib/Driver/ToolChains/HIP.cpp            |  33 +--
 .../clang/lib/Driver/ToolChains/OpenBSD.cpp        |   7 +
 .../lib/Headers/__clang_cuda_device_functions.h    | 276 +++++++++++++--------
 .../clang/lib/Headers/__clang_hip_cmath.h          | 188 ++++++++------
 .../clang/lib/Headers/__clang_hip_math.h           |  50 +++-
 .../__clang_openmp_device_functions.h              |  32 ++-
 .../clang/lib/Headers/openmp_wrappers/cmath        |  54 ++++
 .../clang/lib/Headers/openmp_wrappers/math.h       |  10 +
 .../clang/lib/Sema/SemaTemplateInstantiateDecl.cpp |   2 +-
 .../llvm-project/clang/lib/Sema/TreeTransform.h    |   2 +-
 .../clang/lib/Serialization/ASTReader.cpp          |   2 +
 .../compiler-rt/lib/profile/InstrProfilingFile.c   |  19 +-
 .../lib/profile/InstrProfilingPlatformFuchsia.c    |   5 +-
 .../lib/profile/InstrProfilingPlatformLinux.c      |  11 +-
 contrib/llvm-project/libcxx/include/cwctype        |   2 +
 contrib/llvm-project/libcxx/include/string         |  19 ++
 contrib/llvm-project/libcxx/include/vector         |  20 ++
 contrib/llvm-project/libcxx/include/wctype.h       |  10 +
 .../llvm-project/libunwind/src/Unwind-EHABI.cpp    |   2 +
 .../llvm/include/llvm/Analysis/LazyCallGraph.h     |   2 +-
 .../llvm/include/llvm/Analysis/LoopInfo.h          |   2 +-
 .../llvm/include/llvm/Analysis/LoopNestAnalysis.h  |   2 +-
 .../include/llvm/Analysis/TargetTransformInfo.h    |   1 -
 .../llvm/include/llvm/CodeGen/MachineFunction.h    |   2 +-
 .../llvm-project/llvm/include/llvm/IR/Function.h   |   3 +-
 contrib/llvm-project/llvm/include/llvm/IR/Module.h |   6 +-
 .../llvm/lib/Analysis/ScalarEvolution.cpp          |   2 +-
 .../llvm/lib/Analysis/TargetTransformInfo.cpp      |   1 -
 .../lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp    |   5 +-
 .../llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h |   8 +-
 .../llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp     |  31 +--
 .../llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h       |   3 -
 .../llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp      |   9 +-
 .../llvm/lib/CodeGen/HardwareLoops.cpp             |   5 +-
 .../llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp  |   3 +-
 .../CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp  |   7 +-
 .../CodeGen/SelectionDAG/SelectionDAGBuilder.cpp   |   6 +-
 .../lib/CodeGen/SelectionDAG/TargetLowering.cpp    |   7 +-
 .../llvm-project/llvm/lib/Linker/LinkModules.cpp   |  22 +-
 .../llvm-project/llvm/lib/Passes/PassBuilder.cpp   |   5 +-
 .../lib/Target/AArch64/AArch64ISelLowering.cpp     |   2 +
 .../llvm/lib/Target/AArch64/SMEInstrFormats.td     |   2 +-
 .../lib/Target/M68k/AsmParser/M68kAsmParser.cpp    |  28 +--
 .../llvm/lib/Target/M68k/M68kTargetMachine.cpp     |  12 +-
 .../llvm/lib/Target/RISCV/RISCVInstrInfo.cpp       |   2 +-
 .../lib/Target/WebAssembly/WebAssemblyFastISel.cpp |  20 +-
 .../lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp    |  21 ++
 .../Transforms/Scalar/AlignmentFromAssumptions.cpp |   4 +
 .../lib/Transforms/Scalar/LoopIdiomRecognize.cpp   |   5 +
 .../llvm/tools/llvm-cov/CoverageExporterLcov.cpp   |   2 +-
 .../llvm/tools/llvm-objdump/llvm-objdump.cpp       |   9 +-
 lib/clang/include/VCSVersion.inc                   |   8 +-
 lib/clang/include/llvm/Support/VCSRevision.h       |   2 +-
 63 files changed, 752 insertions(+), 351 deletions(-)

diff --git a/contrib/llvm-project/clang/lib/AST/ASTContext.cpp b/contrib/llvm-project/clang/lib/AST/ASTContext.cpp
index fdba204fbe7f..0e163f3161a3 100644
--- a/contrib/llvm-project/clang/lib/AST/ASTContext.cpp
+++ b/contrib/llvm-project/clang/lib/AST/ASTContext.cpp
@@ -9653,11 +9653,19 @@ static QualType mergeEnumWithInteger(ASTContext &Context, const EnumType *ET,
 QualType ASTContext::mergeTypes(QualType LHS, QualType RHS,
                                 bool OfBlockPointer,
                                 bool Unqualified, bool BlockReturnType) {
+  // For C++ we will not reach this code with reference types (see below),
+  // for OpenMP variant call overloading we might.
+  //
   // C++ [expr]: If an expression initially has the type "reference to T", the
   // type is adjusted to "T" prior to any further analysis, the expression
   // designates the object or function denoted by the reference, and the
   // expression is an lvalue unless the reference is an rvalue reference and
   // the expression is a function call (possibly inside parentheses).
+  if (LangOpts.OpenMP && LHS->getAs<ReferenceType>() &&
+      RHS->getAs<ReferenceType>() && LHS->getTypeClass() == RHS->getTypeClass())
+    return mergeTypes(LHS->getAs<ReferenceType>()->getPointeeType(),
+                      RHS->getAs<ReferenceType>()->getPointeeType(),
+                      OfBlockPointer, Unqualified, BlockReturnType);
   if (LHS->getAs<ReferenceType>() || RHS->getAs<ReferenceType>())
     return {};
 
diff --git a/contrib/llvm-project/clang/lib/Basic/Targets/M68k.cpp b/contrib/llvm-project/clang/lib/Basic/Targets/M68k.cpp
index 31cb36d37636..c0cd8fa90ed6 100644
--- a/contrib/llvm-project/clang/lib/Basic/Targets/M68k.cpp
+++ b/contrib/llvm-project/clang/lib/Basic/Targets/M68k.cpp
@@ -37,8 +37,8 @@ M68kTargetInfo::M68kTargetInfo(const llvm::Triple &Triple,
   // FIXME how to wire it with the used object format?
   Layout += "-m:e";
 
-  // M68k pointers are always 32 bit wide even for 16 bit cpus
-  Layout += "-p:32:32";
+  // M68k pointers are always 32 bit wide even for 16-bit CPUs
+  Layout += "-p:32:16:32";
 
   // M68k integer data types
   Layout += "-i8:8:8-i16:16:16-i32:16:32";
diff --git a/contrib/llvm-project/clang/lib/Basic/Targets/OSTargets.h b/contrib/llvm-project/clang/lib/Basic/Targets/OSTargets.h
index e24fb5cf082d..3fe39ed64d9c 100644
--- a/contrib/llvm-project/clang/lib/Basic/Targets/OSTargets.h
+++ b/contrib/llvm-project/clang/lib/Basic/Targets/OSTargets.h
@@ -460,6 +460,11 @@ protected:
       Builder.defineMacro("_REENTRANT");
     if (this->HasFloat128)
       Builder.defineMacro("__FLOAT128__");
+
+    if (Opts.C11) {
+      Builder.defineMacro("__STDC_NO_ATOMICS__");
+      Builder.defineMacro("__STDC_NO_THREADS__");
+    }
   }
 
 public:
diff --git a/contrib/llvm-project/clang/lib/Driver/Driver.cpp b/contrib/llvm-project/clang/lib/Driver/Driver.cpp
index 5c323cb6ea23..94a7553e273b 100644
--- a/contrib/llvm-project/clang/lib/Driver/Driver.cpp
+++ b/contrib/llvm-project/clang/lib/Driver/Driver.cpp
@@ -5568,7 +5568,6 @@ llvm::StringRef clang::driver::getDriverMode(StringRef ProgName,
     if (!Arg.startswith(OptName))
       continue;
     Opt = Arg;
-    break;
   }
   if (Opt.empty())
     Opt = ToolChain::getTargetAndModeFromProgramName(ProgName).DriverMode;
diff --git a/contrib/llvm-project/clang/lib/Driver/ToolChains/AMDGPU.cpp b/contrib/llvm-project/clang/lib/Driver/ToolChains/AMDGPU.cpp
index d63c5e12c4af..4a7413112b55 100644
--- a/contrib/llvm-project/clang/lib/Driver/ToolChains/AMDGPU.cpp
+++ b/contrib/llvm-project/clang/lib/Driver/ToolChains/AMDGPU.cpp
@@ -893,3 +893,38 @@ bool AMDGPUToolChain::shouldSkipArgument(const llvm::opt::Arg *A) const {
     return true;
   return false;
 }
+
+llvm::SmallVector<std::string, 12>
+ROCMToolChain::getCommonDeviceLibNames(const llvm::opt::ArgList &DriverArgs,
+                                       const std::string &GPUArch) const {
+  auto Kind = llvm::AMDGPU::parseArchAMDGCN(GPUArch);
+  const StringRef CanonArch = llvm::AMDGPU::getArchNameAMDGCN(Kind);
+
+  std::string LibDeviceFile = RocmInstallation.getLibDeviceFile(CanonArch);
+  if (LibDeviceFile.empty()) {
+    getDriver().Diag(diag::err_drv_no_rocm_device_lib) << 1 << GPUArch;
+    return {};
+  }
+
+  // If --hip-device-lib is not set, add the default bitcode libraries.
+  // TODO: There are way too many flags that change this. Do we need to check
+  // them all?
+  bool DAZ = DriverArgs.hasFlag(options::OPT_fgpu_flush_denormals_to_zero,
+                                options::OPT_fno_gpu_flush_denormals_to_zero,
+                                getDefaultDenormsAreZeroForTarget(Kind));
+  bool FiniteOnly = DriverArgs.hasFlag(
+      options::OPT_ffinite_math_only, options::OPT_fno_finite_math_only, false);
+  bool UnsafeMathOpt =
+      DriverArgs.hasFlag(options::OPT_funsafe_math_optimizations,
+                         options::OPT_fno_unsafe_math_optimizations, false);
+  bool FastRelaxedMath = DriverArgs.hasFlag(options::OPT_ffast_math,
+                                            options::OPT_fno_fast_math, false);
+  bool CorrectSqrt = DriverArgs.hasFlag(
+      options::OPT_fhip_fp32_correctly_rounded_divide_sqrt,
+      options::OPT_fno_hip_fp32_correctly_rounded_divide_sqrt);
+  bool Wave64 = isWave64(DriverArgs, Kind);
+
+  return RocmInstallation.getCommonBitcodeLibs(
+      DriverArgs, LibDeviceFile, Wave64, DAZ, FiniteOnly, UnsafeMathOpt,
+      FastRelaxedMath, CorrectSqrt);
+}
\ No newline at end of file
diff --git a/contrib/llvm-project/clang/lib/Driver/ToolChains/AMDGPU.h b/contrib/llvm-project/clang/lib/Driver/ToolChains/AMDGPU.h
index 50ed3b3ded9a..a4bcf315ca76 100644
--- a/contrib/llvm-project/clang/lib/Driver/ToolChains/AMDGPU.h
+++ b/contrib/llvm-project/clang/lib/Driver/ToolChains/AMDGPU.h
@@ -136,6 +136,11 @@ public:
   addClangTargetOptions(const llvm::opt::ArgList &DriverArgs,
                         llvm::opt::ArgStringList &CC1Args,
                         Action::OffloadKind DeviceOffloadKind) const override;
+
+  // Returns a list of device library names shared by different languages
+  llvm::SmallVector<std::string, 12>
+  getCommonDeviceLibNames(const llvm::opt::ArgList &DriverArgs,
+                          const std::string &GPUArch) const;
 };
 
 } // end namespace toolchains
diff --git a/contrib/llvm-project/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp b/contrib/llvm-project/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp
index fe1d19c2dd67..135e3694434d 100644
--- a/contrib/llvm-project/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp
+++ b/contrib/llvm-project/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp
@@ -9,12 +9,14 @@
 #include "AMDGPUOpenMP.h"
 #include "AMDGPU.h"
 #include "CommonArgs.h"
+#include "ToolChains/ROCm.h"
 #include "clang/Basic/DiagnosticDriver.h"
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Driver.h"
 #include "clang/Driver/DriverDiagnostic.h"
 #include "clang/Driver/InputInfo.h"
 #include "clang/Driver/Options.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/FormatAdapters.h"
 #include "llvm/Support/FormatVariadic.h"
@@ -84,14 +86,34 @@ static bool checkSystemForAMDGPU(const ArgList &Args, const AMDGPUToolChain &TC,
 } // namespace
 
 const char *AMDGCN::OpenMPLinker::constructLLVMLinkCommand(
-    Compilation &C, const JobAction &JA, const InputInfoList &Inputs,
-    const ArgList &Args, StringRef SubArchName,
-    StringRef OutputFilePrefix) const {
+    const toolchains::AMDGPUOpenMPToolChain &AMDGPUOpenMPTC, Compilation &C,
+    const JobAction &JA, const InputInfoList &Inputs, const ArgList &Args,
+    StringRef SubArchName, StringRef OutputFilePrefix) const {
   ArgStringList CmdArgs;
 
   for (const auto &II : Inputs)
     if (II.isFilename())
       CmdArgs.push_back(II.getFilename());
+
+  if (Args.hasArg(options::OPT_l)) {
+    auto Lm = Args.getAllArgValues(options::OPT_l);
+    bool HasLibm = false;
+    for (auto &Lib : Lm) {
+      if (Lib == "m") {
+        HasLibm = true;
+        break;
+      }
+    }
+
+    if (HasLibm) {
+      SmallVector<std::string, 12> BCLibs =
+          AMDGPUOpenMPTC.getCommonDeviceLibNames(Args, SubArchName.str());
+      llvm::for_each(BCLibs, [&](StringRef BCFile) {
+        CmdArgs.push_back(Args.MakeArgString(BCFile));
+      });
+    }
+  }
+
   // Add an intermediate output file.
   CmdArgs.push_back("-o");
   const char *OutputFileName =
@@ -180,8 +202,8 @@ void AMDGCN::OpenMPLinker::ConstructJob(Compilation &C, const JobAction &JA,
   assert(Prefix.length() && "no linker inputs are files ");
 
   // Each command outputs different files.
-  const char *LLVMLinkCommand =
-      constructLLVMLinkCommand(C, JA, Inputs, Args, GPUArch, Prefix);
+  const char *LLVMLinkCommand = constructLLVMLinkCommand(
+      AMDGPUOpenMPTC, C, JA, Inputs, Args, GPUArch, Prefix);
 
   // Produce readable assembly if save-temps is enabled.
   if (C.getDriver().isSaveTempsEnabled())
diff --git a/contrib/llvm-project/clang/lib/Driver/ToolChains/AMDGPUOpenMP.h b/contrib/llvm-project/clang/lib/Driver/ToolChains/AMDGPUOpenMP.h
index effca7e212cc..233256bf7378 100644
--- a/contrib/llvm-project/clang/lib/Driver/ToolChains/AMDGPUOpenMP.h
+++ b/contrib/llvm-project/clang/lib/Driver/ToolChains/AMDGPUOpenMP.h
@@ -16,6 +16,10 @@
 namespace clang {
 namespace driver {
 
+namespace toolchains {
+class AMDGPUOpenMPToolChain;
+}
+
 namespace tools {
 
 namespace AMDGCN {
@@ -35,11 +39,11 @@ public:
 
 private:
   /// \return llvm-link output file name.
-  const char *constructLLVMLinkCommand(Compilation &C, const JobAction &JA,
-                                       const InputInfoList &Inputs,
-                                       const llvm::opt::ArgList &Args,
-                                       llvm::StringRef SubArchName,
-                                       llvm::StringRef OutputFilePrefix) const;
+  const char *constructLLVMLinkCommand(
+      const toolchains::AMDGPUOpenMPToolChain &AMDGPUOpenMPTC, Compilation &C,
+      const JobAction &JA, const InputInfoList &Inputs,
+      const llvm::opt::ArgList &Args, llvm::StringRef SubArchName,
+      llvm::StringRef OutputFilePrefix) const;
 
   /// \return llc output file name.
   const char *constructLlcCommand(Compilation &C, const JobAction &JA,
diff --git a/contrib/llvm-project/clang/lib/Driver/ToolChains/Clang.cpp b/contrib/llvm-project/clang/lib/Driver/ToolChains/Clang.cpp
index 4a7dc3a33a5f..cb38ab51327c 100644
--- a/contrib/llvm-project/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/contrib/llvm-project/clang/lib/Driver/ToolChains/Clang.cpp
@@ -1255,7 +1255,8 @@ void Clang::AddPreprocessingOptions(Compilation &C, const JobAction &JA,
   // If we are offloading to a target via OpenMP we need to include the
   // openmp_wrappers folder which contains alternative system headers.
   if (JA.isDeviceOffloading(Action::OFK_OpenMP) &&
-      getToolChain().getTriple().isNVPTX()){
+      (getToolChain().getTriple().isNVPTX() ||
+       getToolChain().getTriple().isAMDGCN())) {
     if (!Args.hasArg(options::OPT_nobuiltininc)) {
       // Add openmp_wrappers/* to our system include path.  This lets us wrap
       // standard library headers.
diff --git a/contrib/llvm-project/clang/lib/Driver/ToolChains/CommonArgs.cpp b/contrib/llvm-project/clang/lib/Driver/ToolChains/CommonArgs.cpp
index 83cab3ac00cb..0ffe95795381 100644
--- a/contrib/llvm-project/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/contrib/llvm-project/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -775,7 +775,8 @@ void tools::linkSanitizerRuntimeDeps(const ToolChain &TC,
     CmdArgs.push_back("-ldl");
   // Required for backtrace on some OSes
   if (TC.getTriple().isOSFreeBSD() ||
-      TC.getTriple().isOSNetBSD())
+      TC.getTriple().isOSNetBSD() ||
+      TC.getTriple().isOSOpenBSD())
     CmdArgs.push_back("-lexecinfo");
 }
 
diff --git a/contrib/llvm-project/clang/lib/Driver/ToolChains/HIP.cpp b/contrib/llvm-project/clang/lib/Driver/ToolChains/HIP.cpp
index 59d58aadb687..c4e840de86e1 100644
--- a/contrib/llvm-project/clang/lib/Driver/ToolChains/HIP.cpp
+++ b/contrib/llvm-project/clang/lib/Driver/ToolChains/HIP.cpp
@@ -395,35 +395,8 @@ HIPToolChain::getHIPDeviceLibs(const llvm::opt::ArgList &DriverArgs) const {
     }
     StringRef GpuArch = getGPUArch(DriverArgs);
     assert(!GpuArch.empty() && "Must have an explicit GPU arch.");
-    (void)GpuArch;
-    auto Kind = llvm::AMDGPU::parseArchAMDGCN(GpuArch);
-    const StringRef CanonArch = llvm::AMDGPU::getArchNameAMDGCN(Kind);
-
-    std::string LibDeviceFile = RocmInstallation.getLibDeviceFile(CanonArch);
-    if (LibDeviceFile.empty()) {
-      getDriver().Diag(diag::err_drv_no_rocm_device_lib) << 1 << GpuArch;
-      return {};
-    }
 
     // If --hip-device-lib is not set, add the default bitcode libraries.
-    // TODO: There are way too many flags that change this. Do we need to check
-    // them all?
-    bool DAZ = DriverArgs.hasFlag(options::OPT_fgpu_flush_denormals_to_zero,
-                                  options::OPT_fno_gpu_flush_denormals_to_zero,
-                                  getDefaultDenormsAreZeroForTarget(Kind));
-    bool FiniteOnly =
-        DriverArgs.hasFlag(options::OPT_ffinite_math_only,
-                           options::OPT_fno_finite_math_only, false);
-    bool UnsafeMathOpt =
-        DriverArgs.hasFlag(options::OPT_funsafe_math_optimizations,
-                           options::OPT_fno_unsafe_math_optimizations, false);
-    bool FastRelaxedMath = DriverArgs.hasFlag(
-        options::OPT_ffast_math, options::OPT_fno_fast_math, false);
-    bool CorrectSqrt = DriverArgs.hasFlag(
-        options::OPT_fhip_fp32_correctly_rounded_divide_sqrt,
-        options::OPT_fno_hip_fp32_correctly_rounded_divide_sqrt);
-    bool Wave64 = isWave64(DriverArgs, Kind);
-
     if (DriverArgs.hasFlag(options::OPT_fgpu_sanitize,
                            options::OPT_fno_gpu_sanitize, false)) {
       auto AsanRTL = RocmInstallation.getAsanRTLPath();
@@ -442,10 +415,8 @@ HIPToolChain::getHIPDeviceLibs(const llvm::opt::ArgList &DriverArgs) const {
     // Add the HIP specific bitcode library.
     BCLibs.push_back(RocmInstallation.getHIPPath().str());
 
-    // Add the generic set of libraries.
-    BCLibs.append(RocmInstallation.getCommonBitcodeLibs(
-        DriverArgs, LibDeviceFile, Wave64, DAZ, FiniteOnly, UnsafeMathOpt,
-        FastRelaxedMath, CorrectSqrt));
+    // Add common device libraries like ocml etc.
+    BCLibs.append(getCommonDeviceLibNames(DriverArgs, GpuArch.str()));
 
     // Add instrument lib.
     auto InstLib =
diff --git a/contrib/llvm-project/clang/lib/Driver/ToolChains/OpenBSD.cpp b/contrib/llvm-project/clang/lib/Driver/ToolChains/OpenBSD.cpp
index e162165b2561..89828fbb6f5f 100644
--- a/contrib/llvm-project/clang/lib/Driver/ToolChains/OpenBSD.cpp
+++ b/contrib/llvm-project/clang/lib/Driver/ToolChains/OpenBSD.cpp
@@ -174,6 +174,11 @@ void openbsd::Linker::ConstructJob(Compilation &C, const JobAction &JA,
   AddLinkerInputs(ToolChain, Inputs, Args, CmdArgs, JA);
 
   if (!Args.hasArg(options::OPT_nostdlib, options::OPT_nodefaultlibs)) {
+    // Use the static OpenMP runtime with -static-openmp
+    bool StaticOpenMP = Args.hasArg(options::OPT_static_openmp) &&
+                        !Args.hasArg(options::OPT_static);
+    addOpenMPRuntime(CmdArgs, ToolChain, Args, StaticOpenMP);
+
     if (D.CCCIsCXX()) {
       if (ToolChain.ShouldLinkCXXStdlib(Args))
         ToolChain.AddCXXStdlibLibArgs(Args, CmdArgs);
@@ -221,6 +226,8 @@ void openbsd::Linker::ConstructJob(Compilation &C, const JobAction &JA,
     CmdArgs.push_back(Args.MakeArgString(ToolChain.GetFilePath(crtend)));
   }
 
+  ToolChain.addProfileRTLibs(Args, CmdArgs);
+
   const char *Exec = Args.MakeArgString(ToolChain.GetLinkerPath());
   C.addCommand(std::make_unique<Command>(JA, *this,
                                          ResponseFileSupport::AtFileCurCP(),
diff --git a/contrib/llvm-project/clang/lib/Headers/__clang_cuda_device_functions.h b/contrib/llvm-project/clang/lib/Headers/__clang_cuda_device_functions.h
index f801e5426aa4..cc4e1a4dd96a 100644
--- a/contrib/llvm-project/clang/lib/Headers/__clang_cuda_device_functions.h
+++ b/contrib/llvm-project/clang/lib/Headers/__clang_cuda_device_functions.h
@@ -34,10 +34,12 @@ __DEVICE__ unsigned long long __brevll(unsigned long long __a) {
   return __nv_brevll(__a);
 }
 #if defined(__cplusplus)
-__DEVICE__ void __brkpt() { asm volatile("brkpt;"); }
+__DEVICE__ void __brkpt() { __asm__ __volatile__("brkpt;"); }
 __DEVICE__ void __brkpt(int __a) { __brkpt(); }
 #else
-__DEVICE__ void __attribute__((overloadable)) __brkpt(void) { asm volatile("brkpt;"); }
+__DEVICE__ void __attribute__((overloadable)) __brkpt(void) {
+  __asm__ __volatile__("brkpt;");
+}
 __DEVICE__ void __attribute__((overloadable)) __brkpt(int __a) { __brkpt(); }
 #endif
 __DEVICE__ unsigned int __byte_perm(unsigned int __a, unsigned int __b,
@@ -507,7 +509,7 @@ __DEVICE__ float __powf(float __a, float __b) {
 }
 
 // Parameter must have a known integer value.
-#define __prof_trigger(__a) asm __volatile__("pmevent \t%0;" ::"i"(__a))
+#define __prof_trigger(__a) __asm__ __volatile__("pmevent \t%0;" ::"i"(__a))
 __DEVICE__ int __rhadd(int __a, int __b) { return __nv_rhadd(__a, __b); }
 __DEVICE__ unsigned int __sad(int __a, int __b, unsigned int __c) {
   return __nv_sad(__a, __b, __c);
@@ -526,7 +528,7 @@ __DEVICE__ float __tanf(float __a) { return __nv_fast_tanf(__a); }
 __DEVICE__ void __threadfence(void) { __nvvm_membar_gl(); }
 __DEVICE__ void __threadfence_block(void) { __nvvm_membar_cta(); };
 __DEVICE__ void __threadfence_system(void) { __nvvm_membar_sys(); };
-__DEVICE__ void __trap(void) { asm volatile("trap;"); }
+__DEVICE__ void __trap(void) { __asm__ __volatile__("trap;"); }
 __DEVICE__ unsigned int __uAtomicAdd(unsigned int *__p, unsigned int __v) {
   return __nvvm_atom_add_gen_i((int *)__p, __v);
 }
@@ -1051,122 +1053,136 @@ __DEVICE__ unsigned int __bool2mask(unsigned int __a, int shift) {
 }
 __DEVICE__ unsigned int __vabs2(unsigned int __a) {
   unsigned int r;
-  asm("vabsdiff2.s32.s32.s32 %0,%1,%2,%3;"
-      : "=r"(r)
-      : "r"(__a), "r"(0), "r"(0));
+  __asm__("vabsdiff2.s32.s32.s32 %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(0), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vabs4(unsigned int __a) {
   unsigned int r;
-  asm("vabsdiff4.s32.s32.s32 %0,%1,%2,%3;"
-      : "=r"(r)
-      : "r"(__a), "r"(0), "r"(0));
+  __asm__("vabsdiff4.s32.s32.s32 %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(0), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vabsdiffs2(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vabsdiff2.s32.s32.s32 %0,%1,%2,%3;"
-      : "=r"(r)
-      : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vabsdiff2.s32.s32.s32 %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 
 __DEVICE__ unsigned int __vabsdiffs4(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vabsdiff4.s32.s32.s32 %0,%1,%2,%3;"
-      : "=r"(r)
-      : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vabsdiff4.s32.s32.s32 %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vabsdiffu2(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vabsdiff2.u32.u32.u32 %0,%1,%2,%3;"
-      : "=r"(r)
-      : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vabsdiff2.u32.u32.u32 %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vabsdiffu4(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vabsdiff4.u32.u32.u32 %0,%1,%2,%3;"
-      : "=r"(r)
-      : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vabsdiff4.u32.u32.u32 %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vabsss2(unsigned int __a) {
   unsigned int r;
-  asm("vabsdiff2.s32.s32.s32.sat %0,%1,%2,%3;"
-      : "=r"(r)
-      : "r"(__a), "r"(0), "r"(0));
+  __asm__("vabsdiff2.s32.s32.s32.sat %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(0), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vabsss4(unsigned int __a) {
   unsigned int r;
-  asm("vabsdiff4.s32.s32.s32.sat %0,%1,%2,%3;"
-      : "=r"(r)
-      : "r"(__a), "r"(0), "r"(0));
+  __asm__("vabsdiff4.s32.s32.s32.sat %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(0), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vadd2(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vadd2.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vadd2.u32.u32.u32 %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vadd4(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vadd4.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vadd4.u32.u32.u32 %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vaddss2(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vadd2.s32.s32.s32.sat %0,%1,%2,%3;"
-      : "=r"(r)
-      : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vadd2.s32.s32.s32.sat %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vaddss4(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vadd4.s32.s32.s32.sat %0,%1,%2,%3;"
-      : "=r"(r)
-      : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vadd4.s32.s32.s32.sat %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vaddus2(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vadd2.u32.u32.u32.sat %0,%1,%2,%3;"
-      : "=r"(r)
-      : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vadd2.u32.u32.u32.sat %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vaddus4(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vadd4.u32.u32.u32.sat %0,%1,%2,%3;"
-      : "=r"(r)
-      : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vadd4.u32.u32.u32.sat %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vavgs2(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vavrg2.s32.s32.s32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vavrg2.s32.s32.s32 %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vavgs4(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vavrg4.s32.s32.s32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vavrg4.s32.s32.s32 %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vavgu2(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vavrg2.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vavrg2.u32.u32.u32 %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vavgu4(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vavrg4.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vavrg4.u32.u32.u32 %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vseteq2(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vset2.u32.u32.eq %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vset2.u32.u32.eq %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vcmpeq2(unsigned int __a, unsigned int __b) {
@@ -1174,7 +1190,9 @@ __DEVICE__ unsigned int __vcmpeq2(unsigned int __a, unsigned int __b) {
 }
 __DEVICE__ unsigned int __vseteq4(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vset4.u32.u32.eq %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vset4.u32.u32.eq %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vcmpeq4(unsigned int __a, unsigned int __b) {
@@ -1182,7 +1200,9 @@ __DEVICE__ unsigned int __vcmpeq4(unsigned int __a, unsigned int __b) {
 }
 __DEVICE__ unsigned int __vsetges2(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vset2.s32.s32.ge %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vset2.s32.s32.ge %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vcmpges2(unsigned int __a, unsigned int __b) {
@@ -1190,7 +1210,9 @@ __DEVICE__ unsigned int __vcmpges2(unsigned int __a, unsigned int __b) {
 }
 __DEVICE__ unsigned int __vsetges4(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vset4.s32.s32.ge %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vset4.s32.s32.ge %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vcmpges4(unsigned int __a, unsigned int __b) {
@@ -1198,7 +1220,9 @@ __DEVICE__ unsigned int __vcmpges4(unsigned int __a, unsigned int __b) {
 }
 __DEVICE__ unsigned int __vsetgeu2(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vset2.u32.u32.ge %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vset2.u32.u32.ge %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vcmpgeu2(unsigned int __a, unsigned int __b) {
@@ -1206,7 +1230,9 @@ __DEVICE__ unsigned int __vcmpgeu2(unsigned int __a, unsigned int __b) {
 }
 __DEVICE__ unsigned int __vsetgeu4(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vset4.u32.u32.ge %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vset4.u32.u32.ge %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vcmpgeu4(unsigned int __a, unsigned int __b) {
@@ -1214,7 +1240,9 @@ __DEVICE__ unsigned int __vcmpgeu4(unsigned int __a, unsigned int __b) {
 }
 __DEVICE__ unsigned int __vsetgts2(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vset2.s32.s32.gt %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vset2.s32.s32.gt %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vcmpgts2(unsigned int __a, unsigned int __b) {
@@ -1222,7 +1250,9 @@ __DEVICE__ unsigned int __vcmpgts2(unsigned int __a, unsigned int __b) {
 }
 __DEVICE__ unsigned int __vsetgts4(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vset4.s32.s32.gt %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vset4.s32.s32.gt %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vcmpgts4(unsigned int __a, unsigned int __b) {
@@ -1230,7 +1260,9 @@ __DEVICE__ unsigned int __vcmpgts4(unsigned int __a, unsigned int __b) {
 }
 __DEVICE__ unsigned int __vsetgtu2(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vset2.u32.u32.gt %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vset2.u32.u32.gt %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vcmpgtu2(unsigned int __a, unsigned int __b) {
@@ -1238,7 +1270,9 @@ __DEVICE__ unsigned int __vcmpgtu2(unsigned int __a, unsigned int __b) {
 }
 __DEVICE__ unsigned int __vsetgtu4(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vset4.u32.u32.gt %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vset4.u32.u32.gt %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vcmpgtu4(unsigned int __a, unsigned int __b) {
@@ -1246,7 +1280,9 @@ __DEVICE__ unsigned int __vcmpgtu4(unsigned int __a, unsigned int __b) {
 }
 __DEVICE__ unsigned int __vsetles2(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vset2.s32.s32.le %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vset2.s32.s32.le %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vcmples2(unsigned int __a, unsigned int __b) {
@@ -1254,7 +1290,9 @@ __DEVICE__ unsigned int __vcmples2(unsigned int __a, unsigned int __b) {
 }
 __DEVICE__ unsigned int __vsetles4(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vset4.s32.s32.le %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vset4.s32.s32.le %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vcmples4(unsigned int __a, unsigned int __b) {
@@ -1262,7 +1300,9 @@ __DEVICE__ unsigned int __vcmples4(unsigned int __a, unsigned int __b) {
 }
 __DEVICE__ unsigned int __vsetleu2(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vset2.u32.u32.le %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vset2.u32.u32.le %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vcmpleu2(unsigned int __a, unsigned int __b) {
@@ -1270,7 +1310,9 @@ __DEVICE__ unsigned int __vcmpleu2(unsigned int __a, unsigned int __b) {
 }
 __DEVICE__ unsigned int __vsetleu4(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vset4.u32.u32.le %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vset4.u32.u32.le %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vcmpleu4(unsigned int __a, unsigned int __b) {
@@ -1278,7 +1320,9 @@ __DEVICE__ unsigned int __vcmpleu4(unsigned int __a, unsigned int __b) {
 }
 __DEVICE__ unsigned int __vsetlts2(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vset2.s32.s32.lt %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vset2.s32.s32.lt %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vcmplts2(unsigned int __a, unsigned int __b) {
@@ -1286,7 +1330,9 @@ __DEVICE__ unsigned int __vcmplts2(unsigned int __a, unsigned int __b) {
 }
 __DEVICE__ unsigned int __vsetlts4(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vset4.s32.s32.lt %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vset4.s32.s32.lt %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vcmplts4(unsigned int __a, unsigned int __b) {
@@ -1294,7 +1340,9 @@ __DEVICE__ unsigned int __vcmplts4(unsigned int __a, unsigned int __b) {
 }
 __DEVICE__ unsigned int __vsetltu2(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vset2.u32.u32.lt %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vset2.u32.u32.lt %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vcmpltu2(unsigned int __a, unsigned int __b) {
@@ -1302,7 +1350,9 @@ __DEVICE__ unsigned int __vcmpltu2(unsigned int __a, unsigned int __b) {
 }
 __DEVICE__ unsigned int __vsetltu4(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vset4.u32.u32.lt %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vset4.u32.u32.lt %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vcmpltu4(unsigned int __a, unsigned int __b) {
@@ -1310,7 +1360,9 @@ __DEVICE__ unsigned int __vcmpltu4(unsigned int __a, unsigned int __b) {
 }
 __DEVICE__ unsigned int __vsetne2(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vset2.u32.u32.ne %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vset2.u32.u32.ne %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vcmpne2(unsigned int __a, unsigned int __b) {
@@ -1318,7 +1370,9 @@ __DEVICE__ unsigned int __vcmpne2(unsigned int __a, unsigned int __b) {
 }
 __DEVICE__ unsigned int __vsetne4(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vset4.u32.u32.ne %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vset4.u32.u32.ne %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vcmpne4(unsigned int __a, unsigned int __b) {
@@ -1345,94 +1399,112 @@ __DEVICE__ unsigned int __vmaxs2(unsigned int __a, unsigned int __b) {
     unsigned mask = __vcmpgts2(__a, __b);
     r = (__a & mask) | (__b & ~mask);
   } else {
-    asm("vmax2.s32.s32.s32 %0,%1,%2,%3;"
-        : "=r"(r)
-        : "r"(__a), "r"(__b), "r"(0));
+    __asm__("vmax2.s32.s32.s32 %0,%1,%2,%3;"
+            : "=r"(r)
+            : "r"(__a), "r"(__b), "r"(0));
   }
   return r;
 }
 __DEVICE__ unsigned int __vmaxs4(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vmax4.s32.s32.s32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vmax4.s32.s32.s32 %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vmaxu2(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vmax2.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vmax2.u32.u32.u32 %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vmaxu4(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vmax4.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vmax4.u32.u32.u32 %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vmins2(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vmin2.s32.s32.s32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vmin2.s32.s32.s32 %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vmins4(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vmin4.s32.s32.s32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vmin4.s32.s32.s32 %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vminu2(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vmin2.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vmin2.u32.u32.u32 %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vminu4(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vmin4.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vmin4.u32.u32.u32 %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vsads2(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vabsdiff2.s32.s32.s32.add %0,%1,%2,%3;"
-      : "=r"(r)
-      : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vabsdiff2.s32.s32.s32.add %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vsads4(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vabsdiff4.s32.s32.s32.add %0,%1,%2,%3;"
-      : "=r"(r)
-      : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vabsdiff4.s32.s32.s32.add %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vsadu2(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vabsdiff2.u32.u32.u32.add %0,%1,%2,%3;"
-      : "=r"(r)
-      : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vabsdiff2.u32.u32.u32.add %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vsadu4(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vabsdiff4.u32.u32.u32.add %0,%1,%2,%3;"
-      : "=r"(r)
-      : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vabsdiff4.u32.u32.u32.add %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 
 __DEVICE__ unsigned int __vsub2(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vsub2.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vsub2.u32.u32.u32 %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vneg2(unsigned int __a) { return __vsub2(0, __a); }
 
 __DEVICE__ unsigned int __vsub4(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vsub4.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vsub4.u32.u32.u32 %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vneg4(unsigned int __a) { return __vsub4(0, __a); }
 __DEVICE__ unsigned int __vsubss2(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vsub2.s32.s32.s32.sat %0,%1,%2,%3;"
-      : "=r"(r)
-      : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vsub2.s32.s32.s32.sat %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vnegss2(unsigned int __a) {
@@ -1440,9 +1512,9 @@ __DEVICE__ unsigned int __vnegss2(unsigned int __a) {
 }
 __DEVICE__ unsigned int __vsubss4(unsigned int __a, unsigned int __b) {
   unsigned int r;
*** 1813 LINES SKIPPED ***