git: cadd2ca21765 - main - Merge commit d0be944aa511 from llvm-project (by Simon Pilgrim):

From: Dimitry Andric <dim_at_FreeBSD.org>
Date: Sat, 25 May 2024 20:18:03 UTC
The branch main has been updated by dim:

URL: https://cgit.FreeBSD.org/src/commit/?id=cadd2ca21765ebcb95b77ec94977b4e74e1edc1b

commit cadd2ca21765ebcb95b77ec94977b4e74e1edc1b
Author:     Dimitry Andric <dim@FreeBSD.org>
AuthorDate: 2024-05-25 17:52:15 +0000
Commit:     Dimitry Andric <dim@FreeBSD.org>
CommitDate: 2024-05-25 19:12:29 +0000

    Merge commit d0be944aa511 from llvm-project (by Simon Pilgrim):
    
      [X86] Add slow div64 tuning flag to Nehalem target (#91129)
    
      This appears to have been missed because later cpus don't inherit from Nehalem tuning much.
    
      Noticed while cleaning up for #90985
    
    Merge commit 8b400de79eff from llvm-project (by Simon Pilgrim):
    
      [X86] Enable TuningSlowDivide64 on Barcelona/Bobcat/Bulldozer/Ryzen Families (#91277)
    
      Despite most AMD cpus having a lower latency for i64 divisions that converge early, we are still better off testing for values representable as i32 and performing a i32 division if possible.
    
      All AMD cpus appear to have been missed when we added the "idivq-to-divl" attribute - this patch now matches Intel cpu behaviour (and the x86-64/v2/3/4 levels).
    
      Unfortunately the difference in code scheduling means I've had to stop using the update_llc_test_checks script and just use old-fashioned CHECK-DAG checks for divl/divq pairs.
    
      Fixes #90985
    
    This fixes possibly worse runtime performance on AMD Zen hardware, when
    using -march=znver4 (or any other znver), as opposed to -march=x86-64-v4
    or the baseline -march=x86-64. A similar fix is applied for Nehalem.
    
    PR:             278908
    MFC after:      3 days
---
 contrib/llvm-project/llvm/lib/Target/X86/X86.td | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86.td b/contrib/llvm-project/llvm/lib/Target/X86/X86.td
index e89ddcc570c9..1aff5f9fad97 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86.td
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86.td
@@ -867,6 +867,7 @@ def ProcessorFeatures {
   // Nehalem
   list<SubtargetFeature> NHMFeatures = X86_64V2Features;
   list<SubtargetFeature> NHMTuning = [TuningMacroFusion,
+                                      TuningSlowDivide64,
                                       TuningInsertVZEROUPPER,
                                       TuningNoDomainDelayMov];
 
@@ -1336,6 +1337,7 @@ def ProcessorFeatures {
                                               FeatureCMOV,
                                               FeatureX86_64];
   list<SubtargetFeature> BarcelonaTuning = [TuningFastScalarShiftMasks,
+                                            TuningSlowDivide64,
                                             TuningSlowSHLD,
                                             TuningSBBDepBreaking,
                                             TuningInsertVZEROUPPER];
@@ -1358,6 +1360,7 @@ def ProcessorFeatures {
   list<SubtargetFeature> BtVer1Tuning = [TuningFast15ByteNOP,
                                          TuningFastScalarShiftMasks,
                                          TuningFastVectorShiftMasks,
+                                         TuningSlowDivide64,
                                          TuningSlowSHLD,
                                          TuningSBBDepBreaking,
                                          TuningInsertVZEROUPPER];
@@ -1380,6 +1383,7 @@ def ProcessorFeatures {
                                          TuningFastVectorShiftMasks,
                                          TuningFastMOVBE,
                                          TuningSBBDepBreaking,
+                                         TuningSlowDivide64,
                                          TuningSlowSHLD];
   list<SubtargetFeature> BtVer2Features =
     !listconcat(BtVer1Features, BtVer2AdditionalFeatures);
@@ -1404,6 +1408,7 @@ def ProcessorFeatures {
                                            FeatureLWP,
                                            FeatureLAHFSAHF64];
   list<SubtargetFeature> BdVer1Tuning = [TuningSlowSHLD,
+                                         TuningSlowDivide64,
                                          TuningFast11ByteNOP,
                                          TuningFastScalarShiftMasks,
                                          TuningBranchFusion,
@@ -1483,6 +1488,7 @@ def ProcessorFeatures {
                                      TuningFastScalarShiftMasks,
                                      TuningFastVariablePerLaneShuffle,
                                      TuningFastMOVBE,
+                                     TuningSlowDivide64,
                                      TuningSlowSHLD,
                                      TuningSBBDepBreaking,
                                      TuningInsertVZEROUPPER,