Auto merge of rust-lang#127143 - matthiaskrgr:rollup-72eqqik, r=matth…

…iaskrgr Rollup of 7 pull requests Successful merges: - rust-lang#123778 (Improve autovectorization of to_lowercase / to_uppercase functions) - rust-lang#126705 (Updated docs on `#[panic_handler]` in `library/core/src/lib.rs`) - rust-lang#126876 (Add `.ignore` file to make `config.toml` searchable in vscode) - rust-lang#126906 (Small fixme in core now that split_first has no codegen issues) - rust-lang#127127 (rustdoc: update to pulldown-cmark 0.11) - rust-lang#127131 (Remove unused `rustc_trait_selection` dependencies) - rust-lang#127134 (Print `TypeId` as a `u128` for `Debug`) r? `@ghost` `@rustbot` modify labels: rollup
rust-lang-ci · Jun 30, 2024 · f9ae9f7 · f9ae9f7
2 parents 716752e + 865788e
commit f9ae9f7
Show file tree

Hide file tree

Showing 26 changed files with 526 additions and 164 deletions.
diff --git a/.ignore b/.ignore
@@ -0,0 +1,2 @@
+# Make vscode *not* count `config.toml` as ignored, so it is included in search
+!/config.toml
diff --git a/.reuse/dep5 b/.reuse/dep5
@@ -36,6 +36,7 @@ Files: compiler/*
        .gitignore
        .gitmodules
        .mailmap
+       .ignore
 Copyright: The Rust Project Developers (see https://thanks.rust-lang.org)
 License: MIT or Apache-2.0
 

diff --git a/Cargo.lock b/Cargo.lock
@@ -3141,7 +3141,19 @@ dependencies = [
  "bitflags 2.5.0",
  "getopts",
  "memchr",
- "pulldown-cmark-escape",
+ "pulldown-cmark-escape 0.10.1",
+ "unicase",
+]
+
+[[package]]
+name = "pulldown-cmark"
+version = "0.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8746739f11d39ce5ad5c2520a9b75285310dbfe78c541ccf832d38615765aec0"
+dependencies = [
+ "bitflags 2.5.0",
+ "memchr",
+ "pulldown-cmark-escape 0.11.0",
  "unicase",
 ]
 
@@ -3151,6 +3163,12 @@ version = "0.10.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "bd348ff538bc9caeda7ee8cad2d1d48236a1f443c1fa3913c6a02fe0043b1dd3"
 
+[[package]]
+name = "pulldown-cmark-escape"
+version = "0.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "007d8adb5ddab6f8e3f491ac63566a7d5002cc7ed73901f72057943fa71ae1ae"
+
 [[package]]
 name = "pulldown-cmark-to-cmark"
 version = "13.0.0"
@@ -4604,7 +4622,7 @@ name = "rustc_resolve"
 version = "0.0.0"
 dependencies = [
  "bitflags 2.5.0",
- "pulldown-cmark 0.9.6",
+ "pulldown-cmark 0.11.0",
  "rustc_arena",
  "rustc_ast",
  "rustc_ast_pretty",
@@ -4760,8 +4778,6 @@ checksum = "8ba09476327c4b70ccefb6180f046ef588c26a24cf5d269a9feba316eb4f029f"
 name = "rustc_trait_selection"
 version = "0.0.0"
 dependencies = [
- "bitflags 2.5.0",
- "derivative",
  "itertools",
  "rustc_ast",
  "rustc_ast_ir",
@@ -4770,7 +4786,6 @@ dependencies = [
  "rustc_errors",
  "rustc_fluent_macro",
  "rustc_hir",
- "rustc_index",
  "rustc_infer",
  "rustc_macros",
  "rustc_middle",
@@ -4783,7 +4798,6 @@ dependencies = [
  "rustc_target",
  "rustc_transmute",
  "rustc_type_ir",
- "rustc_type_ir_macros",
  "smallvec",
  "tracing",
 ]
@@ -4887,6 +4901,7 @@ dependencies = [
  "indexmap",
  "itertools",
  "minifier",
+ "pulldown-cmark 0.9.6",
  "regex",
  "rustdoc-json-types",
  "serde",

diff --git a/compiler/rustc_resolve/Cargo.toml b/compiler/rustc_resolve/Cargo.toml
@@ -6,7 +6,7 @@ edition = "2021"
 [dependencies]
 # tidy-alphabetical-start
 bitflags = "2.4.1"
-pulldown-cmark = { version = "0.9.6", default-features = false }
+pulldown-cmark = { version = "0.11", features = ["html"], default-features = false }
 rustc_arena = { path = "../rustc_arena" }
 rustc_ast = { path = "../rustc_ast" }
 rustc_ast_pretty = { path = "../rustc_ast_pretty" }

diff --git a/compiler/rustc_resolve/src/rustdoc.rs b/compiler/rustc_resolve/src/rustdoc.rs
@@ -1,4 +1,6 @@
-use pulldown_cmark::{BrokenLink, CowStr, Event, LinkType, Options, Parser, Tag};
+use pulldown_cmark::{
+    BrokenLink, BrokenLinkCallback, CowStr, Event, LinkType, Options, Parser, Tag,
+};
 use rustc_ast as ast;
 use rustc_ast::util::comments::beautify_doc_string;
 use rustc_data_structures::fx::FxHashMap;
@@ -427,7 +429,9 @@ fn parse_links<'md>(doc: &'md str) -> Vec<Box<str>> {
 
     while let Some(event) = event_iter.next() {
         match event {
-            Event::Start(Tag::Link(link_type, dest, _)) if may_be_doc_link(link_type) => {
+            Event::Start(Tag::Link { link_type, dest_url, title: _, id: _ })
+                if may_be_doc_link(link_type) =>
+            {
                 if matches!(
                     link_type,
                     LinkType::Inline
@@ -441,7 +445,7 @@ fn parse_links<'md>(doc: &'md str) -> Vec<Box<str>> {
                     }
                 }
 
-                links.push(preprocess_link(&dest));
+                links.push(preprocess_link(&dest_url));
             }
             _ => {}
         }
@@ -451,8 +455,8 @@ fn parse_links<'md>(doc: &'md str) -> Vec<Box<str>> {
 }
 
 /// Collects additional data of link.
-fn collect_link_data<'input, 'callback>(
-    event_iter: &mut Parser<'input, 'callback>,
+fn collect_link_data<'input, F: BrokenLinkCallback<'input>>(
+    event_iter: &mut Parser<'input, F>,
 ) -> Option<Box<str>> {
     let mut display_text: Option<String> = None;
     let mut append_text = |text: CowStr<'_>| {

diff --git a/compiler/rustc_trait_selection/Cargo.toml b/compiler/rustc_trait_selection/Cargo.toml
@@ -5,8 +5,6 @@ edition = "2021"
 
 [dependencies]
 # tidy-alphabetical-start
-bitflags = "2.4.1"
-derivative = "2.2.0"
 itertools = "0.12"
 rustc_ast = { path = "../rustc_ast" }
 rustc_ast_ir = { path = "../rustc_ast_ir" }
@@ -15,7 +13,6 @@ rustc_data_structures = { path = "../rustc_data_structures" }
 rustc_errors = { path = "../rustc_errors" }
 rustc_fluent_macro = { path = "../rustc_fluent_macro" }
 rustc_hir = { path = "../rustc_hir" }
-rustc_index = { path = "../rustc_index" }
 rustc_infer = { path = "../rustc_infer" }
 rustc_macros = { path = "../rustc_macros" }
 rustc_middle = { path = "../rustc_middle" }
@@ -28,7 +25,6 @@ rustc_span = { path = "../rustc_span" }
 rustc_target = { path = "../rustc_target" }
 rustc_transmute = { path = "../rustc_transmute", features = ["rustc"] }
 rustc_type_ir = { path = "../rustc_type_ir" }
-rustc_type_ir_macros = { path = "../rustc_type_ir_macros" }
 smallvec = { version = "1.8.1", features = ["union", "may_dangle"] }
 tracing = "0.1"
 # tidy-alphabetical-end
diff --git a/library/alloc/benches/str.rs b/library/alloc/benches/str.rs
@@ -347,3 +347,5 @@ make_test!(rsplitn_space_char, s, s.rsplitn(10, ' ').count());
 
 make_test!(split_space_str, s, s.split(" ").count());
 make_test!(split_ad_str, s, s.split("ad").count());
+
+make_test!(to_lowercase, s, s.to_lowercase());
diff --git a/library/alloc/src/str.rs b/library/alloc/src/str.rs
@@ -10,6 +10,7 @@
 use core::borrow::{Borrow, BorrowMut};
 use core::iter::FusedIterator;
 use core::mem;
+use core::mem::MaybeUninit;
 use core::ptr;
 use core::str::pattern::{DoubleEndedSearcher, Pattern, ReverseSearcher, Searcher};
 use core::unicode::conversions;
@@ -367,14 +368,9 @@ impl str {
                   without modifying the original"]
     #[stable(feature = "unicode_case_mapping", since = "1.2.0")]
     pub fn to_lowercase(&self) -> String {
-        let out = convert_while_ascii(self.as_bytes(), u8::to_ascii_lowercase);
+        let (mut s, rest) = convert_while_ascii(self, u8::to_ascii_lowercase);
 
-        // Safety: we know this is a valid char boundary since
-        // out.len() is only progressed if ascii bytes are found
-        let rest = unsafe { self.get_unchecked(out.len()..) };
-
-        // Safety: We have written only valid ASCII to our vec
-        let mut s = unsafe { String::from_utf8_unchecked(out) };
+        let prefix_len = s.len();
 
         for (i, c) in rest.char_indices() {
             if c == 'Σ' {
@@ -383,8 +379,7 @@ impl str {
                 // in `SpecialCasing.txt`,
                 // so hard-code it rather than have a generic "condition" mechanism.
                 // See https://github.com/rust-lang/rust/issues/26035
-                let out_len = self.len() - rest.len();
-                let sigma_lowercase = map_uppercase_sigma(&self, i + out_len);
+                let sigma_lowercase = map_uppercase_sigma(self, prefix_len + i);
                 s.push(sigma_lowercase);
             } else {
                 match conversions::to_lower(c) {
@@ -460,14 +455,7 @@ impl str {
                   without modifying the original"]
     #[stable(feature = "unicode_case_mapping", since = "1.2.0")]
     pub fn to_uppercase(&self) -> String {
-        let out = convert_while_ascii(self.as_bytes(), u8::to_ascii_uppercase);
-
-        // Safety: we know this is a valid char boundary since
-        // out.len() is only progressed if ascii bytes are found
-        let rest = unsafe { self.get_unchecked(out.len()..) };
-
-        // Safety: We have written only valid ASCII to our vec
-        let mut s = unsafe { String::from_utf8_unchecked(out) };
+        let (mut s, rest) = convert_while_ascii(self, u8::to_ascii_uppercase);
 
         for c in rest.chars() {
             match conversions::to_upper(c) {
@@ -616,50 +604,83 @@ pub unsafe fn from_boxed_utf8_unchecked(v: Box<[u8]>) -> Box<str> {
     unsafe { Box::from_raw(Box::into_raw(v) as *mut str) }
 }
 
-/// Converts the bytes while the bytes are still ascii.
+/// Converts leading ascii bytes in `s` by calling the `convert` function.
+///
 /// For better average performance, this happens in chunks of `2*size_of::<usize>()`.
-/// Returns a vec with the converted bytes.
+///
+/// Returns a tuple of the converted prefix and the remainder starting from
+/// the first non-ascii character.
 #[inline]
 #[cfg(not(test))]
 #[cfg(not(no_global_oom_handling))]
-fn convert_while_ascii(b: &[u8], convert: fn(&u8) -> u8) -> Vec<u8> {
-    let mut out = Vec::with_capacity(b.len());
+fn convert_while_ascii(s: &str, convert: fn(&u8) -> u8) -> (String, &str) {
+    // Process the input in chunks of 16 bytes to enable auto-vectorization.
+    // Previously the chunk size depended on the size of `usize`,
+    // but on 32-bit platforms with sse or neon is also the better choice.
+    // The only downside on other platforms would be a bit more loop-unrolling.
+    const N: usize = 16;
+
+    let mut slice = s.as_bytes();
+    let mut out = Vec::with_capacity(slice.len());
+    let mut out_slice = out.spare_capacity_mut();
+
+    let mut ascii_prefix_len = 0_usize;
+    let mut is_ascii = [false; N];
+
+    while slice.len() >= N {
+        // Safety: checked in loop condition
+        let chunk = unsafe { slice.get_unchecked(..N) };
+        // Safety: out_slice has at least same length as input slice and gets sliced with the same offsets
+        let out_chunk = unsafe { out_slice.get_unchecked_mut(..N) };
+
+        for j in 0..N {
+            is_ascii[j] = chunk[j] <= 127;
+        }
 
-    const USIZE_SIZE: usize = mem::size_of::<usize>();
-    const MAGIC_UNROLL: usize = 2;
-    const N: usize = USIZE_SIZE * MAGIC_UNROLL;
-    const NONASCII_MASK: usize = usize::from_ne_bytes([0x80; USIZE_SIZE]);
+        // Auto-vectorization for this check is a bit fragile, sum and comparing against the chunk
+        // size gives the best result, specifically a pmovmsk instruction on x86.
+        // There is a codegen test in `issue-123712-str-to-lower-autovectorization.rs` which should
+        // be updated when this method is changed.
+        // See also https://github.com/llvm/llvm-project/issues/96395
+        if is_ascii.iter().map(|x| *x as u8).sum::<u8>() as usize != N {
+            break;
+        }
 
-    let mut i = 0;
-    unsafe {
-        while i + N <= b.len() {
-            // Safety: we have checks the sizes `b` and `out` to know that our
-            let in_chunk = b.get_unchecked(i..i + N);
-            let out_chunk = out.spare_capacity_mut().get_unchecked_mut(i..i + N);
-
-            let mut bits = 0;
-            for j in 0..MAGIC_UNROLL {
-                // read the bytes 1 usize at a time (unaligned since we haven't checked the alignment)
-                // safety: in_chunk is valid bytes in the range
-                bits |= in_chunk.as_ptr().cast::<usize>().add(j).read_unaligned();
-            }
-            // if our chunks aren't ascii, then return only the prior bytes as init
-            if bits & NONASCII_MASK != 0 {
-                break;
-            }
+        for j in 0..N {
+            out_chunk[j] = MaybeUninit::new(convert(&chunk[j]));
+        }
 
-            // perform the case conversions on N bytes (gets heavily autovec'd)
-            for j in 0..N {
-                // safety: in_chunk and out_chunk is valid bytes in the range
-                let out = out_chunk.get_unchecked_mut(j);
-                out.write(convert(in_chunk.get_unchecked(j)));
-            }
+        ascii_prefix_len += N;
+        slice = unsafe { slice.get_unchecked(N..) };
+        out_slice = unsafe { out_slice.get_unchecked_mut(N..) };
+    }
 
-            // mark these bytes as initialised
-            i += N;
+    // handle the remainder as individual bytes
+    while slice.len() > 0 {
+        let byte = slice[0];
+        if byte > 127 {
+            break;
         }
-        out.set_len(i);
+        // Safety: out_slice has same length as input slice and gets sliced with the same offsets
+        unsafe {
+            *out_slice.get_unchecked_mut(0) = MaybeUninit::new(convert(&byte));
+        }
+        ascii_prefix_len += 1;
+        slice = unsafe { slice.get_unchecked(1..) };
+        out_slice = unsafe { out_slice.get_unchecked_mut(1..) };
     }
 
-    out
+    unsafe {
+        // SAFETY: ascii_prefix_len bytes have been initialized above
+        out.set_len(ascii_prefix_len);
+
+        // SAFETY: We have written only valid ascii to the output vec
+        let ascii_string = String::from_utf8_unchecked(out);
+
+        // SAFETY: we know this is a valid char boundary
+        // since we only skipped over leading ascii bytes
+        let rest = core::str::from_utf8_unchecked(slice);
+
+        (ascii_string, rest)
+    }
 }
diff --git a/library/alloc/tests/str.rs b/library/alloc/tests/str.rs
@@ -1849,7 +1849,10 @@ fn to_lowercase() {
     assert_eq!("ΑΣ''Α".to_lowercase(), "ασ''α");
 
     // https://github.com/rust-lang/rust/issues/124714
+    // input lengths around the boundary of the chunk size used by the ascii prefix optimization
+    assert_eq!("abcdefghijklmnoΣ".to_lowercase(), "abcdefghijklmnoς");
     assert_eq!("abcdefghijklmnopΣ".to_lowercase(), "abcdefghijklmnopς");
+    assert_eq!("abcdefghijklmnopqΣ".to_lowercase(), "abcdefghijklmnopqς");
 
     // a really long string that has it's lowercase form
     // even longer. this tests that implementations don't assume

diff --git a/library/core/src/any.rs b/library/core/src/any.rs
@@ -602,7 +602,7 @@ impl dyn Any + Send + Sync {
 /// While `TypeId` implements `Hash`, `PartialOrd`, and `Ord`, it is worth
 /// noting that the hashes and ordering will vary between Rust releases. Beware
 /// of relying on them inside of your code!
-#[derive(Clone, Copy, Debug, Eq, PartialOrd, Ord)]
+#[derive(Clone, Copy, Eq, PartialOrd, Ord)]
 #[stable(feature = "rust1", since = "1.0.0")]
 pub struct TypeId {
     // We avoid using `u128` because that imposes higher alignment requirements on many platforms.
@@ -644,6 +644,10 @@ impl TypeId {
         let t2 = t as u64;
         TypeId { t: (t1, t2) }
     }
+
+    fn as_u128(self) -> u128 {
+        u128::from(self.t.0) << 64 | u128::from(self.t.1)
+    }
 }
 
 #[stable(feature = "rust1", since = "1.0.0")]
@@ -666,6 +670,13 @@ impl hash::Hash for TypeId {
     }
 }
 
+#[stable(feature = "rust1", since = "1.0.0")]
+impl fmt::Debug for TypeId {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> {
+        f.debug_tuple("TypeId").field(&self.as_u128()).finish()
+    }
+}
+
 /// Returns the name of a type as a string slice.
 ///
 /// # Note

diff --git a/library/core/src/lib.rs b/library/core/src/lib.rs
@@ -34,12 +34,9 @@
 //!   Rust user code is to call the functions provided by this library instead (such as
 //!   `ptr::copy`).
 //!
-//! * `rust_begin_panic` - This function takes four arguments, a
-//!   `fmt::Arguments`, a `&'static str`, and two `u32`'s. These four arguments
-//!   dictate the panic message, the file at which panic was invoked, and the
-//!   line and column inside the file. It is up to consumers of this core
+//! * Panic handler - This function takes one argument, a `&panic::PanicInfo`. It is up to consumers of this core
 //!   library to define this panic function; it is only required to never
-//!   return. This requires a `lang` attribute named `panic_impl`.
+//!   return. You should mark your implementation using `#[panic_handler]`.
 //!
 //! * `rust_eh_personality` - is used by the failure mechanisms of the
 //!    compiler. This is often mapped to GCC's personality function, but crates