Skip to content

Commit

Permalink
Split impl Debug for str into ASCII/Unicode chunks
Browse files Browse the repository at this point in the history
Instead of having a single loop that works on utf-8 `char`s,
this splits the implementation into separate ASCII and Unicode loops,
and uses more optimized code for the ASCII-only case.
  • Loading branch information
Swatinem committed May 12, 2024
1 parent 51ecc64 commit 1b43527
Showing 1 changed file with 58 additions and 16 deletions.
74 changes: 58 additions & 16 deletions library/core/src/fmt/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2399,26 +2399,68 @@ impl Display for bool {
impl Debug for str {
fn fmt(&self, f: &mut Formatter<'_>) -> Result {
f.write_char('"')?;
let mut from = 0;
for (i, c) in self.char_indices() {
// a fast path for ASCII chars that do not need escapes:
if matches!(c, ' '..='~') && !matches!(c, '\\' | '\"') {
continue;
}

let esc = c.escape_debug_ext(EscapeDebugExtArgs {
escape_grapheme_extended: true,
escape_single_quote: false,
escape_double_quote: true,
});
// If char needs escaping, flush backlog so far and write, else skip
if esc.len() != 1 {
f.write_str(&self[from..i])?;
// substring we know is printable
let mut printable_range = 0..0;

// the outer loop here splits the string into ASCII-only, and Unicode-only chunks,
// which are then processed separately, to enable a fast path for the ASCII-only chunk.
let mut rest = self.as_bytes();
while rest.len() > 0 {
let mut ascii_bytes: &[u8];
let unicode_bytes: &[u8];

// first, handle an ascii-only prefix
let non_ascii_position = rest.iter().position(|&b| b >= 0x80).unwrap_or(rest.len());
// SAFETY: the position was derived from an iterator, so is known to be within bounds, and at a char boundary
(ascii_bytes, rest) = unsafe { rest.split_at_unchecked(non_ascii_position) };

fn needs_escape(b: u8) -> bool {
b > 0x7E || b < 0x20 || b == b'\\' || b == b'"'
}
while let Some(escape_position) = ascii_bytes.iter().position(|&b| needs_escape(b)) {
printable_range.end += escape_position;
f.write_str(&self[printable_range.clone()])?;

let c = ascii_bytes[escape_position] as char;
let esc = c.escape_debug_ext(EscapeDebugExtArgs {
escape_grapheme_extended: true,
escape_single_quote: false,
escape_double_quote: true,
});
Display::fmt(&esc, f)?;
from = i + c.len_utf8();

ascii_bytes = &ascii_bytes[escape_position + 1..];
printable_range = (printable_range.end + 1)..(printable_range.end + 1);
}
printable_range.end += ascii_bytes.len();

// then, handle a unicode-only prefix
let ascii_position = rest.iter().position(|&b| b < 0x80).unwrap_or(rest.len());
// SAFETY: the position was derived from an iterator, so is known to be within bounds, and at a char boundary
(unicode_bytes, rest) = unsafe { rest.split_at_unchecked(ascii_position) };
// SAFETY: prefix is a valid utf8 sequence, and at a char boundary
let unicode_prefix = unsafe { crate::str::from_utf8_unchecked(unicode_bytes) };

for c in unicode_prefix.chars() {
// SAFETY: we know that our slice only contains unicode chars
unsafe { crate::hint::assert_unchecked(c as u32 >= 0x80) };
let esc = c.escape_debug_ext(EscapeDebugExtArgs {
escape_grapheme_extended: true,
escape_single_quote: false,
escape_double_quote: true,
});
if esc.len() != 1 {
f.write_str(&self[printable_range.clone()])?;
Display::fmt(&esc, f)?;
printable_range.start = printable_range.end + c.len_utf8();
}
printable_range.end += c.len_utf8();
}
}
f.write_str(&self[from..])?;

f.write_str(&self[printable_range])?;

f.write_char('"')
}
}
Expand Down

0 comments on commit 1b43527

Please sign in to comment.