From e1c927bc5875e12332d10fd079d18e2dd45bb5ee Mon Sep 17 00:00:00 2001 From: David Wood Date: Tue, 16 Jan 2018 20:41:00 +0000 Subject: [PATCH] Replaced multi-byte character handling in end_point with potentially more performant variant. --- src/libsyntax/codemap.rs | 82 +++++++++++++++++++++++++++++----------- 1 file changed, 59 insertions(+), 23 deletions(-) diff --git a/src/libsyntax/codemap.rs b/src/libsyntax/codemap.rs index e74066da0ac83..76050f8dc09f0 100644 --- a/src/libsyntax/codemap.rs +++ b/src/libsyntax/codemap.rs @@ -610,38 +610,74 @@ impl CodeMap { /// Returns a new span representing just the end-point of this span pub fn end_point(&self, sp: Span) -> Span { - let hi = sp.hi().0.checked_sub(1).unwrap_or(sp.hi().0); - let hi = self.get_start_of_char_bytepos(BytePos(hi)); - let lo = cmp::max(hi.0, sp.lo().0); - sp.with_lo(BytePos(lo)) + let pos = sp.hi().0; + + let width = self.find_width_of_character_at_span(sp, false); + let corrected_end_position = pos.checked_sub(width).unwrap_or(pos); + + let end_point = BytePos(cmp::max(corrected_end_position, sp.lo().0)); + sp.with_lo(end_point) } /// Returns a new span representing the next character after the end-point of this span pub fn next_point(&self, sp: Span) -> Span { - let hi = sp.lo().0.checked_add(1).unwrap_or(sp.lo().0); - let hi = self.get_start_of_char_bytepos(BytePos(hi)); - let lo = cmp::max(sp.hi().0, hi.0); - Span::new(BytePos(lo), BytePos(lo), sp.ctxt()) + let pos = sp.lo().0; + + let width = self.find_width_of_character_at_span(sp, true); + let corrected_next_position = pos.checked_add(width).unwrap_or(pos); + + let next_point = BytePos(cmp::max(sp.hi().0, corrected_next_position)); + Span::new(next_point, next_point, sp.ctxt()) } - fn get_start_of_char_bytepos(&self, bpos: BytePos) -> BytePos { - let idx = self.lookup_filemap_idx(bpos); - let files = self.files.borrow(); - let map = &(*files)[idx]; + /// Finds the width of a character, either before or after the provided span. + fn find_width_of_character_at_span(&self, sp: Span, forwards: bool) -> u32 { + // Disregard malformed spans and assume a one-byte wide character. + if sp.lo() > sp.hi() { + return 1; + } - for mbc in map.multibyte_chars.borrow().iter() { - if mbc.pos < bpos { - if bpos.to_usize() >= mbc.pos.to_usize() + mbc.bytes { - // If we do, then return the start of the character. - return mbc.pos; - } - } else { - break; - } + let local_begin = self.lookup_byte_offset(sp.lo()); + let local_end = self.lookup_byte_offset(sp.hi()); + + let start_index = local_begin.pos.to_usize(); + let end_index = local_end.pos.to_usize(); + + // Disregard indexes that are at the start or end of their spans, they can't fit bigger + // characters. + if (!forwards && end_index == usize::min_value()) || + (forwards && start_index == usize::max_value()) { + return 1; + } + + let source_len = (local_begin.fm.end_pos - local_begin.fm.start_pos).to_usize(); + // Ensure indexes are also not malformed. + if start_index > end_index || end_index > source_len { + return 1; } - // If this isn't a multibyte character, return the original position. - return bpos; + // We need to extend the snippet to the end of the src rather than to end_index so when + // searching forwards for boundaries we've got somewhere to search. + let snippet = if let Some(ref src) = local_begin.fm.src { + let len = src.len(); + (&src[start_index..len]).to_string() + } else if let Some(src) = local_begin.fm.external_src.borrow().get_source() { + let len = src.len(); + (&src[start_index..len]).to_string() + } else { + return 1; + }; + + let mut target = if forwards { end_index + 1 } else { end_index - 1 }; + while !snippet.is_char_boundary(target - start_index) { + target = if forwards { target + 1 } else { target - 1 }; + } + + if forwards { + (target - end_index) as u32 + } else { + (end_index - target) as u32 + } } pub fn get_filemap(&self, filename: &FileName) -> Option> {