diff --git a/README.md b/README.md index 74c88396..e4a2448a 100644 --- a/README.md +++ b/README.md @@ -142,7 +142,7 @@ Lexical is highly customizable, and contains numerous other optional features: - **f16**:   Add support for numeric conversions to-and-from 16-bit floats.
Adds f16, a half-precision IEEE-754 floating-point type, and bf16, the Brain Float 16 type, and numeric conversions to-and-from these floats. Note that since these are storage formats, and therefore do not have native arithmetic operations, all conversions are done using an intermediate f32.
-To ensure the safety when bounds checking is disabled, we extensively fuzz the all numeric conversion routines. See the [Safety](#safety) section below for more information. +To ensure memory safety, we extensively fuzz the all numeric conversion routines. See the [Safety](#safety) section below for more information. Lexical also places a heavy focus on code bloat: with algorithms both optimized for performance and size. By default, this focuses on performance, however, using the `compact` feature, you can also opt-in to reduced code size at the cost of performance. The compact algorithms minimize the use of pre-computed tables and other optimizations at a major cost to performance. @@ -304,9 +304,7 @@ A benchmark on writing floats generated via a random-number generator and parsed ## Safety -Due to the use of memory unsafe code in the integer and float writers, we extensively fuzz our float writers and parsers. The fuzz harnesses may be found under [fuzz](https://github.com/Alexhuszagh/rust-lexical/tree/main/fuzz), and are run continuously. So far, we've parsed and written over 72 billion floats. - -Due to the simple logic of the integer writers, and the lack of memory safety in the integer parsers, we minimally fuzz both, and test it with edge-cases, which has shown no memory safety issues to date. +Due to the use of memory unsafe code in the library, we extensively fuzz our float writers and parsers. The fuzz harnesses may be found under [fuzz](https://github.com/Alexhuszagh/rust-lexical/tree/main/fuzz), and are run continuously. So far, we've parsed and written over 72 billion floats. ## Platform Support diff --git a/lexical-parse-float/src/parse.rs b/lexical-parse-float/src/parse.rs index 21dc309d..cc704c03 100644 --- a/lexical-parse-float/src/parse.rs +++ b/lexical-parse-float/src/parse.rs @@ -485,7 +485,7 @@ pub fn parse_partial_number<'a, const FORMAT: u128>( let decimal_point = options.decimal_point(); let exponent_character = options.exponent(); debug_assert!(format.is_valid()); - debug_assert!(!byte.is_done()); + debug_assert!(!byte.is_buffer_empty()); let bits_per_digit = shared::log2(format.mantissa_radix()) as i64; let bits_per_base = shared::log2(format.exponent_base()) as i64; @@ -503,7 +503,7 @@ pub fn parse_partial_number<'a, const FORMAT: u128>( // We must have a format like `0x`, `0d`, `0o`. Note: is_prefix = true; if iter.read_if_value(base_prefix, format.case_sensitive_base_prefix()).is_some() - && iter.is_done() + && iter.is_buffer_empty() { return Err(Error::Empty(iter.cursor())); } @@ -553,7 +553,6 @@ pub fn parse_partial_number<'a, const FORMAT: u128>( let mut implicit_exponent: i64; let int_end = n_digits as i64; let mut fraction_digits = None; - // TODO: Change this to something different from read_if_value but same idea if byte.first_is_cased(decimal_point) { // SAFETY: byte cannot be empty due to first_is unsafe { byte.step_unchecked() }; @@ -596,23 +595,23 @@ pub fn parse_partial_number<'a, const FORMAT: u128>( let is_exponent = byte .first_is(exponent_character, format.case_sensitive_exponent() && cfg!(feature = "format")); if is_exponent { + // SAFETY: byte cannot be empty due to `first_is` from `is_exponent`.` + unsafe { byte.step_unchecked() }; + // Check float format syntax checks. #[cfg(feature = "format")] { + // NOTE: We've overstepped for the safety invariant before. if format.no_exponent_notation() { - return Err(Error::InvalidExponent(byte.cursor())); + return Err(Error::InvalidExponent(byte.cursor() - 1)); } // Check if we have no fraction but we required exponent notation. if format.no_exponent_without_fraction() && fraction_digits.is_none() { - return Err(Error::ExponentWithoutFraction(byte.cursor())); + return Err(Error::ExponentWithoutFraction(byte.cursor() - 1)); } } - // SAFETY: byte cannot be empty due to `first_is` from `is_exponent`.` - // TODO: Fix: we need a read_if for bytes themselves? - unsafe { byte.step_unchecked() }; let is_negative = parse_exponent_sign(&mut byte)?; - let before = byte.current_count(); parse_digits::<_, _, FORMAT>(byte.exponent_iter(), |digit| { if explicit_exponent < 0x10000000 { @@ -679,7 +678,6 @@ pub fn parse_partial_number<'a, const FORMAT: u128>( n_digits = n_digits.saturating_sub(1); } if zeros.first_is_cased(decimal_point) { - // TODO: Fix with some read_if like logic // SAFETY: safe since zeros cannot be empty due to first_is unsafe { zeros.step_unchecked() }; } diff --git a/lexical-parse-float/src/slow.rs b/lexical-parse-float/src/slow.rs index 2822b309..f614afa8 100644 --- a/lexical-parse-float/src/slow.rs +++ b/lexical-parse-float/src/slow.rs @@ -685,7 +685,7 @@ pub fn compare_bytes( let mut integer = number.integer.bytes::<{ FORMAT }>(); let mut integer_iter = integer.integer_iter(); integer_iter.skip_zeros(); - if integer_iter.is_done() { + if integer_iter.is_buffer_empty() { // Cannot be empty, since we must have at least **some** significant digits. let mut fraction = number.fraction.unwrap().bytes::<{ FORMAT }>(); let mut fraction_iter = fraction.fraction_iter(); diff --git a/lexical-parse-integer/src/algorithm.rs b/lexical-parse-integer/src/algorithm.rs index c41943a3..014e62a6 100644 --- a/lexical-parse-integer/src/algorithm.rs +++ b/lexical-parse-integer/src/algorithm.rs @@ -116,19 +116,17 @@ macro_rules! fmt_invalid_digit { // NOTE: If we're using the `take_n` optimization where it can't // be the end, then the iterator cannot be done. So, in that case, // we need to end. - if is_suffix && $is_end && $iter.is_done() { + if is_suffix && $is_end && $iter.is_buffer_empty() { // Break out of the loop, we've finished parsing. break; - } else if is_suffix { + } else if !$iter.is_buffer_empty() { // Haven't finished parsing, so we're going to call // invalid_digit!. Need to ensure we include the // base suffix in that. - // TODO: This isn't localized and needs to be fixed - // SAFETY: safe since the iterator is not empty, as checked - // in `$iter.is_done()` and `$is_end`. If `$is_end` is false, - // then we have more elements in our + // in `$iter.is_buffer_empty()`. Adding in the check hopefully + // will be elided since it's a known constant. unsafe { $iter.step_unchecked() }; } } @@ -580,7 +578,7 @@ macro_rules! algorithm { let is_negative = parse_sign::(&mut byte)?; let mut iter = byte.integer_iter(); - if iter.is_done() { + if iter.is_buffer_empty() { return into_error!(Empty, iter.cursor()); } @@ -603,7 +601,7 @@ macro_rules! algorithm { // We must have a format like `0x`, `0d`, `0o`. Note: if iter.read_if_value(base_prefix, format.case_sensitive_base_prefix()).is_some() { is_prefix = true; - if iter.is_done() { + if iter.is_buffer_empty() { return into_error!(Empty, iter.cursor()); } else { start_index += 1; diff --git a/lexical-util/src/iterator.rs b/lexical-util/src/iterator.rs index d14c5aa5..c70e9116 100644 --- a/lexical-util/src/iterator.rs +++ b/lexical-util/src/iterator.rs @@ -20,16 +20,8 @@ pub use crate::skip::{AsBytes, Bytes}; /// methods for reading data and accessing underlying data. The readers /// can either be contiguous or non-contiguous, although performance and /// some API methods may not be available for both. -/// -/// # Safety -/// -/// // TODO: FIX CORRECTNESS DOCUMENTATION -/// This trait is effectively safe but the implementor must guarantee that -/// `is_empty` is implemented correctly. For most implementations, this can -/// be `self.as_slice().is_empty()`, where `as_slice` is implemented as -/// `&self.bytes[self.index..]`. #[cfg(feature = "parse")] -pub unsafe trait Iter<'a> { +pub trait Iter<'a> { /// Determine if the buffer is contiguous in memory. const IS_CONTIGUOUS: bool; @@ -59,6 +51,17 @@ pub unsafe trait Iter<'a> { self.get_buffer().len() } + /// Get if no bytes are available in the buffer. + /// + /// This operators on the underlying buffer: that is, + /// it returns if [as_slice] would return an empty slice. + /// + /// [as_slice]: Iter::as_slice + #[inline(always)] + fn is_buffer_empty(&self) -> bool { + self.cursor() >= self.get_buffer().len() + } + /// Get the current index of the iterator in the slice. fn cursor(&self) -> usize; @@ -86,42 +89,22 @@ pub unsafe trait Iter<'a> { /// non-contiguous iterators this can be smaller. fn current_count(&self) -> usize; - // TODO: DOCUMENT // PROPERTIES - // TODO: Fix this naming convention - /// Get if no bytes are available in the buffer. - /// - /// This operators on the underlying buffer: that is, - /// it returns if [as_slice] would return an empty slice. - /// - /// [as_slice]: Iter::as_slice - #[inline(always)] - fn is_empty(&self) -> bool { - self.as_slice().is_empty() - } - /// Determine if the buffer is contiguous. #[inline(always)] fn is_contiguous(&self) -> bool { Self::IS_CONTIGUOUS } - // TODO: Ensure we get this **RIGHT** - /// Get the next value available without consuming it. /// /// This does **NOT** skip digits, and directly fetches the item /// from the underlying buffer. - /// - /// # Safety - /// - /// An implementor must implement `is_empty` correctly in - /// order to guarantee the traitt is safe: `is_empty` **MUST** - /// ensure that one value remains, if the iterator is non- - /// contiguous this means advancing the iterator to the next - /// position. - fn first(&self) -> Option<&'a u8>; + #[inline(always)] + fn first(&self) -> Option<&'a u8> { + self.get_buffer().get(self.cursor()) + } /// Check if the next element is a given value. #[inline(always)] @@ -249,14 +232,6 @@ pub unsafe trait Iter<'a> { /// A default implementation is provided for slice iterators. /// This trait **should never** return `null` from `as_ptr`, or be /// implemented for non-contiguous data. -/// -/// # Safety -/// -/// TODO: Fix the safety documentation here... -/// The safe methods are sound as long as the caller ensures that -/// the methods for `read_32`, `read_64`, etc. check the bounds -/// of the underlying contiguous buffer and is only called on -/// contiguous buffers. pub trait DigitsIter<'a>: Iterator + Iter<'a> { // TODO: Fix the documentation /// Get if the iterator cannot return any more elements. @@ -278,10 +253,9 @@ pub trait DigitsIter<'a>: Iterator + Iter<'a> { /// ensure [peek] is always safe. /// /// If you would like to see if the cursor is at the end of the buffer, - /// see [is_done] or [is_empty] instead. + /// see [is_buffer_empty] instead. /// - /// [is_done]: DigitsIter::is_done - /// [is_empty]: Iter::is_empty + /// [is_buffer_empty]: Iter::is_buffer_empty /// [peek]: DigitsIter::peek #[inline(always)] #[allow(clippy::wrong_self_convention)] @@ -289,17 +263,6 @@ pub trait DigitsIter<'a>: Iterator + Iter<'a> { self.peek().is_none() } - /// Get if the buffer underlying the iterator is empty. - /// - /// This might not be the same thing as [is_consumed]: [is_consumed] - /// checks if any more elements may be returned, which may require - /// peeking the next value. Consumed merely checks if the - /// iterator has an empty slice. It is effectively a cheaper, - /// but weaker variant of [is_consumed]. - /// - /// [is_consumed]: DigitsIter::is_consumed - fn is_done(&self) -> bool; - /// Peek the next value of the iterator, without consuming it. /// /// Note that this can modify the internal state, by skipping digits diff --git a/lexical-util/src/lib.rs b/lexical-util/src/lib.rs index e3eab8f5..7533e017 100644 --- a/lexical-util/src/lib.rs +++ b/lexical-util/src/lib.rs @@ -55,7 +55,7 @@ //! correctly. //! //! To see if the cursor is at the end of the buffer, use -//! [DigitsIter::is_done][is_done]. +//! [is_buffer_empty][crate::iterator::Iter::is_buffer_empty]. //! //! Any iterators must be peekable: you must be able to read and return the next //! value without advancing the iterator past that point. For iterators that @@ -66,7 +66,7 @@ //! like: //! //! ```rust,ignore -//! unsafe impl<_> DigitsIter<_> for MyIter { +//! impl<_> DigitsIter<_> for MyIter { //! fn peek(&mut self) -> Option { //! loop { //! let value = self.bytes.get(self.index)?; @@ -95,7 +95,7 @@ //! } //! ``` //! -//! [is_done]: iterator::DigitsIter::is_done +//! [is_buffer_empty]: iterator::Iter::is_buffer_empty //! [is_consumed]: iterator::DigitsIter::is_consumed //! [peek]: iterator::DigitsIter::peek diff --git a/lexical-util/src/noskip.rs b/lexical-util/src/noskip.rs index 7fdbb074..2af26191 100644 --- a/lexical-util/src/noskip.rs +++ b/lexical-util/src/noskip.rs @@ -49,10 +49,10 @@ impl<'a, const __: u128> Bytes<'a, __> { } } - // TODO: Move to `Iter` as a trait along with `new` as well` /// Initialize the slice from raw parts. /// /// # Safety + /// /// This is safe if and only if the index is <= slc.len(). /// For this reason, since it's easy to get wrong, we only /// expose it to `DigitsIterator` and nothing else. @@ -67,13 +67,6 @@ impl<'a, const __: u128> Bytes<'a, __> { } } - /// Get if the buffer underlying the iterator is empty. - /// Same as `is_consumed`. - #[inline(always)] - pub fn is_done(&self) -> bool { - self.index >= self.slc.len() - } - /// Get iterator over integer digits. #[inline(always)] pub fn integer_iter<'b>(&'b mut self) -> DigitsIterator<'a, 'b, __> { @@ -107,7 +100,7 @@ impl<'a, const __: u128> Bytes<'a, __> { } } -unsafe impl<'a, const __: u128> Iter<'a> for Bytes<'a, __> { +impl<'a, const __: u128> Iter<'a> for Bytes<'a, __> { const IS_CONTIGUOUS: bool = true; #[inline(always)] @@ -138,12 +131,6 @@ unsafe impl<'a, const __: u128> Iter<'a> for Bytes<'a, __> { self.index } - // TODO: Rename - #[inline(always)] - fn is_empty(&self) -> bool { - self.as_slice().is_empty() - } - #[inline(always)] fn first(&self) -> Option<&'a u8> { self.slc.get(self.index) @@ -212,7 +199,7 @@ impl<'a: 'b, 'b, const __: u128> DigitsIterator<'a, 'b, __> { } } -unsafe impl<'a: 'b, 'b, const __: u128> Iter<'a> for DigitsIterator<'a, 'b, __> { +impl<'a: 'b, 'b, const __: u128> Iter<'a> for DigitsIterator<'a, 'b, __> { const IS_CONTIGUOUS: bool = Bytes::<'a, __>::IS_CONTIGUOUS; #[inline(always)] @@ -237,16 +224,6 @@ unsafe impl<'a: 'b, 'b, const __: u128> Iter<'a> for DigitsIterator<'a, 'b, __> self.byte.current_count() } - #[inline(always)] - fn is_empty(&self) -> bool { - self.byte.is_done() - } - - #[inline(always)] - fn first(&self) -> Option<&'a u8> { - self.byte.first() - } - #[inline(always)] unsafe fn step_by_unchecked(&mut self, count: usize) { debug_assert!(self.as_slice().len() >= count); @@ -265,12 +242,7 @@ unsafe impl<'a: 'b, 'b, const __: u128> Iter<'a> for DigitsIterator<'a, 'b, __> impl<'a: 'b, 'b, const FORMAT: u128> DigitsIter<'a> for DigitsIterator<'a, 'b, FORMAT> { #[inline(always)] fn is_consumed(&mut self) -> bool { - Self::is_done(self) - } - - #[inline(always)] - fn is_done(&self) -> bool { - self.byte.is_done() + self.is_buffer_empty() } #[inline(always)] diff --git a/lexical-util/src/skip.rs b/lexical-util/src/skip.rs index 2b45dafe..3aa62776 100644 --- a/lexical-util/src/skip.rs +++ b/lexical-util/src/skip.rs @@ -383,33 +383,6 @@ impl<'a, const FORMAT: u128> Bytes<'a, FORMAT> { } } - // TODO: Move this to our iter trait - - /// Get if the buffer underlying the iterator is empty. - /// - /// This might not be the same thing as `is_consumed`: `is_consumed` - /// checks if any more elements may be returned, which may require - /// peeking the next value. Consumed merely checks if the - /// iterator has an empty slice. It is effectively a cheaper, - /// but weaker variant of `is_consumed()`. - #[inline(always)] - pub fn is_done(&self) -> bool { - self.index >= self.slc.len() - } - - /// Check if the next element is a given value. - // TODO: Remove the peek methods, these shouldn't be on `bytes`. - #[inline(always)] - pub fn peek_is_cased(&mut self, value: u8) -> bool { - // Don't assert not a digit separator, since this can occur when - // a different component does not allow digit separators there. - if let Some(&c) = self.first() { - c == value - } else { - false - } - } - /// Get iterator over integer digits. #[inline(always)] pub fn integer_iter<'b>(&'b mut self) -> IntegerDigitsIterator<'a, 'b, FORMAT> { @@ -443,7 +416,7 @@ impl<'a, const FORMAT: u128> Bytes<'a, FORMAT> { } } -unsafe impl<'a, const FORMAT: u128> Iter<'a> for Bytes<'a, FORMAT> { +impl<'a, const FORMAT: u128> Iter<'a> for Bytes<'a, FORMAT> { /// If each yielded value is adjacent in memory. const IS_CONTIGUOUS: bool = NumberFormat::<{ FORMAT }>::DIGIT_SEPARATOR == 0; @@ -481,17 +454,6 @@ unsafe impl<'a, const FORMAT: u128> Iter<'a> for Bytes<'a, FORMAT> { } } - // TODO: Rename - #[inline(always)] - fn is_empty(&self) -> bool { - self.index >= self.slc.len() - } - - #[inline(always)] - fn first(&self) -> Option<&'a u8> { - self.slc.get(self.index) - } - #[inline(always)] unsafe fn step_by_unchecked(&mut self, count: usize) { if Self::IS_CONTIGUOUS { @@ -655,16 +617,6 @@ macro_rules! skip_iterator_iter_base { self.byte.current_count() } - #[inline(always)] - fn is_empty(&self) -> bool { - self.byte.is_done() - } - - #[inline(always)] - fn first(&self) -> Option<&'a u8> { - self.byte.first() - } - #[inline(always)] unsafe fn step_by_unchecked(&mut self, count: usize) { debug_assert!(self.as_slice().len() >= count); @@ -688,18 +640,13 @@ macro_rules! skip_iterator_digits_iter_base { fn is_consumed(&mut self) -> bool { self.peek().is_none() } - - #[inline(always)] - fn is_done(&self) -> bool { - self.byte.is_done() - } }; } /// Create impl ByteIter block for skip iterator. macro_rules! skip_iterator_bytesiter_impl { ($iterator:ident, $mask:ident, $i:ident, $l:ident, $t:ident, $c:ident) => { - unsafe impl<'a: 'b, 'b, const FORMAT: u128> Iter<'a> for $iterator<'a, 'b, FORMAT> { + impl<'a: 'b, 'b, const FORMAT: u128> Iter<'a> for $iterator<'a, 'b, FORMAT> { skip_iterator_iter_base!(FORMAT, $mask); } @@ -816,7 +763,7 @@ impl<'a: 'b, 'b, const FORMAT: u128> SpecialDigitsIterator<'a, 'b, FORMAT> { is_digit_separator!(FORMAT); } -unsafe impl<'a: 'b, 'b, const FORMAT: u128> Iter<'a> for SpecialDigitsIterator<'a, 'b, FORMAT> { +impl<'a: 'b, 'b, const FORMAT: u128> Iter<'a> for SpecialDigitsIterator<'a, 'b, FORMAT> { skip_iterator_iter_base!(FORMAT, SPECIAL_DIGIT_SEPARATOR); } diff --git a/lexical-util/tests/iterator_tests.rs b/lexical-util/tests/iterator_tests.rs index 44ca4a61..1882ea6b 100644 --- a/lexical-util/tests/iterator_tests.rs +++ b/lexical-util/tests/iterator_tests.rs @@ -20,7 +20,7 @@ fn digits_iterator_test() { assert_eq!(iter.as_slice(), &digits[..]); assert_eq!(iter.as_ptr(), digits.as_ptr()); assert_eq!(iter.is_consumed(), false); - assert_eq!(iter.is_done(), false); + assert_eq!(iter.is_buffer_empty(), false); assert_eq!(u32::from_le(iter.peek_u32().unwrap()), 0x34333231); assert_eq!(iter.buffer_length(), 5); assert_eq!(iter.cursor(), 0); @@ -83,7 +83,7 @@ fn skip_iterator_test() { assert_eq!(iter.as_slice(), &digits[..]); assert_eq!(iter.as_ptr(), digits.as_ptr()); assert_eq!(iter.is_consumed(), false); - assert_eq!(iter.is_done(), false); + assert_eq!(iter.is_buffer_empty(), false); assert_eq!(iter.buffer_length(), 6); assert_eq!(iter.cursor(), 0); assert_eq!(iter.current_count(), 0); diff --git a/lexical-write-float/src/write.rs b/lexical-write-float/src/write.rs index 1cc63f6d..3a680ee5 100644 --- a/lexical-write-float/src/write.rs +++ b/lexical-write-float/src/write.rs @@ -70,19 +70,10 @@ where /// Write float trait. pub trait WriteFloat: RawFloat + FormattedSize { - /// Forward write integer parameters to an unoptimized backend. + /// Forward float writing parameters and write the float. /// - /// # Safety - /// - /// # TODO: This is now safe effectively - /// Safe as long as the buffer can hold [`FORMATTED_SIZE`] elements - /// (or [`FORMATTED_SIZE_DECIMAL`] for decimal). If using custom digit - /// precision control (such as specifying a minimum number of significant - /// digits), or disabling scientific notation, then more digits may be - /// required (up to `1075` for the leading or trailing zeros, `1` for - /// the sign and `1` for the decimal point). So, - /// `1077 + min_significant_digits.max(52)`, so ~1200 for a reasonable - /// threshold. + /// This abstracts away handling different optimizations and radices into + /// a single API. /// /// # Panics /// diff --git a/lexical/src/lib.rs b/lexical/src/lib.rs index c852d743..fc4913d1 100644 --- a/lexical/src/lib.rs +++ b/lexical/src/lib.rs @@ -350,8 +350,6 @@ pub use lexical_core::{ToLexical, ToLexicalWithOptions}; #[inline] #[cfg(feature = "write")] pub fn to_string(n: N) -> String { - // TODO: Change to use our `MaybeUnint` API and a `with_capacity` to - // avoid the overhead of the calloc here. let mut buf = vec![0u8; N::FORMATTED_SIZE_DECIMAL]; let len = lexical_core::write(n, buf.as_mut_slice()).len();