diff --git a/include/fmt/format-inl.h b/include/fmt/format-inl.h index e4dd7ea80a2b..df07e1fd3be6 100644 --- a/include/fmt/format-inl.h +++ b/include/fmt/format-inl.h @@ -1374,6 +1374,305 @@ template auto to_decimal(T x) noexcept -> decimal_fp { return ret_value; } } // namespace dragonbox + +template +FMT_CONSTEXPR20 auto format_float(Float value, int precision, + const format_specs& specs, bool binary32, + buffer& buf) -> int { + // float is passed as double to reduce the number of instantiations. + static_assert(!std::is_same::value, ""); + auto converted_value = convert_float(value); + + const bool fixed = specs.type() == presentation_type::fixed; + if (value == 0) { + if (precision <= 0 || !fixed) { + buf.push_back('0'); + return 0; + } + buf.try_resize(to_unsigned(precision)); + fill_n(buf.data(), precision, '0'); + return -precision; + } + + int exp = 0; + bool use_dragon = true; + unsigned dragon_flags = 0; + if (!is_fast_float() || is_constant_evaluated()) { + const auto inv_log2_10 = 0.3010299956639812; // 1 / log2(10) + using info = dragonbox::float_info; + const auto f = basic_fp(converted_value); + // Compute exp, an approximate power of 10, such that + // 10^(exp - 1) <= value < 10^exp or 10^exp <= value < 10^(exp + 1). + // This is based on log10(value) == log2(value) / log2(10) and approximation + // of log2(value) by e + num_fraction_bits idea from double-conversion. + auto e = (f.e + count_digits<1>(f.f) - 1) * inv_log2_10 - 1e-10; + exp = static_cast(e); + if (e > exp) ++exp; // Compute ceil. + dragon_flags = dragon::fixup; + } else { + // Extract significand bits and exponent bits. + using info = dragonbox::float_info; + auto br = bit_cast(static_cast(value)); + + const uint64_t significand_mask = + (static_cast(1) << num_significand_bits()) - 1; + uint64_t significand = (br & significand_mask); + int exponent = static_cast((br & exponent_mask()) >> + num_significand_bits()); + + if (exponent != 0) { // Check if normal. + exponent -= exponent_bias() + num_significand_bits(); + significand |= + (static_cast(1) << num_significand_bits()); + significand <<= 1; + } else { + // Normalize subnormal inputs. + FMT_ASSERT(significand != 0, "zeros should not appear here"); + int shift = countl_zero(significand); + FMT_ASSERT(shift >= num_bits() - num_significand_bits(), + ""); + shift -= (num_bits() - num_significand_bits() - 2); + exponent = (std::numeric_limits::min_exponent - + num_significand_bits()) - + shift; + significand <<= shift; + } + + // Compute the first several nonzero decimal significand digits. + // We call the number we get the first segment. + const int k = info::kappa - dragonbox::floor_log10_pow2(exponent); + exp = -k; + const int beta = exponent + dragonbox::floor_log2_pow10(k); + uint64_t first_segment; + bool has_more_segments; + int digits_in_the_first_segment; + { + const auto r = dragonbox::umul192_upper128( + significand << beta, dragonbox::get_cached_power(k)); + first_segment = r.high(); + has_more_segments = r.low() != 0; + + // The first segment can have 18 ~ 19 digits. + if (first_segment >= 1000000000000000000ULL) { + digits_in_the_first_segment = 19; + } else { + // When it is of 18-digits, we align it to 19-digits by adding a bogus + // zero at the end. + digits_in_the_first_segment = 18; + first_segment *= 10; + } + } + + // Compute the actual number of decimal digits to print. + if (fixed) adjust_precision(precision, exp + digits_in_the_first_segment); + + // Use Dragon4 only when there might be not enough digits in the first + // segment. + if (digits_in_the_first_segment > precision) { + use_dragon = false; + + if (precision <= 0) { + exp += digits_in_the_first_segment; + + if (precision < 0) { + // Nothing to do, since all we have are just leading zeros. + buf.try_resize(0); + } else { + // We may need to round-up. + buf.try_resize(1); + if ((first_segment | static_cast(has_more_segments)) > + 5000000000000000000ULL) { + buf[0] = '1'; + } else { + buf[0] = '0'; + } + } + } // precision <= 0 + else { + exp += digits_in_the_first_segment - precision; + + // When precision > 0, we divide the first segment into three + // subsegments, each with 9, 9, and 0 ~ 1 digits so that each fits + // in 32-bits which usually allows faster calculation than in + // 64-bits. Since some compiler (e.g. MSVC) doesn't know how to optimize + // division-by-constant for large 64-bit divisors, we do it here + // manually. The magic number 7922816251426433760 below is equal to + // ceil(2^(64+32) / 10^10). + const uint32_t first_subsegment = static_cast( + dragonbox::umul128_upper64(first_segment, 7922816251426433760ULL) >> + 32); + const uint64_t second_third_subsegments = + first_segment - first_subsegment * 10000000000ULL; + + uint64_t prod; + uint32_t digits; + bool should_round_up; + int number_of_digits_to_print = precision > 9 ? 9 : precision; + + // Print a 9-digits subsegment, either the first or the second. + auto print_subsegment = [&](uint32_t subsegment, char* buffer) { + int number_of_digits_printed = 0; + + // If we want to print an odd number of digits from the subsegment, + if ((number_of_digits_to_print & 1) != 0) { + // Convert to 64-bit fixed-point fractional form with 1-digit + // integer part. The magic number 720575941 is a good enough + // approximation of 2^(32 + 24) / 10^8; see + // https://jk-jeon.github.io/posts/2022/12/fixed-precision-formatting/#fixed-length-case + // for details. + prod = ((subsegment * static_cast(720575941)) >> 24) + 1; + digits = static_cast(prod >> 32); + *buffer = static_cast('0' + digits); + number_of_digits_printed++; + } + // If we want to print an even number of digits from the + // first_subsegment, + else { + // Convert to 64-bit fixed-point fractional form with 2-digits + // integer part. The magic number 450359963 is a good enough + // approximation of 2^(32 + 20) / 10^7; see + // https://jk-jeon.github.io/posts/2022/12/fixed-precision-formatting/#fixed-length-case + // for details. + prod = ((subsegment * static_cast(450359963)) >> 20) + 1; + digits = static_cast(prod >> 32); + write2digits(buffer, digits); + number_of_digits_printed += 2; + } + + // Print all digit pairs. + while (number_of_digits_printed < number_of_digits_to_print) { + prod = static_cast(prod) * static_cast(100); + digits = static_cast(prod >> 32); + write2digits(buffer + number_of_digits_printed, digits); + number_of_digits_printed += 2; + } + }; + + // Print first subsegment. + print_subsegment(first_subsegment, buf.data()); + + // Perform rounding if the first subsegment is the last subsegment to + // print. + if (precision <= 9) { + // Rounding inside the subsegment. + // We round-up if: + // - either the fractional part is strictly larger than 1/2, or + // - the fractional part is exactly 1/2 and the last digit is odd. + // We rely on the following observations: + // - If fractional_part >= threshold, then the fractional part is + // strictly larger than 1/2. + // - If the MSB of fractional_part is set, then the fractional part + // must be at least 1/2. + // - When the MSB of fractional_part is set, either + // second_third_subsegments being nonzero or has_more_segments + // being true means there are further digits not printed, so the + // fractional part is strictly larger than 1/2. + if (precision < 9) { + uint32_t fractional_part = static_cast(prod); + should_round_up = + fractional_part >= fractional_part_rounding_thresholds( + 8 - number_of_digits_to_print) || + ((fractional_part >> 31) & + ((digits & 1) | (second_third_subsegments != 0) | + has_more_segments)) != 0; + } + // Rounding at the subsegment boundary. + // In this case, the fractional part is at least 1/2 if and only if + // second_third_subsegments >= 5000000000ULL, and is strictly larger + // than 1/2 if we further have either second_third_subsegments > + // 5000000000ULL or has_more_segments == true. + else { + should_round_up = second_third_subsegments > 5000000000ULL || + (second_third_subsegments == 5000000000ULL && + ((digits & 1) != 0 || has_more_segments)); + } + } + // Otherwise, print the second subsegment. + else { + // Compilers are not aware of how to leverage the maximum value of + // second_third_subsegments to find out a better magic number which + // allows us to eliminate an additional shift. 1844674407370955162 = + // ceil(2^64/10) < ceil(2^64*(10^9/(10^10 - 1))). + const uint32_t second_subsegment = + static_cast(dragonbox::umul128_upper64( + second_third_subsegments, 1844674407370955162ULL)); + const uint32_t third_subsegment = + static_cast(second_third_subsegments) - + second_subsegment * 10; + + number_of_digits_to_print = precision - 9; + print_subsegment(second_subsegment, buf.data() + 9); + + // Rounding inside the subsegment. + if (precision < 18) { + // The condition third_subsegment != 0 implies that the segment was + // of 19 digits, so in this case the third segment should be + // consisting of a genuine digit from the input. + uint32_t fractional_part = static_cast(prod); + should_round_up = + fractional_part >= fractional_part_rounding_thresholds( + 8 - number_of_digits_to_print) || + ((fractional_part >> 31) & + ((digits & 1) | (third_subsegment != 0) | + has_more_segments)) != 0; + } + // Rounding at the subsegment boundary. + else { + // In this case, the segment must be of 19 digits, thus + // the third subsegment should be consisting of a genuine digit from + // the input. + should_round_up = third_subsegment > 5 || + (third_subsegment == 5 && + ((digits & 1) != 0 || has_more_segments)); + } + } + + // Round-up if necessary. + if (should_round_up) { + ++buf[precision - 1]; + for (int i = precision - 1; i > 0 && buf[i] > '9'; --i) { + buf[i] = '0'; + ++buf[i - 1]; + } + if (buf[0] > '9') { + buf[0] = '1'; + if (fixed) + buf[precision++] = '0'; + else + ++exp; + } + } + buf.try_resize(to_unsigned(precision)); + } + } // if (digits_in_the_first_segment > precision) + else { + // Adjust the exponent for its use in Dragon4. + exp += digits_in_the_first_segment - 1; + } + } + if (use_dragon) { + auto f = basic_fp(); + bool is_predecessor_closer = binary32 ? f.assign(static_cast(value)) + : f.assign(converted_value); + if (is_predecessor_closer) dragon_flags |= dragon::predecessor_closer; + if (fixed) dragon_flags |= dragon::fixed; + // Limit precision to the maximum possible number of significant digits in + // an IEEE754 double because we don't need to generate zeros. + const int max_double_digits = 767; + if (precision > max_double_digits) precision = max_double_digits; + format_dragon(f, dragon_flags, precision, buf, exp); + } + if (!fixed && !specs.alt()) { + // Remove trailing zeros. + auto num_digits = buf.size(); + while (num_digits > 0 && buf[num_digits - 1] == '0') { + --num_digits; + ++exp; + } + buf.try_resize(num_digits); + } + return exp; +} } // namespace detail template <> struct formatter { diff --git a/include/fmt/format.h b/include/fmt/format.h index 45cab4cd8677..c22a4b8db780 100644 --- a/include/fmt/format.h +++ b/include/fmt/format.h @@ -3159,301 +3159,7 @@ constexpr auto fractional_part_rounding_thresholds(int index) -> uint32_t { template FMT_CONSTEXPR20 auto format_float(Float value, int precision, const format_specs& specs, bool binary32, - buffer& buf) -> int { - // float is passed as double to reduce the number of instantiations. - static_assert(!std::is_same::value, ""); - auto converted_value = convert_float(value); - - const bool fixed = specs.type() == presentation_type::fixed; - if (value == 0) { - if (precision <= 0 || !fixed) { - buf.push_back('0'); - return 0; - } - buf.try_resize(to_unsigned(precision)); - fill_n(buf.data(), precision, '0'); - return -precision; - } - - int exp = 0; - bool use_dragon = true; - unsigned dragon_flags = 0; - if (!is_fast_float() || is_constant_evaluated()) { - const auto inv_log2_10 = 0.3010299956639812; // 1 / log2(10) - using info = dragonbox::float_info; - const auto f = basic_fp(converted_value); - // Compute exp, an approximate power of 10, such that - // 10^(exp - 1) <= value < 10^exp or 10^exp <= value < 10^(exp + 1). - // This is based on log10(value) == log2(value) / log2(10) and approximation - // of log2(value) by e + num_fraction_bits idea from double-conversion. - auto e = (f.e + count_digits<1>(f.f) - 1) * inv_log2_10 - 1e-10; - exp = static_cast(e); - if (e > exp) ++exp; // Compute ceil. - dragon_flags = dragon::fixup; - } else { - // Extract significand bits and exponent bits. - using info = dragonbox::float_info; - auto br = bit_cast(static_cast(value)); - - const uint64_t significand_mask = - (static_cast(1) << num_significand_bits()) - 1; - uint64_t significand = (br & significand_mask); - int exponent = static_cast((br & exponent_mask()) >> - num_significand_bits()); - - if (exponent != 0) { // Check if normal. - exponent -= exponent_bias() + num_significand_bits(); - significand |= - (static_cast(1) << num_significand_bits()); - significand <<= 1; - } else { - // Normalize subnormal inputs. - FMT_ASSERT(significand != 0, "zeros should not appear here"); - int shift = countl_zero(significand); - FMT_ASSERT(shift >= num_bits() - num_significand_bits(), - ""); - shift -= (num_bits() - num_significand_bits() - 2); - exponent = (std::numeric_limits::min_exponent - - num_significand_bits()) - - shift; - significand <<= shift; - } - - // Compute the first several nonzero decimal significand digits. - // We call the number we get the first segment. - const int k = info::kappa - dragonbox::floor_log10_pow2(exponent); - exp = -k; - const int beta = exponent + dragonbox::floor_log2_pow10(k); - uint64_t first_segment; - bool has_more_segments; - int digits_in_the_first_segment; - { - const auto r = dragonbox::umul192_upper128( - significand << beta, dragonbox::get_cached_power(k)); - first_segment = r.high(); - has_more_segments = r.low() != 0; - - // The first segment can have 18 ~ 19 digits. - if (first_segment >= 1000000000000000000ULL) { - digits_in_the_first_segment = 19; - } else { - // When it is of 18-digits, we align it to 19-digits by adding a bogus - // zero at the end. - digits_in_the_first_segment = 18; - first_segment *= 10; - } - } - - // Compute the actual number of decimal digits to print. - if (fixed) adjust_precision(precision, exp + digits_in_the_first_segment); - - // Use Dragon4 only when there might be not enough digits in the first - // segment. - if (digits_in_the_first_segment > precision) { - use_dragon = false; - - if (precision <= 0) { - exp += digits_in_the_first_segment; - - if (precision < 0) { - // Nothing to do, since all we have are just leading zeros. - buf.try_resize(0); - } else { - // We may need to round-up. - buf.try_resize(1); - if ((first_segment | static_cast(has_more_segments)) > - 5000000000000000000ULL) { - buf[0] = '1'; - } else { - buf[0] = '0'; - } - } - } // precision <= 0 - else { - exp += digits_in_the_first_segment - precision; - - // When precision > 0, we divide the first segment into three - // subsegments, each with 9, 9, and 0 ~ 1 digits so that each fits - // in 32-bits which usually allows faster calculation than in - // 64-bits. Since some compiler (e.g. MSVC) doesn't know how to optimize - // division-by-constant for large 64-bit divisors, we do it here - // manually. The magic number 7922816251426433760 below is equal to - // ceil(2^(64+32) / 10^10). - const uint32_t first_subsegment = static_cast( - dragonbox::umul128_upper64(first_segment, 7922816251426433760ULL) >> - 32); - const uint64_t second_third_subsegments = - first_segment - first_subsegment * 10000000000ULL; - - uint64_t prod; - uint32_t digits; - bool should_round_up; - int number_of_digits_to_print = precision > 9 ? 9 : precision; - - // Print a 9-digits subsegment, either the first or the second. - auto print_subsegment = [&](uint32_t subsegment, char* buffer) { - int number_of_digits_printed = 0; - - // If we want to print an odd number of digits from the subsegment, - if ((number_of_digits_to_print & 1) != 0) { - // Convert to 64-bit fixed-point fractional form with 1-digit - // integer part. The magic number 720575941 is a good enough - // approximation of 2^(32 + 24) / 10^8; see - // https://jk-jeon.github.io/posts/2022/12/fixed-precision-formatting/#fixed-length-case - // for details. - prod = ((subsegment * static_cast(720575941)) >> 24) + 1; - digits = static_cast(prod >> 32); - *buffer = static_cast('0' + digits); - number_of_digits_printed++; - } - // If we want to print an even number of digits from the - // first_subsegment, - else { - // Convert to 64-bit fixed-point fractional form with 2-digits - // integer part. The magic number 450359963 is a good enough - // approximation of 2^(32 + 20) / 10^7; see - // https://jk-jeon.github.io/posts/2022/12/fixed-precision-formatting/#fixed-length-case - // for details. - prod = ((subsegment * static_cast(450359963)) >> 20) + 1; - digits = static_cast(prod >> 32); - write2digits(buffer, digits); - number_of_digits_printed += 2; - } - - // Print all digit pairs. - while (number_of_digits_printed < number_of_digits_to_print) { - prod = static_cast(prod) * static_cast(100); - digits = static_cast(prod >> 32); - write2digits(buffer + number_of_digits_printed, digits); - number_of_digits_printed += 2; - } - }; - - // Print first subsegment. - print_subsegment(first_subsegment, buf.data()); - - // Perform rounding if the first subsegment is the last subsegment to - // print. - if (precision <= 9) { - // Rounding inside the subsegment. - // We round-up if: - // - either the fractional part is strictly larger than 1/2, or - // - the fractional part is exactly 1/2 and the last digit is odd. - // We rely on the following observations: - // - If fractional_part >= threshold, then the fractional part is - // strictly larger than 1/2. - // - If the MSB of fractional_part is set, then the fractional part - // must be at least 1/2. - // - When the MSB of fractional_part is set, either - // second_third_subsegments being nonzero or has_more_segments - // being true means there are further digits not printed, so the - // fractional part is strictly larger than 1/2. - if (precision < 9) { - uint32_t fractional_part = static_cast(prod); - should_round_up = - fractional_part >= fractional_part_rounding_thresholds( - 8 - number_of_digits_to_print) || - ((fractional_part >> 31) & - ((digits & 1) | (second_third_subsegments != 0) | - has_more_segments)) != 0; - } - // Rounding at the subsegment boundary. - // In this case, the fractional part is at least 1/2 if and only if - // second_third_subsegments >= 5000000000ULL, and is strictly larger - // than 1/2 if we further have either second_third_subsegments > - // 5000000000ULL or has_more_segments == true. - else { - should_round_up = second_third_subsegments > 5000000000ULL || - (second_third_subsegments == 5000000000ULL && - ((digits & 1) != 0 || has_more_segments)); - } - } - // Otherwise, print the second subsegment. - else { - // Compilers are not aware of how to leverage the maximum value of - // second_third_subsegments to find out a better magic number which - // allows us to eliminate an additional shift. 1844674407370955162 = - // ceil(2^64/10) < ceil(2^64*(10^9/(10^10 - 1))). - const uint32_t second_subsegment = - static_cast(dragonbox::umul128_upper64( - second_third_subsegments, 1844674407370955162ULL)); - const uint32_t third_subsegment = - static_cast(second_third_subsegments) - - second_subsegment * 10; - - number_of_digits_to_print = precision - 9; - print_subsegment(second_subsegment, buf.data() + 9); - - // Rounding inside the subsegment. - if (precision < 18) { - // The condition third_subsegment != 0 implies that the segment was - // of 19 digits, so in this case the third segment should be - // consisting of a genuine digit from the input. - uint32_t fractional_part = static_cast(prod); - should_round_up = - fractional_part >= fractional_part_rounding_thresholds( - 8 - number_of_digits_to_print) || - ((fractional_part >> 31) & - ((digits & 1) | (third_subsegment != 0) | - has_more_segments)) != 0; - } - // Rounding at the subsegment boundary. - else { - // In this case, the segment must be of 19 digits, thus - // the third subsegment should be consisting of a genuine digit from - // the input. - should_round_up = third_subsegment > 5 || - (third_subsegment == 5 && - ((digits & 1) != 0 || has_more_segments)); - } - } - - // Round-up if necessary. - if (should_round_up) { - ++buf[precision - 1]; - for (int i = precision - 1; i > 0 && buf[i] > '9'; --i) { - buf[i] = '0'; - ++buf[i - 1]; - } - if (buf[0] > '9') { - buf[0] = '1'; - if (fixed) - buf[precision++] = '0'; - else - ++exp; - } - } - buf.try_resize(to_unsigned(precision)); - } - } // if (digits_in_the_first_segment > precision) - else { - // Adjust the exponent for its use in Dragon4. - exp += digits_in_the_first_segment - 1; - } - } - if (use_dragon) { - auto f = basic_fp(); - bool is_predecessor_closer = binary32 ? f.assign(static_cast(value)) - : f.assign(converted_value); - if (is_predecessor_closer) dragon_flags |= dragon::predecessor_closer; - if (fixed) dragon_flags |= dragon::fixed; - // Limit precision to the maximum possible number of significant digits in - // an IEEE754 double because we don't need to generate zeros. - const int max_double_digits = 767; - if (precision > max_double_digits) precision = max_double_digits; - format_dragon(f, dragon_flags, precision, buf, exp); - } - if (!fixed && !specs.alt()) { - // Remove trailing zeros. - auto num_digits = buf.size(); - while (num_digits > 0 && buf[num_digits - 1] == '0') { - --num_digits; - ++exp; - } - buf.try_resize(num_digits); - } - return exp; -} + buffer& buf) -> int; template FMT_CONSTEXPR20 auto write_float(OutputIt out, T value, format_specs specs,