Skip to content

Commit

Permalink
Single pass date format guessing
Browse files Browse the repository at this point in the history
  • Loading branch information
jas88 committed Jul 16, 2024
1 parent 8d3a137 commit 5ea663f
Show file tree
Hide file tree
Showing 2 changed files with 51 additions and 38 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

- Single pass non-copying date format guessing

## [1.2.6] - 2024-07-16

- Throw exceptions on invalid conversion attempts
Expand Down
87 changes: 49 additions & 38 deletions TypeGuesser/Deciders/DateTimeTypeDecider.cs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ namespace TypeGuesser.Deciders;
/// Creates a new instance for detecting/parsing <see cref="DateTime"/> strings according to the <paramref name="cultureInfo"/>
/// </remarks>
/// <param name="cultureInfo"></param>
public class DateTimeTypeDecider(CultureInfo cultureInfo) : DecideTypesForStrings<DateTime>(cultureInfo, TypeCompatibilityGroup.Exclusive, typeof(DateTime))
public class DateTimeTypeDecider(CultureInfo cultureInfo):DecideTypesForStrings<DateTime>(cultureInfo,TypeCompatibilityGroup.Exclusive,typeof(DateTime))
{
private readonly TimeSpanTypeDecider _timeSpanTypeDecider = new(cultureInfo);
private readonly DecimalTypeDecider _decimalChecker = new(cultureInfo);
Expand Down Expand Up @@ -71,11 +71,11 @@ static DateTimeTypeDecider()
foreach (var d in DayFormats)
foreach (var dateSeparator in DateSeparators)
{
dateFormatsMd.Add(string.Join(dateSeparator, m, d, y));
dateFormatsMd.Add(string.Join(dateSeparator, y, m, d));
dateFormatsMd.Add(string.Join(dateSeparator,m,d,y));
dateFormatsMd.Add(string.Join(dateSeparator,y,m,d));

dateFormatsDm.Add(string.Join(dateSeparator, d, m, y));
dateFormatsMd.Add(string.Join(dateSeparator, y, m, d));
dateFormatsDm.Add(string.Join(dateSeparator,d,m,y));
dateFormatsMd.Add(string.Join(dateSeparator,y,m,d));
}

//then all the times
Expand All @@ -84,13 +84,13 @@ static DateTimeTypeDecider()
foreach (var h in HourFormats)
foreach (var m in MinuteFormats)
{
timeFormats.Add(string.Join(timeSeparator, h, m));
timeFormats.Add($"{string.Join(timeSeparator, h, m)} {suffix}");
timeFormats.Add(string.Join(timeSeparator,h,m));
timeFormats.Add($"{string.Join(timeSeparator,h,m)} {suffix}");

foreach (var s in SecondFormats)
{
timeFormats.Add(string.Join(timeSeparator, h, m, s));
timeFormats.Add($"{string.Join(timeSeparator, h, m, s)} {suffix}");
timeFormats.Add(string.Join(timeSeparator,h,m,s));
timeFormats.Add($"{string.Join(timeSeparator,h,m,s)} {suffix}");
}
}
DateFormatsDM = [.. dateFormatsDm];
Expand Down Expand Up @@ -161,11 +161,11 @@ protected override object ParseImpl(ReadOnlySpan<char> value)
{
// if user has specified a specific format that we are to use, use it
if (Settings.ExplicitDateFormats != null)
return DateTime.ParseExact(value, Settings.ExplicitDateFormats, _culture, DateTimeStyles.None);
return DateTime.ParseExact(value,Settings.ExplicitDateFormats,_culture,DateTimeStyles.None);

// otherwise parse a value using any of the valid culture formats
if (!TryBruteParse(value, out var dt))
throw new FormatException(string.Format(SR.DateTimeTypeDecider_ParseImpl_Could_not_parse___0___to_a_valid_DateTime, value.ToString()));
if (!TryBruteParse(value,out var dt))
throw new FormatException(string.Format(SR.DateTimeTypeDecider_ParseImpl_Could_not_parse___0___to_a_valid_DateTime,value.ToString()));

return dt;
}
Expand All @@ -180,56 +180,67 @@ protected override object ParseImpl(ReadOnlySpan<char> value)
/// </summary>
public void GuessDateFormat(IEnumerable<string> samples)
{
if (!AllowCultureGuessing)
return;

samples = samples.Where(static s => !string.IsNullOrWhiteSpace(s)).ToList();
var total = 0;
var simple = 0;
var m = 0;
var d = 0;

//if they are all valid anyway
if (samples.All(s => DateTime.TryParse(s, Culture, DateTimeStyles.None, out _)))
if (!AllowCultureGuessing)
return;

_dateFormatToUse = DateFormatsDM;
var countDm = samples.Count(s => TryBruteParse(s, out _));
_dateFormatToUse = DateFormatsMD;
var countMd = samples.Count(s => TryBruteParse(s, out _));
foreach (var sSample in samples.Where(static sSample => !string.IsNullOrWhiteSpace(sSample)))
{
var sample = sSample.AsSpan();
total++;
if (DateTime.TryParse(sample,Culture,DateTimeStyles.None, out _))
simple++;
else
{
_dateFormatToUse = DateFormatsDM;
if (TryBruteParse(sample, out _))
d++;
_dateFormatToUse = DateFormatsMD;
if (TryBruteParse(sample, out _))
m++;
}
}

if (countDm >= countMd)
if (simple < total && d > m)
_dateFormatToUse = DateFormatsDM;
}

/// <inheritdoc />
public override bool IsAcceptableAsType(ReadOnlySpan<char> candidateString, IDataTypeSize? size)
public override bool IsAcceptableAsType(ReadOnlySpan<char> candidateString,IDataTypeSize? size)
{
return IsExplicitDate(candidateString) || base.IsAcceptableAsType(candidateString, size);
return IsExplicitDate(candidateString) || base.IsAcceptableAsType(candidateString,size);
}

/// <inheritdoc/>
protected override bool IsAcceptableAsTypeImpl(ReadOnlySpan<char> candidateString, IDataTypeSize? sizeRecord)
protected override bool IsAcceptableAsTypeImpl(ReadOnlySpan<char> candidateString,IDataTypeSize? sizeRecord)
{
//if it's a float then it isn't a date is it! thanks C# for thinking 1.1 is the first of January
if (_decimalChecker.IsAcceptableAsType(candidateString, sizeRecord))
if (_decimalChecker.IsAcceptableAsType(candidateString,sizeRecord))
return false;

//likewise if it is just the Time portion of the date then we have a column with mixed dates and times which SQL will not deal with well in the end database (e.g. it will set the
//date portion of times to today's date which will be very confusing
if (_timeSpanTypeDecider.IsAcceptableAsType(candidateString, sizeRecord))
if (_timeSpanTypeDecider.IsAcceptableAsType(candidateString,sizeRecord))
return false;

try
{
return TryBruteParse(candidateString, out _);
return TryBruteParse(candidateString,out _);
}
catch (Exception)
{
return false;
}
}

private bool TryBruteParse(ReadOnlySpan<char> s, out DateTime dt)
private bool TryBruteParse(ReadOnlySpan<char> s,out DateTime dt)
{
//if it's legit according to the current culture
if (DateTime.TryParse(s, Culture, DateTimeStyles.None, out dt))
if (DateTime.TryParse(s,Culture,DateTimeStyles.None,out dt))
return true;

//if there are no tokens
Expand All @@ -244,29 +255,29 @@ private bool TryBruteParse(ReadOnlySpan<char> s, out DateTime dt)
//if there is one token it is assumed either to be a date or a string
if (sPoint == -1)
{
return TryGetTime(s, out dt) || TryGetDate(s, out dt);
return TryGetTime(s,out dt) || TryGetDate(s,out dt);
}

//if there are 2+ tokens then first token should be a date then the rest (concatenated) should be a time
//e.g. "28/2/1993 5:36:27 AM" gets evaluated as "28/2/1993" and then "5:36:27 AM"

if (TryGetDate(s[..sPoint], out dt) && TryGetTime(s[(sPoint+1)..], out var time))
if (TryGetDate(s[..sPoint],out dt) && TryGetTime(s[(sPoint+1)..],out var time))
{
dt = new DateTime(dt.Year, dt.Month, dt.Day, time.Hour, time.Minute, time.Second, time.Millisecond);
dt = new DateTime(dt.Year,dt.Month,dt.Day,time.Hour,time.Minute,time.Second,time.Millisecond);
return true;
}

dt = DateTime.MinValue;
return false;
}

private bool TryGetDate(ReadOnlySpan<char> v, out DateTime date)
private bool TryGetDate(ReadOnlySpan<char> v,out DateTime date)
{
return DateTime.TryParseExact(v, _dateFormatToUse, Culture, DateTimeStyles.AllowInnerWhite, out date);
return DateTime.TryParseExact(v,_dateFormatToUse,Culture,DateTimeStyles.AllowInnerWhite,out date);
}

private bool TryGetTime(ReadOnlySpan<char> v, out DateTime time)
private bool TryGetTime(ReadOnlySpan<char> v,out DateTime time)
{
return DateTime.TryParseExact(v, TimeFormats, Culture, DateTimeStyles.AllowInnerWhite, out time);
return DateTime.TryParseExact(v,TimeFormats,Culture,DateTimeStyles.AllowInnerWhite,out time);
}
}

0 comments on commit 5ea663f

Please sign in to comment.