-
-
Notifications
You must be signed in to change notification settings - Fork 48
/
JaroWinkler.fs
76 lines (63 loc) · 2.77 KB
/
JaroWinkler.fs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
namespace Algorithms.Strings
open Microsoft.FSharp.Collections
module JaroWinkler =
/// <summary>
/// Jaro–Winkler distance is a string metric measuring an edit distance between two
/// sequences.
/// Output value is between 0.0 and 1.0.
/// </summary>
/// <param name="str1"></param>
/// <param name="str2"></param>
/// <returns></returns>
let jaroWinkler (str1: string, str2: string): float =
let getMatchedCharacters (_str1: string, _str2: string): string =
let mutable istr1 = _str1
let mutable istr2 = _str2
let mutable matched = []
let limit =
int (floor (double (min _str1.Length str2.Length) / 2.0))
istr1
|> Seq.iteri
(fun i l ->
let left = int(max 0 (i - limit))
let right = int(min (i + limit + 1) istr2.Length)
if (istr2.[left..right - 1]).Contains(l) then
matched <- List.append matched [ (string) l ]
let myIndex = (istr2.IndexOf(l))
istr2 <- $"{istr2.[0..istr2.IndexOf(l) - 1]} {istr2.[istr2.IndexOf(l) + 1..]}")
matched |> List.fold (+) ""
// matching characters
let matching1 = getMatchedCharacters (str1, str2)
let matching2 = getMatchedCharacters (str2, str1)
let matchCount = matching1.Length
let mutable jaro = 0.0
// Transposition
let transpositions =
floor (
double (
(double)
[ for c1, c2 in List.zip [ matching1 ] [ matching2 ] do if c1 <> c2 then (c1, c2) ]
.Length
)
)
if matchCount = 0 then
jaro <- 0.0
else
jaro <-
1.0 / 3.0
* ((double) matchCount / (double) str1.Length
+ (double) matchCount / (double) str2.Length
+ ((double) matchCount - transpositions)
/ (double) matchCount)
// Common prefix up to 4 characters
let mutable prefixLen = 0
let mutable c1C2BoolList : bool list = []
if str1.Length = str2.Length then
for c1, c2 in Array.zip (str1.[..4].ToCharArray()) (str2.[..4].ToCharArray()) do
if c1 = c2 then
c1C2BoolList <- List.append c1C2BoolList [true]
else
c1C2BoolList <- List.append c1C2BoolList [false]
if (c1C2BoolList |> List.exists(fun x -> (not x))) then
prefixLen <- prefixLen + (c1C2BoolList |> List.findIndex(fun x -> (not x)))
jaro + 0.1 * (double) prefixLen * (1.0 - jaro)